diff options
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 500 |
1 files changed, 330 insertions, 170 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index ab98dc6..99082fa 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -95,6 +95,8 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ int may_swap; + int swappiness; + int order; /* @@ -171,8 +173,7 @@ static unsigned long zone_nr_lru_pages(struct zone *zone, struct scan_control *sc, enum lru_list lru) { if (!scanning_global_lru(sc)) - return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, - zone_to_nid(zone), zone_idx(zone), BIT(lru)); + return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru); return zone_page_state(zone, NR_LRU_BASE + lru); } @@ -183,7 +184,7 @@ static unsigned long zone_nr_lru_pages(struct zone *zone, */ void register_shrinker(struct shrinker *shrinker) { - atomic_long_set(&shrinker->nr_in_batch, 0); + shrinker->nr = 0; down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); up_write(&shrinker_rwsem); @@ -252,8 +253,6 @@ unsigned long shrink_slab(struct shrink_control *shrink, int shrink_ret = 0; long nr; long new_nr; - long batch_size = shrinker->batch ? shrinker->batch - : SHRINK_BATCH; max_pass = do_shrinker_shrink(shrinker, shrink, 0); if (max_pass <= 0) @@ -264,7 +263,9 @@ unsigned long shrink_slab(struct shrink_control *shrink, * and zero it so that other concurrent shrinker invocations * don't also do this scanning work. */ - nr = atomic_long_xchg(&shrinker->nr_in_batch, 0); + do { + nr = shrinker->nr; + } while (cmpxchg(&shrinker->nr, nr, 0) != nr); total_scan = nr; delta = (4 * nr_pages_scanned) / shrinker->seeks; @@ -305,18 +306,19 @@ unsigned long shrink_slab(struct shrink_control *shrink, nr_pages_scanned, lru_pages, max_pass, delta, total_scan); - while (total_scan >= batch_size) { + while (total_scan >= SHRINK_BATCH) { + long this_scan = SHRINK_BATCH; int nr_before; nr_before = do_shrinker_shrink(shrinker, shrink, 0); shrink_ret = do_shrinker_shrink(shrinker, shrink, - batch_size); + this_scan); if (shrink_ret == -1) break; if (shrink_ret < nr_before) ret += nr_before - shrink_ret; - count_vm_events(SLABS_SCANNED, batch_size); - total_scan -= batch_size; + count_vm_events(SLABS_SCANNED, this_scan); + total_scan -= this_scan; cond_resched(); } @@ -326,11 +328,12 @@ unsigned long shrink_slab(struct shrink_control *shrink, * manner that handles concurrent updates. If we exhausted the * scan, there is no need to do an update. */ - if (total_scan > 0) - new_nr = atomic_long_add_return(total_scan, - &shrinker->nr_in_batch); - else - new_nr = atomic_long_read(&shrinker->nr_in_batch); + do { + nr = shrinker->nr; + new_nr = total_scan + nr; + if (total_scan <= 0) + break; + } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr); trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); } @@ -495,6 +498,15 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, return PAGE_ACTIVATE; } + /* + * Wait on writeback if requested to. This happens when + * direct reclaiming a large contiguous area and the + * first attempt to free a range of pages fails. + */ + if (PageWriteback(page) && + (sc->reclaim_mode & RECLAIM_MODE_SYNC)) + wait_on_page_writeback(page); + if (!PageWriteback(page)) { /* synchronous write or broken a_ops? */ ClearPageReclaim(page); @@ -633,14 +645,13 @@ redo: lru = LRU_UNEVICTABLE; add_page_to_unevictable_list(page); /* - * When racing with an mlock or AS_UNEVICTABLE clearing - * (page is unlocked) make sure that if the other thread - * does not observe our setting of PG_lru and fails - * isolation/check_move_unevictable_pages, - * we see PG_mlocked/AS_UNEVICTABLE cleared below and move + * When racing with an mlock clearing (page is + * unlocked), make sure that if the other thread does + * not observe our setting of PG_lru and fails + * isolation, we see PG_mlocked cleared below and move * the page back to the evictable list. * - * The other side is TestClearPageMlocked() or shmem_lock(). + * The other side is TestClearPageMlocked(). */ smp_mb(); } @@ -715,15 +726,19 @@ static enum page_references page_check_references(struct page *page, */ SetPageReferenced(page); +#ifndef CONFIG_DMA_CMA + if (referenced_page) + return PAGEREF_ACTIVATE; +#else if (referenced_page || referenced_ptes > 1) return PAGEREF_ACTIVATE; /* * Activate file-backed executable pages after first usage. - */ + */ if (vm_flags & VM_EXEC) return PAGEREF_ACTIVATE; - +#endif return PAGEREF_KEEP; } @@ -755,12 +770,12 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages) /* * shrink_page_list() returns the number of reclaimed pages */ -static unsigned long shrink_page_list(struct list_head *page_list, +#ifndef CONFIG_ZRAM_FOR_ANDROID +static +#endif /* CONFIG_ZRAM_FOR_ANDROID */ +unsigned long shrink_page_list(struct list_head *page_list, struct zone *zone, - struct scan_control *sc, - int priority, - unsigned long *ret_nr_dirty, - unsigned long *ret_nr_writeback) + struct scan_control *sc) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); @@ -768,7 +783,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, unsigned long nr_dirty = 0; unsigned long nr_congested = 0; unsigned long nr_reclaimed = 0; - unsigned long nr_writeback = 0; cond_resched(); @@ -805,12 +819,13 @@ static unsigned long shrink_page_list(struct list_head *page_list, (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); if (PageWriteback(page)) { - nr_writeback++; /* - * Synchronous reclaim cannot queue pages for - * writeback due to the possibility of stack overflow - * but if it encounters a page under writeback, wait - * for the IO to complete. + * Synchronous reclaim is performed in two passes, + * first an asynchronous pass over the list to + * start parallel writeback, and a second synchronous + * pass to wait for the IO to complete. Wait here + * for any page for which writeback has already + * started. */ if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && may_enter_fs) @@ -866,25 +881,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (PageDirty(page)) { nr_dirty++; - /* - * Only kswapd can writeback filesystem pages to - * avoid risk of stack overflow but do not writeback - * unless under significant pressure. - */ - if (page_is_file_cache(page) && - (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) { - /* - * Immediately reclaim when written back. - * Similar in principal to deactivate_page() - * except we already have the page isolated - * and know it's dirty - */ - inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); - SetPageReclaim(page); - - goto keep_locked; - } - if (references == PAGEREF_RECLAIM_CLEAN) goto keep_locked; if (!may_enter_fs) @@ -1019,8 +1015,6 @@ keep_lumpy: list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); - *ret_nr_dirty += nr_dirty; - *ret_nr_writeback += nr_writeback; return nr_reclaimed; } @@ -1063,8 +1057,12 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) * unevictable; only give shrink_page_list evictable pages. */ if (PageUnevictable(page)) +#ifndef CONFIG_DMA_CMA return ret; - +#else + printk(KERN_ERR "%s[%d] Unevictable page %p\n", + __func__, __LINE__, page); +#endif ret = -EBUSY; /* @@ -1281,7 +1279,10 @@ static unsigned long isolate_pages_global(unsigned long nr, * clear_active_flags() is a helper for shrink_active_list(), clearing * any active bits from the pages in the list. */ -static unsigned long clear_active_flags(struct list_head *page_list, +#ifndef CONFIG_ZRAM_FOR_ANDROID +static +#endif /* CONFIG_ZRAM_FOR_ANDROID */ +unsigned long clear_active_flags(struct list_head *page_list, unsigned int *count) { int nr_active = 0; @@ -1351,6 +1352,40 @@ int isolate_lru_page(struct page *page) return ret; } +#ifdef CONFIG_ZRAM_FOR_ANDROID +/** + * isolate_lru_page_compcache - tries to isolate a page for compcache + * @page: page to isolate from its LRU list + * + * Isolates a @page from an LRU list, clears PageLRU,but + * does not adjusts the vmstat statistic + * Returns 0 if the page was removed from an LRU list. + * Returns -EBUSY if the page was not on an LRU list. + */ +int isolate_lru_page_compcache(struct page *page) +{ + int ret = -EBUSY; + + VM_BUG_ON(!page_count(page)); + + if (PageLRU(page)) { + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + if (PageLRU(page)) { + int lru = page_lru(page); + ret = 0; + get_page(page); + ClearPageLRU(page); + list_del(&page->lru); + mem_cgroup_del_lru_list(page, lru); + } + spin_unlock_irq(&zone->lru_lock); + } + return ret; +} +#endif + /* * Are there way too many processes in the direct reclaim path already? */ @@ -1458,7 +1493,7 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone, } /* - * Returns true if a direct reclaim should wait on pages under writeback. + * Returns true if the caller should wait to clean dirty/writeback pages. * * If we are direct reclaiming for contiguous pages and we do not reclaim * everything in the list, try again and wait for writeback IO to complete. @@ -1480,7 +1515,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) return false; - /* If we have reclaimed everything on the isolated list, no stall */ + /* If we have relaimed everything on the isolated list, no stall */ if (nr_freed == nr_taken) return false; @@ -1512,8 +1547,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, unsigned long nr_taken; unsigned long nr_anon; unsigned long nr_file; - unsigned long nr_dirty = 0; - unsigned long nr_writeback = 0; isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; while (unlikely(too_many_isolated(zone, file, sc))) { @@ -1566,14 +1599,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, spin_unlock_irq(&zone->lru_lock); - nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority, - &nr_dirty, &nr_writeback); + nr_reclaimed = shrink_page_list(&page_list, zone, sc); /* Check if we should syncronously wait for writeback */ if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { set_reclaim_mode(priority, sc, true); - nr_reclaimed += shrink_page_list(&page_list, zone, sc, - priority, &nr_dirty, &nr_writeback); + nr_reclaimed += shrink_page_list(&page_list, zone, sc); } local_irq_disable(); @@ -1583,32 +1614,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); - /* - * If reclaim is isolating dirty pages under writeback, it implies - * that the long-lived page allocation rate is exceeding the page - * laundering rate. Either the global limits are not being effective - * at throttling processes due to the page distribution throughout - * zones or there is heavy usage of a slow backing device. The - * only option is to throttle from reclaim context which is not ideal - * as there is no guarantee the dirtying process is throttled in the - * same way balance_dirty_pages() manages. - * - * This scales the number of dirty pages that must be under writeback - * before throttling depending on priority. It is a simple backoff - * function that has the most effect in the range DEF_PRIORITY to - * DEF_PRIORITY-2 which is the priority reclaim is considered to be - * in trouble and reclaim is considered to be in trouble. - * - * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle - * DEF_PRIORITY-1 50% must be PageWriteback - * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble - * ... - * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any - * isolated page is PageWriteback - */ - if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority))) - wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); - trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, zone_idx(zone), nr_scanned, nr_reclaimed, @@ -1617,6 +1622,44 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, return nr_reclaimed; } +#ifdef CONFIG_ZRAM_FOR_ANDROID +unsigned long +zone_id_shrink_pagelist(struct zone *zone, struct list_head *page_list) +{ + unsigned long nr_reclaimed = 0; + unsigned long nr_anon; + unsigned long nr_file; + + struct scan_control sc = { + .gfp_mask = GFP_USER, + .may_writepage = 1, + .nr_to_reclaim = SWAP_CLUSTER_MAX, + .may_unmap = 1, + .may_swap = 1, + .swappiness = vm_swappiness, + .order = 0, + .mem_cgroup = NULL, + .nodemask = NULL, + }; + + spin_lock_irq(&zone->lru_lock); + + update_isolated_counts(zone, &sc, &nr_anon, &nr_file, page_list); + + spin_unlock_irq(&zone->lru_lock); + + nr_reclaimed = shrink_page_list(page_list, zone, &sc); + + __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); + + putback_lru_pages(zone, &sc, nr_anon, nr_file, page_list); + + return nr_reclaimed; +} + +EXPORT_SYMBOL(zone_id_shrink_pagelist); +#endif /* CONFIG_ZRAM_FOR_ANDROID */ + /* * This moves pages from the active list to the inactive list. * @@ -1804,7 +1847,7 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) if (scanning_global_lru(sc)) low = inactive_anon_is_low_global(zone); else - low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone); + low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); return low; } #else @@ -1847,7 +1890,7 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) if (scanning_global_lru(sc)) low = inactive_file_is_low_global(zone); else - low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone); + low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); return low; } @@ -1874,13 +1917,6 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); } -static int vmscan_swappiness(struct scan_control *sc) -{ - if (scanning_global_lru(sc)) - return vm_swappiness; - return mem_cgroup_swappiness(sc->mem_cgroup); -} - /* * Determine how aggressively the anon and file LRU lists should be * scanned. The relative value of each set of LRU lists is determined @@ -1900,20 +1936,13 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, enum lru_list l; int noswap = 0; bool force_scan = false; + unsigned long nr_force_scan[2]; - /* - * If the zone or memcg is small, nr[l] can be 0. This - * results in no scanning on this priority and a potential - * priority drop. Global direct reclaim can go to the next - * zone and tends to have no problems. Global kswapd is for - * zone balancing and it needs to scan a minimum amount. When - * reclaiming for a memcg, a priority drop can cause high - * latencies, so it's better to scan a minimum amount there as - * well. - */ + /* kswapd does zone balancing and needs to scan this zone */ if (scanning_global_lru(sc) && current_is_kswapd() && zone->all_unreclaimable) force_scan = true; + /* memcg may have small limit and need to avoid priority drop */ if (!scanning_global_lru(sc)) force_scan = true; @@ -1923,6 +1952,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, fraction[0] = 0; fraction[1] = 1; denominator = 1; + nr_force_scan[0] = 0; + nr_force_scan[1] = SWAP_CLUSTER_MAX; goto out; } @@ -1939,6 +1970,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, fraction[0] = 1; fraction[1] = 0; denominator = 1; + nr_force_scan[0] = SWAP_CLUSTER_MAX; + nr_force_scan[1] = 0; goto out; } } @@ -1947,8 +1980,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, * With swappiness at 100, anonymous and file have the same priority. * This scanning priority is essentially the inverse of IO cost. */ - anon_prio = vmscan_swappiness(sc); - file_prio = 200 - vmscan_swappiness(sc); + anon_prio = sc->swappiness; + file_prio = 200 - sc->swappiness; /* * OK, so we have swap space and a fair amount of page cache @@ -1977,28 +2010,43 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, * proportional to the fraction of recently scanned pages on * each list that were recently referenced and in active use. */ - ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1); + ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1); ap /= reclaim_stat->recent_rotated[0] + 1; - fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); + fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); fp /= reclaim_stat->recent_rotated[1] + 1; spin_unlock_irq(&zone->lru_lock); fraction[0] = ap; fraction[1] = fp; denominator = ap + fp + 1; + if (force_scan) { + unsigned long scan = SWAP_CLUSTER_MAX; + nr_force_scan[0] = div64_u64(scan * ap, denominator); + nr_force_scan[1] = div64_u64(scan * fp, denominator); + } out: for_each_evictable_lru(l) { int file = is_file_lru(l); unsigned long scan; scan = zone_nr_lru_pages(zone, sc, l); - if (priority || noswap || !vmscan_swappiness(sc)) { + if (priority || noswap) { scan >>= priority; - if (!scan && force_scan) - scan = SWAP_CLUSTER_MAX; scan = div64_u64(scan * fraction[file], denominator); } + + /* + * If zone is small or memcg is small, nr[l] can be 0. + * This results no-scan on this priority and priority drop down. + * For global direct reclaim, it can visit next zone and tend + * not to have problems. For global kswapd, it's for zone + * balancing and it need to scan a small amounts. When using + * memcg, priority drop can cause big latency. So, it's better + * to scan small amount. See may_noscan above. + */ + if (!scan && force_scan) + scan = nr_force_scan[file]; nr[l] = scan; } } @@ -2339,8 +2387,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, */ writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; if (total_scanned > writeback_threshold) { - wakeup_flusher_threads(laptop_mode ? 0 : total_scanned, - WB_REASON_TRY_TO_FREE_PAGES); + wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); sc->may_writepage = 1; } @@ -2391,6 +2438,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_unmap = 1, .may_swap = 1, + .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, .nodemask = nodemask, @@ -2414,6 +2462,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, + unsigned int swappiness, struct zone *zone, unsigned long *nr_scanned) { @@ -2423,6 +2472,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, + .swappiness = swappiness, .order = 0, .mem_cgroup = mem, }; @@ -2451,7 +2501,8 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, gfp_t gfp_mask, - bool noswap) + bool noswap, + unsigned int swappiness) { struct zonelist *zonelist; unsigned long nr_reclaimed; @@ -2461,6 +2512,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .may_unmap = 1, .may_swap = !noswap, .nr_to_reclaim = SWAP_CLUSTER_MAX, + .swappiness = swappiness, .order = 0, .mem_cgroup = mem_cont, .nodemask = NULL, /* we don't care the placement */ @@ -2611,6 +2663,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, * we want to put equal scanning pressure on each zone. */ .nr_to_reclaim = ULONG_MAX, + .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, }; @@ -2852,8 +2905,6 @@ out: /* If balanced, clear the congested flag */ zone_clear_flag(zone, ZONE_CONGESTED); - if (i <= *classzone_idx) - balanced += zone->present_pages; } } @@ -3016,10 +3067,7 @@ static int kswapd(void *p) } } - tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); current->reclaim_state = NULL; - lockdep_clear_current_reclaim_state(); - return 0; } @@ -3098,11 +3146,16 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) struct reclaim_state reclaim_state; struct scan_control sc = { .gfp_mask = GFP_HIGHUSER_MOVABLE, +#if defined(CONFIG_SLP) && defined(CONFIG_FULL_PAGE_RECLAIM) + .may_swap = 0, +#else .may_swap = 1, +#endif .may_unmap = 1, .may_writepage = 1, .nr_to_reclaim = nr_to_reclaim, .hibernation_mode = 1, + .swappiness = vm_swappiness, .order = 0, }; struct shrink_control shrink = { @@ -3292,6 +3345,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .nr_to_reclaim = max_t(unsigned long, nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, + .swappiness = vm_swappiness, .order = order, }; struct shrink_control shrink = { @@ -3440,66 +3494,158 @@ int page_evictable(struct page *page, struct vm_area_struct *vma) return 1; } -#ifdef CONFIG_SHMEM /** - * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list - * @pages: array of pages to check - * @nr_pages: number of pages to check + * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list + * @page: page to check evictability and move to appropriate lru list + * @zone: zone page is in * - * Checks pages for evictability and moves them to the appropriate lru list. + * Checks a page for evictability and moves the page to the appropriate + * zone lru list. * - * This function is only used for SysV IPC SHM_UNLOCK. + * Restrictions: zone->lru_lock must be held, page must be on LRU and must + * have PageUnevictable set. */ -void check_move_unevictable_pages(struct page **pages, int nr_pages) +static void check_move_unevictable_page(struct page *page, struct zone *zone) { - struct zone *zone = NULL; - int pgscanned = 0; - int pgrescued = 0; - int i; + VM_BUG_ON(PageActive(page)); - for (i = 0; i < nr_pages; i++) { - struct page *page = pages[i]; - struct zone *pagezone; +retry: + ClearPageUnevictable(page); + if (page_evictable(page, NULL)) { + enum lru_list l = page_lru_base_type(page); - pgscanned++; - pagezone = page_zone(page); - if (pagezone != zone) { - if (zone) - spin_unlock_irq(&zone->lru_lock); - zone = pagezone; - spin_lock_irq(&zone->lru_lock); - } + __dec_zone_state(zone, NR_UNEVICTABLE); + list_move(&page->lru, &zone->lru[l].list); + mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l); + __inc_zone_state(zone, NR_INACTIVE_ANON + l); + __count_vm_event(UNEVICTABLE_PGRESCUED); + } else { + /* + * rotate unevictable list + */ + SetPageUnevictable(page); + list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); + mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE); + if (page_evictable(page, NULL)) + goto retry; + } +} - if (!PageLRU(page) || !PageUnevictable(page)) - continue; +/** + * scan_mapping_unevictable_pages - scan an address space for evictable pages + * @mapping: struct address_space to scan for evictable pages + * + * Scan all pages in mapping. Check unevictable pages for + * evictability and move them to the appropriate zone lru list. + */ +void scan_mapping_unevictable_pages(struct address_space *mapping) +{ + pgoff_t next = 0; + pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + struct zone *zone; + struct pagevec pvec; - if (page_evictable(page, NULL)) { - enum lru_list lru = page_lru_base_type(page); + if (mapping->nrpages == 0) + return; - VM_BUG_ON(PageActive(page)); - ClearPageUnevictable(page); - __dec_zone_state(zone, NR_UNEVICTABLE); - list_move(&page->lru, &zone->lru[lru].list); - mem_cgroup_move_lists(page, LRU_UNEVICTABLE, lru); - __inc_zone_state(zone, NR_INACTIVE_ANON + lru); - pgrescued++; + pagevec_init(&pvec, 0); + while (next < end && + pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + int i; + int pg_scanned = 0; + + zone = NULL; + + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + pgoff_t page_index = page->index; + struct zone *pagezone = page_zone(page); + + pg_scanned++; + if (page_index > next) + next = page_index; + next++; + + if (pagezone != zone) { + if (zone) + spin_unlock_irq(&zone->lru_lock); + zone = pagezone; + spin_lock_irq(&zone->lru_lock); + } + + if (PageLRU(page) && PageUnevictable(page)) + check_move_unevictable_page(page, zone); } + if (zone) + spin_unlock_irq(&zone->lru_lock); + pagevec_release(&pvec); + + count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned); } - if (zone) { - __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); - __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); +} + +/** + * scan_zone_unevictable_pages - check unevictable list for evictable pages + * @zone - zone of which to scan the unevictable list + * + * Scan @zone's unevictable LRU lists to check for pages that have become + * evictable. Move those that have to @zone's inactive list where they + * become candidates for reclaim, unless shrink_inactive_zone() decides + * to reactivate them. Pages that are still unevictable are rotated + * back onto @zone's unevictable list. + */ +#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ +static void scan_zone_unevictable_pages(struct zone *zone) +{ + struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; + unsigned long scan; + unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE); + + while (nr_to_scan > 0) { + unsigned long batch_size = min(nr_to_scan, + SCAN_UNEVICTABLE_BATCH_SIZE); + + spin_lock_irq(&zone->lru_lock); + for (scan = 0; scan < batch_size; scan++) { + struct page *page = lru_to_page(l_unevictable); + + if (!trylock_page(page)) + continue; + + prefetchw_prev_lru_page(page, l_unevictable, flags); + + if (likely(PageLRU(page) && PageUnevictable(page))) + check_move_unevictable_page(page, zone); + + unlock_page(page); + } spin_unlock_irq(&zone->lru_lock); + + nr_to_scan -= batch_size; } } -#endif /* CONFIG_SHMEM */ -static void warn_scan_unevictable_pages(void) + +/** + * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages + * + * A really big hammer: scan all zones' unevictable LRU lists to check for + * pages that have become evictable. Move those back to the zones' + * inactive list where they become candidates for reclaim. + * This occurs when, e.g., we have unswappable pages on the unevictable lists, + * and we add swap to the system. As such, it runs in the context of a task + * that has possibly/probably made some previously unevictable pages + * evictable. + */ +static void scan_all_zones_unevictable_pages(void) { - printk_once(KERN_WARNING - "The scan_unevictable_pages sysctl/node-interface has been " - "disabled for lack of a legitimate use case. If you have " - "one, please send an email to linux-mm@kvack.org.\n"); + struct zone *zone; + + for_each_zone(zone) { + scan_zone_unevictable_pages(zone); + } } /* @@ -3512,8 +3658,11 @@ int scan_unevictable_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { - warn_scan_unevictable_pages(); proc_doulongvec_minmax(table, write, buffer, length, ppos); + + if (write && *(unsigned long *)table->data) + scan_all_zones_unevictable_pages(); + scan_unevictable_pages = 0; return 0; } @@ -3528,7 +3677,6 @@ static ssize_t read_scan_unevictable_node(struct sys_device *dev, struct sysdev_attribute *attr, char *buf) { - warn_scan_unevictable_pages(); return sprintf(buf, "0\n"); /* always zero; should fit... */ } @@ -3536,7 +3684,19 @@ static ssize_t write_scan_unevictable_node(struct sys_device *dev, struct sysdev_attribute *attr, const char *buf, size_t count) { - warn_scan_unevictable_pages(); + struct zone *node_zones = NODE_DATA(dev->id)->node_zones; + struct zone *zone; + unsigned long res; + unsigned long req = strict_strtoul(buf, 10, &res); + + if (!req) + return 1; /* zero is no-op */ + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { + if (!populated_zone(zone)) + continue; + scan_zone_unevictable_pages(zone); + } return 1; } |