From 449dd6984d0e47643c04c807f609dd56d48d5bcc Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 3 Apr 2014 14:47:56 -0700 Subject: mm: keep page cache radix tree nodes in check Previously, page cache radix tree nodes were freed after reclaim emptied out their page pointers. But now reclaim stores shadow entries in their place, which are only reclaimed when the inodes themselves are reclaimed. This is problematic for bigger files that are still in use after they have a significant amount of their cache reclaimed, without any of those pages actually refaulting. The shadow entries will just sit there and waste memory. In the worst case, the shadow entries will accumulate until the machine runs out of memory. To get this under control, the VM will track radix tree nodes exclusively containing shadow entries on a per-NUMA node list. Per-NUMA rather than global because we expect the radix tree nodes themselves to be allocated node-locally and we want to reduce cross-node references of otherwise independent cache workloads. A simple shrinker will then reclaim these nodes on memory pressure. A few things need to be stored in the radix tree node to implement the shadow node LRU and allow tree deletions coming from the list: 1. There is no index available that would describe the reverse path from the node up to the tree root, which is needed to perform a deletion. To solve this, encode in each node its offset inside the parent. This can be stored in the unused upper bits of the same member that stores the node's height at no extra space cost. 2. The number of shadow entries needs to be counted in addition to the regular entries, to quickly detect when the node is ready to go to the shadow node LRU list. The current entry count is an unsigned int but the maximum number of entries is 64, so a shadow counter can easily be stored in the unused upper bits. 3. Tree modification needs tree lock and tree root, which are located in the address space, so store an address_space backpointer in the node. The parent pointer of the node is in a union with the 2-word rcu_head, so the backpointer comes at no extra cost as well. 4. The node needs to be linked to an LRU list, which requires a list head inside the node. This does increase the size of the node, but it does not change the number of objects that fit into a slab page. [akpm@linux-foundation.org: export the right function] Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Cc: Andrea Arcangeli Cc: Bob Liu Cc: Christoph Hellwig Cc: Dave Chinner Cc: Greg Thelen Cc: Hugh Dickins Cc: Jan Kara Cc: KOSAKI Motohiro Cc: Luigi Semenzato Cc: Mel Gorman Cc: Metin Doslu Cc: Michel Lespinasse Cc: Ozgun Erdogan Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Ryan Mallon Cc: Tejun Heo Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 74 insertions(+), 16 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index a603c4d7d3c9..d6df3bacb0fb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -110,11 +110,17 @@ static void page_cache_tree_delete(struct address_space *mapping, struct page *page, void *shadow) { - if (shadow) { - void **slot; + struct radix_tree_node *node; + unsigned long index; + unsigned int offset; + unsigned int tag; + void **slot; - slot = radix_tree_lookup_slot(&mapping->page_tree, page->index); - radix_tree_replace_slot(slot, shadow); + VM_BUG_ON(!PageLocked(page)); + + __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot); + + if (shadow) { mapping->nrshadows++; /* * Make sure the nrshadows update is committed before @@ -123,9 +129,45 @@ static void page_cache_tree_delete(struct address_space *mapping, * same time and miss a shadow entry. */ smp_wmb(); - } else - radix_tree_delete(&mapping->page_tree, page->index); + } mapping->nrpages--; + + if (!node) { + /* Clear direct pointer tags in root node */ + mapping->page_tree.gfp_mask &= __GFP_BITS_MASK; + radix_tree_replace_slot(slot, shadow); + return; + } + + /* Clear tree tags for the removed page */ + index = page->index; + offset = index & RADIX_TREE_MAP_MASK; + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { + if (test_bit(offset, node->tags[tag])) + radix_tree_tag_clear(&mapping->page_tree, index, tag); + } + + /* Delete page, swap shadow entry */ + radix_tree_replace_slot(slot, shadow); + workingset_node_pages_dec(node); + if (shadow) + workingset_node_shadows_inc(node); + else + if (__radix_tree_delete_node(&mapping->page_tree, node)) + return; + + /* + * Track node that only contains shadow entries. + * + * Avoid acquiring the list_lru lock if already tracked. The + * list_empty() test is safe as node->private_list is + * protected by mapping->tree_lock. + */ + if (!workingset_node_pages(node) && + list_empty(&node->private_list)) { + node->private_data = mapping; + list_lru_add(&workingset_shadow_nodes, &node->private_list); + } } /* @@ -471,27 +513,43 @@ EXPORT_SYMBOL_GPL(replace_page_cache_page); static int page_cache_tree_insert(struct address_space *mapping, struct page *page, void **shadowp) { + struct radix_tree_node *node; void **slot; int error; - slot = radix_tree_lookup_slot(&mapping->page_tree, page->index); - if (slot) { + error = __radix_tree_create(&mapping->page_tree, page->index, + &node, &slot); + if (error) + return error; + if (*slot) { void *p; p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); if (!radix_tree_exceptional_entry(p)) return -EEXIST; - radix_tree_replace_slot(slot, page); - mapping->nrshadows--; - mapping->nrpages++; if (shadowp) *shadowp = p; - return 0; + mapping->nrshadows--; + if (node) + workingset_node_shadows_dec(node); } - error = radix_tree_insert(&mapping->page_tree, page->index, page); - if (!error) - mapping->nrpages++; - return error; + radix_tree_replace_slot(slot, page); + mapping->nrpages++; + if (node) { + workingset_node_pages_inc(node); + /* + * Don't track node that contains actual pages. + * + * Avoid acquiring the list_lru lock if already + * untracked. The list_empty() test is safe as + * node->private_list is protected by + * mapping->tree_lock. + */ + if (!list_empty(&node->private_list)) + list_lru_del(&workingset_shadow_nodes, + &node->private_list); + } + return 0; } static int __add_to_page_cache_locked(struct page *page, -- cgit v1.2.3