From 6168d0da2b479ce25a4647de194045de1bdd1f1d Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Tue, 15 Dec 2020 12:34:29 -0800 Subject: mm/lru: replace pgdat lru_lock with lruvec lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch moves per node lru_lock into lruvec, thus bring a lru_lock for each of memcg per node. So on a large machine, each of memcg don't have to suffer from per node pgdat->lru_lock competition. They could go fast with their self lru_lock. After move memcg charge before lru inserting, page isolation could serialize page's memcg, then per memcg lruvec lock is stable and could replace per node lru lock. In isolate_migratepages_block(), compact_unlock_should_abort and lock_page_lruvec_irqsave are open coded to work with compact_control. Also add a debug func in locking which may give some clues if there are sth out of hands. Daniel Jordan's testing show 62% improvement on modified readtwice case on his 2P * 10 core * 2 HT broadwell box. https://lore.kernel.org/lkml/20200915165807.kpp7uhiw7l3loofu@ca-dmjordan1.us.oracle.com/ Hugh Dickins helped on the patch polish, thanks! [alex.shi@linux.alibaba.com: fix comment typo] Link: https://lkml.kernel.org/r/5b085715-292a-4b43-50b3-d73dc90d1de5@linux.alibaba.com [alex.shi@linux.alibaba.com: use page_memcg()] Link: https://lkml.kernel.org/r/5a4c2b72-7ee8-2478-fc0e-85eb83aafec4@linux.alibaba.com Link: https://lkml.kernel.org/r/1604566549-62481-18-git-send-email-alex.shi@linux.alibaba.com Signed-off-by: Alex Shi Acked-by: Hugh Dickins Acked-by: Johannes Weiner Cc: Rong Chen Cc: Michal Hocko Cc: Vladimir Davydov Cc: Yang Shi Cc: Matthew Wilcox Cc: Konstantin Khlebnikov Cc: Daniel Jordan Cc: Alexander Duyck Cc: Andrea Arcangeli Cc: Andrey Ryabinin Cc: "Huang, Ying" Cc: Jann Horn Cc: Joonsoo Kim Cc: Kirill A. Shutemov Cc: Kirill A. Shutemov Cc: Mel Gorman Cc: Michal Hocko Cc: Mika Penttilä Cc: Minchan Kim Cc: Shakeel Butt Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 58 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/mmzone.h | 3 ++- 2 files changed, 60 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f530d634f055..aa5d559853c2 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -491,6 +491,19 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); struct mem_cgroup *get_mem_cgroup_from_page(struct page *page); +struct lruvec *lock_page_lruvec(struct page *page); +struct lruvec *lock_page_lruvec_irq(struct page *page); +struct lruvec *lock_page_lruvec_irqsave(struct page *page, + unsigned long *flags); + +#ifdef CONFIG_DEBUG_VM +void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page); +#else +static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +{ +} +#endif + static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; @@ -996,6 +1009,31 @@ static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } +static inline struct lruvec *lock_page_lruvec(struct page *page) +{ + struct pglist_data *pgdat = page_pgdat(page); + + spin_lock(&pgdat->__lruvec.lru_lock); + return &pgdat->__lruvec; +} + +static inline struct lruvec *lock_page_lruvec_irq(struct page *page) +{ + struct pglist_data *pgdat = page_pgdat(page); + + spin_lock_irq(&pgdat->__lruvec.lru_lock); + return &pgdat->__lruvec; +} + +static inline struct lruvec *lock_page_lruvec_irqsave(struct page *page, + unsigned long *flagsp) +{ + struct pglist_data *pgdat = page_pgdat(page); + + spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp); + return &pgdat->__lruvec; +} + static inline struct mem_cgroup * mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, @@ -1215,6 +1253,10 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } + +static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +{ +} #endif /* CONFIG_MEMCG */ /* idx can be of type enum memcg_stat_item or node_stat_item */ @@ -1296,6 +1338,22 @@ static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec)); } +static inline void unlock_page_lruvec(struct lruvec *lruvec) +{ + spin_unlock(&lruvec->lru_lock); +} + +static inline void unlock_page_lruvec_irq(struct lruvec *lruvec) +{ + spin_unlock_irq(&lruvec->lru_lock); +} + +static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec, + unsigned long flags) +{ + spin_unlock_irqrestore(&lruvec->lru_lock, flags); +} + #ifdef CONFIG_CGROUP_WRITEBACK struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 98a80c01d150..9da23c019dc5 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -276,6 +276,8 @@ enum lruvec_flags { struct lruvec { struct list_head lists[NR_LRU_LISTS]; + /* per lruvec lru_lock for memcg */ + spinlock_t lru_lock; /* * These track the cost of reclaiming one LRU - file or anon - * over the other. As the observed cost of reclaiming one LRU @@ -782,7 +784,6 @@ typedef struct pglist_data { /* Write-intensive fields used by page reclaim */ ZONE_PADDING(_pad1_) - spinlock_t lru_lock; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* -- cgit v1.2.3