From 88221c32f98551727cd4b8137fcc857a5294b97f Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 12 Dec 2014 16:54:12 -0800 Subject: rtc: snvs: fix build with CONFIG_PM_SLEEP disabled Commit 7654e9d4fd8f ("drivers/rtc/rtc-snvs: fix suspend/resume") replaces SIMPLE_DEV_PM_OPS with direct declaration of snvs_rtc_pm_ops, but does so outside #ifdef CONFIG_PM_SLEEP. This causes the driver build to fail if CONFIG_PM_SLEEP is not configured. Fixes: 7654e9d4fd8f ("drivers/rtc/rtc-snvs: fix suspend/resume") Signed-off-by: Guenter Roeck Cc: Sanchayan Maity Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-snvs.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/rtc/rtc-snvs.c b/drivers/rtc/rtc-snvs.c index 2cd8ffe5c698..942b267c6271 100644 --- a/drivers/rtc/rtc-snvs.c +++ b/drivers/rtc/rtc-snvs.c @@ -344,13 +344,20 @@ static int snvs_rtc_resume(struct device *dev) return 0; } -#endif static const struct dev_pm_ops snvs_rtc_pm_ops = { .suspend_noirq = snvs_rtc_suspend, .resume_noirq = snvs_rtc_resume, }; +#define SNVS_RTC_PM_OPS (&snvs_rtc_pm_ops) + +#else + +#define SNVS_RTC_PM_OPS NULL + +#endif + static const struct of_device_id snvs_dt_ids[] = { { .compatible = "fsl,sec-v4.0-mon-rtc-lp", }, { /* sentinel */ } @@ -361,7 +368,7 @@ static struct platform_driver snvs_rtc_driver = { .driver = { .name = "snvs_rtc", .owner = THIS_MODULE, - .pm = &snvs_rtc_pm_ops, + .pm = SNVS_RTC_PM_OPS, .of_match_table = snvs_dt_ids, }, .probe = snvs_rtc_probe, -- cgit v1.2.3 From 9dc00f4c4fdd1fab12a4da9a966e535b4284b0a4 Mon Sep 17 00:00:00 2001 From: Jesse Barnes Date: Fri, 12 Dec 2014 16:55:30 -0800 Subject: iommu/amd: use handle_mm_fault directly This could be useful for debug in the future if we want to track major/minor faults more closely, and also avoids the put_page trick we used with gup. In order to do this, we also track the task struct in the PASID state structure. This lets us update the appropriate task stats after the fault has been handled, and may aid with debug in the future as well. Signed-off-by: Jesse Barnes Tested-by: Oded Gabbay Cc: Joerg Roedel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/iommu/amd_iommu_v2.c | 84 ++++++++++++++++++++++++++++---------------- 1 file changed, 53 insertions(+), 31 deletions(-) (limited to 'drivers') diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c index 90d734bbf467..9c0d6e290097 100644 --- a/drivers/iommu/amd_iommu_v2.c +++ b/drivers/iommu/amd_iommu_v2.c @@ -513,45 +513,67 @@ static void finish_pri_tag(struct device_state *dev_state, spin_unlock_irqrestore(&pasid_state->lock, flags); } +static void handle_fault_error(struct fault *fault) +{ + int status; + + if (!fault->dev_state->inv_ppr_cb) { + set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); + return; + } + + status = fault->dev_state->inv_ppr_cb(fault->dev_state->pdev, + fault->pasid, + fault->address, + fault->flags); + switch (status) { + case AMD_IOMMU_INV_PRI_RSP_SUCCESS: + set_pri_tag_status(fault->state, fault->tag, PPR_SUCCESS); + break; + case AMD_IOMMU_INV_PRI_RSP_INVALID: + set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); + break; + case AMD_IOMMU_INV_PRI_RSP_FAIL: + set_pri_tag_status(fault->state, fault->tag, PPR_FAILURE); + break; + default: + BUG(); + } +} + static void do_fault(struct work_struct *work) { struct fault *fault = container_of(work, struct fault, work); - int npages, write; - struct page *page; + struct mm_struct *mm; + struct vm_area_struct *vma; + u64 address; + int ret, write; write = !!(fault->flags & PPR_FAULT_WRITE); - down_read(&fault->state->mm->mmap_sem); - npages = get_user_pages(NULL, fault->state->mm, - fault->address, 1, write, 0, &page, NULL); - up_read(&fault->state->mm->mmap_sem); - - if (npages == 1) { - put_page(page); - } else if (fault->dev_state->inv_ppr_cb) { - int status; - - status = fault->dev_state->inv_ppr_cb(fault->dev_state->pdev, - fault->pasid, - fault->address, - fault->flags); - switch (status) { - case AMD_IOMMU_INV_PRI_RSP_SUCCESS: - set_pri_tag_status(fault->state, fault->tag, PPR_SUCCESS); - break; - case AMD_IOMMU_INV_PRI_RSP_INVALID: - set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); - break; - case AMD_IOMMU_INV_PRI_RSP_FAIL: - set_pri_tag_status(fault->state, fault->tag, PPR_FAILURE); - break; - default: - BUG(); - } - } else { - set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); + mm = fault->state->mm; + address = fault->address; + + down_read(&mm->mmap_sem); + vma = find_extend_vma(mm, address); + if (!vma || address < vma->vm_start) { + /* failed to get a vma in the right range */ + up_read(&mm->mmap_sem); + handle_fault_error(fault); + goto out; + } + + ret = handle_mm_fault(mm, vma, address, write); + if (ret & VM_FAULT_ERROR) { + /* failed to service fault */ + up_read(&mm->mmap_sem); + handle_fault_error(fault); + goto out; } + up_read(&mm->mmap_sem); + +out: finish_pri_tag(fault->dev_state, fault->state, fault->tag); put_pasid_state(fault->state); -- cgit v1.2.3 From 71fbd556adde2795787a6ca4c16871e57efba847 Mon Sep 17 00:00:00 2001 From: Zhang Zhen Date: Fri, 12 Dec 2014 16:55:33 -0800 Subject: memory-hotplug: remove redundant call of page_to_pfn This is just a small optimization. The start_pfn can be obtained directly by phys_index << PFN_SECTION_SHIFT. So the call of page_to_pfn() is redundant and remove it. Signed-off-by: Zhang Zhen Acked-by: Yasuaki Ishimatsu Acked-by: David Rientjes Cc: Dave Hansen Cc: Wang Nan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 7c5d87191b28..85be040a21c8 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -228,8 +228,8 @@ memory_block_action(unsigned long phys_index, unsigned long action, int online_t struct page *first_page; int ret; - first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT); - start_pfn = page_to_pfn(first_page); + start_pfn = phys_index << PFN_SECTION_SHIFT; + first_page = pfn_to_page(start_pfn); switch (action) { case MEM_ONLINE: -- cgit v1.2.3 From 6b4f7799c6a5703ac6b8c0649f4c22f00fa07513 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 12 Dec 2014 16:56:13 -0800 Subject: mm: vmscan: invoke slab shrinkers from shrink_zone() The slab shrinkers are currently invoked from the zonelist walkers in kswapd, direct reclaim, and zone reclaim, all of which roughly gauge the eligible LRU pages and assemble a nodemask to pass to NUMA-aware shrinkers, which then again have to walk over the nodemask. This is redundant code, extra runtime work, and fairly inaccurate when it comes to the estimation of actually scannable LRU pages. The code duplication will only get worse when making the shrinkers cgroup-aware and requiring them to have out-of-band cgroup hierarchy walks as well. Instead, invoke the shrinkers from shrink_zone(), which is where all reclaimers end up, to avoid this duplication. Take the count for eligible LRU pages out of get_scan_count(), which considers many more factors than just the availability of swap space, like zone_reclaimable_pages() currently does. Accumulate the number over all visited lruvecs to get the per-zone value. Some nodes have multiple zones due to memory addressing restrictions. To avoid putting too much pressure on the shrinkers, only invoke them once for each such node, using the class zone of the allocation as the pivot zone. For now, this integrates the slab shrinking better into the reclaim logic and gets rid of duplicative invocations from kswapd, direct reclaim, and zone reclaim. It also prepares for cgroup-awareness, allowing memcg-capable shrinkers to be added at the lruvec level without much duplication of both code and runtime work. This changes kswapd behavior, which used to invoke the shrinkers for each zone, but with scan ratios gathered from the entire node, resulting in meaningless pressure quantities on multi-zone nodes. Zone reclaim behavior also changes. It used to shrink slabs until the same amount of pages were shrunk as were reclaimed from the LRUs. Now it merely invokes the shrinkers once with the zone's scan ratio, which makes the shrinkers go easier on caches that implement aging and would prefer feeding back pressure from recently used slab objects to unused LRU pages. [vdavydov@parallels.com: assure class zone is populated] Signed-off-by: Johannes Weiner Cc: Dave Chinner Signed-off-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/staging/android/ashmem.c | 3 +- fs/drop_caches.c | 11 +- include/linux/mm.h | 6 +- include/linux/shrinker.h | 2 - mm/memory-failure.c | 11 +- mm/page_alloc.c | 6 +- mm/vmscan.c | 216 ++++++++++++++++----------------------- 7 files changed, 106 insertions(+), 149 deletions(-) (limited to 'drivers') diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c index ad4f5790a76f..46f8ef42559e 100644 --- a/drivers/staging/android/ashmem.c +++ b/drivers/staging/android/ashmem.c @@ -418,7 +418,7 @@ out: } /* - * ashmem_shrink - our cache shrinker, called from mm/vmscan.c :: shrink_slab + * ashmem_shrink - our cache shrinker, called from mm/vmscan.c * * 'nr_to_scan' is the number of objects to scan for freeing. * @@ -785,7 +785,6 @@ static long ashmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg) .nr_to_scan = LONG_MAX, }; ret = ashmem_shrink_count(&ashmem_shrinker, &sc); - nodes_setall(sc.nodes_to_scan); ashmem_shrink_scan(&ashmem_shrinker, &sc); } break; diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 1de7294aad20..2bc2c87f35e7 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -40,13 +40,14 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) static void drop_slab(void) { int nr_objects; - struct shrink_control shrink = { - .gfp_mask = GFP_KERNEL, - }; - nodes_setall(shrink.nodes_to_scan); do { - nr_objects = shrink_slab(&shrink, 1000, 1000); + int nid; + + nr_objects = 0; + for_each_online_node(nid) + nr_objects += shrink_node_slabs(GFP_KERNEL, nid, + 1000, 1000); } while (nr_objects > 10); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 8b8d77a1532f..c0a67b894c4c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2110,9 +2110,9 @@ int drop_caches_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); #endif -unsigned long shrink_slab(struct shrink_control *shrink, - unsigned long nr_pages_scanned, - unsigned long lru_pages); +unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, + unsigned long nr_scanned, + unsigned long nr_eligible); #ifndef CONFIG_MMU #define randomize_va_space 0 diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 68c097077ef0..f4aee75f00b1 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -18,8 +18,6 @@ struct shrink_control { */ unsigned long nr_to_scan; - /* shrink from these nodes */ - nodemask_t nodes_to_scan; /* current node being shrunk (for NUMA aware shrinkers) */ int nid; }; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6b94969d91c5..feb803bf3443 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -239,19 +239,14 @@ void shake_page(struct page *p, int access) } /* - * Only call shrink_slab here (which would also shrink other caches) if - * access is not potentially fatal. + * Only call shrink_node_slabs here (which would also shrink + * other caches) if access is not potentially fatal. */ if (access) { int nr; int nid = page_to_nid(p); do { - struct shrink_control shrink = { - .gfp_mask = GFP_KERNEL, - }; - node_set(nid, shrink.nodes_to_scan); - - nr = shrink_slab(&shrink, 1000, 1000); + nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000); if (page_count(p) == 1) break; } while (nr > 10); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c13b6b29add2..9d3870862293 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6284,9 +6284,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, if (!PageLRU(page)) found++; /* - * If there are RECLAIMABLE pages, we need to check it. - * But now, memory offline itself doesn't call shrink_slab() - * and it still to be fixed. + * If there are RECLAIMABLE pages, we need to check + * it. But now, memory offline itself doesn't call + * shrink_node_slabs() and it still to be fixed. */ /* * If the page is not RAM, page_count()should be 0. diff --git a/mm/vmscan.c b/mm/vmscan.c index a384339bf718..bd9a72bc4a1b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -229,9 +229,10 @@ EXPORT_SYMBOL(unregister_shrinker); #define SHRINK_BATCH 128 -static unsigned long -shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, - unsigned long nr_pages_scanned, unsigned long lru_pages) +static unsigned long shrink_slabs(struct shrink_control *shrinkctl, + struct shrinker *shrinker, + unsigned long nr_scanned, + unsigned long nr_eligible) { unsigned long freed = 0; unsigned long long delta; @@ -255,9 +256,9 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); total_scan = nr; - delta = (4 * nr_pages_scanned) / shrinker->seeks; + delta = (4 * nr_scanned) / shrinker->seeks; delta *= freeable; - do_div(delta, lru_pages + 1); + do_div(delta, nr_eligible + 1); total_scan += delta; if (total_scan < 0) { pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", @@ -289,8 +290,8 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, total_scan = freeable * 2; trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, - nr_pages_scanned, lru_pages, - freeable, delta, total_scan); + nr_scanned, nr_eligible, + freeable, delta, total_scan); /* * Normally, we should not scan less than batch_size objects in one @@ -339,34 +340,37 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, return freed; } -/* - * Call the shrink functions to age shrinkable caches - * - * Here we assume it costs one seek to replace a lru page and that it also - * takes a seek to recreate a cache object. With this in mind we age equal - * percentages of the lru and ageable caches. This should balance the seeks - * generated by these structures. +/** + * shrink_node_slabs - shrink slab caches of a given node + * @gfp_mask: allocation context + * @nid: node whose slab caches to target + * @nr_scanned: pressure numerator + * @nr_eligible: pressure denominator * - * If the vm encountered mapped pages on the LRU it increase the pressure on - * slab to avoid swapping. + * Call the shrink functions to age shrinkable caches. * - * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. + * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, + * unaware shrinkers will receive a node id of 0 instead. * - * `lru_pages' represents the number of on-LRU pages in all the zones which - * are eligible for the caller's allocation attempt. It is used for balancing - * slab reclaim versus page reclaim. + * @nr_scanned and @nr_eligible form a ratio that indicate how much of + * the available objects should be scanned. Page reclaim for example + * passes the number of pages scanned and the number of pages on the + * LRU lists that it considered on @nid, plus a bias in @nr_scanned + * when it encountered mapped pages. The ratio is further biased by + * the ->seeks setting of the shrink function, which indicates the + * cost to recreate an object relative to that of an LRU page. * - * Returns the number of slab objects which we shrunk. + * Returns the number of reclaimed slab objects. */ -unsigned long shrink_slab(struct shrink_control *shrinkctl, - unsigned long nr_pages_scanned, - unsigned long lru_pages) +unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, + unsigned long nr_scanned, + unsigned long nr_eligible) { struct shrinker *shrinker; unsigned long freed = 0; - if (nr_pages_scanned == 0) - nr_pages_scanned = SWAP_CLUSTER_MAX; + if (nr_scanned == 0) + nr_scanned = SWAP_CLUSTER_MAX; if (!down_read_trylock(&shrinker_rwsem)) { /* @@ -380,20 +384,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl, } list_for_each_entry(shrinker, &shrinker_list, list) { - if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) { - shrinkctl->nid = 0; - freed += shrink_slab_node(shrinkctl, shrinker, - nr_pages_scanned, lru_pages); - continue; - } + struct shrink_control sc = { + .gfp_mask = gfp_mask, + .nid = nid, + }; - for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { - if (node_online(shrinkctl->nid)) - freed += shrink_slab_node(shrinkctl, shrinker, - nr_pages_scanned, lru_pages); + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + sc.nid = 0; - } + freed += shrink_slabs(&sc, shrinker, nr_scanned, nr_eligible); } + up_read(&shrinker_rwsem); out: cond_resched(); @@ -1876,7 +1877,8 @@ enum scan_balance { * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ static void get_scan_count(struct lruvec *lruvec, int swappiness, - struct scan_control *sc, unsigned long *nr) + struct scan_control *sc, unsigned long *nr, + unsigned long *lru_pages) { struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; u64 fraction[2]; @@ -2022,6 +2024,7 @@ out: some_scanned = false; /* Only use force_scan on second pass. */ for (pass = 0; !some_scanned && pass < 2; pass++) { + *lru_pages = 0; for_each_evictable_lru(lru) { int file = is_file_lru(lru); unsigned long size; @@ -2048,14 +2051,19 @@ out: case SCAN_FILE: case SCAN_ANON: /* Scan one type exclusively */ - if ((scan_balance == SCAN_FILE) != file) + if ((scan_balance == SCAN_FILE) != file) { + size = 0; scan = 0; + } break; default: /* Look ma, no brain */ BUG(); } + + *lru_pages += size; nr[lru] = scan; + /* * Skip the second pass and don't force_scan, * if we found something to scan. @@ -2069,7 +2077,7 @@ out: * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ static void shrink_lruvec(struct lruvec *lruvec, int swappiness, - struct scan_control *sc) + struct scan_control *sc, unsigned long *lru_pages) { unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; @@ -2080,7 +2088,7 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness, struct blk_plug plug; bool scan_adjusted; - get_scan_count(lruvec, swappiness, sc, nr); + get_scan_count(lruvec, swappiness, sc, nr, lru_pages); /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); @@ -2258,7 +2266,8 @@ static inline bool should_continue_reclaim(struct zone *zone, } } -static bool shrink_zone(struct zone *zone, struct scan_control *sc) +static bool shrink_zone(struct zone *zone, struct scan_control *sc, + bool is_classzone) { unsigned long nr_reclaimed, nr_scanned; bool reclaimable = false; @@ -2269,6 +2278,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc) .zone = zone, .priority = sc->priority, }; + unsigned long zone_lru_pages = 0; struct mem_cgroup *memcg; nr_reclaimed = sc->nr_reclaimed; @@ -2276,13 +2286,15 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc) memcg = mem_cgroup_iter(root, NULL, &reclaim); do { + unsigned long lru_pages; struct lruvec *lruvec; int swappiness; lruvec = mem_cgroup_zone_lruvec(zone, memcg); swappiness = mem_cgroup_swappiness(memcg); - shrink_lruvec(lruvec, swappiness, sc); + shrink_lruvec(lruvec, swappiness, sc, &lru_pages); + zone_lru_pages += lru_pages; /* * Direct reclaim and kswapd have to scan all memory @@ -2302,6 +2314,25 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc) memcg = mem_cgroup_iter(root, memcg, &reclaim); } while (memcg); + /* + * Shrink the slab caches in the same proportion that + * the eligible LRU pages were scanned. + */ + if (global_reclaim(sc) && is_classzone) { + struct reclaim_state *reclaim_state; + + shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone), + sc->nr_scanned - nr_scanned, + zone_lru_pages); + + reclaim_state = current->reclaim_state; + if (reclaim_state) { + sc->nr_reclaimed += + reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; + } + } + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, sc->nr_scanned - nr_scanned, sc->nr_reclaimed - nr_reclaimed); @@ -2376,12 +2407,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) struct zone *zone; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; - unsigned long lru_pages = 0; - struct reclaim_state *reclaim_state = current->reclaim_state; gfp_t orig_mask; - struct shrink_control shrink = { - .gfp_mask = sc->gfp_mask, - }; enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); bool reclaimable = false; @@ -2394,12 +2420,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (buffer_heads_over_limit) sc->gfp_mask |= __GFP_HIGHMEM; - nodes_clear(shrink.nodes_to_scan); - for_each_zone_zonelist_nodemask(zone, z, zonelist, - gfp_zone(sc->gfp_mask), sc->nodemask) { + requested_highidx, sc->nodemask) { + enum zone_type classzone_idx; + if (!populated_zone(zone)) continue; + + classzone_idx = requested_highidx; + while (!populated_zone(zone->zone_pgdat->node_zones + + classzone_idx)) + classzone_idx--; + /* * Take care memory controller reclaiming has small influence * to global LRU. @@ -2409,9 +2441,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) GFP_KERNEL | __GFP_HARDWALL)) continue; - lru_pages += zone_reclaimable_pages(zone); - node_set(zone_to_nid(zone), shrink.nodes_to_scan); - if (sc->priority != DEF_PRIORITY && !zone_reclaimable(zone)) continue; /* Let kswapd poll it */ @@ -2450,7 +2479,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) /* need some check for avoid more shrink_zone() */ } - if (shrink_zone(zone, sc)) + if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx)) reclaimable = true; if (global_reclaim(sc) && @@ -2458,20 +2487,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) reclaimable = true; } - /* - * Don't shrink slabs when reclaiming memory from over limit cgroups - * but do shrink slab at least once when aborting reclaim for - * compaction to avoid unevenly scanning file/anon LRU pages over slab - * pages. - */ - if (global_reclaim(sc)) { - shrink_slab(&shrink, sc->nr_scanned, lru_pages); - if (reclaim_state) { - sc->nr_reclaimed += reclaim_state->reclaimed_slab; - reclaim_state->reclaimed_slab = 0; - } - } - /* * Restore to original mask to avoid the impact on the caller if we * promoted it to __GFP_HIGHMEM. @@ -2736,6 +2751,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, }; struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); int swappiness = mem_cgroup_swappiness(memcg); + unsigned long lru_pages; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -2751,7 +2767,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_lruvec(lruvec, swappiness, &sc); + shrink_lruvec(lruvec, swappiness, &sc, &lru_pages); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -2932,15 +2948,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, static bool kswapd_shrink_zone(struct zone *zone, int classzone_idx, struct scan_control *sc, - unsigned long lru_pages, unsigned long *nr_attempted) { int testorder = sc->order; unsigned long balance_gap; - struct reclaim_state *reclaim_state = current->reclaim_state; - struct shrink_control shrink = { - .gfp_mask = sc->gfp_mask, - }; bool lowmem_pressure; /* Reclaim above the high watermark. */ @@ -2975,13 +2986,7 @@ static bool kswapd_shrink_zone(struct zone *zone, balance_gap, classzone_idx)) return true; - shrink_zone(zone, sc); - nodes_clear(shrink.nodes_to_scan); - node_set(zone_to_nid(zone), shrink.nodes_to_scan); - - reclaim_state->reclaimed_slab = 0; - shrink_slab(&shrink, sc->nr_scanned, lru_pages); - sc->nr_reclaimed += reclaim_state->reclaimed_slab; + shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); /* Account for the number of pages attempted to reclaim */ *nr_attempted += sc->nr_to_reclaim; @@ -3042,7 +3047,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, count_vm_event(PAGEOUTRUN); do { - unsigned long lru_pages = 0; unsigned long nr_attempted = 0; bool raise_priority = true; bool pgdat_needs_compaction = (order > 0); @@ -3102,8 +3106,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, if (!populated_zone(zone)) continue; - lru_pages += zone_reclaimable_pages(zone); - /* * If any zone is currently balanced then kswapd will * not call compaction as it is expected that the @@ -3159,8 +3161,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, * that that high watermark would be met at 100% * efficiency. */ - if (kswapd_shrink_zone(zone, end_zone, &sc, - lru_pages, &nr_attempted)) + if (kswapd_shrink_zone(zone, end_zone, + &sc, &nr_attempted)) raise_priority = false; } @@ -3612,10 +3614,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), .may_swap = 1, }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, - }; - unsigned long nr_slab_pages0, nr_slab_pages1; cond_resched(); /* @@ -3634,44 +3632,10 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * priorities until we have enough memory freed. */ do { - shrink_zone(zone, &sc); + shrink_zone(zone, &sc, true); } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); } - nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); - if (nr_slab_pages0 > zone->min_slab_pages) { - /* - * shrink_slab() does not currently allow us to determine how - * many pages were freed in this zone. So we take the current - * number of slab pages and shake the slab until it is reduced - * by the same nr_pages that we used for reclaiming unmapped - * pages. - */ - nodes_clear(shrink.nodes_to_scan); - node_set(zone_to_nid(zone), shrink.nodes_to_scan); - for (;;) { - unsigned long lru_pages = zone_reclaimable_pages(zone); - - /* No reclaimable slab or very low memory pressure */ - if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages)) - break; - - /* Freed enough memory */ - nr_slab_pages1 = zone_page_state(zone, - NR_SLAB_RECLAIMABLE); - if (nr_slab_pages1 + nr_pages <= nr_slab_pages0) - break; - } - - /* - * Update nr_reclaimed by the number of slab pages we - * reclaimed from this zone. - */ - nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); - if (nr_slab_pages1 < nr_slab_pages0) - sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1; - } - p->reclaim_state = NULL; current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); lockdep_clear_current_reclaim_state(); -- cgit v1.2.3 From b627cff3d308d3ccb3ec73a89260f5c7872e46a4 Mon Sep 17 00:00:00 2001 From: "karam.lee" Date: Fri, 12 Dec 2014 16:56:47 -0800 Subject: zram: remove bio parameter from zram_bvec_rw() Recently rw_page block device operation has been added. This patchset implements rw_page operation for zram block device and does some clean-up. This patch (of 3): Remove an unnecessary parameter(bio) from zram_bvec_rw() and zram_bvec_read(). zram_bvec_read() doesn't use a bio parameter, so remove it. zram_bvec_rw() calls a read/write operation not using bio, so a rw parameter replaces a bio parameter. Signed-off-by: karam.lee Acked-by: Minchan Kim Acked-by: Jerome Marchand Cc: Matthew Wilcox Cc: Nitin Gupta Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3920ee45aa59..ac5be75c013f 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -453,7 +453,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) } static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, - u32 index, int offset, struct bio *bio) + u32 index, int offset) { int ret; struct page *page; @@ -645,14 +645,13 @@ out: } static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, - int offset, struct bio *bio) + int offset, int rw) { int ret; - int rw = bio_data_dir(bio); if (rw == READ) { atomic64_inc(&zram->stats.num_reads); - ret = zram_bvec_read(zram, bvec, index, offset, bio); + ret = zram_bvec_read(zram, bvec, index, offset); } else { atomic64_inc(&zram->stats.num_writes); ret = zram_bvec_write(zram, bvec, index, offset); @@ -853,7 +852,7 @@ out: static void __zram_make_request(struct zram *zram, struct bio *bio) { - int offset; + int offset, rw; u32 index; struct bio_vec bvec; struct bvec_iter iter; @@ -868,6 +867,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) return; } + rw = bio_data_dir(bio); bio_for_each_segment(bvec, bio, iter) { int max_transfer_size = PAGE_SIZE - offset; @@ -882,15 +882,15 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) bv.bv_len = max_transfer_size; bv.bv_offset = bvec.bv_offset; - if (zram_bvec_rw(zram, &bv, index, offset, bio) < 0) + if (zram_bvec_rw(zram, &bv, index, offset, rw) < 0) goto out; bv.bv_len = bvec.bv_len - max_transfer_size; bv.bv_offset += max_transfer_size; - if (zram_bvec_rw(zram, &bv, index + 1, 0, bio) < 0) + if (zram_bvec_rw(zram, &bv, index + 1, 0, rw) < 0) goto out; } else - if (zram_bvec_rw(zram, &bvec, index, offset, bio) < 0) + if (zram_bvec_rw(zram, &bvec, index, offset, rw) < 0) goto out; update_position(&index, &offset, &bvec); -- cgit v1.2.3 From 54850e73e86e3bc092680d1bdb84eb322f982ab1 Mon Sep 17 00:00:00 2001 From: "karam.lee" Date: Fri, 12 Dec 2014 16:56:50 -0800 Subject: zram: change parameter from vaild_io_request() This patch changes parameter of valid_io_request for common usage. The purpose of valid_io_request() is to determine if bio request is valid or not. This patch use I/O start address and size instead of a BIO parameter for common usage. Signed-off-by: karam.lee Acked-by: Minchan Kim Acked-by: Jerome Marchand Cc: Matthew Wilcox Cc: Nitin Gupta Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index ac5be75c013f..98af4aae2618 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -287,19 +287,18 @@ static inline int is_partial_io(struct bio_vec *bvec) /* * Check if request is within bounds and aligned on zram logical blocks. */ -static inline int valid_io_request(struct zram *zram, struct bio *bio) +static inline int valid_io_request(struct zram *zram, + sector_t start, unsigned int size) { - u64 start, end, bound; + u64 end, bound; /* unaligned request */ - if (unlikely(bio->bi_iter.bi_sector & - (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) + if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) return 0; - if (unlikely(bio->bi_iter.bi_size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) + if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) return 0; - start = bio->bi_iter.bi_sector; - end = start + (bio->bi_iter.bi_size >> SECTOR_SHIFT); + end = start + (size >> SECTOR_SHIFT); bound = zram->disksize >> SECTOR_SHIFT; /* out of range range */ if (unlikely(start >= bound || end > bound || start > end)) @@ -915,7 +914,8 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio) if (unlikely(!init_done(zram))) goto error; - if (!valid_io_request(zram, bio)) { + if (!valid_io_request(zram, bio->bi_iter.bi_sector, + bio->bi_iter.bi_size)) { atomic64_inc(&zram->stats.invalid_io); goto error; } -- cgit v1.2.3 From 8c7f01025f7becd577bed9af93f974fc6798c5a3 Mon Sep 17 00:00:00 2001 From: "karam.lee" Date: Fri, 12 Dec 2014 16:56:53 -0800 Subject: zram: implement rw_page operation of zram This patch implements rw_page operation for zram block device. I implemented the feature in zram and tested it. Test bed was the G2, LG electronic mobile device, whtich has msm8974 processor and 2GB memory. With a memory allocation test program consuming memory, the system generates swap. Operating time of swap_write_page() was measured. -------------------------------------------------- | | operating time | improvement | | | (20 runs average) | | -------------------------------------------------- |with patch | 1061.15 us | +2.4% | -------------------------------------------------- |without patch| 1087.35 us | | -------------------------------------------------- Each test(with paged_io,with BIO) result set shows normal distribution and has equal variance. I mean the two values are valid result to compare. I can say operation with paged I/O(without BIO) is faster 2.4% with confidence level 95%. [minchan@kernel.org: make rw_page opeartion return 0] [minchan@kernel.org: rely on the bi_end_io for zram_rw_page fails] [sergey.senozhatsky@gmail.com: code cleanup] [minchan@kernel.org: add comment] Signed-off-by: karam.lee Acked-by: Minchan Kim Acked-by: Jerome Marchand Cc: Matthew Wilcox Cc: Nitin Gupta Cc: Signed-off-by: Minchan Kim Signed-off-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 44 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'drivers') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 98af4aae2618..976eab6f35b9 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -945,8 +945,52 @@ static void zram_slot_free_notify(struct block_device *bdev, atomic64_inc(&zram->stats.notify_free); } +static int zram_rw_page(struct block_device *bdev, sector_t sector, + struct page *page, int rw) +{ + int offset, err; + u32 index; + struct zram *zram; + struct bio_vec bv; + + zram = bdev->bd_disk->private_data; + if (!valid_io_request(zram, sector, PAGE_SIZE)) { + atomic64_inc(&zram->stats.invalid_io); + return -EINVAL; + } + + down_read(&zram->init_lock); + if (unlikely(!init_done(zram))) { + err = -EIO; + goto out_unlock; + } + + index = sector >> SECTORS_PER_PAGE_SHIFT; + offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT; + + bv.bv_page = page; + bv.bv_len = PAGE_SIZE; + bv.bv_offset = 0; + + err = zram_bvec_rw(zram, &bv, index, offset, rw); +out_unlock: + up_read(&zram->init_lock); + /* + * If I/O fails, just return error(ie, non-zero) without + * calling page_endio. + * It causes resubmit the I/O with bio request by upper functions + * of rw_page(e.g., swap_readpage, __swap_writepage) and + * bio->bi_end_io does things to handle the error + * (e.g., SetPageError, set_page_dirty and extra works). + */ + if (err == 0) + page_endio(page, rw, 0); + return err; +} + static const struct block_device_operations zram_devops = { .swap_slot_free_notify = zram_slot_free_notify, + .rw_page = zram_rw_page, .owner = THIS_MODULE }; -- cgit v1.2.3 From d49b1c254c997195872a9e8913660a788298921e Mon Sep 17 00:00:00 2001 From: Mahendran Ganesh Date: Fri, 12 Dec 2014 16:57:04 -0800 Subject: mm/zram: correct ZRAM_ZERO flag bit position In struct zram_table_entry, the element *value* contains obj size and obj zram flags. Bit 0 to bit (ZRAM_FLAG_SHIFT - 1) represent obj size, and bit ZRAM_FLAG_SHIFT to the highest bit of unsigned long represent obj zram_flags. So the first zram flag(ZRAM_ZERO) should be from ZRAM_FLAG_SHIFT instead of (ZRAM_FLAG_SHIFT + 1). This patch fixes this cosmetic issue. Also fix a typo, "page in now accessed" -> "page is now accessed" Signed-off-by: Mahendran Ganesh Acked-by: Minchan Kim Acked-by: Weijie Yang Acked-by: Sergey Senozhatsky Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index c6ee271317f5..b05a816b09ac 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -66,8 +66,8 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; /* Flags for zram pages (table[page_no].value) */ enum zram_pageflags { /* Page consists entirely of zeros */ - ZRAM_ZERO = ZRAM_FLAG_SHIFT + 1, - ZRAM_ACCESS, /* page in now accessed */ + ZRAM_ZERO = ZRAM_FLAG_SHIFT, + ZRAM_ACCESS, /* page is now accessed */ __NR_ZRAM_PAGEFLAGS, }; -- cgit v1.2.3 From 083914eab96fabddfc715f0436e082eb375c5704 Mon Sep 17 00:00:00 2001 From: Ganesh Mahendran Date: Fri, 12 Dec 2014 16:57:13 -0800 Subject: zram: use DEVICE_ATTR_[RW|RO|WO] to define zram sys device attribute In current zram, we use DEVICE_ATTR() to define sys device attributes. SO, we need to set (S_IRUGO | S_IWUSR) permission and other arguments manually. Linux already provids the macro DEVICE_ATTR_[RW|RO|WO] to define sys device attribute. It is simple and readable. This patch uses kernel defined macro DEVICE_ATTR_[RW|RO|WO] to define zram device attribute. Signed-off-by: Ganesh Mahendran Acked-by: Jerome Marchand Cc: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) (limited to 'drivers') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 976eab6f35b9..bd8bda386e02 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -44,15 +44,14 @@ static const char *default_compressor = "lzo"; static unsigned int num_devices = 1; #define ZRAM_ATTR_RO(name) \ -static ssize_t zram_attr_##name##_show(struct device *d, \ +static ssize_t name##_show(struct device *d, \ struct device_attribute *attr, char *b) \ { \ struct zram *zram = dev_to_zram(d); \ return scnprintf(b, PAGE_SIZE, "%llu\n", \ (u64)atomic64_read(&zram->stats.name)); \ } \ -static struct device_attribute dev_attr_##name = \ - __ATTR(name, S_IRUGO, zram_attr_##name##_show, NULL); +static DEVICE_ATTR_RO(name); static inline int init_done(struct zram *zram) { @@ -994,20 +993,15 @@ static const struct block_device_operations zram_devops = { .owner = THIS_MODULE }; -static DEVICE_ATTR(disksize, S_IRUGO | S_IWUSR, - disksize_show, disksize_store); -static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL); -static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store); -static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); -static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); -static DEVICE_ATTR(mem_limit, S_IRUGO | S_IWUSR, mem_limit_show, - mem_limit_store); -static DEVICE_ATTR(mem_used_max, S_IRUGO | S_IWUSR, mem_used_max_show, - mem_used_max_store); -static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR, - max_comp_streams_show, max_comp_streams_store); -static DEVICE_ATTR(comp_algorithm, S_IRUGO | S_IWUSR, - comp_algorithm_show, comp_algorithm_store); +static DEVICE_ATTR_RW(disksize); +static DEVICE_ATTR_RO(initstate); +static DEVICE_ATTR_WO(reset); +static DEVICE_ATTR_RO(orig_data_size); +static DEVICE_ATTR_RO(mem_used_total); +static DEVICE_ATTR_RW(mem_limit); +static DEVICE_ATTR_RW(mem_used_max); +static DEVICE_ATTR_RW(max_comp_streams); +static DEVICE_ATTR_RW(comp_algorithm); ZRAM_ATTR_RO(num_reads); ZRAM_ATTR_RO(num_writes); -- cgit v1.2.3