diff options
Diffstat (limited to 'mm/zswap.c')
-rw-r--r-- | mm/zswap.c | 231 |
1 files changed, 142 insertions, 89 deletions
diff --git a/mm/zswap.c b/mm/zswap.c index 6504174fbc6a..204fb59da33c 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -43,7 +43,7 @@ * statistics **********************************/ /* The number of compressed pages currently stored in zswap */ -atomic_long_t zswap_stored_pages = ATOMIC_INIT(0); +atomic_long_t zswap_stored_pages = ATOMIC_LONG_INIT(0); /* * The statistics below are not protected from concurrent access for @@ -62,6 +62,8 @@ static u64 zswap_reject_reclaim_fail; static u64 zswap_reject_compress_fail; /* Compressed page was too big for the allocator to (optimally) store */ static u64 zswap_reject_compress_poor; +/* Load or writeback failed due to decompression failure */ +static u64 zswap_decompress_fail; /* Store failed because underlying allocator could not get memory */ static u64 zswap_reject_alloc_fail; /* Store failed because the entry metadata could not be allocated (rare) */ @@ -881,18 +883,32 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); + struct acomp_req *req; + struct crypto_acomp *acomp; + u8 *buffer; + + if (IS_ERR_OR_NULL(acomp_ctx)) + return 0; mutex_lock(&acomp_ctx->mutex); - if (!IS_ERR_OR_NULL(acomp_ctx)) { - if (!IS_ERR_OR_NULL(acomp_ctx->req)) - acomp_request_free(acomp_ctx->req); - acomp_ctx->req = NULL; - if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) - crypto_free_acomp(acomp_ctx->acomp); - kfree(acomp_ctx->buffer); - } + req = acomp_ctx->req; + acomp = acomp_ctx->acomp; + buffer = acomp_ctx->buffer; + acomp_ctx->req = NULL; + acomp_ctx->acomp = NULL; + acomp_ctx->buffer = NULL; mutex_unlock(&acomp_ctx->mutex); + /* + * Do the actual freeing after releasing the mutex to avoid subtle + * locking dependencies causing deadlocks. + */ + if (!IS_ERR_OR_NULL(req)) + acomp_request_free(req); + if (!IS_ERR_OR_NULL(acomp)) + crypto_free_acomp(acomp); + kfree(buffer); + return 0; } @@ -930,7 +946,6 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, unsigned int dlen = PAGE_SIZE; unsigned long handle; struct zpool *zpool; - char *buf; gfp_t gfp; u8 *dst; @@ -965,17 +980,12 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, goto unlock; zpool = pool->zpool; - gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; - if (zpool_malloc_support_movable(zpool)) - gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; + gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE; alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle); if (alloc_ret) goto unlock; - buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); - memcpy(buf, dst, dlen); - zpool_unmap_handle(zpool, handle); - + zpool_obj_write(zpool, handle, dst, dlen); entry->handle = handle; entry->length = dlen; @@ -991,41 +1001,49 @@ unlock: return comp_ret == 0 && alloc_ret == 0; } -static void zswap_decompress(struct zswap_entry *entry, struct folio *folio) +static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) { struct zpool *zpool = entry->pool->zpool; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; - u8 *src; + int decomp_ret, dlen; + u8 *src, *obj; acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool); - src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); + obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffer); + /* - * If zpool_map_handle is atomic, we cannot reliably utilize its mapped buffer - * to do crypto_acomp_decompress() which might sleep. In such cases, we must - * resort to copying the buffer to a temporary one. - * Meanwhile, zpool_map_handle() might return a non-linearly mapped buffer, - * such as a kmap address of high memory or even ever a vmap address. - * However, sg_init_one is only equipped to handle linearly mapped low memory. - * In such cases, we also must copy the buffer to a temporary and lowmem one. + * zpool_obj_read_begin() might return a kmap address of highmem when + * acomp_ctx->buffer is not used. However, sg_init_one() does not + * handle highmem addresses, so copy the object to acomp_ctx->buffer. */ - if ((acomp_ctx->is_sleepable && !zpool_can_sleep_mapped(zpool)) || - !virt_addr_valid(src)) { - memcpy(acomp_ctx->buffer, src, entry->length); + if (virt_addr_valid(obj)) { + src = obj; + } else { + WARN_ON_ONCE(obj == acomp_ctx->buffer); + memcpy(acomp_ctx->buffer, obj, entry->length); src = acomp_ctx->buffer; - zpool_unmap_handle(zpool, entry->handle); } sg_init_one(&input, src, entry->length); sg_init_table(&output, 1); sg_set_folio(&output, folio, PAGE_SIZE, 0); acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); - BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); - BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); + decomp_ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); + dlen = acomp_ctx->req->dlen; - if (src != acomp_ctx->buffer) - zpool_unmap_handle(zpool, entry->handle); + zpool_obj_read_end(zpool, entry->handle, obj); acomp_ctx_put_unlock(acomp_ctx); + + if (!decomp_ret && dlen == PAGE_SIZE) + return true; + + zswap_decompress_fail++; + pr_alert_ratelimited("Decompression error from zswap (%d:%lu %s %u->%d)\n", + swp_type(entry->swpentry), + swp_offset(entry->swpentry), + entry->pool->tfm_name, entry->length, dlen); + return false; } /********************************* @@ -1051,14 +1069,21 @@ static int zswap_writeback_entry(struct zswap_entry *entry, struct folio *folio; struct mempolicy *mpol; bool folio_was_allocated; + struct swap_info_struct *si; struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, }; + int ret = 0; /* try to allocate swap cache folio */ + si = get_swap_device(swpentry); + if (!si) + return -EEXIST; + mpol = get_task_policy(current); folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, - NO_INTERLEAVE_INDEX, &folio_was_allocated, true); + NO_INTERLEAVE_INDEX, &folio_was_allocated, true); + put_swap_device(si); if (!folio) return -ENOMEM; @@ -1070,8 +1095,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry, * and freed when invalidated by the concurrent shrinker anyway. */ if (!folio_was_allocated) { - folio_put(folio); - return -EEXIST; + ret = -EEXIST; + goto out; } /* @@ -1084,14 +1109,17 @@ static int zswap_writeback_entry(struct zswap_entry *entry, * be dereferenced. */ tree = swap_zswap_tree(swpentry); - if (entry != xa_cmpxchg(tree, offset, entry, NULL, GFP_KERNEL)) { - delete_from_swap_cache(folio); - folio_unlock(folio); - folio_put(folio); - return -ENOMEM; + if (entry != xa_load(tree, offset)) { + ret = -ENOMEM; + goto out; } - zswap_decompress(entry, folio); + if (!zswap_decompress(entry, folio)) { + ret = -EIO; + goto out; + } + + xa_erase(tree, offset); count_vm_event(ZSWPWB); if (entry->objcg) @@ -1107,9 +1135,14 @@ static int zswap_writeback_entry(struct zswap_entry *entry, /* start writeback */ __swap_writepage(folio, &wbc); - folio_put(folio); - return 0; +out: + if (ret && ret != -EEXIST) { + delete_from_swap_cache(folio); + folio_unlock(folio); + } + folio_put(folio); + return ret; } /********************************* @@ -1445,9 +1478,9 @@ resched: * main API **********************************/ -static ssize_t zswap_store_page(struct page *page, - struct obj_cgroup *objcg, - struct zswap_pool *pool) +static bool zswap_store_page(struct page *page, + struct obj_cgroup *objcg, + struct zswap_pool *pool) { swp_entry_t page_swpentry = page_swap_entry(page); struct zswap_entry *entry, *old; @@ -1456,7 +1489,7 @@ static ssize_t zswap_store_page(struct page *page, entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); if (!entry) { zswap_reject_kmemcache_fail++; - return -EINVAL; + return false; } if (!zswap_compress(page, entry, pool)) @@ -1483,13 +1516,17 @@ static ssize_t zswap_store_page(struct page *page, /* * The entry is successfully compressed and stored in the tree, there is - * no further possibility of failure. Grab refs to the pool and objcg. - * These refs will be dropped by zswap_entry_free() when the entry is - * removed from the tree. + * no further possibility of failure. Grab refs to the pool and objcg, + * charge zswap memory, and increment zswap_stored_pages. + * The opposite actions will be performed by zswap_entry_free() + * when the entry is removed from the tree. */ zswap_pool_get(pool); - if (objcg) + if (objcg) { obj_cgroup_get(objcg); + obj_cgroup_charge_zswap(objcg, entry->length); + } + atomic_long_inc(&zswap_stored_pages); /* * We finish initializing the entry while it's already in xarray. @@ -1510,13 +1547,13 @@ static ssize_t zswap_store_page(struct page *page, zswap_lru_add(&zswap_list_lru, entry); } - return entry->length; + return true; store_failed: zpool_free(pool->zpool, entry->handle); compress_failed: zswap_entry_cache_free(entry); - return -EINVAL; + return false; } bool zswap_store(struct folio *folio) @@ -1526,7 +1563,6 @@ bool zswap_store(struct folio *folio) struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; struct zswap_pool *pool; - size_t compressed_bytes = 0; bool ret = false; long index; @@ -1564,20 +1600,14 @@ bool zswap_store(struct folio *folio) for (index = 0; index < nr_pages; ++index) { struct page *page = folio_page(folio, index); - ssize_t bytes; - bytes = zswap_store_page(page, objcg, pool); - if (bytes < 0) + if (!zswap_store_page(page, objcg, pool)) goto put_pool; - compressed_bytes += bytes; } - if (objcg) { - obj_cgroup_charge_zswap(objcg, compressed_bytes); + if (objcg) count_objcg_events(objcg, ZSWPOUT, nr_pages); - } - atomic_long_add(nr_pages, &zswap_stored_pages); count_vm_events(ZSWPOUT, nr_pages); ret = true; @@ -1612,7 +1642,27 @@ check_old: return ret; } -bool zswap_load(struct folio *folio) +/** + * zswap_load() - load a folio from zswap + * @folio: folio to load + * + * Return: 0 on success, with the folio unlocked and marked up-to-date, or one + * of the following error codes: + * + * -EIO: if the swapped out content was in zswap, but could not be loaded + * into the page due to a decompression failure. The folio is unlocked, but + * NOT marked up-to-date, so that an IO error is emitted (e.g. do_swap_page() + * will SIGBUS). + * + * -EINVAL: if the swapped out content was in zswap, but the page belongs + * to a large folio, which is not supported by zswap. The folio is unlocked, + * but NOT marked up-to-date, so that an IO error is emitted (e.g. + * do_swap_page() will SIGBUS). + * + * -ENOENT: if the swapped out content was not in zswap. The folio remains + * locked on return. + */ +int zswap_load(struct folio *folio) { swp_entry_t swp = folio->swap; pgoff_t offset = swp_offset(swp); @@ -1623,18 +1673,32 @@ bool zswap_load(struct folio *folio) VM_WARN_ON_ONCE(!folio_test_locked(folio)); if (zswap_never_enabled()) - return false; + return -ENOENT; /* * Large folios should not be swapped in while zswap is being used, as * they are not properly handled. Zswap does not properly load large * folios, and a large folio may only be partially in zswap. - * - * Return true without marking the folio uptodate so that an IO error is - * emitted (e.g. do_swap_page() will sigbus). */ - if (WARN_ON_ONCE(folio_test_large(folio))) - return true; + if (WARN_ON_ONCE(folio_test_large(folio))) { + folio_unlock(folio); + return -EINVAL; + } + + entry = xa_load(tree, offset); + if (!entry) + return -ENOENT; + + if (!zswap_decompress(entry, folio)) { + folio_unlock(folio); + return -EIO; + } + + folio_mark_uptodate(folio); + + count_vm_event(ZSWPIN); + if (entry->objcg) + count_objcg_events(entry->objcg, ZSWPIN, 1); /* * When reading into the swapcache, invalidate our entry. The @@ -1648,27 +1712,14 @@ bool zswap_load(struct folio *folio) * files, which reads into a private page and may free it if * the fault fails. We remain the primary owner of the entry.) */ - if (swapcache) - entry = xa_erase(tree, offset); - else - entry = xa_load(tree, offset); - - if (!entry) - return false; - - zswap_decompress(entry, folio); - - count_vm_event(ZSWPIN); - if (entry->objcg) - count_objcg_events(entry->objcg, ZSWPIN, 1); - if (swapcache) { - zswap_entry_free(entry); folio_mark_dirty(folio); + xa_erase(tree, offset); + zswap_entry_free(entry); } - folio_mark_uptodate(folio); - return true; + folio_unlock(folio); + return 0; } void zswap_invalidate(swp_entry_t swp) @@ -1763,6 +1814,8 @@ static int zswap_debugfs_init(void) zswap_debugfs_root, &zswap_reject_compress_fail); debugfs_create_u64("reject_compress_poor", 0444, zswap_debugfs_root, &zswap_reject_compress_poor); + debugfs_create_u64("decompress_fail", 0444, + zswap_debugfs_root, &zswap_decompress_fail); debugfs_create_u64("written_back_pages", 0444, zswap_debugfs_root, &zswap_written_back_pages); debugfs_create_file("pool_total_size", 0444, |