diff options
Diffstat (limited to 'include/linux/swap.h')
| -rw-r--r-- | include/linux/swap.h | 257 |
1 files changed, 96 insertions, 161 deletions
diff --git a/include/linux/swap.h b/include/linux/swap.h index a98c757400fe..7a09df6977a5 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -20,11 +20,8 @@ struct notifier_block; struct bio; -struct pagevec; - #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff -#define SWAP_FLAG_PRIO_SHIFT 0 #define SWAP_FLAG_DISCARD 0x10000 /* enable discard for swap */ #define SWAP_FLAG_DISCARD_ONCE 0x20000 /* discard swap area at swapon-time */ #define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */ @@ -74,14 +71,13 @@ static inline int current_is_kswapd(void) * to a special SWP_DEVICE_{READ|WRITE} entry. * * When a page is mapped by the device for exclusive access we set the CPU page - * table entries to special SWP_DEVICE_EXCLUSIVE_* entries. + * table entries to a special SWP_DEVICE_EXCLUSIVE entry. */ #ifdef CONFIG_DEVICE_PRIVATE -#define SWP_DEVICE_NUM 4 +#define SWP_DEVICE_NUM 3 #define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM) #define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1) -#define SWP_DEVICE_EXCLUSIVE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2) -#define SWP_DEVICE_EXCLUSIVE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+3) +#define SWP_DEVICE_EXCLUSIVE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2) #else #define SWP_DEVICE_NUM 0 #endif @@ -210,7 +206,6 @@ enum { SWP_DISCARDABLE = (1 << 2), /* blkdev support discard */ SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ - SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ SWP_BLKDEV = (1 << 6), /* its a block device */ SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */ SWP_FS_OPS = (1 << 8), /* swapfile operations go through fs */ @@ -225,52 +220,6 @@ enum { #define SWAP_CLUSTER_MAX_SKIPPED (SWAP_CLUSTER_MAX << 10) #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX -/* Bit flag in swap_map */ -#define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */ -#define COUNT_CONTINUED 0x80 /* Flag swap_map continuation for full count */ - -/* Special value in first swap_map */ -#define SWAP_MAP_MAX 0x3e /* Max count */ -#define SWAP_MAP_BAD 0x3f /* Note page is bad */ -#define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs */ - -/* Special value in each swap_map continuation */ -#define SWAP_CONT_MAX 0x7f /* Max count */ - -/* - * We use this to track usage of a cluster. A cluster is a block of swap disk - * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All - * free clusters are organized into a list. We fetch an entry from the list to - * get a free cluster. - * - * The flags field determines if a cluster is free. This is - * protected by cluster lock. - */ -struct swap_cluster_info { - spinlock_t lock; /* - * Protect swap_cluster_info fields - * other than list, and swap_info_struct->swap_map - * elements corresponding to the swap cluster. - */ - u16 count; - u8 flags; - u8 order; - struct list_head list; -}; - -/* All on-list cluster must have a non-zero flag. */ -enum swap_cluster_flags { - CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */ - CLUSTER_FLAG_FREE, - CLUSTER_FLAG_NONFULL, - CLUSTER_FLAG_FRAG, - /* Clusters with flags above are allocatable */ - CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG, - CLUSTER_FLAG_FULL, - CLUSTER_FLAG_DISCARD, - CLUSTER_FLAG_MAX, -}; - /* * The first page in the swap file is the swap header, which is always marked * bad to prevent it from being allocated as an entry. This also prevents the @@ -286,12 +235,10 @@ enum swap_cluster_flags { #endif /* - * We assign a cluster to each CPU, so each CPU can allocate swap entry from - * its own cluster and swapout sequentially. The purpose is to optimize swapout - * throughput. + * We keep using same cluster for rotational device so IO will be sequential. + * The purpose is to optimize SWAP throughput on these device. */ -struct percpu_cluster { - local_lock_t lock; /* Protect the percpu_cluster above */ +struct swap_sequential_cluster { unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ }; @@ -304,8 +251,7 @@ struct swap_info_struct { signed short prio; /* swap priority of this type */ struct plist_node list; /* entry in swap_active_head */ signed char type; /* strange name for an index */ - unsigned int max; /* extent of the swap_map */ - unsigned char *swap_map; /* vmalloc'ed array of usage counts */ + unsigned int max; /* size of this swap device */ unsigned long *zeromap; /* kvmalloc'ed bitmap to track zero pages */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ struct list_head free_clusters; /* free clusters list */ @@ -314,11 +260,9 @@ struct swap_info_struct { /* list of cluster that contains at least one free slot */ struct list_head frag_clusters[SWAP_NR_ORDERS]; /* list of cluster that are fragmented or contented */ - atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS]; unsigned int pages; /* total of usable pages of swap */ atomic_long_t inuse_pages; /* number of those currently in use */ - struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ - struct percpu_cluster *global_cluster; /* Use one global cluster for rotating device */ + struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */ spinlock_t global_cluster_lock; /* Serialize usage of global cluster */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ @@ -326,34 +270,18 @@ struct swap_info_struct { struct completion comp; /* seldom referenced */ spinlock_t lock; /* * protect map scan related fields like - * swap_map, lowest_bit, highest_bit, - * inuse_pages, cluster_next, - * cluster_nr, lowest_alloc, - * highest_alloc, free/discard cluster - * list. other fields are only changed + * inuse_pages and all cluster lists. + * Other fields are only changed * at swapon/swapoff, so are protected * by swap_lock. changing flags need * hold this lock and swap_lock. If * both locks need hold, hold swap_lock * first. */ - spinlock_t cont_lock; /* - * protect swap count continuation page - * list. - */ struct work_struct discard_work; /* discard worker */ struct work_struct reclaim_work; /* reclaim worker */ struct list_head discard_clusters; /* discard clusters list */ - struct plist_node avail_lists[]; /* - * entries in swap_avail_heads, one - * entry per node. - * Must be last as the number of the - * array is nr_node_ids, which is not - * a fixed value so have to allocate - * dynamically. - * And it has to be an array so that - * plist_for_each_* can work. - */ + struct plist_node avail_list; /* entry in swap_avail_head */ }; static inline swp_entry_t page_swap_entry(struct page *page) @@ -381,14 +309,24 @@ extern unsigned long totalreserve_pages; /* linux/mm/swap.c */ -void lru_note_cost(struct lruvec *lruvec, bool file, - unsigned int nr_io, unsigned int nr_rotated); +void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, + unsigned int nr_io, unsigned int nr_rotated); void lru_note_cost_refault(struct folio *); void folio_add_lru(struct folio *); void folio_add_lru_vma(struct folio *, struct vm_area_struct *); void mark_page_accessed(struct page *); void folio_mark_accessed(struct folio *); +static inline bool folio_may_be_lru_cached(struct folio *folio) +{ + /* + * Holding PMD-sized folios in per-CPU LRU cache unbalances accounting. + * Holding small numbers of low-order mTHP folios in per-CPU LRU cache + * will be sensible, but nobody has implemented and tested that yet. + */ + return !folio_test_large(folio); +} + extern atomic_t lru_disable_count; static inline bool lru_cache_disabled(void) @@ -414,11 +352,16 @@ extern void swap_setup(void); extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); +unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx); #define MEMCG_RECLAIM_MAY_SWAP (1 << 1) #define MEMCG_RECLAIM_PROACTIVE (1 << 2) #define MIN_SWAPPINESS 0 #define MAX_SWAPPINESS 200 + +/* Just reclaim from anon folios in proactive memory reclaim */ +#define SWAPPINESS_ANON_ONLY (MAX_SWAPPINESS + 1) + extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, @@ -432,6 +375,22 @@ extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; long remove_mapping(struct address_space *mapping, struct folio *folio); +#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) +extern int reclaim_register_node(struct node *node); +extern void reclaim_unregister_node(struct node *node); + +#else + +static inline int reclaim_register_node(struct node *node) +{ + return 0; +} + +static inline void reclaim_unregister_node(struct node *node) +{ +} +#endif /* CONFIG_SYSFS && CONFIG_NUMA */ + #ifdef CONFIG_NUMA extern int sysctl_min_unmapped_ratio; extern int sysctl_min_slab_ratio; @@ -455,13 +414,12 @@ static inline unsigned long total_swapcache_pages(void) } void free_swap_cache(struct folio *folio); -void free_page_and_swap_cache(struct page *); +void free_folio_and_swap_cache(struct folio *folio); void free_pages_and_swap_cache(struct encoded_page **, int); /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; extern atomic_t nr_rotate_swap; -extern bool has_usable_swap(void); /* Swap 50% full? Release swapcache more aggressively.. */ static inline bool vm_swap_full(void) @@ -475,43 +433,46 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -swp_entry_t folio_alloc_swap(struct folio *folio); -bool folio_free_swap(struct folio *folio); -void put_swap_folio(struct folio *folio, swp_entry_t entry); -extern swp_entry_t get_swap_page_of_type(int); -extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order); -extern int add_swap_count_continuation(swp_entry_t, gfp_t); -extern void swap_shmem_alloc(swp_entry_t, int); -extern int swap_duplicate(swp_entry_t); -extern int swapcache_prepare(swp_entry_t entry, int nr); -extern void swap_free_nr(swp_entry_t entry, int nr_pages); -extern void swapcache_free_entries(swp_entry_t *entries, int n); -extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); extern sector_t swapdev_block(int, pgoff_t); extern int __swap_count(swp_entry_t entry); -extern int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry); +extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); -struct swap_info_struct *swp_swap_info(swp_entry_t entry); struct backing_dev_info; -extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); -extern void exit_swap_address_space(unsigned int type); extern struct swap_info_struct *get_swap_device(swp_entry_t entry); sector_t swap_folio_sector(struct folio *folio); +/* + * If there is an existing swap slot reference (swap entry) and the caller + * guarantees that there is no race modification of it (e.g., PTL + * protecting the swap entry in page table; shmem's cmpxchg protects t + * he swap entry in shmem mapping), these two helpers below can be used + * to put/dup the entries directly. + * + * All entries must be allocated by folio_alloc_swap(). And they must have + * a swap count > 1. See comments of folio_*_swap helpers for more info. + */ +int swap_dup_entry_direct(swp_entry_t entry); +void swap_put_entries_direct(swp_entry_t entry, int nr); + +/* + * folio_free_swap tries to free the swap entries pinned by a swap cache + * folio, it has to be here to be called by other components. + */ +bool folio_free_swap(struct folio *folio); + +/* Allocate / free (hibernation) exclusive entries */ +swp_entry_t swap_alloc_hibernation_slot(int type); +void swap_free_hibernation_slot(swp_entry_t entry); + static inline void put_swap_device(struct swap_info_struct *si) { percpu_ref_put(&si->users); } #else /* CONFIG_SWAP */ -static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry) -{ - return NULL; -} - static inline struct swap_info_struct *get_swap_device(swp_entry_t entry) { return NULL; @@ -528,45 +489,21 @@ static inline void put_swap_device(struct swap_info_struct *si) #define si_swapinfo(val) \ do { (val)->freeswap = (val)->totalswap = 0; } while (0) -/* only sparc can not include linux/pagemap.h in this file - * so leave put_page and release_pages undeclared... */ -#define free_page_and_swap_cache(page) \ - put_page(page) +#define free_folio_and_swap_cache(folio) \ + folio_put(folio) #define free_pages_and_swap_cache(pages, nr) \ release_pages((pages), (nr)); -static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr) -{ -} - static inline void free_swap_cache(struct folio *folio) { } -static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) -{ - return 0; -} - -static inline void swap_shmem_alloc(swp_entry_t swp, int nr) -{ -} - -static inline int swap_duplicate(swp_entry_t swp) -{ - return 0; -} - -static inline int swapcache_prepare(swp_entry_t swp, int nr) +static inline int swap_dup_entry_direct(swp_entry_t ent) { return 0; } -static inline void swap_free_nr(swp_entry_t entry, int nr_pages) -{ -} - -static inline void put_swap_folio(struct folio *folio, swp_entry_t swp) +static inline void swap_put_entries_direct(swp_entry_t ent, int nr) { } @@ -575,9 +512,9 @@ static inline int __swap_count(swp_entry_t entry) return 0; } -static inline int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) +static inline bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) { - return 0; + return false; } static inline int swp_swapcount(swp_entry_t entry) @@ -585,13 +522,6 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -static inline swp_entry_t folio_alloc_swap(struct folio *folio) -{ - swp_entry_t entry; - entry.val = 0; - return entry; -} - static inline bool folio_free_swap(struct folio *folio) { return false; @@ -604,17 +534,6 @@ static inline int add_swap_extent(struct swap_info_struct *sis, return -EINVAL; } #endif /* CONFIG_SWAP */ - -static inline void free_swap_and_cache(swp_entry_t entry) -{ - free_swap_and_cache_nr(entry, 1); -} - -static inline void swap_free(swp_entry_t entry) -{ - swap_free_nr(entry, 1); -} - #ifdef CONFIG_MEMCG static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) { @@ -628,6 +547,8 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) return READ_ONCE(memcg->swappiness); } + +void lru_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid); #else static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) { @@ -650,7 +571,6 @@ static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) #endif #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) -void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry); int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry); static inline int mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) @@ -671,10 +591,6 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_p extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); extern bool mem_cgroup_swap_full(struct folio *folio); #else -static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) -{ -} - static inline int mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) { @@ -697,5 +613,24 @@ static inline bool mem_cgroup_swap_full(struct folio *folio) } #endif +/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to + * and including the specified highidx + * @zone: The current zone in the iterator + * @pgdat: The pgdat which node_zones are being iterated + * @idx: The index variable + * @highidx: The index of the highest zone to return + * + * This macro iterates through all managed zones up to and including the specified highidx. + * The zone iterator enters an invalid state after macro call and must be reinitialized + * before it can be used again. + */ +#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \ + for ((idx) = 0, (zone) = (pgdat)->node_zones; \ + (idx) <= (highidx); \ + (idx)++, (zone)++) \ + if (!managed_zone(zone)) \ + continue; \ + else + #endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */ |
