Merge tag 'for-5.5/block-20191121' of git://git.kernel.dk/linux-block

Pull core block updates from Jens Axboe: "Due to more granular branches, this one is small and will be followed with other core branches that add specific features. I meant to just have a core and drivers branch, but external dependencies we ended up adding a few more that are also core. The changes are: - Fixes and improvements for the zoned device support (Ajay, Damien) - sed-opal table writing and datastore UID (Revanth) - blk-cgroup (and bfq) blk-cgroup stat fixes (Tejun) - Improvements to the block stats tracking (Pavel) - Fix for overruning sysfs buffer for large number of CPUs (Ming) - Optimization for small IO (Ming, Christoph) - Fix typo in RWH lifetime hint (Eugene) - Dead code removal and documentation (Bart) - Reduction in memory usage for queue and tag set (Bart) - Kerneldoc header documentation (André) - Device/partition revalidation fixes (Jan) - Stats tracking for flush requests (Konstantin) - Various other little fixes here and there (et al)" * tag 'for-5.5/block-20191121' of git://git.kernel.dk/linux-block: (48 commits) Revert "block: split bio if the only bvec's length is > SZ_4K" block: add iostat counters for flush requests block,bfq: Skip tracing hooks if possible block: sed-opal: Introduce SUM_SET_LIST parameter and append it using 'add_token_u64' blk-cgroup: cgroup_rstat_updated() shouldn't be called on cgroup1 block: Don't disable interrupts in trigger_softirq() sbitmap: Delete sbitmap_any_bit_clear() blk-mq: Delete blk_mq_has_free_tags() and blk_mq_can_queue() block: split bio if the only bvec's length is > SZ_4K block: still try to split bio if the bvec crosses pages blk-cgroup: separate out blkg_rwstat under CONFIG_BLK_CGROUP_RWSTAT blk-cgroup: reimplement basic IO stats using cgroup rstat blk-cgroup: remove now unused blkg_print_stat_{bytes|ios}_recursive() blk-throtl: stop using blkg->stat_bytes and ->stat_ios bfq-iosched: stop using blkg->stat_bytes and ->stat_ios bfq-iosched: relocate bfqg_*rwstat*() helpers block: add zone open, close and finish ioctl support block: add zone open, close and finish operations block: Simplify REQ_OP_ZONE_RESET_ALL handling block: Remove REQ_OP_ZONE_RESET plugging ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2019-11-25 10:59:41 -0800
committer: Linus Torvalds <torvalds@linux-foundation.org> 2019-11-25 10:59:41 -0800
commit: ff6814b078e33a4d26fee9ea80779c81a6744cd8 (patch)
tree: b8559e89e01cad7d59e41e485d5c20ac6bb2e7ec /include
parent: 6e7b06a4c88846c20c2cc01b370564a2423ff0d0 (diff)
parent: 1e279153dfd53e76006720df804d5935a6cbc6d5 (diff)
download: lwn-ff6814b078e33a4d26fee9ea80779c81a6744cd8.tar.gz
lwn-ff6814b078e33a4d26fee9ea80779c81a6744cd8.zip
10 files changed, 369 insertions, 242 deletions
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index bed9e43f9426..19394c77ed99 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -15,7 +15,9 @@
  */
 
 #include <linux/cgroup.h>
+#include <linux/percpu.h>
 #include <linux/percpu_counter.h>
+#include <linux/u64_stats_sync.h>
 #include <linux/seq_file.h>
 #include <linux/radix-tree.h>
 #include <linux/blkdev.h>
@@ -31,15 +33,12 @@
 
 #ifdef CONFIG_BLK_CGROUP
 
-enum blkg_rwstat_type {
-	BLKG_RWSTAT_READ,
-	BLKG_RWSTAT_WRITE,
-	BLKG_RWSTAT_SYNC,
-	BLKG_RWSTAT_ASYNC,
-	BLKG_RWSTAT_DISCARD,
+enum blkg_iostat_type {
+	BLKG_IOSTAT_READ,
+	BLKG_IOSTAT_WRITE,
+	BLKG_IOSTAT_DISCARD,
 
-	BLKG_RWSTAT_NR,
-	BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
+	BLKG_IOSTAT_NR,
 };
 
 struct blkcg_gq;
@@ -61,17 +60,15 @@ struct blkcg {
 #endif
 };
 
-/*
- * blkg_[rw]stat->aux_cnt is excluded for local stats but included for
- * recursive.  Used to carry stats of dead children.
- */
-struct blkg_rwstat {
-	struct percpu_counter		cpu_cnt[BLKG_RWSTAT_NR];
-	atomic64_t			aux_cnt[BLKG_RWSTAT_NR];
+struct blkg_iostat {
+	u64				bytes[BLKG_IOSTAT_NR];
+	u64				ios[BLKG_IOSTAT_NR];
 };
 
-struct blkg_rwstat_sample {
-	u64				cnt[BLKG_RWSTAT_NR];
+struct blkg_iostat_set {
+	struct u64_stats_sync		sync;
+	struct blkg_iostat		cur;
+	struct blkg_iostat		last;
 };
 
 /*
@@ -127,8 +124,8 @@ struct blkcg_gq {
 	/* is this blkg online? protected by both blkcg and q locks */
 	bool				online;
 
-	struct blkg_rwstat		stat_bytes;
-	struct blkg_rwstat		stat_ios;
+	struct blkg_iostat_set __percpu	*iostat_cpu;
+	struct blkg_iostat_set		iostat;
 
 	struct blkg_policy_data		*pd[BLKCG_MAX_POLS];
 
@@ -202,13 +199,6 @@ int blkcg_activate_policy(struct request_queue *q,
 void blkcg_deactivate_policy(struct request_queue *q,
 			     const struct blkcg_policy *pol);
 
-static inline u64 blkg_rwstat_read_counter(struct blkg_rwstat *rwstat,
-		unsigned int idx)
-{
-	return atomic64_read(&rwstat->aux_cnt[idx]) +
-		percpu_counter_sum_positive(&rwstat->cpu_cnt[idx]);
-}
-
 const char *blkg_dev_name(struct blkcg_gq *blkg);
 void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
 		       u64 (*prfill)(struct seq_file *,
@@ -216,17 +206,6 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
 		       const struct blkcg_policy *pol, int data,
 		       bool show_total);
 u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
-u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
-			 const struct blkg_rwstat_sample *rwstat);
-u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
-		       int off);
-int blkg_print_stat_bytes(struct seq_file *sf, void *v);
-int blkg_print_stat_ios(struct seq_file *sf, void *v);
-int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v);
-int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v);
-
-void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
-		int off, struct blkg_rwstat_sample *sum);
 
 struct blkg_conf_ctx {
 	struct gendisk			*disk;
@@ -578,128 +557,6 @@ static inline void blkg_put(struct blkcg_gq *blkg)
 		if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),	\
 					      (p_blkg)->q, false)))
 
-static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp)
-{
-	int i, ret;
-
-	for (i = 0; i < BLKG_RWSTAT_NR; i++) {
-		ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp);
-		if (ret) {
-			while (--i >= 0)
-				percpu_counter_destroy(&rwstat->cpu_cnt[i]);
-			return ret;
-		}
-		atomic64_set(&rwstat->aux_cnt[i], 0);
-	}
-	return 0;
-}
-
-static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat)
-{
-	int i;
-
-	for (i = 0; i < BLKG_RWSTAT_NR; i++)
-		percpu_counter_destroy(&rwstat->cpu_cnt[i]);
-}
-
-/**
- * blkg_rwstat_add - add a value to a blkg_rwstat
- * @rwstat: target blkg_rwstat
- * @op: REQ_OP and flags
- * @val: value to add
- *
- * Add @val to @rwstat.  The counters are chosen according to @rw.  The
- * caller is responsible for synchronizing calls to this function.
- */
-static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
-				   unsigned int op, uint64_t val)
-{
-	struct percpu_counter *cnt;
-
-	if (op_is_discard(op))
-		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD];
-	else if (op_is_write(op))
-		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE];
-	else
-		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
-
-	percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH);
-
-	if (op_is_sync(op))
-		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
-	else
-		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];
-
-	percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH);
-}
-
-/**
- * blkg_rwstat_read - read the current values of a blkg_rwstat
- * @rwstat: blkg_rwstat to read
- *
- * Read the current snapshot of @rwstat and return it in the aux counts.
- */
-static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat,
-		struct blkg_rwstat_sample *result)
-{
-	int i;
-
-	for (i = 0; i < BLKG_RWSTAT_NR; i++)
-		result->cnt[i] =
-			percpu_counter_sum_positive(&rwstat->cpu_cnt[i]);
-}
-
-/**
- * blkg_rwstat_total - read the total count of a blkg_rwstat
- * @rwstat: blkg_rwstat to read
- *
- * Return the total count of @rwstat regardless of the IO direction.  This
- * function can be called without synchronization and takes care of u64
- * atomicity.
- */
-static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
-{
-	struct blkg_rwstat_sample tmp = { };
-
-	blkg_rwstat_read(rwstat, &tmp);
-	return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
-}
-
-/**
- * blkg_rwstat_reset - reset a blkg_rwstat
- * @rwstat: blkg_rwstat to reset
- */
-static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
-{
-	int i;
-
-	for (i = 0; i < BLKG_RWSTAT_NR; i++) {
-		percpu_counter_set(&rwstat->cpu_cnt[i], 0);
-		atomic64_set(&rwstat->aux_cnt[i], 0);
-	}
-}
-
-/**
- * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count
- * @to: the destination blkg_rwstat
- * @from: the source
- *
- * Add @from's count including the aux one to @to's aux count.
- */
-static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
-				       struct blkg_rwstat *from)
-{
-	u64 sum[BLKG_RWSTAT_NR];
-	int i;
-
-	for (i = 0; i < BLKG_RWSTAT_NR; i++)
-		sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]);
-
-	for (i = 0; i < BLKG_RWSTAT_NR; i++)
-		atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]),
-			     &to->aux_cnt[i]);
-}
-
 #ifdef CONFIG_BLK_DEV_THROTTLING
 extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 			   struct bio *bio);
@@ -745,15 +602,33 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 	throtl = blk_throtl_bio(q, blkg, bio);
 
 	if (!throtl) {
+		struct blkg_iostat_set *bis;
+		int rwd, cpu;
+
+		if (op_is_discard(bio->bi_opf))
+			rwd = BLKG_IOSTAT_DISCARD;
+		else if (op_is_write(bio->bi_opf))
+			rwd = BLKG_IOSTAT_WRITE;
+		else
+			rwd = BLKG_IOSTAT_READ;
+
+		cpu = get_cpu();
+		bis = per_cpu_ptr(blkg->iostat_cpu, cpu);
+		u64_stats_update_begin(&bis->sync);
+
 		/*
 		 * If the bio is flagged with BIO_QUEUE_ENTERED it means this
 		 * is a split bio and we would have already accounted for the
 		 * size of the bio.
 		 */
 		if (!bio_flagged(bio, BIO_QUEUE_ENTERED))
-			blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf,
-					bio->bi_iter.bi_size);
-		blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1);
+			bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
+		bis->cur.ios[rwd]++;
+
+		u64_stats_update_end(&bis->sync);
+		if (cgroup_subsys_on_dfl(io_cgrp_subsys))
+			cgroup_rstat_updated(blkg->blkcg->css.cgroup, cpu);
+		put_cpu();
 	}
 
 	blkcg_bio_issue_init(bio);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 0bf056de5cc3..11cfd6470b1a 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -10,103 +10,239 @@ struct blk_mq_tags;
 struct blk_flush_queue;
 
 /**
- * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware block device
+ * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware
+ * block device
  */
 struct blk_mq_hw_ctx {
 	struct {
+		/** @lock: Protects the dispatch list. */
 		spinlock_t		lock;
+		/**
+		 * @dispatch: Used for requests that are ready to be
+		 * dispatched to the hardware but for some reason (e.g. lack of
+		 * resources) could not be sent to the hardware. As soon as the
+		 * driver can send new requests, requests at this list will
+		 * be sent first for a fairer dispatch.
+		 */
 		struct list_head	dispatch;
-		unsigned long		state;		/* BLK_MQ_S_* flags */
+		 /**
+		  * @state: BLK_MQ_S_* flags. Defines the state of the hw
+		  * queue (active, scheduled to restart, stopped).
+		  */
+		unsigned long		state;
 	} ____cacheline_aligned_in_smp;
 
+	/**
+	 * @run_work: Used for scheduling a hardware queue run at a later time.
+	 */
 	struct delayed_work	run_work;
+	/** @cpumask: Map of available CPUs where this hctx can run. */
 	cpumask_var_t		cpumask;
+	/**
+	 * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU
+	 * selection from @cpumask.
+	 */
 	int			next_cpu;
+	/**
+	 * @next_cpu_batch: Counter of how many works left in the batch before
+	 * changing to the next CPU.
+	 */
 	int			next_cpu_batch;
 
-	unsigned long		flags;		/* BLK_MQ_F_* flags */
+	/** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */
+	unsigned long		flags;
 
+	/**
+	 * @sched_data: Pointer owned by the IO scheduler attached to a request
+	 * queue. It's up to the IO scheduler how to use this pointer.
+	 */
 	void			*sched_data;
+	/**
+	 * @queue: Pointer to the request queue that owns this hardware context.
+	 */
 	struct request_queue	*queue;
+	/** @fq: Queue of requests that need to perform a flush operation. */
 	struct blk_flush_queue	*fq;
 
+	/**
+	 * @driver_data: Pointer to data owned by the block driver that created
+	 * this hctx
+	 */
 	void			*driver_data;
 
+	/**
+	 * @ctx_map: Bitmap for each software queue. If bit is on, there is a
+	 * pending request in that software queue.
+	 */
 	struct sbitmap		ctx_map;
 
+	/**
+	 * @dispatch_from: Software queue to be used when no scheduler was
+	 * selected.
+	 */
 	struct blk_mq_ctx	*dispatch_from;
+	/**
+	 * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to
+	 * decide if the hw_queue is busy using Exponential Weighted Moving
+	 * Average algorithm.
+	 */
 	unsigned int		dispatch_busy;
 
+	/** @type: HCTX_TYPE_* flags. Type of hardware queue. */
 	unsigned short		type;
+	/** @nr_ctx: Number of software queues. */
 	unsigned short		nr_ctx;
+	/** @ctxs: Array of software queues. */
 	struct blk_mq_ctx	**ctxs;
 
+	/** @dispatch_wait_lock: Lock for dispatch_wait queue. */
 	spinlock_t		dispatch_wait_lock;
+	/**
+	 * @dispatch_wait: Waitqueue to put requests when there is no tag
+	 * available at the moment, to wait for another try in the future.
+	 */
 	wait_queue_entry_t	dispatch_wait;
+
+	/**
+	 * @wait_index: Index of next available dispatch_wait queue to insert
+	 * requests.
+	 */
 	atomic_t		wait_index;
 
+	/**
+	 * @tags: Tags owned by the block driver. A tag at this set is only
+	 * assigned when a request is dispatched from a hardware queue.
+	 */
 	struct blk_mq_tags	*tags;
+	/**
+	 * @sched_tags: Tags owned by I/O scheduler. If there is an I/O
+	 * scheduler associated with a request queue, a tag is assigned when
+	 * that request is allocated. Else, this member is not used.
+	 */
 	struct blk_mq_tags	*sched_tags;
 
+	/** @queued: Number of queued requests. */
 	unsigned long		queued;
+	/** @run: Number of dispatched requests. */
 	unsigned long		run;
 #define BLK_MQ_MAX_DISPATCH_ORDER	7
+	/** @dispatched: Number of dispatch requests by queue. */
 	unsigned long		dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
 
+	/** @numa_node: NUMA node the storage adapter has been connected to. */
 	unsigned int		numa_node;
+	/** @queue_num: Index of this hardware queue. */
 	unsigned int		queue_num;
 
+	/**
+	 * @nr_active: Number of active requests. Only used when a tag set is
+	 * shared across request queues.
+	 */
 	atomic_t		nr_active;
 
+	/** @cpuhp_dead: List to store request if some CPU die. */
 	struct hlist_node	cpuhp_dead;
+	/** @kobj: Kernel object for sysfs. */
 	struct kobject		kobj;
 
+	/** @poll_considered: Count times blk_poll() was called. */
 	unsigned long		poll_considered;
+	/** @poll_invoked: Count how many requests blk_poll() polled. */
 	unsigned long		poll_invoked;
+	/** @poll_success: Count how many polled requests were completed. */
 	unsigned long		poll_success;
 
 #ifdef CONFIG_BLK_DEBUG_FS
+	/**
+	 * @debugfs_dir: debugfs directory for this hardware queue. Named
+	 * as cpu<cpu_number>.
+	 */
 	struct dentry		*debugfs_dir;
+	/** @sched_debugfs_dir:	debugfs directory for the scheduler. */
 	struct dentry		*sched_debugfs_dir;
 #endif
 
+	/** @hctx_list:	List of all hardware queues. */
 	struct list_head	hctx_list;
 
-	/* Must be the last member - see also blk_mq_hw_ctx_size(). */
+	/**
+	 * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is
+	 * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also
+	 * blk_mq_hw_ctx_size().
+	 */
 	struct srcu_struct	srcu[0];
 };
 
+/**
+ * struct blk_mq_queue_map - Map software queues to hardware queues
+ * @mq_map:       CPU ID to hardware queue index map. This is an array
+ *	with nr_cpu_ids elements. Each element has a value in the range
+ *	[@queue_offset, @queue_offset + @nr_queues).
+ * @nr_queues:    Number of hardware queues to map CPU IDs onto.
+ * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe
+ *	driver to map each hardware queue type (enum hctx_type) onto a distinct
+ *	set of hardware queues.
+ */
 struct blk_mq_queue_map {
 	unsigned int *mq_map;
 	unsigned int nr_queues;
 	unsigned int queue_offset;
 };
 
+/**
+ * enum hctx_type - Type of hardware queue
+ * @HCTX_TYPE_DEFAULT:	All I/O not otherwise accounted for.
+ * @HCTX_TYPE_READ:	Just for READ I/O.
+ * @HCTX_TYPE_POLL:	Polled I/O of any kind.
+ * @HCTX_MAX_TYPES:	Number of types of hctx.
+ */
 enum hctx_type {
-	HCTX_TYPE_DEFAULT,	/* all I/O not otherwise accounted for */
-	HCTX_TYPE_READ,		/* just for READ I/O */
-	HCTX_TYPE_POLL,		/* polled I/O of any kind */
+	HCTX_TYPE_DEFAULT,
+	HCTX_TYPE_READ,
+	HCTX_TYPE_POLL,
 
 	HCTX_MAX_TYPES,
 };
 
+/**
+ * struct blk_mq_tag_set - tag set that can be shared between request queues
+ * @map:	   One or more ctx -> hctx mappings. One map exists for each
+ *		   hardware queue type (enum hctx_type) that the driver wishes
+ *		   to support. There are no restrictions on maps being of the
+ *		   same size, and it's perfectly legal to share maps between
+ *		   types.
+ * @nr_maps:	   Number of elements in the @map array. A number in the range
+ *		   [1, HCTX_MAX_TYPES].
+ * @ops:	   Pointers to functions that implement block driver behavior.
+ * @nr_hw_queues:  Number of hardware queues supported by the block driver that
+ *		   owns this data structure.
+ * @queue_depth:   Number of tags per hardware queue, reserved tags included.
+ * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag
+ *		   allocations.
+ * @cmd_size:	   Number of additional bytes to allocate per request. The block
+ *		   driver owns these additional bytes.
+ * @numa_node:	   NUMA node the storage adapter has been connected to.
+ * @timeout:	   Request processing timeout in jiffies.
+ * @flags:	   Zero or more BLK_MQ_F_* flags.
+ * @driver_data:   Pointer to data owned by the block driver that created this
+ *		   tag set.
+ * @tags:	   Tag sets. One tag set per hardware queue. Has @nr_hw_queues
+ *		   elements.
+ * @tag_list_lock: Serializes tag_list accesses.
+ * @tag_list:	   List of the request queues that use this tag set. See also
+ *		   request_queue.tag_set_list.
+ */
 struct blk_mq_tag_set {
-	/*
-	 * map[] holds ctx -> hctx mappings, one map exists for each type
-	 * that the driver wishes to support. There are no restrictions
-	 * on maps being of the same size, and it's perfectly legal to
-	 * share maps between types.
-	 */
 	struct blk_mq_queue_map	map[HCTX_MAX_TYPES];
-	unsigned int		nr_maps;	/* nr entries in map[] */
+	unsigned int		nr_maps;
 	const struct blk_mq_ops	*ops;
-	unsigned int		nr_hw_queues;	/* nr hw queues across maps */
-	unsigned int		queue_depth;	/* max hw supported */
+	unsigned int		nr_hw_queues;
+	unsigned int		queue_depth;
 	unsigned int		reserved_tags;
-	unsigned int		cmd_size;	/* per-request extra data */
+	unsigned int		cmd_size;
 	int			numa_node;
 	unsigned int		timeout;
-	unsigned int		flags;		/* BLK_MQ_F_* */
+	unsigned int		flags;
 	void			*driver_data;
 
 	struct blk_mq_tags	**tags;
@@ -115,6 +251,12 @@ struct blk_mq_tag_set {
 	struct list_head	tag_list;
 };
 
+/**
+ * struct blk_mq_queue_data - Data about a request inserted in a queue
+ *
+ * @rq:   Request pointer.
+ * @last: If it is the last request in the queue.
+ */
 struct blk_mq_queue_data {
 	struct request *rq;
 	bool last;
@@ -142,81 +284,101 @@ typedef bool (busy_fn)(struct request_queue *);
 typedef void (complete_fn)(struct request *);
 typedef void (cleanup_rq_fn)(struct request *);
 
-
+/**
+ * struct blk_mq_ops - Callback functions that implements block driver
+ * behaviour.
+ */
 struct blk_mq_ops {
-	/*
-	 * Queue request
+	/**
+	 * @queue_rq: Queue a new request from block IO.
 	 */
 	queue_rq_fn		*queue_rq;
 
-	/*
-	 * If a driver uses bd->last to judge when to submit requests to
-	 * hardware, it must define this function. In case of errors that
-	 * make us stop issuing further requests, this hook serves the
+	/**
+	 * @commit_rqs: If a driver uses bd->last to judge when to submit
+	 * requests to hardware, it must define this function. In case of errors
+	 * that make us stop issuing further requests, this hook serves the
 	 * purpose of kicking the hardware (which the last request otherwise
 	 * would have done).
 	 */
 	commit_rqs_fn		*commit_rqs;
 
-	/*
-	 * Reserve budget before queue request, once .queue_rq is
+	/**
+	 * @get_budget: Reserve budget before queue request, once .queue_rq is
 	 * run, it is driver's responsibility to release the
 	 * reserved budget. Also we have to handle failure case
 	 * of .get_budget for avoiding I/O deadlock.
 	 */
 	get_budget_fn		*get_budget;
+	/**
+	 * @put_budget: Release the reserved budget.
+	 */
 	put_budget_fn		*put_budget;
 
-	/*
-	 * Called on request timeout
+	/**
+	 * @timeout: Called on request timeout.
 	 */
 	timeout_fn		*timeout;
 
-	/*
-	 * Called to poll for completion of a specific tag.
+	/**
+	 * @poll: Called to poll for completion of a specific tag.
 	 */
 	poll_fn			*poll;
 
+	/**
+	 * @complete: Mark the request as complete.
+	 */
 	complete_fn		*complete;
 
-	/*
-	 * Called when the block layer side of a hardware queue has been
-	 * set up, allowing the driver to allocate/init matching structures.
-	 * Ditto for exit/teardown.
+	/**
+	 * @init_hctx: Called when the block layer side of a hardware queue has
+	 * been set up, allowing the driver to allocate/init matching
+	 * structures.
 	 */
 	init_hctx_fn		*init_hctx;
+	/**
+	 * @exit_hctx: Ditto for exit/teardown.
+	 */
 	exit_hctx_fn		*exit_hctx;
 
-	/*
-	 * Called for every command allocated by the block layer to allow
-	 * the driver to set up driver specific data.
+	/**
+	 * @init_request: Called for every command allocated by the block layer
+	 * to allow the driver to set up driver specific data.
 	 *
 	 * Tag greater than or equal to queue_depth is for setting up
 	 * flush request.
-	 *
-	 * Ditto for exit/teardown.
 	 */
 	init_request_fn		*init_request;
+	/**
+	 * @exit_request: Ditto for exit/teardown.
+	 */
 	exit_request_fn		*exit_request;
-	/* Called from inside blk_get_request() */
+
+	/**
+	 * @initialize_rq_fn: Called from inside blk_get_request().
+	 */
 	void (*initialize_rq_fn)(struct request *rq);
 
-	/*
-	 * Called before freeing one request which isn't completed yet,
-	 * and usually for freeing the driver private data
+	/**
+	 * @cleanup_rq: Called before freeing one request which isn't completed
+	 * yet, and usually for freeing the driver private data.
 	 */
 	cleanup_rq_fn		*cleanup_rq;
 
-	/*
-	 * If set, returns whether or not this queue currently is busy
+	/**
+	 * @busy: If set, returns whether or not this queue currently is busy.
 	 */
 	busy_fn			*busy;
 
+	/**
+	 * @map_queues: This allows drivers specify their own queue mapping by
+	 * overriding the setup-time function that builds the mq_map.
+	 */
 	map_queues_fn		*map_queues;
 
 #ifdef CONFIG_BLK_DEBUG_FS
-	/*
-	 * Used by the debugfs implementation to show driver-specific
+	/**
+	 * @show_rq: Used by the debugfs implementation to show driver-specific
 	 * information about a request.
 	 */
 	void (*show_rq)(struct seq_file *m, struct request *rq);
@@ -262,7 +424,6 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
 
 void blk_mq_free_request(struct request *rq);
-bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
 
 bool blk_mq_queue_inflight(struct request_queue *q);
 
@@ -301,9 +462,25 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
 	return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
 }
 
+/**
+ * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
+ * @rq: target request.
+ */
+static inline enum mq_rq_state blk_mq_rq_state(struct request *rq)
+{
+	return READ_ONCE(rq->state);
+}
+
+static inline int blk_mq_request_started(struct request *rq)
+{
+	return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
+}
+
+static inline int blk_mq_request_completed(struct request *rq)
+{
+	return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE;
+}
 
-int blk_mq_request_started(struct request *rq);
-int blk_mq_request_completed(struct request *rq);
 void blk_mq_start_request(struct request *rq);
 void blk_mq_end_request(struct request *rq, blk_status_t error);
 void __blk_mq_end_request(struct request *rq, blk_status_t error);
@@ -324,7 +501,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
 void blk_mq_quiesce_queue(struct request_queue *q);
 void blk_mq_unquiesce_queue(struct request_queue *q);
 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
-bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_run_hw_queues(struct request_queue *q, bool async);
 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 		busy_tag_iter_fn *fn, void *priv);
@@ -343,14 +520,29 @@ void blk_mq_quiesce_queue_nowait(struct request_queue *q);
 
 unsigned int blk_mq_rq_cpu(struct request *rq);
 
-/*
+/**
+ * blk_mq_rq_from_pdu - cast a PDU to a request
+ * @pdu: the PDU (Protocol Data Unit) to be casted
+ *
+ * Return: request
+ *
  * Driver command data is immediately after the request. So subtract request
- * size to get back to the original request, add request size to get the PDU.
+ * size to get back to the original request.
  */
 static inline struct request *blk_mq_rq_from_pdu(void *pdu)
 {
 	return pdu - sizeof(struct request);
 }
+
+/**
+ * blk_mq_rq_to_pdu - cast a request to a PDU
+ * @rq: the request to be casted
+ *
+ * Return: pointer to the PDU
+ *
+ * Driver command data is immediately after the request. So add request to get
+ * the PDU.
+ */
 static inline void *blk_mq_rq_to_pdu(struct request *rq)
 {
 	return rq + 1;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index d688b96d1d63..70254ae11769 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -153,10 +153,10 @@ struct bio {
 	unsigned short		bi_write_hint;
 	blk_status_t		bi_status;
 	u8			bi_partno;
+	atomic_t		__bi_remaining;
 
 	struct bvec_iter	bi_iter;
 
-	atomic_t		__bi_remaining;
 	bio_end_io_t		*bi_end_io;
 
 	void			*bi_private;
@@ -290,6 +290,12 @@ enum req_opf {
 	REQ_OP_ZONE_RESET_ALL	= 8,
 	/* write the zero filled sector many times */
 	REQ_OP_WRITE_ZEROES	= 9,
+	/* Open a zone */
+	REQ_OP_ZONE_OPEN	= 10,
+	/* Close a zone */
+	REQ_OP_ZONE_CLOSE	= 11,
+	/* Transition a zone to full */
+	REQ_OP_ZONE_FINISH	= 12,
 
 	/* SCSI passthrough using struct scsi_request */
 	REQ_OP_SCSI_IN		= 32,
@@ -371,6 +377,7 @@ enum stat_group {
 	STAT_READ,
 	STAT_WRITE,
 	STAT_DISCARD,
+	STAT_FLUSH,
 
 	NR_STAT_GROUPS
 };
@@ -417,6 +424,25 @@ static inline bool op_is_discard(unsigned int op)
 	return (op & REQ_OP_MASK) == REQ_OP_DISCARD;
 }
 
+/*
+ * Check if a bio or request operation is a zone management operation, with
+ * the exception of REQ_OP_ZONE_RESET_ALL which is treated as a special case
+ * due to its different handling in the block layer and device response in
+ * case of command failure.
+ */
+static inline bool op_is_zone_mgmt(enum req_opf op)
+{
+	switch (op & REQ_OP_MASK) {
+	case REQ_OP_ZONE_RESET:
+	case REQ_OP_ZONE_OPEN:
+	case REQ_OP_ZONE_CLOSE:
+	case REQ_OP_ZONE_FINISH:
+		return true;
+	default:
+		return false;
+	}
+}
+
 static inline int op_stat_group(unsigned int op)
 {
 	if (op_is_discard(op))
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f3ea78b0c91c..6a4f7abbdcf7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -360,14 +360,15 @@ extern unsigned int blkdev_nr_zones(struct block_device *bdev);
 extern int blkdev_report_zones(struct block_device *bdev,
 			       sector_t sector, struct blk_zone *zones,
 			       unsigned int *nr_zones);
-extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors,
-			      sector_t nr_sectors, gfp_t gfp_mask);
+extern int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
+			    sector_t sectors, sector_t nr_sectors,
+			    gfp_t gfp_mask);
 extern int blk_revalidate_disk_zones(struct gendisk *disk);
 
 extern int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
 				     unsigned int cmd, unsigned long arg);
-extern int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode,
-				    unsigned int cmd, unsigned long arg);
+extern int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
+				  unsigned int cmd, unsigned long arg);
 
 #else /* CONFIG_BLK_DEV_ZONED */
 
@@ -388,9 +389,9 @@ static inline int blkdev_report_zones_ioctl(struct block_device *bdev,
 	return -ENOTTY;
 }
 
-static inline int blkdev_reset_zones_ioctl(struct block_device *bdev,
-					   fmode_t mode, unsigned int cmd,
-					   unsigned long arg)
+static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev,
+					 fmode_t mode, unsigned int cmd,
+					 unsigned long arg)
 {
 	return -ENOTTY;
 }
@@ -411,7 +412,6 @@ struct request_queue {
 
 	/* sw queues */
 	struct blk_mq_ctx __percpu	*queue_ctx;
-	unsigned int		nr_queues;
 
 	unsigned int		queue_depth;
 
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index a986ac12a848..e40d019c3d9d 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -216,15 +216,6 @@ int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint,
  */
 bool sbitmap_any_bit_set(const struct sbitmap *sb);
 
-/**
- * sbitmap_any_bit_clear() - Check for an unset bit in a &struct
- * sbitmap.
- * @sb: Bitmap to check.
- *
- * Return: true if any bit in the bitmap is clear, false otherwise.
- */
-bool sbitmap_any_bit_clear(const struct sbitmap *sb);
-
 #define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift)
 #define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U))
 
diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
index 53c28d750a45..1ac0d712a9c3 100644
--- a/include/linux/sed-opal.h
+++ b/include/linux/sed-opal.h
@@ -42,6 +42,7 @@ static inline bool is_sed_ioctl(unsigned int cmd)
 	case IOC_OPAL_PSID_REVERT_TPR:
 	case IOC_OPAL_MBR_DONE:
 	case IOC_OPAL_WRITE_SHADOW_MBR:
+	case IOC_OPAL_GENERIC_TABLE_RW:
 		return true;
 	}
 	return false;
diff --git a/include/trace/events/wbt.h b/include/trace/events/wbt.h
index b048694070e2..37342a13c9cb 100644
--- a/include/trace/events/wbt.h
+++ b/include/trace/events/wbt.h
@@ -33,7 +33,8 @@ TRACE_EVENT(wbt_stat,
 	),
 
 	TP_fast_assign(
-		strncpy(__entry->name, dev_name(bdi->dev), 32);
+		strlcpy(__entry->name, dev_name(bdi->dev),
+			ARRAY_SIZE(__entry->name));
 		__entry->rmean		= stat[0].mean;
 		__entry->rmin		= stat[0].min;
 		__entry->rmax		= stat[0].max;
@@ -67,7 +68,8 @@ TRACE_EVENT(wbt_lat,
 	),
 
 	TP_fast_assign(
-		strncpy(__entry->name, dev_name(bdi->dev), 32);
+		strlcpy(__entry->name, dev_name(bdi->dev),
+			ARRAY_SIZE(__entry->name));
 		__entry->lat = div_u64(lat, 1000);
 	),
 
@@ -103,7 +105,8 @@ TRACE_EVENT(wbt_step,
 	),
 
 	TP_fast_assign(
-		strncpy(__entry->name, dev_name(bdi->dev), 32);
+		strlcpy(__entry->name, dev_name(bdi->dev),
+			ARRAY_SIZE(__entry->name));
 		__entry->msg	= msg;
 		__entry->step	= step;
 		__entry->window	= div_u64(window, 1000);
@@ -138,7 +141,8 @@ TRACE_EVENT(wbt_timer,
 	),
 
 	TP_fast_assign(
-		strncpy(__entry->name, dev_name(bdi->dev), 32);
+		strlcpy(__entry->name, dev_name(bdi->dev),
+			ARRAY_SIZE(__entry->name));
 		__entry->status		= status;
 		__entry->step		= step;
 		__entry->inflight	= inflight;
diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
index 498eec813494..0cdef67135f0 100644
--- a/include/uapi/linux/blkzoned.h
+++ b/include/uapi/linux/blkzoned.h
@@ -120,9 +120,11 @@ struct blk_zone_report {
 };
 
 /**
- * struct blk_zone_range - BLKRESETZONE ioctl request
- * @sector: starting sector of the first zone to issue reset write pointer
- * @nr_sectors: Total number of sectors of 1 or more zones to reset
+ * struct blk_zone_range - BLKRESETZONE/BLKOPENZONE/
+ *                         BLKCLOSEZONE/BLKFINISHZONE ioctl
+ *                         requests
+ * @sector: Starting sector of the first zone to operate on.
+ * @nr_sectors: Total number of sectors of all zones to operate on.
  */
 struct blk_zone_range {
 	__u64		sector;
@@ -139,10 +141,19 @@ struct blk_zone_range {
  *                sector range. The sector range must be zone aligned.
  * @BLKGETZONESZ: Get the device zone size in number of 512 B sectors.
  * @BLKGETNRZONES: Get the total number of zones of the device.
+ * @BLKOPENZONE: Open the zones in the specified sector range.
+ *               The 512 B sector range must be zone aligned.
+ * @BLKCLOSEZONE: Close the zones in the specified sector range.
+ *                The 512 B sector range must be zone aligned.
+ * @BLKFINISHZONE: Mark the zones as full in the specified sector range.
+ *                 The 512 B sector range must be zone aligned.
  */
 #define BLKREPORTZONE	_IOWR(0x12, 130, struct blk_zone_report)
 #define BLKRESETZONE	_IOW(0x12, 131, struct blk_zone_range)
 #define BLKGETZONESZ	_IOR(0x12, 132, __u32)
 #define BLKGETNRZONES	_IOR(0x12, 133, __u32)
+#define BLKOPENZONE	_IOW(0x12, 134, struct blk_zone_range)
+#define BLKCLOSEZONE	_IOW(0x12, 135, struct blk_zone_range)
+#define BLKFINISHZONE	_IOW(0x12, 136, struct blk_zone_range)
 
 #endif /* _UAPI_BLKZONED_H */
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 1d338357df8a..1f97b33c840e 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -58,7 +58,7 @@
  * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
  * used to clear any hints previously set.
  */
-#define RWF_WRITE_LIFE_NOT_SET	0
+#define RWH_WRITE_LIFE_NOT_SET	0
 #define RWH_WRITE_LIFE_NONE	1
 #define RWH_WRITE_LIFE_SHORT	2
 #define RWH_WRITE_LIFE_MEDIUM	3
@@ -66,6 +66,13 @@
 #define RWH_WRITE_LIFE_EXTREME	5
 
 /*
+ * The originally introduced spelling is remained from the first
+ * versions of the patch set that introduced the feature, see commit
+ * v4.13-rc1~212^2~51.
+ */
+#define RWF_WRITE_LIFE_NOT_SET	RWH_WRITE_LIFE_NOT_SET
+
+/*
  * Types of directory notifications that may be requested.
  */
 #define DN_ACCESS	0x00000001	/* File accessed */
diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h
index c6d035fa1b6c..6f5af1a84213 100644
--- a/include/uapi/linux/sed-opal.h
+++ b/include/uapi/linux/sed-opal.h
@@ -113,6 +113,25 @@ struct opal_shadow_mbr {
 	__u64 size;
 };
 
+/* Opal table operations */
+enum opal_table_ops {
+	OPAL_READ_TABLE,
+	OPAL_WRITE_TABLE,
+};
+
+#define OPAL_UID_LENGTH 8
+struct opal_read_write_table {
+	struct opal_key key;
+	const __u64 data;
+	const __u8 table_uid[OPAL_UID_LENGTH];
+	__u64 offset;
+	__u64 size;
+#define OPAL_TABLE_READ (1 << OPAL_READ_TABLE)
+#define OPAL_TABLE_WRITE (1 << OPAL_WRITE_TABLE)
+	__u64 flags;
+	__u64 priv;
+};
+
 #define IOC_OPAL_SAVE		    _IOW('p', 220, struct opal_lock_unlock)
 #define IOC_OPAL_LOCK_UNLOCK	    _IOW('p', 221, struct opal_lock_unlock)
 #define IOC_OPAL_TAKE_OWNERSHIP	    _IOW('p', 222, struct opal_key)
@@ -128,5 +147,6 @@ struct opal_shadow_mbr {
 #define IOC_OPAL_PSID_REVERT_TPR    _IOW('p', 232, struct opal_key)
 #define IOC_OPAL_MBR_DONE           _IOW('p', 233, struct opal_mbr_done)
 #define IOC_OPAL_WRITE_SHADOW_MBR   _IOW('p', 234, struct opal_shadow_mbr)
+#define IOC_OPAL_GENERIC_TABLE_RW   _IOW('p', 235, struct opal_read_write_table)
 
 #endif /* _UAPI_SED_OPAL_H */
author	Linus Torvalds <torvalds@linux-foundation.org>	2019-11-25 10:59:41 -0800
committer	Linus Torvalds <torvalds@linux-foundation.org>	2019-11-25 10:59:41 -0800
commit	ff6814b078e33a4d26fee9ea80779c81a6744cd8 (patch)
tree	b8559e89e01cad7d59e41e485d5c20ac6bb2e7ec /include
parent	6e7b06a4c88846c20c2cc01b370564a2423ff0d0 (diff)
parent	1e279153dfd53e76006720df804d5935a6cbc6d5 (diff)
download	lwn-ff6814b078e33a4d26fee9ea80779c81a6744cd8.tar.gz lwn-ff6814b078e33a4d26fee9ea80779c81a6744cd8.zip