summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-11-25 10:59:41 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2019-11-25 10:59:41 -0800
commitff6814b078e33a4d26fee9ea80779c81a6744cd8 (patch)
treeb8559e89e01cad7d59e41e485d5c20ac6bb2e7ec /include
parent6e7b06a4c88846c20c2cc01b370564a2423ff0d0 (diff)
parent1e279153dfd53e76006720df804d5935a6cbc6d5 (diff)
downloadlwn-ff6814b078e33a4d26fee9ea80779c81a6744cd8.tar.gz
lwn-ff6814b078e33a4d26fee9ea80779c81a6744cd8.zip
Merge tag 'for-5.5/block-20191121' of git://git.kernel.dk/linux-block
Pull core block updates from Jens Axboe: "Due to more granular branches, this one is small and will be followed with other core branches that add specific features. I meant to just have a core and drivers branch, but external dependencies we ended up adding a few more that are also core. The changes are: - Fixes and improvements for the zoned device support (Ajay, Damien) - sed-opal table writing and datastore UID (Revanth) - blk-cgroup (and bfq) blk-cgroup stat fixes (Tejun) - Improvements to the block stats tracking (Pavel) - Fix for overruning sysfs buffer for large number of CPUs (Ming) - Optimization for small IO (Ming, Christoph) - Fix typo in RWH lifetime hint (Eugene) - Dead code removal and documentation (Bart) - Reduction in memory usage for queue and tag set (Bart) - Kerneldoc header documentation (André) - Device/partition revalidation fixes (Jan) - Stats tracking for flush requests (Konstantin) - Various other little fixes here and there (et al)" * tag 'for-5.5/block-20191121' of git://git.kernel.dk/linux-block: (48 commits) Revert "block: split bio if the only bvec's length is > SZ_4K" block: add iostat counters for flush requests block,bfq: Skip tracing hooks if possible block: sed-opal: Introduce SUM_SET_LIST parameter and append it using 'add_token_u64' blk-cgroup: cgroup_rstat_updated() shouldn't be called on cgroup1 block: Don't disable interrupts in trigger_softirq() sbitmap: Delete sbitmap_any_bit_clear() blk-mq: Delete blk_mq_has_free_tags() and blk_mq_can_queue() block: split bio if the only bvec's length is > SZ_4K block: still try to split bio if the bvec crosses pages blk-cgroup: separate out blkg_rwstat under CONFIG_BLK_CGROUP_RWSTAT blk-cgroup: reimplement basic IO stats using cgroup rstat blk-cgroup: remove now unused blkg_print_stat_{bytes|ios}_recursive() blk-throtl: stop using blkg->stat_bytes and ->stat_ios bfq-iosched: stop using blkg->stat_bytes and ->stat_ios bfq-iosched: relocate bfqg_*rwstat*() helpers block: add zone open, close and finish ioctl support block: add zone open, close and finish operations block: Simplify REQ_OP_ZONE_RESET_ALL handling block: Remove REQ_OP_ZONE_RESET plugging ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/blk-cgroup.h199
-rw-r--r--include/linux/blk-mq.h300
-rw-r--r--include/linux/blk_types.h28
-rw-r--r--include/linux/blkdev.h16
-rw-r--r--include/linux/sbitmap.h9
-rw-r--r--include/linux/sed-opal.h1
-rw-r--r--include/trace/events/wbt.h12
-rw-r--r--include/uapi/linux/blkzoned.h17
-rw-r--r--include/uapi/linux/fcntl.h9
-rw-r--r--include/uapi/linux/sed-opal.h20
10 files changed, 369 insertions, 242 deletions
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index bed9e43f9426..19394c77ed99 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -15,7 +15,9 @@
*/
#include <linux/cgroup.h>
+#include <linux/percpu.h>
#include <linux/percpu_counter.h>
+#include <linux/u64_stats_sync.h>
#include <linux/seq_file.h>
#include <linux/radix-tree.h>
#include <linux/blkdev.h>
@@ -31,15 +33,12 @@
#ifdef CONFIG_BLK_CGROUP
-enum blkg_rwstat_type {
- BLKG_RWSTAT_READ,
- BLKG_RWSTAT_WRITE,
- BLKG_RWSTAT_SYNC,
- BLKG_RWSTAT_ASYNC,
- BLKG_RWSTAT_DISCARD,
+enum blkg_iostat_type {
+ BLKG_IOSTAT_READ,
+ BLKG_IOSTAT_WRITE,
+ BLKG_IOSTAT_DISCARD,
- BLKG_RWSTAT_NR,
- BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
+ BLKG_IOSTAT_NR,
};
struct blkcg_gq;
@@ -61,17 +60,15 @@ struct blkcg {
#endif
};
-/*
- * blkg_[rw]stat->aux_cnt is excluded for local stats but included for
- * recursive. Used to carry stats of dead children.
- */
-struct blkg_rwstat {
- struct percpu_counter cpu_cnt[BLKG_RWSTAT_NR];
- atomic64_t aux_cnt[BLKG_RWSTAT_NR];
+struct blkg_iostat {
+ u64 bytes[BLKG_IOSTAT_NR];
+ u64 ios[BLKG_IOSTAT_NR];
};
-struct blkg_rwstat_sample {
- u64 cnt[BLKG_RWSTAT_NR];
+struct blkg_iostat_set {
+ struct u64_stats_sync sync;
+ struct blkg_iostat cur;
+ struct blkg_iostat last;
};
/*
@@ -127,8 +124,8 @@ struct blkcg_gq {
/* is this blkg online? protected by both blkcg and q locks */
bool online;
- struct blkg_rwstat stat_bytes;
- struct blkg_rwstat stat_ios;
+ struct blkg_iostat_set __percpu *iostat_cpu;
+ struct blkg_iostat_set iostat;
struct blkg_policy_data *pd[BLKCG_MAX_POLS];
@@ -202,13 +199,6 @@ int blkcg_activate_policy(struct request_queue *q,
void blkcg_deactivate_policy(struct request_queue *q,
const struct blkcg_policy *pol);
-static inline u64 blkg_rwstat_read_counter(struct blkg_rwstat *rwstat,
- unsigned int idx)
-{
- return atomic64_read(&rwstat->aux_cnt[idx]) +
- percpu_counter_sum_positive(&rwstat->cpu_cnt[idx]);
-}
-
const char *blkg_dev_name(struct blkcg_gq *blkg);
void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
u64 (*prfill)(struct seq_file *,
@@ -216,17 +206,6 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
const struct blkcg_policy *pol, int data,
bool show_total);
u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
-u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
- const struct blkg_rwstat_sample *rwstat);
-u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
- int off);
-int blkg_print_stat_bytes(struct seq_file *sf, void *v);
-int blkg_print_stat_ios(struct seq_file *sf, void *v);
-int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v);
-int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v);
-
-void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
- int off, struct blkg_rwstat_sample *sum);
struct blkg_conf_ctx {
struct gendisk *disk;
@@ -578,128 +557,6 @@ static inline void blkg_put(struct blkcg_gq *blkg)
if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \
(p_blkg)->q, false)))
-static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp)
-{
- int i, ret;
-
- for (i = 0; i < BLKG_RWSTAT_NR; i++) {
- ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp);
- if (ret) {
- while (--i >= 0)
- percpu_counter_destroy(&rwstat->cpu_cnt[i]);
- return ret;
- }
- atomic64_set(&rwstat->aux_cnt[i], 0);
- }
- return 0;
-}
-
-static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat)
-{
- int i;
-
- for (i = 0; i < BLKG_RWSTAT_NR; i++)
- percpu_counter_destroy(&rwstat->cpu_cnt[i]);
-}
-
-/**
- * blkg_rwstat_add - add a value to a blkg_rwstat
- * @rwstat: target blkg_rwstat
- * @op: REQ_OP and flags
- * @val: value to add
- *
- * Add @val to @rwstat. The counters are chosen according to @rw. The
- * caller is responsible for synchronizing calls to this function.
- */
-static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
- unsigned int op, uint64_t val)
-{
- struct percpu_counter *cnt;
-
- if (op_is_discard(op))
- cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD];
- else if (op_is_write(op))
- cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE];
- else
- cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
-
- percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH);
-
- if (op_is_sync(op))
- cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
- else
- cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];
-
- percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH);
-}
-
-/**
- * blkg_rwstat_read - read the current values of a blkg_rwstat
- * @rwstat: blkg_rwstat to read
- *
- * Read the current snapshot of @rwstat and return it in the aux counts.
- */
-static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat,
- struct blkg_rwstat_sample *result)
-{
- int i;
-
- for (i = 0; i < BLKG_RWSTAT_NR; i++)
- result->cnt[i] =
- percpu_counter_sum_positive(&rwstat->cpu_cnt[i]);
-}
-
-/**
- * blkg_rwstat_total - read the total count of a blkg_rwstat
- * @rwstat: blkg_rwstat to read
- *
- * Return the total count of @rwstat regardless of the IO direction. This
- * function can be called without synchronization and takes care of u64
- * atomicity.
- */
-static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
-{
- struct blkg_rwstat_sample tmp = { };
-
- blkg_rwstat_read(rwstat, &tmp);
- return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
-}
-
-/**
- * blkg_rwstat_reset - reset a blkg_rwstat
- * @rwstat: blkg_rwstat to reset
- */
-static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
-{
- int i;
-
- for (i = 0; i < BLKG_RWSTAT_NR; i++) {
- percpu_counter_set(&rwstat->cpu_cnt[i], 0);
- atomic64_set(&rwstat->aux_cnt[i], 0);
- }
-}
-
-/**
- * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count
- * @to: the destination blkg_rwstat
- * @from: the source
- *
- * Add @from's count including the aux one to @to's aux count.
- */
-static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
- struct blkg_rwstat *from)
-{
- u64 sum[BLKG_RWSTAT_NR];
- int i;
-
- for (i = 0; i < BLKG_RWSTAT_NR; i++)
- sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]);
-
- for (i = 0; i < BLKG_RWSTAT_NR; i++)
- atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]),
- &to->aux_cnt[i]);
-}
-
#ifdef CONFIG_BLK_DEV_THROTTLING
extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
struct bio *bio);
@@ -745,15 +602,33 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
throtl = blk_throtl_bio(q, blkg, bio);
if (!throtl) {
+ struct blkg_iostat_set *bis;
+ int rwd, cpu;
+
+ if (op_is_discard(bio->bi_opf))
+ rwd = BLKG_IOSTAT_DISCARD;
+ else if (op_is_write(bio->bi_opf))
+ rwd = BLKG_IOSTAT_WRITE;
+ else
+ rwd = BLKG_IOSTAT_READ;
+
+ cpu = get_cpu();
+ bis = per_cpu_ptr(blkg->iostat_cpu, cpu);
+ u64_stats_update_begin(&bis->sync);
+
/*
* If the bio is flagged with BIO_QUEUE_ENTERED it means this
* is a split bio and we would have already accounted for the
* size of the bio.
*/
if (!bio_flagged(bio, BIO_QUEUE_ENTERED))
- blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf,
- bio->bi_iter.bi_size);
- blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1);
+ bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
+ bis->cur.ios[rwd]++;
+
+ u64_stats_update_end(&bis->sync);
+ if (cgroup_subsys_on_dfl(io_cgrp_subsys))
+ cgroup_rstat_updated(blkg->blkcg->css.cgroup, cpu);
+ put_cpu();
}
blkcg_bio_issue_init(bio);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 0bf056de5cc3..11cfd6470b1a 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -10,103 +10,239 @@ struct blk_mq_tags;
struct blk_flush_queue;
/**
- * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware block device
+ * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware
+ * block device
*/
struct blk_mq_hw_ctx {
struct {
+ /** @lock: Protects the dispatch list. */
spinlock_t lock;
+ /**
+ * @dispatch: Used for requests that are ready to be
+ * dispatched to the hardware but for some reason (e.g. lack of
+ * resources) could not be sent to the hardware. As soon as the
+ * driver can send new requests, requests at this list will
+ * be sent first for a fairer dispatch.
+ */
struct list_head dispatch;
- unsigned long state; /* BLK_MQ_S_* flags */
+ /**
+ * @state: BLK_MQ_S_* flags. Defines the state of the hw
+ * queue (active, scheduled to restart, stopped).
+ */
+ unsigned long state;
} ____cacheline_aligned_in_smp;
+ /**
+ * @run_work: Used for scheduling a hardware queue run at a later time.
+ */
struct delayed_work run_work;
+ /** @cpumask: Map of available CPUs where this hctx can run. */
cpumask_var_t cpumask;
+ /**
+ * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU
+ * selection from @cpumask.
+ */
int next_cpu;
+ /**
+ * @next_cpu_batch: Counter of how many works left in the batch before
+ * changing to the next CPU.
+ */
int next_cpu_batch;
- unsigned long flags; /* BLK_MQ_F_* flags */
+ /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */
+ unsigned long flags;
+ /**
+ * @sched_data: Pointer owned by the IO scheduler attached to a request
+ * queue. It's up to the IO scheduler how to use this pointer.
+ */
void *sched_data;
+ /**
+ * @queue: Pointer to the request queue that owns this hardware context.
+ */
struct request_queue *queue;
+ /** @fq: Queue of requests that need to perform a flush operation. */
struct blk_flush_queue *fq;
+ /**
+ * @driver_data: Pointer to data owned by the block driver that created
+ * this hctx
+ */
void *driver_data;
+ /**
+ * @ctx_map: Bitmap for each software queue. If bit is on, there is a
+ * pending request in that software queue.
+ */
struct sbitmap ctx_map;
+ /**
+ * @dispatch_from: Software queue to be used when no scheduler was
+ * selected.
+ */
struct blk_mq_ctx *dispatch_from;
+ /**
+ * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to
+ * decide if the hw_queue is busy using Exponential Weighted Moving
+ * Average algorithm.
+ */
unsigned int dispatch_busy;
+ /** @type: HCTX_TYPE_* flags. Type of hardware queue. */
unsigned short type;
+ /** @nr_ctx: Number of software queues. */
unsigned short nr_ctx;
+ /** @ctxs: Array of software queues. */
struct blk_mq_ctx **ctxs;
+ /** @dispatch_wait_lock: Lock for dispatch_wait queue. */
spinlock_t dispatch_wait_lock;
+ /**
+ * @dispatch_wait: Waitqueue to put requests when there is no tag
+ * available at the moment, to wait for another try in the future.
+ */
wait_queue_entry_t dispatch_wait;
+
+ /**
+ * @wait_index: Index of next available dispatch_wait queue to insert
+ * requests.
+ */
atomic_t wait_index;
+ /**
+ * @tags: Tags owned by the block driver. A tag at this set is only
+ * assigned when a request is dispatched from a hardware queue.
+ */
struct blk_mq_tags *tags;
+ /**
+ * @sched_tags: Tags owned by I/O scheduler. If there is an I/O
+ * scheduler associated with a request queue, a tag is assigned when
+ * that request is allocated. Else, this member is not used.
+ */
struct blk_mq_tags *sched_tags;
+ /** @queued: Number of queued requests. */
unsigned long queued;
+ /** @run: Number of dispatched requests. */
unsigned long run;
#define BLK_MQ_MAX_DISPATCH_ORDER 7
+ /** @dispatched: Number of dispatch requests by queue. */
unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
+ /** @numa_node: NUMA node the storage adapter has been connected to. */
unsigned int numa_node;
+ /** @queue_num: Index of this hardware queue. */
unsigned int queue_num;
+ /**
+ * @nr_active: Number of active requests. Only used when a tag set is
+ * shared across request queues.
+ */
atomic_t nr_active;
+ /** @cpuhp_dead: List to store request if some CPU die. */
struct hlist_node cpuhp_dead;
+ /** @kobj: Kernel object for sysfs. */
struct kobject kobj;
+ /** @poll_considered: Count times blk_poll() was called. */
unsigned long poll_considered;
+ /** @poll_invoked: Count how many requests blk_poll() polled. */
unsigned long poll_invoked;
+ /** @poll_success: Count how many polled requests were completed. */
unsigned long poll_success;
#ifdef CONFIG_BLK_DEBUG_FS
+ /**
+ * @debugfs_dir: debugfs directory for this hardware queue. Named
+ * as cpu<cpu_number>.
+ */
struct dentry *debugfs_dir;
+ /** @sched_debugfs_dir: debugfs directory for the scheduler. */
struct dentry *sched_debugfs_dir;
#endif
+ /** @hctx_list: List of all hardware queues. */
struct list_head hctx_list;
- /* Must be the last member - see also blk_mq_hw_ctx_size(). */
+ /**
+ * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is
+ * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also
+ * blk_mq_hw_ctx_size().
+ */
struct srcu_struct srcu[0];
};
+/**
+ * struct blk_mq_queue_map - Map software queues to hardware queues
+ * @mq_map: CPU ID to hardware queue index map. This is an array
+ * with nr_cpu_ids elements. Each element has a value in the range
+ * [@queue_offset, @queue_offset + @nr_queues).
+ * @nr_queues: Number of hardware queues to map CPU IDs onto.
+ * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe
+ * driver to map each hardware queue type (enum hctx_type) onto a distinct
+ * set of hardware queues.
+ */
struct blk_mq_queue_map {
unsigned int *mq_map;
unsigned int nr_queues;
unsigned int queue_offset;
};
+/**
+ * enum hctx_type - Type of hardware queue
+ * @HCTX_TYPE_DEFAULT: All I/O not otherwise accounted for.
+ * @HCTX_TYPE_READ: Just for READ I/O.
+ * @HCTX_TYPE_POLL: Polled I/O of any kind.
+ * @HCTX_MAX_TYPES: Number of types of hctx.
+ */
enum hctx_type {
- HCTX_TYPE_DEFAULT, /* all I/O not otherwise accounted for */
- HCTX_TYPE_READ, /* just for READ I/O */
- HCTX_TYPE_POLL, /* polled I/O of any kind */
+ HCTX_TYPE_DEFAULT,
+ HCTX_TYPE_READ,
+ HCTX_TYPE_POLL,
HCTX_MAX_TYPES,
};
+/**
+ * struct blk_mq_tag_set - tag set that can be shared between request queues
+ * @map: One or more ctx -> hctx mappings. One map exists for each
+ * hardware queue type (enum hctx_type) that the driver wishes
+ * to support. There are no restrictions on maps being of the
+ * same size, and it's perfectly legal to share maps between
+ * types.
+ * @nr_maps: Number of elements in the @map array. A number in the range
+ * [1, HCTX_MAX_TYPES].
+ * @ops: Pointers to functions that implement block driver behavior.
+ * @nr_hw_queues: Number of hardware queues supported by the block driver that
+ * owns this data structure.
+ * @queue_depth: Number of tags per hardware queue, reserved tags included.
+ * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag
+ * allocations.
+ * @cmd_size: Number of additional bytes to allocate per request. The block
+ * driver owns these additional bytes.
+ * @numa_node: NUMA node the storage adapter has been connected to.
+ * @timeout: Request processing timeout in jiffies.
+ * @flags: Zero or more BLK_MQ_F_* flags.
+ * @driver_data: Pointer to data owned by the block driver that created this
+ * tag set.
+ * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues
+ * elements.
+ * @tag_list_lock: Serializes tag_list accesses.
+ * @tag_list: List of the request queues that use this tag set. See also
+ * request_queue.tag_set_list.
+ */
struct blk_mq_tag_set {
- /*
- * map[] holds ctx -> hctx mappings, one map exists for each type
- * that the driver wishes to support. There are no restrictions
- * on maps being of the same size, and it's perfectly legal to
- * share maps between types.
- */
struct blk_mq_queue_map map[HCTX_MAX_TYPES];
- unsigned int nr_maps; /* nr entries in map[] */
+ unsigned int nr_maps;
const struct blk_mq_ops *ops;
- unsigned int nr_hw_queues; /* nr hw queues across maps */
- unsigned int queue_depth; /* max hw supported */
+ unsigned int nr_hw_queues;
+ unsigned int queue_depth;
unsigned int reserved_tags;
- unsigned int cmd_size; /* per-request extra data */
+ unsigned int cmd_size;
int numa_node;
unsigned int timeout;
- unsigned int flags; /* BLK_MQ_F_* */
+ unsigned int flags;
void *driver_data;
struct blk_mq_tags **tags;
@@ -115,6 +251,12 @@ struct blk_mq_tag_set {
struct list_head tag_list;
};
+/**
+ * struct blk_mq_queue_data - Data about a request inserted in a queue
+ *
+ * @rq: Request pointer.
+ * @last: If it is the last request in the queue.
+ */
struct blk_mq_queue_data {
struct request *rq;
bool last;
@@ -142,81 +284,101 @@ typedef bool (busy_fn)(struct request_queue *);
typedef void (complete_fn)(struct request *);
typedef void (cleanup_rq_fn)(struct request *);
-
+/**
+ * struct blk_mq_ops - Callback functions that implements block driver
+ * behaviour.
+ */
struct blk_mq_ops {
- /*
- * Queue request
+ /**
+ * @queue_rq: Queue a new request from block IO.
*/
queue_rq_fn *queue_rq;
- /*
- * If a driver uses bd->last to judge when to submit requests to
- * hardware, it must define this function. In case of errors that
- * make us stop issuing further requests, this hook serves the
+ /**
+ * @commit_rqs: If a driver uses bd->last to judge when to submit
+ * requests to hardware, it must define this function. In case of errors
+ * that make us stop issuing further requests, this hook serves the
* purpose of kicking the hardware (which the last request otherwise
* would have done).
*/
commit_rqs_fn *commit_rqs;
- /*
- * Reserve budget before queue request, once .queue_rq is
+ /**
+ * @get_budget: Reserve budget before queue request, once .queue_rq is
* run, it is driver's responsibility to release the
* reserved budget. Also we have to handle failure case
* of .get_budget for avoiding I/O deadlock.
*/
get_budget_fn *get_budget;
+ /**
+ * @put_budget: Release the reserved budget.
+ */
put_budget_fn *put_budget;
- /*
- * Called on request timeout
+ /**
+ * @timeout: Called on request timeout.
*/
timeout_fn *timeout;
- /*
- * Called to poll for completion of a specific tag.
+ /**
+ * @poll: Called to poll for completion of a specific tag.
*/
poll_fn *poll;
+ /**
+ * @complete: Mark the request as complete.
+ */
complete_fn *complete;
- /*
- * Called when the block layer side of a hardware queue has been
- * set up, allowing the driver to allocate/init matching structures.
- * Ditto for exit/teardown.
+ /**
+ * @init_hctx: Called when the block layer side of a hardware queue has
+ * been set up, allowing the driver to allocate/init matching
+ * structures.
*/
init_hctx_fn *init_hctx;
+ /**
+ * @exit_hctx: Ditto for exit/teardown.
+ */
exit_hctx_fn *exit_hctx;
- /*
- * Called for every command allocated by the block layer to allow
- * the driver to set up driver specific data.
+ /**
+ * @init_request: Called for every command allocated by the block layer
+ * to allow the driver to set up driver specific data.
*
* Tag greater than or equal to queue_depth is for setting up
* flush request.
- *
- * Ditto for exit/teardown.
*/
init_request_fn *init_request;
+ /**
+ * @exit_request: Ditto for exit/teardown.
+ */
exit_request_fn *exit_request;
- /* Called from inside blk_get_request() */
+
+ /**
+ * @initialize_rq_fn: Called from inside blk_get_request().
+ */
void (*initialize_rq_fn)(struct request *rq);
- /*
- * Called before freeing one request which isn't completed yet,
- * and usually for freeing the driver private data
+ /**
+ * @cleanup_rq: Called before freeing one request which isn't completed
+ * yet, and usually for freeing the driver private data.
*/
cleanup_rq_fn *cleanup_rq;
- /*
- * If set, returns whether or not this queue currently is busy
+ /**
+ * @busy: If set, returns whether or not this queue currently is busy.
*/
busy_fn *busy;
+ /**
+ * @map_queues: This allows drivers specify their own queue mapping by
+ * overriding the setup-time function that builds the mq_map.
+ */
map_queues_fn *map_queues;
#ifdef CONFIG_BLK_DEBUG_FS
- /*
- * Used by the debugfs implementation to show driver-specific
+ /**
+ * @show_rq: Used by the debugfs implementation to show driver-specific
* information about a request.
*/
void (*show_rq)(struct seq_file *m, struct request *rq);
@@ -262,7 +424,6 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
void blk_mq_free_request(struct request *rq);
-bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
bool blk_mq_queue_inflight(struct request_queue *q);
@@ -301,9 +462,25 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
}
+/**
+ * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
+ * @rq: target request.
+ */
+static inline enum mq_rq_state blk_mq_rq_state(struct request *rq)
+{
+ return READ_ONCE(rq->state);
+}
+
+static inline int blk_mq_request_started(struct request *rq)
+{
+ return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
+}
+
+static inline int blk_mq_request_completed(struct request *rq)
+{
+ return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE;
+}
-int blk_mq_request_started(struct request *rq);
-int blk_mq_request_completed(struct request *rq);
void blk_mq_start_request(struct request *rq);
void blk_mq_end_request(struct request *rq, blk_status_t error);
void __blk_mq_end_request(struct request *rq, blk_status_t error);
@@ -324,7 +501,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
void blk_mq_quiesce_queue(struct request_queue *q);
void blk_mq_unquiesce_queue(struct request_queue *q);
void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
-bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_run_hw_queues(struct request_queue *q, bool async);
void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
busy_tag_iter_fn *fn, void *priv);
@@ -343,14 +520,29 @@ void blk_mq_quiesce_queue_nowait(struct request_queue *q);
unsigned int blk_mq_rq_cpu(struct request *rq);
-/*
+/**
+ * blk_mq_rq_from_pdu - cast a PDU to a request
+ * @pdu: the PDU (Protocol Data Unit) to be casted
+ *
+ * Return: request
+ *
* Driver command data is immediately after the request. So subtract request
- * size to get back to the original request, add request size to get the PDU.
+ * size to get back to the original request.
*/
static inline struct request *blk_mq_rq_from_pdu(void *pdu)
{
return pdu - sizeof(struct request);
}
+
+/**
+ * blk_mq_rq_to_pdu - cast a request to a PDU
+ * @rq: the request to be casted
+ *
+ * Return: pointer to the PDU
+ *
+ * Driver command data is immediately after the request. So add request to get
+ * the PDU.
+ */
static inline void *blk_mq_rq_to_pdu(struct request *rq)
{
return rq + 1;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index d688b96d1d63..70254ae11769 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -153,10 +153,10 @@ struct bio {
unsigned short bi_write_hint;
blk_status_t bi_status;
u8 bi_partno;
+ atomic_t __bi_remaining;
struct bvec_iter bi_iter;
- atomic_t __bi_remaining;
bio_end_io_t *bi_end_io;
void *bi_private;
@@ -290,6 +290,12 @@ enum req_opf {
REQ_OP_ZONE_RESET_ALL = 8,
/* write the zero filled sector many times */
REQ_OP_WRITE_ZEROES = 9,
+ /* Open a zone */
+ REQ_OP_ZONE_OPEN = 10,
+ /* Close a zone */
+ REQ_OP_ZONE_CLOSE = 11,
+ /* Transition a zone to full */
+ REQ_OP_ZONE_FINISH = 12,
/* SCSI passthrough using struct scsi_request */
REQ_OP_SCSI_IN = 32,
@@ -371,6 +377,7 @@ enum stat_group {
STAT_READ,
STAT_WRITE,
STAT_DISCARD,
+ STAT_FLUSH,
NR_STAT_GROUPS
};
@@ -417,6 +424,25 @@ static inline bool op_is_discard(unsigned int op)
return (op & REQ_OP_MASK) == REQ_OP_DISCARD;
}
+/*
+ * Check if a bio or request operation is a zone management operation, with
+ * the exception of REQ_OP_ZONE_RESET_ALL which is treated as a special case
+ * due to its different handling in the block layer and device response in
+ * case of command failure.
+ */
+static inline bool op_is_zone_mgmt(enum req_opf op)
+{
+ switch (op & REQ_OP_MASK) {
+ case REQ_OP_ZONE_RESET:
+ case REQ_OP_ZONE_OPEN:
+ case REQ_OP_ZONE_CLOSE:
+ case REQ_OP_ZONE_FINISH:
+ return true;
+ default:
+ return false;
+ }
+}
+
static inline int op_stat_group(unsigned int op)
{
if (op_is_discard(op))
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f3ea78b0c91c..6a4f7abbdcf7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -360,14 +360,15 @@ extern unsigned int blkdev_nr_zones(struct block_device *bdev);
extern int blkdev_report_zones(struct block_device *bdev,
sector_t sector, struct blk_zone *zones,
unsigned int *nr_zones);
-extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors,
- sector_t nr_sectors, gfp_t gfp_mask);
+extern int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
+ sector_t sectors, sector_t nr_sectors,
+ gfp_t gfp_mask);
extern int blk_revalidate_disk_zones(struct gendisk *disk);
extern int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg);
-extern int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned int cmd, unsigned long arg);
+extern int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg);
#else /* CONFIG_BLK_DEV_ZONED */
@@ -388,9 +389,9 @@ static inline int blkdev_report_zones_ioctl(struct block_device *bdev,
return -ENOTTY;
}
-static inline int blkdev_reset_zones_ioctl(struct block_device *bdev,
- fmode_t mode, unsigned int cmd,
- unsigned long arg)
+static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev,
+ fmode_t mode, unsigned int cmd,
+ unsigned long arg)
{
return -ENOTTY;
}
@@ -411,7 +412,6 @@ struct request_queue {
/* sw queues */
struct blk_mq_ctx __percpu *queue_ctx;
- unsigned int nr_queues;
unsigned int queue_depth;
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index a986ac12a848..e40d019c3d9d 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -216,15 +216,6 @@ int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint,
*/
bool sbitmap_any_bit_set(const struct sbitmap *sb);
-/**
- * sbitmap_any_bit_clear() - Check for an unset bit in a &struct
- * sbitmap.
- * @sb: Bitmap to check.
- *
- * Return: true if any bit in the bitmap is clear, false otherwise.
- */
-bool sbitmap_any_bit_clear(const struct sbitmap *sb);
-
#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift)
#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U))
diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
index 53c28d750a45..1ac0d712a9c3 100644
--- a/include/linux/sed-opal.h
+++ b/include/linux/sed-opal.h
@@ -42,6 +42,7 @@ static inline bool is_sed_ioctl(unsigned int cmd)
case IOC_OPAL_PSID_REVERT_TPR:
case IOC_OPAL_MBR_DONE:
case IOC_OPAL_WRITE_SHADOW_MBR:
+ case IOC_OPAL_GENERIC_TABLE_RW:
return true;
}
return false;
diff --git a/include/trace/events/wbt.h b/include/trace/events/wbt.h
index b048694070e2..37342a13c9cb 100644
--- a/include/trace/events/wbt.h
+++ b/include/trace/events/wbt.h
@@ -33,7 +33,8 @@ TRACE_EVENT(wbt_stat,
),
TP_fast_assign(
- strncpy(__entry->name, dev_name(bdi->dev), 32);
+ strlcpy(__entry->name, dev_name(bdi->dev),
+ ARRAY_SIZE(__entry->name));
__entry->rmean = stat[0].mean;
__entry->rmin = stat[0].min;
__entry->rmax = stat[0].max;
@@ -67,7 +68,8 @@ TRACE_EVENT(wbt_lat,
),
TP_fast_assign(
- strncpy(__entry->name, dev_name(bdi->dev), 32);
+ strlcpy(__entry->name, dev_name(bdi->dev),
+ ARRAY_SIZE(__entry->name));
__entry->lat = div_u64(lat, 1000);
),
@@ -103,7 +105,8 @@ TRACE_EVENT(wbt_step,
),
TP_fast_assign(
- strncpy(__entry->name, dev_name(bdi->dev), 32);
+ strlcpy(__entry->name, dev_name(bdi->dev),
+ ARRAY_SIZE(__entry->name));
__entry->msg = msg;
__entry->step = step;
__entry->window = div_u64(window, 1000);
@@ -138,7 +141,8 @@ TRACE_EVENT(wbt_timer,
),
TP_fast_assign(
- strncpy(__entry->name, dev_name(bdi->dev), 32);
+ strlcpy(__entry->name, dev_name(bdi->dev),
+ ARRAY_SIZE(__entry->name));
__entry->status = status;
__entry->step = step;
__entry->inflight = inflight;
diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
index 498eec813494..0cdef67135f0 100644
--- a/include/uapi/linux/blkzoned.h
+++ b/include/uapi/linux/blkzoned.h
@@ -120,9 +120,11 @@ struct blk_zone_report {
};
/**
- * struct blk_zone_range - BLKRESETZONE ioctl request
- * @sector: starting sector of the first zone to issue reset write pointer
- * @nr_sectors: Total number of sectors of 1 or more zones to reset
+ * struct blk_zone_range - BLKRESETZONE/BLKOPENZONE/
+ * BLKCLOSEZONE/BLKFINISHZONE ioctl
+ * requests
+ * @sector: Starting sector of the first zone to operate on.
+ * @nr_sectors: Total number of sectors of all zones to operate on.
*/
struct blk_zone_range {
__u64 sector;
@@ -139,10 +141,19 @@ struct blk_zone_range {
* sector range. The sector range must be zone aligned.
* @BLKGETZONESZ: Get the device zone size in number of 512 B sectors.
* @BLKGETNRZONES: Get the total number of zones of the device.
+ * @BLKOPENZONE: Open the zones in the specified sector range.
+ * The 512 B sector range must be zone aligned.
+ * @BLKCLOSEZONE: Close the zones in the specified sector range.
+ * The 512 B sector range must be zone aligned.
+ * @BLKFINISHZONE: Mark the zones as full in the specified sector range.
+ * The 512 B sector range must be zone aligned.
*/
#define BLKREPORTZONE _IOWR(0x12, 130, struct blk_zone_report)
#define BLKRESETZONE _IOW(0x12, 131, struct blk_zone_range)
#define BLKGETZONESZ _IOR(0x12, 132, __u32)
#define BLKGETNRZONES _IOR(0x12, 133, __u32)
+#define BLKOPENZONE _IOW(0x12, 134, struct blk_zone_range)
+#define BLKCLOSEZONE _IOW(0x12, 135, struct blk_zone_range)
+#define BLKFINISHZONE _IOW(0x12, 136, struct blk_zone_range)
#endif /* _UAPI_BLKZONED_H */
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 1d338357df8a..1f97b33c840e 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -58,7 +58,7 @@
* Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
* used to clear any hints previously set.
*/
-#define RWF_WRITE_LIFE_NOT_SET 0
+#define RWH_WRITE_LIFE_NOT_SET 0
#define RWH_WRITE_LIFE_NONE 1
#define RWH_WRITE_LIFE_SHORT 2
#define RWH_WRITE_LIFE_MEDIUM 3
@@ -66,6 +66,13 @@
#define RWH_WRITE_LIFE_EXTREME 5
/*
+ * The originally introduced spelling is remained from the first
+ * versions of the patch set that introduced the feature, see commit
+ * v4.13-rc1~212^2~51.
+ */
+#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET
+
+/*
* Types of directory notifications that may be requested.
*/
#define DN_ACCESS 0x00000001 /* File accessed */
diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h
index c6d035fa1b6c..6f5af1a84213 100644
--- a/include/uapi/linux/sed-opal.h
+++ b/include/uapi/linux/sed-opal.h
@@ -113,6 +113,25 @@ struct opal_shadow_mbr {
__u64 size;
};
+/* Opal table operations */
+enum opal_table_ops {
+ OPAL_READ_TABLE,
+ OPAL_WRITE_TABLE,
+};
+
+#define OPAL_UID_LENGTH 8
+struct opal_read_write_table {
+ struct opal_key key;
+ const __u64 data;
+ const __u8 table_uid[OPAL_UID_LENGTH];
+ __u64 offset;
+ __u64 size;
+#define OPAL_TABLE_READ (1 << OPAL_READ_TABLE)
+#define OPAL_TABLE_WRITE (1 << OPAL_WRITE_TABLE)
+ __u64 flags;
+ __u64 priv;
+};
+
#define IOC_OPAL_SAVE _IOW('p', 220, struct opal_lock_unlock)
#define IOC_OPAL_LOCK_UNLOCK _IOW('p', 221, struct opal_lock_unlock)
#define IOC_OPAL_TAKE_OWNERSHIP _IOW('p', 222, struct opal_key)
@@ -128,5 +147,6 @@ struct opal_shadow_mbr {
#define IOC_OPAL_PSID_REVERT_TPR _IOW('p', 232, struct opal_key)
#define IOC_OPAL_MBR_DONE _IOW('p', 233, struct opal_mbr_done)
#define IOC_OPAL_WRITE_SHADOW_MBR _IOW('p', 234, struct opal_shadow_mbr)
+#define IOC_OPAL_GENERIC_TABLE_RW _IOW('p', 235, struct opal_read_write_table)
#endif /* _UAPI_SED_OPAL_H */