diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-11-01 12:30:07 -1000 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-11-01 12:30:07 -1000 |
commit | 90d624af2e5a9945eedd5cafd6ae6d88f32cc977 (patch) | |
tree | e936a0cc8f2b613f327ab08280dccbad664703cf | |
parent | 4de520f1fcefd4ebb7dddcf28bde1b330c2f6b5d (diff) | |
parent | 0c696bb38f4cc0f0f90a8e06ae1eda21a9630cd0 (diff) | |
download | lwn-90d624af2e5a9945eedd5cafd6ae6d88f32cc977.tar.gz lwn-90d624af2e5a9945eedd5cafd6ae6d88f32cc977.zip |
Merge tag 'for-6.7/block-2023-10-30' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe:
- Improvements to the queue_rqs() support, and adding null_blk support
for that as well (Chengming)
- Series improving badblocks support (Coly)
- Key store support for sed-opal (Greg)
- IBM partition string handling improvements (Jan)
- Make number of ublk devices supported configurable (Mike)
- Cancelation improvements for ublk (Ming)
- MD pull requests via Song:
- Handle timeout in md-cluster, by Denis Plotnikov
- Cleanup pers->prepare_suspend, by Yu Kuai
- Rewrite mddev_suspend(), by Yu Kuai
- Simplify md_seq_ops, by Yu Kuai
- Reduce unnecessary locking array_state_store(), by Mariusz
Tkaczyk
- Make rdev add/remove independent from daemon thread, by Yu Kuai
- Refactor code around quiesce() and mddev_suspend(), by Yu Kuai
- NVMe pull request via Keith:
- nvme-auth updates (Mark)
- nvme-tcp tls (Hannes)
- nvme-fc annotaions (Kees)
- Misc cleanups and improvements (Jiapeng, Joel)
* tag 'for-6.7/block-2023-10-30' of git://git.kernel.dk/linux: (95 commits)
block: ublk_drv: Remove unused function
md: cleanup pers->prepare_suspend()
nvme-auth: allow mixing of secret and hash lengths
nvme-auth: use transformed key size to create resp
nvme-auth: alloc nvme_dhchap_key as single buffer
nvmet-tcp: use 'spin_lock_bh' for state_lock()
powerpc/pseries: PLPKS SED Opal keystore support
block: sed-opal: keystore access for SED Opal keys
block:sed-opal: SED Opal keystore
ublk: simplify aborting request
ublk: replace monitor with cancelable uring_cmd
ublk: quiesce request queue when aborting queue
ublk: rename mm_lock as lock
ublk: move ublk_cancel_dev() out of ub->mutex
ublk: make sure io cmd handled in submitter task context
ublk: don't get ublk device reference in ublk_abort_queue()
ublk: Make ublks_max configurable
ublk: Limit dev_id/ub_number values
md-cluster: check for timeout while a new disk adding
nvme: rework NVME_AUTH Kconfig selection
...
57 files changed, 3610 insertions, 1226 deletions
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index 4ebf2ef2845d..afc0f6a61337 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -164,6 +164,12 @@ config PSERIES_PLPKS # This option is selected by in-kernel consumers that require # access to the PKS. +config PSERIES_PLPKS_SED + depends on PPC_PSERIES + bool + # This option is selected by in-kernel consumers that require + # access to the SED PKS keystore. + config PAPR_SCM depends on PPC_PSERIES && MEMORY_HOTPLUG && LIBNVDIMM tristate "Support for the PAPR Storage Class Memory interface" diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index 53c3b91af2f7..1476c5e4433c 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_PPC_SVM) += svm.o obj-$(CONFIG_FA_DUMP) += rtas-fadump.o obj-$(CONFIG_PSERIES_PLPKS) += plpks.o obj-$(CONFIG_PPC_SECURE_BOOT) += plpks-secvar.o +obj-$(CONFIG_PSERIES_PLPKS_SED) += plpks_sed_ops.o obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PPC_VAS) += vas.o vas-sysfs.o diff --git a/arch/powerpc/platforms/pseries/plpks_sed_ops.c b/arch/powerpc/platforms/pseries/plpks_sed_ops.c new file mode 100644 index 000000000000..7c873c9589ef --- /dev/null +++ b/arch/powerpc/platforms/pseries/plpks_sed_ops.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * POWER Platform specific code for non-volatile SED key access + * Copyright (C) 2022 IBM Corporation + * + * Define operations for SED Opal to read/write keys + * from POWER LPAR Platform KeyStore(PLPKS). + * + * Self Encrypting Drives(SED) key storage using PLPKS + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/ioctl.h> +#include <linux/sed-opal-key.h> +#include <asm/plpks.h> + +static bool plpks_sed_initialized = false; +static bool plpks_sed_available = false; + +/* + * structure that contains all SED data + */ +struct plpks_sed_object_data { + u_char version; + u_char pad1[7]; + u_long authority; + u_long range; + u_int key_len; + u_char key[32]; +}; + +#define PLPKS_SED_OBJECT_DATA_V0 0 +#define PLPKS_SED_MANGLED_LABEL "/default/pri" +#define PLPKS_SED_COMPONENT "sed-opal" +#define PLPKS_SED_KEY "opal-boot-pin" + +/* + * authority is admin1 and range is global + */ +#define PLPKS_SED_AUTHORITY 0x0000000900010001 +#define PLPKS_SED_RANGE 0x0000080200000001 + +static void plpks_init_var(struct plpks_var *var, char *keyname) +{ + if (!plpks_sed_initialized) { + plpks_sed_initialized = true; + plpks_sed_available = plpks_is_available(); + if (!plpks_sed_available) + pr_err("SED: plpks not available\n"); + } + + var->name = keyname; + var->namelen = strlen(keyname); + if (strcmp(PLPKS_SED_KEY, keyname) == 0) { + var->name = PLPKS_SED_MANGLED_LABEL; + var->namelen = strlen(keyname); + } + var->policy = PLPKS_WORLDREADABLE; + var->os = PLPKS_VAR_COMMON; + var->data = NULL; + var->datalen = 0; + var->component = PLPKS_SED_COMPONENT; +} + +/* + * Read the SED Opal key from PLPKS given the label + */ +int sed_read_key(char *keyname, char *key, u_int *keylen) +{ + struct plpks_var var; + struct plpks_sed_object_data data; + int ret; + u_int len; + + plpks_init_var(&var, keyname); + + if (!plpks_sed_available) + return -EOPNOTSUPP; + + var.data = (u8 *)&data; + var.datalen = sizeof(data); + + ret = plpks_read_os_var(&var); + if (ret != 0) + return ret; + + len = min_t(u16, be32_to_cpu(data.key_len), var.datalen); + memcpy(key, data.key, len); + key[len] = '\0'; + *keylen = len; + + return 0; +} + +/* + * Write the SED Opal key to PLPKS given the label + */ +int sed_write_key(char *keyname, char *key, u_int keylen) +{ + struct plpks_var var; + struct plpks_sed_object_data data; + struct plpks_var_name vname; + + plpks_init_var(&var, keyname); + + if (!plpks_sed_available) + return -EOPNOTSUPP; + + var.datalen = sizeof(struct plpks_sed_object_data); + var.data = (u8 *)&data; + + /* initialize SED object */ + data.version = PLPKS_SED_OBJECT_DATA_V0; + data.authority = cpu_to_be64(PLPKS_SED_AUTHORITY); + data.range = cpu_to_be64(PLPKS_SED_RANGE); + memset(&data.pad1, '\0', sizeof(data.pad1)); + data.key_len = cpu_to_be32(keylen); + memcpy(data.key, (char *)key, keylen); + + /* + * Key update requires remove first. The return value + * is ignored since it's okay if the key doesn't exist. + */ + vname.namelen = var.namelen; + vname.name = var.name; + plpks_remove_var(var.component, var.os, vname); + + return plpks_write_var(var); +} diff --git a/block/Kconfig b/block/Kconfig index f1364d1c0d93..55ae2286a4de 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -186,6 +186,7 @@ config BLK_SED_OPAL bool "Logic for interfacing with Opal enabled SEDs" depends on KEYS select PSERIES_PLPKS if PPC_PSERIES + select PSERIES_PLPKS_SED if PPC_PSERIES help Builds Logic for interfacing with Opal enabled controllers. Enabling this option enables users to setup/unlock/lock diff --git a/block/badblocks.c b/block/badblocks.c index 3afb550c0f7b..fc92d4e18aa3 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -16,119 +16,830 @@ #include <linux/types.h> #include <linux/slab.h> -/** - * badblocks_check() - check a given range for bad sectors - * @bb: the badblocks structure that holds all badblock information - * @s: sector (start) at which to check for badblocks - * @sectors: number of sectors to check for badblocks - * @first_bad: pointer to store location of the first badblock - * @bad_sectors: pointer to store number of badblocks after @first_bad +/* + * The purpose of badblocks set/clear is to manage bad blocks ranges which are + * identified by LBA addresses. * - * We can record which blocks on each device are 'bad' and so just - * fail those blocks, or that stripe, rather than the whole device. - * Entries in the bad-block table are 64bits wide. This comprises: - * Length of bad-range, in sectors: 0-511 for lengths 1-512 - * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) - * A 'shift' can be set so that larger blocks are tracked and - * consequently larger devices can be covered. - * 'Acknowledged' flag - 1 bit. - the most significant bit. + * When the caller of badblocks_set() wants to set a range of bad blocks, the + * setting range can be acked or unacked. And the setting range may merge, + * overwrite, skip the overlapped already set range, depends on who they are + * overlapped or adjacent, and the acknowledgment type of the ranges. It can be + * more complicated when the setting range covers multiple already set bad block + * ranges, with restrictions of maximum length of each bad range and the bad + * table space limitation. * - * Locking of the bad-block table uses a seqlock so badblocks_check - * might need to retry if it is very unlucky. - * We will sometimes want to check for bad blocks in a bi_end_io function, - * so we use the write_seqlock_irq variant. + * It is difficult and unnecessary to take care of all the possible situations, + * for setting a large range of bad blocks, we can handle it by dividing the + * large range into smaller ones when encounter overlap, max range length or + * bad table full conditions. Every time only a smaller piece of the bad range + * is handled with a limited number of conditions how it is interacted with + * possible overlapped or adjacent already set bad block ranges. Then the hard + * complicated problem can be much simpler to handle in proper way. * - * When looking for a bad block we specify a range and want to - * know if any block in the range is bad. So we binary-search - * to the last range that starts at-or-before the given endpoint, - * (or "before the sector after the target range") - * then see if it ends after the given start. + * When setting a range of bad blocks to the bad table, the simplified situations + * to be considered are, (The already set bad blocks ranges are naming with + * prefix E, and the setting bad blocks range is naming with prefix S) * - * Return: - * 0: there are no known bad blocks in the range - * 1: there are known bad block which are all acknowledged - * -1: there are bad blocks which have not yet been acknowledged in metadata. - * plus the start/length of the first bad section we overlap. + * 1) A setting range is not overlapped or adjacent to any other already set bad + * block range. + * +--------+ + * | S | + * +--------+ + * +-------------+ +-------------+ + * | E1 | | E2 | + * +-------------+ +-------------+ + * For this situation if the bad blocks table is not full, just allocate a + * free slot from the bad blocks table to mark the setting range S. The + * result is, + * +-------------+ +--------+ +-------------+ + * | E1 | | S | | E2 | + * +-------------+ +--------+ +-------------+ + * 2) A setting range starts exactly at a start LBA of an already set bad blocks + * range. + * 2.1) The setting range size < already set range size + * +--------+ + * | S | + * +--------+ + * +-------------+ + * | E | + * +-------------+ + * 2.1.1) If S and E are both acked or unacked range, the setting range S can + * be merged into existing bad range E. The result is, + * +-------------+ + * | S | + * +-------------+ + * 2.1.2) If S is unacked setting and E is acked, the setting will be denied, and + * the result is, + * +-------------+ + * | E | + * +-------------+ + * 2.1.3) If S is acked setting and E is unacked, range S can overwrite on E. + * An extra slot from the bad blocks table will be allocated for S, and head + * of E will move to end of the inserted range S. The result is, + * +--------+----+ + * | S | E | + * +--------+----+ + * 2.2) The setting range size == already set range size + * 2.2.1) If S and E are both acked or unacked range, the setting range S can + * be merged into existing bad range E. The result is, + * +-------------+ + * | S | + * +-------------+ + * 2.2.2) If S is unacked setting and E is acked, the setting will be denied, and + * the result is, + * +-------------+ + * | E | + * +-------------+ + * 2.2.3) If S is acked setting and E is unacked, range S can overwrite all of + bad blocks range E. The result is, + * +-------------+ + * | S | + * +-------------+ + * 2.3) The setting range size > already set range size + * +-------------------+ + * | S | + * +-------------------+ + * +-------------+ + * | E | + * +-------------+ + * For such situation, the setting range S can be treated as two parts, the + * first part (S1) is as same size as the already set range E, the second + * part (S2) is the rest of setting range. + * +-------------+-----+ +-------------+ +-----+ + * | S1 | S2 | | S1 | | S2 | + * +-------------+-----+ ===> +-------------+ +-----+ + * +-------------+ +-------------+ + * | E | | E | + * +-------------+ +-------------+ + * Now we only focus on how to handle the setting range S1 and already set + * range E, which are already explained in 2.2), for the rest S2 it will be + * handled later in next loop. + * 3) A setting range starts before the start LBA of an already set bad blocks + * range. + * +-------------+ + * | S | + * +-------------+ + * +-------------+ + * | E | + * +-------------+ + * For this situation, the setting range S can be divided into two parts, the + * first (S1) ends at the start LBA of already set range E, the second part + * (S2) starts exactly at a start LBA of the already set range E. + * +----+---------+ +----+ +---------+ + * | S1 | S2 | | S1 | | S2 | + * +----+---------+ ===> +----+ +---------+ + * +-------------+ +-------------+ + * | E | | E | + * +-------------+ +-------------+ + * Now only the first part S1 should be handled in this loop, which is in + * similar condition as 1). The rest part S2 has exact same start LBA address + * of the already set range E, they will be handled in next loop in one of + * situations in 2). + * 4) A setting range starts after the start LBA of an already set bad blocks + * range. + * 4.1) If the setting range S exactly matches the tail part of already set bad + * blocks range E, like the following chart shows, + * +---------+ + * | S | + * +---------+ + * +-------------+ + * | E | + * +-------------+ + * 4.1.1) If range S and E have same acknowledge value (both acked or unacked), + * they will be merged into one, the result is, + * +-------------+ + * | S | + * +-------------+ + * 4.1.2) If range E is acked and the setting range S is unacked, the setting + * request of S will be rejected, the result is, + * +-------------+ + * | E | + * +-------------+ + * 4.1.3) If range E is unacked, and the setting range S is acked, then S may + * overwrite the overlapped range of E, the result is, + * +---+---------+ + * | E | S | + * +---+---------+ + * 4.2) If the setting range S stays in middle of an already set range E, like + * the following chart shows, + * +----+ + * | S | + * +----+ + * +--------------+ + * | E | + * +--------------+ + * 4.2.1) If range S and E have same acknowledge value (both acked or unacked), + * they will be merged into one, the result is, + * +--------------+ + * | S | + * +--------------+ + * 4.2.2) If range E is acked and the setting range S is unacked, the setting + * request of S will be rejected, the result is also, + * +--------------+ + * | E | + * +--------------+ + * 4.2.3) If range E is unacked, and the setting range S is acked, then S will + * inserted into middle of E and split previous range E into two parts (E1 + * and E2), the result is, + * +----+----+----+ + * | E1 | S | E2 | + * +----+----+----+ + * 4.3) If the setting bad blocks range S is overlapped with an already set bad + * blocks range E. The range S starts after the start LBA of range E, and + * ends after the end LBA of range E, as the following chart shows, + * +-------------------+ + * | S | + * +-------------------+ + * +-------------+ + * | E | + * +-------------+ + * For this situation the range S can be divided into two parts, the first + * part (S1) ends at end range E, and the second part (S2) has rest range of + * origin S. + * +---------+---------+ +---------+ +---------+ + * | S1 | S2 | | S1 | | S2 | + * +---------+---------+ ===> +---------+ +---------+ + * +-------------+ +-------------+ + * | E | | E | + * +-------------+ +-------------+ + * Now in this loop the setting range S1 and already set range E can be + * handled as the situations 4.1), the rest range S2 will be handled in next + * loop and ignored in this loop. + * 5) A setting bad blocks range S is adjacent to one or more already set bad + * blocks range(s), and they are all acked or unacked range. + * 5.1) Front merge: If the already set bad blocks range E is before setting + * range S and they are adjacent, + * +------+ + * | S | + * +------+ + * +-------+ + * | E | + * +-------+ + * 5.1.1) When total size of range S and E <= BB_MAX_LEN, and their acknowledge + * values are same, the setting range S can front merges into range E. The + * result is, + * +--------------+ + * | S | + * +--------------+ + * 5.1.2) Otherwise these two ranges cannot merge, just insert the setting + * range S right after already set range E into the bad blocks table. The + * result is, + * +--------+------+ + * | E | S | + * +--------+------+ + * 6) Special cases which above conditions cannot handle + * 6.1) Multiple already set ranges may merge into less ones in a full bad table + * +-------------------------------------------------------+ + * | S | + * +-------------------------------------------------------+ + * |<----- BB_MAX_LEN ----->| + * +-----+ +-----+ +-----+ + * | E1 | | E2 | | E3 | + * +-----+ +-----+ +-----+ + * In the above example, when the bad blocks table is full, inserting the + * first part of setting range S will fail because no more available slot + * can be allocated from bad blocks table. In this situation a proper + * setting method should be go though all the setting bad blocks range and + * look for chance to merge already set ranges into less ones. When there + * is available slot from bad blocks table, re-try again to handle more + * setting bad blocks ranges as many as possible. + * +------------------------+ + * | S3 | + * +------------------------+ + * |<----- BB_MAX_LEN ----->| + * +-----+-----+-----+---+-----+--+ + * | S1 | S2 | + * +-----+-----+-----+---+-----+--+ + * The above chart shows although the first part (S3) cannot be inserted due + * to no-space in bad blocks table, but the following E1, E2 and E3 ranges + * can be merged with rest part of S into less range S1 and S2. Now there is + * 1 free slot in bad blocks table. + * +------------------------+-----+-----+-----+---+-----+--+ + * | S3 | S1 | S2 | + * +------------------------+-----+-----+-----+---+-----+--+ + * Since the bad blocks table is not full anymore, re-try again for the + * origin setting range S. Now the setting range S3 can be inserted into the + * bad blocks table with previous freed slot from multiple ranges merge. + * 6.2) Front merge after overwrite + * In the following example, in bad blocks table, E1 is an acked bad blocks + * range and E2 is an unacked bad blocks range, therefore they are not able + * to merge into a larger range. The setting bad blocks range S is acked, + * therefore part of E2 can be overwritten by S. + * +--------+ + * | S | acknowledged + * +--------+ S: 1 + * +-------+-------------+ E1: 1 + * | E1 | E2 | E2: 0 + * +-------+-------------+ + * With previous simplified routines, after overwriting part of E2 with S, + * the bad blocks table should be (E3 is remaining part of E2 which is not + * overwritten by S), + * acknowledged + * +-------+--------+----+ S: 1 + * | E1 | S | E3 | E1: 1 + * +-------+--------+----+ E3: 0 + * The above result is correct but not perfect. Range E1 and S in the bad + * blocks table are all acked, merging them into a larger one range may + * occupy less bad blocks table space and make badblocks_check() faster. + * Therefore in such situation, after overwriting range S, the previous range + * E1 should be checked for possible front combination. Then the ideal + * result can be, + * +----------------+----+ acknowledged + * | E1 | E3 | E1: 1 + * +----------------+----+ E3: 0 + * 6.3) Behind merge: If the already set bad blocks range E is behind the setting + * range S and they are adjacent. Normally we don't need to care about this + * because front merge handles this while going though range S from head to + * tail, except for the tail part of range S. When the setting range S are + * fully handled, all the above simplified routine doesn't check whether the + * tail LBA of range S is adjacent to the next already set range and not + * merge them even it is possible. + * +------+ + * | S | + * +------+ + * +-------+ + * | E | + * +-------+ + * For the above special situation, when the setting range S are all handled + * and the loop ends, an extra check is necessary for whether next already + * set range E is right after S and mergeable. + * 6.3.1) When total size of range E and S <= BB_MAX_LEN, and their acknowledge + * values are same, the setting range S can behind merges into range E. The + * result is, + * +--------------+ + * | S | + * +--------------+ + * 6.3.2) Otherwise these two ranges cannot merge, just insert the setting range + * S in front of the already set range E in the bad blocks table. The result + * is, + * +------+-------+ + * | S | E | + * +------+-------+ + * + * All the above 5 simplified situations and 3 special cases may cover 99%+ of + * the bad block range setting conditions. Maybe there is some rare corner case + * is not considered and optimized, it won't hurt if badblocks_set() fails due + * to no space, or some ranges are not merged to save bad blocks table space. + * + * Inside badblocks_set() each loop starts by jumping to re_insert label, every + * time for the new loop prev_badblocks() is called to find an already set range + * which starts before or at current setting range. Since the setting bad blocks + * range is handled from head to tail, most of the cases it is unnecessary to do + * the binary search inside prev_badblocks(), it is possible to provide a hint + * to prev_badblocks() for a fast path, then the expensive binary search can be + * avoided. In my test with the hint to prev_badblocks(), except for the first + * loop, all rested calls to prev_badblocks() can go into the fast path and + * return correct bad blocks table index immediately. + * + * + * Clearing a bad blocks range from the bad block table has similar idea as + * setting does, but much more simpler. The only thing needs to be noticed is + * when the clearing range hits middle of a bad block range, the existing bad + * block range will split into two, and one more item should be added into the + * bad block table. The simplified situations to be considered are, (The already + * set bad blocks ranges in bad block table are naming with prefix E, and the + * clearing bad blocks range is naming with prefix C) + * + * 1) A clearing range is not overlapped to any already set ranges in bad block + * table. + * +-----+ | +-----+ | +-----+ + * | C | | | C | | | C | + * +-----+ or +-----+ or +-----+ + * +---+ | +----+ +----+ | +---+ + * | E | | | E1 | | E2 | | | E | + * +---+ | +----+ +----+ | +---+ + * For the above situations, no bad block to be cleared and no failure + * happens, simply returns 0. + * 2) The clearing range hits middle of an already setting bad blocks range in + * the bad block table. + * +---+ + * | C | + * +---+ + * +-----------------+ + * | E | + * +-----------------+ + * In this situation if the bad block table is not full, the range E will be + * split into two ranges E1 and E2. The result is, + * +------+ +------+ + * | E1 | | E2 | + * +------+ +------+ + * 3) The clearing range starts exactly at same LBA as an already set bad block range + * from the bad block table. + * 3.1) Partially covered at head part + * +------------+ + * | C | + * +------------+ + * +-----------------+ + * | E | + * +-----------------+ + * For this situation, the overlapped already set range will update the + * start LBA to end of C and shrink the range to BB_LEN(E) - BB_LEN(C). No + * item deleted from bad block table. The result is, + * +----+ + * | E1 | + * +----+ + * 3.2) Exact fully covered + * +-----------------+ + * | C | + * +-----------------+ + * +-----------------+ + * | E | + * +-----------------+ + * For this situation the whole bad blocks range E will be cleared and its + * corresponded item is deleted from the bad block table. + * 4) The clearing range exactly ends at same LBA as an already set bad block + * range. + * +-------+ + * | C | + * +-------+ + * +-----------------+ + * | E | + * +-----------------+ + * For the above situation, the already set range E is updated to shrink its + * end to the start of C, and reduce its length to BB_LEN(E) - BB_LEN(C). + * The result is, + * +---------+ + * | E | + * +---------+ + * 5) The clearing range is partially overlapped with an already set bad block + * range from the bad block table. + * 5.1) The already set bad block range is front overlapped with the clearing + * range. + * +----------+ + * | C | + * +----------+ + * +------------+ + * | E | + * +------------+ + * For such situation, the clearing range C can be treated as two parts. The + * first part ends at the start LBA of range E, and the second part starts at + * same LBA of range E. + * +----+-----+ +----+ +-----+ + * | C1 | C2 | | C1 | | C2 | + * +----+-----+ ===> +----+ +-----+ + * +------------+ +------------+ + * | E | | E | + * +------------+ +------------+ + * Now the first part C1 can be handled as condition 1), and the second part C2 can be + * handled as condition 3.1) in next loop. + * 5.2) The already set bad block range is behind overlaopped with the clearing + * range. + * +----------+ + * | C | + * +----------+ + * +------------+ + * | E | + * +------------+ + * For such situation, the clearing range C can be treated as two parts. The + * first part C1 ends at same end LBA of range E, and the second part starts + * at end LBA of range E. + * +----+-----+ +----+ +-----+ + * | C1 | C2 | | C1 | | C2 | + * +----+-----+ ===> +----+ +-----+ + * +------------+ +------------+ + * | E | | E | + * +------------+ +------------+ + * Now the first part clearing range C1 can be handled as condition 4), and + * the second part clearing range C2 can be handled as condition 1) in next + * loop. + * + * All bad blocks range clearing can be simplified into the above 5 situations + * by only handling the head part of the clearing range in each run of the + * while-loop. The idea is similar to bad blocks range setting but much + * simpler. */ -int badblocks_check(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) + +/* + * Find the range starts at-or-before 's' from bad table. The search + * starts from index 'hint' and stops at index 'hint_end' from the bad + * table. + */ +static int prev_by_hint(struct badblocks *bb, sector_t s, int hint) { - int hi; - int lo; + int hint_end = hint + 2; u64 *p = bb->page; - int rv; - sector_t target = s + sectors; - unsigned seq; + int ret = -1; - if (bb->shift > 0) { - /* round the start down, and the end up */ - s >>= bb->shift; - target += (1<<bb->shift) - 1; - target >>= bb->shift; + while ((hint < hint_end) && ((hint + 1) <= bb->count) && + (BB_OFFSET(p[hint]) <= s)) { + if ((hint + 1) == bb->count || BB_OFFSET(p[hint + 1]) > s) { + ret = hint; + break; + } + hint++; + } + + return ret; +} + +/* + * Find the range starts at-or-before bad->start. If 'hint' is provided + * (hint >= 0) then search in the bad table from hint firstly. It is + * very probably the wanted bad range can be found from the hint index, + * then the unnecessary while-loop iteration can be avoided. + */ +static int prev_badblocks(struct badblocks *bb, struct badblocks_context *bad, + int hint) +{ + sector_t s = bad->start; + int ret = -1; + int lo, hi; + u64 *p; + + if (!bb->count) + goto out; + + if (hint >= 0) { + ret = prev_by_hint(bb, s, hint); + if (ret >= 0) + goto out; } - /* 'target' is now the first block after the bad range */ -retry: - seq = read_seqbegin(&bb->lock); lo = 0; - rv = 0; hi = bb->count; + p = bb->page; - /* Binary search between lo and hi for 'target' - * i.e. for the last range that starts before 'target' - */ - /* INVARIANT: ranges before 'lo' and at-or-after 'hi' - * are known not to be the last range before target. - * VARIANT: hi-lo is the number of possible - * ranges, and decreases until it reaches 1 - */ + /* The following bisect search might be unnecessary */ + if (BB_OFFSET(p[lo]) > s) + return -1; + if (BB_OFFSET(p[hi - 1]) <= s) + return hi - 1; + + /* Do bisect search in bad table */ while (hi - lo > 1) { - int mid = (lo + hi) / 2; + int mid = (lo + hi)/2; sector_t a = BB_OFFSET(p[mid]); - if (a < target) - /* This could still be the one, earlier ranges - * could not. - */ + if (a == s) { + ret = mid; + goto out; + } + + if (a < s) lo = mid; else - /* This and later ranges are definitely out. */ hi = mid; } - /* 'lo' might be the last that started before target, but 'hi' isn't */ - if (hi > lo) { - /* need to check all range that end after 's' to see if - * any are unacknowledged. + + if (BB_OFFSET(p[lo]) <= s) + ret = lo; +out: + return ret; +} + +/* + * Return 'true' if the range indicated by 'bad' can be backward merged + * with the bad range (from the bad table) index by 'behind'. + */ +static bool can_merge_behind(struct badblocks *bb, + struct badblocks_context *bad, int behind) +{ + sector_t sectors = bad->len; + sector_t s = bad->start; + u64 *p = bb->page; + + if ((s < BB_OFFSET(p[behind])) && + ((s + sectors) >= BB_OFFSET(p[behind])) && + ((BB_END(p[behind]) - s) <= BB_MAX_LEN) && + BB_ACK(p[behind]) == bad->ack) + return true; + return false; +} + +/* + * Do backward merge for range indicated by 'bad' and the bad range + * (from the bad table) indexed by 'behind'. The return value is merged + * sectors from bad->len. + */ +static int behind_merge(struct badblocks *bb, struct badblocks_context *bad, + int behind) +{ + sector_t sectors = bad->len; + sector_t s = bad->start; + u64 *p = bb->page; + int merged = 0; + + WARN_ON(s >= BB_OFFSET(p[behind])); + WARN_ON((s + sectors) < BB_OFFSET(p[behind])); + + if (s < BB_OFFSET(p[behind])) { + merged = BB_OFFSET(p[behind]) - s; + p[behind] = BB_MAKE(s, BB_LEN(p[behind]) + merged, bad->ack); + + WARN_ON((BB_LEN(p[behind]) + merged) >= BB_MAX_LEN); + } + + return merged; +} + +/* + * Return 'true' if the range indicated by 'bad' can be forward + * merged with the bad range (from the bad table) indexed by 'prev'. + */ +static bool can_merge_front(struct badblocks *bb, int prev, + struct badblocks_context *bad) +{ + sector_t s = bad->start; + u64 *p = bb->page; + + if (BB_ACK(p[prev]) == bad->ack && + (s < BB_END(p[prev]) || + (s == BB_END(p[prev]) && (BB_LEN(p[prev]) < BB_MAX_LEN)))) + return true; + return false; +} + +/* + * Do forward merge for range indicated by 'bad' and the bad range + * (from bad table) indexed by 'prev'. The return value is sectors + * merged from bad->len. + */ +static int front_merge(struct badblocks *bb, int prev, struct badblocks_context *bad) +{ + sector_t sectors = bad->len; + sector_t s = bad->start; + u64 *p = bb->page; + int merged = 0; + + WARN_ON(s > BB_END(p[prev])); + + if (s < BB_END(p[prev])) { + merged = min_t(sector_t, sectors, BB_END(p[prev]) - s); + } else { + merged = min_t(sector_t, sectors, BB_MAX_LEN - BB_LEN(p[prev])); + if ((prev + 1) < bb->count && + merged > (BB_OFFSET(p[prev + 1]) - BB_END(p[prev]))) { + merged = BB_OFFSET(p[prev + 1]) - BB_END(p[prev]); + } + + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), + BB_LEN(p[prev]) + merged, bad->ack); + } + + return merged; +} + +/* + * 'Combine' is a special case which can_merge_front() is not able to + * handle: If a bad range (indexed by 'prev' from bad table) exactly + * starts as bad->start, and the bad range ahead of 'prev' (indexed by + * 'prev - 1' from bad table) exactly ends at where 'prev' starts, and + * the sum of their lengths does not exceed BB_MAX_LEN limitation, then + * these two bad range (from bad table) can be combined. + * + * Return 'true' if bad ranges indexed by 'prev' and 'prev - 1' from bad + * table can be combined. + */ +static bool can_combine_front(struct badblocks *bb, int prev, + struct badblocks_context *bad) +{ + u64 *p = bb->page; + + if ((prev > 0) && + (BB_OFFSET(p[prev]) == bad->start) && + (BB_END(p[prev - 1]) == BB_OFFSET(p[prev])) && + (BB_LEN(p[prev - 1]) + BB_LEN(p[prev]) <= BB_MAX_LEN) && + (BB_ACK(p[prev - 1]) == BB_ACK(p[prev]))) + return true; + return false; +} + +/* + * Combine the bad ranges indexed by 'prev' and 'prev - 1' (from bad + * table) into one larger bad range, and the new range is indexed by + * 'prev - 1'. + * The caller of front_combine() will decrease bb->count, therefore + * it is unnecessary to clear p[perv] after front merge. + */ +static void front_combine(struct badblocks *bb, int prev) +{ + u64 *p = bb->page; + + p[prev - 1] = BB_MAKE(BB_OFFSET(p[prev - 1]), + BB_LEN(p[prev - 1]) + BB_LEN(p[prev]), + BB_ACK(p[prev])); + if ((prev + 1) < bb->count) + memmove(p + prev, p + prev + 1, (bb->count - prev - 1) * 8); +} + +/* + * Return 'true' if the range indicated by 'bad' is exactly forward + * overlapped with the bad range (from bad table) indexed by 'front'. + * Exactly forward overlap means the bad range (from bad table) indexed + * by 'prev' does not cover the whole range indicated by 'bad'. + */ +static bool overlap_front(struct badblocks *bb, int front, + struct badblocks_context *bad) +{ + u64 *p = bb->page; + + if (bad->start >= BB_OFFSET(p[front]) && + bad->start < BB_END(p[front])) + return true; + return false; +} + +/* + * Return 'true' if the range indicated by 'bad' is exactly backward + * overlapped with the bad range (from bad table) indexed by 'behind'. + */ +static bool overlap_behind(struct badblocks *bb, struct badblocks_context *bad, + int behind) +{ + u64 *p = bb->page; + + if (bad->start < BB_OFFSET(p[behind]) && + (bad->start + bad->len) > BB_OFFSET(p[behind])) + return true; + return false; +} + +/* + * Return 'true' if the range indicated by 'bad' can overwrite the bad + * range (from bad table) indexed by 'prev'. + * + * The range indicated by 'bad' can overwrite the bad range indexed by + * 'prev' when, + * 1) The whole range indicated by 'bad' can cover partial or whole bad + * range (from bad table) indexed by 'prev'. + * 2) The ack value of 'bad' is larger or equal to the ack value of bad + * range 'prev'. + * + * If the overwriting doesn't cover the whole bad range (from bad table) + * indexed by 'prev', new range might be split from existing bad range, + * 1) The overwrite covers head or tail part of existing bad range, 1 + * extra bad range will be split and added into the bad table. + * 2) The overwrite covers middle of existing bad range, 2 extra bad + * ranges will be split (ahead and after the overwritten range) and + * added into the bad table. + * The number of extra split ranges of the overwriting is stored in + * 'extra' and returned for the caller. + */ +static bool can_front_overwrite(struct badblocks *bb, int prev, + struct badblocks_context *bad, int *extra) +{ + u64 *p = bb->page; + int len; + + WARN_ON(!overlap_front(bb, prev, bad)); + + if (BB_ACK(p[prev]) >= bad->ack) + return false; + + if (BB_END(p[prev]) <= (bad->start + bad->len)) { + len = BB_END(p[prev]) - bad->start; + if (BB_OFFSET(p[prev]) == bad->start) + *extra = 0; + else + *extra = 1; + + bad->len = len; + } else { + if (BB_OFFSET(p[prev]) == bad->start) + *extra = 1; + else + /* + * prev range will be split into two, beside the overwritten + * one, an extra slot needed from bad table. */ - while (lo >= 0 && - BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { - if (BB_OFFSET(p[lo]) < target) { - /* starts before the end, and finishes after - * the start, so they must overlap - */ - if (rv != -1 && BB_ACK(p[lo])) - rv = 1; - else - rv = -1; - *first_bad = BB_OFFSET(p[lo]); - *bad_sectors = BB_LEN(p[lo]); - } - lo--; + *extra = 2; + } + + if ((bb->count + (*extra)) >= MAX_BADBLOCKS) + return false; + + return true; +} + +/* + * Do the overwrite from the range indicated by 'bad' to the bad range + * (from bad table) indexed by 'prev'. + * The previously called can_front_overwrite() will provide how many + * extra bad range(s) might be split and added into the bad table. All + * the splitting cases in the bad table will be handled here. + */ +static int front_overwrite(struct badblocks *bb, int prev, + struct badblocks_context *bad, int extra) +{ + u64 *p = bb->page; + sector_t orig_end = BB_END(p[prev]); + int orig_ack = BB_ACK(p[prev]); + + switch (extra) { + case 0: + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), BB_LEN(p[prev]), + bad->ack); + break; + case 1: + if (BB_OFFSET(p[prev]) == bad->start) { + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), + bad->len, bad->ack); + memmove(p + prev + 2, p + prev + 1, + (bb->count - prev - 1) * 8); + p[prev + 1] = BB_MAKE(bad->start + bad->len, + orig_end - BB_END(p[prev]), + orig_ack); + } else { + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), + bad->start - BB_OFFSET(p[prev]), + orig_ack); + /* + * prev +2 -> prev + 1 + 1, which is for, + * 1) prev + 1: the slot index of the previous one + * 2) + 1: one more slot for extra being 1. + */ + memmove(p + prev + 2, p + prev + 1, + (bb->count - prev - 1) * 8); + p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack); } + break; + case 2: + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), + bad->start - BB_OFFSET(p[prev]), + orig_ack); + /* + * prev + 3 -> prev + 1 + 2, which is for, + * 1) prev + 1: the slot index of the previous one + * 2) + 2: two more slots for extra being 2. + */ + memmove(p + prev + 3, p + prev + 1, + (bb->count - prev - 1) * 8); + p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack); + p[prev + 2] = BB_MAKE(BB_END(p[prev + 1]), + orig_end - BB_END(p[prev + 1]), + orig_ack); + break; + default: + break; } - if (read_seqretry(&bb->lock, seq)) - goto retry; + return bad->len; +} - return rv; +/* + * Explicitly insert a range indicated by 'bad' to the bad table, where + * the location is indexed by 'at'. + */ +static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad) +{ + u64 *p = bb->page; + int len; + + WARN_ON(badblocks_full(bb)); + + len = min_t(sector_t, bad->len, BB_MAX_LEN); + if (at < bb->count) + memmove(p + at + 1, p + at, (bb->count - at) * 8); + p[at] = BB_MAKE(bad->start, len, bad->ack); + + return len; } -EXPORT_SYMBOL_GPL(badblocks_check); static void badblocks_update_acked(struct badblocks *bb) { + bool unacked = false; u64 *p = bb->page; int i; - bool unacked = false; if (!bb->unacked_exist) return; @@ -144,281 +855,600 @@ static void badblocks_update_acked(struct badblocks *bb) bb->unacked_exist = 0; } -/** - * badblocks_set() - Add a range of bad blocks to the table. - * @bb: the badblocks structure that holds all badblock information - * @s: first sector to mark as bad - * @sectors: number of sectors to mark as bad - * @acknowledged: weather to mark the bad sectors as acknowledged - * - * This might extend the table, or might contract it if two adjacent ranges - * can be merged. We binary-search to find the 'insertion' point, then - * decide how best to handle it. - * - * Return: - * 0: success - * 1: failed to set badblocks (out of space) - */ -int badblocks_set(struct badblocks *bb, sector_t s, int sectors, - int acknowledged) +/* Do exact work to set bad block range into the bad block table */ +static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors, + int acknowledged) { - u64 *p; - int lo, hi; - int rv = 0; + int retried = 0, space_desired = 0; + int orig_len, len = 0, added = 0; + struct badblocks_context bad; + int prev = -1, hint = -1; + sector_t orig_start; unsigned long flags; + int rv = 0; + u64 *p; if (bb->shift < 0) /* badblocks are disabled */ return 1; + if (sectors == 0) + /* Invalid sectors number */ + return 1; + if (bb->shift) { /* round the start down, and the end up */ sector_t next = s + sectors; - s >>= bb->shift; - next += (1<<bb->shift) - 1; - next >>= bb->shift; + rounddown(s, bb->shift); + roundup(next, bb->shift); sectors = next - s; } write_seqlock_irqsave(&bb->lock, flags); + orig_start = s; + orig_len = sectors; + bad.ack = acknowledged; p = bb->page; - lo = 0; - hi = bb->count; - /* Find the last range that starts at-or-before 's' */ - while (hi - lo > 1) { - int mid = (lo + hi) / 2; - sector_t a = BB_OFFSET(p[mid]); - if (a <= s) - lo = mid; - else - hi = mid; +re_insert: + bad.start = s; + bad.len = sectors; + len = 0; + + if (badblocks_empty(bb)) { + len = insert_at(bb, 0, &bad); + bb->count++; + added++; + goto update_sectors; } - if (hi > lo && BB_OFFSET(p[lo]) > s) - hi = lo; - if (hi > lo) { - /* we found a range that might merge with the start - * of our new range - */ - sector_t a = BB_OFFSET(p[lo]); - sector_t e = a + BB_LEN(p[lo]); - int ack = BB_ACK(p[lo]); - - if (e >= s) { - /* Yes, we can merge with a previous range */ - if (s == a && s + sectors >= e) - /* new range covers old */ - ack = acknowledged; - else - ack = ack && acknowledged; - - if (e < s + sectors) - e = s + sectors; - if (e - a <= BB_MAX_LEN) { - p[lo] = BB_MAKE(a, e-a, ack); - s = e; + prev = prev_badblocks(bb, &bad, hint); + + /* start before all badblocks */ + if (prev < 0) { + if (!badblocks_full(bb)) { + /* insert on the first */ + if (bad.len > (BB_OFFSET(p[0]) - bad.start)) + bad.len = BB_OFFSET(p[0]) - bad.start; + len = insert_at(bb, 0, &bad); + bb->count++; + added++; + hint = 0; + goto update_sectors; + } + + /* No sapce, try to merge */ + if (overlap_behind(bb, &bad, 0)) { + if (can_merge_behind(bb, &bad, 0)) { + len = behind_merge(bb, &bad, 0); + added++; } else { - /* does not all fit in one range, - * make p[lo] maximal - */ - if (BB_LEN(p[lo]) != BB_MAX_LEN) - p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); - s = a + BB_MAX_LEN; + len = BB_OFFSET(p[0]) - s; + space_desired = 1; } - sectors = e - s; + hint = 0; + goto update_sectors; } + + /* no table space and give up */ + goto out; } - if (sectors && hi < bb->count) { - /* 'hi' points to the first range that starts after 's'. - * Maybe we can merge with the start of that range - */ - sector_t a = BB_OFFSET(p[hi]); - sector_t e = a + BB_LEN(p[hi]); - int ack = BB_ACK(p[hi]); - - if (a <= s + sectors) { - /* merging is possible */ - if (e <= s + sectors) { - /* full overlap */ - e = s + sectors; - ack = acknowledged; - } else - ack = ack && acknowledged; - - a = s; - if (e - a <= BB_MAX_LEN) { - p[hi] = BB_MAKE(a, e-a, ack); - s = e; - } else { - p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); - s = a + BB_MAX_LEN; + + /* in case p[prev-1] can be merged with p[prev] */ + if (can_combine_front(bb, prev, &bad)) { + front_combine(bb, prev); + bb->count--; + added++; + hint = prev; + goto update_sectors; + } + + if (overlap_front(bb, prev, &bad)) { + if (can_merge_front(bb, prev, &bad)) { + len = front_merge(bb, prev, &bad); + added++; + } else { + int extra = 0; + + if (!can_front_overwrite(bb, prev, &bad, &extra)) { + len = min_t(sector_t, + BB_END(p[prev]) - s, sectors); + hint = prev; + goto update_sectors; + } + + len = front_overwrite(bb, prev, &bad, extra); + added++; + bb->count += extra; + + if (can_combine_front(bb, prev, &bad)) { + front_combine(bb, prev); + bb->count--; } - sectors = e - s; - lo = hi; - hi++; } + hint = prev; + goto update_sectors; + } + + if (can_merge_front(bb, prev, &bad)) { + len = front_merge(bb, prev, &bad); + added++; + hint = prev; + goto update_sectors; } - if (sectors == 0 && hi < bb->count) { - /* we might be able to combine lo and hi */ - /* Note: 's' is at the end of 'lo' */ - sector_t a = BB_OFFSET(p[hi]); - int lolen = BB_LEN(p[lo]); - int hilen = BB_LEN(p[hi]); - int newlen = lolen + hilen - (s - a); - - if (s >= a && newlen < BB_MAX_LEN) { - /* yes, we can combine them */ - int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); - - p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); - memmove(p + hi, p + hi + 1, - (bb->count - hi - 1) * 8); - bb->count--; + + /* if no space in table, still try to merge in the covered range */ + if (badblocks_full(bb)) { + /* skip the cannot-merge range */ + if (((prev + 1) < bb->count) && + overlap_behind(bb, &bad, prev + 1) && + ((s + sectors) >= BB_END(p[prev + 1]))) { + len = BB_END(p[prev + 1]) - s; + hint = prev + 1; + goto update_sectors; } + + /* no retry any more */ + len = sectors; + space_desired = 1; + hint = -1; + goto update_sectors; } - while (sectors) { - /* didn't merge (it all). - * Need to add a range just before 'hi' - */ - if (bb->count >= MAX_BADBLOCKS) { - /* No room for more */ - rv = 1; - break; - } else { - int this_sectors = sectors; - memmove(p + hi + 1, p + hi, - (bb->count - hi) * 8); - bb->count++; + /* cannot merge and there is space in bad table */ + if ((prev + 1) < bb->count && + overlap_behind(bb, &bad, prev + 1)) + bad.len = min_t(sector_t, + bad.len, BB_OFFSET(p[prev + 1]) - bad.start); - if (this_sectors > BB_MAX_LEN) - this_sectors = BB_MAX_LEN; - p[hi] = BB_MAKE(s, this_sectors, acknowledged); - sectors -= this_sectors; - s += this_sectors; - } + len = insert_at(bb, prev + 1, &bad); + bb->count++; + added++; + hint = prev + 1; + +update_sectors: + s += len; + sectors -= len; + + if (sectors > 0) + goto re_insert; + + WARN_ON(sectors < 0); + + /* + * Check whether the following already set range can be + * merged. (prev < 0) condition is not handled here, + * because it's already complicated enough. + */ + if (prev >= 0 && + (prev + 1) < bb->count && + BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) && + (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN && + BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) { + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), + BB_LEN(p[prev]) + BB_LEN(p[prev + 1]), + BB_ACK(p[prev])); + + if ((prev + 2) < bb->count) + memmove(p + prev + 1, p + prev + 2, + (bb->count - (prev + 2)) * 8); + bb->count--; + } + + if (space_desired && !badblocks_full(bb)) { + s = orig_start; + sectors = orig_len; + space_desired = 0; + if (retried++ < 3) + goto re_insert; + } + +out: + if (added) { + set_changed(bb); + + if (!acknowledged) + bb->unacked_exist = 1; + else + badblocks_update_acked(bb); } - bb->changed = 1; - if (!acknowledged) - bb->unacked_exist = 1; - else - badblocks_update_acked(bb); write_sequnlock_irqrestore(&bb->lock, flags); + if (!added) + rv = 1; + return rv; } -EXPORT_SYMBOL_GPL(badblocks_set); -/** - * badblocks_clear() - Remove a range of bad blocks to the table. - * @bb: the badblocks structure that holds all badblock information - * @s: first sector to mark as bad - * @sectors: number of sectors to mark as bad - * - * This may involve extending the table if we spilt a region, - * but it must not fail. So if the table becomes full, we just - * drop the remove request. - * - * Return: - * 0: success - * 1: failed to clear badblocks +/* + * Clear the bad block range from bad block table which is front overlapped + * with the clearing range. The return value is how many sectors from an + * already set bad block range are cleared. If the whole bad block range is + * covered by the clearing range and fully cleared, 'delete' is set as 1 for + * the caller to reduce bb->count. */ -int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +static int front_clear(struct badblocks *bb, int prev, + struct badblocks_context *bad, int *deleted) { - u64 *p; - int lo, hi; - sector_t target = s + sectors; + sector_t sectors = bad->len; + sector_t s = bad->start; + u64 *p = bb->page; + int cleared = 0; + + *deleted = 0; + if (s == BB_OFFSET(p[prev])) { + if (BB_LEN(p[prev]) > sectors) { + p[prev] = BB_MAKE(BB_OFFSET(p[prev]) + sectors, + BB_LEN(p[prev]) - sectors, + BB_ACK(p[prev])); + cleared = sectors; + } else { + /* BB_LEN(p[prev]) <= sectors */ + cleared = BB_LEN(p[prev]); + if ((prev + 1) < bb->count) + memmove(p + prev, p + prev + 1, + (bb->count - prev - 1) * 8); + *deleted = 1; + } + } else if (s > BB_OFFSET(p[prev])) { + if (BB_END(p[prev]) <= (s + sectors)) { + cleared = BB_END(p[prev]) - s; + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), + s - BB_OFFSET(p[prev]), + BB_ACK(p[prev])); + } else { + /* Splitting is handled in front_splitting_clear() */ + BUG(); + } + } + + return cleared; +} + +/* + * Handle the condition that the clearing range hits middle of an already set + * bad block range from bad block table. In this condition the existing bad + * block range is split into two after the middle part is cleared. + */ +static int front_splitting_clear(struct badblocks *bb, int prev, + struct badblocks_context *bad) +{ + u64 *p = bb->page; + u64 end = BB_END(p[prev]); + int ack = BB_ACK(p[prev]); + sector_t sectors = bad->len; + sector_t s = bad->start; + + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), + s - BB_OFFSET(p[prev]), + ack); + memmove(p + prev + 2, p + prev + 1, (bb->count - prev - 1) * 8); + p[prev + 1] = BB_MAKE(s + sectors, end - s - sectors, ack); + return sectors; +} + +/* Do the exact work to clear bad block range from the bad block table */ +static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +{ + struct badblocks_context bad; + int prev = -1, hint = -1; + int len = 0, cleared = 0; int rv = 0; + u64 *p; + + if (bb->shift < 0) + /* badblocks are disabled */ + return 1; + + if (sectors == 0) + /* Invalid sectors number */ + return 1; + + if (bb->shift) { + sector_t target; - if (bb->shift > 0) { /* When clearing we round the start up and the end down. * This should not matter as the shift should align with * the block size and no rounding should ever be needed. * However it is better the think a block is bad when it * isn't than to think a block is not bad when it is. */ - s += (1<<bb->shift) - 1; - s >>= bb->shift; - target >>= bb->shift; + target = s + sectors; + roundup(s, bb->shift); + rounddown(target, bb->shift); + sectors = target - s; } write_seqlock_irq(&bb->lock); + bad.ack = true; p = bb->page; - lo = 0; - hi = bb->count; - /* Find the last range that starts before 'target' */ - while (hi - lo > 1) { - int mid = (lo + hi) / 2; - sector_t a = BB_OFFSET(p[mid]); - if (a < target) - lo = mid; - else - hi = mid; +re_clear: + bad.start = s; + bad.len = sectors; + + if (badblocks_empty(bb)) { + len = sectors; + cleared++; + goto update_sectors; } - if (hi > lo) { - /* p[lo] is the last range that could overlap the - * current range. Earlier ranges could also overlap, - * but only this one can overlap the end of the range. - */ - if ((BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) && - (BB_OFFSET(p[lo]) < target)) { - /* Partial overlap, leave the tail of this range */ - int ack = BB_ACK(p[lo]); - sector_t a = BB_OFFSET(p[lo]); - sector_t end = a + BB_LEN(p[lo]); - - if (a < s) { - /* we need to split this range */ - if (bb->count >= MAX_BADBLOCKS) { - rv = -ENOSPC; - goto out; - } - memmove(p+lo+1, p+lo, (bb->count - lo) * 8); - bb->count++; - p[lo] = BB_MAKE(a, s-a, ack); - lo++; - } - p[lo] = BB_MAKE(target, end - target, ack); - /* there is no longer an overlap */ - hi = lo; - lo--; - } - while (lo >= 0 && - (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) && - (BB_OFFSET(p[lo]) < target)) { - /* This range does overlap */ - if (BB_OFFSET(p[lo]) < s) { - /* Keep the early parts of this range. */ - int ack = BB_ACK(p[lo]); - sector_t start = BB_OFFSET(p[lo]); - - p[lo] = BB_MAKE(start, s - start, ack); - /* now low doesn't overlap, so.. */ - break; - } - lo--; + + + prev = prev_badblocks(bb, &bad, hint); + + /* Start before all badblocks */ + if (prev < 0) { + if (overlap_behind(bb, &bad, 0)) { + len = BB_OFFSET(p[0]) - s; + hint = 0; + } else { + len = sectors; } - /* 'lo' is strictly before, 'hi' is strictly after, - * anything between needs to be discarded + /* + * Both situations are to clear non-bad range, + * should be treated as successful */ - if (hi - lo > 1) { - memmove(p+lo+1, p+hi, (bb->count - hi) * 8); - bb->count -= (hi - lo - 1); + cleared++; + goto update_sectors; + } + + /* Start after all badblocks */ + if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) { + len = sectors; + cleared++; + goto update_sectors; + } + + /* Clear will split a bad record but the table is full */ + if (badblocks_full(bb) && (BB_OFFSET(p[prev]) < bad.start) && + (BB_END(p[prev]) > (bad.start + sectors))) { + len = sectors; + goto update_sectors; + } + + if (overlap_front(bb, prev, &bad)) { + if ((BB_OFFSET(p[prev]) < bad.start) && + (BB_END(p[prev]) > (bad.start + bad.len))) { + /* Splitting */ + if ((bb->count + 1) < MAX_BADBLOCKS) { + len = front_splitting_clear(bb, prev, &bad); + bb->count += 1; + cleared++; + } else { + /* No space to split, give up */ + len = sectors; + } + } else { + int deleted = 0; + + len = front_clear(bb, prev, &bad, &deleted); + bb->count -= deleted; + cleared++; + hint = prev; } + + goto update_sectors; + } + + /* Not front overlap, but behind overlap */ + if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) { + len = BB_OFFSET(p[prev + 1]) - bad.start; + hint = prev + 1; + /* Clear non-bad range should be treated as successful */ + cleared++; + goto update_sectors; + } + + /* Not cover any badblocks range in the table */ + len = sectors; + /* Clear non-bad range should be treated as successful */ + cleared++; + +update_sectors: + s += len; + sectors -= len; + + if (sectors > 0) + goto re_clear; + + WARN_ON(sectors < 0); + + if (cleared) { + badblocks_update_acked(bb); + set_changed(bb); } - badblocks_update_acked(bb); - bb->changed = 1; -out: write_sequnlock_irq(&bb->lock); + + if (!cleared) + rv = 1; + return rv; } + +/* Do the exact work to check bad blocks range from the bad block table */ +static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) +{ + int unacked_badblocks, acked_badblocks; + int prev = -1, hint = -1, set = 0; + struct badblocks_context bad; + unsigned int seq; + int len, rv; + u64 *p; + + WARN_ON(bb->shift < 0 || sectors == 0); + + if (bb->shift > 0) { + sector_t target; + + /* round the start down, and the end up */ + target = s + sectors; + rounddown(s, bb->shift); + roundup(target, bb->shift); + sectors = target - s; + } + +retry: + seq = read_seqbegin(&bb->lock); + + p = bb->page; + unacked_badblocks = 0; + acked_badblocks = 0; + +re_check: + bad.start = s; + bad.len = sectors; + + if (badblocks_empty(bb)) { + len = sectors; + goto update_sectors; + } + + prev = prev_badblocks(bb, &bad, hint); + + /* start after all badblocks */ + if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) { + len = sectors; + goto update_sectors; + } + + if (overlap_front(bb, prev, &bad)) { + if (BB_ACK(p[prev])) + acked_badblocks++; + else + unacked_badblocks++; + + if (BB_END(p[prev]) >= (s + sectors)) + len = sectors; + else + len = BB_END(p[prev]) - s; + + if (set == 0) { + *first_bad = BB_OFFSET(p[prev]); + *bad_sectors = BB_LEN(p[prev]); + set = 1; + } + goto update_sectors; + } + + /* Not front overlap, but behind overlap */ + if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) { + len = BB_OFFSET(p[prev + 1]) - bad.start; + hint = prev + 1; + goto update_sectors; + } + + /* not cover any badblocks range in the table */ + len = sectors; + +update_sectors: + s += len; + sectors -= len; + + if (sectors > 0) + goto re_check; + + WARN_ON(sectors < 0); + + if (unacked_badblocks > 0) + rv = -1; + else if (acked_badblocks > 0) + rv = 1; + else + rv = 0; + + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return rv; +} + +/** + * badblocks_check() - check a given range for bad sectors + * @bb: the badblocks structure that holds all badblock information + * @s: sector (start) at which to check for badblocks + * @sectors: number of sectors to check for badblocks + * @first_bad: pointer to store location of the first badblock + * @bad_sectors: pointer to store number of badblocks after @first_bad + * + * We can record which blocks on each device are 'bad' and so just + * fail those blocks, or that stripe, rather than the whole device. + * Entries in the bad-block table are 64bits wide. This comprises: + * Length of bad-range, in sectors: 0-511 for lengths 1-512 + * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) + * A 'shift' can be set so that larger blocks are tracked and + * consequently larger devices can be covered. + * 'Acknowledged' flag - 1 bit. - the most significant bit. + * + * Locking of the bad-block table uses a seqlock so badblocks_check + * might need to retry if it is very unlucky. + * We will sometimes want to check for bad blocks in a bi_end_io function, + * so we use the write_seqlock_irq variant. + * + * When looking for a bad block we specify a range and want to + * know if any block in the range is bad. So we binary-search + * to the last range that starts at-or-before the given endpoint, + * (or "before the sector after the target range") + * then see if it ends after the given start. + * + * Return: + * 0: there are no known bad blocks in the range + * 1: there are known bad block which are all acknowledged + * -1: there are bad blocks which have not yet been acknowledged in metadata. + * plus the start/length of the first bad section we overlap. + */ +int badblocks_check(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) +{ + return _badblocks_check(bb, s, sectors, first_bad, bad_sectors); +} +EXPORT_SYMBOL_GPL(badblocks_check); + +/** + * badblocks_set() - Add a range of bad blocks to the table. + * @bb: the badblocks structure that holds all badblock information + * @s: first sector to mark as bad + * @sectors: number of sectors to mark as bad + * @acknowledged: weather to mark the bad sectors as acknowledged + * + * This might extend the table, or might contract it if two adjacent ranges + * can be merged. We binary-search to find the 'insertion' point, then + * decide how best to handle it. + * + * Return: + * 0: success + * 1: failed to set badblocks (out of space) + */ +int badblocks_set(struct badblocks *bb, sector_t s, int sectors, + int acknowledged) +{ + return _badblocks_set(bb, s, sectors, acknowledged); +} +EXPORT_SYMBOL_GPL(badblocks_set); + +/** + * badblocks_clear() - Remove a range of bad blocks to the table. + * @bb: the badblocks structure that holds all badblock information + * @s: first sector to mark as bad + * @sectors: number of sectors to mark as bad + * + * This may involve extending the table if we spilt a region, + * but it must not fail. So if the table becomes full, we just + * drop the remove request. + * + * Return: + * 0: success + * 1: failed to clear badblocks + */ +int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +{ + return _badblocks_clear(bb, s, sectors); +} EXPORT_SYMBOL_GPL(badblocks_clear); /** diff --git a/block/blk-flush.c b/block/blk-flush.c index e73dc22d05c1..3f4d41952ef2 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -323,16 +323,9 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, flush_rq->mq_ctx = first_rq->mq_ctx; flush_rq->mq_hctx = first_rq->mq_hctx; - if (!q->elevator) { + if (!q->elevator) flush_rq->tag = first_rq->tag; - - /* - * We borrow data request's driver tag, so have to mark - * this flush request as INFLIGHT for avoiding double - * account of this driver tag - */ - flush_rq->rq_flags |= RQF_MQ_INFLIGHT; - } else + else flush_rq->internal_tag = first_rq->internal_tag; flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index c3b5930106b2..5cbeb9344f2f 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -246,7 +246,6 @@ static const char *const rqf_name[] = { RQF_NAME(STARTED), RQF_NAME(FLUSH_SEQ), RQF_NAME(MIXED_MERGE), - RQF_NAME(MQ_INFLIGHT), RQF_NAME(DONTPREP), RQF_NAME(SCHED_TAGS), RQF_NAME(USE_SCHED), diff --git a/block/blk-mq.c b/block/blk-mq.c index 1fafd54dce3c..e2d11183f62e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -426,6 +426,8 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) rq_list_add(data->cached_rq, rq); nr++; } + if (!(data->rq_flags & RQF_SCHED_TAGS)) + blk_mq_add_active_requests(data->hctx, nr); /* caller already holds a reference, add for remainder */ percpu_ref_get_many(&data->q->q_usage_counter, nr - 1); data->nr_tags -= nr; @@ -510,6 +512,8 @@ retry: goto retry; } + if (!(data->rq_flags & RQF_SCHED_TAGS)) + blk_mq_inc_active_requests(data->hctx); rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag); blk_mq_rq_time_init(rq, alloc_time_ns); return rq; @@ -669,6 +673,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, tag = blk_mq_get_tag(&data); if (tag == BLK_MQ_NO_TAG) goto out_queue_exit; + if (!(data.rq_flags & RQF_SCHED_TAGS)) + blk_mq_inc_active_requests(data.hctx); rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag); blk_mq_rq_time_init(rq, alloc_time_ns); rq->__data_len = 0; @@ -708,11 +714,10 @@ static void __blk_mq_free_request(struct request *rq) blk_pm_mark_last_busy(rq); rq->mq_hctx = NULL; - if (rq->rq_flags & RQF_MQ_INFLIGHT) - __blk_mq_dec_active_requests(hctx); - - if (rq->tag != BLK_MQ_NO_TAG) + if (rq->tag != BLK_MQ_NO_TAG) { + blk_mq_dec_active_requests(hctx); blk_mq_put_tag(hctx->tags, ctx, rq->tag); + } if (sched_tag != BLK_MQ_NO_TAG) blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag); blk_mq_sched_restart(hctx); @@ -1061,12 +1066,7 @@ static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx, { struct request_queue *q = hctx->queue; - /* - * All requests should have been marked as RQF_MQ_INFLIGHT, so - * update hctx->nr_active in batch - */ - if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) - __blk_mq_sub_active_requests(hctx, nr_tags); + blk_mq_sub_active_requests(hctx, nr_tags); blk_mq_put_tags(hctx->tags, tag_array, nr_tags); percpu_ref_put_many(&q->q_usage_counter, nr_tags); @@ -1259,6 +1259,7 @@ void blk_mq_start_request(struct request *rq) blk_add_timer(rq); WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT); + rq->mq_hctx->tags->rqs[rq->tag] = rq; #ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) @@ -1748,7 +1749,7 @@ struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, return data.rq; } -static bool __blk_mq_alloc_driver_tag(struct request *rq) +bool __blk_mq_alloc_driver_tag(struct request *rq) { struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; @@ -1769,20 +1770,7 @@ static bool __blk_mq_alloc_driver_tag(struct request *rq) return false; rq->tag = tag + tag_offset; - return true; -} - -bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq) -{ - if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq)) - return false; - - if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) && - !(rq->rq_flags & RQF_MQ_INFLIGHT)) { - rq->rq_flags |= RQF_MQ_INFLIGHT; - __blk_mq_inc_active_requests(hctx); - } - hctx->tags->rqs[rq->tag] = rq; + blk_mq_inc_active_requests(rq->mq_hctx); return true; } @@ -2794,13 +2782,8 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) * If we do, we can dispatch the whole plug list in one go. We * already know at this point that all requests belong to the * same queue, caller must ensure that's the case. - * - * Since we pass off the full list to the driver at this point, - * we do not increment the active request count for the queue. - * Bypass shared tags for now because of that. */ - if (q->mq_ops->queue_rqs && - !(rq->mq_hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { + if (q->mq_ops->queue_rqs) { blk_mq_run_dispatch_ops(q, __blk_mq_flush_plug_list(q, plug)); if (rq_list_empty(plug->mq_list)) diff --git a/block/blk-mq.h b/block/blk-mq.h index 1743857e0b01..f75a9ecfebde 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -271,12 +271,18 @@ static inline int blk_mq_get_rq_budget_token(struct request *rq) return -1; } -static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) +static inline void __blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx, + int val) { if (blk_mq_is_shared_tags(hctx->flags)) - atomic_inc(&hctx->queue->nr_active_requests_shared_tags); + atomic_add(val, &hctx->queue->nr_active_requests_shared_tags); else - atomic_inc(&hctx->nr_active); + atomic_add(val, &hctx->nr_active); +} + +static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) +{ + __blk_mq_add_active_requests(hctx, 1); } static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx, @@ -293,6 +299,32 @@ static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) __blk_mq_sub_active_requests(hctx, 1); } +static inline void blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx, + int val) +{ + if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) + __blk_mq_add_active_requests(hctx, val); +} + +static inline void blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) +{ + if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) + __blk_mq_inc_active_requests(hctx); +} + +static inline void blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx, + int val) +{ + if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) + __blk_mq_sub_active_requests(hctx, val); +} + +static inline void blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) +{ + if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) + __blk_mq_dec_active_requests(hctx); +} + static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx) { if (blk_mq_is_shared_tags(hctx->flags)) @@ -302,13 +334,9 @@ static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx) static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq) { + blk_mq_dec_active_requests(hctx); blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag); rq->tag = BLK_MQ_NO_TAG; - - if (rq->rq_flags & RQF_MQ_INFLIGHT) { - rq->rq_flags &= ~RQF_MQ_INFLIGHT; - __blk_mq_dec_active_requests(hctx); - } } static inline void blk_mq_put_driver_tag(struct request *rq) @@ -319,19 +347,14 @@ static inline void blk_mq_put_driver_tag(struct request *rq) __blk_mq_put_driver_tag(rq->mq_hctx, rq); } -bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq); +bool __blk_mq_alloc_driver_tag(struct request *rq); static inline bool blk_mq_get_driver_tag(struct request *rq) { - struct blk_mq_hw_ctx *hctx = rq->mq_hctx; - - if (rq->tag != BLK_MQ_NO_TAG && - !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { - hctx->tags->rqs[rq->tag] = rq; - return true; - } + if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq)) + return false; - return __blk_mq_get_driver_tag(hctx, rq); + return true; } static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c index 403756dbd50d..82d9c4c3fb41 100644 --- a/block/partitions/ibm.c +++ b/block/partitions/ibm.c @@ -61,6 +61,47 @@ static sector_t cchhb2blk(struct vtoc_cchhb *ptr, struct hd_geometry *geo) ptr->b; } +/* Volume Label Type/ID Length */ +#define DASD_VOL_TYPE_LEN 4 +#define DASD_VOL_ID_LEN 6 + +/* Volume Label Types */ +#define DASD_VOLLBL_TYPE_VOL1 0 +#define DASD_VOLLBL_TYPE_LNX1 1 +#define DASD_VOLLBL_TYPE_CMS1 2 + +struct dasd_vollabel { + char *type; + int idx; +}; + +static struct dasd_vollabel dasd_vollabels[] = { + [DASD_VOLLBL_TYPE_VOL1] = { + .type = "VOL1", + .idx = DASD_VOLLBL_TYPE_VOL1, + }, + [DASD_VOLLBL_TYPE_LNX1] = { + .type = "LNX1", + .idx = DASD_VOLLBL_TYPE_LNX1, + }, + [DASD_VOLLBL_TYPE_CMS1] = { + .type = "CMS1", + .idx = DASD_VOLLBL_TYPE_CMS1, + }, +}; + +static int get_label_by_type(const char *type) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(dasd_vollabels); i++) { + if (!memcmp(type, dasd_vollabels[i].type, DASD_VOL_TYPE_LEN)) + return dasd_vollabels[i].idx; + } + + return -1; +} + static int find_label(struct parsed_partitions *state, dasd_information2_t *info, struct hd_geometry *geo, @@ -70,12 +111,10 @@ static int find_label(struct parsed_partitions *state, char type[], union label_t *label) { - Sector sect; - unsigned char *data; sector_t testsect[3]; - unsigned char temp[5]; - int found = 0; int i, testcount; + Sector sect; + void *data; /* There a three places where we may find a valid label: * - on an ECKD disk it's block 2 @@ -103,31 +142,27 @@ static int find_label(struct parsed_partitions *state, if (data == NULL) continue; memcpy(label, data, sizeof(*label)); - memcpy(temp, data, 4); - temp[4] = 0; - EBCASC(temp, 4); + memcpy(type, data, DASD_VOL_TYPE_LEN); + EBCASC(type, DASD_VOL_TYPE_LEN); put_dev_sector(sect); - if (!strcmp(temp, "VOL1") || - !strcmp(temp, "LNX1") || - !strcmp(temp, "CMS1")) { - if (!strcmp(temp, "VOL1")) { - strncpy(type, label->vol.vollbl, 4); - strncpy(name, label->vol.volid, 6); - } else { - strncpy(type, label->lnx.vollbl, 4); - strncpy(name, label->lnx.volid, 6); - } - EBCASC(type, 4); - EBCASC(name, 6); + switch (get_label_by_type(type)) { + case DASD_VOLLBL_TYPE_VOL1: + memcpy(name, label->vol.volid, DASD_VOL_ID_LEN); + EBCASC(name, DASD_VOL_ID_LEN); + *labelsect = testsect[i]; + return 1; + case DASD_VOLLBL_TYPE_LNX1: + case DASD_VOLLBL_TYPE_CMS1: + memcpy(name, label->lnx.volid, DASD_VOL_ID_LEN); + EBCASC(name, DASD_VOL_ID_LEN); *labelsect = testsect[i]; - found = 1; + return 1; + default: break; } } - if (!found) - memset(label, 0, sizeof(*label)); - return found; + return 0; } static int find_vol1_partitions(struct parsed_partitions *state, @@ -297,8 +332,8 @@ int ibm_partition(struct parsed_partitions *state) sector_t nr_sectors; dasd_information2_t *info; struct hd_geometry *geo; - char type[5] = {0,}; - char name[7] = {0,}; + char type[DASD_VOL_TYPE_LEN + 1] = ""; + char name[DASD_VOL_ID_LEN + 1] = ""; sector_t labelsect; union label_t *label; @@ -330,18 +365,21 @@ int ibm_partition(struct parsed_partitions *state) info = NULL; } - if (find_label(state, info, geo, blocksize, &labelsect, name, type, - label)) { - if (!strncmp(type, "VOL1", 4)) { + if (find_label(state, info, geo, blocksize, &labelsect, name, type, label)) { + switch (get_label_by_type(type)) { + case DASD_VOLLBL_TYPE_VOL1: res = find_vol1_partitions(state, geo, blocksize, name, label); - } else if (!strncmp(type, "LNX1", 4)) { + break; + case DASD_VOLLBL_TYPE_LNX1: res = find_lnx1_partitions(state, geo, blocksize, name, label, labelsect, nr_sectors, info); - } else if (!strncmp(type, "CMS1", 4)) { + break; + case DASD_VOLLBL_TYPE_CMS1: res = find_cms1_partitions(state, geo, blocksize, name, label, labelsect); + break; } } else if (info) { /* diff --git a/block/sed-opal.c b/block/sed-opal.c index 04f38a3f5d95..3d9e9cd250bd 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -18,6 +18,7 @@ #include <linux/uaccess.h> #include <uapi/linux/sed-opal.h> #include <linux/sed-opal.h> +#include <linux/sed-opal-key.h> #include <linux/string.h> #include <linux/kdev_t.h> #include <linux/key.h> @@ -3018,7 +3019,13 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw) if (ret) return ret; - /* update keyring with new password */ + /* update keyring and key store with new password */ + ret = sed_write_key(OPAL_AUTH_KEY, + opal_pw->new_user_pw.opal_key.key, + opal_pw->new_user_pw.opal_key.key_len); + if (ret != -EOPNOTSUPP) + pr_warn("error updating SED key: %d\n", ret); + ret = update_sed_opal_key(OPAL_AUTH_KEY, opal_pw->new_user_pw.opal_key.key, opal_pw->new_user_pw.opal_key.key_len); @@ -3291,6 +3298,8 @@ EXPORT_SYMBOL_GPL(sed_ioctl); static int __init sed_opal_init(void) { struct key *kr; + char init_sed_key[OPAL_KEY_MAX]; + int keylen = OPAL_KEY_MAX - 1; kr = keyring_alloc(".sed_opal", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), @@ -3303,6 +3312,11 @@ static int __init sed_opal_init(void) sed_opal_keyring = kr; - return 0; + if (sed_read_key(OPAL_AUTH_KEY, init_sed_key, &keylen) < 0) { + memset(init_sed_key, '\0', sizeof(init_sed_key)); + keylen = OPAL_KEY_MAX - 1; + } + + return update_sed_opal_key(OPAL_AUTH_KEY, init_sed_key, keylen); } late_initcall(sed_opal_init); diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c index 63773a90581d..c51ea95bc2ce 100644 --- a/drivers/block/aoe/aoenet.c +++ b/drivers/block/aoe/aoenet.c @@ -39,8 +39,7 @@ static struct ktstate kts; #ifndef MODULE static int __init aoe_iflist_setup(char *str) { - strncpy(aoe_iflist, str, IFLISTSZ); - aoe_iflist[IFLISTSZ - 1] = '\0'; + strscpy(aoe_iflist, str, IFLISTSZ); return 1; } diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 968090935eb2..22a3cf7f32e2 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1750,6 +1750,25 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, return null_handle_cmd(cmd, sector, nr_sectors, req_op(rq)); } +static void null_queue_rqs(struct request **rqlist) +{ + struct request *requeue_list = NULL; + struct request **requeue_lastp = &requeue_list; + struct blk_mq_queue_data bd = { }; + blk_status_t ret; + + do { + struct request *rq = rq_list_pop(rqlist); + + bd.rq = rq; + ret = null_queue_rq(rq->mq_hctx, &bd); + if (ret != BLK_STS_OK) + rq_list_add_tail(&requeue_lastp, rq); + } while (!rq_list_empty(*rqlist)); + + *rqlist = requeue_list; +} + static void cleanup_queue(struct nullb_queue *nq) { bitmap_free(nq->tag_map); @@ -1802,6 +1821,7 @@ static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, static const struct blk_mq_ops null_mq_ops = { .queue_rq = null_queue_rq, + .queue_rqs = null_queue_rqs, .complete = null_complete_rq, .timeout = null_timeout_rq, .poll = null_poll, @@ -1946,7 +1966,7 @@ static int null_gendisk_register(struct nullb *nullb) else disk->fops = &null_bio_ops; disk->private_data = nullb; - strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); + strscpy_pad(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); if (nullb->dev->zoned) { int ret = null_register_zoned_dev(nullb); diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 630ddfe6657b..83600b45e12a 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -75,6 +75,7 @@ struct ublk_rq_data { struct ublk_uring_cmd_pdu { struct ublk_queue *ubq; + u16 tag; }; /* @@ -115,6 +116,9 @@ struct ublk_uring_cmd_pdu { */ #define UBLK_IO_FLAG_NEED_GET_DATA 0x08 +/* atomic RW with ubq->cancel_lock */ +#define UBLK_IO_FLAG_CANCELED 0x80000000 + struct ublk_io { /* userspace buffer address from io cmd */ __u64 addr; @@ -138,13 +142,13 @@ struct ublk_queue { unsigned int max_io_sz; bool force_abort; bool timeout; + bool canceling; unsigned short nr_io_ready; /* how many ios setup */ + spinlock_t cancel_lock; struct ublk_device *dev; struct ublk_io ios[]; }; -#define UBLK_DAEMON_MONITOR_PERIOD (5 * HZ) - struct ublk_device { struct gendisk *ub_disk; @@ -166,7 +170,7 @@ struct ublk_device { struct mutex mutex; - spinlock_t mm_lock; + spinlock_t lock; struct mm_struct *mm; struct ublk_params params; @@ -175,11 +179,6 @@ struct ublk_device { unsigned int nr_queues_ready; unsigned int nr_privileged_daemon; - /* - * Our ubq->daemon may be killed without any notification, so - * monitor each queue's daemon periodically - */ - struct delayed_work monitor_work; struct work_struct quiesce_work; struct work_struct stop_work; }; @@ -190,10 +189,11 @@ struct ublk_params_header { __u32 types; }; +static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq); + static inline unsigned int ublk_req_build_flags(struct request *req); static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, int tag); - static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub) { return ub->dev_info.flags & UBLK_F_USER_COPY; @@ -470,6 +470,7 @@ static DEFINE_MUTEX(ublk_ctl_mutex); * It can be extended to one per-user limit in future or even controlled * by cgroup. */ +#define UBLK_MAX_UBLKS UBLK_MINORS static unsigned int ublks_max = 64; static unsigned int ublks_added; /* protected by ublk_ctl_mutex */ @@ -1083,13 +1084,10 @@ static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, { WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); - if (!(io->flags & UBLK_IO_FLAG_ABORTED)) { - io->flags |= UBLK_IO_FLAG_ABORTED; - if (ublk_queue_can_use_recovery_reissue(ubq)) - blk_mq_requeue_request(req, false); - else - ublk_put_req_ref(ubq, req); - } + if (ublk_queue_can_use_recovery_reissue(ubq)) + blk_mq_requeue_request(req, false); + else + ublk_put_req_ref(ubq, req); } static void ubq_complete_io_cmd(struct ublk_io *io, int res, @@ -1118,8 +1116,6 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq, blk_mq_requeue_request(rq, false); else blk_mq_end_request(rq, BLK_STS_IOERR); - - mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0); } static inline void __ublk_rq_task_work(struct request *req, @@ -1212,15 +1208,6 @@ static inline void ublk_forward_io_cmds(struct ublk_queue *ubq, __ublk_rq_task_work(blk_mq_rq_from_pdu(data), issue_flags); } -static inline void ublk_abort_io_cmds(struct ublk_queue *ubq) -{ - struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds); - struct ublk_rq_data *data, *tmp; - - llist_for_each_entry_safe(data, tmp, io_cmds, node) - __ublk_abort_rq(ubq, blk_mq_rq_from_pdu(data)); -} - static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, unsigned issue_flags) { struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); @@ -1232,38 +1219,19 @@ static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, unsigned issue_flags) static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) { struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq); - struct ublk_io *io; - if (!llist_add(&data->node, &ubq->io_cmds)) - return; + if (llist_add(&data->node, &ubq->io_cmds)) { + struct ublk_io *io = &ubq->ios[rq->tag]; - io = &ubq->ios[rq->tag]; - /* - * If the check pass, we know that this is a re-issued request aborted - * previously in monitor_work because the ubq_daemon(cmd's task) is - * PF_EXITING. We cannot call io_uring_cmd_complete_in_task() anymore - * because this ioucmd's io_uring context may be freed now if no inflight - * ioucmd exists. Otherwise we may cause null-deref in ctx->fallback_work. - * - * Note: monitor_work sets UBLK_IO_FLAG_ABORTED and ends this request(releasing - * the tag). Then the request is re-started(allocating the tag) and we are here. - * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED - * guarantees that here is a re-issued request aborted previously. - */ - if (unlikely(io->flags & UBLK_IO_FLAG_ABORTED)) { - ublk_abort_io_cmds(ubq); - } else { - struct io_uring_cmd *cmd = io->cmd; - struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); - - pdu->ubq = ubq; - io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb); + io_uring_cmd_complete_in_task(io->cmd, ublk_rq_task_work_cb); } } static enum blk_eh_timer_return ublk_timeout(struct request *rq) { struct ublk_queue *ubq = rq->mq_hctx->driver_data; + unsigned int nr_inflight = 0; + int i; if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) { if (!ubq->timeout) { @@ -1274,6 +1242,29 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq) return BLK_EH_DONE; } + if (!ubq_daemon_is_dying(ubq)) + return BLK_EH_RESET_TIMER; + + for (i = 0; i < ubq->q_depth; i++) { + struct ublk_io *io = &ubq->ios[i]; + + if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) + nr_inflight++; + } + + /* cancelable uring_cmd can't help us if all commands are in-flight */ + if (nr_inflight == ubq->q_depth) { + struct ublk_device *ub = ubq->dev; + + if (ublk_abort_requests(ub, ubq)) { + if (ublk_can_use_recovery(ub)) + schedule_work(&ub->quiesce_work); + else + schedule_work(&ub->stop_work); + } + return BLK_EH_DONE; + } + return BLK_EH_RESET_TIMER; } @@ -1301,13 +1292,12 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, if (ublk_queue_can_use_recovery(ubq) && unlikely(ubq->force_abort)) return BLK_STS_IOERR; - blk_mq_start_request(bd->rq); - - if (unlikely(ubq_daemon_is_dying(ubq))) { + if (unlikely(ubq->canceling)) { __ublk_abort_rq(ubq, rq); return BLK_STS_OK; } + blk_mq_start_request(bd->rq); ublk_queue_cmd(ubq, rq); return BLK_STS_OK; @@ -1357,12 +1347,12 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma) unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT; int q_id, ret = 0; - spin_lock(&ub->mm_lock); + spin_lock(&ub->lock); if (!ub->mm) ub->mm = current->mm; if (current->mm != ub->mm) ret = -EINVAL; - spin_unlock(&ub->mm_lock); + spin_unlock(&ub->lock); if (ret) return ret; @@ -1411,17 +1401,14 @@ static void ublk_commit_completion(struct ublk_device *ub, } /* - * When ->ubq_daemon is exiting, either new request is ended immediately, - * or any queued io command is drained, so it is safe to abort queue - * lockless + * Called from ubq_daemon context via cancel fn, meantime quiesce ublk + * blk-mq queue, so we are called exclusively with blk-mq and ubq_daemon + * context, so everything is serialized. */ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) { int i; - if (!ublk_get_device(ub)) - return; - for (i = 0; i < ubq->q_depth; i++) { struct ublk_io *io = &ubq->ios[i]; @@ -1433,72 +1420,114 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) * will do it */ rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i); - if (rq) + if (rq && blk_mq_request_started(rq)) { + io->flags |= UBLK_IO_FLAG_ABORTED; __ublk_fail_req(ubq, io, rq); + } } } - ublk_put_device(ub); } -static void ublk_daemon_monitor_work(struct work_struct *work) +static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq) { - struct ublk_device *ub = - container_of(work, struct ublk_device, monitor_work.work); - int i; + struct gendisk *disk; - for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { - struct ublk_queue *ubq = ublk_get_queue(ub, i); + spin_lock(&ubq->cancel_lock); + if (ubq->canceling) { + spin_unlock(&ubq->cancel_lock); + return false; + } + ubq->canceling = true; + spin_unlock(&ubq->cancel_lock); - if (ubq_daemon_is_dying(ubq)) { - if (ublk_queue_can_use_recovery(ubq)) - schedule_work(&ub->quiesce_work); - else - schedule_work(&ub->stop_work); + spin_lock(&ub->lock); + disk = ub->ub_disk; + if (disk) + get_device(disk_to_dev(disk)); + spin_unlock(&ub->lock); - /* abort queue is for making forward progress */ - ublk_abort_queue(ub, ubq); - } - } + /* Our disk has been dead */ + if (!disk) + return false; - /* - * We can't schedule monitor work after ub's state is not UBLK_S_DEV_LIVE. - * after ublk_remove() or __ublk_quiesce_dev() is started. - * - * No need ub->mutex, monitor work are canceled after state is marked - * as not LIVE, so new state is observed reliably. - */ - if (ub->dev_info.state == UBLK_S_DEV_LIVE) - schedule_delayed_work(&ub->monitor_work, - UBLK_DAEMON_MONITOR_PERIOD); -} + /* Now we are serialized with ublk_queue_rq() */ + blk_mq_quiesce_queue(disk->queue); + /* abort queue is for making forward progress */ + ublk_abort_queue(ub, ubq); + blk_mq_unquiesce_queue(disk->queue); + put_device(disk_to_dev(disk)); -static inline bool ublk_queue_ready(struct ublk_queue *ubq) -{ - return ubq->nr_io_ready == ubq->q_depth; + return true; } -static void ublk_cmd_cancel_cb(struct io_uring_cmd *cmd, unsigned issue_flags) +static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io, + unsigned int issue_flags) { - io_uring_cmd_done(cmd, UBLK_IO_RES_ABORT, 0, issue_flags); + bool done; + + if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) + return; + + spin_lock(&ubq->cancel_lock); + done = !!(io->flags & UBLK_IO_FLAG_CANCELED); + if (!done) + io->flags |= UBLK_IO_FLAG_CANCELED; + spin_unlock(&ubq->cancel_lock); + + if (!done) + io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0, issue_flags); } -static void ublk_cancel_queue(struct ublk_queue *ubq) +/* + * The ublk char device won't be closed when calling cancel fn, so both + * ublk device and queue are guaranteed to be live + */ +static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, + unsigned int issue_flags) { - int i; + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + struct ublk_queue *ubq = pdu->ubq; + struct task_struct *task; + struct ublk_device *ub; + bool need_schedule; + struct ublk_io *io; - if (!ublk_queue_ready(ubq)) + if (WARN_ON_ONCE(!ubq)) return; - for (i = 0; i < ubq->q_depth; i++) { - struct ublk_io *io = &ubq->ios[i]; + if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth)) + return; + + task = io_uring_cmd_get_task(cmd); + if (WARN_ON_ONCE(task && task != ubq->ubq_daemon)) + return; + + ub = ubq->dev; + need_schedule = ublk_abort_requests(ub, ubq); + + io = &ubq->ios[pdu->tag]; + WARN_ON_ONCE(io->cmd != cmd); + ublk_cancel_cmd(ubq, io, issue_flags); - if (io->flags & UBLK_IO_FLAG_ACTIVE) - io_uring_cmd_complete_in_task(io->cmd, - ublk_cmd_cancel_cb); + if (need_schedule) { + if (ublk_can_use_recovery(ub)) + schedule_work(&ub->quiesce_work); + else + schedule_work(&ub->stop_work); } +} - /* all io commands are canceled */ - ubq->nr_io_ready = 0; +static inline bool ublk_queue_ready(struct ublk_queue *ubq) +{ + return ubq->nr_io_ready == ubq->q_depth; +} + +static void ublk_cancel_queue(struct ublk_queue *ubq) +{ + int i; + + for (i = 0; i < ubq->q_depth; i++) + ublk_cancel_cmd(ubq, &ubq->ios[i], IO_URING_F_UNLOCKED); } /* Cancel all pending commands, must be called after del_gendisk() returns */ @@ -1545,16 +1574,6 @@ static void __ublk_quiesce_dev(struct ublk_device *ub) blk_mq_quiesce_queue(ub->ub_disk->queue); ublk_wait_tagset_rqs_idle(ub); ub->dev_info.state = UBLK_S_DEV_QUIESCED; - ublk_cancel_dev(ub); - /* we are going to release task_struct of ubq_daemon and resets - * ->ubq_daemon to NULL. So in monitor_work, check on ubq_daemon causes UAF. - * Besides, monitor_work is not necessary in QUIESCED state since we have - * already scheduled quiesce_work and quiesced all ubqs. - * - * Do not let monitor_work schedule itself if state it QUIESCED. And we cancel - * it here and re-schedule it in END_USER_RECOVERY to avoid UAF. - */ - cancel_delayed_work_sync(&ub->monitor_work); } static void ublk_quiesce_work_fn(struct work_struct *work) @@ -1568,6 +1587,7 @@ static void ublk_quiesce_work_fn(struct work_struct *work) __ublk_quiesce_dev(ub); unlock: mutex_unlock(&ub->mutex); + ublk_cancel_dev(ub); } static void ublk_unquiesce_dev(struct ublk_device *ub) @@ -1593,6 +1613,8 @@ static void ublk_unquiesce_dev(struct ublk_device *ub) static void ublk_stop_dev(struct ublk_device *ub) { + struct gendisk *disk; + mutex_lock(&ub->mutex); if (ub->dev_info.state == UBLK_S_DEV_DEAD) goto unlock; @@ -1602,14 +1624,18 @@ static void ublk_stop_dev(struct ublk_device *ub) ublk_unquiesce_dev(ub); } del_gendisk(ub->ub_disk); + + /* Sync with ublk_abort_queue() by holding the lock */ + spin_lock(&ub->lock); + disk = ub->ub_disk; ub->dev_info.state = UBLK_S_DEV_DEAD; ub->dev_info.ublksrv_pid = -1; - put_disk(ub->ub_disk); ub->ub_disk = NULL; + spin_unlock(&ub->lock); + put_disk(disk); unlock: - ublk_cancel_dev(ub); mutex_unlock(&ub->mutex); - cancel_delayed_work_sync(&ub->monitor_work); + ublk_cancel_dev(ub); } /* device can only be started after all IOs are ready */ @@ -1660,6 +1686,21 @@ static inline void ublk_fill_io_cmd(struct ublk_io *io, io->addr = buf_addr; } +static inline void ublk_prep_cancel(struct io_uring_cmd *cmd, + unsigned int issue_flags, + struct ublk_queue *ubq, unsigned int tag) +{ + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + + /* + * Safe to refer to @ubq since ublk_queue won't be died until its + * commands are completed + */ + pdu->ubq = ubq; + pdu->tag = tag; + io_uring_cmd_mark_cancelable(cmd, issue_flags); +} + static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags, const struct ublksrv_io_cmd *ub_cmd) @@ -1775,6 +1816,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, default: goto out; } + ublk_prep_cancel(cmd, issue_flags, ubq, tag); return -EIOCBQUEUED; out: @@ -1814,7 +1856,8 @@ fail_put: return NULL; } -static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +static inline int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, + unsigned int issue_flags) { /* * Not necessary for async retry, but let's keep it simple and always @@ -1828,9 +1871,33 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) .addr = READ_ONCE(ub_src->addr) }; + WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED); + return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd); } +static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + ublk_ch_uring_cmd_local(cmd, issue_flags); +} + +static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + if (unlikely(issue_flags & IO_URING_F_CANCEL)) { + ublk_uring_cmd_cancel_fn(cmd, issue_flags); + return 0; + } + + /* well-implemented server won't run into unlocked */ + if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { + io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb); + return -EIOCBQUEUED; + } + + return ublk_ch_uring_cmd_local(cmd, issue_flags); +} + static inline bool ublk_check_ubuf_dir(const struct request *req, int ubuf_dir) { @@ -1962,6 +2029,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id) void *ptr; int size; + spin_lock_init(&ubq->cancel_lock); ubq->flags = ub->dev_info.flags; ubq->q_id = q_id; ubq->q_depth = ub->dev_info.queue_depth; @@ -2026,7 +2094,8 @@ static int ublk_alloc_dev_number(struct ublk_device *ub, int idx) if (err == -ENOSPC) err = -EEXIST; } else { - err = idr_alloc(&ublk_index_idr, ub, 0, 0, GFP_NOWAIT); + err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS, + GFP_NOWAIT); } spin_unlock(&ublk_idr_lock); @@ -2151,8 +2220,6 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) if (wait_for_completion_interruptible(&ub->completion) != 0) return -EINTR; - schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD); - mutex_lock(&ub->mutex); if (ub->dev_info.state == UBLK_S_DEV_LIVE || test_bit(UB_STATE_USED, &ub->state)) { @@ -2305,6 +2372,12 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) return -EINVAL; } + if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) { + pr_warn("%s: dev id is too large. Max supported is %d\n", + __func__, UBLK_MAX_UBLKS - 1); + return -EINVAL; + } + ublk_dump_dev_info(&info); ret = mutex_lock_killable(&ublk_ctl_mutex); @@ -2320,10 +2393,9 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) if (!ub) goto out_unlock; mutex_init(&ub->mutex); - spin_lock_init(&ub->mm_lock); + spin_lock_init(&ub->lock); INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn); INIT_WORK(&ub->stop_work, ublk_stop_work_fn); - INIT_DELAYED_WORK(&ub->monitor_work, ublk_daemon_monitor_work); ret = ublk_alloc_dev_number(ub, header->dev_id); if (ret < 0) @@ -2569,13 +2641,15 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) int i; WARN_ON_ONCE(!(ubq->ubq_daemon && ubq_daemon_is_dying(ubq))); + /* All old ioucmds have to be completed */ - WARN_ON_ONCE(ubq->nr_io_ready); + ubq->nr_io_ready = 0; /* old daemon is PF_EXITING, put it now */ put_task_struct(ubq->ubq_daemon); /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */ ubq->ubq_daemon = NULL; ubq->timeout = false; + ubq->canceling = false; for (i = 0; i < ubq->q_depth; i++) { struct ublk_io *io = &ubq->ios[i]; @@ -2661,7 +2735,6 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub, __func__, header->dev_id); blk_mq_kick_requeue_list(ub->ub_disk->queue); ub->dev_info.state = UBLK_S_DEV_LIVE; - schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD); ret = 0; out_unlock: mutex_unlock(&ub->mutex); @@ -2932,7 +3005,22 @@ static void __exit ublk_exit(void) module_init(ublk_init); module_exit(ublk_exit); -module_param(ublks_max, int, 0444); +static int ublk_set_max_ublks(const char *buf, const struct kernel_param *kp) +{ + return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS); +} + +static int ublk_get_max_ublks(char *buf, const struct kernel_param *kp) +{ + return sysfs_emit(buf, "%u\n", ublks_max); +} + +static const struct kernel_param_ops ublk_max_ublks_ops = { + .set = ublk_set_max_ublks, + .get = ublk_get_max_ublks, +}; + +module_param_cb(ublks_max, &ublk_max_ublks_ops, &ublks_max, 0644); MODULE_PARM_DESC(ublks_max, "max number of ublk devices allowed to add(default: 64)"); MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>"); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 1fe011676d07..4689ac2e0c0e 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -470,8 +470,6 @@ static bool virtblk_prep_rq_batch(struct request *req) struct virtio_blk *vblk = req->mq_hctx->queue->queuedata; struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); - req->mq_hctx->tags->rqs[req->tag] = req; - return virtblk_prep_rq(req->mq_hctx, vblk, req, vbr) == BLK_STS_OK; } diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index cc2839805983..a5e07270e0d4 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -3655,7 +3655,6 @@ static struct ctl_table cdrom_table[] = { .mode = 0644, .proc_handler = cdrom_sysctl_handler }, - { } }; static struct ctl_table_header *cdrom_sysctl_header; diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 9755788e8b78..91ebdcc6e9a8 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -749,7 +749,11 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r return ERR_PTR(-ENOMEM); } - mddev_init(&rs->md); + if (mddev_init(&rs->md)) { + kfree(rs); + ti->error = "Cannot initialize raid context"; + return ERR_PTR(-ENOMEM); + } rs->raid_disks = raid_devs; rs->delta_disks = 0; @@ -798,6 +802,7 @@ static void raid_set_free(struct raid_set *rs) dm_put_device(rs->ti, rs->dev[i].data_dev); } + mddev_destroy(&rs->md); kfree(rs); } @@ -3239,7 +3244,7 @@ size_check: set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery); /* Has to be held on running the array */ - mddev_lock_nointr(&rs->md); + mddev_suspend_and_lock_nointr(&rs->md); r = md_run(&rs->md); rs->md.in_sync = 0; /* Assume already marked dirty */ if (r) { @@ -3263,7 +3268,6 @@ size_check: } } - mddev_suspend(&rs->md); set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags); /* Try to adjust the raid4/5/6 stripe cache size to the stripe size */ @@ -3793,9 +3797,7 @@ static void raid_postsuspend(struct dm_target *ti) if (!test_bit(MD_RECOVERY_FROZEN, &rs->md.recovery)) md_stop_writes(&rs->md); - mddev_lock_nointr(&rs->md); - mddev_suspend(&rs->md); - mddev_unlock(&rs->md); + mddev_suspend(&rs->md, false); } } @@ -4054,8 +4056,7 @@ static void raid_resume(struct dm_target *ti) clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); mddev->ro = 0; mddev->in_sync = 0; - mddev_resume(mddev); - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); } } diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c index 6eaa0eab40f9..4b80165afd23 100644 --- a/drivers/md/md-autodetect.c +++ b/drivers/md/md-autodetect.c @@ -175,7 +175,7 @@ static void __init md_setup_drive(struct md_setup_args *args) return; } - err = mddev_lock(mddev); + err = mddev_suspend_and_lock(mddev); if (err) { pr_err("md: failed to lock array %s\n", name); goto out_mddev_put; @@ -221,7 +221,7 @@ static void __init md_setup_drive(struct md_setup_args *args) if (err) pr_warn("md: starting %s failed\n", name); out_unlock: - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); out_mddev_put: mddev_put(mddev); } diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 6f9ff14971f9..9672f75c3050 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1861,7 +1861,7 @@ void md_bitmap_destroy(struct mddev *mddev) md_bitmap_wait_behind_writes(mddev); if (!mddev->serialize_policy) - mddev_destroy_serial_pool(mddev, NULL, true); + mddev_destroy_serial_pool(mddev, NULL); mutex_lock(&mddev->bitmap_info.mutex); spin_lock(&mddev->lock); @@ -1977,7 +1977,7 @@ int md_bitmap_load(struct mddev *mddev) goto out; rdev_for_each(rdev, mddev) - mddev_create_serial_pool(mddev, rdev, true); + mddev_create_serial_pool(mddev, rdev); if (mddev_is_clustered(mddev)) md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes); @@ -2348,14 +2348,11 @@ location_store(struct mddev *mddev, const char *buf, size_t len) { int rv; - rv = mddev_lock(mddev); + rv = mddev_suspend_and_lock(mddev); if (rv) return rv; + if (mddev->pers) { - if (!mddev->pers->quiesce) { - rv = -EBUSY; - goto out; - } if (mddev->recovery || mddev->sync_thread) { rv = -EBUSY; goto out; @@ -2369,11 +2366,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len) rv = -EBUSY; goto out; } - if (mddev->pers) { - mddev_suspend(mddev); - md_bitmap_destroy(mddev); - mddev_resume(mddev); - } + + md_bitmap_destroy(mddev); mddev->bitmap_info.offset = 0; if (mddev->bitmap_info.file) { struct file *f = mddev->bitmap_info.file; @@ -2383,6 +2377,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len) } else { /* No bitmap, OK to set a location */ long long offset; + struct bitmap *bitmap; + if (strncmp(buf, "none", 4) == 0) /* nothing to be done */; else if (strncmp(buf, "file:", 5) == 0) { @@ -2406,25 +2402,20 @@ location_store(struct mddev *mddev, const char *buf, size_t len) rv = -EINVAL; goto out; } + mddev->bitmap_info.offset = offset; - if (mddev->pers) { - struct bitmap *bitmap; - bitmap = md_bitmap_create(mddev, -1); - mddev_suspend(mddev); - if (IS_ERR(bitmap)) - rv = PTR_ERR(bitmap); - else { - mddev->bitmap = bitmap; - rv = md_bitmap_load(mddev); - if (rv) - mddev->bitmap_info.offset = 0; - } - if (rv) { - md_bitmap_destroy(mddev); - mddev_resume(mddev); - goto out; - } - mddev_resume(mddev); + bitmap = md_bitmap_create(mddev, -1); + if (IS_ERR(bitmap)) { + rv = PTR_ERR(bitmap); + goto out; + } + + mddev->bitmap = bitmap; + rv = md_bitmap_load(mddev); + if (rv) { + mddev->bitmap_info.offset = 0; + md_bitmap_destroy(mddev); + goto out; } } } @@ -2437,7 +2428,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len) } rv = 0; out: - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); if (rv) return rv; return len; @@ -2546,7 +2537,7 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) if (backlog > COUNTER_MAX) return -EINVAL; - rv = mddev_lock(mddev); + rv = mddev_suspend_and_lock(mddev); if (rv) return rv; @@ -2571,16 +2562,16 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) if (!backlog && mddev->serial_info_pool) { /* serial_info_pool is not needed if backlog is zero */ if (!mddev->serialize_policy) - mddev_destroy_serial_pool(mddev, NULL, false); + mddev_destroy_serial_pool(mddev, NULL); } else if (backlog && !mddev->serial_info_pool) { /* serial_info_pool is needed since backlog is not zero */ rdev_for_each(rdev, mddev) - mddev_create_serial_pool(mddev, rdev, false); + mddev_create_serial_pool(mddev, rdev); } if (old_mwb != backlog) md_bitmap_update_sb(mddev->bitmap); - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return len; } diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 1e26eb223349..8e36a0feec09 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -501,7 +501,7 @@ static void process_suspend_info(struct mddev *mddev, mddev->pers->quiesce(mddev, 0); } -static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) +static int process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) { char disk_uuid[64]; struct md_cluster_info *cinfo = mddev->cluster_info; @@ -509,6 +509,7 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) char raid_slot[16]; char *envp[] = {event_name, disk_uuid, raid_slot, NULL}; int len; + int res = 0; len = snprintf(disk_uuid, 64, "DEVICE_UUID="); sprintf(disk_uuid + len, "%pU", cmsg->uuid); @@ -517,9 +518,14 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) init_completion(&cinfo->newdisk_completion); set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); kobject_uevent_env(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE, envp); - wait_for_completion_timeout(&cinfo->newdisk_completion, - NEW_DEV_TIMEOUT); + if (!wait_for_completion_timeout(&cinfo->newdisk_completion, + NEW_DEV_TIMEOUT)) { + pr_err("md-cluster(%s:%d): timeout on a new disk adding\n", + __func__, __LINE__); + res = -1; + } clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); + return res; } @@ -594,7 +600,8 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) le64_to_cpu(msg->high)); break; case NEWDISK: - process_add_new_disk(mddev, msg); + if (process_add_new_disk(mddev, msg)) + ret = -1; break; case REMOVE: process_remove_disk(mddev, msg); diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 71ac99646827..8eca7693b793 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -69,6 +69,19 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) if (!conf) return NULL; + /* + * conf->raid_disks is copy of mddev->raid_disks. The reason to + * keep a copy of mddev->raid_disks in struct linear_conf is, + * mddev->raid_disks may not be consistent with pointers number of + * conf->disks[] when it is updated in linear_add() and used to + * iterate old conf->disks[] earray in linear_congested(). + * Here conf->raid_disks is always consitent with number of + * pointers in conf->disks[] array, and mddev->private is updated + * with rcu_assign_pointer() in linear_addr(), such race can be + * avoided. + */ + conf->raid_disks = raid_disks; + cnt = 0; conf->array_sectors = 0; @@ -112,19 +125,6 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) conf->disks[i-1].end_sector + conf->disks[i].rdev->sectors; - /* - * conf->raid_disks is copy of mddev->raid_disks. The reason to - * keep a copy of mddev->raid_disks in struct linear_conf is, - * mddev->raid_disks may not be consistent with pointers number of - * conf->disks[] when it is updated in linear_add() and used to - * iterate old conf->disks[] earray in linear_congested(). - * Here conf->raid_disks is always consitent with number of - * pointers in conf->disks[] array, and mddev->private is updated - * with rcu_assign_pointer() in linear_addr(), such race can be - * avoided. - */ - conf->raid_disks = raid_disks; - return conf; out: @@ -183,7 +183,6 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) * in linear_congested(), therefore kfree_rcu() is used to free * oldconf until no one uses it anymore. */ - mddev_suspend(mddev); oldconf = rcu_dereference_protected(mddev->private, lockdep_is_held(&mddev->reconfig_mutex)); mddev->raid_disks++; @@ -192,7 +191,6 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) rcu_assign_pointer(mddev->private, newconf); md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); - mddev_resume(mddev); kfree_rcu(oldconf, rcu); return 0; } diff --git a/drivers/md/md-linear.h b/drivers/md/md-linear.h index 24e97db50ebb..5587eeedb882 100644 --- a/drivers/md/md-linear.h +++ b/drivers/md/md-linear.h @@ -12,6 +12,6 @@ struct linear_conf struct rcu_head rcu; sector_t array_sectors; int raid_disks; /* a copy of mddev->raid_disks */ - struct dev_info disks[]; + struct dev_info disks[] __counted_by(raid_disks); }; #endif diff --git a/drivers/md/md.c b/drivers/md/md.c index 839e79e567ee..411121a72df9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -91,6 +91,18 @@ static void mddev_detach(struct mddev *mddev); static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); static void md_wakeup_thread_directly(struct md_thread __rcu *thread); +enum md_ro_state { + MD_RDWR, + MD_RDONLY, + MD_AUTO_READ, + MD_MAX_STATE +}; + +static bool md_is_rdwr(struct mddev *mddev) +{ + return (mddev->ro == MD_RDWR); +} + /* * Default number of read corrections we'll attempt on an rdev * before ejecting it from the array. We divide the read error @@ -206,8 +218,7 @@ static int rdev_need_serial(struct md_rdev *rdev) * 1. rdev is the first device which return true from rdev_enable_serial. * 2. rdev is NULL, means we want to enable serialization for all rdevs. */ -void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, - bool is_suspend) +void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev) { int ret = 0; @@ -215,15 +226,12 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, !test_bit(CollisionCheck, &rdev->flags)) return; - if (!is_suspend) - mddev_suspend(mddev); - if (!rdev) ret = rdevs_init_serial(mddev); else ret = rdev_init_serial(rdev); if (ret) - goto abort; + return; if (mddev->serial_info_pool == NULL) { /* @@ -238,10 +246,6 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, pr_err("can't alloc memory pool for serialization\n"); } } - -abort: - if (!is_suspend) - mddev_resume(mddev); } /* @@ -250,8 +254,7 @@ abort: * 2. when bitmap is destroyed while policy is not enabled. * 3. for disable policy, the pool is destroyed only when no rdev needs it. */ -void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, - bool is_suspend) +void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) { if (rdev && !test_bit(CollisionCheck, &rdev->flags)) return; @@ -260,8 +263,6 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, struct md_rdev *temp; int num = 0; /* used to track if other rdevs need the pool */ - if (!is_suspend) - mddev_suspend(mddev); rdev_for_each(temp, mddev) { if (!rdev) { if (!mddev->serialize_policy || @@ -283,8 +284,6 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, mempool_destroy(mddev->serial_info_pool); mddev->serial_info_pool = NULL; } - if (!is_suspend) - mddev_resume(mddev); } } @@ -346,6 +345,10 @@ EXPORT_SYMBOL_GPL(md_new_event); static LIST_HEAD(all_mddevs); static DEFINE_SPINLOCK(all_mddevs_lock); +static bool is_md_suspended(struct mddev *mddev) +{ + return percpu_ref_is_dying(&mddev->active_io); +} /* Rather than calling directly into the personality make_request function, * IO requests come here first so that we can check if the device is * being suspended pending a reconfiguration. @@ -359,11 +362,11 @@ static bool is_suspended(struct mddev *mddev, struct bio *bio) return true; if (bio_data_dir(bio) != WRITE) return false; - if (mddev->suspend_lo >= mddev->suspend_hi) + if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi)) return false; - if (bio->bi_iter.bi_sector >= mddev->suspend_hi) + if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi)) return false; - if (bio_end_sector(bio) < mddev->suspend_lo) + if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo)) return false; return true; } @@ -431,42 +434,73 @@ static void md_submit_bio(struct bio *bio) md_handle_request(mddev, bio); } -/* mddev_suspend makes sure no new requests are submitted - * to the device, and that any requests that have been submitted - * are completely handled. - * Once mddev_detach() is called and completes, the module will be - * completely unused. +/* + * Make sure no new requests are submitted to the device, and any requests that + * have been submitted are completely handled. */ -void mddev_suspend(struct mddev *mddev) +int mddev_suspend(struct mddev *mddev, bool interruptible) { - struct md_thread *thread = rcu_dereference_protected(mddev->thread, - lockdep_is_held(&mddev->reconfig_mutex)); + int err = 0; - WARN_ON_ONCE(thread && current == thread->tsk); - if (mddev->suspended++) - return; - wake_up(&mddev->sb_wait); - set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); - percpu_ref_kill(&mddev->active_io); + /* + * hold reconfig_mutex to wait for normal io will deadlock, because + * other context can't update super_block, and normal io can rely on + * updating super_block. + */ + lockdep_assert_not_held(&mddev->reconfig_mutex); - if (mddev->pers->prepare_suspend) - mddev->pers->prepare_suspend(mddev); + if (interruptible) + err = mutex_lock_interruptible(&mddev->suspend_mutex); + else + mutex_lock(&mddev->suspend_mutex); + if (err) + return err; - wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); - clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); - wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags)); + if (mddev->suspended) { + WRITE_ONCE(mddev->suspended, mddev->suspended + 1); + mutex_unlock(&mddev->suspend_mutex); + return 0; + } + + percpu_ref_kill(&mddev->active_io); + if (interruptible) + err = wait_event_interruptible(mddev->sb_wait, + percpu_ref_is_zero(&mddev->active_io)); + else + wait_event(mddev->sb_wait, + percpu_ref_is_zero(&mddev->active_io)); + if (err) { + percpu_ref_resurrect(&mddev->active_io); + mutex_unlock(&mddev->suspend_mutex); + return err; + } + + /* + * For raid456, io might be waiting for reshape to make progress, + * allow new reshape to start while waiting for io to be done to + * prevent deadlock. + */ + WRITE_ONCE(mddev->suspended, mddev->suspended + 1); del_timer_sync(&mddev->safemode_timer); /* restrict memory reclaim I/O during raid array is suspend */ mddev->noio_flag = memalloc_noio_save(); + + mutex_unlock(&mddev->suspend_mutex); + return 0; } EXPORT_SYMBOL_GPL(mddev_suspend); void mddev_resume(struct mddev *mddev) { - lockdep_assert_held(&mddev->reconfig_mutex); - if (--mddev->suspended) + lockdep_assert_not_held(&mddev->reconfig_mutex); + + mutex_lock(&mddev->suspend_mutex); + WRITE_ONCE(mddev->suspended, mddev->suspended - 1); + if (mddev->suspended) { + mutex_unlock(&mddev->suspend_mutex); return; + } /* entred the memalloc scope from mddev_suspend() */ memalloc_noio_restore(mddev->noio_flag); @@ -477,6 +511,8 @@ void mddev_resume(struct mddev *mddev) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ + + mutex_unlock(&mddev->suspend_mutex); } EXPORT_SYMBOL_GPL(mddev_resume); @@ -616,34 +652,63 @@ static inline struct mddev *mddev_get(struct mddev *mddev) static void mddev_delayed_delete(struct work_struct *ws); +static void __mddev_put(struct mddev *mddev) +{ + if (mddev->raid_disks || !list_empty(&mddev->disks) || + mddev->ctime || mddev->hold_active) + return; + + /* Array is not configured at all, and not held active, so destroy it */ + set_bit(MD_DELETED, &mddev->flags); + + /* + * Call queue_work inside the spinlock so that flush_workqueue() after + * mddev_find will succeed in waiting for the work to be done. + */ + queue_work(md_misc_wq, &mddev->del_work); +} + void mddev_put(struct mddev *mddev) { if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) return; - if (!mddev->raid_disks && list_empty(&mddev->disks) && - mddev->ctime == 0 && !mddev->hold_active) { - /* Array is not configured at all, and not held active, - * so destroy it */ - set_bit(MD_DELETED, &mddev->flags); - /* - * Call queue_work inside the spinlock so that - * flush_workqueue() after mddev_find will succeed in waiting - * for the work to be done. - */ - INIT_WORK(&mddev->del_work, mddev_delayed_delete); - queue_work(md_misc_wq, &mddev->del_work); - } + __mddev_put(mddev); spin_unlock(&all_mddevs_lock); } static void md_safemode_timeout(struct timer_list *t); +static void md_start_sync(struct work_struct *ws); + +static void active_io_release(struct percpu_ref *ref) +{ + struct mddev *mddev = container_of(ref, struct mddev, active_io); + + wake_up(&mddev->sb_wait); +} + +static void no_op(struct percpu_ref *r) {} -void mddev_init(struct mddev *mddev) +int mddev_init(struct mddev *mddev) { + + if (percpu_ref_init(&mddev->active_io, active_io_release, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) + return -ENOMEM; + + if (percpu_ref_init(&mddev->writes_pending, no_op, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { + percpu_ref_exit(&mddev->active_io); + return -ENOMEM; + } + + /* We want to start with the refcount at zero */ + percpu_ref_put(&mddev->writes_pending); + mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); mutex_init(&mddev->sync_mutex); + mutex_init(&mddev->suspend_mutex); mutex_init(&mddev->bitmap_info.mutex); INIT_LIST_HEAD(&mddev->disks); INIT_LIST_HEAD(&mddev->all_mddevs); @@ -662,9 +727,21 @@ void mddev_init(struct mddev *mddev) mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->level = LEVEL_NONE; + + INIT_WORK(&mddev->sync_work, md_start_sync); + INIT_WORK(&mddev->del_work, mddev_delayed_delete); + + return 0; } EXPORT_SYMBOL_GPL(mddev_init); +void mddev_destroy(struct mddev *mddev) +{ + percpu_ref_exit(&mddev->active_io); + percpu_ref_exit(&mddev->writes_pending); +} +EXPORT_SYMBOL_GPL(mddev_destroy); + static struct mddev *mddev_find_locked(dev_t unit) { struct mddev *mddev; @@ -708,13 +785,16 @@ static struct mddev *mddev_alloc(dev_t unit) new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) return ERR_PTR(-ENOMEM); - mddev_init(new); + + error = mddev_init(new); + if (error) + goto out_free_new; spin_lock(&all_mddevs_lock); if (unit) { error = -EEXIST; if (mddev_find_locked(unit)) - goto out_free_new; + goto out_destroy_new; new->unit = unit; if (MAJOR(unit) == MD_MAJOR) new->md_minor = MINOR(unit); @@ -725,7 +805,7 @@ static struct mddev *mddev_alloc(dev_t unit) error = -ENODEV; new->unit = mddev_alloc_unit(); if (!new->unit) - goto out_free_new; + goto out_destroy_new; new->md_minor = MINOR(new->unit); new->hold_active = UNTIL_STOP; } @@ -733,8 +813,11 @@ static struct mddev *mddev_alloc(dev_t unit) list_add(&new->all_mddevs, &all_mddevs); spin_unlock(&all_mddevs_lock); return new; -out_free_new: + +out_destroy_new: spin_unlock(&all_mddevs_lock); + mddev_destroy(new); +out_free_new: kfree(new); return ERR_PTR(error); } @@ -745,6 +828,7 @@ static void mddev_free(struct mddev *mddev) list_del(&mddev->all_mddevs); spin_unlock(&all_mddevs_lock); + mddev_destroy(mddev); kfree(mddev); } @@ -2412,7 +2496,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) pr_debug("md: bind<%s>\n", b); if (mddev->raid_disks) - mddev_create_serial_pool(mddev, rdev, false); + mddev_create_serial_pool(mddev, rdev); if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) goto fail; @@ -2464,7 +2548,7 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev) bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); pr_debug("md: unbind<%pg>\n", rdev->bdev); - mddev_destroy_serial_pool(rdev->mddev, rdev, false); + mddev_destroy_serial_pool(rdev->mddev, rdev); rdev->mddev = NULL; sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); @@ -2794,11 +2878,7 @@ static int add_bound_rdev(struct md_rdev *rdev) */ super_types[mddev->major_version]. validate_super(mddev, rdev); - if (add_journal) - mddev_suspend(mddev); err = mddev->pers->hot_add_disk(mddev, rdev); - if (add_journal) - mddev_resume(mddev); if (err) { md_kick_rdev_from_array(rdev); return err; @@ -2935,11 +3015,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) } } else if (cmd_match(buf, "writemostly")) { set_bit(WriteMostly, &rdev->flags); - mddev_create_serial_pool(rdev->mddev, rdev, false); + mddev_create_serial_pool(rdev->mddev, rdev); need_update_sb = true; err = 0; } else if (cmd_match(buf, "-writemostly")) { - mddev_destroy_serial_pool(rdev->mddev, rdev, false); + mddev_destroy_serial_pool(rdev->mddev, rdev); clear_bit(WriteMostly, &rdev->flags); need_update_sb = true; err = 0; @@ -3551,6 +3631,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); struct kernfs_node *kn = NULL; + bool suspend = false; ssize_t rv; struct mddev *mddev = rdev->mddev; @@ -3558,17 +3639,25 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; + if (!mddev) + return -ENODEV; - if (entry->store == state_store && cmd_match(page, "remove")) - kn = sysfs_break_active_protection(kobj, attr); + if (entry->store == state_store) { + if (cmd_match(page, "remove")) + kn = sysfs_break_active_protection(kobj, attr); + if (cmd_match(page, "remove") || cmd_match(page, "re-add") || + cmd_match(page, "writemostly") || + cmd_match(page, "-writemostly")) + suspend = true; + } - rv = mddev ? mddev_lock(mddev) : -ENODEV; + rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev); if (!rv) { if (rdev->mddev == NULL) rv = -ENODEV; else rv = entry->store(rdev, page, length); - mddev_unlock(mddev); + suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); } if (kn) @@ -3867,12 +3956,12 @@ level_store(struct mddev *mddev, const char *buf, size_t len) if (slen == 0 || slen >= sizeof(clevel)) return -EINVAL; - rv = mddev_lock(mddev); + rv = mddev_suspend_and_lock(mddev); if (rv) return rv; if (mddev->pers == NULL) { - strncpy(mddev->clevel, buf, slen); + memcpy(mddev->clevel, buf, slen); if (mddev->clevel[slen-1] == '\n') slen--; mddev->clevel[slen] = 0; @@ -3905,7 +3994,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) } /* Now find the new personality */ - strncpy(clevel, buf, slen); + memcpy(clevel, buf, slen); if (clevel[slen-1] == '\n') slen--; clevel[slen] = 0; @@ -3960,7 +4049,6 @@ level_store(struct mddev *mddev, const char *buf, size_t len) } /* Looks like we have a winner */ - mddev_suspend(mddev); mddev_detach(mddev); spin_lock(&mddev->lock); @@ -4046,14 +4134,13 @@ level_store(struct mddev *mddev, const char *buf, size_t len) blk_set_stacking_limits(&mddev->queue->limits); pers->run(mddev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); - mddev_resume(mddev); if (!mddev->thread) md_update_sb(mddev, 1); sysfs_notify_dirent_safe(mddev->sysfs_level); md_new_event(); rv = len; out_unlock: - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return rv; } @@ -4361,6 +4448,18 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) int err = 0; enum array_state st = match_word(buf, array_states); + /* No lock dependent actions */ + switch (st) { + case suspended: /* not supported yet */ + case write_pending: /* cannot be set */ + case active_idle: /* cannot be set */ + case broken: /* cannot be set */ + case bad_word: + return -EINVAL; + default: + break; + } + if (mddev->pers && (st == active || st == clean) && mddev->ro != MD_RDONLY) { /* don't take reconfig_mutex when toggling between @@ -4385,23 +4484,16 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) err = mddev_lock(mddev); if (err) return err; - err = -EINVAL; - switch(st) { - case bad_word: - break; - case clear: - /* stopping an active array */ - err = do_md_stop(mddev, 0, NULL); - break; + + switch (st) { case inactive: - /* stopping an active array */ + /* stop an active array, return 0 otherwise */ if (mddev->pers) err = do_md_stop(mddev, 2, NULL); - else - err = 0; /* already inactive */ break; - case suspended: - break; /* not supported yet */ + case clear: + err = do_md_stop(mddev, 0, NULL); + break; case readonly: if (mddev->pers) err = md_set_readonly(mddev, NULL); @@ -4452,10 +4544,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) err = do_md_run(mddev); } break; - case write_pending: - case active_idle: - case broken: - /* these cannot be set */ + default: + err = -EINVAL; break; } @@ -4528,7 +4618,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) minor != MINOR(dev)) return -EOVERFLOW; - err = mddev_lock(mddev); + err = mddev_suspend_and_lock(mddev); if (err) return err; if (mddev->persistent) { @@ -4549,14 +4639,14 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) rdev = md_import_device(dev, -1, -1); if (IS_ERR(rdev)) { - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return PTR_ERR(rdev); } err = bind_rdev_to_array(rdev, mddev); out: if (err) export_rdev(rdev, mddev); - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); if (!err) md_new_event(); return err ? err : len; @@ -4691,7 +4781,7 @@ metadata_store(struct mddev *mddev, const char *buf, size_t len) size_t namelen = len-9; if (namelen >= sizeof(mddev->metadata_type)) namelen = sizeof(mddev->metadata_type)-1; - strncpy(mddev->metadata_type, buf+9, namelen); + memcpy(mddev->metadata_type, buf+9, namelen); mddev->metadata_type[namelen] = 0; if (namelen && mddev->metadata_type[namelen-1] == '\n') mddev->metadata_type[--namelen] = 0; @@ -4865,6 +4955,7 @@ action_store(struct mddev *mddev, const char *page, size_t len) /* A write to sync_action is enough to justify * canceling read-auto mode */ + flush_work(&mddev->sync_work); mddev->ro = MD_RDWR; md_wakeup_thread(mddev->sync_thread); } @@ -5121,7 +5212,8 @@ __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); static ssize_t suspend_lo_show(struct mddev *mddev, char *page) { - return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); + return sprintf(page, "%llu\n", + (unsigned long long)READ_ONCE(mddev->suspend_lo)); } static ssize_t @@ -5136,21 +5228,14 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) if (new != (sector_t)new) return -EINVAL; - err = mddev_lock(mddev); + err = mddev_suspend(mddev, true); if (err) return err; - err = -EINVAL; - if (mddev->pers == NULL || - mddev->pers->quiesce == NULL) - goto unlock; - mddev_suspend(mddev); - mddev->suspend_lo = new; + + WRITE_ONCE(mddev->suspend_lo, new); mddev_resume(mddev); - err = 0; -unlock: - mddev_unlock(mddev); - return err ?: len; + return len; } static struct md_sysfs_entry md_suspend_lo = __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); @@ -5158,7 +5243,8 @@ __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); static ssize_t suspend_hi_show(struct mddev *mddev, char *page) { - return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); + return sprintf(page, "%llu\n", + (unsigned long long)READ_ONCE(mddev->suspend_hi)); } static ssize_t @@ -5173,21 +5259,14 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) if (new != (sector_t)new) return -EINVAL; - err = mddev_lock(mddev); + err = mddev_suspend(mddev, true); if (err) return err; - err = -EINVAL; - if (mddev->pers == NULL) - goto unlock; - mddev_suspend(mddev); - mddev->suspend_hi = new; + WRITE_ONCE(mddev->suspend_hi, new); mddev_resume(mddev); - err = 0; -unlock: - mddev_unlock(mddev); - return err ?: len; + return len; } static struct md_sysfs_entry md_suspend_hi = __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); @@ -5434,7 +5513,7 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) if (value == mddev->serialize_policy) return len; - err = mddev_lock(mddev); + err = mddev_suspend_and_lock(mddev); if (err) return err; if (mddev->pers == NULL || (mddev->pers->level != 1)) { @@ -5443,15 +5522,13 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) goto unlock; } - mddev_suspend(mddev); if (value) - mddev_create_serial_pool(mddev, NULL, true); + mddev_create_serial_pool(mddev, NULL); else - mddev_destroy_serial_pool(mddev, NULL, true); + mddev_destroy_serial_pool(mddev, NULL); mddev->serialize_policy = value; - mddev_resume(mddev); unlock: - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return err ?: len; } @@ -5590,21 +5667,6 @@ static void mddev_delayed_delete(struct work_struct *ws) kobject_put(&mddev->kobj); } -static void no_op(struct percpu_ref *r) {} - -int mddev_init_writes_pending(struct mddev *mddev) -{ - if (mddev->writes_pending.percpu_count_ptr) - return 0; - if (percpu_ref_init(&mddev->writes_pending, no_op, - PERCPU_REF_ALLOW_REINIT, GFP_KERNEL) < 0) - return -ENOMEM; - /* We want to start with the refcount at zero */ - percpu_ref_put(&mddev->writes_pending); - return 0; -} -EXPORT_SYMBOL_GPL(mddev_init_writes_pending); - struct mddev *md_alloc(dev_t dev, char *name) { /* @@ -5776,12 +5838,6 @@ static void md_safemode_timeout(struct timer_list *t) } static int start_dirty_degraded; -static void active_io_release(struct percpu_ref *ref) -{ - struct mddev *mddev = container_of(ref, struct mddev, active_io); - - wake_up(&mddev->sb_wait); -} int md_run(struct mddev *mddev) { @@ -5862,15 +5918,10 @@ int md_run(struct mddev *mddev) nowait = nowait && bdev_nowait(rdev->bdev); } - err = percpu_ref_init(&mddev->active_io, active_io_release, - PERCPU_REF_ALLOW_REINIT, GFP_KERNEL); - if (err) - return err; - if (!bioset_initialized(&mddev->bio_set)) { err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (err) - goto exit_active_io; + return err; } if (!bioset_initialized(&mddev->sync_set)) { err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); @@ -6067,8 +6118,6 @@ exit_sync_set: bioset_exit(&mddev->sync_set); exit_bio_set: bioset_exit(&mddev->bio_set); -exit_active_io: - percpu_ref_exit(&mddev->active_io); return err; } EXPORT_SYMBOL_GPL(md_run); @@ -6242,7 +6291,7 @@ static void __md_stop_writes(struct mddev *mddev) } /* disable policy to guarantee rdevs free resources for serialization */ mddev->serialize_policy = 0; - mddev_destroy_serial_pool(mddev, NULL, true); + mddev_destroy_serial_pool(mddev, NULL); } void md_stop_writes(struct mddev *mddev) @@ -6284,7 +6333,6 @@ static void __md_stop(struct mddev *mddev) module_put(pers->owner); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - percpu_ref_exit(&mddev->active_io); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); bioset_exit(&mddev->io_clone_set); @@ -6299,7 +6347,6 @@ void md_stop(struct mddev *mddev) */ __md_stop_writes(mddev); __md_stop(mddev); - percpu_ref_exit(&mddev->writes_pending); } EXPORT_SYMBOL_GPL(md_stop); @@ -6536,13 +6583,13 @@ static void autorun_devices(int part) if (IS_ERR(mddev)) break; - if (mddev_lock(mddev)) + if (mddev_suspend_and_lock(mddev)) pr_warn("md: %s locked, cannot run\n", mdname(mddev)); else if (mddev->raid_disks || mddev->major_version || !list_empty(&mddev->disks)) { pr_warn("md: %s already running, cannot run %pg\n", mdname(mddev), rdev0->bdev); - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); } else { pr_debug("md: created %s\n", mdname(mddev)); mddev->persistent = 1; @@ -6552,7 +6599,7 @@ static void autorun_devices(int part) export_rdev(rdev, mddev); } autorun_array(mddev); - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); } /* on success, candidates will be empty, on error * it won't... @@ -7102,7 +7149,6 @@ static int set_bitmap_file(struct mddev *mddev, int fd) struct bitmap *bitmap; bitmap = md_bitmap_create(mddev, -1); - mddev_suspend(mddev); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; err = md_bitmap_load(mddev); @@ -7112,11 +7158,8 @@ static int set_bitmap_file(struct mddev *mddev, int fd) md_bitmap_destroy(mddev); fd = -1; } - mddev_resume(mddev); } else if (fd < 0) { - mddev_suspend(mddev); md_bitmap_destroy(mddev); - mddev_resume(mddev); } } if (fd < 0) { @@ -7405,7 +7448,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.space = mddev->bitmap_info.default_space; bitmap = md_bitmap_create(mddev, -1); - mddev_suspend(mddev); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; rv = md_bitmap_load(mddev); @@ -7413,7 +7455,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) rv = PTR_ERR(bitmap); if (rv) md_bitmap_destroy(mddev); - mddev_resume(mddev); } else { /* remove the bitmap */ if (!mddev->bitmap) { @@ -7438,9 +7479,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) module_put(md_cluster_mod); mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; } - mddev_suspend(mddev); md_bitmap_destroy(mddev); - mddev_resume(mddev); mddev->bitmap_info.offset = 0; } } @@ -7511,6 +7550,20 @@ static inline bool md_ioctl_valid(unsigned int cmd) } } +static bool md_ioctl_need_suspend(unsigned int cmd) +{ + switch (cmd) { + case ADD_NEW_DISK: + case HOT_ADD_DISK: + case HOT_REMOVE_DISK: + case SET_BITMAP_FILE: + case SET_ARRAY_INFO: + return true; + default: + return false; + } +} + static int __md_set_array_info(struct mddev *mddev, void __user *argp) { mdu_array_info_t info; @@ -7639,7 +7692,12 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode, mutex_unlock(&mddev->open_mutex); sync_blockdev(bdev); } - err = mddev_lock(mddev); + + if (!md_is_rdwr(mddev)) + flush_work(&mddev->sync_work); + + err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) : + mddev_lock(mddev); if (err) { pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", err, cmd); @@ -7767,7 +7825,10 @@ unlock: if (mddev->hold_active == UNTIL_IOCTL && err != -EINVAL) mddev->hold_active = 0; - mddev_unlock(mddev); + + md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) : + mddev_unlock(mddev); + out: if(did_set_md_closing) clear_bit(MD_CLOSING, &mddev->flags); @@ -7879,7 +7940,6 @@ static void md_free_disk(struct gendisk *disk) { struct mddev *mddev = disk->private_data; - percpu_ref_exit(&mddev->writes_pending); mddev_free(mddev); } @@ -8195,105 +8255,46 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) } static void *md_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(&all_mddevs_lock) { - struct list_head *tmp; - loff_t l = *pos; - struct mddev *mddev; + struct md_personality *pers; - if (l == 0x10000) { - ++*pos; - return (void *)2; - } - if (l > 0x10000) - return NULL; - if (!l--) - /* header */ - return (void*)1; + seq_puts(seq, "Personalities : "); + spin_lock(&pers_lock); + list_for_each_entry(pers, &pers_list, list) + seq_printf(seq, "[%s] ", pers->name); + + spin_unlock(&pers_lock); + seq_puts(seq, "\n"); + seq->poll_event = atomic_read(&md_event_count); spin_lock(&all_mddevs_lock); - list_for_each(tmp,&all_mddevs) - if (!l--) { - mddev = list_entry(tmp, struct mddev, all_mddevs); - if (!mddev_get(mddev)) - continue; - spin_unlock(&all_mddevs_lock); - return mddev; - } - spin_unlock(&all_mddevs_lock); - if (!l--) - return (void*)2;/* tail */ - return NULL; + + return seq_list_start(&all_mddevs, *pos); } static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct list_head *tmp; - struct mddev *next_mddev, *mddev = v; - struct mddev *to_put = NULL; - - ++*pos; - if (v == (void*)2) - return NULL; - - spin_lock(&all_mddevs_lock); - if (v == (void*)1) { - tmp = all_mddevs.next; - } else { - to_put = mddev; - tmp = mddev->all_mddevs.next; - } - - for (;;) { - if (tmp == &all_mddevs) { - next_mddev = (void*)2; - *pos = 0x10000; - break; - } - next_mddev = list_entry(tmp, struct mddev, all_mddevs); - if (mddev_get(next_mddev)) - break; - mddev = next_mddev; - tmp = mddev->all_mddevs.next; - } - spin_unlock(&all_mddevs_lock); - - if (to_put) - mddev_put(to_put); - return next_mddev; - + return seq_list_next(v, &all_mddevs, pos); } static void md_seq_stop(struct seq_file *seq, void *v) + __releases(&all_mddevs_lock) { - struct mddev *mddev = v; - - if (mddev && v != (void*)1 && v != (void*)2) - mddev_put(mddev); + status_unused(seq); + spin_unlock(&all_mddevs_lock); } static int md_seq_show(struct seq_file *seq, void *v) { - struct mddev *mddev = v; + struct mddev *mddev = list_entry(v, struct mddev, all_mddevs); sector_t sectors; struct md_rdev *rdev; - if (v == (void*)1) { - struct md_personality *pers; - seq_printf(seq, "Personalities : "); - spin_lock(&pers_lock); - list_for_each_entry(pers, &pers_list, list) - seq_printf(seq, "[%s] ", pers->name); - - spin_unlock(&pers_lock); - seq_printf(seq, "\n"); - seq->poll_event = atomic_read(&md_event_count); + if (!mddev_get(mddev)) return 0; - } - if (v == (void*)2) { - status_unused(seq); - return 0; - } + spin_unlock(&all_mddevs_lock); spin_lock(&mddev->lock); if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { seq_printf(seq, "%s : %sactive", mdname(mddev), @@ -8364,6 +8365,9 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\n"); } spin_unlock(&mddev->lock); + spin_lock(&all_mddevs_lock); + if (atomic_dec_and_test(&mddev->active)) + __mddev_put(mddev); return 0; } @@ -8563,6 +8567,7 @@ bool md_write_start(struct mddev *mddev, struct bio *bi) BUG_ON(mddev->ro == MD_RDONLY); if (mddev->ro == MD_AUTO_READ) { /* need to switch to read/write */ + flush_work(&mddev->sync_work); mddev->ro = MD_RDWR; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); @@ -9148,12 +9153,90 @@ void md_do_sync(struct md_thread *thread) spin_unlock(&mddev->lock); wake_up(&resync_wait); - wake_up(&mddev->sb_wait); md_wakeup_thread(mddev->thread); return; } EXPORT_SYMBOL_GPL(md_do_sync); +static bool rdev_removeable(struct md_rdev *rdev) +{ + /* rdev is not used. */ + if (rdev->raid_disk < 0) + return false; + + /* There are still inflight io, don't remove this rdev. */ + if (atomic_read(&rdev->nr_pending)) + return false; + + /* + * An error occurred but has not yet been acknowledged by the metadata + * handler, don't remove this rdev. + */ + if (test_bit(Blocked, &rdev->flags)) + return false; + + /* Fautly rdev is not used, it's safe to remove it. */ + if (test_bit(Faulty, &rdev->flags)) + return true; + + /* Journal disk can only be removed if it's faulty. */ + if (test_bit(Journal, &rdev->flags)) + return false; + + /* + * 'In_sync' is cleared while 'raid_disk' is valid, which means + * replacement has just become active from pers->spare_active(), and + * then pers->hot_remove_disk() will replace this rdev with replacement. + */ + if (!test_bit(In_sync, &rdev->flags)) + return true; + + return false; +} + +static bool rdev_is_spare(struct md_rdev *rdev) +{ + return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 && + !test_bit(In_sync, &rdev->flags) && + !test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags); +} + +static bool rdev_addable(struct md_rdev *rdev) +{ + /* rdev is already used, don't add it again. */ + if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 || + test_bit(Faulty, &rdev->flags)) + return false; + + /* Allow to add journal disk. */ + if (test_bit(Journal, &rdev->flags)) + return true; + + /* Allow to add if array is read-write. */ + if (md_is_rdwr(rdev->mddev)) + return true; + + /* + * For read-only array, only allow to readd a rdev. And if bitmap is + * used, don't allow to readd a rdev that is too old. + */ + if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags)) + return true; + + return false; +} + +static bool md_spares_need_change(struct mddev *mddev) +{ + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) + if (rdev_removeable(rdev) || rdev_addable(rdev)) + return true; + return false; +} + static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this) { @@ -9186,12 +9269,8 @@ static int remove_and_add_spares(struct mddev *mddev, synchronize_rcu(); rdev_for_each(rdev, mddev) { if ((this == NULL || rdev == this) && - rdev->raid_disk >= 0 && - !test_bit(Blocked, &rdev->flags) && - ((test_bit(RemoveSynchronized, &rdev->flags) || - (!test_bit(In_sync, &rdev->flags) && - !test_bit(Journal, &rdev->flags))) && - atomic_read(&rdev->nr_pending)==0)) { + (test_bit(RemoveSynchronized, &rdev->flags) || + rdev_removeable(rdev))) { if (mddev->pers->hot_remove_disk( mddev, rdev) == 0) { sysfs_unlink_rdev(mddev, rdev); @@ -9213,25 +9292,12 @@ static int remove_and_add_spares(struct mddev *mddev, rdev_for_each(rdev, mddev) { if (this && this != rdev) continue; - if (test_bit(Candidate, &rdev->flags)) - continue; - if (rdev->raid_disk >= 0 && - !test_bit(In_sync, &rdev->flags) && - !test_bit(Journal, &rdev->flags) && - !test_bit(Faulty, &rdev->flags)) + if (rdev_is_spare(rdev)) spares++; - if (rdev->raid_disk >= 0) - continue; - if (test_bit(Faulty, &rdev->flags)) + if (!rdev_addable(rdev)) continue; - if (!test_bit(Journal, &rdev->flags)) { - if (!md_is_rdwr(mddev) && - !(rdev->saved_raid_disk >= 0 && - !test_bit(Bitmap_sync, &rdev->flags))) - continue; - + if (!test_bit(Journal, &rdev->flags)) rdev->recovery_offset = 0; - } if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { /* failure here is OK */ sysfs_link_rdev(mddev, rdev); @@ -9247,9 +9313,86 @@ no_add: return spares; } +static bool md_choose_sync_action(struct mddev *mddev, int *spares) +{ + /* Check if reshape is in progress first. */ + if (mddev->reshape_position != MaxSector) { + if (mddev->pers->check_reshape == NULL || + mddev->pers->check_reshape(mddev) != 0) + return false; + + set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + return true; + } + + /* + * Remove any failed drives, then add spares if possible. Spares are + * also removed and re-added, to allow the personality to fail the + * re-add. + */ + *spares = remove_and_add_spares(mddev, NULL); + if (*spares) { + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + + /* Start new recovery. */ + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + return true; + } + + /* Check if recovery is in progress. */ + if (mddev->recovery_cp < MaxSector) { + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + return true; + } + + /* Delay to choose resync/check/repair in md_do_sync(). */ + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + return true; + + /* Nothing to be done */ + return false; +} + static void md_start_sync(struct work_struct *ws) { - struct mddev *mddev = container_of(ws, struct mddev, del_work); + struct mddev *mddev = container_of(ws, struct mddev, sync_work); + int spares = 0; + bool suspend = false; + + if (md_spares_need_change(mddev)) + suspend = true; + + suspend ? mddev_suspend_and_lock_nointr(mddev) : + mddev_lock_nointr(mddev); + + if (!md_is_rdwr(mddev)) { + /* + * On a read-only array we can: + * - remove failed devices + * - add already-in_sync devices if the array itself is in-sync. + * As we only add devices that are already in-sync, we can + * activate the spares immediately. + */ + remove_and_add_spares(mddev, NULL); + goto not_running; + } + + if (!md_choose_sync_action(mddev, &spares)) + goto not_running; + + if (!mddev->pers->sync_request) + goto not_running; + + /* + * We are adding a device or devices to an array which has the bitmap + * stored on all devices. So make sure all bitmap pages get written. + */ + if (spares) + md_bitmap_write_all(mddev->bitmap); rcu_assign_pointer(mddev->sync_thread, md_register_thread(md_do_sync, mddev, "resync")); @@ -9257,20 +9400,27 @@ static void md_start_sync(struct work_struct *ws) pr_warn("%s: could not start resync thread...\n", mdname(mddev)); /* leave the spares where they are, it shouldn't hurt */ - clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); - clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); - clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - wake_up(&resync_wait); - if (test_and_clear_bit(MD_RECOVERY_RECOVER, - &mddev->recovery)) - if (mddev->sysfs_action) - sysfs_notify_dirent_safe(mddev->sysfs_action); - } else - md_wakeup_thread(mddev->sync_thread); + goto not_running; + } + + suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); + md_wakeup_thread(mddev->sync_thread); sysfs_notify_dirent_safe(mddev->sysfs_action); md_new_event(); + return; + +not_running: + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); + + wake_up(&resync_wait); + if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && + mddev->sysfs_action) + sysfs_notify_dirent_safe(mddev->sysfs_action); } /* @@ -9297,19 +9447,7 @@ static void md_start_sync(struct work_struct *ws) */ void md_check_recovery(struct mddev *mddev) { - if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) { - /* Write superblock - thread that called mddev_suspend() - * holds reconfig_mutex for us. - */ - set_bit(MD_UPDATING_SB, &mddev->flags); - smp_mb__after_atomic(); - if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags)) - md_update_sb(mddev, 0); - clear_bit_unlock(MD_UPDATING_SB, &mddev->flags); - wake_up(&mddev->sb_wait); - } - - if (is_md_suspended(mddev)) + if (READ_ONCE(mddev->suspended)) return; if (mddev->bitmap) @@ -9338,7 +9476,6 @@ void md_check_recovery(struct mddev *mddev) return; if (mddev_trylock(mddev)) { - int spares = 0; bool try_set_sync = mddev->safemode != 0; if (!mddev->external && mddev->safemode == 1) @@ -9346,30 +9483,43 @@ void md_check_recovery(struct mddev *mddev) if (!md_is_rdwr(mddev)) { struct md_rdev *rdev; + + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { + /* sync_work already queued. */ + clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + goto unlock; + } + if (!mddev->external && mddev->in_sync) - /* 'Blocked' flag not needed as failed devices + /* + * 'Blocked' flag not needed as failed devices * will be recorded if array switched to read/write. * Leaving it set will prevent the device * from being removed. */ rdev_for_each(rdev, mddev) clear_bit(Blocked, &rdev->flags); - /* On a read-only array we can: - * - remove failed devices - * - add already-in_sync devices if the array itself - * is in-sync. - * As we only add devices that are already in-sync, - * we can activate the spares immediately. - */ - remove_and_add_spares(mddev, NULL); - /* There is no thread, but we need to call + + /* + * There is no thread, but we need to call * ->spare_active and clear saved_raid_disk */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_reap_sync_thread(mddev); + + /* + * Let md_start_sync() to remove and add rdevs to the + * array. + */ + if (md_spares_need_change(mddev)) { + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + queue_work(md_misc_wq, &mddev->sync_work); + } + clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); + goto unlock; } @@ -9425,56 +9575,14 @@ void md_check_recovery(struct mddev *mddev) clear_bit(MD_RECOVERY_INTR, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); - if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || - test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) - goto not_running; - /* no recovery is running. - * remove any failed drives, then - * add spares if possible. - * Spares are also removed and re-added, to allow - * the personality to fail the re-add. - */ - - if (mddev->reshape_position != MaxSector) { - if (mddev->pers->check_reshape == NULL || - mddev->pers->check_reshape(mddev) != 0) - /* Cannot proceed */ - goto not_running; - set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); - clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - } else if ((spares = remove_and_add_spares(mddev, NULL))) { - clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); - clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - } else if (mddev->recovery_cp < MaxSector) { - set_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - /* nothing to be done ... */ - goto not_running; - - if (mddev->pers->sync_request) { - if (spares) { - /* We are adding a device or devices to an array - * which has the bitmap stored on all devices. - * So make sure all bitmap pages get written - */ - md_bitmap_write_all(mddev->bitmap); - } - INIT_WORK(&mddev->del_work, md_start_sync); - queue_work(md_misc_wq, &mddev->del_work); - goto unlock; - } - not_running: - if (!mddev->sync_thread) { + if (test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && + !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { + queue_work(md_misc_wq, &mddev->sync_work); + } else { clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); wake_up(&resync_wait); - if (test_and_clear_bit(MD_RECOVERY_RECOVER, - &mddev->recovery)) - if (mddev->sysfs_action) - sysfs_notify_dirent_safe(mddev->sysfs_action); } + unlock: wake_up(&mddev->sb_wait); mddev_unlock(mddev); diff --git a/drivers/md/md.h b/drivers/md/md.h index 274e7d61d19f..ade83af123a2 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -246,10 +246,6 @@ struct md_cluster_info; * become failed. * @MD_HAS_PPL: The raid array has PPL feature set. * @MD_HAS_MULTIPLE_PPLS: The raid array has multiple PPLs feature set. - * @MD_ALLOW_SB_UPDATE: md_check_recovery is allowed to update the metadata - * without taking reconfig_mutex. - * @MD_UPDATING_SB: md_check_recovery is updating the metadata without - * explicitly holding reconfig_mutex. * @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that * array is ready yet. * @MD_BROKEN: This is used to stop writes and mark array as failed. @@ -266,8 +262,6 @@ enum mddev_flags { MD_FAILFAST_SUPPORTED, MD_HAS_PPL, MD_HAS_MULTIPLE_PPLS, - MD_ALLOW_SB_UPDATE, - MD_UPDATING_SB, MD_NOT_READY, MD_BROKEN, MD_DELETED, @@ -314,6 +308,7 @@ struct mddev { unsigned long sb_flags; int suspended; + struct mutex suspend_mutex; struct percpu_ref active_io; int ro; int sysfs_active; /* set when sysfs deletes @@ -451,7 +446,10 @@ struct mddev { struct kernfs_node *sysfs_degraded; /*handle for 'degraded' */ struct kernfs_node *sysfs_level; /*handle for 'level' */ - struct work_struct del_work; /* used for delayed sysfs removal */ + /* used for delayed sysfs removal */ + struct work_struct del_work; + /* used for register new sync thread */ + struct work_struct sync_work; /* "lock" protects: * flush_bio transition from NULL to !NULL @@ -565,23 +563,6 @@ enum recovery_flags { MD_RESYNCING_REMOTE, /* remote node is running resync thread */ }; -enum md_ro_state { - MD_RDWR, - MD_RDONLY, - MD_AUTO_READ, - MD_MAX_STATE -}; - -static inline bool md_is_rdwr(struct mddev *mddev) -{ - return (mddev->ro == MD_RDWR); -} - -static inline bool is_md_suspended(struct mddev *mddev) -{ - return percpu_ref_is_dying(&mddev->active_io); -} - static inline int __must_check mddev_lock(struct mddev *mddev) { return mutex_lock_interruptible(&mddev->reconfig_mutex); @@ -641,7 +622,6 @@ struct md_personality int (*start_reshape) (struct mddev *mddev); void (*finish_reshape) (struct mddev *mddev); void (*update_reshape_pos) (struct mddev *mddev); - void (*prepare_suspend) (struct mddev *mddev); /* quiesce suspends or resumes internal processing. * 1 - stop new actions and wait for action io to complete * 0 - return to normal behaviour @@ -766,7 +746,6 @@ extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **t extern void md_wakeup_thread(struct md_thread __rcu *thread); extern void md_check_recovery(struct mddev *mddev); extern void md_reap_sync_thread(struct mddev *mddev); -extern int mddev_init_writes_pending(struct mddev *mddev); extern bool md_write_start(struct mddev *mddev, struct bio *bi); extern void md_write_inc(struct mddev *mddev, struct bio *bi); extern void md_write_end(struct mddev *mddev); @@ -793,7 +772,8 @@ extern int md_integrity_register(struct mddev *mddev); extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev); extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); -extern void mddev_init(struct mddev *mddev); +extern int mddev_init(struct mddev *mddev); +extern void mddev_destroy(struct mddev *mddev); struct mddev *md_alloc(dev_t dev, char *name); void mddev_put(struct mddev *mddev); extern int md_run(struct mddev *mddev); @@ -804,15 +784,14 @@ extern int md_rdev_init(struct md_rdev *rdev); extern void md_rdev_clear(struct md_rdev *rdev); extern void md_handle_request(struct mddev *mddev, struct bio *bio); -extern void mddev_suspend(struct mddev *mddev); +extern int mddev_suspend(struct mddev *mddev, bool interruptible); extern void mddev_resume(struct mddev *mddev); extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); -extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, - bool is_suspend); -extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, - bool is_suspend); +extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev); +extern void mddev_destroy_serial_pool(struct mddev *mddev, + struct md_rdev *rdev); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); @@ -850,6 +829,33 @@ static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio mddev->queue->limits.max_write_zeroes_sectors = 0; } +static inline int mddev_suspend_and_lock(struct mddev *mddev) +{ + int ret; + + ret = mddev_suspend(mddev, true); + if (ret) + return ret; + + ret = mddev_lock(mddev); + if (ret) + mddev_resume(mddev); + + return ret; +} + +static inline void mddev_suspend_and_lock_nointr(struct mddev *mddev) +{ + mddev_suspend(mddev, false); + mutex_lock(&mddev->reconfig_mutex); +} + +static inline void mddev_unlock_and_resume(struct mddev *mddev) +{ + mddev_unlock(mddev); + mddev_resume(mddev); +} + struct mdu_array_info_s; struct mdu_disk_info_s; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 2aabac773fe7..35d12948e0a9 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1345,6 +1345,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, int first_clone; int max_sectors; bool write_behind = false; + bool is_discard = (bio_op(bio) == REQ_OP_DISCARD); if (mddev_is_clustered(mddev) && md_cluster_ops->area_resyncing(mddev, WRITE, @@ -1405,7 +1406,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, * write-mostly, which means we could allocate write behind * bio later. */ - if (rdev && test_bit(WriteMostly, &rdev->flags)) + if (!is_discard && rdev && test_bit(WriteMostly, &rdev->flags)) write_behind = true; if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { @@ -3122,8 +3123,7 @@ static int raid1_run(struct mddev *mddev) mdname(mddev)); return -EIO; } - if (mddev_init_writes_pending(mddev) < 0) - return -ENOMEM; + /* * copy the already verified devices into our private RAID1 * bookkeeping area. [whatever we allocate in run(), diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 023413120851..a5927e98dc67 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4154,9 +4154,6 @@ static int raid10_run(struct mddev *mddev) sector_t min_offset_diff = 0; int first = 1; - if (mddev_init_writes_pending(mddev) < 0) - return -ENOMEM; - if (mddev->private == NULL) { conf = setup_conf(mddev); if (IS_ERR(conf)) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 518b7cfa78b9..6157f5beb9fe 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -327,8 +327,9 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space); void r5c_check_stripe_cache_usage(struct r5conf *conf) { int total_cached; + struct r5l_log *log = READ_ONCE(conf->log); - if (!r5c_is_writeback(conf->log)) + if (!r5c_is_writeback(log)) return; total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + @@ -344,7 +345,7 @@ void r5c_check_stripe_cache_usage(struct r5conf *conf) */ if (total_cached > conf->min_nr_stripes * 1 / 2 || atomic_read(&conf->empty_inactive_list_nr) > 0) - r5l_wake_reclaim(conf->log, 0); + r5l_wake_reclaim(log, 0); } /* @@ -353,7 +354,9 @@ void r5c_check_stripe_cache_usage(struct r5conf *conf) */ void r5c_check_cached_full_stripe(struct r5conf *conf) { - if (!r5c_is_writeback(conf->log)) + struct r5l_log *log = READ_ONCE(conf->log); + + if (!r5c_is_writeback(log)) return; /* @@ -363,7 +366,7 @@ void r5c_check_cached_full_stripe(struct r5conf *conf) if (atomic_read(&conf->r5c_cached_full_stripes) >= min(R5C_FULL_STRIPE_FLUSH_BATCH(conf), conf->chunk_sectors >> RAID5_STRIPE_SHIFT(conf))) - r5l_wake_reclaim(conf->log, 0); + r5l_wake_reclaim(log, 0); } /* @@ -396,7 +399,7 @@ void r5c_check_cached_full_stripe(struct r5conf *conf) */ static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) { - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); if (!r5c_is_writeback(log)) return 0; @@ -449,7 +452,7 @@ static inline void r5c_update_log_state(struct r5l_log *log) void r5c_make_stripe_write_out(struct stripe_head *sh) { struct r5conf *conf = sh->raid_conf; - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); BUG_ON(!r5c_is_writeback(log)); @@ -491,7 +494,7 @@ static void r5c_handle_parity_cached(struct stripe_head *sh) */ static void r5c_finish_cache_stripe(struct stripe_head *sh) { - struct r5l_log *log = sh->raid_conf->log; + struct r5l_log *log = READ_ONCE(sh->raid_conf->log); if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); @@ -683,7 +686,6 @@ static void r5c_disable_writeback_async(struct work_struct *work) disable_writeback_work); struct mddev *mddev = log->rdev->mddev; struct r5conf *conf = mddev->private; - int locked = 0; if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) return; @@ -692,14 +694,14 @@ static void r5c_disable_writeback_async(struct work_struct *work) /* wait superblock change before suspend */ wait_event(mddev->sb_wait, - conf->log == NULL || - (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && - (locked = mddev_trylock(mddev)))); - if (locked) { - mddev_suspend(mddev); + !READ_ONCE(conf->log) || + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); + + log = READ_ONCE(conf->log); + if (log) { + mddev_suspend(mddev, false); log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; mddev_resume(mddev); - mddev_unlock(mddev); } } @@ -1151,7 +1153,7 @@ static void r5l_run_no_space_stripes(struct r5l_log *log) static sector_t r5c_calculate_new_cp(struct r5conf *conf) { struct stripe_head *sh; - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); sector_t new_cp; unsigned long flags; @@ -1159,12 +1161,12 @@ static sector_t r5c_calculate_new_cp(struct r5conf *conf) return log->next_checkpoint; spin_lock_irqsave(&log->stripe_in_journal_lock, flags); - if (list_empty(&conf->log->stripe_in_journal_list)) { + if (list_empty(&log->stripe_in_journal_list)) { /* all stripes flushed */ spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); return log->next_checkpoint; } - sh = list_first_entry(&conf->log->stripe_in_journal_list, + sh = list_first_entry(&log->stripe_in_journal_list, struct stripe_head, r5c); new_cp = sh->log_start; spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); @@ -1399,7 +1401,7 @@ void r5c_flush_cache(struct r5conf *conf, int num) struct stripe_head *sh, *next; lockdep_assert_held(&conf->device_lock); - if (!conf->log) + if (!READ_ONCE(conf->log)) return; count = 0; @@ -1420,7 +1422,7 @@ void r5c_flush_cache(struct r5conf *conf, int num) static void r5c_do_reclaim(struct r5conf *conf) { - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); struct stripe_head *sh; int count = 0; unsigned long flags; @@ -1549,7 +1551,7 @@ static void r5l_reclaim_thread(struct md_thread *thread) { struct mddev *mddev = thread->mddev; struct r5conf *conf = mddev->private; - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); if (!log) return; @@ -1591,7 +1593,7 @@ void r5l_quiesce(struct r5l_log *log, int quiesce) bool r5l_log_disk_error(struct r5conf *conf) { - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); /* don't allow write if journal disk is missing */ if (!log) @@ -2583,9 +2585,7 @@ int r5c_journal_mode_set(struct mddev *mddev, int mode) mode == R5C_JOURNAL_MODE_WRITE_BACK) return -EINVAL; - mddev_suspend(mddev); conf->log->r5c_journal_mode = mode; - mddev_resume(mddev); pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n", mdname(mddev), mode, r5c_journal_mode_str[mode]); @@ -2610,11 +2610,11 @@ static ssize_t r5c_journal_mode_store(struct mddev *mddev, if (strlen(r5c_journal_mode_str[mode]) == len && !strncmp(page, r5c_journal_mode_str[mode], len)) break; - ret = mddev_lock(mddev); + ret = mddev_suspend_and_lock(mddev); if (ret) return ret; ret = r5c_journal_mode_set(mddev, mode); - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return ret ?: length; } @@ -2635,7 +2635,7 @@ int r5c_try_caching_write(struct r5conf *conf, struct stripe_head_state *s, int disks) { - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); int i; struct r5dev *dev; int to_cache = 0; @@ -2802,7 +2802,7 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh, struct stripe_head_state *s) { - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); int i; int do_wakeup = 0; sector_t tree_index; @@ -2941,7 +2941,7 @@ int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh) /* check whether this big stripe is in write back cache. */ bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect) { - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); sector_t tree_index; void *slot; @@ -3049,14 +3049,14 @@ int r5l_start(struct r5l_log *log) void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev) { struct r5conf *conf = mddev->private; - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); if (!log) return; if ((raid5_calc_degraded(conf) > 0 || test_bit(Journal, &rdev->flags)) && - conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) + log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) schedule_work(&log->disable_writeback_work); } @@ -3145,7 +3145,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) spin_lock_init(&log->stripe_in_journal_lock); atomic_set(&log->stripe_in_journal_count, 0); - conf->log = log; + WRITE_ONCE(conf->log, log); set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); return 0; @@ -3173,7 +3173,7 @@ void r5l_exit_log(struct r5conf *conf) * 'reconfig_mutex' is held by caller, set 'confg->log' to NULL to * ensure disable_writeback_work wakes up and exits. */ - conf->log = NULL; + WRITE_ONCE(conf->log, NULL); wake_up(&conf->mddev->sb_wait); flush_work(&log->disable_writeback_work); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 284cd71bcc68..c84ccc97329b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -70,6 +70,8 @@ MODULE_PARM_DESC(devices_handle_discard_safely, "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); static struct workqueue_struct *raid5_wq; +static void raid5_quiesce(struct mddev *mddev, int quiesce); + static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) { int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK; @@ -2499,15 +2501,12 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) unsigned long cpu; int err = 0; - /* - * Never shrink. And mddev_suspend() could deadlock if this is called - * from raid5d. In that case, scribble_disks and scribble_sectors - * should equal to new_disks and new_sectors - */ + /* Never shrink. */ if (conf->scribble_disks >= new_disks && conf->scribble_sectors >= new_sectors) return 0; - mddev_suspend(conf->mddev); + + raid5_quiesce(conf->mddev, true); cpus_read_lock(); for_each_present_cpu(cpu) { @@ -2521,7 +2520,8 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) } cpus_read_unlock(); - mddev_resume(conf->mddev); + raid5_quiesce(conf->mddev, false); + if (!err) { conf->scribble_disks = new_disks; conf->scribble_sectors = new_sectors; @@ -5960,19 +5960,6 @@ out: return ret; } -static bool reshape_inprogress(struct mddev *mddev) -{ - return test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && - !test_bit(MD_RECOVERY_DONE, &mddev->recovery) && - !test_bit(MD_RECOVERY_INTR, &mddev->recovery); -} - -static bool reshape_disabled(struct mddev *mddev) -{ - return is_md_suspended(mddev) || !md_is_rdwr(mddev); -} - static enum stripe_result make_stripe_request(struct mddev *mddev, struct r5conf *conf, struct stripe_request_ctx *ctx, sector_t logical_sector, struct bio *bi) @@ -6004,8 +5991,7 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, if (ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) { spin_unlock_irq(&conf->device_lock); - ret = STRIPE_SCHEDULE_AND_RETRY; - goto out; + return STRIPE_SCHEDULE_AND_RETRY; } } spin_unlock_irq(&conf->device_lock); @@ -6084,15 +6070,6 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, out_release: raid5_release_stripe(sh); -out: - if (ret == STRIPE_SCHEDULE_AND_RETRY && !reshape_inprogress(mddev) && - reshape_disabled(mddev)) { - bi->bi_status = BLK_STS_IOERR; - ret = STRIPE_FAIL; - pr_err("md/raid456:%s: io failed across reshape position while reshape can't make progress.\n", - mdname(mddev)); - } - return ret; } @@ -7032,7 +7009,7 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len) new != roundup_pow_of_two(new)) return -EINVAL; - err = mddev_lock(mddev); + err = mddev_suspend_and_lock(mddev); if (err) return err; @@ -7056,7 +7033,6 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len) goto out_unlock; } - mddev_suspend(mddev); mutex_lock(&conf->cache_size_mutex); size = conf->max_nr_stripes; @@ -7071,10 +7047,9 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len) err = -ENOMEM; } mutex_unlock(&conf->cache_size_mutex); - mddev_resume(mddev); out_unlock: - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return err ?: len; } @@ -7160,7 +7135,7 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) return -EINVAL; new = !!new; - err = mddev_lock(mddev); + err = mddev_suspend_and_lock(mddev); if (err) return err; conf = mddev->private; @@ -7169,15 +7144,13 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) else if (new != conf->skip_copy) { struct request_queue *q = mddev->queue; - mddev_suspend(mddev); conf->skip_copy = new; if (new) blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); else blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q); - mddev_resume(mddev); } - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return err ?: len; } @@ -7232,15 +7205,13 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) if (new > 8192) return -EINVAL; - err = mddev_lock(mddev); + err = mddev_suspend_and_lock(mddev); if (err) return err; conf = mddev->private; if (!conf) err = -ENODEV; else if (new != conf->worker_cnt_per_group) { - mddev_suspend(mddev); - old_groups = conf->worker_groups; if (old_groups) flush_workqueue(raid5_wq); @@ -7257,9 +7228,8 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) kfree(old_groups[0].workers); kfree(old_groups); } - mddev_resume(mddev); } - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return err ?: len; } @@ -7785,9 +7755,6 @@ static int raid5_run(struct mddev *mddev) long long min_offset_diff = 0; int first = 1; - if (mddev_init_writes_pending(mddev) < 0) - return -ENOMEM; - if (mddev->recovery_cp != MaxSector) pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", mdname(mddev)); @@ -8568,8 +8535,8 @@ static int raid5_start_reshape(struct mddev *mddev) * the reshape wasn't running - like Discard or Read - have * completed. */ - mddev_suspend(mddev); - mddev_resume(mddev); + raid5_quiesce(mddev, true); + raid5_quiesce(mddev, false); /* Add some new drives, as many as will fit. * We know there are enough to make the newly sized array work. @@ -8984,12 +8951,12 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) struct r5conf *conf; int err; - err = mddev_lock(mddev); + err = mddev_suspend_and_lock(mddev); if (err) return err; conf = mddev->private; if (!conf) { - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return -ENODEV; } @@ -8999,19 +8966,14 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) err = log_init(conf, NULL, true); if (!err) { err = resize_stripes(conf, conf->pool_size); - if (err) { - mddev_suspend(mddev); + if (err) log_exit(conf); - mddev_resume(mddev); - } } } else err = -EINVAL; } else if (strncmp(buf, "resync", 6) == 0) { if (raid5_has_ppl(conf)) { - mddev_suspend(mddev); log_exit(conf); - mddev_resume(mddev); err = resize_stripes(conf, conf->pool_size); } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) && r5l_log_disk_error(conf)) { @@ -9024,11 +8986,9 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) break; } - if (!journal_dev_exists) { - mddev_suspend(mddev); + if (!journal_dev_exists) clear_bit(MD_HAS_JOURNAL, &mddev->flags); - mddev_resume(mddev); - } else /* need remove journal device first */ + else /* need remove journal device first */ err = -EBUSY; } else err = -EINVAL; @@ -9039,7 +8999,7 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) if (!err) md_update_sb(mddev, 1); - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return err; } @@ -9051,22 +9011,6 @@ static int raid5_start(struct mddev *mddev) return r5l_start(conf->log); } -static void raid5_prepare_suspend(struct mddev *mddev) -{ - struct r5conf *conf = mddev->private; - - wait_event(mddev->sb_wait, !reshape_inprogress(mddev) || - percpu_ref_is_zero(&mddev->active_io)); - if (percpu_ref_is_zero(&mddev->active_io)) - return; - - /* - * Reshape is not in progress, and array is suspended, io that is - * waiting for reshpape can never be done. - */ - wake_up(&conf->wait_for_overlap); -} - static struct md_personality raid6_personality = { .name = "raid6", @@ -9087,7 +9031,6 @@ static struct md_personality raid6_personality = .check_reshape = raid6_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, - .prepare_suspend = raid5_prepare_suspend, .quiesce = raid5_quiesce, .takeover = raid6_takeover, .change_consistency_policy = raid5_change_consistency_policy, @@ -9112,7 +9055,6 @@ static struct md_personality raid5_personality = .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, - .prepare_suspend = raid5_prepare_suspend, .quiesce = raid5_quiesce, .takeover = raid5_takeover, .change_consistency_policy = raid5_change_consistency_policy, @@ -9138,7 +9080,6 @@ static struct md_personality raid4_personality = .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, - .prepare_suspend = raid5_prepare_suspend, .quiesce = raid5_quiesce, .takeover = raid4_takeover, .change_consistency_policy = raid5_change_consistency_policy, diff --git a/drivers/nvme/common/Kconfig b/drivers/nvme/common/Kconfig index 4514f44362dd..06c8df00d1e2 100644 --- a/drivers/nvme/common/Kconfig +++ b/drivers/nvme/common/Kconfig @@ -2,3 +2,16 @@ config NVME_COMMON tristate + +config NVME_KEYRING + bool + select KEYS + +config NVME_AUTH + bool + select CRYPTO + select CRYPTO_HMAC + select CRYPTO_SHA256 + select CRYPTO_SHA512 + select CRYPTO_DH + select CRYPTO_DH_RFC7919_GROUPS diff --git a/drivers/nvme/common/Makefile b/drivers/nvme/common/Makefile index 720c625b8a52..0cbd0b0b8d49 100644 --- a/drivers/nvme/common/Makefile +++ b/drivers/nvme/common/Makefile @@ -4,4 +4,5 @@ ccflags-y += -I$(src) obj-$(CONFIG_NVME_COMMON) += nvme-common.o -nvme-common-y += auth.o +nvme-common-$(CONFIG_NVME_AUTH) += auth.o +nvme-common-$(CONFIG_NVME_KEYRING) += keyring.o diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index d90e4f0c08b7..a8e87dfbeab2 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -150,6 +150,14 @@ size_t nvme_auth_hmac_hash_len(u8 hmac_id) } EXPORT_SYMBOL_GPL(nvme_auth_hmac_hash_len); +u32 nvme_auth_key_struct_size(u32 key_len) +{ + struct nvme_dhchap_key key; + + return struct_size(&key, key, key_len); +} +EXPORT_SYMBOL_GPL(nvme_auth_key_struct_size); + struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret, u8 key_hash) { @@ -163,14 +171,9 @@ struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret, p = strrchr(secret, ':'); if (p) allocated_len = p - secret; - key = kzalloc(sizeof(*key), GFP_KERNEL); + key = nvme_auth_alloc_key(allocated_len, 0); if (!key) return ERR_PTR(-ENOMEM); - key->key = kzalloc(allocated_len, GFP_KERNEL); - if (!key->key) { - ret = -ENOMEM; - goto out_free_key; - } key_len = base64_decode(secret, allocated_len, key->key); if (key_len < 0) { @@ -187,14 +190,6 @@ struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret, goto out_free_secret; } - if (key_hash > 0 && - (key_len - 4) != nvme_auth_hmac_hash_len(key_hash)) { - pr_err("Mismatched key len %d for %s\n", key_len, - nvme_auth_hmac_name(key_hash)); - ret = -EINVAL; - goto out_free_secret; - } - /* The last four bytes is the CRC in little-endian format */ key_len -= 4; /* @@ -213,37 +208,51 @@ struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret, key->hash = key_hash; return key; out_free_secret: - kfree_sensitive(key->key); -out_free_key: - kfree(key); + nvme_auth_free_key(key); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(nvme_auth_extract_key); +struct nvme_dhchap_key *nvme_auth_alloc_key(u32 len, u8 hash) +{ + u32 num_bytes = nvme_auth_key_struct_size(len); + struct nvme_dhchap_key *key = kzalloc(num_bytes, GFP_KERNEL); + + if (key) { + key->len = len; + key->hash = hash; + } + return key; +} +EXPORT_SYMBOL_GPL(nvme_auth_alloc_key); + void nvme_auth_free_key(struct nvme_dhchap_key *key) { if (!key) return; - kfree_sensitive(key->key); - kfree(key); + kfree_sensitive(key); } EXPORT_SYMBOL_GPL(nvme_auth_free_key); -u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn) +struct nvme_dhchap_key *nvme_auth_transform_key( + struct nvme_dhchap_key *key, char *nqn) { const char *hmac_name; struct crypto_shash *key_tfm; struct shash_desc *shash; - u8 *transformed_key; - int ret; + struct nvme_dhchap_key *transformed_key; + int ret, key_len; - if (!key || !key->key) { + if (!key) { pr_warn("No key specified\n"); return ERR_PTR(-ENOKEY); } if (key->hash == 0) { - transformed_key = kmemdup(key->key, key->len, GFP_KERNEL); - return transformed_key ? transformed_key : ERR_PTR(-ENOMEM); + key_len = nvme_auth_key_struct_size(key->len); + transformed_key = kmemdup(key, key_len, GFP_KERNEL); + if (!transformed_key) + return ERR_PTR(-ENOMEM); + return transformed_key; } hmac_name = nvme_auth_hmac_name(key->hash); if (!hmac_name) { @@ -253,7 +262,7 @@ u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn) key_tfm = crypto_alloc_shash(hmac_name, 0, 0); if (IS_ERR(key_tfm)) - return (u8 *)key_tfm; + return ERR_CAST(key_tfm); shash = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(key_tfm), @@ -263,7 +272,8 @@ u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn) goto out_free_key; } - transformed_key = kzalloc(crypto_shash_digestsize(key_tfm), GFP_KERNEL); + key_len = crypto_shash_digestsize(key_tfm); + transformed_key = nvme_auth_alloc_key(key_len, key->hash); if (!transformed_key) { ret = -ENOMEM; goto out_free_shash; @@ -282,7 +292,7 @@ u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn) ret = crypto_shash_update(shash, "NVMe-over-Fabrics", 17); if (ret < 0) goto out_free_transformed_key; - ret = crypto_shash_final(shash, transformed_key); + ret = crypto_shash_final(shash, transformed_key->key); if (ret < 0) goto out_free_transformed_key; @@ -292,7 +302,7 @@ u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn) return transformed_key; out_free_transformed_key: - kfree_sensitive(transformed_key); + nvme_auth_free_key(transformed_key); out_free_shash: kfree(shash); out_free_key: diff --git a/drivers/nvme/common/keyring.c b/drivers/nvme/common/keyring.c new file mode 100644 index 000000000000..f8d9a208397b --- /dev/null +++ b/drivers/nvme/common/keyring.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023 Hannes Reinecke, SUSE Labs + */ + +#include <linux/module.h> +#include <linux/seq_file.h> +#include <linux/key.h> +#include <linux/key-type.h> +#include <keys/user-type.h> +#include <linux/nvme.h> +#include <linux/nvme-tcp.h> +#include <linux/nvme-keyring.h> + +static struct key *nvme_keyring; + +key_serial_t nvme_keyring_id(void) +{ + return nvme_keyring->serial; +} +EXPORT_SYMBOL_GPL(nvme_keyring_id); + +static void nvme_tls_psk_describe(const struct key *key, struct seq_file *m) +{ + seq_puts(m, key->description); + seq_printf(m, ": %u", key->datalen); +} + +static bool nvme_tls_psk_match(const struct key *key, + const struct key_match_data *match_data) +{ + const char *match_id; + size_t match_len; + + if (!key->description) { + pr_debug("%s: no key description\n", __func__); + return false; + } + match_len = strlen(key->description); + pr_debug("%s: id %s len %zd\n", __func__, key->description, match_len); + + if (!match_data->raw_data) { + pr_debug("%s: no match data\n", __func__); + return false; + } + match_id = match_data->raw_data; + pr_debug("%s: match '%s' '%s' len %zd\n", + __func__, match_id, key->description, match_len); + return !memcmp(key->description, match_id, match_len); +} + +static int nvme_tls_psk_match_preparse(struct key_match_data *match_data) +{ + match_data->lookup_type = KEYRING_SEARCH_LOOKUP_ITERATE; + match_data->cmp = nvme_tls_psk_match; + return 0; +} + +static struct key_type nvme_tls_psk_key_type = { + .name = "psk", + .flags = KEY_TYPE_NET_DOMAIN, + .preparse = user_preparse, + .free_preparse = user_free_preparse, + .match_preparse = nvme_tls_psk_match_preparse, + .instantiate = generic_key_instantiate, + .revoke = user_revoke, + .destroy = user_destroy, + .describe = nvme_tls_psk_describe, + .read = user_read, +}; + +static struct key *nvme_tls_psk_lookup(struct key *keyring, + const char *hostnqn, const char *subnqn, + int hmac, bool generated) +{ + char *identity; + size_t identity_len = (NVMF_NQN_SIZE) * 2 + 11; + key_ref_t keyref; + key_serial_t keyring_id; + + identity = kzalloc(identity_len, GFP_KERNEL); + if (!identity) + return ERR_PTR(-ENOMEM); + + snprintf(identity, identity_len, "NVMe0%c%02d %s %s", + generated ? 'G' : 'R', hmac, hostnqn, subnqn); + + if (!keyring) + keyring = nvme_keyring; + keyring_id = key_serial(keyring); + pr_debug("keyring %x lookup tls psk '%s'\n", + keyring_id, identity); + keyref = keyring_search(make_key_ref(keyring, true), + &nvme_tls_psk_key_type, + identity, false); + if (IS_ERR(keyref)) { + pr_debug("lookup tls psk '%s' failed, error %ld\n", + identity, PTR_ERR(keyref)); + kfree(identity); + return ERR_PTR(-ENOKEY); + } + kfree(identity); + + return key_ref_to_ptr(keyref); +} + +/* + * NVMe PSK priority list + * + * 'Retained' PSKs (ie 'generated == false') + * should be preferred to 'generated' PSKs, + * and SHA-384 should be preferred to SHA-256. + */ +struct nvme_tls_psk_priority_list { + bool generated; + enum nvme_tcp_tls_cipher cipher; +} nvme_tls_psk_prio[] = { + { .generated = false, + .cipher = NVME_TCP_TLS_CIPHER_SHA384, }, + { .generated = false, + .cipher = NVME_TCP_TLS_CIPHER_SHA256, }, + { .generated = true, + .cipher = NVME_TCP_TLS_CIPHER_SHA384, }, + { .generated = true, + .cipher = NVME_TCP_TLS_CIPHER_SHA256, }, +}; + +/* + * nvme_tls_psk_default - Return the preferred PSK to use for TLS ClientHello + */ +key_serial_t nvme_tls_psk_default(struct key *keyring, + const char *hostnqn, const char *subnqn) +{ + struct key *tls_key; + key_serial_t tls_key_id; + int prio; + + for (prio = 0; prio < ARRAY_SIZE(nvme_tls_psk_prio); prio++) { + bool generated = nvme_tls_psk_prio[prio].generated; + enum nvme_tcp_tls_cipher cipher = nvme_tls_psk_prio[prio].cipher; + + tls_key = nvme_tls_psk_lookup(keyring, hostnqn, subnqn, + cipher, generated); + if (!IS_ERR(tls_key)) { + tls_key_id = tls_key->serial; + key_put(tls_key); + return tls_key_id; + } + } + return 0; +} +EXPORT_SYMBOL_GPL(nvme_tls_psk_default); + +int nvme_keyring_init(void) +{ + int err; + + nvme_keyring = keyring_alloc(".nvme", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + current_cred(), + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + (KEY_USR_ALL & ~KEY_USR_SETATTR), + KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); + if (IS_ERR(nvme_keyring)) + return PTR_ERR(nvme_keyring); + + err = register_key_type(&nvme_tls_psk_key_type); + if (err) { + key_put(nvme_keyring); + return err; + } + return 0; +} +EXPORT_SYMBOL_GPL(nvme_keyring_init); + +void nvme_keyring_exit(void) +{ + unregister_key_type(&nvme_tls_psk_key_type); + key_revoke(nvme_keyring); + key_put(nvme_keyring); +} +EXPORT_SYMBOL_GPL(nvme_keyring_exit); diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index 2f6a7f8c94e8..48f7d72de5e9 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -92,16 +92,26 @@ config NVME_TCP If unsure, say N. -config NVME_AUTH +config NVME_TCP_TLS + bool "NVMe over Fabrics TCP TLS encryption support" + depends on NVME_TCP + select NVME_COMMON + select NVME_KEYRING + select NET_HANDSHAKE + select KEYS + help + Enables TLS encryption for NVMe TCP using the netlink handshake API. + + The TLS handshake daemon is availble at + https://github.com/oracle/ktls-utils. + + If unsure, say N. + +config NVME_HOST_AUTH bool "NVM Express over Fabrics In-Band Authentication" depends on NVME_CORE select NVME_COMMON - select CRYPTO - select CRYPTO_HMAC - select CRYPTO_SHA256 - select CRYPTO_SHA512 - select CRYPTO_DH - select CRYPTO_DH_RFC7919_GROUPS + select NVME_AUTH help This provides support for NVMe over Fabrics In-Band Authentication. diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index c7c3cf202d12..6414ec968f99 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -17,7 +17,7 @@ nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o nvme-core-$(CONFIG_BLK_DEV_ZONED) += zns.o nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS) += fault_inject.o nvme-core-$(CONFIG_NVME_HWMON) += hwmon.o -nvme-core-$(CONFIG_NVME_AUTH) += auth.o +nvme-core-$(CONFIG_NVME_HOST_AUTH) += auth.o nvme-y += pci.o diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 064592a5d546..eaefebb2a799 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -23,6 +23,7 @@ struct nvme_dhchap_queue_context { struct nvme_ctrl *ctrl; struct crypto_shash *shash_tfm; struct crypto_kpp *dh_tfm; + struct nvme_dhchap_key *transformed_key; void *buf; int qid; int error; @@ -36,7 +37,6 @@ struct nvme_dhchap_queue_context { u8 c1[64]; u8 c2[64]; u8 response[64]; - u8 *host_response; u8 *ctrl_key; u8 *host_key; u8 *sess_key; @@ -428,12 +428,12 @@ static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl, dev_dbg(ctrl->device, "%s: qid %d host response seq %u transaction %d\n", __func__, chap->qid, chap->s1, chap->transaction); - if (!chap->host_response) { - chap->host_response = nvme_auth_transform_key(ctrl->host_key, + if (!chap->transformed_key) { + chap->transformed_key = nvme_auth_transform_key(ctrl->host_key, ctrl->opts->host->nqn); - if (IS_ERR(chap->host_response)) { - ret = PTR_ERR(chap->host_response); - chap->host_response = NULL; + if (IS_ERR(chap->transformed_key)) { + ret = PTR_ERR(chap->transformed_key); + chap->transformed_key = NULL; return ret; } } else { @@ -442,7 +442,7 @@ static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl, } ret = crypto_shash_setkey(chap->shash_tfm, - chap->host_response, ctrl->host_key->len); + chap->transformed_key->key, chap->transformed_key->len); if (ret) { dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n", chap->qid, ret); @@ -508,19 +508,19 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl, struct nvme_dhchap_queue_context *chap) { SHASH_DESC_ON_STACK(shash, chap->shash_tfm); - u8 *ctrl_response; + struct nvme_dhchap_key *transformed_key; u8 buf[4], *challenge = chap->c2; int ret; - ctrl_response = nvme_auth_transform_key(ctrl->ctrl_key, + transformed_key = nvme_auth_transform_key(ctrl->ctrl_key, ctrl->opts->subsysnqn); - if (IS_ERR(ctrl_response)) { - ret = PTR_ERR(ctrl_response); + if (IS_ERR(transformed_key)) { + ret = PTR_ERR(transformed_key); return ret; } ret = crypto_shash_setkey(chap->shash_tfm, - ctrl_response, ctrl->ctrl_key->len); + transformed_key->key, transformed_key->len); if (ret) { dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n", chap->qid, ret); @@ -586,7 +586,7 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl, out: if (challenge != chap->c2) kfree(challenge); - kfree(ctrl_response); + nvme_auth_free_key(transformed_key); return ret; } @@ -648,8 +648,8 @@ gen_sesskey: static void nvme_auth_reset_dhchap(struct nvme_dhchap_queue_context *chap) { - kfree_sensitive(chap->host_response); - chap->host_response = NULL; + nvme_auth_free_key(chap->transformed_key); + chap->transformed_key = NULL; kfree_sensitive(chap->host_key); chap->host_key = NULL; chap->host_key_len = 0; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 21783aa2ee8e..62612f87aafa 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -25,6 +25,7 @@ #include "nvme.h" #include "fabrics.h" #include <linux/nvme-auth.h> +#include <linux/nvme-keyring.h> #define CREATE_TRACE_POINTS #include "trace.h" @@ -420,7 +421,7 @@ void nvme_complete_rq(struct request *req) nvme_failover_req(req); return; case AUTHENTICATE: -#ifdef CONFIG_NVME_AUTH +#ifdef CONFIG_NVME_HOST_AUTH queue_work(nvme_wq, &ctrl->dhchap_auth_work); nvme_retry_req(req); #else @@ -4399,7 +4400,7 @@ static void nvme_free_ctrl(struct device *dev) if (!subsys || ctrl->instance != subsys->instance) ida_free(&nvme_instance_ida, ctrl->instance); - + key_put(ctrl->tls_key); nvme_free_cels(ctrl); nvme_mpath_uninit(ctrl); nvme_auth_stop(ctrl); @@ -4723,12 +4724,16 @@ static int __init nvme_core_init(void) result = PTR_ERR(nvme_ns_chr_class); goto unregister_generic_ns; } - - result = nvme_init_auth(); + result = nvme_keyring_init(); if (result) goto destroy_ns_chr; + result = nvme_init_auth(); + if (result) + goto keyring_exit; return 0; +keyring_exit: + nvme_keyring_exit(); destroy_ns_chr: class_destroy(nvme_ns_chr_class); unregister_generic_ns: @@ -4752,6 +4757,7 @@ out: static void __exit nvme_core_exit(void) { nvme_exit_auth(); + nvme_keyring_exit(); class_destroy(nvme_ns_chr_class); class_destroy(nvme_subsys_class); class_destroy(nvme_class); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 8175d49f2909..4673ead69c5f 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -12,6 +12,7 @@ #include <linux/seq_file.h> #include "nvme.h" #include "fabrics.h" +#include <linux/nvme-keyring.h> static LIST_HEAD(nvmf_transports); static DECLARE_RWSEM(nvmf_transports_rwsem); @@ -622,6 +623,23 @@ static struct nvmf_transport_ops *nvmf_lookup_transport( return NULL; } +static struct key *nvmf_parse_key(int key_id) +{ + struct key *key; + + if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) { + pr_err("TLS is not supported\n"); + return ERR_PTR(-EINVAL); + } + + key = key_lookup(key_id); + if (!IS_ERR(key)) + pr_err("key id %08x not found\n", key_id); + else + pr_debug("Using key id %08x\n", key_id); + return key; +} + static const match_table_t opt_tokens = { { NVMF_OPT_TRANSPORT, "transport=%s" }, { NVMF_OPT_TRADDR, "traddr=%s" }, @@ -643,10 +661,17 @@ static const match_table_t opt_tokens = { { NVMF_OPT_NR_WRITE_QUEUES, "nr_write_queues=%d" }, { NVMF_OPT_NR_POLL_QUEUES, "nr_poll_queues=%d" }, { NVMF_OPT_TOS, "tos=%d" }, +#ifdef CONFIG_NVME_TCP_TLS + { NVMF_OPT_KEYRING, "keyring=%d" }, + { NVMF_OPT_TLS_KEY, "tls_key=%d" }, +#endif { NVMF_OPT_FAIL_FAST_TMO, "fast_io_fail_tmo=%d" }, { NVMF_OPT_DISCOVERY, "discovery" }, { NVMF_OPT_DHCHAP_SECRET, "dhchap_secret=%s" }, { NVMF_OPT_DHCHAP_CTRL_SECRET, "dhchap_ctrl_secret=%s" }, +#ifdef CONFIG_NVME_TCP_TLS + { NVMF_OPT_TLS, "tls" }, +#endif { NVMF_OPT_ERR, NULL } }; @@ -657,9 +682,10 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, char *options, *o, *p; int token, ret = 0; size_t nqnlen = 0; - int ctrl_loss_tmo = NVMF_DEF_CTRL_LOSS_TMO; + int ctrl_loss_tmo = NVMF_DEF_CTRL_LOSS_TMO, key_id; uuid_t hostid; char hostnqn[NVMF_NQN_SIZE]; + struct key *key; /* Set defaults */ opts->queue_size = NVMF_DEF_QUEUE_SIZE; @@ -671,6 +697,9 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->hdr_digest = false; opts->data_digest = false; opts->tos = -1; /* < 0 == use transport default */ + opts->tls = false; + opts->tls_key = NULL; + opts->keyring = NULL; options = o = kstrdup(buf, GFP_KERNEL); if (!options) @@ -924,6 +953,32 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, } opts->tos = token; break; + case NVMF_OPT_KEYRING: + if (match_int(args, &key_id) || key_id <= 0) { + ret = -EINVAL; + goto out; + } + key = nvmf_parse_key(key_id); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto out; + } + key_put(opts->keyring); + opts->keyring = key; + break; + case NVMF_OPT_TLS_KEY: + if (match_int(args, &key_id) || key_id <= 0) { + ret = -EINVAL; + goto out; + } + key = nvmf_parse_key(key_id); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto out; + } + key_put(opts->tls_key); + opts->tls_key = key; + break; case NVMF_OPT_DISCOVERY: opts->discovery_nqn = true; break; @@ -955,6 +1010,14 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, kfree(opts->dhchap_ctrl_secret); opts->dhchap_ctrl_secret = p; break; + case NVMF_OPT_TLS: + if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) { + pr_err("TLS is not supported\n"); + ret = -EINVAL; + goto out; + } + opts->tls = true; + break; default: pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", p); @@ -1156,6 +1219,8 @@ static int nvmf_check_allowed_opts(struct nvmf_ctrl_options *opts, void nvmf_free_options(struct nvmf_ctrl_options *opts) { nvmf_host_put(opts->host); + key_put(opts->keyring); + key_put(opts->tls_key); kfree(opts->transport); kfree(opts->traddr); kfree(opts->trsvcid); diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 82e7a27ffbde..fbaee5a7be19 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -70,6 +70,9 @@ enum { NVMF_OPT_DISCOVERY = 1 << 22, NVMF_OPT_DHCHAP_SECRET = 1 << 23, NVMF_OPT_DHCHAP_CTRL_SECRET = 1 << 24, + NVMF_OPT_TLS = 1 << 25, + NVMF_OPT_KEYRING = 1 << 26, + NVMF_OPT_TLS_KEY = 1 << 27, }; /** @@ -102,6 +105,9 @@ enum { * @dhchap_secret: DH-HMAC-CHAP secret * @dhchap_ctrl_secret: DH-HMAC-CHAP controller secret for bi-directional * authentication + * @keyring: Keyring to use for key lookups + * @tls_key: TLS key for encrypted connections (TCP) + * @tls: Start TLS encrypted connections (TCP) * @disable_sqflow: disable controller sq flow control * @hdr_digest: generate/verify header digest (TCP) * @data_digest: generate/verify data digest (TCP) @@ -128,6 +134,9 @@ struct nvmf_ctrl_options { struct nvmf_host *host; char *dhchap_secret; char *dhchap_ctrl_secret; + struct key *keyring; + struct key *tls_key; + bool tls; bool disable_sqflow; bool hdr_digest; bool data_digest; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index f35647c470af..39a90b7cb125 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -349,7 +349,7 @@ struct nvme_ctrl { struct work_struct ana_work; #endif -#ifdef CONFIG_NVME_AUTH +#ifdef CONFIG_NVME_HOST_AUTH struct work_struct dhchap_auth_work; struct mutex dhchap_auth_mutex; struct nvme_dhchap_queue_context *dhchap_ctxs; @@ -357,6 +357,7 @@ struct nvme_ctrl { struct nvme_dhchap_key *ctrl_key; u16 transaction; #endif + struct key *tls_key; /* Power saving configuration */ u64 ps_max_latency_us; @@ -1048,7 +1049,7 @@ static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl) return ctrl->sgls & ((1 << 0) | (1 << 1)); } -#ifdef CONFIG_NVME_AUTH +#ifdef CONFIG_NVME_HOST_AUTH int __init nvme_init_auth(void); void __exit nvme_exit_auth(void); int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 3f0c9ee09a12..507bc149046d 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -924,7 +924,6 @@ static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req) if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true))) return false; - req->mq_hctx->tags->rqs[req->tag] = req; return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK; } diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 212e1b05d298..c6b7fbd4d34d 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -409,7 +409,7 @@ static ssize_t dctype_show(struct device *dev, } static DEVICE_ATTR_RO(dctype); -#ifdef CONFIG_NVME_AUTH +#ifdef CONFIG_NVME_HOST_AUTH static ssize_t nvme_ctrl_dhchap_secret_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -527,6 +527,19 @@ static DEVICE_ATTR(dhchap_ctrl_secret, S_IRUGO | S_IWUSR, nvme_ctrl_dhchap_ctrl_secret_show, nvme_ctrl_dhchap_ctrl_secret_store); #endif +#ifdef CONFIG_NVME_TCP_TLS +static ssize_t tls_key_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (!ctrl->tls_key) + return 0; + return sysfs_emit(buf, "%08x", key_serial(ctrl->tls_key)); +} +static DEVICE_ATTR_RO(tls_key); +#endif + static struct attribute *nvme_dev_attrs[] = { &dev_attr_reset_controller.attr, &dev_attr_rescan_controller.attr, @@ -550,10 +563,13 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_kato.attr, &dev_attr_cntrltype.attr, &dev_attr_dctype.attr, -#ifdef CONFIG_NVME_AUTH +#ifdef CONFIG_NVME_HOST_AUTH &dev_attr_dhchap_secret.attr, &dev_attr_dhchap_ctrl_secret.attr, #endif +#ifdef CONFIG_NVME_TCP_TLS + &dev_attr_tls_key.attr, +#endif NULL }; @@ -577,12 +593,17 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, return 0; if (a == &dev_attr_fast_io_fail_tmo.attr && !ctrl->opts) return 0; -#ifdef CONFIG_NVME_AUTH +#ifdef CONFIG_NVME_HOST_AUTH if (a == &dev_attr_dhchap_secret.attr && !ctrl->opts) return 0; if (a == &dev_attr_dhchap_ctrl_secret.attr && !ctrl->opts) return 0; #endif +#ifdef CONFIG_NVME_TCP_TLS + if (a == &dev_attr_tls_key.attr && + (!ctrl->opts || strcmp(ctrl->opts->transport, "tcp"))) + return 0; +#endif return a->mode; } diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 5b332d9f87fc..4714a902f4ca 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -8,9 +8,14 @@ #include <linux/init.h> #include <linux/slab.h> #include <linux/err.h> +#include <linux/key.h> #include <linux/nvme-tcp.h> +#include <linux/nvme-keyring.h> #include <net/sock.h> #include <net/tcp.h> +#include <net/tls.h> +#include <net/tls_prot.h> +#include <net/handshake.h> #include <linux/blk-mq.h> #include <crypto/hash.h> #include <net/busy_poll.h> @@ -31,6 +36,16 @@ static int so_priority; module_param(so_priority, int, 0644); MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority"); +#ifdef CONFIG_NVME_TCP_TLS +/* + * TLS handshake timeout + */ +static int tls_handshake_timeout = 10; +module_param(tls_handshake_timeout, int, 0644); +MODULE_PARM_DESC(tls_handshake_timeout, + "nvme TLS handshake timeout in seconds (default 10)"); +#endif + #ifdef CONFIG_DEBUG_LOCK_ALLOC /* lockdep can detect a circular dependency of the form * sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock @@ -146,7 +161,10 @@ struct nvme_tcp_queue { struct ahash_request *snd_hash; __le32 exp_ddgst; __le32 recv_ddgst; - +#ifdef CONFIG_NVME_TCP_TLS + struct completion tls_complete; + int tls_err; +#endif struct page_frag_cache pf_cache; void (*state_change)(struct sock *); @@ -1338,7 +1356,9 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) } noreclaim_flag = memalloc_noreclaim_save(); - sock_release(queue->sock); + /* ->sock will be released by fput() */ + fput(queue->sock->file); + queue->sock = NULL; memalloc_noreclaim_restore(noreclaim_flag); kfree(queue->pdu); @@ -1350,6 +1370,8 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) { struct nvme_tcp_icreq_pdu *icreq; struct nvme_tcp_icresp_pdu *icresp; + char cbuf[CMSG_LEN(sizeof(char))] = {}; + u8 ctype; struct msghdr msg = {}; struct kvec iov; bool ctrl_hdgst, ctrl_ddgst; @@ -1381,17 +1403,35 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) iov.iov_base = icreq; iov.iov_len = sizeof(*icreq); ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); - if (ret < 0) + if (ret < 0) { + pr_warn("queue %d: failed to send icreq, error %d\n", + nvme_tcp_queue_id(queue), ret); goto free_icresp; + } memset(&msg, 0, sizeof(msg)); iov.iov_base = icresp; iov.iov_len = sizeof(*icresp); + if (queue->ctrl->ctrl.opts->tls) { + msg.msg_control = cbuf; + msg.msg_controllen = sizeof(cbuf); + } ret = kernel_recvmsg(queue->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags); - if (ret < 0) + if (ret < 0) { + pr_warn("queue %d: failed to receive icresp, error %d\n", + nvme_tcp_queue_id(queue), ret); goto free_icresp; - + } + if (queue->ctrl->ctrl.opts->tls) { + ctype = tls_get_record_type(queue->sock->sk, + (struct cmsghdr *)cbuf); + if (ctype != TLS_RECORD_TYPE_DATA) { + pr_err("queue %d: unhandled TLS record %d\n", + nvme_tcp_queue_id(queue), ctype); + return -ENOTCONN; + } + } ret = -EINVAL; if (icresp->hdr.type != nvme_tcp_icresp) { pr_err("queue %d: bad type returned %d\n", @@ -1507,11 +1547,99 @@ static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue) queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); } -static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid) +#ifdef CONFIG_NVME_TCP_TLS +static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid) +{ + struct nvme_tcp_queue *queue = data; + struct nvme_tcp_ctrl *ctrl = queue->ctrl; + int qid = nvme_tcp_queue_id(queue); + struct key *tls_key; + + dev_dbg(ctrl->ctrl.device, "queue %d: TLS handshake done, key %x, status %d\n", + qid, pskid, status); + + if (status) { + queue->tls_err = -status; + goto out_complete; + } + + tls_key = key_lookup(pskid); + if (IS_ERR(tls_key)) { + dev_warn(ctrl->ctrl.device, "queue %d: Invalid key %x\n", + qid, pskid); + queue->tls_err = -ENOKEY; + } else { + ctrl->ctrl.tls_key = tls_key; + queue->tls_err = 0; + } + +out_complete: + complete(&queue->tls_complete); +} + +static int nvme_tcp_start_tls(struct nvme_ctrl *nctrl, + struct nvme_tcp_queue *queue, + key_serial_t pskid) +{ + int qid = nvme_tcp_queue_id(queue); + int ret; + struct tls_handshake_args args; + unsigned long tmo = tls_handshake_timeout * HZ; + key_serial_t keyring = nvme_keyring_id(); + + dev_dbg(nctrl->device, "queue %d: start TLS with key %x\n", + qid, pskid); + memset(&args, 0, sizeof(args)); + args.ta_sock = queue->sock; + args.ta_done = nvme_tcp_tls_done; + args.ta_data = queue; + args.ta_my_peerids[0] = pskid; + args.ta_num_peerids = 1; + if (nctrl->opts->keyring) + keyring = key_serial(nctrl->opts->keyring); + args.ta_keyring = keyring; + args.ta_timeout_ms = tls_handshake_timeout * 1000; + queue->tls_err = -EOPNOTSUPP; + init_completion(&queue->tls_complete); + ret = tls_client_hello_psk(&args, GFP_KERNEL); + if (ret) { + dev_err(nctrl->device, "queue %d: failed to start TLS: %d\n", + qid, ret); + return ret; + } + ret = wait_for_completion_interruptible_timeout(&queue->tls_complete, tmo); + if (ret <= 0) { + if (ret == 0) + ret = -ETIMEDOUT; + + dev_err(nctrl->device, + "queue %d: TLS handshake failed, error %d\n", + qid, ret); + tls_handshake_cancel(queue->sock->sk); + } else { + dev_dbg(nctrl->device, + "queue %d: TLS handshake complete, error %d\n", + qid, queue->tls_err); + ret = queue->tls_err; + } + return ret; +} +#else +static int nvme_tcp_start_tls(struct nvme_ctrl *nctrl, + struct nvme_tcp_queue *queue, + key_serial_t pskid) +{ + return -EPROTONOSUPPORT; +} +#endif + +static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, + key_serial_t pskid) { struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); struct nvme_tcp_queue *queue = &ctrl->queues[qid]; int ret, rcv_pdu_size; + struct file *sock_file; mutex_init(&queue->queue_lock); queue->ctrl = ctrl; @@ -1534,6 +1662,11 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid) goto err_destroy_mutex; } + sock_file = sock_alloc_file(queue->sock, O_CLOEXEC, NULL); + if (IS_ERR(sock_file)) { + ret = PTR_ERR(sock_file); + goto err_destroy_mutex; + } nvme_tcp_reclassify_socket(queue->sock); /* Single syn retry */ @@ -1624,6 +1757,13 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid) goto err_rcv_pdu; } + /* If PSKs are configured try to start TLS */ + if (pskid) { + ret = nvme_tcp_start_tls(nctrl, queue, pskid); + if (ret) + goto err_init_connect; + } + ret = nvme_tcp_init_connection(queue); if (ret) goto err_init_connect; @@ -1640,7 +1780,8 @@ err_crypto: if (queue->hdr_digest || queue->data_digest) nvme_tcp_free_crypto(queue); err_sock: - sock_release(queue->sock); + /* ->sock will be released by fput() */ + fput(queue->sock->file); queue->sock = NULL; err_destroy_mutex: mutex_destroy(&queue->send_mutex); @@ -1772,10 +1913,25 @@ out_stop_queues: static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl) { int ret; + key_serial_t pskid = 0; - ret = nvme_tcp_alloc_queue(ctrl, 0); + if (ctrl->opts->tls) { + if (ctrl->opts->tls_key) + pskid = key_serial(ctrl->opts->tls_key); + else + pskid = nvme_tls_psk_default(ctrl->opts->keyring, + ctrl->opts->host->nqn, + ctrl->opts->subsysnqn); + if (!pskid) { + dev_err(ctrl->device, "no valid PSK found\n"); + ret = -ENOKEY; + goto out_free_queue; + } + } + + ret = nvme_tcp_alloc_queue(ctrl, 0, pskid); if (ret) - return ret; + goto out_free_queue; ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl)); if (ret) @@ -1792,8 +1948,13 @@ static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) { int i, ret; + if (ctrl->opts->tls && !ctrl->tls_key) { + dev_err(ctrl->device, "no PSK negotiated\n"); + return -ENOKEY; + } for (i = 1; i < ctrl->queue_count; i++) { - ret = nvme_tcp_alloc_queue(ctrl, i); + ret = nvme_tcp_alloc_queue(ctrl, i, + key_serial(ctrl->tls_key)); if (ret) goto out_free_queues; } @@ -2621,7 +2782,8 @@ static struct nvmf_transport_ops nvme_tcp_transport = { NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST | NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | - NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE, + NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE | NVMF_OPT_TLS | + NVMF_OPT_KEYRING | NVMF_OPT_TLS_KEY, .create_ctrl = nvme_tcp_create_ctrl, }; diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index 79fc64035ee3..fa479c9f5c3d 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -84,16 +84,26 @@ config NVME_TARGET_TCP If unsure, say N. +config NVME_TARGET_TCP_TLS + bool "NVMe over Fabrics TCP target TLS encryption support" + depends on NVME_TARGET_TCP + select NVME_COMMON + select NVME_KEYRING + select NET_HANDSHAKE + select KEYS + help + Enables TLS encryption for the NVMe TCP target using the netlink handshake API. + + The TLS handshake daemon is available at + https://github.com/oracle/ktls-utils. + + If unsure, say N. + config NVME_TARGET_AUTH bool "NVMe over Fabrics In-band Authentication support" depends on NVME_TARGET select NVME_COMMON - select CRYPTO - select CRYPTO_HMAC - select CRYPTO_SHA256 - select CRYPTO_SHA512 - select CRYPTO_DH - select CRYPTO_DH_RFC7919_GROUPS + select NVME_AUTH help This enables support for NVMe over Fabrics In-band Authentication diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index 4dcddcf95279..3ddbc3880cac 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -267,7 +267,8 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, struct shash_desc *shash; struct nvmet_ctrl *ctrl = req->sq->ctrl; const char *hash_name; - u8 *challenge = req->sq->dhchap_c1, *host_response; + u8 *challenge = req->sq->dhchap_c1; + struct nvme_dhchap_key *transformed_key; u8 buf[4]; int ret; @@ -291,14 +292,15 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, goto out_free_tfm; } - host_response = nvme_auth_transform_key(ctrl->host_key, ctrl->hostnqn); - if (IS_ERR(host_response)) { - ret = PTR_ERR(host_response); + transformed_key = nvme_auth_transform_key(ctrl->host_key, + ctrl->hostnqn); + if (IS_ERR(transformed_key)) { + ret = PTR_ERR(transformed_key); goto out_free_tfm; } - ret = crypto_shash_setkey(shash_tfm, host_response, - ctrl->host_key->len); + ret = crypto_shash_setkey(shash_tfm, transformed_key->key, + transformed_key->len); if (ret) goto out_free_response; @@ -365,7 +367,7 @@ out: kfree(challenge); kfree(shash); out_free_response: - kfree_sensitive(host_response); + nvme_auth_free_key(transformed_key); out_free_tfm: crypto_free_shash(shash_tfm); return 0; @@ -378,7 +380,8 @@ int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response, struct shash_desc *shash; struct nvmet_ctrl *ctrl = req->sq->ctrl; const char *hash_name; - u8 *challenge = req->sq->dhchap_c2, *ctrl_response; + u8 *challenge = req->sq->dhchap_c2; + struct nvme_dhchap_key *transformed_key; u8 buf[4]; int ret; @@ -402,15 +405,15 @@ int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response, goto out_free_tfm; } - ctrl_response = nvme_auth_transform_key(ctrl->ctrl_key, + transformed_key = nvme_auth_transform_key(ctrl->ctrl_key, ctrl->subsysnqn); - if (IS_ERR(ctrl_response)) { - ret = PTR_ERR(ctrl_response); + if (IS_ERR(transformed_key)) { + ret = PTR_ERR(transformed_key); goto out_free_tfm; } - ret = crypto_shash_setkey(shash_tfm, ctrl_response, - ctrl->ctrl_key->len); + ret = crypto_shash_setkey(shash_tfm, transformed_key->key, + transformed_key->len); if (ret) goto out_free_response; @@ -474,7 +477,7 @@ out: kfree(challenge); kfree(shash); out_free_response: - kfree_sensitive(ctrl_response); + nvme_auth_free_key(transformed_key); out_free_tfm: crypto_free_shash(shash_tfm); return 0; diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 907143870da5..9eed6e6765ea 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -15,6 +15,7 @@ #ifdef CONFIG_NVME_TARGET_AUTH #include <linux/nvme-auth.h> #endif +#include <linux/nvme-keyring.h> #include <crypto/hash.h> #include <crypto/kpp.h> @@ -159,10 +160,14 @@ static const struct nvmet_type_name_map nvmet_addr_treq[] = { { NVMF_TREQ_NOT_REQUIRED, "not required" }, }; +static inline u8 nvmet_port_disc_addr_treq_mask(struct nvmet_port *port) +{ + return (port->disc_addr.treq & ~NVME_TREQ_SECURE_CHANNEL_MASK); +} + static ssize_t nvmet_addr_treq_show(struct config_item *item, char *page) { - u8 treq = to_nvmet_port(item)->disc_addr.treq & - NVME_TREQ_SECURE_CHANNEL_MASK; + u8 treq = nvmet_port_disc_addr_treq_secure_channel(to_nvmet_port(item)); int i; for (i = 0; i < ARRAY_SIZE(nvmet_addr_treq); i++) { @@ -178,7 +183,7 @@ static ssize_t nvmet_addr_treq_store(struct config_item *item, const char *page, size_t count) { struct nvmet_port *port = to_nvmet_port(item); - u8 treq = port->disc_addr.treq & ~NVME_TREQ_SECURE_CHANNEL_MASK; + u8 treq = nvmet_port_disc_addr_treq_mask(port); int i; if (nvmet_is_port_enabled(port, __func__)) @@ -193,6 +198,20 @@ static ssize_t nvmet_addr_treq_store(struct config_item *item, return -EINVAL; found: + if (port->disc_addr.trtype == NVMF_TRTYPE_TCP && + port->disc_addr.tsas.tcp.sectype == NVMF_TCP_SECTYPE_TLS13) { + switch (nvmet_addr_treq[i].type) { + case NVMF_TREQ_NOT_SPECIFIED: + pr_debug("treq '%s' not allowed for TLS1.3\n", + nvmet_addr_treq[i].name); + return -EINVAL; + case NVMF_TREQ_NOT_REQUIRED: + pr_warn("Allow non-TLS connections while TLS1.3 is enabled\n"); + break; + default: + break; + } + } treq |= nvmet_addr_treq[i].type; port->disc_addr.treq = treq; return count; @@ -303,6 +322,11 @@ static void nvmet_port_init_tsas_rdma(struct nvmet_port *port) port->disc_addr.tsas.rdma.cms = NVMF_RDMA_CMS_RDMA_CM; } +static void nvmet_port_init_tsas_tcp(struct nvmet_port *port, int sectype) +{ + port->disc_addr.tsas.tcp.sectype = sectype; +} + static ssize_t nvmet_addr_trtype_store(struct config_item *item, const char *page, size_t count) { @@ -325,11 +349,99 @@ found: port->disc_addr.trtype = nvmet_transport[i].type; if (port->disc_addr.trtype == NVMF_TRTYPE_RDMA) nvmet_port_init_tsas_rdma(port); + else if (port->disc_addr.trtype == NVMF_TRTYPE_TCP) + nvmet_port_init_tsas_tcp(port, NVMF_TCP_SECTYPE_NONE); return count; } CONFIGFS_ATTR(nvmet_, addr_trtype); +static const struct nvmet_type_name_map nvmet_addr_tsas_tcp[] = { + { NVMF_TCP_SECTYPE_NONE, "none" }, + { NVMF_TCP_SECTYPE_TLS13, "tls1.3" }, +}; + +static const struct nvmet_type_name_map nvmet_addr_tsas_rdma[] = { + { NVMF_RDMA_QPTYPE_CONNECTED, "connected" }, + { NVMF_RDMA_QPTYPE_DATAGRAM, "datagram" }, +}; + +static ssize_t nvmet_addr_tsas_show(struct config_item *item, + char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + int i; + + if (port->disc_addr.trtype == NVMF_TRTYPE_TCP) { + for (i = 0; i < ARRAY_SIZE(nvmet_addr_tsas_tcp); i++) { + if (port->disc_addr.tsas.tcp.sectype == nvmet_addr_tsas_tcp[i].type) + return sprintf(page, "%s\n", nvmet_addr_tsas_tcp[i].name); + } + } else if (port->disc_addr.trtype == NVMF_TRTYPE_RDMA) { + for (i = 0; i < ARRAY_SIZE(nvmet_addr_tsas_rdma); i++) { + if (port->disc_addr.tsas.rdma.qptype == nvmet_addr_tsas_rdma[i].type) + return sprintf(page, "%s\n", nvmet_addr_tsas_rdma[i].name); + } + } + return sprintf(page, "reserved\n"); +} + +static ssize_t nvmet_addr_tsas_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + u8 treq = nvmet_port_disc_addr_treq_mask(port); + u8 sectype; + int i; + + if (nvmet_is_port_enabled(port, __func__)) + return -EACCES; + + if (port->disc_addr.trtype != NVMF_TRTYPE_TCP) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(nvmet_addr_tsas_tcp); i++) { + if (sysfs_streq(page, nvmet_addr_tsas_tcp[i].name)) { + sectype = nvmet_addr_tsas_tcp[i].type; + goto found; + } + } + + pr_err("Invalid value '%s' for tsas\n", page); + return -EINVAL; + +found: + if (sectype == NVMF_TCP_SECTYPE_TLS13) { + if (!IS_ENABLED(CONFIG_NVME_TARGET_TCP_TLS)) { + pr_err("TLS is not supported\n"); + return -EINVAL; + } + if (!port->keyring) { + pr_err("TLS keyring not configured\n"); + return -EINVAL; + } + } + + nvmet_port_init_tsas_tcp(port, sectype); + /* + * If TLS is enabled TREQ should be set to 'required' per default + */ + if (sectype == NVMF_TCP_SECTYPE_TLS13) { + u8 sc = nvmet_port_disc_addr_treq_secure_channel(port); + + if (sc == NVMF_TREQ_NOT_SPECIFIED) + treq |= NVMF_TREQ_REQUIRED; + else + treq |= sc; + } else { + treq |= NVMF_TREQ_NOT_SPECIFIED; + } + port->disc_addr.treq = treq; + return count; +} + +CONFIGFS_ATTR(nvmet_, addr_tsas); + /* * Namespace structures & file operation functions below */ @@ -1731,6 +1843,7 @@ static void nvmet_port_release(struct config_item *item) flush_workqueue(nvmet_wq); list_del(&port->global_entry); + key_put(port->keyring); kfree(port->ana_state); kfree(port); } @@ -1741,6 +1854,7 @@ static struct configfs_attribute *nvmet_port_attrs[] = { &nvmet_attr_addr_traddr, &nvmet_attr_addr_trsvcid, &nvmet_attr_addr_trtype, + &nvmet_attr_addr_tsas, &nvmet_attr_param_inline_data_size, #ifdef CONFIG_BLK_DEV_INTEGRITY &nvmet_attr_param_pi_enable, @@ -1779,6 +1893,14 @@ static struct config_group *nvmet_ports_make(struct config_group *group, return ERR_PTR(-ENOMEM); } + if (nvme_keyring_id()) { + port->keyring = key_lookup(nvme_keyring_id()); + if (IS_ERR(port->keyring)) { + pr_warn("NVMe keyring not available, disabling TLS\n"); + port->keyring = NULL; + } + } + for (i = 1; i <= NVMET_MAX_ANAGRPS; i++) { if (i == NVMET_DEFAULT_ANA_GRPID) port->ana_state[1] = NVME_ANA_OPTIMIZED; diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 1ab6601fdd5c..bd59990b5250 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -146,7 +146,8 @@ struct nvmet_fc_tgt_queue { struct workqueue_struct *work_q; struct kref ref; struct rcu_head rcu; - struct nvmet_fc_fcp_iod fod[]; /* array of fcp_iods */ + /* array of fcp_iods */ + struct nvmet_fc_fcp_iod fod[] __counted_by(sqsize); } __aligned(sizeof(unsigned long long)); struct nvmet_fc_hostport { diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 360e385be33b..6c8acebe1a1a 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -159,6 +159,7 @@ struct nvmet_port { struct config_group ana_groups_group; struct nvmet_ana_group ana_default_group; enum nvme_ana_state *ana_state; + struct key *keyring; void *priv; bool enabled; int inline_data_size; @@ -179,6 +180,16 @@ static inline struct nvmet_port *ana_groups_to_port( ana_groups_group); } +static inline u8 nvmet_port_disc_addr_treq_secure_channel(struct nvmet_port *port) +{ + return (port->disc_addr.treq & NVME_TREQ_SECURE_CHANNEL_MASK); +} + +static inline bool nvmet_port_secure_channel_required(struct nvmet_port *port) +{ + return nvmet_port_disc_addr_treq_secure_channel(port) == NVMF_TREQ_REQUIRED; +} + struct nvmet_ctrl { struct nvmet_subsys *subsys; struct nvmet_sq **sqs; diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 197fc2ecb164..92b74d0b8686 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -8,9 +8,14 @@ #include <linux/init.h> #include <linux/slab.h> #include <linux/err.h> +#include <linux/key.h> #include <linux/nvme-tcp.h> +#include <linux/nvme-keyring.h> #include <net/sock.h> #include <net/tcp.h> +#include <net/tls.h> +#include <net/tls_prot.h> +#include <net/handshake.h> #include <linux/inet.h> #include <linux/llist.h> #include <crypto/hash.h> @@ -66,6 +71,16 @@ device_param_cb(idle_poll_period_usecs, &set_param_ops, MODULE_PARM_DESC(idle_poll_period_usecs, "nvmet tcp io_work poll till idle time period in usecs: Default 0"); +#ifdef CONFIG_NVME_TARGET_TCP_TLS +/* + * TLS handshake timeout + */ +static int tls_handshake_timeout = 10; +module_param(tls_handshake_timeout, int, 0644); +MODULE_PARM_DESC(tls_handshake_timeout, + "nvme TLS handshake timeout in seconds (default 10)"); +#endif + #define NVMET_TCP_RECV_BUDGET 8 #define NVMET_TCP_SEND_BUDGET 8 #define NVMET_TCP_IO_WORK_BUDGET 64 @@ -104,6 +119,7 @@ struct nvmet_tcp_cmd { u32 pdu_len; u32 pdu_recv; int sg_idx; + char recv_cbuf[CMSG_LEN(sizeof(char))]; struct msghdr recv_msg; struct bio_vec *iov; u32 flags; @@ -122,8 +138,10 @@ struct nvmet_tcp_cmd { enum nvmet_tcp_queue_state { NVMET_TCP_Q_CONNECTING, + NVMET_TCP_Q_TLS_HANDSHAKE, NVMET_TCP_Q_LIVE, NVMET_TCP_Q_DISCONNECTING, + NVMET_TCP_Q_FAILED, }; struct nvmet_tcp_queue { @@ -132,6 +150,7 @@ struct nvmet_tcp_queue { struct work_struct io_work; struct nvmet_cq nvme_cq; struct nvmet_sq nvme_sq; + struct kref kref; /* send state */ struct nvmet_tcp_cmd *cmds; @@ -155,6 +174,10 @@ struct nvmet_tcp_queue { struct ahash_request *snd_hash; struct ahash_request *rcv_hash; + /* TLS state */ + key_serial_t tls_pskid; + struct delayed_work tls_handshake_tmo_work; + unsigned long poll_end; spinlock_t state_lock; @@ -910,8 +933,10 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue) iov.iov_base = icresp; iov.iov_len = sizeof(*icresp); ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); - if (ret < 0) + if (ret < 0) { + queue->state = NVMET_TCP_Q_FAILED; return ret; /* queue removal will cleanup */ + } queue->state = NVMET_TCP_Q_LIVE; nvmet_prepare_receive_pdu(queue); @@ -1096,20 +1121,65 @@ static inline bool nvmet_tcp_pdu_valid(u8 type) return false; } +static int nvmet_tcp_tls_record_ok(struct nvmet_tcp_queue *queue, + struct msghdr *msg, char *cbuf) +{ + struct cmsghdr *cmsg = (struct cmsghdr *)cbuf; + u8 ctype, level, description; + int ret = 0; + + ctype = tls_get_record_type(queue->sock->sk, cmsg); + switch (ctype) { + case 0: + break; + case TLS_RECORD_TYPE_DATA: + break; + case TLS_RECORD_TYPE_ALERT: + tls_alert_recv(queue->sock->sk, msg, &level, &description); + if (level == TLS_ALERT_LEVEL_FATAL) { + pr_err("queue %d: TLS Alert desc %u\n", + queue->idx, description); + ret = -ENOTCONN; + } else { + pr_warn("queue %d: TLS Alert desc %u\n", + queue->idx, description); + ret = -EAGAIN; + } + break; + default: + /* discard this record type */ + pr_err("queue %d: TLS record %d unhandled\n", + queue->idx, ctype); + ret = -EAGAIN; + break; + } + return ret; +} + static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue) { struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr; - int len; + int len, ret; struct kvec iov; + char cbuf[CMSG_LEN(sizeof(char))] = {}; struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; recv: iov.iov_base = (void *)&queue->pdu + queue->offset; iov.iov_len = queue->left; + if (queue->tls_pskid) { + msg.msg_control = cbuf; + msg.msg_controllen = sizeof(cbuf); + } len = kernel_recvmsg(queue->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags); if (unlikely(len < 0)) return len; + if (queue->tls_pskid) { + ret = nvmet_tcp_tls_record_ok(queue, &msg, cbuf); + if (ret < 0) + return ret; + } queue->offset += len; queue->left -= len; @@ -1162,16 +1232,22 @@ static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd) static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue) { struct nvmet_tcp_cmd *cmd = queue->cmd; - int ret; + int len, ret; while (msg_data_left(&cmd->recv_msg)) { - ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg, + len = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg, cmd->recv_msg.msg_flags); - if (ret <= 0) - return ret; + if (len <= 0) + return len; + if (queue->tls_pskid) { + ret = nvmet_tcp_tls_record_ok(cmd->queue, + &cmd->recv_msg, cmd->recv_cbuf); + if (ret < 0) + return ret; + } - cmd->pdu_recv += ret; - cmd->rbytes_done += ret; + cmd->pdu_recv += len; + cmd->rbytes_done += len; } if (queue->data_digest) { @@ -1189,20 +1265,30 @@ static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue) static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue) { struct nvmet_tcp_cmd *cmd = queue->cmd; - int ret; + int ret, len; + char cbuf[CMSG_LEN(sizeof(char))] = {}; struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; struct kvec iov = { .iov_base = (void *)&cmd->recv_ddgst + queue->offset, .iov_len = queue->left }; - ret = kernel_recvmsg(queue->sock, &msg, &iov, 1, + if (queue->tls_pskid) { + msg.msg_control = cbuf; + msg.msg_controllen = sizeof(cbuf); + } + len = kernel_recvmsg(queue->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags); - if (unlikely(ret < 0)) - return ret; + if (unlikely(len < 0)) + return len; + if (queue->tls_pskid) { + ret = nvmet_tcp_tls_record_ok(queue, &msg, cbuf); + if (ret < 0) + return ret; + } - queue->offset += ret; - queue->left -= ret; + queue->offset += len; + queue->left -= len; if (queue->left) return -EAGAIN; @@ -1280,14 +1366,27 @@ done: return ret; } +static void nvmet_tcp_release_queue(struct kref *kref) +{ + struct nvmet_tcp_queue *queue = + container_of(kref, struct nvmet_tcp_queue, kref); + + WARN_ON(queue->state != NVMET_TCP_Q_DISCONNECTING); + queue_work(nvmet_wq, &queue->release_work); +} + static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue) { - spin_lock(&queue->state_lock); + spin_lock_bh(&queue->state_lock); + if (queue->state == NVMET_TCP_Q_TLS_HANDSHAKE) { + /* Socket closed during handshake */ + tls_handshake_cancel(queue->sock->sk); + } if (queue->state != NVMET_TCP_Q_DISCONNECTING) { queue->state = NVMET_TCP_Q_DISCONNECTING; - queue_work(nvmet_wq, &queue->release_work); + kref_put(&queue->kref, nvmet_tcp_release_queue); } - spin_unlock(&queue->state_lock); + spin_unlock_bh(&queue->state_lock); } static inline void nvmet_tcp_arm_queue_deadline(struct nvmet_tcp_queue *queue) @@ -1369,6 +1468,10 @@ static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue, if (!c->r2t_pdu) goto out_free_data; + if (queue->state == NVMET_TCP_Q_TLS_HANDSHAKE) { + c->recv_msg.msg_control = c->recv_cbuf; + c->recv_msg.msg_controllen = sizeof(c->recv_cbuf); + } c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; list_add_tail(&c->entry, &queue->free_list); @@ -1482,6 +1585,7 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w) mutex_unlock(&nvmet_tcp_queue_mutex); nvmet_tcp_restore_socket_callbacks(queue); + cancel_delayed_work_sync(&queue->tls_handshake_tmo_work); cancel_work_sync(&queue->io_work); /* stop accepting incoming data */ queue->rcv_state = NVMET_TCP_RECV_ERR; @@ -1490,12 +1594,12 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w) nvmet_sq_destroy(&queue->nvme_sq); cancel_work_sync(&queue->io_work); nvmet_tcp_free_cmd_data_in_buffers(queue); - sock_release(queue->sock); + /* ->sock will be released by fput() */ + fput(queue->sock->file); nvmet_tcp_free_cmds(queue); if (queue->hdr_digest || queue->data_digest) nvmet_tcp_free_crypto(queue); ida_free(&nvmet_tcp_queue_ida, queue->idx); - page = virt_to_head_page(queue->pf_cache.va); __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias); kfree(queue); @@ -1509,8 +1613,13 @@ static void nvmet_tcp_data_ready(struct sock *sk) read_lock_bh(&sk->sk_callback_lock); queue = sk->sk_user_data; - if (likely(queue)) - queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); + if (likely(queue)) { + if (queue->data_ready) + queue->data_ready(sk); + if (queue->state != NVMET_TCP_Q_TLS_HANDSHAKE) + queue_work_on(queue_cpu(queue), nvmet_tcp_wq, + &queue->io_work); + } read_unlock_bh(&sk->sk_callback_lock); } @@ -1618,31 +1727,174 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) return ret; } -static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, +#ifdef CONFIG_NVME_TARGET_TCP_TLS +static int nvmet_tcp_try_peek_pdu(struct nvmet_tcp_queue *queue) +{ + struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr; + int len, ret; + struct kvec iov = { + .iov_base = (u8 *)&queue->pdu + queue->offset, + .iov_len = sizeof(struct nvme_tcp_hdr), + }; + char cbuf[CMSG_LEN(sizeof(char))] = {}; + struct msghdr msg = { + .msg_control = cbuf, + .msg_controllen = sizeof(cbuf), + .msg_flags = MSG_PEEK, + }; + + if (nvmet_port_secure_channel_required(queue->port->nport)) + return 0; + + len = kernel_recvmsg(queue->sock, &msg, &iov, 1, + iov.iov_len, msg.msg_flags); + if (unlikely(len < 0)) { + pr_debug("queue %d: peek error %d\n", + queue->idx, len); + return len; + } + + ret = nvmet_tcp_tls_record_ok(queue, &msg, cbuf); + if (ret < 0) + return ret; + + if (len < sizeof(struct nvme_tcp_hdr)) { + pr_debug("queue %d: short read, %d bytes missing\n", + queue->idx, (int)iov.iov_len - len); + return -EAGAIN; + } + pr_debug("queue %d: hdr type %d hlen %d plen %d size %d\n", + queue->idx, hdr->type, hdr->hlen, hdr->plen, + (int)sizeof(struct nvme_tcp_icreq_pdu)); + if (hdr->type == nvme_tcp_icreq && + hdr->hlen == sizeof(struct nvme_tcp_icreq_pdu) && + hdr->plen == (__le32)sizeof(struct nvme_tcp_icreq_pdu)) { + pr_debug("queue %d: icreq detected\n", + queue->idx); + return len; + } + return 0; +} + +static void nvmet_tcp_tls_handshake_done(void *data, int status, + key_serial_t peerid) +{ + struct nvmet_tcp_queue *queue = data; + + pr_debug("queue %d: TLS handshake done, key %x, status %d\n", + queue->idx, peerid, status); + spin_lock_bh(&queue->state_lock); + if (WARN_ON(queue->state != NVMET_TCP_Q_TLS_HANDSHAKE)) { + spin_unlock_bh(&queue->state_lock); + return; + } + if (!status) { + queue->tls_pskid = peerid; + queue->state = NVMET_TCP_Q_CONNECTING; + } else + queue->state = NVMET_TCP_Q_FAILED; + spin_unlock_bh(&queue->state_lock); + + cancel_delayed_work_sync(&queue->tls_handshake_tmo_work); + if (status) + nvmet_tcp_schedule_release_queue(queue); + else + nvmet_tcp_set_queue_sock(queue); + kref_put(&queue->kref, nvmet_tcp_release_queue); +} + +static void nvmet_tcp_tls_handshake_timeout(struct work_struct *w) +{ + struct nvmet_tcp_queue *queue = container_of(to_delayed_work(w), + struct nvmet_tcp_queue, tls_handshake_tmo_work); + + pr_warn("queue %d: TLS handshake timeout\n", queue->idx); + /* + * If tls_handshake_cancel() fails we've lost the race with + * nvmet_tcp_tls_handshake_done() */ + if (!tls_handshake_cancel(queue->sock->sk)) + return; + spin_lock_bh(&queue->state_lock); + if (WARN_ON(queue->state != NVMET_TCP_Q_TLS_HANDSHAKE)) { + spin_unlock_bh(&queue->state_lock); + return; + } + queue->state = NVMET_TCP_Q_FAILED; + spin_unlock_bh(&queue->state_lock); + nvmet_tcp_schedule_release_queue(queue); + kref_put(&queue->kref, nvmet_tcp_release_queue); +} + +static int nvmet_tcp_tls_handshake(struct nvmet_tcp_queue *queue) +{ + int ret = -EOPNOTSUPP; + struct tls_handshake_args args; + + if (queue->state != NVMET_TCP_Q_TLS_HANDSHAKE) { + pr_warn("cannot start TLS in state %d\n", queue->state); + return -EINVAL; + } + + kref_get(&queue->kref); + pr_debug("queue %d: TLS ServerHello\n", queue->idx); + memset(&args, 0, sizeof(args)); + args.ta_sock = queue->sock; + args.ta_done = nvmet_tcp_tls_handshake_done; + args.ta_data = queue; + args.ta_keyring = key_serial(queue->port->nport->keyring); + args.ta_timeout_ms = tls_handshake_timeout * 1000; + + ret = tls_server_hello_psk(&args, GFP_KERNEL); + if (ret) { + kref_put(&queue->kref, nvmet_tcp_release_queue); + pr_err("failed to start TLS, err=%d\n", ret); + } else { + queue_delayed_work(nvmet_wq, &queue->tls_handshake_tmo_work, + tls_handshake_timeout * HZ); + } + return ret; +} +#endif + +static void nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, struct socket *newsock) { struct nvmet_tcp_queue *queue; + struct file *sock_file = NULL; int ret; queue = kzalloc(sizeof(*queue), GFP_KERNEL); - if (!queue) - return -ENOMEM; + if (!queue) { + ret = -ENOMEM; + goto out_release; + } INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work); INIT_WORK(&queue->io_work, nvmet_tcp_io_work); + kref_init(&queue->kref); queue->sock = newsock; queue->port = port; queue->nr_cmds = 0; spin_lock_init(&queue->state_lock); - queue->state = NVMET_TCP_Q_CONNECTING; + if (queue->port->nport->disc_addr.tsas.tcp.sectype == + NVMF_TCP_SECTYPE_TLS13) + queue->state = NVMET_TCP_Q_TLS_HANDSHAKE; + else + queue->state = NVMET_TCP_Q_CONNECTING; INIT_LIST_HEAD(&queue->free_list); init_llist_head(&queue->resp_list); INIT_LIST_HEAD(&queue->resp_send_list); + sock_file = sock_alloc_file(queue->sock, O_CLOEXEC, NULL); + if (IS_ERR(sock_file)) { + ret = PTR_ERR(sock_file); + goto out_free_queue; + } + queue->idx = ida_alloc(&nvmet_tcp_queue_ida, GFP_KERNEL); if (queue->idx < 0) { ret = queue->idx; - goto out_free_queue; + goto out_sock; } ret = nvmet_tcp_alloc_cmd(queue, &queue->connect); @@ -1659,11 +1911,33 @@ static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list); mutex_unlock(&nvmet_tcp_queue_mutex); +#ifdef CONFIG_NVME_TARGET_TCP_TLS + INIT_DELAYED_WORK(&queue->tls_handshake_tmo_work, + nvmet_tcp_tls_handshake_timeout); + if (queue->state == NVMET_TCP_Q_TLS_HANDSHAKE) { + struct sock *sk = queue->sock->sk; + + /* Restore the default callbacks before starting upcall */ + read_lock_bh(&sk->sk_callback_lock); + sk->sk_user_data = NULL; + sk->sk_data_ready = port->data_ready; + read_unlock_bh(&sk->sk_callback_lock); + if (!nvmet_tcp_try_peek_pdu(queue)) { + if (!nvmet_tcp_tls_handshake(queue)) + return; + /* TLS handshake failed, terminate the connection */ + goto out_destroy_sq; + } + /* Not a TLS connection, continue with normal processing */ + queue->state = NVMET_TCP_Q_CONNECTING; + } +#endif + ret = nvmet_tcp_set_queue_sock(queue); if (ret) goto out_destroy_sq; - return 0; + return; out_destroy_sq: mutex_lock(&nvmet_tcp_queue_mutex); list_del_init(&queue->queue_list); @@ -1673,9 +1947,14 @@ out_free_connect: nvmet_tcp_free_cmd(&queue->connect); out_ida_remove: ida_free(&nvmet_tcp_queue_ida, queue->idx); +out_sock: + fput(queue->sock->file); out_free_queue: kfree(queue); - return ret; +out_release: + pr_err("failed to allocate queue, error %d\n", ret); + if (!sock_file) + sock_release(newsock); } static void nvmet_tcp_accept_work(struct work_struct *w) @@ -1692,11 +1971,7 @@ static void nvmet_tcp_accept_work(struct work_struct *w) pr_warn("failed to accept err=%d\n", ret); return; } - ret = nvmet_tcp_alloc_queue(port, newsock); - if (ret) { - pr_err("failed to allocate queue\n"); - sock_release(newsock); - } + nvmet_tcp_alloc_queue(port, newsock); } } diff --git a/include/linux/badblocks.h b/include/linux/badblocks.h index 2426276b9bd3..670f2dae692f 100644 --- a/include/linux/badblocks.h +++ b/include/linux/badblocks.h @@ -15,6 +15,7 @@ #define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9) #define BB_LEN(x) (((x) & BB_LEN_MASK) + 1) #define BB_ACK(x) (!!((x) & BB_ACK_MASK)) +#define BB_END(x) (BB_OFFSET(x) + BB_LEN(x)) #define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63)) /* Bad block numbers are stored sorted in a single page. @@ -41,6 +42,12 @@ struct badblocks { sector_t size; /* in sectors */ }; +struct badblocks_context { + sector_t start; + sector_t len; + int ack; +}; + int badblocks_check(struct badblocks *bb, sector_t s, int sectors, sector_t *first_bad, int *bad_sectors); int badblocks_set(struct badblocks *bb, sector_t s, int sectors, @@ -63,4 +70,27 @@ static inline void devm_exit_badblocks(struct device *dev, struct badblocks *bb) } badblocks_exit(bb); } + +static inline int badblocks_full(struct badblocks *bb) +{ + return (bb->count >= MAX_BADBLOCKS); +} + +static inline int badblocks_empty(struct badblocks *bb) +{ + return (bb->count == 0); +} + +static inline void set_changed(struct badblocks *bb) +{ + if (bb->changed != 1) + bb->changed = 1; +} + +static inline void clear_changed(struct badblocks *bb) +{ + if (bb->changed != 0) + bb->changed = 0; +} + #endif diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 958ed7e89b30..1ab3081c82ed 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -32,8 +32,6 @@ typedef __u32 __bitwise req_flags_t; #define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4)) /* merge of different types, fail separately */ #define RQF_MIXED_MERGE ((__force req_flags_t)(1 << 5)) -/* track inflight for MQ */ -#define RQF_MQ_INFLIGHT ((__force req_flags_t)(1 << 6)) /* don't call prep for this one */ #define RQF_DONTPREP ((__force req_flags_t)(1 << 7)) /* use hctx->sched_tags */ diff --git a/include/linux/key.h b/include/linux/key.h index 938d7ecfb495..943a432da3ae 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -515,6 +515,7 @@ extern void key_init(void); #define key_init() do { } while(0) #define key_free_user_ns(ns) do { } while(0) #define key_remove_domain(d) do { } while(0) +#define key_lookup(k) NULL #endif /* CONFIG_KEYS */ #endif /* __KERNEL__ */ diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index dcb8030062dd..c1d0bc5d9624 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -9,9 +9,9 @@ #include <crypto/kpp.h> struct nvme_dhchap_key { - u8 *key; size_t len; u8 hash; + u8 key[]; }; u32 nvme_auth_get_seqnum(void); @@ -24,10 +24,13 @@ const char *nvme_auth_digest_name(u8 hmac_id); size_t nvme_auth_hmac_hash_len(u8 hmac_id); u8 nvme_auth_hmac_id(const char *hmac_name); +u32 nvme_auth_key_struct_size(u32 key_len); struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret, u8 key_hash); void nvme_auth_free_key(struct nvme_dhchap_key *key); -u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn); +struct nvme_dhchap_key *nvme_auth_alloc_key(u32 len, u8 hash); +struct nvme_dhchap_key *nvme_auth_transform_key( + struct nvme_dhchap_key *key, char *nqn); int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key); int nvme_auth_augmented_challenge(u8 hmac_id, u8 *skey, size_t skey_len, u8 *challenge, u8 *aug, size_t hlen); diff --git a/include/linux/nvme-keyring.h b/include/linux/nvme-keyring.h new file mode 100644 index 000000000000..4efea9dd967c --- /dev/null +++ b/include/linux/nvme-keyring.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Hannes Reinecke, SUSE Labs + */ + +#ifndef _NVME_KEYRING_H +#define _NVME_KEYRING_H + +#ifdef CONFIG_NVME_KEYRING + +key_serial_t nvme_tls_psk_default(struct key *keyring, + const char *hostnqn, const char *subnqn); + +key_serial_t nvme_keyring_id(void); +int nvme_keyring_init(void); +void nvme_keyring_exit(void); + +#else + +static inline key_serial_t nvme_tls_psk_default(struct key *keyring, + const char *hostnqn, const char *subnqn) +{ + return 0; +} +static inline key_serial_t nvme_keyring_id(void) +{ + return 0; +} +static inline int nvme_keyring_init(void) +{ + return 0; +} +static inline void nvme_keyring_exit(void) {} + +#endif /* !CONFIG_NVME_KEYRING */ +#endif /* _NVME_KEYRING_H */ diff --git a/include/linux/nvme-tcp.h b/include/linux/nvme-tcp.h index 57ebe1267f7f..e07e8978d691 100644 --- a/include/linux/nvme-tcp.h +++ b/include/linux/nvme-tcp.h @@ -18,6 +18,12 @@ enum nvme_tcp_pfv { NVME_TCP_PFV_1_0 = 0x0, }; +enum nvme_tcp_tls_cipher { + NVME_TCP_TLS_CIPHER_INVALID = 0, + NVME_TCP_TLS_CIPHER_SHA256 = 1, + NVME_TCP_TLS_CIPHER_SHA384 = 2, +}; + enum nvme_tcp_fatal_error_status { NVME_TCP_FES_INVALID_PDU_HDR = 0x01, NVME_TCP_FES_PDU_SEQ_ERR = 0x02, diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 26dd3f859d9d..a7ba74babad7 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -108,6 +108,13 @@ enum { NVMF_RDMA_CMS_RDMA_CM = 1, /* Sockets based endpoint addressing */ }; +/* TSAS SECTYPE for TCP transport */ +enum { + NVMF_TCP_SECTYPE_NONE = 0, /* No Security */ + NVMF_TCP_SECTYPE_TLS12 = 1, /* TLSv1.2, NVMe-oF 1.1 and NVMe-TCP 3.6.1.1 */ + NVMF_TCP_SECTYPE_TLS13 = 2, /* TLSv1.3, NVMe-oF 1.1 and NVMe-TCP 3.6.1.1 */ +}; + #define NVME_AQ_DEPTH 32 #define NVME_NR_AEN_COMMANDS 1 #define NVME_AQ_BLK_MQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS) @@ -1493,6 +1500,9 @@ struct nvmf_disc_rsp_page_entry { __u16 pkey; __u8 resv10[246]; } rdma; + struct tcp { + __u8 sectype; + } tcp; } tsas; }; diff --git a/include/linux/sed-opal-key.h b/include/linux/sed-opal-key.h new file mode 100644 index 000000000000..0ca03054e8f6 --- /dev/null +++ b/include/linux/sed-opal-key.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * SED key operations. + * + * Copyright (C) 2023 IBM Corporation + * + * These are the accessor functions (read/write) for SED Opal + * keys. Specific keystores can provide overrides. + * + */ + +#include <linux/kernel.h> + +#ifdef CONFIG_PSERIES_PLPKS_SED +int sed_read_key(char *keyname, char *key, u_int *keylen); +int sed_write_key(char *keyname, char *key, u_int keylen); +#else +static inline +int sed_read_key(char *keyname, char *key, u_int *keylen) { + return -EOPNOTSUPP; +} +static inline +int sed_write_key(char *keyname, char *key, u_int keylen) { + return -EOPNOTSUPP; +} +#endif diff --git a/security/keys/key.c b/security/keys/key.c index 5c0c7df833f8..0260a1902922 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -693,6 +693,7 @@ error: spin_unlock(&key_serial_lock); return key; } +EXPORT_SYMBOL(key_lookup); /* * Find and lock the specified key type against removal. |