From f3396c58fd8442850e759843457d78b6ec3a9589 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 13 Feb 2015 08:23:09 -0500
Subject: dm crypt: use unbound workqueue for request processing

Use unbound workqueue by default so that work is automatically balanced
between available CPUs.  The original behavior of encrypting using the
same cpu that IO was submitted on can still be enabled by setting the
optional 'same_cpu_crypt' table argument.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 49 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 15 deletions(-)

(limited to 'drivers/md/dm-crypt.c')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 08981be7baa1..5063c901c0f5 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -108,7 +108,7 @@ struct iv_tcw_private {
  * Crypt: maps a linear range of a block device
  * and encrypts / decrypts at the same time.
  */
-enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
+enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID, DM_CRYPT_SAME_CPU };
 
 /*
  * The fields in here must be read only after initialization.
@@ -1688,7 +1688,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	char dummy;
 
 	static struct dm_arg _args[] = {
-		{0, 1, "Invalid number of feature args"},
+		{0, 2, "Invalid number of feature args"},
 	};
 
 	if (argc < 5) {
@@ -1788,15 +1788,23 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		if (ret)
 			goto bad;
 
-		opt_string = dm_shift_arg(&as);
+		while (opt_params--) {
+			opt_string = dm_shift_arg(&as);
+			if (!opt_string) {
+				ti->error = "Not enough feature arguments";
+				goto bad;
+			}
 
-		if (opt_params == 1 && opt_string &&
-		    !strcasecmp(opt_string, "allow_discards"))
-			ti->num_discard_bios = 1;
-		else if (opt_params) {
-			ret = -EINVAL;
-			ti->error = "Invalid feature arguments";
-			goto bad;
+			if (!strcasecmp(opt_string, "allow_discards"))
+				ti->num_discard_bios = 1;
+
+			else if (!strcasecmp(opt_string, "same_cpu_crypt"))
+				set_bit(DM_CRYPT_SAME_CPU, &cc->flags);
+
+			else {
+				ti->error = "Invalid feature arguments";
+				goto bad;
+			}
 		}
 	}
 
@@ -1807,8 +1815,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 	}
 
-	cc->crypt_queue = alloc_workqueue("kcryptd",
-					  WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
+	if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
+		cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
+	else
+		cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
+						  num_online_cpus());
 	if (!cc->crypt_queue) {
 		ti->error = "Couldn't create kcryptd queue";
 		goto bad;
@@ -1860,6 +1871,7 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
 {
 	struct crypt_config *cc = ti->private;
 	unsigned i, sz = 0;
+	int num_feature_args = 0;
 
 	switch (type) {
 	case STATUSTYPE_INFO:
@@ -1878,8 +1890,15 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
 		DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset,
 				cc->dev->name, (unsigned long long)cc->start);
 
-		if (ti->num_discard_bios)
-			DMEMIT(" 1 allow_discards");
+		num_feature_args += !!ti->num_discard_bios;
+		num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags);
+		if (num_feature_args) {
+			DMEMIT(" %d", num_feature_args);
+			if (ti->num_discard_bios)
+				DMEMIT(" allow_discards");
+			if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
+				DMEMIT(" same_cpu_crypt");
+		}
 
 		break;
 	}
@@ -1976,7 +1995,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
 
 static struct target_type crypt_target = {
 	.name   = "crypt",
-	.version = {1, 13, 0},
+	.version = {1, 14, 0},
 	.module = THIS_MODULE,
 	.ctr    = crypt_ctr,
 	.dtr    = crypt_dtr,
-- 
cgit v1.2.3


From cf2f1abfbd0dba701f7f16ef619e4d2485de3366 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 13 Feb 2015 08:23:52 -0500
Subject: dm crypt: don't allocate pages for a partial request

Change crypt_alloc_buffer so that it only ever allocates pages for a
full request.  This is a prerequisite for the commit "dm crypt: offload
writes to thread".

This change simplifies the dm-crypt code at the expense of reduced
throughput in low memory conditions (where allocation for a partial
request is most useful).

Note: the next commit ("dm crypt: avoid deadlock in mempools") is needed
to fix a theoretical deadlock.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 139 +++++++++++---------------------------------------
 1 file changed, 30 insertions(+), 109 deletions(-)

(limited to 'drivers/md/dm-crypt.c')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 5063c901c0f5..6199245ea6a6 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -58,7 +58,6 @@ struct dm_crypt_io {
 	atomic_t io_pending;
 	int error;
 	sector_t sector;
-	struct dm_crypt_io *base_io;
 } CRYPTO_MINALIGN_ATTR;
 
 struct dm_crypt_request {
@@ -172,7 +171,6 @@ struct crypt_config {
 };
 
 #define MIN_IOS        16
-#define MIN_POOL_PAGES 32
 
 static struct kmem_cache *_crypt_io_pool;
 
@@ -946,14 +944,13 @@ static int crypt_convert(struct crypt_config *cc,
 	return 0;
 }
 
+static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone);
+
 /*
  * Generate a new unfragmented bio with the given size
  * This should never violate the device limitations
- * May return a smaller bio when running out of pages, indicated by
- * *out_of_pages set to 1.
  */
-static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size,
-				      unsigned *out_of_pages)
+static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size)
 {
 	struct crypt_config *cc = io->cc;
 	struct bio *clone;
@@ -961,41 +958,27 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size,
 	gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
 	unsigned i, len;
 	struct page *page;
+	struct bio_vec *bvec;
 
 	clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
 	if (!clone)
 		return NULL;
 
 	clone_init(io, clone);
-	*out_of_pages = 0;
 
 	for (i = 0; i < nr_iovecs; i++) {
 		page = mempool_alloc(cc->page_pool, gfp_mask);
-		if (!page) {
-			*out_of_pages = 1;
-			break;
-		}
-
-		/*
-		 * If additional pages cannot be allocated without waiting,
-		 * return a partially-allocated bio.  The caller will then try
-		 * to allocate more bios while submitting this partial bio.
-		 */
-		gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
 
 		len = (size > PAGE_SIZE) ? PAGE_SIZE : size;
 
-		if (!bio_add_page(clone, page, len, 0)) {
-			mempool_free(page, cc->page_pool);
-			break;
-		}
+		bvec = &clone->bi_io_vec[clone->bi_vcnt++];
+		bvec->bv_page = page;
+		bvec->bv_len = len;
+		bvec->bv_offset = 0;
 
-		size -= len;
-	}
+		clone->bi_iter.bi_size += len;
 
-	if (!clone->bi_iter.bi_size) {
-		bio_put(clone);
-		return NULL;
+		size -= len;
 	}
 
 	return clone;
@@ -1020,7 +1003,6 @@ static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc,
 	io->base_bio = bio;
 	io->sector = sector;
 	io->error = 0;
-	io->base_io = NULL;
 	io->ctx.req = NULL;
 	atomic_set(&io->io_pending, 0);
 }
@@ -1033,13 +1015,11 @@ static void crypt_inc_pending(struct dm_crypt_io *io)
 /*
  * One of the bios was finished. Check for completion of
  * the whole request and correctly clean up the buffer.
- * If base_io is set, wait for the last fragment to complete.
  */
 static void crypt_dec_pending(struct dm_crypt_io *io)
 {
 	struct crypt_config *cc = io->cc;
 	struct bio *base_bio = io->base_bio;
-	struct dm_crypt_io *base_io = io->base_io;
 	int error = io->error;
 
 	if (!atomic_dec_and_test(&io->io_pending))
@@ -1050,13 +1030,7 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
 	if (io != dm_per_bio_data(base_bio, cc->per_bio_data_size))
 		mempool_free(io, cc->io_pool);
 
-	if (likely(!base_io))
-		bio_endio(base_bio, error);
-	else {
-		if (error && !base_io->error)
-			base_io->error = error;
-		crypt_dec_pending(base_io);
-	}
+	bio_endio(base_bio, error);
 }
 
 /*
@@ -1192,10 +1166,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 {
 	struct crypt_config *cc = io->cc;
 	struct bio *clone;
-	struct dm_crypt_io *new_io;
 	int crypt_finished;
-	unsigned out_of_pages = 0;
-	unsigned remaining = io->base_bio->bi_iter.bi_size;
 	sector_t sector = io->sector;
 	int r;
 
@@ -1205,80 +1176,30 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 	crypt_inc_pending(io);
 	crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, sector);
 
-	/*
-	 * The allocated buffers can be smaller than the whole bio,
-	 * so repeat the whole process until all the data can be handled.
-	 */
-	while (remaining) {
-		clone = crypt_alloc_buffer(io, remaining, &out_of_pages);
-		if (unlikely(!clone)) {
-			io->error = -ENOMEM;
-			break;
-		}
-
-		io->ctx.bio_out = clone;
-		io->ctx.iter_out = clone->bi_iter;
-
-		remaining -= clone->bi_iter.bi_size;
-		sector += bio_sectors(clone);
-
-		crypt_inc_pending(io);
-
-		r = crypt_convert(cc, &io->ctx);
-		if (r < 0)
-			io->error = -EIO;
-
-		crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
-
-		/* Encryption was already finished, submit io now */
-		if (crypt_finished) {
-			kcryptd_crypt_write_io_submit(io, 0);
-
-			/*
-			 * If there was an error, do not try next fragments.
-			 * For async, error is processed in async handler.
-			 */
-			if (unlikely(r < 0))
-				break;
+	clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size);
+	if (unlikely(!clone)) {
+		io->error = -EIO;
+		goto dec;
+	}
 
-			io->sector = sector;
-		}
+	io->ctx.bio_out = clone;
+	io->ctx.iter_out = clone->bi_iter;
 
-		/*
-		 * Out of memory -> run queues
-		 * But don't wait if split was due to the io size restriction
-		 */
-		if (unlikely(out_of_pages))
-			congestion_wait(BLK_RW_ASYNC, HZ/100);
+	sector += bio_sectors(clone);
 
-		/*
-		 * With async crypto it is unsafe to share the crypto context
-		 * between fragments, so switch to a new dm_crypt_io structure.
-		 */
-		if (unlikely(!crypt_finished && remaining)) {
-			new_io = mempool_alloc(cc->io_pool, GFP_NOIO);
-			crypt_io_init(new_io, io->cc, io->base_bio, sector);
-			crypt_inc_pending(new_io);
-			crypt_convert_init(cc, &new_io->ctx, NULL,
-					   io->base_bio, sector);
-			new_io->ctx.iter_in = io->ctx.iter_in;
-
-			/*
-			 * Fragments after the first use the base_io
-			 * pending count.
-			 */
-			if (!io->base_io)
-				new_io->base_io = io;
-			else {
-				new_io->base_io = io->base_io;
-				crypt_inc_pending(io->base_io);
-				crypt_dec_pending(io);
-			}
+	crypt_inc_pending(io);
+	r = crypt_convert(cc, &io->ctx);
+	if (r)
+		io->error = -EIO;
+	crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
 
-			io = new_io;
-		}
+	/* Encryption was already finished, submit io now */
+	if (crypt_finished) {
+		kcryptd_crypt_write_io_submit(io, 0);
+		io->sector = sector;
 	}
 
+dec:
 	crypt_dec_pending(io);
 }
 
@@ -1746,7 +1667,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		      sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size,
 		      ARCH_KMALLOC_MINALIGN);
 
-	cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
+	cc->page_pool = mempool_create_page_pool(BIO_MAX_PAGES, 0);
 	if (!cc->page_pool) {
 		ti->error = "Cannot allocate page mempool";
 		goto bad;
-- 
cgit v1.2.3


From 7145c241a1bf2841952c3e297c4080b357b3e52d Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 13 Feb 2015 08:24:41 -0500
Subject: dm crypt: avoid deadlock in mempools

Fix a theoretical deadlock introduced in the previous commit ("dm crypt:
don't allocate pages for a partial request").

The function crypt_alloc_buffer may be called concurrently.  If we allocate
from the mempool concurrently, there is a possibility of deadlock.  For
example, if we have mempool of 256 pages, two processes, each wanting
256, pages allocate from the mempool concurrently, it may deadlock in a
situation where both processes have allocated 128 pages and the mempool
is exhausted.

To avoid such a scenario we allocate the pages under a mutex.  In order
to not degrade performance with excessive locking, we try non-blocking
allocations without a mutex first and if that fails, we fallback to a
blocking allocations with a mutex.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 41 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 5 deletions(-)

(limited to 'drivers/md/dm-crypt.c')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 6199245ea6a6..fa1dba1d06f7 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -124,6 +124,7 @@ struct crypt_config {
 	mempool_t *req_pool;
 	mempool_t *page_pool;
 	struct bio_set *bs;
+	struct mutex bio_alloc_lock;
 
 	struct workqueue_struct *io_queue;
 	struct workqueue_struct *crypt_queue;
@@ -949,27 +950,51 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone);
 /*
  * Generate a new unfragmented bio with the given size
  * This should never violate the device limitations
+ *
+ * This function may be called concurrently. If we allocate from the mempool
+ * concurrently, there is a possibility of deadlock. For example, if we have
+ * mempool of 256 pages, two processes, each wanting 256, pages allocate from
+ * the mempool concurrently, it may deadlock in a situation where both processes
+ * have allocated 128 pages and the mempool is exhausted.
+ *
+ * In order to avoid this scenario we allocate the pages under a mutex.
+ *
+ * In order to not degrade performance with excessive locking, we try
+ * non-blocking allocations without a mutex first but on failure we fallback
+ * to blocking allocations with a mutex.
  */
 static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size)
 {
 	struct crypt_config *cc = io->cc;
 	struct bio *clone;
 	unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
-	unsigned i, len;
+	gfp_t gfp_mask = GFP_NOWAIT | __GFP_HIGHMEM;
+	unsigned i, len, remaining_size;
 	struct page *page;
 	struct bio_vec *bvec;
 
+retry:
+	if (unlikely(gfp_mask & __GFP_WAIT))
+		mutex_lock(&cc->bio_alloc_lock);
+
 	clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
 	if (!clone)
-		return NULL;
+		goto return_clone;
 
 	clone_init(io, clone);
 
+	remaining_size = size;
+
 	for (i = 0; i < nr_iovecs; i++) {
 		page = mempool_alloc(cc->page_pool, gfp_mask);
+		if (!page) {
+			crypt_free_buffer_pages(cc, clone);
+			bio_put(clone);
+			gfp_mask |= __GFP_WAIT;
+			goto retry;
+		}
 
-		len = (size > PAGE_SIZE) ? PAGE_SIZE : size;
+		len = (remaining_size > PAGE_SIZE) ? PAGE_SIZE : remaining_size;
 
 		bvec = &clone->bi_io_vec[clone->bi_vcnt++];
 		bvec->bv_page = page;
@@ -978,9 +1003,13 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size)
 
 		clone->bi_iter.bi_size += len;
 
-		size -= len;
+		remaining_size -= len;
 	}
 
+return_clone:
+	if (unlikely(gfp_mask & __GFP_WAIT))
+		mutex_unlock(&cc->bio_alloc_lock);
+
 	return clone;
 }
 
@@ -1679,6 +1708,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 	}
 
+	mutex_init(&cc->bio_alloc_lock);
+
 	ret = -EINVAL;
 	if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) {
 		ti->error = "Invalid iv_offset sector";
-- 
cgit v1.2.3


From 94f5e0243c48aa01441c987743dc468e2d6eaca2 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 13 Feb 2015 08:25:26 -0500
Subject: dm crypt: remove unused io_pool and _crypt_io_pool

The previous commit ("dm crypt: don't allocate pages for a partial
request") stopped using the io_pool slab mempool and backing
_crypt_io_pool kmem cache.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 24 ++----------------------
 1 file changed, 2 insertions(+), 22 deletions(-)

(limited to 'drivers/md/dm-crypt.c')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index fa1dba1d06f7..c29daf417aaf 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -120,7 +120,6 @@ struct crypt_config {
 	 * pool for per bio private data, crypto requests and
 	 * encryption requeusts/buffer pages
 	 */
-	mempool_t *io_pool;
 	mempool_t *req_pool;
 	mempool_t *page_pool;
 	struct bio_set *bs;
@@ -173,8 +172,6 @@ struct crypt_config {
 
 #define MIN_IOS        16
 
-static struct kmem_cache *_crypt_io_pool;
-
 static void clone_init(struct dm_crypt_io *, struct bio *);
 static void kcryptd_queue_crypt(struct dm_crypt_io *io);
 static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
@@ -1056,8 +1053,6 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
 
 	if (io->ctx.req)
 		crypt_free_req(cc, io->ctx.req, base_bio);
-	if (io != dm_per_bio_data(base_bio, cc->per_bio_data_size))
-		mempool_free(io, cc->io_pool);
 
 	bio_endio(base_bio, error);
 }
@@ -1445,8 +1440,6 @@ static void crypt_dtr(struct dm_target *ti)
 		mempool_destroy(cc->page_pool);
 	if (cc->req_pool)
 		mempool_destroy(cc->req_pool);
-	if (cc->io_pool)
-		mempool_destroy(cc->io_pool);
 
 	if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
 		cc->iv_gen_ops->dtr(cc);
@@ -1660,13 +1653,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	if (ret < 0)
 		goto bad;
 
-	ret = -ENOMEM;
-	cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool);
-	if (!cc->io_pool) {
-		ti->error = "Cannot allocate crypt io mempool";
-		goto bad;
-	}
-
 	cc->dmreq_start = sizeof(struct ablkcipher_request);
 	cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc));
 	cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request));
@@ -1684,6 +1670,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		iv_size_padding = crypto_ablkcipher_alignmask(any_tfm(cc));
 	}
 
+	ret = -ENOMEM;
 	cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
 			sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size);
 	if (!cc->req_pool) {
@@ -1965,15 +1952,9 @@ static int __init dm_crypt_init(void)
 {
 	int r;
 
-	_crypt_io_pool = KMEM_CACHE(dm_crypt_io, 0);
-	if (!_crypt_io_pool)
-		return -ENOMEM;
-
 	r = dm_register_target(&crypt_target);
-	if (r < 0) {
+	if (r < 0)
 		DMERR("register failed %d", r);
-		kmem_cache_destroy(_crypt_io_pool);
-	}
 
 	return r;
 }
@@ -1981,7 +1962,6 @@ static int __init dm_crypt_init(void)
 static void __exit dm_crypt_exit(void)
 {
 	dm_unregister_target(&crypt_target);
-	kmem_cache_destroy(_crypt_io_pool);
 }
 
 module_init(dm_crypt_init);
-- 
cgit v1.2.3


From dc2676210c425ee8e5cb1bec5bc84d004ddf4179 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 13 Feb 2015 08:25:59 -0500
Subject: dm crypt: offload writes to thread

Submitting write bios directly in the encryption thread caused serious
performance degradation.  On a multiprocessor machine, encryption requests
finish in a different order than they were submitted.  Consequently, write
requests would be submitted in a different order and it could cause severe
performance degradation.

Move the submission of write requests to a separate thread so that the
requests can be sorted before submitting.  But this commit improves
dm-crypt performance even without having dm-crypt perform request
sorting (in particular it enables IO schedulers like CFQ to sort more
effectively).

Note: it is required that a previous commit ("dm crypt: don't allocate
pages for a partial request") be applied before applying this patch.
Otherwise, this commit could introduce a crash.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 114 +++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 94 insertions(+), 20 deletions(-)

(limited to 'drivers/md/dm-crypt.c')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index c29daf417aaf..8c0e36b1d0ed 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/crypto.h>
 #include <linux/workqueue.h>
+#include <linux/kthread.h>
 #include <linux/backing-dev.h>
 #include <linux/atomic.h>
 #include <linux/scatterlist.h>
@@ -58,6 +59,8 @@ struct dm_crypt_io {
 	atomic_t io_pending;
 	int error;
 	sector_t sector;
+
+	struct list_head list;
 } CRYPTO_MINALIGN_ATTR;
 
 struct dm_crypt_request {
@@ -128,6 +131,10 @@ struct crypt_config {
 	struct workqueue_struct *io_queue;
 	struct workqueue_struct *crypt_queue;
 
+	struct task_struct *write_thread;
+	wait_queue_head_t write_thread_wait;
+	struct list_head write_thread_list;
+
 	char *cipher;
 	char *cipher_string;
 
@@ -1136,37 +1143,89 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 	return 0;
 }
 
+static void kcryptd_io_read_work(struct work_struct *work)
+{
+	struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
+
+	crypt_inc_pending(io);
+	if (kcryptd_io_read(io, GFP_NOIO))
+		io->error = -ENOMEM;
+	crypt_dec_pending(io);
+}
+
+static void kcryptd_queue_read(struct dm_crypt_io *io)
+{
+	struct crypt_config *cc = io->cc;
+
+	INIT_WORK(&io->work, kcryptd_io_read_work);
+	queue_work(cc->io_queue, &io->work);
+}
+
 static void kcryptd_io_write(struct dm_crypt_io *io)
 {
 	struct bio *clone = io->ctx.bio_out;
+
 	generic_make_request(clone);
 }
 
-static void kcryptd_io(struct work_struct *work)
+static int dmcrypt_write(void *data)
 {
-	struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
+	struct crypt_config *cc = data;
+	while (1) {
+		struct list_head local_list;
+		struct blk_plug plug;
 
-	if (bio_data_dir(io->base_bio) == READ) {
-		crypt_inc_pending(io);
-		if (kcryptd_io_read(io, GFP_NOIO))
-			io->error = -ENOMEM;
-		crypt_dec_pending(io);
-	} else
-		kcryptd_io_write(io);
-}
+		DECLARE_WAITQUEUE(wait, current);
 
-static void kcryptd_queue_io(struct dm_crypt_io *io)
-{
-	struct crypt_config *cc = io->cc;
+		spin_lock_irq(&cc->write_thread_wait.lock);
+continue_locked:
 
-	INIT_WORK(&io->work, kcryptd_io);
-	queue_work(cc->io_queue, &io->work);
+		if (!list_empty(&cc->write_thread_list))
+			goto pop_from_list;
+
+		__set_current_state(TASK_INTERRUPTIBLE);
+		__add_wait_queue(&cc->write_thread_wait, &wait);
+
+		spin_unlock_irq(&cc->write_thread_wait.lock);
+
+		if (unlikely(kthread_should_stop())) {
+			set_task_state(current, TASK_RUNNING);
+			remove_wait_queue(&cc->write_thread_wait, &wait);
+			break;
+		}
+
+		schedule();
+
+		set_task_state(current, TASK_RUNNING);
+		spin_lock_irq(&cc->write_thread_wait.lock);
+		__remove_wait_queue(&cc->write_thread_wait, &wait);
+		goto continue_locked;
+
+pop_from_list:
+		local_list = cc->write_thread_list;
+		local_list.next->prev = &local_list;
+		local_list.prev->next = &local_list;
+		INIT_LIST_HEAD(&cc->write_thread_list);
+
+		spin_unlock_irq(&cc->write_thread_wait.lock);
+
+		blk_start_plug(&plug);
+		do {
+			struct dm_crypt_io *io = container_of(local_list.next,
+						struct dm_crypt_io, list);
+			list_del(&io->list);
+			kcryptd_io_write(io);
+		} while (!list_empty(&local_list));
+		blk_finish_plug(&plug);
+	}
+	return 0;
 }
 
 static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
 {
 	struct bio *clone = io->ctx.bio_out;
 	struct crypt_config *cc = io->cc;
+	unsigned long flags;
 
 	if (unlikely(io->error < 0)) {
 		crypt_free_buffer_pages(cc, clone);
@@ -1180,10 +1239,10 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
 
 	clone->bi_iter.bi_sector = cc->start + io->sector;
 
-	if (async)
-		kcryptd_queue_io(io);
-	else
-		generic_make_request(clone);
+	spin_lock_irqsave(&cc->write_thread_wait.lock, flags);
+	list_add_tail(&io->list, &cc->write_thread_list);
+	wake_up_locked(&cc->write_thread_wait);
+	spin_unlock_irqrestore(&cc->write_thread_wait.lock, flags);
 }
 
 static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
@@ -1426,6 +1485,9 @@ static void crypt_dtr(struct dm_target *ti)
 	if (!cc)
 		return;
 
+	if (cc->write_thread)
+		kthread_stop(cc->write_thread);
+
 	if (cc->io_queue)
 		destroy_workqueue(cc->io_queue);
 	if (cc->crypt_queue)
@@ -1764,6 +1826,18 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 	}
 
+	init_waitqueue_head(&cc->write_thread_wait);
+	INIT_LIST_HEAD(&cc->write_thread_list);
+
+	cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write");
+	if (IS_ERR(cc->write_thread)) {
+		ret = PTR_ERR(cc->write_thread);
+		cc->write_thread = NULL;
+		ti->error = "Couldn't spawn write thread";
+		goto bad;
+	}
+	wake_up_process(cc->write_thread);
+
 	ti->num_flush_bios = 1;
 	ti->discard_zeroes_data_unsupported = true;
 
@@ -1798,7 +1872,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
 
 	if (bio_data_dir(io->base_bio) == READ) {
 		if (kcryptd_io_read(io, GFP_NOWAIT))
-			kcryptd_queue_io(io);
+			kcryptd_queue_read(io);
 	} else
 		kcryptd_queue_crypt(io);
 
-- 
cgit v1.2.3


From 0f5d8e6ee758f7023e4353cca75d785b2d4f6abe Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 13 Feb 2015 08:27:08 -0500
Subject: dm crypt: add 'submit_from_crypt_cpus' option

Make it possible to disable offloading writes by setting the optional
'submit_from_crypt_cpus' table argument.

There are some situations where offloading write bios from the
encryption threads to a single thread degrades performance
significantly.

The default is to offload write bios to the same thread because it
benefits CFQ to have writes submitted using the same IO context.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/dm-crypt.txt | 10 +++++++++-
 drivers/md/dm-crypt.c                    | 16 ++++++++++++++--
 2 files changed, 23 insertions(+), 3 deletions(-)

(limited to 'drivers/md/dm-crypt.c')

diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt
index 571f24ffc91c..ad697781f9ac 100644
--- a/Documentation/device-mapper/dm-crypt.txt
+++ b/Documentation/device-mapper/dm-crypt.txt
@@ -51,7 +51,7 @@ Parameters: <cipher> <key> <iv_offset> <device path> \
     Otherwise #opt_params is the number of following arguments.
 
     Example of optional parameters section:
-        2 allow_discards same_cpu_crypt
+        3 allow_discards same_cpu_crypt submit_from_crypt_cpus
 
 allow_discards
     Block discard requests (a.k.a. TRIM) are passed through the crypt device.
@@ -68,6 +68,14 @@ same_cpu_crypt
     The default is to use an unbound workqueue so that encryption work
     is automatically balanced between available CPUs.
 
+submit_from_crypt_cpus
+    Disable offloading writes to a separate thread after encryption.
+    There are some situations where offloading write bios from the
+    encryption threads to a single thread degrades performance
+    significantly.  The default is to offload write bios to the same
+    thread because it benefits CFQ to have writes submitted using the
+    same context.
+
 Example scripts
 ===============
 LUKS (Linux Unified Key Setup) is now the preferred way to set up disk
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 8c0e36b1d0ed..4519a7c0098c 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -110,7 +110,8 @@ struct iv_tcw_private {
  * Crypt: maps a linear range of a block device
  * and encrypts / decrypts at the same time.
  */
-enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID, DM_CRYPT_SAME_CPU };
+enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
+	     DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD };
 
 /*
  * The fields in here must be read only after initialization.
@@ -1239,6 +1240,11 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
 
 	clone->bi_iter.bi_sector = cc->start + io->sector;
 
+	if (likely(!async) && test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) {
+		generic_make_request(clone);
+		return;
+	}
+
 	spin_lock_irqsave(&cc->write_thread_wait.lock, flags);
 	list_add_tail(&io->list, &cc->write_thread_list);
 	wake_up_locked(&cc->write_thread_wait);
@@ -1693,7 +1699,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	char dummy;
 
 	static struct dm_arg _args[] = {
-		{0, 2, "Invalid number of feature args"},
+		{0, 3, "Invalid number of feature args"},
 	};
 
 	if (argc < 5) {
@@ -1802,6 +1808,9 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 			else if (!strcasecmp(opt_string, "same_cpu_crypt"))
 				set_bit(DM_CRYPT_SAME_CPU, &cc->flags);
 
+			else if (!strcasecmp(opt_string, "submit_from_crypt_cpus"))
+				set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
+
 			else {
 				ti->error = "Invalid feature arguments";
 				goto bad;
@@ -1905,12 +1914,15 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
 
 		num_feature_args += !!ti->num_discard_bios;
 		num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags);
+		num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
 		if (num_feature_args) {
 			DMEMIT(" %d", num_feature_args);
 			if (ti->num_discard_bios)
 				DMEMIT(" allow_discards");
 			if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
 				DMEMIT(" same_cpu_crypt");
+			if (test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags))
+				DMEMIT(" submit_from_crypt_cpus");
 		}
 
 		break;
-- 
cgit v1.2.3


From b3c5fd3052492f1b8d060799d4f18be5a5438add Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 13 Feb 2015 08:27:41 -0500
Subject: dm crypt: sort writes

Write requests are sorted in a red-black tree structure and are
submitted in the sorted order.

In theory the sorting should be performed by the underlying disk
scheduler, however, in practice the disk scheduler only accepts and
sorts a finite number of requests.  To allow the sorting of all
requests, dm-crypt needs to implement its own sorting.

The overhead associated with rbtree-based sorting is considered
negligible so it is not used conditionally.  Even on SSD sorting can be
beneficial since in-order request dispatch promotes lower latency IO
completion to the upper layers.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 51 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 36 insertions(+), 15 deletions(-)

(limited to 'drivers/md/dm-crypt.c')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 4519a7c0098c..713a96237a80 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -22,6 +22,7 @@
 #include <linux/backing-dev.h>
 #include <linux/atomic.h>
 #include <linux/scatterlist.h>
+#include <linux/rbtree.h>
 #include <asm/page.h>
 #include <asm/unaligned.h>
 #include <crypto/hash.h>
@@ -60,7 +61,7 @@ struct dm_crypt_io {
 	int error;
 	sector_t sector;
 
-	struct list_head list;
+	struct rb_node rb_node;
 } CRYPTO_MINALIGN_ATTR;
 
 struct dm_crypt_request {
@@ -134,7 +135,7 @@ struct crypt_config {
 
 	struct task_struct *write_thread;
 	wait_queue_head_t write_thread_wait;
-	struct list_head write_thread_list;
+	struct rb_root write_tree;
 
 	char *cipher;
 	char *cipher_string;
@@ -1169,11 +1170,15 @@ static void kcryptd_io_write(struct dm_crypt_io *io)
 	generic_make_request(clone);
 }
 
+#define crypt_io_from_node(node) rb_entry((node), struct dm_crypt_io, rb_node)
+
 static int dmcrypt_write(void *data)
 {
 	struct crypt_config *cc = data;
+	struct dm_crypt_io *io;
+
 	while (1) {
-		struct list_head local_list;
+		struct rb_root write_tree;
 		struct blk_plug plug;
 
 		DECLARE_WAITQUEUE(wait, current);
@@ -1181,7 +1186,7 @@ static int dmcrypt_write(void *data)
 		spin_lock_irq(&cc->write_thread_wait.lock);
 continue_locked:
 
-		if (!list_empty(&cc->write_thread_list))
+		if (!RB_EMPTY_ROOT(&cc->write_tree))
 			goto pop_from_list;
 
 		__set_current_state(TASK_INTERRUPTIBLE);
@@ -1203,20 +1208,22 @@ continue_locked:
 		goto continue_locked;
 
 pop_from_list:
-		local_list = cc->write_thread_list;
-		local_list.next->prev = &local_list;
-		local_list.prev->next = &local_list;
-		INIT_LIST_HEAD(&cc->write_thread_list);
-
+		write_tree = cc->write_tree;
+		cc->write_tree = RB_ROOT;
 		spin_unlock_irq(&cc->write_thread_wait.lock);
 
+		BUG_ON(rb_parent(write_tree.rb_node));
+
+		/*
+		 * Note: we cannot walk the tree here with rb_next because
+		 * the structures may be freed when kcryptd_io_write is called.
+		 */
 		blk_start_plug(&plug);
 		do {
-			struct dm_crypt_io *io = container_of(local_list.next,
-						struct dm_crypt_io, list);
-			list_del(&io->list);
+			io = crypt_io_from_node(rb_first(&write_tree));
+			rb_erase(&io->rb_node, &write_tree);
 			kcryptd_io_write(io);
-		} while (!list_empty(&local_list));
+		} while (!RB_EMPTY_ROOT(&write_tree));
 		blk_finish_plug(&plug);
 	}
 	return 0;
@@ -1227,6 +1234,8 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
 	struct bio *clone = io->ctx.bio_out;
 	struct crypt_config *cc = io->cc;
 	unsigned long flags;
+	sector_t sector;
+	struct rb_node **rbp, *parent;
 
 	if (unlikely(io->error < 0)) {
 		crypt_free_buffer_pages(cc, clone);
@@ -1246,7 +1255,19 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
 	}
 
 	spin_lock_irqsave(&cc->write_thread_wait.lock, flags);
-	list_add_tail(&io->list, &cc->write_thread_list);
+	rbp = &cc->write_tree.rb_node;
+	parent = NULL;
+	sector = io->sector;
+	while (*rbp) {
+		parent = *rbp;
+		if (sector < crypt_io_from_node(parent)->sector)
+			rbp = &(*rbp)->rb_left;
+		else
+			rbp = &(*rbp)->rb_right;
+	}
+	rb_link_node(&io->rb_node, parent, rbp);
+	rb_insert_color(&io->rb_node, &cc->write_tree);
+
 	wake_up_locked(&cc->write_thread_wait);
 	spin_unlock_irqrestore(&cc->write_thread_wait.lock, flags);
 }
@@ -1836,7 +1857,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 
 	init_waitqueue_head(&cc->write_thread_wait);
-	INIT_LIST_HEAD(&cc->write_thread_list);
+	cc->write_tree = RB_ROOT;
 
 	cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write");
 	if (IS_ERR(cc->write_thread)) {
-- 
cgit v1.2.3