From 297e3d854784821d3b8ff3ae117f20d71f125504 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Mon, 27 Mar 2017 10:51:37 -0700
Subject: blk-throttle: make throtl_slice tunable

throtl_slice is important for blk-throttling. It's called slice
internally but it really is a time window blk-throttling samples data.
blk-throttling will make decision based on the samplings. An example is
bandwidth measurement. A cgroup's bandwidth is measured in the time
interval of throtl_slice.

A small throtl_slice meanse cgroups have smoother throughput but burn
more CPUs. It has 100ms default value, which is not appropriate for all
disks. A fast SSD can dispatch a lot of IOs in 100ms. This patch makes
it tunable.

Since throtl_slice isn't a time slice, the sysfs name
'throttle_sample_time' reflects its character better.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/block/queue-sysfs.txt | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'Documentation')
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index c0a3bb5a6e4e..b7f6bdc96d73 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -192,5 +192,11 @@ scaling back writes. Writing a value of '0' to this file disables the
 feature. Writing a value of '-1' to this file resets the value to the
 default setting.
 
+throttle_sample_time (RW)
+-------------------------
+This is the time window that blk-throttle samples data, in millisecond.
+blk-throttle makes decision based on the samplings. Lower time means cgroups
+have more smooth throughput, but higher CPU overhead. This exists only when
+CONFIG_BLK_DEV_THROTTLING_LOW is enabled.
 
 Jens Axboe <jens.axboe@oracle.com>, February 2009
-- 
cgit v1.2.3


From 48920ff2a5a940cd07d12cc79e4a2c75f1185aee Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 5 Apr 2017 19:21:23 +0200
Subject: block: remove the discard_zeroes_data flag

Now that we use the proper REQ_OP_WRITE_ZEROES operation everywhere we can
kill this hack.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/ABI/testing/sysfs-block | 10 ++-----
 Documentation/block/queue-sysfs.txt   |  5 ----
 block/blk-lib.c                       |  7 +----
 block/blk-settings.c                  |  3 ---
 block/blk-sysfs.c                     |  2 +-
 block/compat_ioctl.c                  |  2 +-
 block/ioctl.c                         |  2 +-
 drivers/block/drbd/drbd_main.c        |  2 --
 drivers/block/drbd/drbd_nl.c          |  7 +----
 drivers/block/loop.c                  |  2 --
 drivers/block/mtip32xx/mtip32xx.c     |  1 -
 drivers/block/nbd.c                   |  1 -
 drivers/md/dm-cache-target.c          |  1 -
 drivers/md/dm-crypt.c                 |  1 -
 drivers/md/dm-raid.c                  |  6 ++---
 drivers/md/dm-raid1.c                 |  1 -
 drivers/md/dm-table.c                 | 19 -------------
 drivers/md/dm-thin.c                  |  2 --
 drivers/md/raid5.c                    | 50 +++++++++++------------------------
 drivers/scsi/sd.c                     |  5 ----
 drivers/target/target_core_device.c   |  2 +-
 include/linux/blkdev.h                | 15 -----------
 include/linux/device-mapper.h         |  5 ----
 23 files changed, 27 insertions(+), 124 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index 2da04ce6aeef..dea212db9df3 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -213,14 +213,8 @@ What:		/sys/block/<disk>/queue/discard_zeroes_data
 Date:		May 2011
 Contact:	Martin K. Petersen <martin.petersen@oracle.com>
 Description:
-		Devices that support discard functionality may return
-		stale or random data when a previously discarded block
-		is read back. This can cause problems if the filesystem
-		expects discarded blocks to be explicitly cleared. If a
-		device reports that it deterministically returns zeroes
-		when a discarded area is read the discard_zeroes_data
-		parameter will be set to one. Otherwise it will be 0 and
-		the result of reading a discarded area is undefined.
+		Will always return 0.  Don't rely on any specific behavior
+		for discards, and don't read this file.
 
 What:		/sys/block/<disk>/queue/write_same_max_bytes
 Date:		January 2012
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index b7f6bdc96d73..2c1e67058fd3 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -43,11 +43,6 @@ large discards are issued, setting this value lower will make Linux issue
 smaller discards and potentially help reduce latencies induced by large
 discard operations.
 
-discard_zeroes_data (RO)
-------------------------
-When read, this file will show if the discarded block are zeroed by the
-device or not. If its value is '1' the blocks are zeroed otherwise not.
-
 hw_sector_size (RO)
 -------------------
 This is the hardware sector size of the device, in bytes.
diff --git a/block/blk-lib.c b/block/blk-lib.c
index b0c6c4bcf441..e8caecd71688 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -37,17 +37,12 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		return -ENXIO;
 
 	if (flags & BLKDEV_DISCARD_SECURE) {
-		if (flags & BLKDEV_DISCARD_ZERO)
-			return -EOPNOTSUPP;
 		if (!blk_queue_secure_erase(q))
 			return -EOPNOTSUPP;
 		op = REQ_OP_SECURE_ERASE;
 	} else {
 		if (!blk_queue_discard(q))
 			return -EOPNOTSUPP;
-		if ((flags & BLKDEV_DISCARD_ZERO) &&
-		    !q->limits.discard_zeroes_data)
-			return -EOPNOTSUPP;
 		op = REQ_OP_DISCARD;
 	}
 
@@ -126,7 +121,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 			&bio);
 	if (!ret && bio) {
 		ret = submit_bio_wait(bio);
-		if (ret == -EOPNOTSUPP && !(flags & BLKDEV_DISCARD_ZERO))
+		if (ret == -EOPNOTSUPP)
 			ret = 0;
 		bio_put(bio);
 	}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 1e7174ffc9d4..4fa81ed383ca 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -103,7 +103,6 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->discard_granularity = 0;
 	lim->discard_alignment = 0;
 	lim->discard_misaligned = 0;
-	lim->discard_zeroes_data = 0;
 	lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
 	lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
 	lim->alignment_offset = 0;
@@ -127,7 +126,6 @@ void blk_set_stacking_limits(struct queue_limits *lim)
 	blk_set_default_limits(lim);
 
 	/* Inherit limits from component devices */
-	lim->discard_zeroes_data = 1;
 	lim->max_segments = USHRT_MAX;
 	lim->max_discard_segments = 1;
 	lim->max_hw_sectors = UINT_MAX;
@@ -609,7 +607,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->io_opt = lcm_not_zero(t->io_opt, b->io_opt);
 
 	t->cluster &= b->cluster;
-	t->discard_zeroes_data &= b->discard_zeroes_data;
 
 	/* Physical block size a multiple of the logical block size? */
 	if (t->physical_block_size & (t->logical_block_size - 1)) {
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index c47db43a40cc..fc20489f0d2b 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -208,7 +208,7 @@ static ssize_t queue_discard_max_store(struct request_queue *q,
 
 static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
 {
-	return queue_var_show(queue_discard_zeroes_data(q), page);
+	return queue_var_show(0, page);
 }
 
 static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 570021a0dc1c..04325b81c2b4 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -685,7 +685,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case BLKALIGNOFF:
 		return compat_put_int(arg, bdev_alignment_offset(bdev));
 	case BLKDISCARDZEROES:
-		return compat_put_uint(arg, bdev_discard_zeroes_data(bdev));
+		return compat_put_uint(arg, 0);
 	case BLKFLSBUF:
 	case BLKROSET:
 	case BLKDISCARD:
diff --git a/block/ioctl.c b/block/ioctl.c
index 8ea00a41be01..0de02ee67eed 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -547,7 +547,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKALIGNOFF:
 		return put_int(arg, bdev_alignment_offset(bdev));
 	case BLKDISCARDZEROES:
-		return put_uint(arg, bdev_discard_zeroes_data(bdev));
+		return put_uint(arg, 0);
 	case BLKSECTGET:
 		max_sectors = min_t(unsigned int, USHRT_MAX,
 				    queue_max_sectors(bdev_get_queue(bdev)));
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 8e62d9f65510..84455c365f57 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -931,7 +931,6 @@ void assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, struct r
 		p->qlim->io_min = cpu_to_be32(queue_io_min(q));
 		p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
 		p->qlim->discard_enabled = blk_queue_discard(q);
-		p->qlim->discard_zeroes_data = queue_discard_zeroes_data(q);
 		p->qlim->write_same_capable = !!q->limits.max_write_same_sectors;
 	} else {
 		q = device->rq_queue;
@@ -941,7 +940,6 @@ void assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, struct r
 		p->qlim->io_min = cpu_to_be32(queue_io_min(q));
 		p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
 		p->qlim->discard_enabled = 0;
-		p->qlim->discard_zeroes_data = 0;
 		p->qlim->write_same_capable = 0;
 	}
 }
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index e4516d3b971d..02255a0d68b9 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1199,10 +1199,6 @@ static void decide_on_discard_support(struct drbd_device *device,
 	struct drbd_connection *connection = first_peer_device(device)->connection;
 	bool can_do = b ? blk_queue_discard(b) : true;
 
-	if (can_do && b && !b->limits.discard_zeroes_data && !discard_zeroes_if_aligned) {
-		can_do = false;
-		drbd_info(device, "discard_zeroes_data=0 and discard_zeroes_if_aligned=no: disabling discards\n");
-	}
 	if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_TRIM)) {
 		can_do = false;
 		drbd_info(connection, "peer DRBD too old, does not support TRIM: disabling discards\n");
@@ -1484,8 +1480,7 @@ static void sanitize_disk_conf(struct drbd_device *device, struct disk_conf *dis
 	if (disk_conf->al_extents > drbd_al_extents_max(nbc))
 		disk_conf->al_extents = drbd_al_extents_max(nbc);
 
-	if (!blk_queue_discard(q)
-	    || (!q->limits.discard_zeroes_data && !disk_conf->discard_zeroes_if_aligned)) {
+	if (!blk_queue_discard(q)) {
 		if (disk_conf->rs_discard_granularity) {
 			disk_conf->rs_discard_granularity = 0; /* disable feature */
 			drbd_info(device, "rs_discard_granularity feature disabled\n");
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 3bb04c1a4ba1..3081d83d2ea3 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -828,7 +828,6 @@ static void loop_config_discard(struct loop_device *lo)
 		q->limits.discard_alignment = 0;
 		blk_queue_max_discard_sectors(q, 0);
 		blk_queue_max_write_zeroes_sectors(q, 0);
-		q->limits.discard_zeroes_data = 0;
 		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
 		return;
 	}
@@ -837,7 +836,6 @@ static void loop_config_discard(struct loop_device *lo)
 	q->limits.discard_alignment = 0;
 	blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
 	blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
-	q->limits.discard_zeroes_data = 1;
 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
 }
 
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 30076e7753bc..05e3e664ea1b 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -4025,7 +4025,6 @@ skip_create_disk:
 		dd->queue->limits.discard_granularity = 4096;
 		blk_queue_max_discard_sectors(dd->queue,
 			MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES);
-		dd->queue->limits.discard_zeroes_data = 0;
 	}
 
 	/* Set the capacity of the device in 512 byte sectors. */
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 03ae72985c79..b02f2362fdf7 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1110,7 +1110,6 @@ static int nbd_dev_add(int index)
 	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, disk->queue);
 	disk->queue->limits.discard_granularity = 512;
 	blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
-	disk->queue->limits.discard_zeroes_data = 0;
 	blk_queue_max_hw_sectors(disk->queue, 65536);
 	disk->queue->limits.max_sectors = 256;
 
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 9c689b34e6e7..975922c8f231 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2773,7 +2773,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 
 	ti->num_discard_bios = 1;
 	ti->discards_supported = true;
-	ti->discard_zeroes_data_unsupported = true;
 	ti->split_discard_bios = false;
 
 	cache->features = ca->features;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 389a3637ffcc..ef1d836bd81b 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -2030,7 +2030,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	wake_up_process(cc->write_thread);
 
 	ti->num_flush_bios = 1;
-	ti->discard_zeroes_data_unsupported = true;
 
 	return 0;
 
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index f8564d63982f..468f1380de1d 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2813,7 +2813,9 @@ static void configure_discard_support(struct raid_set *rs)
 	/* Assume discards not supported until after checks below. */
 	ti->discards_supported = false;
 
-	/* RAID level 4,5,6 require discard_zeroes_data for data integrity! */
+	/*
+	 * XXX: RAID level 4,5,6 require zeroing for safety.
+	 */
 	raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6);
 
 	for (i = 0; i < rs->raid_disks; i++) {
@@ -2827,8 +2829,6 @@ static void configure_discard_support(struct raid_set *rs)
 			return;
 
 		if (raid456) {
-			if (!q->limits.discard_zeroes_data)
-				return;
 			if (!devices_handle_discard_safely) {
 				DMERR("raid456 discard support disabled due to discard_zeroes_data uncertainty.");
 				DMERR("Set dm-raid.devices_handle_discard_safely=Y to override.");
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 2ddc2d20e62d..a95cbb80fb34 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1124,7 +1124,6 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ti->num_flush_bios = 1;
 	ti->num_discard_bios = 1;
 	ti->per_io_data_size = sizeof(struct dm_raid1_bio_record);
-	ti->discard_zeroes_data_unsupported = true;
 
 	ms->kmirrord_wq = alloc_workqueue("kmirrord", WQ_MEM_RECLAIM, 0);
 	if (!ms->kmirrord_wq) {
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 5cd665c91ead..958275aca008 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1449,22 +1449,6 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
 	return false;
 }
 
-static bool dm_table_discard_zeroes_data(struct dm_table *t)
-{
-	struct dm_target *ti;
-	unsigned i = 0;
-
-	/* Ensure that all targets supports discard_zeroes_data. */
-	while (i < dm_table_get_num_targets(t)) {
-		ti = dm_table_get_target(t, i++);
-
-		if (ti->discard_zeroes_data_unsupported)
-			return false;
-	}
-
-	return true;
-}
-
 static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
 			    sector_t start, sector_t len, void *data)
 {
@@ -1620,9 +1604,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	}
 	blk_queue_write_cache(q, wc, fua);
 
-	if (!dm_table_discard_zeroes_data(t))
-		q->limits.discard_zeroes_data = 0;
-
 	/* Ensure that all underlying devices are non-rotational. */
 	if (dm_table_all_devices_attribute(t, device_is_nonrot))
 		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 2b266a2b5035..a5f1916f621a 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -3263,7 +3263,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	 * them down to the data device.  The thin device's discard
 	 * processing will cause mappings to be removed from the btree.
 	 */
-	ti->discard_zeroes_data_unsupported = true;
 	if (pf.discard_enabled && pf.discard_passdown) {
 		ti->num_discard_bios = 1;
 
@@ -4119,7 +4118,6 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	ti->per_io_data_size = sizeof(struct dm_thin_endio_hook);
 
 	/* In case the pool supports discards, pass them on. */
-	ti->discard_zeroes_data_unsupported = true;
 	if (tc->pool->pf.discard_enabled) {
 		ti->discards_supported = true;
 		ti->num_discard_bios = 1;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 1725a54042bb..2efdb0d67460 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7227,7 +7227,6 @@ static int raid5_run(struct mddev *mddev)
 
 	if (mddev->queue) {
 		int chunk_size;
-		bool discard_supported = true;
 		/* read-ahead size must cover two whole stripes, which
 		 * is 2 * (datadisks) * chunksize where 'n' is the
 		 * number of raid devices
@@ -7263,12 +7262,6 @@ static int raid5_run(struct mddev *mddev)
 		blk_queue_max_discard_sectors(mddev->queue,
 					      0xfffe * STRIPE_SECTORS);
 
-		/*
-		 * unaligned part of discard request will be ignored, so can't
-		 * guarantee discard_zeroes_data
-		 */
-		mddev->queue->limits.discard_zeroes_data = 0;
-
 		blk_queue_max_write_same_sectors(mddev->queue, 0);
 		blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
 
@@ -7277,35 +7270,24 @@ static int raid5_run(struct mddev *mddev)
 					  rdev->data_offset << 9);
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->new_data_offset << 9);
-			/*
-			 * discard_zeroes_data is required, otherwise data
-			 * could be lost. Consider a scenario: discard a stripe
-			 * (the stripe could be inconsistent if
-			 * discard_zeroes_data is 0); write one disk of the
-			 * stripe (the stripe could be inconsistent again
-			 * depending on which disks are used to calculate
-			 * parity); the disk is broken; The stripe data of this
-			 * disk is lost.
-			 */
-			if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
-			    !bdev_get_queue(rdev->bdev)->
-						limits.discard_zeroes_data)
-				discard_supported = false;
-			/* Unfortunately, discard_zeroes_data is not currently
-			 * a guarantee - just a hint.  So we only allow DISCARD
-			 * if the sysadmin has confirmed that only safe devices
-			 * are in use by setting a module parameter.
-			 */
-			if (!devices_handle_discard_safely) {
-				if (discard_supported) {
-					pr_info("md/raid456: discard support disabled due to uncertainty.\n");
-					pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n");
-				}
-				discard_supported = false;
-			}
 		}
 
-		if (discard_supported &&
+		/*
+		 * zeroing is required, otherwise data
+		 * could be lost. Consider a scenario: discard a stripe
+		 * (the stripe could be inconsistent if
+		 * discard_zeroes_data is 0); write one disk of the
+		 * stripe (the stripe could be inconsistent again
+		 * depending on which disks are used to calculate
+		 * parity); the disk is broken; The stripe data of this
+		 * disk is lost.
+		 *
+		 * We only allow DISCARD if the sysadmin has confirmed that
+		 * only safe devices are in use by setting a module parameter.
+		 * A better idea might be to turn DISCARD into WRITE_ZEROES
+		 * requests, as that is required to be safe.
+		 */
+		if (devices_handle_discard_safely &&
 		    mddev->queue->limits.max_discard_sectors >= (stripe >> 9) &&
 		    mddev->queue->limits.discard_granularity >= stripe)
 			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 001593ed0444..bcb0cb020fd2 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -644,8 +644,6 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
 	unsigned int logical_block_size = sdkp->device->sector_size;
 	unsigned int max_blocks = 0;
 
-	q->limits.discard_zeroes_data = 0;
-
 	/*
 	 * When LBPRZ is reported, discard alignment and granularity
 	 * must be fixed to the logical block size. Otherwise the block
@@ -681,19 +679,16 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
 	case SD_LBP_WS16:
 		max_blocks = min_not_zero(sdkp->max_ws_blocks,
 					  (u32)SD_MAX_WS16_BLOCKS);
-		q->limits.discard_zeroes_data = sdkp->lbprz;
 		break;
 
 	case SD_LBP_WS10:
 		max_blocks = min_not_zero(sdkp->max_ws_blocks,
 					  (u32)SD_MAX_WS10_BLOCKS);
-		q->limits.discard_zeroes_data = sdkp->lbprz;
 		break;
 
 	case SD_LBP_ZERO:
 		max_blocks = min_not_zero(sdkp->max_ws_blocks,
 					  (u32)SD_MAX_WS10_BLOCKS);
-		q->limits.discard_zeroes_data = 1;
 		break;
 	}
 
diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c
index c754ae33bf7b..d2f089cfa9ae 100644
--- a/drivers/target/target_core_device.c
+++ b/drivers/target/target_core_device.c
@@ -851,7 +851,7 @@ bool target_configure_unmap_from_queue(struct se_dev_attrib *attrib,
 	attrib->unmap_granularity = q->limits.discard_granularity / block_size;
 	attrib->unmap_granularity_alignment = q->limits.discard_alignment /
 								block_size;
-	attrib->unmap_zeroes_data = q->limits.discard_zeroes_data;
+	attrib->unmap_zeroes_data = 0;
 	return true;
 }
 EXPORT_SYMBOL(target_configure_unmap_from_queue);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 21a30f011674..ec993573e0a8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -339,7 +339,6 @@ struct queue_limits {
 	unsigned char		misaligned;
 	unsigned char		discard_misaligned;
 	unsigned char		cluster;
-	unsigned char		discard_zeroes_data;
 	unsigned char		raid_partial_stripes_expensive;
 	enum blk_zoned_model	zoned;
 };
@@ -1341,7 +1340,6 @@ extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct page *page);
 
 #define BLKDEV_DISCARD_SECURE	(1 << 0)	/* issue a secure erase */
-#define BLKDEV_DISCARD_ZERO	(1 << 1)	/* must reliably zero data */
 
 extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
@@ -1541,19 +1539,6 @@ static inline int bdev_discard_alignment(struct block_device *bdev)
 	return q->limits.discard_alignment;
 }
 
-static inline unsigned int queue_discard_zeroes_data(struct request_queue *q)
-{
-	if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1)
-		return 1;
-
-	return 0;
-}
-
-static inline unsigned int bdev_discard_zeroes_data(struct block_device *bdev)
-{
-	return queue_discard_zeroes_data(bdev_get_queue(bdev));
-}
-
 static inline unsigned int bdev_write_same(struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 3829bee2302a..c7ea33e38fb9 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -296,11 +296,6 @@ struct dm_target {
 	 * on max_io_len boundary.
 	 */
 	bool split_discard_bios:1;
-
-	/*
-	 * Set if this target does not return zeroes on discarded blocks.
-	 */
-	bool discard_zeroes_data_unsupported:1;
 };
 
 /* Each target can link one of these into the table */
-- 
cgit v1.2.3


From 84253394927c4352652d0b118ad9583f5646959b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 6 Apr 2017 13:28:46 +0200
Subject: remove the mg_disk driver

This drivers was added in 2008, but as far as a I can tell we never had a
single platform that actually registered resources for the platform driver.

It's also been unmaintained for a long time and apparently has a ATA mode
that can be driven using the IDE/libata subsystem.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/blockdev/mflash.txt |   84 ---
 drivers/block/Kconfig             |   17 -
 drivers/block/Makefile            |    1 -
 drivers/block/mg_disk.c           | 1110 -------------------------------------
 include/linux/mg_disk.h           |   45 --
 5 files changed, 1257 deletions(-)
 delete mode 100644 Documentation/blockdev/mflash.txt
 delete mode 100644 drivers/block/mg_disk.c
 delete mode 100644 include/linux/mg_disk.h

(limited to 'Documentation')

diff --git a/Documentation/blockdev/mflash.txt b/Documentation/blockdev/mflash.txt
deleted file mode 100644
index f7e050551487..000000000000
--- a/Documentation/blockdev/mflash.txt
+++ /dev/null
@@ -1,84 +0,0 @@
-This document describes m[g]flash support in linux.
-
-Contents
-  1. Overview
-  2. Reserved area configuration
-  3. Example of mflash platform driver registration
-
-1. Overview
-
-Mflash and gflash are embedded flash drive. The only difference is mflash is
-MCP(Multi Chip Package) device. These two device operate exactly same way.
-So the rest mflash repersents mflash and gflash altogether.
-
-Internally, mflash has nand flash and other hardware logics and supports
-2 different operation (ATA, IO) modes. ATA mode doesn't need any new
-driver and currently works well under standard IDE subsystem. Actually it's
-one chip SSD. IO mode is ATA-like custom mode for the host that doesn't have
-IDE interface.
-
-Following are brief descriptions about IO mode.
-A. IO mode based on ATA protocol and uses some custom command. (read confirm,
-write confirm)
-B. IO mode uses SRAM bus interface.
-C. IO mode supports 4kB boot area, so host can boot from mflash.
-
-2. Reserved area configuration
-If host boot from mflash, usually needs raw area for boot loader image. All of
-the mflash's block device operation will be taken this value as start offset.
-Note that boot loader's size of reserved area and kernel configuration value
-must be same.
-
-3. Example of mflash platform driver registration
-Working mflash is very straight forward. Adding platform device stuff to board
-configuration file is all. Here is some pseudo example.
-
-static struct mg_drv_data mflash_drv_data = {
-	/* If you want to polling driver set to 1 */
-	.use_polling = 0,
-	/* device attribution */
-	.dev_attr = MG_BOOT_DEV
-};
-
-static struct resource mg_mflash_rsc[] = {
-	/* Base address of mflash */
-	[0] = {
-		.start = 0x08000000,
-		.end = 0x08000000 + SZ_64K - 1,
-		.flags = IORESOURCE_MEM
-	},
-	/* mflash interrupt pin */
-	[1] = {
-		.start = IRQ_GPIO(84),
-		.end = IRQ_GPIO(84),
-		.flags = IORESOURCE_IRQ
-	},
-	/* mflash reset pin */
-	[2] = {
-		.start = 43,
-		.end = 43,
-		.name = MG_RST_PIN,
-		.flags = IORESOURCE_IO
-	},
-	/* mflash reset-out pin
-	 * If you use mflash as storage device (i.e. other than MG_BOOT_DEV),
-	 * should assign this */
-	[3] = {
-		.start = 51,
-		.end = 51,
-		.name = MG_RSTOUT_PIN,
-		.flags = IORESOURCE_IO
-	}
-};
-
-static struct platform_device mflash_dev = {
-	.name = MG_DEV_NAME,
-	.id = -1,
-	.dev = {
-		.platform_data = &mflash_drv_data,
-	},
-	.num_resources = ARRAY_SIZE(mg_mflash_rsc),
-	.resource = mg_mflash_rsc
-};
-
-platform_device_register(&mflash_dev);
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index a1c2e816128f..ebe8c1a6195e 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -434,23 +434,6 @@ config ATA_OVER_ETH
 	This driver provides Support for ATA over Ethernet block
 	devices like the Coraid EtherDrive (R) Storage Blade.
 
-config MG_DISK
-	tristate "mGine mflash, gflash support"
-	depends on ARM && GPIOLIB
-	help
-	  mGine mFlash(gFlash) block device driver
-
-config MG_DISK_RES
-	int "Size of reserved area before MBR"
-	depends on MG_DISK
-	default 0
-	help
-	  Define size of reserved area that usually used for boot. Unit is KB.
-	  All of the block device operation will be taken this value as start
-	  offset
-	  Examples:
-			1024 => 1 MB
-
 config SUNVDC
 	tristate "Sun Virtual Disk Client support"
 	depends on SUN_LDOMS
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index b12c772bbeb3..5ceead8b52d7 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -19,7 +19,6 @@ obj-$(CONFIG_BLK_CPQ_CISS_DA)  += cciss.o
 obj-$(CONFIG_BLK_DEV_DAC960)	+= DAC960.o
 obj-$(CONFIG_XILINX_SYSACE)	+= xsysace.o
 obj-$(CONFIG_CDROM_PKTCDVD)	+= pktcdvd.o
-obj-$(CONFIG_MG_DISK)		+= mg_disk.o
 obj-$(CONFIG_SUNVDC)		+= sunvdc.o
 obj-$(CONFIG_BLK_DEV_SKD)	+= skd.o
 obj-$(CONFIG_BLK_DEV_OSD)	+= osdblk.o
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
deleted file mode 100644
index e88e7b06c616..000000000000
--- a/drivers/block/mg_disk.c
+++ /dev/null
@@ -1,1110 +0,0 @@
-/*
- *  drivers/block/mg_disk.c
- *
- *  Support for the mGine m[g]flash IO mode.
- *  Based on legacy hd.c
- *
- * (c) 2008 mGine Co.,LTD
- * (c) 2008 unsik Kim <donari75@gmail.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/blkdev.h>
-#include <linux/hdreg.h>
-#include <linux/ata.h>
-#include <linux/interrupt.h>
-#include <linux/delay.h>
-#include <linux/platform_device.h>
-#include <linux/gpio.h>
-#include <linux/mg_disk.h>
-#include <linux/slab.h>
-
-#define MG_RES_SEC (CONFIG_MG_DISK_RES << 1)
-
-/* name for block device */
-#define MG_DISK_NAME "mgd"
-
-#define MG_DISK_MAJ 0
-#define MG_DISK_MAX_PART 16
-#define MG_SECTOR_SIZE 512
-#define MG_MAX_SECTS 256
-
-/* Register offsets */
-#define MG_BUFF_OFFSET			0x8000
-#define MG_REG_OFFSET			0xC000
-#define MG_REG_FEATURE			(MG_REG_OFFSET + 2)	/* write case */
-#define MG_REG_ERROR			(MG_REG_OFFSET + 2)	/* read case */
-#define MG_REG_SECT_CNT			(MG_REG_OFFSET + 4)
-#define MG_REG_SECT_NUM			(MG_REG_OFFSET + 6)
-#define MG_REG_CYL_LOW			(MG_REG_OFFSET + 8)
-#define MG_REG_CYL_HIGH			(MG_REG_OFFSET + 0xA)
-#define MG_REG_DRV_HEAD			(MG_REG_OFFSET + 0xC)
-#define MG_REG_COMMAND			(MG_REG_OFFSET + 0xE)	/* write case */
-#define MG_REG_STATUS			(MG_REG_OFFSET + 0xE)	/* read  case */
-#define MG_REG_DRV_CTRL			(MG_REG_OFFSET + 0x10)
-#define MG_REG_BURST_CTRL		(MG_REG_OFFSET + 0x12)
-
-/* handy status */
-#define MG_STAT_READY	(ATA_DRDY | ATA_DSC)
-#define MG_READY_OK(s)	(((s) & (MG_STAT_READY | (ATA_BUSY | ATA_DF | \
-				 ATA_ERR))) == MG_STAT_READY)
-
-/* error code for others */
-#define MG_ERR_NONE		0
-#define MG_ERR_TIMEOUT		0x100
-#define MG_ERR_INIT_STAT	0x101
-#define MG_ERR_TRANSLATION	0x102
-#define MG_ERR_CTRL_RST		0x103
-#define MG_ERR_INV_STAT		0x104
-#define MG_ERR_RSTOUT		0x105
-
-#define MG_MAX_ERRORS	6	/* Max read/write errors */
-
-/* command */
-#define MG_CMD_RD 0x20
-#define MG_CMD_WR 0x30
-#define MG_CMD_SLEEP 0x99
-#define MG_CMD_WAKEUP 0xC3
-#define MG_CMD_ID 0xEC
-#define MG_CMD_WR_CONF 0x3C
-#define MG_CMD_RD_CONF 0x40
-
-/* operation mode */
-#define MG_OP_CASCADE (1 << 0)
-#define MG_OP_CASCADE_SYNC_RD (1 << 1)
-#define MG_OP_CASCADE_SYNC_WR (1 << 2)
-#define MG_OP_INTERLEAVE (1 << 3)
-
-/* synchronous */
-#define MG_BURST_LAT_4 (3 << 4)
-#define MG_BURST_LAT_5 (4 << 4)
-#define MG_BURST_LAT_6 (5 << 4)
-#define MG_BURST_LAT_7 (6 << 4)
-#define MG_BURST_LAT_8 (7 << 4)
-#define MG_BURST_LEN_4 (1 << 1)
-#define MG_BURST_LEN_8 (2 << 1)
-#define MG_BURST_LEN_16 (3 << 1)
-#define MG_BURST_LEN_32 (4 << 1)
-#define MG_BURST_LEN_CONT (0 << 1)
-
-/* timeout value (unit: ms) */
-#define MG_TMAX_CONF_TO_CMD	1
-#define MG_TMAX_WAIT_RD_DRQ	10
-#define MG_TMAX_WAIT_WR_DRQ	500
-#define MG_TMAX_RST_TO_BUSY	10
-#define MG_TMAX_HDRST_TO_RDY	500
-#define MG_TMAX_SWRST_TO_RDY	500
-#define MG_TMAX_RSTOUT		3000
-
-#define MG_DEV_MASK (MG_BOOT_DEV | MG_STORAGE_DEV | MG_STORAGE_DEV_SKIP_RST)
-
-/* main structure for mflash driver */
-struct mg_host {
-	struct device *dev;
-
-	struct request_queue *breq;
-	struct request *req;
-	spinlock_t lock;
-	struct gendisk *gd;
-
-	struct timer_list timer;
-	void (*mg_do_intr) (struct mg_host *);
-
-	u16 id[ATA_ID_WORDS];
-
-	u16 cyls;
-	u16 heads;
-	u16 sectors;
-	u32 n_sectors;
-	u32 nres_sectors;
-
-	void __iomem *dev_base;
-	unsigned int irq;
-	unsigned int rst;
-	unsigned int rstout;
-
-	u32 major;
-	u32 error;
-};
-
-/*
- * Debugging macro and defines
- */
-#undef DO_MG_DEBUG
-#ifdef DO_MG_DEBUG
-#  define MG_DBG(fmt, args...) \
-	printk(KERN_DEBUG "%s:%d "fmt, __func__, __LINE__, ##args)
-#else /* CONFIG_MG_DEBUG */
-#  define MG_DBG(fmt, args...) do { } while (0)
-#endif /* CONFIG_MG_DEBUG */
-
-static void mg_request(struct request_queue *);
-
-static bool mg_end_request(struct mg_host *host, int err, unsigned int nr_bytes)
-{
-	if (__blk_end_request(host->req, err, nr_bytes))
-		return true;
-
-	host->req = NULL;
-	return false;
-}
-
-static bool mg_end_request_cur(struct mg_host *host, int err)
-{
-	return mg_end_request(host, err, blk_rq_cur_bytes(host->req));
-}
-
-static void mg_dump_status(const char *msg, unsigned int stat,
-		struct mg_host *host)
-{
-	char *name = MG_DISK_NAME;
-
-	if (host->req)
-		name = host->req->rq_disk->disk_name;
-
-	printk(KERN_ERR "%s: %s: status=0x%02x { ", name, msg, stat & 0xff);
-	if (stat & ATA_BUSY)
-		printk("Busy ");
-	if (stat & ATA_DRDY)
-		printk("DriveReady ");
-	if (stat & ATA_DF)
-		printk("WriteFault ");
-	if (stat & ATA_DSC)
-		printk("SeekComplete ");
-	if (stat & ATA_DRQ)
-		printk("DataRequest ");
-	if (stat & ATA_CORR)
-		printk("CorrectedError ");
-	if (stat & ATA_ERR)
-		printk("Error ");
-	printk("}\n");
-	if ((stat & ATA_ERR) == 0) {
-		host->error = 0;
-	} else {
-		host->error = inb((unsigned long)host->dev_base + MG_REG_ERROR);
-		printk(KERN_ERR "%s: %s: error=0x%02x { ", name, msg,
-				host->error & 0xff);
-		if (host->error & ATA_BBK)
-			printk("BadSector ");
-		if (host->error & ATA_UNC)
-			printk("UncorrectableError ");
-		if (host->error & ATA_IDNF)
-			printk("SectorIdNotFound ");
-		if (host->error & ATA_ABORTED)
-			printk("DriveStatusError ");
-		if (host->error & ATA_AMNF)
-			printk("AddrMarkNotFound ");
-		printk("}");
-		if (host->error & (ATA_BBK | ATA_UNC | ATA_IDNF | ATA_AMNF)) {
-			if (host->req)
-				printk(", sector=%u",
-				       (unsigned int)blk_rq_pos(host->req));
-		}
-		printk("\n");
-	}
-}
-
-static unsigned int mg_wait(struct mg_host *host, u32 expect, u32 msec)
-{
-	u8 status;
-	unsigned long expire, cur_jiffies;
-	struct mg_drv_data *prv_data = host->dev->platform_data;
-
-	host->error = MG_ERR_NONE;
-	expire = jiffies + msecs_to_jiffies(msec);
-
-	/* These 2 times dummy status read prevents reading invalid
-	 * status. A very little time (3 times of mflash operating clk)
-	 * is required for busy bit is set. Use dummy read instead of
-	 * busy wait, because mflash's PLL is machine dependent.
-	 */
-	if (prv_data->use_polling) {
-		status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-		status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-	}
-
-	status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-
-	do {
-		cur_jiffies = jiffies;
-		if (status & ATA_BUSY) {
-			if (expect == ATA_BUSY)
-				break;
-		} else {
-			/* Check the error condition! */
-			if (status & ATA_ERR) {
-				mg_dump_status("mg_wait", status, host);
-				break;
-			}
-
-			if (expect == MG_STAT_READY)
-				if (MG_READY_OK(status))
-					break;
-
-			if (expect == ATA_DRQ)
-				if (status & ATA_DRQ)
-					break;
-		}
-		if (!msec) {
-			mg_dump_status("not ready", status, host);
-			return MG_ERR_INV_STAT;
-		}
-
-		status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-	} while (time_before(cur_jiffies, expire));
-
-	if (time_after_eq(cur_jiffies, expire) && msec)
-		host->error = MG_ERR_TIMEOUT;
-
-	return host->error;
-}
-
-static unsigned int mg_wait_rstout(u32 rstout, u32 msec)
-{
-	unsigned long expire;
-
-	expire = jiffies + msecs_to_jiffies(msec);
-	while (time_before(jiffies, expire)) {
-		if (gpio_get_value(rstout) == 1)
-			return MG_ERR_NONE;
-		msleep(10);
-	}
-
-	return MG_ERR_RSTOUT;
-}
-
-static void mg_unexpected_intr(struct mg_host *host)
-{
-	u32 status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-
-	mg_dump_status("mg_unexpected_intr", status, host);
-}
-
-static irqreturn_t mg_irq(int irq, void *dev_id)
-{
-	struct mg_host *host = dev_id;
-	void (*handler)(struct mg_host *) = host->mg_do_intr;
-
-	spin_lock(&host->lock);
-
-	host->mg_do_intr = NULL;
-	del_timer(&host->timer);
-	if (!handler)
-		handler = mg_unexpected_intr;
-	handler(host);
-
-	spin_unlock(&host->lock);
-
-	return IRQ_HANDLED;
-}
-
-/* local copy of ata_id_string() */
-static void mg_id_string(const u16 *id, unsigned char *s,
-			 unsigned int ofs, unsigned int len)
-{
-	unsigned int c;
-
-	BUG_ON(len & 1);
-
-	while (len > 0) {
-		c = id[ofs] >> 8;
-		*s = c;
-		s++;
-
-		c = id[ofs] & 0xff;
-		*s = c;
-		s++;
-
-		ofs++;
-		len -= 2;
-	}
-}
-
-/* local copy of ata_id_c_string() */
-static void mg_id_c_string(const u16 *id, unsigned char *s,
-			   unsigned int ofs, unsigned int len)
-{
-	unsigned char *p;
-
-	mg_id_string(id, s, ofs, len - 1);
-
-	p = s + strnlen(s, len - 1);
-	while (p > s && p[-1] == ' ')
-		p--;
-	*p = '\0';
-}
-
-static int mg_get_disk_id(struct mg_host *host)
-{
-	u32 i;
-	s32 err;
-	const u16 *id = host->id;
-	struct mg_drv_data *prv_data = host->dev->platform_data;
-	char fwrev[ATA_ID_FW_REV_LEN + 1];
-	char model[ATA_ID_PROD_LEN + 1];
-	char serial[ATA_ID_SERNO_LEN + 1];
-
-	if (!prv_data->use_polling)
-		outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-
-	outb(MG_CMD_ID, (unsigned long)host->dev_base + MG_REG_COMMAND);
-	err = mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_RD_DRQ);
-	if (err)
-		return err;
-
-	for (i = 0; i < (MG_SECTOR_SIZE >> 1); i++)
-		host->id[i] = le16_to_cpu(inw((unsigned long)host->dev_base +
-					MG_BUFF_OFFSET + i * 2));
-
-	outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
-	err = mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD);
-	if (err)
-		return err;
-
-	if ((id[ATA_ID_FIELD_VALID] & 1) == 0)
-		return MG_ERR_TRANSLATION;
-
-	host->n_sectors = ata_id_u32(id, ATA_ID_LBA_CAPACITY);
-	host->cyls = id[ATA_ID_CYLS];
-	host->heads = id[ATA_ID_HEADS];
-	host->sectors = id[ATA_ID_SECTORS];
-
-	if (MG_RES_SEC && host->heads && host->sectors) {
-		/* modify cyls, n_sectors */
-		host->cyls = (host->n_sectors - MG_RES_SEC) /
-			host->heads / host->sectors;
-		host->nres_sectors = host->n_sectors - host->cyls *
-			host->heads * host->sectors;
-		host->n_sectors -= host->nres_sectors;
-	}
-
-	mg_id_c_string(id, fwrev, ATA_ID_FW_REV, sizeof(fwrev));
-	mg_id_c_string(id, model, ATA_ID_PROD, sizeof(model));
-	mg_id_c_string(id, serial, ATA_ID_SERNO, sizeof(serial));
-	printk(KERN_INFO "mg_disk: model: %s\n", model);
-	printk(KERN_INFO "mg_disk: firm: %.8s\n", fwrev);
-	printk(KERN_INFO "mg_disk: serial: %s\n", serial);
-	printk(KERN_INFO "mg_disk: %d + reserved %d sectors\n",
-			host->n_sectors, host->nres_sectors);
-
-	if (!prv_data->use_polling)
-		outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-
-	return err;
-}
-
-
-static int mg_disk_init(struct mg_host *host)
-{
-	struct mg_drv_data *prv_data = host->dev->platform_data;
-	s32 err;
-	u8 init_status;
-
-	/* hdd rst low */
-	gpio_set_value(host->rst, 0);
-	err = mg_wait(host, ATA_BUSY, MG_TMAX_RST_TO_BUSY);
-	if (err)
-		return err;
-
-	/* hdd rst high */
-	gpio_set_value(host->rst, 1);
-	err = mg_wait(host, MG_STAT_READY, MG_TMAX_HDRST_TO_RDY);
-	if (err)
-		return err;
-
-	/* soft reset on */
-	outb(ATA_SRST | (prv_data->use_polling ? ATA_NIEN : 0),
-			(unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-	err = mg_wait(host, ATA_BUSY, MG_TMAX_RST_TO_BUSY);
-	if (err)
-		return err;
-
-	/* soft reset off */
-	outb(prv_data->use_polling ? ATA_NIEN : 0,
-			(unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-	err = mg_wait(host, MG_STAT_READY, MG_TMAX_SWRST_TO_RDY);
-	if (err)
-		return err;
-
-	init_status = inb((unsigned long)host->dev_base + MG_REG_STATUS) & 0xf;
-
-	if (init_status == 0xf)
-		return MG_ERR_INIT_STAT;
-
-	return err;
-}
-
-static void mg_bad_rw_intr(struct mg_host *host)
-{
-	if (host->req)
-		if (++host->req->errors >= MG_MAX_ERRORS ||
-		    host->error == MG_ERR_TIMEOUT)
-			mg_end_request_cur(host, -EIO);
-}
-
-static unsigned int mg_out(struct mg_host *host,
-		unsigned int sect_num,
-		unsigned int sect_cnt,
-		unsigned int cmd,
-		void (*intr_addr)(struct mg_host *))
-{
-	struct mg_drv_data *prv_data = host->dev->platform_data;
-
-	if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
-		return host->error;
-
-	if (!prv_data->use_polling) {
-		host->mg_do_intr = intr_addr;
-		mod_timer(&host->timer, jiffies + 3 * HZ);
-	}
-	if (MG_RES_SEC)
-		sect_num += MG_RES_SEC;
-	outb((u8)sect_cnt, (unsigned long)host->dev_base + MG_REG_SECT_CNT);
-	outb((u8)sect_num, (unsigned long)host->dev_base + MG_REG_SECT_NUM);
-	outb((u8)(sect_num >> 8), (unsigned long)host->dev_base +
-			MG_REG_CYL_LOW);
-	outb((u8)(sect_num >> 16), (unsigned long)host->dev_base +
-			MG_REG_CYL_HIGH);
-	outb((u8)((sect_num >> 24) | ATA_LBA | ATA_DEVICE_OBS),
-			(unsigned long)host->dev_base + MG_REG_DRV_HEAD);
-	outb(cmd, (unsigned long)host->dev_base + MG_REG_COMMAND);
-	return MG_ERR_NONE;
-}
-
-static void mg_read_one(struct mg_host *host, struct request *req)
-{
-	u16 *buff = (u16 *)bio_data(req->bio);
-	u32 i;
-
-	for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
-		*buff++ = inw((unsigned long)host->dev_base + MG_BUFF_OFFSET +
-			      (i << 1));
-}
-
-static void mg_read(struct request *req)
-{
-	struct mg_host *host = req->rq_disk->private_data;
-
-	if (mg_out(host, blk_rq_pos(req), blk_rq_sectors(req),
-		   MG_CMD_RD, NULL) != MG_ERR_NONE)
-		mg_bad_rw_intr(host);
-
-	MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
-	       blk_rq_sectors(req), blk_rq_pos(req), bio_data(req->bio));
-
-	do {
-		if (mg_wait(host, ATA_DRQ,
-			    MG_TMAX_WAIT_RD_DRQ) != MG_ERR_NONE) {
-			mg_bad_rw_intr(host);
-			return;
-		}
-
-		mg_read_one(host, req);
-
-		outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base +
-				MG_REG_COMMAND);
-	} while (mg_end_request(host, 0, MG_SECTOR_SIZE));
-}
-
-static void mg_write_one(struct mg_host *host, struct request *req)
-{
-	u16 *buff = (u16 *)bio_data(req->bio);
-	u32 i;
-
-	for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
-		outw(*buff++, (unsigned long)host->dev_base + MG_BUFF_OFFSET +
-		     (i << 1));
-}
-
-static void mg_write(struct request *req)
-{
-	struct mg_host *host = req->rq_disk->private_data;
-	unsigned int rem = blk_rq_sectors(req);
-
-	if (mg_out(host, blk_rq_pos(req), rem,
-		   MG_CMD_WR, NULL) != MG_ERR_NONE) {
-		mg_bad_rw_intr(host);
-		return;
-	}
-
-	MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
-	       rem, blk_rq_pos(req), bio_data(req->bio));
-
-	if (mg_wait(host, ATA_DRQ,
-		    MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
-		mg_bad_rw_intr(host);
-		return;
-	}
-
-	do {
-		mg_write_one(host, req);
-
-		outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
-				MG_REG_COMMAND);
-
-		rem--;
-		if (rem > 1 && mg_wait(host, ATA_DRQ,
-					MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
-			mg_bad_rw_intr(host);
-			return;
-		} else if (mg_wait(host, MG_STAT_READY,
-					MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
-			mg_bad_rw_intr(host);
-			return;
-		}
-	} while (mg_end_request(host, 0, MG_SECTOR_SIZE));
-}
-
-static void mg_read_intr(struct mg_host *host)
-{
-	struct request *req = host->req;
-	u32 i;
-
-	/* check status */
-	do {
-		i = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-		if (i & ATA_BUSY)
-			break;
-		if (!MG_READY_OK(i))
-			break;
-		if (i & ATA_DRQ)
-			goto ok_to_read;
-	} while (0);
-	mg_dump_status("mg_read_intr", i, host);
-	mg_bad_rw_intr(host);
-	mg_request(host->breq);
-	return;
-
-ok_to_read:
-	mg_read_one(host, req);
-
-	MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
-	       blk_rq_pos(req), blk_rq_sectors(req) - 1, bio_data(req->bio));
-
-	/* send read confirm */
-	outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
-
-	if (mg_end_request(host, 0, MG_SECTOR_SIZE)) {
-		/* set handler if read remains */
-		host->mg_do_intr = mg_read_intr;
-		mod_timer(&host->timer, jiffies + 3 * HZ);
-	} else /* goto next request */
-		mg_request(host->breq);
-}
-
-static void mg_write_intr(struct mg_host *host)
-{
-	struct request *req = host->req;
-	u32 i;
-	bool rem;
-
-	/* check status */
-	do {
-		i = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-		if (i & ATA_BUSY)
-			break;
-		if (!MG_READY_OK(i))
-			break;
-		if ((blk_rq_sectors(req) <= 1) || (i & ATA_DRQ))
-			goto ok_to_write;
-	} while (0);
-	mg_dump_status("mg_write_intr", i, host);
-	mg_bad_rw_intr(host);
-	mg_request(host->breq);
-	return;
-
-ok_to_write:
-	if ((rem = mg_end_request(host, 0, MG_SECTOR_SIZE))) {
-		/* write 1 sector and set handler if remains */
-		mg_write_one(host, req);
-		MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
-		       blk_rq_pos(req), blk_rq_sectors(req), bio_data(req->bio));
-		host->mg_do_intr = mg_write_intr;
-		mod_timer(&host->timer, jiffies + 3 * HZ);
-	}
-
-	/* send write confirm */
-	outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
-
-	if (!rem)
-		mg_request(host->breq);
-}
-
-static void mg_times_out(unsigned long data)
-{
-	struct mg_host *host = (struct mg_host *)data;
-	char *name;
-
-	spin_lock_irq(&host->lock);
-
-	if (!host->req)
-		goto out_unlock;
-
-	host->mg_do_intr = NULL;
-
-	name = host->req->rq_disk->disk_name;
-	printk(KERN_DEBUG "%s: timeout\n", name);
-
-	host->error = MG_ERR_TIMEOUT;
-	mg_bad_rw_intr(host);
-
-out_unlock:
-	mg_request(host->breq);
-	spin_unlock_irq(&host->lock);
-}
-
-static void mg_request_poll(struct request_queue *q)
-{
-	struct mg_host *host = q->queuedata;
-
-	while (1) {
-		if (!host->req) {
-			host->req = blk_fetch_request(q);
-			if (!host->req)
-				break;
-		}
-
-		switch (req_op(host->req)) {
-		case REQ_OP_READ:
-			mg_read(host->req);
-			break;
-		case REQ_OP_WRITE:
-			mg_write(host->req);
-			break;
-		default:
-			mg_end_request_cur(host, -EIO);
-			break;
-		}
-	}
-}
-
-static unsigned int mg_issue_req(struct request *req,
-		struct mg_host *host,
-		unsigned int sect_num,
-		unsigned int sect_cnt)
-{
-	switch (req_op(host->req)) {
-	case REQ_OP_READ:
-		if (mg_out(host, sect_num, sect_cnt, MG_CMD_RD, &mg_read_intr)
-				!= MG_ERR_NONE) {
-			mg_bad_rw_intr(host);
-			return host->error;
-		}
-		break;
-	case REQ_OP_WRITE:
-		/* TODO : handler */
-		outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-		if (mg_out(host, sect_num, sect_cnt, MG_CMD_WR, &mg_write_intr)
-				!= MG_ERR_NONE) {
-			mg_bad_rw_intr(host);
-			return host->error;
-		}
-		del_timer(&host->timer);
-		mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_WR_DRQ);
-		outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-		if (host->error) {
-			mg_bad_rw_intr(host);
-			return host->error;
-		}
-		mg_write_one(host, req);
-		mod_timer(&host->timer, jiffies + 3 * HZ);
-		outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
-				MG_REG_COMMAND);
-		break;
-	default:
-		mg_end_request_cur(host, -EIO);
-		break;
-	}
-	return MG_ERR_NONE;
-}
-
-/* This function also called from IRQ context */
-static void mg_request(struct request_queue *q)
-{
-	struct mg_host *host = q->queuedata;
-	struct request *req;
-	u32 sect_num, sect_cnt;
-
-	while (1) {
-		if (!host->req) {
-			host->req = blk_fetch_request(q);
-			if (!host->req)
-				break;
-		}
-		req = host->req;
-
-		/* check unwanted request call */
-		if (host->mg_do_intr)
-			return;
-
-		del_timer(&host->timer);
-
-		sect_num = blk_rq_pos(req);
-		/* deal whole segments */
-		sect_cnt = blk_rq_sectors(req);
-
-		/* sanity check */
-		if (sect_num >= get_capacity(req->rq_disk) ||
-				((sect_num + sect_cnt) >
-				 get_capacity(req->rq_disk))) {
-			printk(KERN_WARNING
-					"%s: bad access: sector=%d, count=%d\n",
-					req->rq_disk->disk_name,
-					sect_num, sect_cnt);
-			mg_end_request_cur(host, -EIO);
-			continue;
-		}
-
-		if (!mg_issue_req(req, host, sect_num, sect_cnt))
-			return;
-	}
-}
-
-static int mg_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
-	struct mg_host *host = bdev->bd_disk->private_data;
-
-	geo->cylinders = (unsigned short)host->cyls;
-	geo->heads = (unsigned char)host->heads;
-	geo->sectors = (unsigned char)host->sectors;
-	return 0;
-}
-
-static const struct block_device_operations mg_disk_ops = {
-	.getgeo = mg_getgeo
-};
-
-#ifdef CONFIG_PM_SLEEP
-static int mg_suspend(struct device *dev)
-{
-	struct mg_drv_data *prv_data = dev->platform_data;
-	struct mg_host *host = prv_data->host;
-
-	if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
-		return -EIO;
-
-	if (!prv_data->use_polling)
-		outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-
-	outb(MG_CMD_SLEEP, (unsigned long)host->dev_base + MG_REG_COMMAND);
-	/* wait until mflash deep sleep */
-	msleep(1);
-
-	if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD)) {
-		if (!prv_data->use_polling)
-			outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-		return -EIO;
-	}
-
-	return 0;
-}
-
-static int mg_resume(struct device *dev)
-{
-	struct mg_drv_data *prv_data = dev->platform_data;
-	struct mg_host *host = prv_data->host;
-
-	if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
-		return -EIO;
-
-	outb(MG_CMD_WAKEUP, (unsigned long)host->dev_base + MG_REG_COMMAND);
-	/* wait until mflash wakeup */
-	msleep(1);
-
-	if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
-		return -EIO;
-
-	if (!prv_data->use_polling)
-		outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-
-	return 0;
-}
-#endif
-
-static SIMPLE_DEV_PM_OPS(mg_pm, mg_suspend, mg_resume);
-
-static int mg_probe(struct platform_device *plat_dev)
-{
-	struct mg_host *host;
-	struct resource *rsc;
-	struct mg_drv_data *prv_data = plat_dev->dev.platform_data;
-	int err = 0;
-
-	if (!prv_data) {
-		printk(KERN_ERR	"%s:%d fail (no driver_data)\n",
-				__func__, __LINE__);
-		err = -EINVAL;
-		goto probe_err;
-	}
-
-	/* alloc mg_host */
-	host = kzalloc(sizeof(struct mg_host), GFP_KERNEL);
-	if (!host) {
-		printk(KERN_ERR "%s:%d fail (no memory for mg_host)\n",
-				__func__, __LINE__);
-		err = -ENOMEM;
-		goto probe_err;
-	}
-	host->major = MG_DISK_MAJ;
-
-	/* link each other */
-	prv_data->host = host;
-	host->dev = &plat_dev->dev;
-
-	/* io remap */
-	rsc = platform_get_resource(plat_dev, IORESOURCE_MEM, 0);
-	if (!rsc) {
-		printk(KERN_ERR "%s:%d platform_get_resource fail\n",
-				__func__, __LINE__);
-		err = -EINVAL;
-		goto probe_err_2;
-	}
-	host->dev_base = ioremap(rsc->start, resource_size(rsc));
-	if (!host->dev_base) {
-		printk(KERN_ERR "%s:%d ioremap fail\n",
-				__func__, __LINE__);
-		err = -EIO;
-		goto probe_err_2;
-	}
-	MG_DBG("dev_base = 0x%x\n", (u32)host->dev_base);
-
-	/* get reset pin */
-	rsc = platform_get_resource_byname(plat_dev, IORESOURCE_IO,
-			MG_RST_PIN);
-	if (!rsc) {
-		printk(KERN_ERR "%s:%d get reset pin fail\n",
-				__func__, __LINE__);
-		err = -EIO;
-		goto probe_err_3;
-	}
-	host->rst = rsc->start;
-
-	/* init rst pin */
-	err = gpio_request(host->rst, MG_RST_PIN);
-	if (err)
-		goto probe_err_3;
-	gpio_direction_output(host->rst, 1);
-
-	/* reset out pin */
-	if (!(prv_data->dev_attr & MG_DEV_MASK)) {
-		err = -EINVAL;
-		goto probe_err_3a;
-	}
-
-	if (prv_data->dev_attr != MG_BOOT_DEV) {
-		rsc = platform_get_resource_byname(plat_dev, IORESOURCE_IO,
-				MG_RSTOUT_PIN);
-		if (!rsc) {
-			printk(KERN_ERR "%s:%d get reset-out pin fail\n",
-					__func__, __LINE__);
-			err = -EIO;
-			goto probe_err_3a;
-		}
-		host->rstout = rsc->start;
-		err = gpio_request(host->rstout, MG_RSTOUT_PIN);
-		if (err)
-			goto probe_err_3a;
-		gpio_direction_input(host->rstout);
-	}
-
-	/* disk reset */
-	if (prv_data->dev_attr == MG_STORAGE_DEV) {
-		/* If POR seq. not yet finished, wait */
-		err = mg_wait_rstout(host->rstout, MG_TMAX_RSTOUT);
-		if (err)
-			goto probe_err_3b;
-		err = mg_disk_init(host);
-		if (err) {
-			printk(KERN_ERR "%s:%d fail (err code : %d)\n",
-					__func__, __LINE__, err);
-			err = -EIO;
-			goto probe_err_3b;
-		}
-	}
-
-	/* get irq resource */
-	if (!prv_data->use_polling) {
-		host->irq = platform_get_irq(plat_dev, 0);
-		if (host->irq == -ENXIO) {
-			err = host->irq;
-			goto probe_err_3b;
-		}
-		err = request_irq(host->irq, mg_irq,
-				IRQF_TRIGGER_RISING,
-				MG_DEV_NAME, host);
-		if (err) {
-			printk(KERN_ERR "%s:%d fail (request_irq err=%d)\n",
-					__func__, __LINE__, err);
-			goto probe_err_3b;
-		}
-
-	}
-
-	/* get disk id */
-	err = mg_get_disk_id(host);
-	if (err) {
-		printk(KERN_ERR "%s:%d fail (err code : %d)\n",
-				__func__, __LINE__, err);
-		err = -EIO;
-		goto probe_err_4;
-	}
-
-	err = register_blkdev(host->major, MG_DISK_NAME);
-	if (err < 0) {
-		printk(KERN_ERR "%s:%d register_blkdev fail (err code : %d)\n",
-				__func__, __LINE__, err);
-		goto probe_err_4;
-	}
-	if (!host->major)
-		host->major = err;
-
-	spin_lock_init(&host->lock);
-
-	if (prv_data->use_polling)
-		host->breq = blk_init_queue(mg_request_poll, &host->lock);
-	else
-		host->breq = blk_init_queue(mg_request, &host->lock);
-
-	if (!host->breq) {
-		err = -ENOMEM;
-		printk(KERN_ERR "%s:%d (blk_init_queue) fail\n",
-				__func__, __LINE__);
-		goto probe_err_5;
-	}
-	host->breq->queuedata = host;
-
-	/* mflash is random device, thanx for the noop */
-	err = elevator_change(host->breq, "noop");
-	if (err) {
-		printk(KERN_ERR "%s:%d (elevator_init) fail\n",
-				__func__, __LINE__);
-		goto probe_err_6;
-	}
-	blk_queue_max_hw_sectors(host->breq, MG_MAX_SECTS);
-	blk_queue_logical_block_size(host->breq, MG_SECTOR_SIZE);
-
-	setup_timer(&host->timer, mg_times_out, (unsigned long)host);
-
-	host->gd = alloc_disk(MG_DISK_MAX_PART);
-	if (!host->gd) {
-		printk(KERN_ERR "%s:%d (alloc_disk) fail\n",
-				__func__, __LINE__);
-		err = -ENOMEM;
-		goto probe_err_7;
-	}
-	host->gd->major = host->major;
-	host->gd->first_minor = 0;
-	host->gd->fops = &mg_disk_ops;
-	host->gd->queue = host->breq;
-	host->gd->private_data = host;
-	sprintf(host->gd->disk_name, MG_DISK_NAME"a");
-
-	set_capacity(host->gd, host->n_sectors);
-
-	add_disk(host->gd);
-
-	return err;
-
-probe_err_7:
-	del_timer_sync(&host->timer);
-probe_err_6:
-	blk_cleanup_queue(host->breq);
-probe_err_5:
-	unregister_blkdev(host->major, MG_DISK_NAME);
-probe_err_4:
-	if (!prv_data->use_polling)
-		free_irq(host->irq, host);
-probe_err_3b:
-	gpio_free(host->rstout);
-probe_err_3a:
-	gpio_free(host->rst);
-probe_err_3:
-	iounmap(host->dev_base);
-probe_err_2:
-	kfree(host);
-probe_err:
-	return err;
-}
-
-static int mg_remove(struct platform_device *plat_dev)
-{
-	struct mg_drv_data *prv_data = plat_dev->dev.platform_data;
-	struct mg_host *host = prv_data->host;
-	int err = 0;
-
-	/* delete timer */
-	del_timer_sync(&host->timer);
-
-	/* remove disk */
-	if (host->gd) {
-		del_gendisk(host->gd);
-		put_disk(host->gd);
-	}
-	/* remove queue */
-	if (host->breq)
-		blk_cleanup_queue(host->breq);
-
-	/* unregister blk device */
-	unregister_blkdev(host->major, MG_DISK_NAME);
-
-	/* free irq */
-	if (!prv_data->use_polling)
-		free_irq(host->irq, host);
-
-	/* free reset-out pin */
-	if (prv_data->dev_attr != MG_BOOT_DEV)
-		gpio_free(host->rstout);
-
-	/* free rst pin */
-	if (host->rst)
-		gpio_free(host->rst);
-
-	/* unmap io */
-	if (host->dev_base)
-		iounmap(host->dev_base);
-
-	/* free mg_host */
-	kfree(host);
-
-	return err;
-}
-
-static struct platform_driver mg_disk_driver = {
-	.probe = mg_probe,
-	.remove = mg_remove,
-	.driver = {
-		.name = MG_DEV_NAME,
-		.pm = &mg_pm,
-	}
-};
-
-/****************************************************************************
- *
- * Module stuff
- *
- ****************************************************************************/
-
-static int __init mg_init(void)
-{
-	printk(KERN_INFO "mGine mflash driver, (c) 2008 mGine Co.\n");
-	return platform_driver_register(&mg_disk_driver);
-}
-
-static void __exit mg_exit(void)
-{
-	printk(KERN_INFO "mflash driver : bye bye\n");
-	platform_driver_unregister(&mg_disk_driver);
-}
-
-module_init(mg_init);
-module_exit(mg_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("unsik Kim <donari75@gmail.com>");
-MODULE_DESCRIPTION("mGine m[g]flash device driver");
diff --git a/include/linux/mg_disk.h b/include/linux/mg_disk.h
deleted file mode 100644
index e11f4d9f1c2e..000000000000
--- a/include/linux/mg_disk.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- *  include/linux/mg_disk.c
- *
- *  Private data for mflash platform driver
- *
- * (c) 2008 mGine Co.,LTD
- * (c) 2008 unsik Kim <donari75@gmail.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
- */
-
-#ifndef __MG_DISK_H__
-#define __MG_DISK_H__
-
-/* name for platform device */
-#define MG_DEV_NAME "mg_disk"
-
-/* names of GPIO resource */
-#define MG_RST_PIN	"mg_rst"
-/* except MG_BOOT_DEV, reset-out pin should be assigned */
-#define MG_RSTOUT_PIN	"mg_rstout"
-
-/* device attribution */
-/* use mflash as boot device */
-#define MG_BOOT_DEV		(1 << 0)
-/* use mflash as storage device */
-#define MG_STORAGE_DEV		(1 << 1)
-/* same as MG_STORAGE_DEV, but bootloader already done reset sequence */
-#define MG_STORAGE_DEV_SKIP_RST	(1 << 2)
-
-/* private driver data */
-struct mg_drv_data {
-	/* disk resource */
-	u32 use_polling;
-
-	/* device attribution */
-	u32 dev_attr;
-
-	/* internally used */
-	void *host;
-};
-
-#endif
-- 
cgit v1.2.3


From 00e043936e9a1c274c29366c7ecd9e17c79418e6 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Fri, 14 Apr 2017 01:00:02 -0700
Subject: blk-mq: introduce Kyber multiqueue I/O scheduler

The Kyber I/O scheduler is an I/O scheduler for fast devices designed to
scale to multiple queues. Users configure only two knobs, the target
read and synchronous write latencies, and the scheduler tunes itself to
achieve that latency goal.

The implementation is based on "tokens", built on top of the scalable
bitmap library. Tokens serve as a mechanism for limiting requests. There
are two tiers of tokens: queueing tokens and dispatch tokens.

A queueing token is required to allocate a request. In fact, these
tokens are actually the blk-mq internal scheduler tags, but the
scheduler manages the allocation directly in order to implement its
policy.

Dispatch tokens are device-wide and split up into two scheduling
domains: reads vs. writes. Each hardware queue dispatches batches
round-robin between the scheduling domains as long as tokens are
available for that domain.

These tokens can be used as the mechanism to enable various policies.
The policy Kyber uses is inspired by active queue management techniques
for network routing, similar to blk-wbt. The scheduler monitors
latencies and scales the number of dispatch tokens accordingly. Queueing
tokens are used to prevent starvation of synchronous requests by
asynchronous requests.

Various extensions are possible, including better heuristics and ionice
support. The new scheduler isn't set as the default yet.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/block/kyber-iosched.txt |  14 +
 block/Kconfig.iosched                 |   9 +
 block/Makefile                        |   1 +
 block/kyber-iosched.c                 | 719 ++++++++++++++++++++++++++++++++++
 4 files changed, 743 insertions(+)
 create mode 100644 Documentation/block/kyber-iosched.txt
 create mode 100644 block/kyber-iosched.c

(limited to 'Documentation')

diff --git a/Documentation/block/kyber-iosched.txt b/Documentation/block/kyber-iosched.txt
new file mode 100644
index 000000000000..e94feacd7edc
--- /dev/null
+++ b/Documentation/block/kyber-iosched.txt
@@ -0,0 +1,14 @@
+Kyber I/O scheduler tunables
+===========================
+
+The only two tunables for the Kyber scheduler are the target latencies for
+reads and synchronous writes. Kyber will throttle requests in order to meet
+these target latencies.
+
+read_lat_nsec
+-------------
+Target latency for reads (in nanoseconds).
+
+write_lat_nsec
+--------------
+Target latency for synchronous writes (in nanoseconds).
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 58fc8684788d..916e69c68fa4 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -69,6 +69,15 @@ config MQ_IOSCHED_DEADLINE
 	---help---
 	  MQ version of the deadline IO scheduler.
 
+config MQ_IOSCHED_KYBER
+	tristate "Kyber I/O scheduler"
+	default y
+	---help---
+	  The Kyber I/O scheduler is a low-overhead scheduler suitable for
+	  multiqueue and other fast devices. Given target latencies for reads and
+	  synchronous writes, it will self-tune queue depths to achieve that
+	  goal.
+
 endmenu
 
 endif
diff --git a/block/Makefile b/block/Makefile
index 081bb680789b..6146d2eaaeaa 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
 obj-$(CONFIG_MQ_IOSCHED_DEADLINE)	+= mq-deadline.o
+obj-$(CONFIG_MQ_IOSCHED_KYBER)	+= kyber-iosched.o
 
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
 obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
new file mode 100644
index 000000000000..fe4af5b97c0e
--- /dev/null
+++ b/block/kyber-iosched.c
@@ -0,0 +1,719 @@
+/*
+ * The Kyber I/O scheduler. Controls latency by throttling queue depths using
+ * scalable techniques.
+ *
+ * Copyright (C) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/elevator.h>
+#include <linux/module.h>
+#include <linux/sbitmap.h>
+
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-sched.h"
+#include "blk-mq-tag.h"
+#include "blk-stat.h"
+
+/* Scheduling domains. */
+enum {
+	KYBER_READ,
+	KYBER_SYNC_WRITE,
+	KYBER_OTHER, /* Async writes, discard, etc. */
+	KYBER_NUM_DOMAINS,
+};
+
+enum {
+	KYBER_MIN_DEPTH = 256,
+
+	/*
+	 * In order to prevent starvation of synchronous requests by a flood of
+	 * asynchronous requests, we reserve 25% of requests for synchronous
+	 * operations.
+	 */
+	KYBER_ASYNC_PERCENT = 75,
+};
+
+/*
+ * Initial device-wide depths for each scheduling domain.
+ *
+ * Even for fast devices with lots of tags like NVMe, you can saturate
+ * the device with only a fraction of the maximum possible queue depth.
+ * So, we cap these to a reasonable value.
+ */
+static const unsigned int kyber_depth[] = {
+	[KYBER_READ] = 256,
+	[KYBER_SYNC_WRITE] = 128,
+	[KYBER_OTHER] = 64,
+};
+
+/*
+ * Scheduling domain batch sizes. We favor reads.
+ */
+static const unsigned int kyber_batch_size[] = {
+	[KYBER_READ] = 16,
+	[KYBER_SYNC_WRITE] = 8,
+	[KYBER_OTHER] = 8,
+};
+
+struct kyber_queue_data {
+	struct request_queue *q;
+
+	struct blk_stat_callback *cb;
+
+	/*
+	 * The device is divided into multiple scheduling domains based on the
+	 * request type. Each domain has a fixed number of in-flight requests of
+	 * that type device-wide, limited by these tokens.
+	 */
+	struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
+
+	/*
+	 * Async request percentage, converted to per-word depth for
+	 * sbitmap_get_shallow().
+	 */
+	unsigned int async_depth;
+
+	/* Target latencies in nanoseconds. */
+	u64 read_lat_nsec, write_lat_nsec;
+};
+
+struct kyber_hctx_data {
+	spinlock_t lock;
+	struct list_head rqs[KYBER_NUM_DOMAINS];
+	unsigned int cur_domain;
+	unsigned int batching;
+	wait_queue_t domain_wait[KYBER_NUM_DOMAINS];
+	atomic_t wait_index[KYBER_NUM_DOMAINS];
+};
+
+static unsigned int rq_sched_domain(const struct request *rq)
+{
+	unsigned int op = rq->cmd_flags;
+
+	if ((op & REQ_OP_MASK) == REQ_OP_READ)
+		return KYBER_READ;
+	else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
+		return KYBER_SYNC_WRITE;
+	else
+		return KYBER_OTHER;
+}
+
+enum {
+	NONE = 0,
+	GOOD = 1,
+	GREAT = 2,
+	BAD = -1,
+	AWFUL = -2,
+};
+
+#define IS_GOOD(status) ((status) > 0)
+#define IS_BAD(status) ((status) < 0)
+
+static int kyber_lat_status(struct blk_stat_callback *cb,
+			    unsigned int sched_domain, u64 target)
+{
+	u64 latency;
+
+	if (!cb->stat[sched_domain].nr_samples)
+		return NONE;
+
+	latency = cb->stat[sched_domain].mean;
+	if (latency >= 2 * target)
+		return AWFUL;
+	else if (latency > target)
+		return BAD;
+	else if (latency <= target / 2)
+		return GREAT;
+	else /* (latency <= target) */
+		return GOOD;
+}
+
+/*
+ * Adjust the read or synchronous write depth given the status of reads and
+ * writes. The goal is that the latencies of the two domains are fair (i.e., if
+ * one is good, then the other is good).
+ */
+static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
+				  unsigned int sched_domain, int this_status,
+				  int other_status)
+{
+	unsigned int orig_depth, depth;
+
+	/*
+	 * If this domain had no samples, or reads and writes are both good or
+	 * both bad, don't adjust the depth.
+	 */
+	if (this_status == NONE ||
+	    (IS_GOOD(this_status) && IS_GOOD(other_status)) ||
+	    (IS_BAD(this_status) && IS_BAD(other_status)))
+		return;
+
+	orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
+
+	if (other_status == NONE) {
+		depth++;
+	} else {
+		switch (this_status) {
+		case GOOD:
+			if (other_status == AWFUL)
+				depth -= max(depth / 4, 1U);
+			else
+				depth -= max(depth / 8, 1U);
+			break;
+		case GREAT:
+			if (other_status == AWFUL)
+				depth /= 2;
+			else
+				depth -= max(depth / 4, 1U);
+			break;
+		case BAD:
+			depth++;
+			break;
+		case AWFUL:
+			if (other_status == GREAT)
+				depth += 2;
+			else
+				depth++;
+			break;
+		}
+	}
+
+	depth = clamp(depth, 1U, kyber_depth[sched_domain]);
+	if (depth != orig_depth)
+		sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
+}
+
+/*
+ * Adjust the depth of other requests given the status of reads and synchronous
+ * writes. As long as either domain is doing fine, we don't throttle, but if
+ * both domains are doing badly, we throttle heavily.
+ */
+static void kyber_adjust_other_depth(struct kyber_queue_data *kqd,
+				     int read_status, int write_status,
+				     bool have_samples)
+{
+	unsigned int orig_depth, depth;
+	int status;
+
+	orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth;
+
+	if (read_status == NONE && write_status == NONE) {
+		depth += 2;
+	} else if (have_samples) {
+		if (read_status == NONE)
+			status = write_status;
+		else if (write_status == NONE)
+			status = read_status;
+		else
+			status = max(read_status, write_status);
+		switch (status) {
+		case GREAT:
+			depth += 2;
+			break;
+		case GOOD:
+			depth++;
+			break;
+		case BAD:
+			depth -= max(depth / 4, 1U);
+			break;
+		case AWFUL:
+			depth /= 2;
+			break;
+		}
+	}
+
+	depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]);
+	if (depth != orig_depth)
+		sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth);
+}
+
+/*
+ * Apply heuristics for limiting queue depths based on gathered latency
+ * statistics.
+ */
+static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
+{
+	struct kyber_queue_data *kqd = cb->data;
+	int read_status, write_status;
+
+	read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
+	write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
+
+	kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status);
+	kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status);
+	kyber_adjust_other_depth(kqd, read_status, write_status,
+				 cb->stat[KYBER_OTHER].nr_samples != 0);
+
+	/*
+	 * Continue monitoring latencies if we aren't hitting the targets or
+	 * we're still throttling other requests.
+	 */
+	if (!blk_stat_is_active(kqd->cb) &&
+	    ((IS_BAD(read_status) || IS_BAD(write_status) ||
+	      kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER])))
+		blk_stat_activate_msecs(kqd->cb, 100);
+}
+
+static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
+{
+	/*
+	 * All of the hardware queues have the same depth, so we can just grab
+	 * the shift of the first one.
+	 */
+	return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
+}
+
+static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
+{
+	struct kyber_queue_data *kqd;
+	unsigned int max_tokens;
+	unsigned int shift;
+	int ret = -ENOMEM;
+	int i;
+
+	kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
+	if (!kqd)
+		goto err;
+	kqd->q = q;
+
+	kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, rq_sched_domain,
+					  KYBER_NUM_DOMAINS, kqd);
+	if (!kqd->cb)
+		goto err_kqd;
+
+	/*
+	 * The maximum number of tokens for any scheduling domain is at least
+	 * the queue depth of a single hardware queue. If the hardware doesn't
+	 * have many tags, still provide a reasonable number.
+	 */
+	max_tokens = max_t(unsigned int, q->tag_set->queue_depth,
+			   KYBER_MIN_DEPTH);
+	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
+		WARN_ON(!kyber_depth[i]);
+		WARN_ON(!kyber_batch_size[i]);
+		ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
+					      max_tokens, -1, false, GFP_KERNEL,
+					      q->node);
+		if (ret) {
+			while (--i >= 0)
+				sbitmap_queue_free(&kqd->domain_tokens[i]);
+			goto err_cb;
+		}
+		sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]);
+	}
+
+	shift = kyber_sched_tags_shift(kqd);
+	kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
+
+	kqd->read_lat_nsec = 2000000ULL;
+	kqd->write_lat_nsec = 10000000ULL;
+
+	return kqd;
+
+err_cb:
+	blk_stat_free_callback(kqd->cb);
+err_kqd:
+	kfree(kqd);
+err:
+	return ERR_PTR(ret);
+}
+
+static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
+{
+	struct kyber_queue_data *kqd;
+	struct elevator_queue *eq;
+
+	eq = elevator_alloc(q, e);
+	if (!eq)
+		return -ENOMEM;
+
+	kqd = kyber_queue_data_alloc(q);
+	if (IS_ERR(kqd)) {
+		kobject_put(&eq->kobj);
+		return PTR_ERR(kqd);
+	}
+
+	eq->elevator_data = kqd;
+	q->elevator = eq;
+
+	blk_stat_add_callback(q, kqd->cb);
+
+	return 0;
+}
+
+static void kyber_exit_sched(struct elevator_queue *e)
+{
+	struct kyber_queue_data *kqd = e->elevator_data;
+	struct request_queue *q = kqd->q;
+	int i;
+
+	blk_stat_remove_callback(q, kqd->cb);
+
+	for (i = 0; i < KYBER_NUM_DOMAINS; i++)
+		sbitmap_queue_free(&kqd->domain_tokens[i]);
+	blk_stat_free_callback(kqd->cb);
+	kfree(kqd);
+}
+
+static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+{
+	struct kyber_hctx_data *khd;
+	int i;
+
+	khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node);
+	if (!khd)
+		return -ENOMEM;
+
+	spin_lock_init(&khd->lock);
+
+	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
+		INIT_LIST_HEAD(&khd->rqs[i]);
+		INIT_LIST_HEAD(&khd->domain_wait[i].task_list);
+		atomic_set(&khd->wait_index[i], 0);
+	}
+
+	khd->cur_domain = 0;
+	khd->batching = 0;
+
+	hctx->sched_data = khd;
+
+	return 0;
+}
+
+static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+{
+	kfree(hctx->sched_data);
+}
+
+static int rq_get_domain_token(struct request *rq)
+{
+	return (long)rq->elv.priv[0];
+}
+
+static void rq_set_domain_token(struct request *rq, int token)
+{
+	rq->elv.priv[0] = (void *)(long)token;
+}
+
+static void rq_clear_domain_token(struct kyber_queue_data *kqd,
+				  struct request *rq)
+{
+	unsigned int sched_domain;
+	int nr;
+
+	nr = rq_get_domain_token(rq);
+	if (nr != -1) {
+		sched_domain = rq_sched_domain(rq);
+		sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
+				    rq->mq_ctx->cpu);
+	}
+}
+
+static struct request *kyber_get_request(struct request_queue *q,
+					 unsigned int op,
+					 struct blk_mq_alloc_data *data)
+{
+	struct kyber_queue_data *kqd = q->elevator->elevator_data;
+	struct request *rq;
+
+	/*
+	 * We use the scheduler tags as per-hardware queue queueing tokens.
+	 * Async requests can be limited at this stage.
+	 */
+	if (!op_is_sync(op))
+		data->shallow_depth = kqd->async_depth;
+
+	rq = __blk_mq_alloc_request(data, op);
+	if (rq)
+		rq_set_domain_token(rq, -1);
+	return rq;
+}
+
+static void kyber_put_request(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	struct kyber_queue_data *kqd = q->elevator->elevator_data;
+
+	rq_clear_domain_token(kqd, rq);
+	blk_mq_finish_request(rq);
+}
+
+static void kyber_completed_request(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	struct kyber_queue_data *kqd = q->elevator->elevator_data;
+	unsigned int sched_domain;
+	u64 now, latency, target;
+
+	/*
+	 * Check if this request met our latency goal. If not, quickly gather
+	 * some statistics and start throttling.
+	 */
+	sched_domain = rq_sched_domain(rq);
+	switch (sched_domain) {
+	case KYBER_READ:
+		target = kqd->read_lat_nsec;
+		break;
+	case KYBER_SYNC_WRITE:
+		target = kqd->write_lat_nsec;
+		break;
+	default:
+		return;
+	}
+
+	/* If we are already monitoring latencies, don't check again. */
+	if (blk_stat_is_active(kqd->cb))
+		return;
+
+	now = __blk_stat_time(ktime_to_ns(ktime_get()));
+	if (now < blk_stat_time(&rq->issue_stat))
+		return;
+
+	latency = now - blk_stat_time(&rq->issue_stat);
+
+	if (latency > target)
+		blk_stat_activate_msecs(kqd->cb, 10);
+}
+
+static void kyber_flush_busy_ctxs(struct kyber_hctx_data *khd,
+				  struct blk_mq_hw_ctx *hctx)
+{
+	LIST_HEAD(rq_list);
+	struct request *rq, *next;
+
+	blk_mq_flush_busy_ctxs(hctx, &rq_list);
+	list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
+		unsigned int sched_domain;
+
+		sched_domain = rq_sched_domain(rq);
+		list_move_tail(&rq->queuelist, &khd->rqs[sched_domain]);
+	}
+}
+
+static int kyber_domain_wake(wait_queue_t *wait, unsigned mode, int flags,
+			     void *key)
+{
+	struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
+
+	list_del_init(&wait->task_list);
+	blk_mq_run_hw_queue(hctx, true);
+	return 1;
+}
+
+static int kyber_get_domain_token(struct kyber_queue_data *kqd,
+				  struct kyber_hctx_data *khd,
+				  struct blk_mq_hw_ctx *hctx)
+{
+	unsigned int sched_domain = khd->cur_domain;
+	struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
+	wait_queue_t *wait = &khd->domain_wait[sched_domain];
+	struct sbq_wait_state *ws;
+	int nr;
+
+	nr = __sbitmap_queue_get(domain_tokens);
+	if (nr >= 0)
+		return nr;
+
+	/*
+	 * If we failed to get a domain token, make sure the hardware queue is
+	 * run when one becomes available. Note that this is serialized on
+	 * khd->lock, but we still need to be careful about the waker.
+	 */
+	if (list_empty_careful(&wait->task_list)) {
+		init_waitqueue_func_entry(wait, kyber_domain_wake);
+		wait->private = hctx;
+		ws = sbq_wait_ptr(domain_tokens,
+				  &khd->wait_index[sched_domain]);
+		add_wait_queue(&ws->wait, wait);
+
+		/*
+		 * Try again in case a token was freed before we got on the wait
+		 * queue.
+		 */
+		nr = __sbitmap_queue_get(domain_tokens);
+	}
+	return nr;
+}
+
+static struct request *
+kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
+			  struct kyber_hctx_data *khd,
+			  struct blk_mq_hw_ctx *hctx,
+			  bool *flushed)
+{
+	struct list_head *rqs;
+	struct request *rq;
+	int nr;
+
+	rqs = &khd->rqs[khd->cur_domain];
+	rq = list_first_entry_or_null(rqs, struct request, queuelist);
+
+	/*
+	 * If there wasn't already a pending request and we haven't flushed the
+	 * software queues yet, flush the software queues and check again.
+	 */
+	if (!rq && !*flushed) {
+		kyber_flush_busy_ctxs(khd, hctx);
+		*flushed = true;
+		rq = list_first_entry_or_null(rqs, struct request, queuelist);
+	}
+
+	if (rq) {
+		nr = kyber_get_domain_token(kqd, khd, hctx);
+		if (nr >= 0) {
+			khd->batching++;
+			rq_set_domain_token(rq, nr);
+			list_del_init(&rq->queuelist);
+			return rq;
+		}
+	}
+
+	/* There were either no pending requests or no tokens. */
+	return NULL;
+}
+
+static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+	struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
+	struct kyber_hctx_data *khd = hctx->sched_data;
+	bool flushed = false;
+	struct request *rq;
+	int i;
+
+	spin_lock(&khd->lock);
+
+	/*
+	 * First, if we are still entitled to batch, try to dispatch a request
+	 * from the batch.
+	 */
+	if (khd->batching < kyber_batch_size[khd->cur_domain]) {
+		rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
+		if (rq)
+			goto out;
+	}
+
+	/*
+	 * Either,
+	 * 1. We were no longer entitled to a batch.
+	 * 2. The domain we were batching didn't have any requests.
+	 * 3. The domain we were batching was out of tokens.
+	 *
+	 * Start another batch. Note that this wraps back around to the original
+	 * domain if no other domains have requests or tokens.
+	 */
+	khd->batching = 0;
+	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
+		if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
+			khd->cur_domain = 0;
+		else
+			khd->cur_domain++;
+
+		rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
+		if (rq)
+			goto out;
+	}
+
+	rq = NULL;
+out:
+	spin_unlock(&khd->lock);
+	return rq;
+}
+
+static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
+{
+	struct kyber_hctx_data *khd = hctx->sched_data;
+	int i;
+
+	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
+		if (!list_empty_careful(&khd->rqs[i]))
+			return true;
+	}
+	return false;
+}
+
+#define KYBER_LAT_SHOW_STORE(op)					\
+static ssize_t kyber_##op##_lat_show(struct elevator_queue *e,		\
+				     char *page)			\
+{									\
+	struct kyber_queue_data *kqd = e->elevator_data;		\
+									\
+	return sprintf(page, "%llu\n", kqd->op##_lat_nsec);		\
+}									\
+									\
+static ssize_t kyber_##op##_lat_store(struct elevator_queue *e,		\
+				      const char *page, size_t count)	\
+{									\
+	struct kyber_queue_data *kqd = e->elevator_data;		\
+	unsigned long long nsec;					\
+	int ret;							\
+									\
+	ret = kstrtoull(page, 10, &nsec);				\
+	if (ret)							\
+		return ret;						\
+									\
+	kqd->op##_lat_nsec = nsec;					\
+									\
+	return count;							\
+}
+KYBER_LAT_SHOW_STORE(read);
+KYBER_LAT_SHOW_STORE(write);
+#undef KYBER_LAT_SHOW_STORE
+
+#define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
+static struct elv_fs_entry kyber_sched_attrs[] = {
+	KYBER_LAT_ATTR(read),
+	KYBER_LAT_ATTR(write),
+	__ATTR_NULL
+};
+#undef KYBER_LAT_ATTR
+
+static struct elevator_type kyber_sched = {
+	.ops.mq = {
+		.init_sched = kyber_init_sched,
+		.exit_sched = kyber_exit_sched,
+		.init_hctx = kyber_init_hctx,
+		.exit_hctx = kyber_exit_hctx,
+		.get_request = kyber_get_request,
+		.put_request = kyber_put_request,
+		.completed_request = kyber_completed_request,
+		.dispatch_request = kyber_dispatch_request,
+		.has_work = kyber_has_work,
+	},
+	.uses_mq = true,
+	.elevator_attrs = kyber_sched_attrs,
+	.elevator_name = "kyber",
+	.elevator_owner = THIS_MODULE,
+};
+
+static int __init kyber_init(void)
+{
+	return elv_register(&kyber_sched);
+}
+
+static void __exit kyber_exit(void)
+{
+	elv_unregister(&kyber_sched);
+}
+
+module_init(kyber_init);
+module_exit(kyber_exit);
+
+MODULE_AUTHOR("Omar Sandoval");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Kyber I/O scheduler");
-- 
cgit v1.2.3


From a4bd217b432685d6a177c28a2af187f041c473b7 Mon Sep 17 00:00:00 2001
From: Javier González <jg@lightnvm.io>
Date: Sat, 15 Apr 2017 20:55:50 +0200
Subject: lightnvm: physical block device (pblk) target
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch introduces pblk, a host-side translation layer for
Open-Channel SSDs to expose them like block devices. The translation
layer allows data placement decisions, and I/O scheduling to be
managed by the host, enabling users to optimize the SSD for their
specific workloads.

An open-channel SSD has a set of LUNs (parallel units) and a
collection of blocks. Each block can be read in any order, but
writes must be sequential. Writes may also fail, and if a block
requires it, must also be reset before new writes can be
applied.

To manage the constraints, pblk maintains a logical to
physical address (L2P) table,  write cache, garbage
collection logic, recovery scheme, and logic to rate-limit
user I/Os versus garbage collection I/Os.

The L2P table is fully-associative and manages sectors at a
4KB granularity. Pblk stores the L2P table in two places, in
the out-of-band area of the media and on the last page of a
line. In the cause of a power failure, pblk will perform a
scan to recover the L2P table.

The user data is organized into lines. A line is data
striped across blocks and LUNs. The lines enable the host to
reduce the amount of metadata to maintain besides the user
data and makes it easier to implement RAID or erasure coding
in the future.

pblk implements multi-tenant support and can be instantiated
multiple times on the same drive. Each instance owns a
portion of the SSD - both regarding I/O bandwidth and
capacity - providing I/O isolation for each case.

Finally, pblk also exposes a sysfs interface that allows
user-space to peek into the internals of pblk. The interface
is available at /dev/block/*/pblk/ where * is the block
device name exposed.

This work also contains contributions from:
  Matias Bjørling <matias@cnexlabs.com>
  Simon A. F. Lund <slund@cnexlabs.com>
  Young Tack Jin <youngtack.jin@gmail.com>
  Huaicheng Li <huaicheng@cs.uchicago.edu>

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <matias@cnexlabs.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/lightnvm/pblk.txt  |   21 +
 drivers/lightnvm/Kconfig         |    9 +
 drivers/lightnvm/Makefile        |    5 +
 drivers/lightnvm/pblk-cache.c    |  114 +++
 drivers/lightnvm/pblk-core.c     | 1655 ++++++++++++++++++++++++++++++++++++++
 drivers/lightnvm/pblk-gc.c       |  555 +++++++++++++
 drivers/lightnvm/pblk-init.c     |  949 ++++++++++++++++++++++
 drivers/lightnvm/pblk-map.c      |  136 ++++
 drivers/lightnvm/pblk-rb.c       |  852 ++++++++++++++++++++
 drivers/lightnvm/pblk-read.c     |  529 ++++++++++++
 drivers/lightnvm/pblk-recovery.c |  998 +++++++++++++++++++++++
 drivers/lightnvm/pblk-rl.c       |  182 +++++
 drivers/lightnvm/pblk-sysfs.c    |  507 ++++++++++++
 drivers/lightnvm/pblk-write.c    |  411 ++++++++++
 drivers/lightnvm/pblk.h          | 1121 ++++++++++++++++++++++++++
 15 files changed, 8044 insertions(+)
 create mode 100644 Documentation/lightnvm/pblk.txt
 create mode 100644 drivers/lightnvm/pblk-cache.c
 create mode 100644 drivers/lightnvm/pblk-core.c
 create mode 100644 drivers/lightnvm/pblk-gc.c
 create mode 100644 drivers/lightnvm/pblk-init.c
 create mode 100644 drivers/lightnvm/pblk-map.c
 create mode 100644 drivers/lightnvm/pblk-rb.c
 create mode 100644 drivers/lightnvm/pblk-read.c
 create mode 100644 drivers/lightnvm/pblk-recovery.c
 create mode 100644 drivers/lightnvm/pblk-rl.c
 create mode 100644 drivers/lightnvm/pblk-sysfs.c
 create mode 100644 drivers/lightnvm/pblk-write.c
 create mode 100644 drivers/lightnvm/pblk.h

(limited to 'Documentation')

diff --git a/Documentation/lightnvm/pblk.txt b/Documentation/lightnvm/pblk.txt
new file mode 100644
index 000000000000..1040ed1cec81
--- /dev/null
+++ b/Documentation/lightnvm/pblk.txt
@@ -0,0 +1,21 @@
+pblk: Physical Block Device Target
+==================================
+
+pblk implements a fully associative, host-based FTL that exposes a traditional
+block I/O interface. Its primary responsibilities are:
+
+  - Map logical addresses onto physical addresses (4KB granularity) in a
+    logical-to-physical (L2P) table.
+  - Maintain the integrity and consistency of the L2P table as well as its
+    recovery from normal tear down and power outage.
+  - Deal with controller- and media-specific constrains.
+  - Handle I/O errors.
+  - Implement garbage collection.
+  - Maintain consistency across the I/O stack during synchronization points.
+
+For more information please refer to:
+
+  http://lightnvm.io
+
+which maintains updated FAQs, manual pages, technical documentation, tools,
+contacts, etc.
diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig
index 052714106b7b..ead61a93cb4e 100644
--- a/drivers/lightnvm/Kconfig
+++ b/drivers/lightnvm/Kconfig
@@ -33,4 +33,13 @@ config NVM_RRPC
 	host. The target is implemented using a linear mapping table and
 	cost-based garbage collection. It is optimized for 4K IO sizes.
 
+config NVM_PBLK
+	tristate "Physical Block Device Open-Channel SSD target"
+	---help---
+	Allows an open-channel SSD to be exposed as a block device to the
+	host. The target assumes the device exposes raw flash and must be
+	explicitly managed by the host.
+
+	Please note the disk format is considered EXPERIMENTAL for now.
+
 endif # NVM
diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile
index b2a39e2d2895..82d1a117fb27 100644
--- a/drivers/lightnvm/Makefile
+++ b/drivers/lightnvm/Makefile
@@ -4,3 +4,8 @@
 
 obj-$(CONFIG_NVM)		:= core.o
 obj-$(CONFIG_NVM_RRPC)		+= rrpc.o
+obj-$(CONFIG_NVM_PBLK)		+= pblk.o
+pblk-y				:= pblk-init.o pblk-core.o pblk-rb.o \
+				   pblk-write.o pblk-cache.o pblk-read.o \
+				   pblk-gc.o pblk-recovery.o pblk-map.o \
+				   pblk-rl.o pblk-sysfs.o
diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c
new file mode 100644
index 000000000000..59bcea88db84
--- /dev/null
+++ b/drivers/lightnvm/pblk-cache.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *                  Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-cache.c - pblk's write cache
+ */
+
+#include "pblk.h"
+
+int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags)
+{
+	struct pblk_w_ctx w_ctx;
+	sector_t lba = pblk_get_lba(bio);
+	unsigned int bpos, pos;
+	int nr_entries = pblk_get_secs(bio);
+	int i, ret;
+
+	/* Update the write buffer head (mem) with the entries that we can
+	 * write. The write in itself cannot fail, so there is no need to
+	 * rollback from here on.
+	 */
+retry:
+	ret = pblk_rb_may_write_user(&pblk->rwb, bio, nr_entries, &bpos);
+	if (ret == NVM_IO_REQUEUE) {
+		io_schedule();
+		goto retry;
+	}
+
+	if (unlikely(!bio_has_data(bio)))
+		goto out;
+
+	w_ctx.flags = flags;
+	pblk_ppa_set_empty(&w_ctx.ppa);
+
+	for (i = 0; i < nr_entries; i++) {
+		void *data = bio_data(bio);
+
+		w_ctx.lba = lba + i;
+
+		pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + i);
+		pblk_rb_write_entry_user(&pblk->rwb, data, w_ctx, pos);
+
+		bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
+	}
+
+#ifdef CONFIG_NVM_DEBUG
+	atomic_long_add(nr_entries, &pblk->inflight_writes);
+	atomic_long_add(nr_entries, &pblk->req_writes);
+#endif
+
+out:
+	pblk_write_should_kick(pblk);
+	return ret;
+}
+
+/*
+ * On GC the incoming lbas are not necessarily sequential. Also, some of the
+ * lbas might not be valid entries, which are marked as empty by the GC thread
+ */
+int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list,
+			   unsigned int nr_entries, unsigned int nr_rec_entries,
+			   struct pblk_line *gc_line, unsigned long flags)
+{
+	struct pblk_w_ctx w_ctx;
+	unsigned int bpos, pos;
+	int i, valid_entries;
+
+	/* Update the write buffer head (mem) with the entries that we can
+	 * write. The write in itself cannot fail, so there is no need to
+	 * rollback from here on.
+	 */
+retry:
+	if (!pblk_rb_may_write_gc(&pblk->rwb, nr_rec_entries, &bpos)) {
+		io_schedule();
+		goto retry;
+	}
+
+	w_ctx.flags = flags;
+	pblk_ppa_set_empty(&w_ctx.ppa);
+
+	for (i = 0, valid_entries = 0; i < nr_entries; i++) {
+		if (lba_list[i] == ADDR_EMPTY)
+			continue;
+
+		w_ctx.lba = lba_list[i];
+
+		pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + valid_entries);
+		pblk_rb_write_entry_gc(&pblk->rwb, data, w_ctx, gc_line, pos);
+
+		data += PBLK_EXPOSED_PAGE_SIZE;
+		valid_entries++;
+	}
+
+	WARN_ONCE(nr_rec_entries != valid_entries,
+					"pblk: inconsistent GC write\n");
+
+#ifdef CONFIG_NVM_DEBUG
+	atomic_long_add(valid_entries, &pblk->inflight_writes);
+	atomic_long_add(valid_entries, &pblk->recov_gc_writes);
+#endif
+
+	pblk_write_should_kick(pblk);
+	return NVM_IO_OK;
+}
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
new file mode 100644
index 000000000000..a2bcd098babc
--- /dev/null
+++ b/drivers/lightnvm/pblk-core.c
@@ -0,0 +1,1655 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *                  Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-core.c - pblk's core functionality
+ *
+ */
+
+#include "pblk.h"
+#include <linux/time.h>
+
+static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
+			 struct ppa_addr *ppa)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	int pos = pblk_dev_ppa_to_pos(geo, *ppa);
+
+	pr_debug("pblk: erase failed: line:%d, pos:%d\n", line->id, pos);
+	atomic_long_inc(&pblk->erase_failed);
+
+	if (test_and_set_bit(pos, line->blk_bitmap))
+		pr_err("pblk: attempted to erase bb: line:%d, pos:%d\n",
+							line->id, pos);
+
+	pblk_line_run_ws(pblk, NULL, ppa, pblk_line_mark_bb);
+}
+
+static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd)
+{
+	struct pblk_line *line;
+
+	line = &pblk->lines[pblk_dev_ppa_to_line(rqd->ppa_addr)];
+	atomic_dec(&line->left_seblks);
+
+	if (rqd->error) {
+		struct ppa_addr *ppa;
+
+		ppa = kmalloc(sizeof(struct ppa_addr), GFP_ATOMIC);
+		if (!ppa)
+			return;
+
+		*ppa = rqd->ppa_addr;
+		pblk_mark_bb(pblk, line, ppa);
+	}
+}
+
+/* Erase completion assumes that only one block is erased at the time */
+static void pblk_end_io_erase(struct nvm_rq *rqd)
+{
+	struct pblk *pblk = rqd->private;
+
+	up(&pblk->erase_sem);
+	__pblk_end_io_erase(pblk, rqd);
+	mempool_free(rqd, pblk->r_rq_pool);
+}
+
+static void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line,
+				  u64 paddr)
+{
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	struct list_head *move_list = NULL;
+
+	/* Lines being reclaimed (GC'ed) cannot be invalidated. Before the L2P
+	 * table is modified with reclaimed sectors, a check is done to endure
+	 * that newer updates are not overwritten.
+	 */
+	spin_lock(&line->lock);
+	if (line->state == PBLK_LINESTATE_GC ||
+					line->state == PBLK_LINESTATE_FREE) {
+		spin_unlock(&line->lock);
+		return;
+	}
+
+	if (test_and_set_bit(paddr, line->invalid_bitmap)) {
+		WARN_ONCE(1, "pblk: double invalidate\n");
+		spin_unlock(&line->lock);
+		return;
+	}
+	line->vsc--;
+
+	if (line->state == PBLK_LINESTATE_CLOSED)
+		move_list = pblk_line_gc_list(pblk, line);
+	spin_unlock(&line->lock);
+
+	if (move_list) {
+		spin_lock(&l_mg->gc_lock);
+		spin_lock(&line->lock);
+		/* Prevent moving a line that has just been chosen for GC */
+		if (line->state == PBLK_LINESTATE_GC ||
+					line->state == PBLK_LINESTATE_FREE) {
+			spin_unlock(&line->lock);
+			spin_unlock(&l_mg->gc_lock);
+			return;
+		}
+		spin_unlock(&line->lock);
+
+		list_move_tail(&line->list, move_list);
+		spin_unlock(&l_mg->gc_lock);
+	}
+}
+
+void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa)
+{
+	struct pblk_line *line;
+	u64 paddr;
+	int line_id;
+
+#ifdef CONFIG_NVM_DEBUG
+	/* Callers must ensure that the ppa points to a device address */
+	BUG_ON(pblk_addr_in_cache(ppa));
+	BUG_ON(pblk_ppa_empty(ppa));
+#endif
+
+	line_id = pblk_tgt_ppa_to_line(ppa);
+	line = &pblk->lines[line_id];
+	paddr = pblk_dev_ppa_to_line_addr(pblk, ppa);
+
+	__pblk_map_invalidate(pblk, line, paddr);
+}
+
+void pblk_map_pad_invalidate(struct pblk *pblk, struct pblk_line *line,
+			     u64 paddr)
+{
+	__pblk_map_invalidate(pblk, line, paddr);
+
+	pblk_rb_sync_init(&pblk->rwb, NULL);
+	line->left_ssecs--;
+	if (!line->left_ssecs)
+		pblk_line_run_ws(pblk, line, NULL, pblk_line_close_ws);
+	pblk_rb_sync_end(&pblk->rwb, NULL);
+}
+
+static void pblk_invalidate_range(struct pblk *pblk, sector_t slba,
+				  unsigned int nr_secs)
+{
+	sector_t lba;
+
+	spin_lock(&pblk->trans_lock);
+	for (lba = slba; lba < slba + nr_secs; lba++) {
+		struct ppa_addr ppa;
+
+		ppa = pblk_trans_map_get(pblk, lba);
+
+		if (!pblk_addr_in_cache(ppa) && !pblk_ppa_empty(ppa))
+			pblk_map_invalidate(pblk, ppa);
+
+		pblk_ppa_set_empty(&ppa);
+		pblk_trans_map_set(pblk, lba, ppa);
+	}
+	spin_unlock(&pblk->trans_lock);
+}
+
+struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw)
+{
+	mempool_t *pool;
+	struct nvm_rq *rqd;
+	int rq_size;
+
+	if (rw == WRITE) {
+		pool = pblk->w_rq_pool;
+		rq_size = pblk_w_rq_size;
+	} else {
+		pool = pblk->r_rq_pool;
+		rq_size = pblk_r_rq_size;
+	}
+
+	rqd = mempool_alloc(pool, GFP_KERNEL);
+	memset(rqd, 0, rq_size);
+
+	return rqd;
+}
+
+void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw)
+{
+	mempool_t *pool;
+
+	if (rw == WRITE)
+		pool = pblk->w_rq_pool;
+	else
+		pool = pblk->r_rq_pool;
+
+	mempool_free(rqd, pool);
+}
+
+void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off,
+			 int nr_pages)
+{
+	struct bio_vec bv;
+	int i;
+
+	WARN_ON(off + nr_pages != bio->bi_vcnt);
+
+	bio_advance(bio, off * PBLK_EXPOSED_PAGE_SIZE);
+	for (i = off; i < nr_pages + off; i++) {
+		bv = bio->bi_io_vec[i];
+		mempool_free(bv.bv_page, pblk->page_pool);
+	}
+}
+
+int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
+		       int nr_pages)
+{
+	struct request_queue *q = pblk->dev->q;
+	struct page *page;
+	int i, ret;
+
+	for (i = 0; i < nr_pages; i++) {
+		page = mempool_alloc(pblk->page_pool, flags);
+		if (!page)
+			goto err;
+
+		ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0);
+		if (ret != PBLK_EXPOSED_PAGE_SIZE) {
+			pr_err("pblk: could not add page to bio\n");
+			mempool_free(page, pblk->page_pool);
+			goto err;
+		}
+	}
+
+	return 0;
+err:
+	pblk_bio_free_pages(pblk, bio, 0, i - 1);
+	return -1;
+}
+
+static void pblk_write_kick(struct pblk *pblk)
+{
+	wake_up_process(pblk->writer_ts);
+	mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(1000));
+}
+
+void pblk_write_timer_fn(unsigned long data)
+{
+	struct pblk *pblk = (struct pblk *)data;
+
+	/* kick the write thread every tick to flush outstanding data */
+	pblk_write_kick(pblk);
+}
+
+void pblk_write_should_kick(struct pblk *pblk)
+{
+	unsigned int secs_avail = pblk_rb_read_count(&pblk->rwb);
+
+	if (secs_avail >= pblk->min_write_pgs)
+		pblk_write_kick(pblk);
+}
+
+void pblk_end_bio_sync(struct bio *bio)
+{
+	struct completion *waiting = bio->bi_private;
+
+	complete(waiting);
+}
+
+void pblk_end_io_sync(struct nvm_rq *rqd)
+{
+	struct completion *waiting = rqd->private;
+
+	complete(waiting);
+}
+
+void pblk_flush_writer(struct pblk *pblk)
+{
+	struct bio *bio;
+	int ret;
+	DECLARE_COMPLETION_ONSTACK(wait);
+
+	bio = bio_alloc(GFP_KERNEL, 1);
+	if (!bio)
+		return;
+
+	bio->bi_iter.bi_sector = 0; /* internal bio */
+	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_OP_FLUSH);
+	bio->bi_private = &wait;
+	bio->bi_end_io = pblk_end_bio_sync;
+
+	ret = pblk_write_to_cache(pblk, bio, 0);
+	if (ret == NVM_IO_OK) {
+		if (!wait_for_completion_io_timeout(&wait,
+				msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+			pr_err("pblk: flush cache timed out\n");
+		}
+	} else if (ret != NVM_IO_DONE) {
+		pr_err("pblk: tear down bio failed\n");
+	}
+
+	if (bio->bi_error)
+		pr_err("pblk: flush sync write failed (%u)\n", bio->bi_error);
+
+	bio_put(bio);
+}
+
+struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line)
+{
+	struct pblk_line_meta *lm = &pblk->lm;
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	struct list_head *move_list = NULL;
+
+	if (!line->vsc) {
+		if (line->gc_group != PBLK_LINEGC_FULL) {
+			line->gc_group = PBLK_LINEGC_FULL;
+			move_list = &l_mg->gc_full_list;
+		}
+	} else if (line->vsc < lm->mid_thrs) {
+		if (line->gc_group != PBLK_LINEGC_HIGH) {
+			line->gc_group = PBLK_LINEGC_HIGH;
+			move_list = &l_mg->gc_high_list;
+		}
+	} else if (line->vsc < lm->high_thrs) {
+		if (line->gc_group != PBLK_LINEGC_MID) {
+			line->gc_group = PBLK_LINEGC_MID;
+			move_list = &l_mg->gc_mid_list;
+		}
+	} else if (line->vsc < line->sec_in_line) {
+		if (line->gc_group != PBLK_LINEGC_LOW) {
+			line->gc_group = PBLK_LINEGC_LOW;
+			move_list = &l_mg->gc_low_list;
+		}
+	} else if (line->vsc == line->sec_in_line) {
+		if (line->gc_group != PBLK_LINEGC_EMPTY) {
+			line->gc_group = PBLK_LINEGC_EMPTY;
+			move_list = &l_mg->gc_empty_list;
+		}
+	} else {
+		line->state = PBLK_LINESTATE_CORRUPT;
+		line->gc_group = PBLK_LINEGC_NONE;
+		move_list =  &l_mg->corrupt_list;
+		pr_err("pblk: corrupted vsc for line %d, vsc:%d (%d/%d/%d)\n",
+						line->id, line->vsc,
+						line->sec_in_line,
+						lm->high_thrs, lm->mid_thrs);
+	}
+
+	return move_list;
+}
+
+void pblk_discard(struct pblk *pblk, struct bio *bio)
+{
+	sector_t slba = pblk_get_lba(bio);
+	sector_t nr_secs = pblk_get_secs(bio);
+
+	pblk_invalidate_range(pblk, slba, nr_secs);
+}
+
+struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba)
+{
+	struct ppa_addr ppa;
+
+	spin_lock(&pblk->trans_lock);
+	ppa = pblk_trans_map_get(pblk, lba);
+	spin_unlock(&pblk->trans_lock);
+
+	return ppa;
+}
+
+void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd)
+{
+	atomic_long_inc(&pblk->write_failed);
+#ifdef CONFIG_NVM_DEBUG
+	pblk_print_failed_rqd(pblk, rqd, rqd->error);
+#endif
+}
+
+void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd)
+{
+	/* Empty page read is not necessarily an error (e.g., L2P recovery) */
+	if (rqd->error == NVM_RSP_ERR_EMPTYPAGE) {
+		atomic_long_inc(&pblk->read_empty);
+		return;
+	}
+
+	switch (rqd->error) {
+	case NVM_RSP_WARN_HIGHECC:
+		atomic_long_inc(&pblk->read_high_ecc);
+		break;
+	case NVM_RSP_ERR_FAILECC:
+	case NVM_RSP_ERR_FAILCRC:
+		atomic_long_inc(&pblk->read_failed);
+		break;
+	default:
+		pr_err("pblk: unknown read error:%d\n", rqd->error);
+	}
+#ifdef CONFIG_NVM_DEBUG
+	pblk_print_failed_rqd(pblk, rqd, rqd->error);
+#endif
+}
+
+int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+
+#ifdef CONFIG_NVM_DEBUG
+	struct ppa_addr *ppa_list;
+
+	ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
+	if (pblk_boundary_ppa_checks(dev, ppa_list, rqd->nr_ppas)) {
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	if (rqd->opcode == NVM_OP_PWRITE) {
+		struct pblk_line *line;
+		struct ppa_addr ppa;
+		int i;
+
+		for (i = 0; i < rqd->nr_ppas; i++) {
+			ppa = ppa_list[i];
+			line = &pblk->lines[pblk_dev_ppa_to_line(ppa)];
+
+			spin_lock(&line->lock);
+			if (line->state != PBLK_LINESTATE_OPEN) {
+				pr_err("pblk: bad ppa: line:%d,state:%d\n",
+							line->id, line->state);
+				WARN_ON(1);
+				spin_unlock(&line->lock);
+				return -EINVAL;
+			}
+			spin_unlock(&line->lock);
+		}
+	}
+#endif
+	return nvm_submit_io(dev, rqd);
+}
+
+struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
+			      unsigned int nr_secs, unsigned int len,
+			      gfp_t gfp_mask)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	void *kaddr = data;
+	struct page *page;
+	struct bio *bio;
+	int i, ret;
+
+	if (l_mg->emeta_alloc_type == PBLK_KMALLOC_META)
+		return bio_map_kern(dev->q, kaddr, len, gfp_mask);
+
+	bio = bio_kmalloc(gfp_mask, nr_secs);
+	if (!bio)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < nr_secs; i++) {
+		page = vmalloc_to_page(kaddr);
+		if (!page) {
+			pr_err("pblk: could not map vmalloc bio\n");
+			bio_put(bio);
+			bio = ERR_PTR(-ENOMEM);
+			goto out;
+		}
+
+		ret = bio_add_pc_page(dev->q, bio, page, PAGE_SIZE, 0);
+		if (ret != PAGE_SIZE) {
+			pr_err("pblk: could not add page to bio\n");
+			bio_put(bio);
+			bio = ERR_PTR(-ENOMEM);
+			goto out;
+		}
+
+		kaddr += PAGE_SIZE;
+	}
+out:
+	return bio;
+}
+
+int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
+		   unsigned long secs_to_flush)
+{
+	int max = pblk->max_write_pgs;
+	int min = pblk->min_write_pgs;
+	int secs_to_sync = 0;
+
+	if (secs_avail >= max)
+		secs_to_sync = max;
+	else if (secs_avail >= min)
+		secs_to_sync = min * (secs_avail / min);
+	else if (secs_to_flush)
+		secs_to_sync = min;
+
+	return secs_to_sync;
+}
+
+static u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line,
+			     int nr_secs)
+{
+	u64 addr;
+	int i;
+
+	/* logic error: ppa out-of-bounds. Prevent generating bad address */
+	if (line->cur_sec + nr_secs > pblk->lm.sec_per_line) {
+		WARN(1, "pblk: page allocation out of bounds\n");
+		nr_secs = pblk->lm.sec_per_line - line->cur_sec;
+	}
+
+	line->cur_sec = addr = find_next_zero_bit(line->map_bitmap,
+					pblk->lm.sec_per_line, line->cur_sec);
+	for (i = 0; i < nr_secs; i++, line->cur_sec++)
+		WARN_ON(test_and_set_bit(line->cur_sec, line->map_bitmap));
+
+	return addr;
+}
+
+u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs)
+{
+	u64 addr;
+
+	/* Lock needed in case a write fails and a recovery needs to remap
+	 * failed write buffer entries
+	 */
+	spin_lock(&line->lock);
+	addr = __pblk_alloc_page(pblk, line, nr_secs);
+	line->left_msecs -= nr_secs;
+	WARN(line->left_msecs < 0, "pblk: page allocation out of bounds\n");
+	spin_unlock(&line->lock);
+
+	return addr;
+}
+
+/*
+ * Submit emeta to one LUN in the raid line at the time to avoid a deadlock when
+ * taking the per LUN semaphore.
+ */
+static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line,
+				     u64 paddr, int dir)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct pblk_line_meta *lm = &pblk->lm;
+	struct bio *bio;
+	struct nvm_rq rqd;
+	struct ppa_addr *ppa_list;
+	dma_addr_t dma_ppa_list;
+	void *emeta = line->emeta;
+	int min = pblk->min_write_pgs;
+	int left_ppas = lm->emeta_sec;
+	int id = line->id;
+	int rq_ppas, rq_len;
+	int cmd_op, bio_op;
+	int flags;
+	int i, j;
+	int ret;
+	DECLARE_COMPLETION_ONSTACK(wait);
+
+	if (dir == WRITE) {
+		bio_op = REQ_OP_WRITE;
+		cmd_op = NVM_OP_PWRITE;
+		flags = pblk_set_progr_mode(pblk, WRITE);
+	} else if (dir == READ) {
+		bio_op = REQ_OP_READ;
+		cmd_op = NVM_OP_PREAD;
+		flags = pblk_set_read_mode(pblk);
+	} else
+		return -EINVAL;
+
+	ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_ppa_list);
+	if (!ppa_list)
+		return -ENOMEM;
+
+next_rq:
+	memset(&rqd, 0, sizeof(struct nvm_rq));
+
+	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+	rq_len = rq_ppas * geo->sec_size;
+
+	bio = pblk_bio_map_addr(pblk, emeta, rq_ppas, rq_len, GFP_KERNEL);
+	if (IS_ERR(bio)) {
+		ret = PTR_ERR(bio);
+		goto free_rqd_dma;
+	}
+
+	bio->bi_iter.bi_sector = 0; /* internal bio */
+	bio_set_op_attrs(bio, bio_op, 0);
+
+	rqd.bio = bio;
+	rqd.opcode = cmd_op;
+	rqd.flags = flags;
+	rqd.nr_ppas = rq_ppas;
+	rqd.ppa_list = ppa_list;
+	rqd.dma_ppa_list = dma_ppa_list;
+	rqd.end_io = pblk_end_io_sync;
+	rqd.private = &wait;
+
+	if (dir == WRITE) {
+		for (i = 0; i < rqd.nr_ppas; ) {
+			spin_lock(&line->lock);
+			paddr = __pblk_alloc_page(pblk, line, min);
+			spin_unlock(&line->lock);
+			for (j = 0; j < min; j++, i++, paddr++)
+				rqd.ppa_list[i] =
+					addr_to_gen_ppa(pblk, paddr, id);
+		}
+	} else {
+		for (i = 0; i < rqd.nr_ppas; ) {
+			struct ppa_addr ppa = addr_to_gen_ppa(pblk, paddr, id);
+			int pos = pblk_dev_ppa_to_pos(geo, ppa);
+
+			while (test_bit(pos, line->blk_bitmap)) {
+				paddr += min;
+				if (pblk_boundary_paddr_checks(pblk, paddr)) {
+					pr_err("pblk: corrupt emeta line:%d\n",
+								line->id);
+					bio_put(bio);
+					ret = -EINTR;
+					goto free_rqd_dma;
+				}
+
+				ppa = addr_to_gen_ppa(pblk, paddr, id);
+				pos = pblk_dev_ppa_to_pos(geo, ppa);
+			}
+
+			if (pblk_boundary_paddr_checks(pblk, paddr + min)) {
+				pr_err("pblk: corrupt emeta line:%d\n",
+								line->id);
+				bio_put(bio);
+				ret = -EINTR;
+				goto free_rqd_dma;
+			}
+
+			for (j = 0; j < min; j++, i++, paddr++)
+				rqd.ppa_list[i] =
+					addr_to_gen_ppa(pblk, paddr, line->id);
+		}
+	}
+
+	ret = pblk_submit_io(pblk, &rqd);
+	if (ret) {
+		pr_err("pblk: emeta I/O submission failed: %d\n", ret);
+		bio_put(bio);
+		goto free_rqd_dma;
+	}
+
+	if (!wait_for_completion_io_timeout(&wait,
+				msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+		pr_err("pblk: emeta I/O timed out\n");
+	}
+	reinit_completion(&wait);
+
+	bio_put(bio);
+
+	if (rqd.error) {
+		if (dir == WRITE)
+			pblk_log_write_err(pblk, &rqd);
+		else
+			pblk_log_read_err(pblk, &rqd);
+	}
+
+	emeta += rq_len;
+	left_ppas -= rq_ppas;
+	if (left_ppas)
+		goto next_rq;
+free_rqd_dma:
+	nvm_dev_dma_free(dev->parent, ppa_list, dma_ppa_list);
+	return ret;
+}
+
+u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct pblk_line_meta *lm = &pblk->lm;
+	int bit;
+
+	/* This usually only happens on bad lines */
+	bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
+	if (bit >= lm->blk_per_line)
+		return -1;
+
+	return bit * geo->sec_per_pl;
+}
+
+static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
+				     u64 paddr, int dir)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct pblk_line_meta *lm = &pblk->lm;
+	struct bio *bio;
+	struct nvm_rq rqd;
+	__le64 *lba_list = NULL;
+	int i, ret;
+	int cmd_op, bio_op;
+	int flags;
+	DECLARE_COMPLETION_ONSTACK(wait);
+
+	if (dir == WRITE) {
+		bio_op = REQ_OP_WRITE;
+		cmd_op = NVM_OP_PWRITE;
+		flags = pblk_set_progr_mode(pblk, WRITE);
+		lba_list = pblk_line_emeta_to_lbas(line->emeta);
+	} else if (dir == READ) {
+		bio_op = REQ_OP_READ;
+		cmd_op = NVM_OP_PREAD;
+		flags = pblk_set_read_mode(pblk);
+	} else
+		return -EINVAL;
+
+	memset(&rqd, 0, sizeof(struct nvm_rq));
+
+	rqd.ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+							&rqd.dma_ppa_list);
+	if (!rqd.ppa_list)
+		return -ENOMEM;
+
+	bio = bio_map_kern(dev->q, line->smeta, lm->smeta_len, GFP_KERNEL);
+	if (IS_ERR(bio)) {
+		ret = PTR_ERR(bio);
+		goto free_ppa_list;
+	}
+
+	bio->bi_iter.bi_sector = 0; /* internal bio */
+	bio_set_op_attrs(bio, bio_op, 0);
+
+	rqd.bio = bio;
+	rqd.opcode = cmd_op;
+	rqd.flags = flags;
+	rqd.nr_ppas = lm->smeta_sec;
+	rqd.end_io = pblk_end_io_sync;
+	rqd.private = &wait;
+
+	for (i = 0; i < lm->smeta_sec; i++, paddr++) {
+		rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
+		if (dir == WRITE)
+			lba_list[paddr] = cpu_to_le64(ADDR_EMPTY);
+	}
+
+	/*
+	 * This I/O is sent by the write thread when a line is replace. Since
+	 * the write thread is the only one sending write and erase commands,
+	 * there is no need to take the LUN semaphore.
+	 */
+	ret = pblk_submit_io(pblk, &rqd);
+	if (ret) {
+		pr_err("pblk: smeta I/O submission failed: %d\n", ret);
+		bio_put(bio);
+		goto free_ppa_list;
+	}
+
+	if (!wait_for_completion_io_timeout(&wait,
+				msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+		pr_err("pblk: smeta I/O timed out\n");
+	}
+
+	if (rqd.error) {
+		if (dir == WRITE)
+			pblk_log_write_err(pblk, &rqd);
+		else
+			pblk_log_read_err(pblk, &rqd);
+	}
+
+free_ppa_list:
+	nvm_dev_dma_free(dev->parent, rqd.ppa_list, rqd.dma_ppa_list);
+
+	return ret;
+}
+
+int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line)
+{
+	u64 bpaddr = pblk_line_smeta_start(pblk, line);
+
+	return pblk_line_submit_smeta_io(pblk, line, bpaddr, READ);
+}
+
+int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line)
+{
+	return pblk_line_submit_emeta_io(pblk, line, line->emeta_ssec, READ);
+}
+
+static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd,
+			    struct ppa_addr ppa)
+{
+	rqd->opcode = NVM_OP_ERASE;
+	rqd->ppa_addr = ppa;
+	rqd->nr_ppas = 1;
+	rqd->flags = pblk_set_progr_mode(pblk, ERASE);
+	rqd->bio = NULL;
+}
+
+static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
+{
+	struct nvm_rq rqd;
+	int ret;
+	DECLARE_COMPLETION_ONSTACK(wait);
+
+	memset(&rqd, 0, sizeof(struct nvm_rq));
+
+	pblk_setup_e_rq(pblk, &rqd, ppa);
+
+	rqd.end_io = pblk_end_io_sync;
+	rqd.private = &wait;
+
+	/* The write thread schedules erases so that it minimizes disturbances
+	 * with writes. Thus, there is no need to take the LUN semaphore.
+	 */
+	ret = pblk_submit_io(pblk, &rqd);
+	if (ret) {
+		struct nvm_tgt_dev *dev = pblk->dev;
+		struct nvm_geo *geo = &dev->geo;
+
+		pr_err("pblk: could not sync erase line:%d,blk:%d\n",
+					pblk_dev_ppa_to_line(ppa),
+					pblk_dev_ppa_to_pos(geo, ppa));
+
+		rqd.error = ret;
+		goto out;
+	}
+
+	if (!wait_for_completion_io_timeout(&wait,
+				msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+		pr_err("pblk: sync erase timed out\n");
+	}
+
+out:
+	rqd.private = pblk;
+	__pblk_end_io_erase(pblk, &rqd);
+
+	return 0;
+}
+
+int pblk_line_erase(struct pblk *pblk, struct pblk_line *line)
+{
+	struct pblk_line_meta *lm = &pblk->lm;
+	struct ppa_addr ppa;
+	int bit = -1;
+
+	/* Erase one block at the time and only erase good blocks */
+	while ((bit = find_next_zero_bit(line->erase_bitmap, lm->blk_per_line,
+						bit + 1)) < lm->blk_per_line) {
+		ppa = pblk->luns[bit].bppa; /* set ch and lun */
+		ppa.g.blk = line->id;
+
+		/* If the erase fails, the block is bad and should be marked */
+		line->left_eblks--;
+		WARN_ON(test_and_set_bit(bit, line->erase_bitmap));
+
+		if (pblk_blk_erase_sync(pblk, ppa)) {
+			pr_err("pblk: failed to erase line %d\n", line->id);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+/* For now lines are always assumed full lines. Thus, smeta former and current
+ * lun bitmaps are omitted.
+ */
+static int pblk_line_set_metadata(struct pblk *pblk, struct pblk_line *line,
+				  struct pblk_line *cur)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct pblk_line_meta *lm = &pblk->lm;
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	struct line_smeta *smeta = line->smeta;
+	struct line_emeta *emeta = line->emeta;
+	int nr_blk_line;
+
+	/* After erasing the line, new bad blocks might appear and we risk
+	 * having an invalid line
+	 */
+	nr_blk_line = lm->blk_per_line -
+			bitmap_weight(line->blk_bitmap, lm->blk_per_line);
+	if (nr_blk_line < lm->min_blk_line) {
+		spin_lock(&l_mg->free_lock);
+		spin_lock(&line->lock);
+		line->state = PBLK_LINESTATE_BAD;
+		spin_unlock(&line->lock);
+
+		list_add_tail(&line->list, &l_mg->bad_list);
+		spin_unlock(&l_mg->free_lock);
+
+		pr_debug("pblk: line %d is bad\n", line->id);
+
+		return 0;
+	}
+
+	/* Run-time metadata */
+	line->lun_bitmap = ((void *)(smeta)) + sizeof(struct line_smeta);
+
+	/* Mark LUNs allocated in this line (all for now) */
+	bitmap_set(line->lun_bitmap, 0, lm->lun_bitmap_len);
+
+	smeta->header.identifier = cpu_to_le32(PBLK_MAGIC);
+	memcpy(smeta->header.uuid, pblk->instance_uuid, 16);
+	smeta->header.id = cpu_to_le32(line->id);
+	smeta->header.type = cpu_to_le16(line->type);
+	smeta->header.version = cpu_to_le16(1);
+
+	/* Start metadata */
+	smeta->seq_nr = cpu_to_le64(line->seq_nr);
+	smeta->window_wr_lun = cpu_to_le32(geo->nr_luns);
+
+	/* Fill metadata among lines */
+	if (cur) {
+		memcpy(line->lun_bitmap, cur->lun_bitmap, lm->lun_bitmap_len);
+		smeta->prev_id = cpu_to_le32(cur->id);
+		cur->emeta->next_id = cpu_to_le32(line->id);
+	} else {
+		smeta->prev_id = cpu_to_le32(PBLK_LINE_EMPTY);
+	}
+
+	/* All smeta must be set at this point */
+	smeta->header.crc = cpu_to_le32(pblk_calc_meta_header_crc(pblk, smeta));
+	smeta->crc = cpu_to_le32(pblk_calc_smeta_crc(pblk, smeta));
+
+	/* End metadata */
+	memcpy(&emeta->header, &smeta->header, sizeof(struct line_header));
+	emeta->seq_nr = cpu_to_le64(line->seq_nr);
+	emeta->nr_lbas = cpu_to_le64(line->sec_in_line);
+	emeta->nr_valid_lbas = cpu_to_le64(0);
+	emeta->next_id = cpu_to_le32(PBLK_LINE_EMPTY);
+	emeta->crc = cpu_to_le32(0);
+	emeta->prev_id = smeta->prev_id;
+
+	return 1;
+}
+
+/* For now lines are always assumed full lines. Thus, smeta former and current
+ * lun bitmaps are omitted.
+ */
+static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
+			     int init)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct pblk_line_meta *lm = &pblk->lm;
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	int nr_bb = 0;
+	u64 off;
+	int bit = -1;
+
+	line->sec_in_line = lm->sec_per_line;
+
+	/* Capture bad block information on line mapping bitmaps */
+	while ((bit = find_next_bit(line->blk_bitmap, lm->blk_per_line,
+					bit + 1)) < lm->blk_per_line) {
+		off = bit * geo->sec_per_pl;
+		bitmap_shift_left(l_mg->bb_aux, l_mg->bb_template, off,
+							lm->sec_per_line);
+		bitmap_or(line->map_bitmap, line->map_bitmap, l_mg->bb_aux,
+							lm->sec_per_line);
+		line->sec_in_line -= geo->sec_per_blk;
+		if (bit >= lm->emeta_bb)
+			nr_bb++;
+	}
+
+	/* Mark smeta metadata sectors as bad sectors */
+	bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
+	off = bit * geo->sec_per_pl;
+retry_smeta:
+	bitmap_set(line->map_bitmap, off, lm->smeta_sec);
+	line->sec_in_line -= lm->smeta_sec;
+	line->smeta_ssec = off;
+	line->cur_sec = off + lm->smeta_sec;
+
+	if (init && pblk_line_submit_smeta_io(pblk, line, off, WRITE)) {
+		pr_debug("pblk: line smeta I/O failed. Retry\n");
+		off += geo->sec_per_pl;
+		goto retry_smeta;
+	}
+
+	bitmap_copy(line->invalid_bitmap, line->map_bitmap, lm->sec_per_line);
+
+	/* Mark emeta metadata sectors as bad sectors. We need to consider bad
+	 * blocks to make sure that there are enough sectors to store emeta
+	 */
+	bit = lm->sec_per_line;
+	off = lm->sec_per_line - lm->emeta_sec;
+	bitmap_set(line->invalid_bitmap, off, lm->emeta_sec);
+	while (nr_bb) {
+		off -= geo->sec_per_pl;
+		if (!test_bit(off, line->invalid_bitmap)) {
+			bitmap_set(line->invalid_bitmap, off, geo->sec_per_pl);
+			nr_bb--;
+		}
+	}
+
+	line->sec_in_line -= lm->emeta_sec;
+	line->emeta_ssec = off;
+	line->vsc = line->left_ssecs = line->left_msecs = line->sec_in_line;
+
+	if (lm->sec_per_line - line->sec_in_line !=
+		bitmap_weight(line->invalid_bitmap, lm->sec_per_line)) {
+		spin_lock(&line->lock);
+		line->state = PBLK_LINESTATE_BAD;
+		spin_unlock(&line->lock);
+
+		list_add_tail(&line->list, &l_mg->bad_list);
+		pr_err("pblk: unexpected line %d is bad\n", line->id);
+
+		return 0;
+	}
+
+	return 1;
+}
+
+static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line)
+{
+	struct pblk_line_meta *lm = &pblk->lm;
+
+	line->map_bitmap = mempool_alloc(pblk->line_meta_pool, GFP_ATOMIC);
+	if (!line->map_bitmap)
+		return -ENOMEM;
+	memset(line->map_bitmap, 0, lm->sec_bitmap_len);
+
+	/* invalid_bitmap is special since it is used when line is closed. No
+	 * need to zeroized; it will be initialized using bb info form
+	 * map_bitmap
+	 */
+	line->invalid_bitmap = mempool_alloc(pblk->line_meta_pool, GFP_ATOMIC);
+	if (!line->invalid_bitmap) {
+		mempool_free(line->map_bitmap, pblk->line_meta_pool);
+		return -ENOMEM;
+	}
+
+	spin_lock(&line->lock);
+	if (line->state != PBLK_LINESTATE_FREE) {
+		spin_unlock(&line->lock);
+		WARN(1, "pblk: corrupted line state\n");
+		return -EINTR;
+	}
+	line->state = PBLK_LINESTATE_OPEN;
+	spin_unlock(&line->lock);
+
+	/* Bad blocks do not need to be erased */
+	bitmap_copy(line->erase_bitmap, line->blk_bitmap, lm->blk_per_line);
+	line->left_eblks = line->blk_in_line;
+	atomic_set(&line->left_seblks, line->left_eblks);
+
+	kref_init(&line->ref);
+
+	return 0;
+}
+
+int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
+{
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	int ret;
+
+	spin_lock(&l_mg->free_lock);
+	l_mg->data_line = line;
+	list_del(&line->list);
+	spin_unlock(&l_mg->free_lock);
+
+	ret = pblk_line_prepare(pblk, line);
+	if (ret) {
+		list_add(&line->list, &l_mg->free_list);
+		return ret;
+	}
+
+	pblk_rl_free_lines_dec(&pblk->rl, line);
+
+	if (!pblk_line_init_bb(pblk, line, 0)) {
+		list_add(&line->list, &l_mg->free_list);
+		return -EINTR;
+	}
+
+	return 0;
+}
+
+void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line)
+{
+	mempool_free(line->map_bitmap, pblk->line_meta_pool);
+	line->map_bitmap = NULL;
+	line->smeta = NULL;
+	line->emeta = NULL;
+}
+
+struct pblk_line *pblk_line_get(struct pblk *pblk)
+{
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	struct pblk_line_meta *lm = &pblk->lm;
+	struct pblk_line *line = NULL;
+	int bit;
+
+	lockdep_assert_held(&l_mg->free_lock);
+
+retry_get:
+	if (list_empty(&l_mg->free_list)) {
+		pr_err("pblk: no free lines\n");
+		goto out;
+	}
+
+	line = list_first_entry(&l_mg->free_list, struct pblk_line, list);
+	list_del(&line->list);
+	l_mg->nr_free_lines--;
+
+	bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
+	if (unlikely(bit >= lm->blk_per_line)) {
+		spin_lock(&line->lock);
+		line->state = PBLK_LINESTATE_BAD;
+		spin_unlock(&line->lock);
+
+		list_add_tail(&line->list, &l_mg->bad_list);
+
+		pr_debug("pblk: line %d is bad\n", line->id);
+		goto retry_get;
+	}
+
+	if (pblk_line_prepare(pblk, line)) {
+		pr_err("pblk: failed to prepare line %d\n", line->id);
+		list_add(&line->list, &l_mg->free_list);
+		return NULL;
+	}
+
+out:
+	return line;
+}
+
+static struct pblk_line *pblk_line_retry(struct pblk *pblk,
+					 struct pblk_line *line)
+{
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	struct pblk_line *retry_line;
+
+	spin_lock(&l_mg->free_lock);
+	retry_line = pblk_line_get(pblk);
+	if (!retry_line) {
+		spin_unlock(&l_mg->free_lock);
+		return NULL;
+	}
+
+	retry_line->smeta = line->smeta;
+	retry_line->emeta = line->emeta;
+	retry_line->meta_line = line->meta_line;
+	retry_line->map_bitmap = line->map_bitmap;
+	retry_line->invalid_bitmap = line->invalid_bitmap;
+
+	line->map_bitmap = NULL;
+	line->invalid_bitmap = NULL;
+	line->smeta = NULL;
+	line->emeta = NULL;
+	spin_unlock(&l_mg->free_lock);
+
+	if (pblk_line_erase(pblk, retry_line))
+		return NULL;
+
+	pblk_rl_free_lines_dec(&pblk->rl, retry_line);
+
+	l_mg->data_line = retry_line;
+
+	return retry_line;
+}
+
+struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
+{
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	struct pblk_line *line;
+	int meta_line;
+	int is_next = 0;
+
+	spin_lock(&l_mg->free_lock);
+	line = pblk_line_get(pblk);
+	if (!line) {
+		spin_unlock(&l_mg->free_lock);
+		return NULL;
+	}
+
+	line->seq_nr = l_mg->d_seq_nr++;
+	line->type = PBLK_LINETYPE_DATA;
+	l_mg->data_line = line;
+
+	meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES);
+	set_bit(meta_line, &l_mg->meta_bitmap);
+	line->smeta = l_mg->sline_meta[meta_line].meta;
+	line->emeta = l_mg->eline_meta[meta_line].meta;
+	line->meta_line = meta_line;
+
+	/* Allocate next line for preparation */
+	l_mg->data_next = pblk_line_get(pblk);
+	if (l_mg->data_next) {
+		l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
+		l_mg->data_next->type = PBLK_LINETYPE_DATA;
+		is_next = 1;
+	}
+	spin_unlock(&l_mg->free_lock);
+
+	pblk_rl_free_lines_dec(&pblk->rl, line);
+	if (is_next)
+		pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
+
+	if (pblk_line_erase(pblk, line))
+		return NULL;
+
+retry_setup:
+	if (!pblk_line_set_metadata(pblk, line, NULL)) {
+		line = pblk_line_retry(pblk, line);
+		if (!line)
+			return NULL;
+
+		goto retry_setup;
+	}
+
+	if (!pblk_line_init_bb(pblk, line, 1)) {
+		line = pblk_line_retry(pblk, line);
+		if (!line)
+			return NULL;
+
+		goto retry_setup;
+	}
+
+	return line;
+}
+
+struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
+{
+	struct pblk_line_meta *lm = &pblk->lm;
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	struct pblk_line *cur, *new;
+	unsigned int left_seblks;
+	int meta_line;
+	int is_next = 0;
+
+	cur = l_mg->data_line;
+	new = l_mg->data_next;
+	if (!new)
+		return NULL;
+	l_mg->data_line = new;
+
+retry_line:
+	left_seblks = atomic_read(&new->left_seblks);
+	if (left_seblks) {
+		/* If line is not fully erased, erase it */
+		if (new->left_eblks) {
+			if (pblk_line_erase(pblk, new))
+				return NULL;
+		} else {
+			io_schedule();
+		}
+		goto retry_line;
+	}
+
+	spin_lock(&l_mg->free_lock);
+	/* Allocate next line for preparation */
+	l_mg->data_next = pblk_line_get(pblk);
+	if (l_mg->data_next) {
+		l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
+		l_mg->data_next->type = PBLK_LINETYPE_DATA;
+		is_next = 1;
+	}
+
+retry_meta:
+	meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES);
+	if (meta_line == PBLK_DATA_LINES) {
+		spin_unlock(&l_mg->free_lock);
+		io_schedule();
+		spin_lock(&l_mg->free_lock);
+		goto retry_meta;
+	}
+
+	set_bit(meta_line, &l_mg->meta_bitmap);
+	new->smeta = l_mg->sline_meta[meta_line].meta;
+	new->emeta = l_mg->eline_meta[meta_line].meta;
+	new->meta_line = meta_line;
+
+	memset(new->smeta, 0, lm->smeta_len);
+	memset(new->emeta, 0, lm->emeta_len);
+	spin_unlock(&l_mg->free_lock);
+
+	if (is_next)
+		pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
+
+retry_setup:
+	if (!pblk_line_set_metadata(pblk, new, cur)) {
+		new = pblk_line_retry(pblk, new);
+		if (new)
+			return NULL;
+
+		goto retry_setup;
+	}
+
+	if (!pblk_line_init_bb(pblk, new, 1)) {
+		new = pblk_line_retry(pblk, new);
+		if (!new)
+			return NULL;
+
+		goto retry_setup;
+	}
+
+	return new;
+}
+
+void pblk_line_free(struct pblk *pblk, struct pblk_line *line)
+{
+	if (line->map_bitmap)
+		mempool_free(line->map_bitmap, pblk->line_meta_pool);
+	if (line->invalid_bitmap)
+		mempool_free(line->invalid_bitmap, pblk->line_meta_pool);
+
+	line->map_bitmap = NULL;
+	line->invalid_bitmap = NULL;
+}
+
+void pblk_line_put(struct kref *ref)
+{
+	struct pblk_line *line = container_of(ref, struct pblk_line, ref);
+	struct pblk *pblk = line->pblk;
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+
+	spin_lock(&line->lock);
+	WARN_ON(line->state != PBLK_LINESTATE_GC);
+	line->state = PBLK_LINESTATE_FREE;
+	line->gc_group = PBLK_LINEGC_NONE;
+	pblk_line_free(pblk, line);
+	spin_unlock(&line->lock);
+
+	spin_lock(&l_mg->free_lock);
+	list_add_tail(&line->list, &l_mg->free_list);
+	l_mg->nr_free_lines++;
+	spin_unlock(&l_mg->free_lock);
+
+	pblk_rl_free_lines_inc(&pblk->rl, line);
+}
+
+int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
+{
+	struct nvm_rq *rqd;
+	int err;
+
+	rqd = mempool_alloc(pblk->r_rq_pool, GFP_KERNEL);
+	memset(rqd, 0, pblk_r_rq_size);
+
+	pblk_setup_e_rq(pblk, rqd, ppa);
+
+	rqd->end_io = pblk_end_io_erase;
+	rqd->private = pblk;
+
+	/* The write thread schedules erases so that it minimizes disturbances
+	 * with writes. Thus, there is no need to take the LUN semaphore.
+	 */
+	err = pblk_submit_io(pblk, rqd);
+	if (err) {
+		struct nvm_tgt_dev *dev = pblk->dev;
+		struct nvm_geo *geo = &dev->geo;
+
+		pr_err("pblk: could not async erase line:%d,blk:%d\n",
+					pblk_dev_ppa_to_line(ppa),
+					pblk_dev_ppa_to_pos(geo, ppa));
+	}
+
+	return err;
+}
+
+struct pblk_line *pblk_line_get_data(struct pblk *pblk)
+{
+	return pblk->l_mg.data_line;
+}
+
+struct pblk_line *pblk_line_get_data_next(struct pblk *pblk)
+{
+	return pblk->l_mg.data_next;
+}
+
+int pblk_line_is_full(struct pblk_line *line)
+{
+	return (line->left_msecs == 0);
+}
+
+void pblk_line_close(struct pblk *pblk, struct pblk_line *line)
+{
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	struct list_head *move_list;
+
+	line->emeta->crc = cpu_to_le32(pblk_calc_emeta_crc(pblk, line->emeta));
+
+	if (pblk_line_submit_emeta_io(pblk, line, line->cur_sec, WRITE))
+		pr_err("pblk: line %d close I/O failed\n", line->id);
+
+	WARN(!bitmap_full(line->map_bitmap, line->sec_in_line),
+				"pblk: corrupt closed line %d\n", line->id);
+
+	spin_lock(&l_mg->free_lock);
+	WARN_ON(!test_and_clear_bit(line->meta_line, &l_mg->meta_bitmap));
+	spin_unlock(&l_mg->free_lock);
+
+	spin_lock(&l_mg->gc_lock);
+	spin_lock(&line->lock);
+	WARN_ON(line->state != PBLK_LINESTATE_OPEN);
+	line->state = PBLK_LINESTATE_CLOSED;
+	move_list = pblk_line_gc_list(pblk, line);
+
+	list_add_tail(&line->list, move_list);
+
+	mempool_free(line->map_bitmap, pblk->line_meta_pool);
+	line->map_bitmap = NULL;
+	line->smeta = NULL;
+	line->emeta = NULL;
+
+	spin_unlock(&line->lock);
+	spin_unlock(&l_mg->gc_lock);
+}
+
+void pblk_line_close_ws(struct work_struct *work)
+{
+	struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
+									ws);
+	struct pblk *pblk = line_ws->pblk;
+	struct pblk_line *line = line_ws->line;
+
+	pblk_line_close(pblk, line);
+	mempool_free(line_ws, pblk->line_ws_pool);
+}
+
+void pblk_line_mark_bb(struct work_struct *work)
+{
+	struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
+									ws);
+	struct pblk *pblk = line_ws->pblk;
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct ppa_addr *ppa = line_ws->priv;
+	int ret;
+
+	ret = nvm_set_tgt_bb_tbl(dev, ppa, 1, NVM_BLK_T_GRWN_BAD);
+	if (ret) {
+		struct pblk_line *line;
+		int pos;
+
+		line = &pblk->lines[pblk_dev_ppa_to_line(*ppa)];
+		pos = pblk_dev_ppa_to_pos(&dev->geo, *ppa);
+
+		pr_err("pblk: failed to mark bb, line:%d, pos:%d\n",
+				line->id, pos);
+	}
+
+	kfree(ppa);
+	mempool_free(line_ws, pblk->line_ws_pool);
+}
+
+void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
+		      void (*work)(struct work_struct *))
+{
+	struct pblk_line_ws *line_ws;
+
+	line_ws = mempool_alloc(pblk->line_ws_pool, GFP_ATOMIC);
+	if (!line_ws)
+		return;
+
+	line_ws->pblk = pblk;
+	line_ws->line = line;
+	line_ws->priv = priv;
+
+	INIT_WORK(&line_ws->ws, work);
+	queue_work(pblk->kw_wq, &line_ws->ws);
+}
+
+void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
+		  unsigned long *lun_bitmap)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct pblk_lun *rlun;
+	int lun_id = ppa_list[0].g.ch * geo->luns_per_chnl + ppa_list[0].g.lun;
+	int ret;
+
+	/*
+	 * Only send one inflight I/O per LUN. Since we map at a page
+	 * granurality, all ppas in the I/O will map to the same LUN
+	 */
+#ifdef CONFIG_NVM_DEBUG
+	int i;
+
+	for (i = 1; i < nr_ppas; i++)
+		WARN_ON(ppa_list[0].g.lun != ppa_list[i].g.lun ||
+				ppa_list[0].g.ch != ppa_list[i].g.ch);
+#endif
+	/* If the LUN has been locked for this same request, do no attempt to
+	 * lock it again
+	 */
+	if (test_and_set_bit(lun_id, lun_bitmap))
+		return;
+
+	rlun = &pblk->luns[lun_id];
+	ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(5000));
+	if (ret) {
+		switch (ret) {
+		case -ETIME:
+			pr_err("pblk: lun semaphore timed out\n");
+			break;
+		case -EINTR:
+			pr_err("pblk: lun semaphore timed out\n");
+			break;
+		}
+	}
+}
+
+void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
+		unsigned long *lun_bitmap)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct pblk_lun *rlun;
+	int nr_luns = geo->nr_luns;
+	int bit = -1;
+
+	while ((bit = find_next_bit(lun_bitmap, nr_luns, bit + 1)) < nr_luns) {
+		rlun = &pblk->luns[bit];
+		up(&rlun->wr_sem);
+	}
+
+	kfree(lun_bitmap);
+}
+
+void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
+{
+	struct ppa_addr l2p_ppa;
+
+	/* logic error: lba out-of-bounds. Ignore update */
+	if (!(lba < pblk->rl.nr_secs)) {
+		WARN(1, "pblk: corrupted L2P map request\n");
+		return;
+	}
+
+	spin_lock(&pblk->trans_lock);
+	l2p_ppa = pblk_trans_map_get(pblk, lba);
+
+	if (!pblk_addr_in_cache(l2p_ppa) && !pblk_ppa_empty(l2p_ppa))
+		pblk_map_invalidate(pblk, l2p_ppa);
+
+	pblk_trans_map_set(pblk, lba, ppa);
+	spin_unlock(&pblk->trans_lock);
+}
+
+void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
+{
+#ifdef CONFIG_NVM_DEBUG
+	/* Callers must ensure that the ppa points to a cache address */
+	BUG_ON(!pblk_addr_in_cache(ppa));
+	BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa)));
+#endif
+
+	pblk_update_map(pblk, lba, ppa);
+}
+
+int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
+		       struct pblk_line *gc_line)
+{
+	struct ppa_addr l2p_ppa;
+	int ret = 1;
+
+#ifdef CONFIG_NVM_DEBUG
+	/* Callers must ensure that the ppa points to a cache address */
+	BUG_ON(!pblk_addr_in_cache(ppa));
+	BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa)));
+#endif
+
+	/* logic error: lba out-of-bounds. Ignore update */
+	if (!(lba < pblk->rl.nr_secs)) {
+		WARN(1, "pblk: corrupted L2P map request\n");
+		return 0;
+	}
+
+	spin_lock(&pblk->trans_lock);
+	l2p_ppa = pblk_trans_map_get(pblk, lba);
+
+	/* Prevent updated entries to be overwritten by GC */
+	if (pblk_addr_in_cache(l2p_ppa) || pblk_ppa_empty(l2p_ppa) ||
+				pblk_tgt_ppa_to_line(l2p_ppa) != gc_line->id) {
+		ret = 0;
+		goto out;
+	}
+
+	pblk_trans_map_set(pblk, lba, ppa);
+out:
+	spin_unlock(&pblk->trans_lock);
+	return ret;
+}
+
+void pblk_update_map_dev(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
+			 struct ppa_addr entry_line)
+{
+	struct ppa_addr l2p_line;
+
+#ifdef CONFIG_NVM_DEBUG
+	/* Callers must ensure that the ppa points to a device address */
+	BUG_ON(pblk_addr_in_cache(ppa));
+#endif
+	/* Invalidate and discard padded entries */
+	if (lba == ADDR_EMPTY) {
+#ifdef CONFIG_NVM_DEBUG
+		atomic_long_inc(&pblk->padded_wb);
+#endif
+		pblk_map_invalidate(pblk, ppa);
+		return;
+	}
+
+	/* logic error: lba out-of-bounds. Ignore update */
+	if (!(lba < pblk->rl.nr_secs)) {
+		WARN(1, "pblk: corrupted L2P map request\n");
+		return;
+	}
+
+	spin_lock(&pblk->trans_lock);
+	l2p_line = pblk_trans_map_get(pblk, lba);
+
+	/* Do not update L2P if the cacheline has been updated. In this case,
+	 * the mapped ppa must be invalidated
+	 */
+	if (l2p_line.ppa != entry_line.ppa) {
+		if (!pblk_ppa_empty(ppa))
+			pblk_map_invalidate(pblk, ppa);
+		goto out;
+	}
+
+#ifdef CONFIG_NVM_DEBUG
+	WARN_ON(!pblk_addr_in_cache(l2p_line) && !pblk_ppa_empty(l2p_line));
+#endif
+
+	pblk_trans_map_set(pblk, lba, ppa);
+out:
+	spin_unlock(&pblk->trans_lock);
+}
+
+void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
+			 sector_t blba, int nr_secs)
+{
+	int i;
+
+	spin_lock(&pblk->trans_lock);
+	for (i = 0; i < nr_secs; i++)
+		ppas[i] = pblk_trans_map_get(pblk, blba + i);
+	spin_unlock(&pblk->trans_lock);
+}
+
+void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
+			  u64 *lba_list, int nr_secs)
+{
+	sector_t lba;
+	int i;
+
+	spin_lock(&pblk->trans_lock);
+	for (i = 0; i < nr_secs; i++) {
+		lba = lba_list[i];
+		if (lba == ADDR_EMPTY) {
+			ppas[i].ppa = ADDR_EMPTY;
+		} else {
+			/* logic error: lba out-of-bounds. Ignore update */
+			if (!(lba < pblk->rl.nr_secs)) {
+				WARN(1, "pblk: corrupted L2P map request\n");
+				continue;
+			}
+			ppas[i] = pblk_trans_map_get(pblk, lba);
+		}
+	}
+	spin_unlock(&pblk->trans_lock);
+}
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
new file mode 100644
index 000000000000..9b147cfd8a41
--- /dev/null
+++ b/drivers/lightnvm/pblk-gc.c
@@ -0,0 +1,555 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *                  Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-gc.c - pblk's garbage collector
+ */
+
+#include "pblk.h"
+#include <linux/delay.h>
+
+static void pblk_gc_free_gc_rq(struct pblk_gc_rq *gc_rq)
+{
+	kfree(gc_rq->data);
+	kfree(gc_rq->lba_list);
+	kfree(gc_rq);
+}
+
+static int pblk_gc_write(struct pblk *pblk)
+{
+	struct pblk_gc *gc = &pblk->gc;
+	struct pblk_gc_rq *gc_rq, *tgc_rq;
+	LIST_HEAD(w_list);
+
+	spin_lock(&gc->w_lock);
+	if (list_empty(&gc->w_list)) {
+		spin_unlock(&gc->w_lock);
+		return 1;
+	}
+
+	list_for_each_entry_safe(gc_rq, tgc_rq, &gc->w_list, list) {
+		list_move_tail(&gc_rq->list, &w_list);
+		gc->w_entries--;
+	}
+	spin_unlock(&gc->w_lock);
+
+	list_for_each_entry_safe(gc_rq, tgc_rq, &w_list, list) {
+		pblk_write_gc_to_cache(pblk, gc_rq->data, gc_rq->lba_list,
+				gc_rq->nr_secs, gc_rq->secs_to_gc,
+				gc_rq->line, PBLK_IOTYPE_GC);
+
+		kref_put(&gc_rq->line->ref, pblk_line_put);
+
+		list_del(&gc_rq->list);
+		pblk_gc_free_gc_rq(gc_rq);
+	}
+
+	return 0;
+}
+
+static void pblk_gc_writer_kick(struct pblk_gc *gc)
+{
+	wake_up_process(gc->gc_writer_ts);
+}
+
+/*
+ * Responsible for managing all memory related to a gc request. Also in case of
+ * failure
+ */
+static int pblk_gc_move_valid_secs(struct pblk *pblk, struct pblk_line *line,
+				   u64 *lba_list, unsigned int nr_secs)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct pblk_gc *gc = &pblk->gc;
+	struct pblk_gc_rq *gc_rq;
+	void *data;
+	unsigned int secs_to_gc;
+	int ret = NVM_IO_OK;
+
+	data = kmalloc(nr_secs * geo->sec_size, GFP_KERNEL);
+	if (!data) {
+		ret = NVM_IO_ERR;
+		goto free_lba_list;
+	}
+
+	/* Read from GC victim block */
+	if (pblk_submit_read_gc(pblk, lba_list, data, nr_secs,
+							&secs_to_gc, line)) {
+		ret = NVM_IO_ERR;
+		goto free_data;
+	}
+
+	if (!secs_to_gc)
+		goto free_data;
+
+	gc_rq = kmalloc(sizeof(struct pblk_gc_rq), GFP_KERNEL);
+	if (!gc_rq) {
+		ret = NVM_IO_ERR;
+		goto free_data;
+	}
+
+	gc_rq->line = line;
+	gc_rq->data = data;
+	gc_rq->lba_list = lba_list;
+	gc_rq->nr_secs = nr_secs;
+	gc_rq->secs_to_gc = secs_to_gc;
+
+	kref_get(&line->ref);
+
+retry:
+	spin_lock(&gc->w_lock);
+	if (gc->w_entries > 256) {
+		spin_unlock(&gc->w_lock);
+		usleep_range(256, 1024);
+		goto retry;
+	}
+	gc->w_entries++;
+	list_add_tail(&gc_rq->list, &gc->w_list);
+	spin_unlock(&gc->w_lock);
+
+	pblk_gc_writer_kick(&pblk->gc);
+
+	return NVM_IO_OK;
+
+free_data:
+	kfree(data);
+free_lba_list:
+	kfree(lba_list);
+
+	return ret;
+}
+
+static void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line)
+{
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	struct list_head *move_list;
+
+	spin_lock(&line->lock);
+	WARN_ON(line->state != PBLK_LINESTATE_GC);
+	line->state = PBLK_LINESTATE_CLOSED;
+	move_list = pblk_line_gc_list(pblk, line);
+	spin_unlock(&line->lock);
+
+	if (move_list) {
+		spin_lock(&l_mg->gc_lock);
+		list_add_tail(&line->list, move_list);
+		spin_unlock(&l_mg->gc_lock);
+	}
+}
+
+static void pblk_gc_line_ws(struct work_struct *work)
+{
+	struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
+									ws);
+	struct pblk *pblk = line_ws->pblk;
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	struct pblk_line *line = line_ws->line;
+	struct pblk_line_meta *lm = &pblk->lm;
+	__le64 *lba_list = line_ws->priv;
+	u64 *gc_list;
+	int sec_left;
+	int nr_ppas, bit;
+	int put_line = 1;
+
+	pr_debug("pblk: line '%d' being reclaimed for GC\n", line->id);
+
+	spin_lock(&line->lock);
+	sec_left = line->vsc;
+	if (!sec_left) {
+		/* Lines are erased before being used (l_mg->data_/log_next) */
+		spin_unlock(&line->lock);
+		goto out;
+	}
+	spin_unlock(&line->lock);
+
+	if (sec_left < 0) {
+		pr_err("pblk: corrupted GC line (%d)\n", line->id);
+		put_line = 0;
+		pblk_put_line_back(pblk, line);
+		goto out;
+	}
+
+	bit = -1;
+next_rq:
+	gc_list = kmalloc_array(pblk->max_write_pgs, sizeof(u64), GFP_KERNEL);
+	if (!gc_list) {
+		put_line = 0;
+		pblk_put_line_back(pblk, line);
+		goto out;
+	}
+
+	nr_ppas = 0;
+	do {
+		bit = find_next_zero_bit(line->invalid_bitmap, lm->sec_per_line,
+								bit + 1);
+		if (bit > line->emeta_ssec)
+			break;
+
+		gc_list[nr_ppas++] = le64_to_cpu(lba_list[bit]);
+	} while (nr_ppas < pblk->max_write_pgs);
+
+	if (unlikely(!nr_ppas)) {
+		kfree(gc_list);
+		goto out;
+	}
+
+	if (pblk_gc_move_valid_secs(pblk, line, gc_list, nr_ppas)) {
+		pr_err("pblk: could not GC all sectors: line:%d (%d/%d/%d)\n",
+						line->id, line->vsc,
+						nr_ppas, nr_ppas);
+		put_line = 0;
+		pblk_put_line_back(pblk, line);
+		goto out;
+	}
+
+	sec_left -= nr_ppas;
+	if (sec_left > 0)
+		goto next_rq;
+
+out:
+	pblk_mfree(line->emeta, l_mg->emeta_alloc_type);
+	mempool_free(line_ws, pblk->line_ws_pool);
+	atomic_dec(&pblk->gc.inflight_gc);
+	if (put_line)
+		kref_put(&line->ref, pblk_line_put);
+}
+
+static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line)
+{
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	struct pblk_line_meta *lm = &pblk->lm;
+	struct pblk_line_ws *line_ws;
+	__le64 *lba_list;
+	int ret;
+
+	line_ws = mempool_alloc(pblk->line_ws_pool, GFP_KERNEL);
+	line->emeta = pblk_malloc(lm->emeta_len, l_mg->emeta_alloc_type,
+								GFP_KERNEL);
+	if (!line->emeta) {
+		pr_err("pblk: cannot use GC emeta\n");
+		goto fail_free_ws;
+	}
+
+	ret = pblk_line_read_emeta(pblk, line);
+	if (ret) {
+		pr_err("pblk: line %d read emeta failed (%d)\n", line->id, ret);
+		goto fail_free_emeta;
+	}
+
+	/* If this read fails, it means that emeta is corrupted. For now, leave
+	 * the line untouched. TODO: Implement a recovery routine that scans and
+	 * moves all sectors on the line.
+	 */
+	lba_list = pblk_recov_get_lba_list(pblk, line->emeta);
+	if (!lba_list) {
+		pr_err("pblk: could not interpret emeta (line %d)\n", line->id);
+		goto fail_free_emeta;
+	}
+
+	line_ws->pblk = pblk;
+	line_ws->line = line;
+	line_ws->priv = lba_list;
+
+	INIT_WORK(&line_ws->ws, pblk_gc_line_ws);
+	queue_work(pblk->gc.gc_reader_wq, &line_ws->ws);
+
+	return 0;
+
+fail_free_emeta:
+	pblk_mfree(line->emeta, l_mg->emeta_alloc_type);
+fail_free_ws:
+	mempool_free(line_ws, pblk->line_ws_pool);
+	pblk_put_line_back(pblk, line);
+
+	return 1;
+}
+
+static void pblk_gc_lines(struct pblk *pblk, struct list_head *gc_list)
+{
+	struct pblk_line *line, *tline;
+
+	list_for_each_entry_safe(line, tline, gc_list, list) {
+		if (pblk_gc_line(pblk, line))
+			pr_err("pblk: failed to GC line %d\n", line->id);
+		list_del(&line->list);
+	}
+}
+
+/*
+ * Lines with no valid sectors will be returned to the free list immediately. If
+ * GC is activated - either because the free block count is under the determined
+ * threshold, or because it is being forced from user space - only lines with a
+ * high count of invalid sectors will be recycled.
+ */
+static void pblk_gc_run(struct pblk *pblk)
+{
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	struct pblk_gc *gc = &pblk->gc;
+	struct pblk_line *line, *tline;
+	unsigned int nr_blocks_free, nr_blocks_need;
+	struct list_head *group_list;
+	int run_gc, gc_group = 0;
+	int prev_gc = 0;
+	int inflight_gc = atomic_read(&gc->inflight_gc);
+	LIST_HEAD(gc_list);
+
+	spin_lock(&l_mg->gc_lock);
+	list_for_each_entry_safe(line, tline, &l_mg->gc_full_list, list) {
+		spin_lock(&line->lock);
+		WARN_ON(line->state != PBLK_LINESTATE_CLOSED);
+		line->state = PBLK_LINESTATE_GC;
+		spin_unlock(&line->lock);
+
+		list_del(&line->list);
+		kref_put(&line->ref, pblk_line_put);
+	}
+	spin_unlock(&l_mg->gc_lock);
+
+	nr_blocks_need = pblk_rl_gc_thrs(&pblk->rl);
+	nr_blocks_free = pblk_rl_nr_free_blks(&pblk->rl);
+	run_gc = (nr_blocks_need > nr_blocks_free || gc->gc_forced);
+
+next_gc_group:
+	group_list = l_mg->gc_lists[gc_group++];
+	spin_lock(&l_mg->gc_lock);
+	while (run_gc && !list_empty(group_list)) {
+		/* No need to queue up more GC lines than we can handle */
+		if (!run_gc || inflight_gc > gc->gc_jobs_active) {
+			spin_unlock(&l_mg->gc_lock);
+			pblk_gc_lines(pblk, &gc_list);
+			return;
+		}
+
+		line = list_first_entry(group_list, struct pblk_line, list);
+		nr_blocks_free += line->blk_in_line;
+
+		spin_lock(&line->lock);
+		WARN_ON(line->state != PBLK_LINESTATE_CLOSED);
+		line->state = PBLK_LINESTATE_GC;
+		list_move_tail(&line->list, &gc_list);
+		atomic_inc(&gc->inflight_gc);
+		inflight_gc++;
+		spin_unlock(&line->lock);
+
+		prev_gc = 1;
+		run_gc = (nr_blocks_need > nr_blocks_free || gc->gc_forced);
+	}
+	spin_unlock(&l_mg->gc_lock);
+
+	pblk_gc_lines(pblk, &gc_list);
+
+	if (!prev_gc && pblk->rl.rb_state > gc_group &&
+						gc_group < PBLK_NR_GC_LISTS)
+		goto next_gc_group;
+}
+
+
+static void pblk_gc_kick(struct pblk *pblk)
+{
+	struct pblk_gc *gc = &pblk->gc;
+
+	wake_up_process(gc->gc_ts);
+	pblk_gc_writer_kick(gc);
+	mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS));
+}
+
+static void pblk_gc_timer(unsigned long data)
+{
+	struct pblk *pblk = (struct pblk *)data;
+
+	pblk_gc_kick(pblk);
+}
+
+static int pblk_gc_ts(void *data)
+{
+	struct pblk *pblk = data;
+
+	while (!kthread_should_stop()) {
+		pblk_gc_run(pblk);
+		set_current_state(TASK_INTERRUPTIBLE);
+		io_schedule();
+	}
+
+	return 0;
+}
+
+static int pblk_gc_writer_ts(void *data)
+{
+	struct pblk *pblk = data;
+
+	while (!kthread_should_stop()) {
+		if (!pblk_gc_write(pblk))
+			continue;
+		set_current_state(TASK_INTERRUPTIBLE);
+		io_schedule();
+	}
+
+	return 0;
+}
+
+static void pblk_gc_start(struct pblk *pblk)
+{
+	pblk->gc.gc_active = 1;
+
+	pr_debug("pblk: gc start\n");
+}
+
+int pblk_gc_status(struct pblk *pblk)
+{
+	struct pblk_gc *gc = &pblk->gc;
+	int ret;
+
+	spin_lock(&gc->lock);
+	ret = gc->gc_active;
+	spin_unlock(&gc->lock);
+
+	return ret;
+}
+
+static void __pblk_gc_should_start(struct pblk *pblk)
+{
+	struct pblk_gc *gc = &pblk->gc;
+
+	lockdep_assert_held(&gc->lock);
+
+	if (gc->gc_enabled && !gc->gc_active)
+		pblk_gc_start(pblk);
+}
+
+void pblk_gc_should_start(struct pblk *pblk)
+{
+	struct pblk_gc *gc = &pblk->gc;
+
+	spin_lock(&gc->lock);
+	__pblk_gc_should_start(pblk);
+	spin_unlock(&gc->lock);
+}
+
+/*
+ * If flush_wq == 1 then no lock should be held by the caller since
+ * flush_workqueue can sleep
+ */
+static void pblk_gc_stop(struct pblk *pblk, int flush_wq)
+{
+	spin_lock(&pblk->gc.lock);
+	pblk->gc.gc_active = 0;
+	spin_unlock(&pblk->gc.lock);
+
+	pr_debug("pblk: gc stop\n");
+}
+
+void pblk_gc_should_stop(struct pblk *pblk)
+{
+	struct pblk_gc *gc = &pblk->gc;
+
+	if (gc->gc_active && !gc->gc_forced)
+		pblk_gc_stop(pblk, 0);
+}
+
+void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
+			      int *gc_active)
+{
+	struct pblk_gc *gc = &pblk->gc;
+
+	spin_lock(&gc->lock);
+	*gc_enabled = gc->gc_enabled;
+	*gc_active = gc->gc_active;
+	spin_unlock(&gc->lock);
+}
+
+void pblk_gc_sysfs_force(struct pblk *pblk, int force)
+{
+	struct pblk_gc *gc = &pblk->gc;
+	int rsv = 0;
+
+	spin_lock(&gc->lock);
+	if (force) {
+		gc->gc_enabled = 1;
+		rsv = 64;
+	}
+	pblk_rl_set_gc_rsc(&pblk->rl, rsv);
+	gc->gc_forced = force;
+	__pblk_gc_should_start(pblk);
+	spin_unlock(&gc->lock);
+}
+
+int pblk_gc_init(struct pblk *pblk)
+{
+	struct pblk_gc *gc = &pblk->gc;
+	int ret;
+
+	gc->gc_ts = kthread_create(pblk_gc_ts, pblk, "pblk-gc-ts");
+	if (IS_ERR(gc->gc_ts)) {
+		pr_err("pblk: could not allocate GC main kthread\n");
+		return PTR_ERR(gc->gc_ts);
+	}
+
+	gc->gc_writer_ts = kthread_create(pblk_gc_writer_ts, pblk,
+							"pblk-gc-writer-ts");
+	if (IS_ERR(gc->gc_writer_ts)) {
+		pr_err("pblk: could not allocate GC writer kthread\n");
+		ret = PTR_ERR(gc->gc_writer_ts);
+		goto fail_free_main_kthread;
+	}
+
+	setup_timer(&gc->gc_timer, pblk_gc_timer, (unsigned long)pblk);
+	mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS));
+
+	gc->gc_active = 0;
+	gc->gc_forced = 0;
+	gc->gc_enabled = 1;
+	gc->gc_jobs_active = 8;
+	gc->w_entries = 0;
+	atomic_set(&gc->inflight_gc, 0);
+
+	gc->gc_reader_wq = alloc_workqueue("pblk-gc-reader-wq",
+			WQ_MEM_RECLAIM | WQ_UNBOUND, gc->gc_jobs_active);
+	if (!gc->gc_reader_wq) {
+		pr_err("pblk: could not allocate GC reader workqueue\n");
+		ret = -ENOMEM;
+		goto fail_free_writer_kthread;
+	}
+
+	spin_lock_init(&gc->lock);
+	spin_lock_init(&gc->w_lock);
+	INIT_LIST_HEAD(&gc->w_list);
+
+	return 0;
+
+fail_free_main_kthread:
+	kthread_stop(gc->gc_ts);
+fail_free_writer_kthread:
+	kthread_stop(gc->gc_writer_ts);
+
+	return ret;
+}
+
+void pblk_gc_exit(struct pblk *pblk)
+{
+	struct pblk_gc *gc = &pblk->gc;
+
+	flush_workqueue(gc->gc_reader_wq);
+
+	del_timer(&gc->gc_timer);
+	pblk_gc_stop(pblk, 1);
+
+	if (gc->gc_ts)
+		kthread_stop(gc->gc_ts);
+
+	if (pblk->gc.gc_reader_wq)
+		destroy_workqueue(pblk->gc.gc_reader_wq);
+
+	if (gc->gc_writer_ts)
+		kthread_stop(gc->gc_writer_ts);
+}
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
new file mode 100644
index 000000000000..94653b1f1300
--- /dev/null
+++ b/drivers/lightnvm/pblk-init.c
@@ -0,0 +1,949 @@
+/*
+ * Copyright (C) 2015 IT University of Copenhagen (rrpc.c)
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *                  Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Implementation of a physical block-device target for Open-channel SSDs.
+ *
+ * pblk-init.c - pblk's initialization.
+ */
+
+#include "pblk.h"
+
+static struct kmem_cache *pblk_blk_ws_cache, *pblk_rec_cache, *pblk_r_rq_cache,
+					*pblk_w_rq_cache, *pblk_line_meta_cache;
+static DECLARE_RWSEM(pblk_lock);
+
+static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
+			  struct bio *bio)
+{
+	int ret;
+
+	/* Read requests must be <= 256kb due to NVMe's 64 bit completion bitmap
+	 * constraint. Writes can be of arbitrary size.
+	 */
+	if (bio_data_dir(bio) == READ) {
+		blk_queue_split(q, &bio, q->bio_split);
+		ret = pblk_submit_read(pblk, bio);
+		if (ret == NVM_IO_DONE && bio_flagged(bio, BIO_CLONED))
+			bio_put(bio);
+
+		return ret;
+	}
+
+	/* Prevent deadlock in the case of a modest LUN configuration and large
+	 * user I/Os. Unless stalled, the rate limiter leaves at least 256KB
+	 * available for user I/O.
+	 */
+	if (unlikely(pblk_get_secs(bio) >= pblk_rl_sysfs_rate_show(&pblk->rl)))
+		blk_queue_split(q, &bio, q->bio_split);
+
+	return pblk_write_to_cache(pblk, bio, PBLK_IOTYPE_USER);
+}
+
+static blk_qc_t pblk_make_rq(struct request_queue *q, struct bio *bio)
+{
+	struct pblk *pblk = q->queuedata;
+
+	if (bio_op(bio) == REQ_OP_DISCARD) {
+		pblk_discard(pblk, bio);
+		if (!(bio->bi_opf & REQ_PREFLUSH)) {
+			bio_endio(bio);
+			return BLK_QC_T_NONE;
+		}
+	}
+
+	switch (pblk_rw_io(q, pblk, bio)) {
+	case NVM_IO_ERR:
+		bio_io_error(bio);
+		break;
+	case NVM_IO_DONE:
+		bio_endio(bio);
+		break;
+	}
+
+	return BLK_QC_T_NONE;
+}
+
+static void pblk_l2p_free(struct pblk *pblk)
+{
+	vfree(pblk->trans_map);
+}
+
+static int pblk_l2p_init(struct pblk *pblk)
+{
+	sector_t i;
+	struct ppa_addr ppa;
+	int entry_size = 8;
+
+	if (pblk->ppaf_bitsize < 32)
+		entry_size = 4;
+
+	pblk->trans_map = vmalloc(entry_size * pblk->rl.nr_secs);
+	if (!pblk->trans_map)
+		return -ENOMEM;
+
+	pblk_ppa_set_empty(&ppa);
+
+	for (i = 0; i < pblk->rl.nr_secs; i++)
+		pblk_trans_map_set(pblk, i, ppa);
+
+	return 0;
+}
+
+static void pblk_rwb_free(struct pblk *pblk)
+{
+	if (pblk_rb_tear_down_check(&pblk->rwb))
+		pr_err("pblk: write buffer error on tear down\n");
+
+	pblk_rb_data_free(&pblk->rwb);
+	vfree(pblk_rb_entries_ref(&pblk->rwb));
+}
+
+static int pblk_rwb_init(struct pblk *pblk)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct pblk_rb_entry *entries;
+	unsigned long nr_entries;
+	unsigned int power_size, power_seg_sz;
+
+	nr_entries = pblk_rb_calculate_size(pblk->pgs_in_buffer);
+
+	entries = vzalloc(nr_entries * sizeof(struct pblk_rb_entry));
+	if (!entries)
+		return -ENOMEM;
+
+	power_size = get_count_order(nr_entries);
+	power_seg_sz = get_count_order(geo->sec_size);
+
+	return pblk_rb_init(&pblk->rwb, entries, power_size, power_seg_sz);
+}
+
+/* Minimum pages needed within a lun */
+#define PAGE_POOL_SIZE 16
+#define ADDR_POOL_SIZE 64
+
+static int pblk_set_ppaf(struct pblk *pblk)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct nvm_addr_format ppaf = geo->ppaf;
+	int power_len;
+
+	/* Re-calculate channel and lun format to adapt to configuration */
+	power_len = get_count_order(geo->nr_chnls);
+	if (1 << power_len != geo->nr_chnls) {
+		pr_err("pblk: supports only power-of-two channel config.\n");
+		return -EINVAL;
+	}
+	ppaf.ch_len = power_len;
+
+	power_len = get_count_order(geo->luns_per_chnl);
+	if (1 << power_len != geo->luns_per_chnl) {
+		pr_err("pblk: supports only power-of-two LUN config.\n");
+		return -EINVAL;
+	}
+	ppaf.lun_len = power_len;
+
+	pblk->ppaf.sec_offset = 0;
+	pblk->ppaf.pln_offset = ppaf.sect_len;
+	pblk->ppaf.ch_offset = pblk->ppaf.pln_offset + ppaf.pln_len;
+	pblk->ppaf.lun_offset = pblk->ppaf.ch_offset + ppaf.ch_len;
+	pblk->ppaf.pg_offset = pblk->ppaf.lun_offset + ppaf.lun_len;
+	pblk->ppaf.blk_offset = pblk->ppaf.pg_offset + ppaf.pg_len;
+	pblk->ppaf.sec_mask = (1ULL << ppaf.sect_len) - 1;
+	pblk->ppaf.pln_mask = ((1ULL << ppaf.pln_len) - 1) <<
+							pblk->ppaf.pln_offset;
+	pblk->ppaf.ch_mask = ((1ULL << ppaf.ch_len) - 1) <<
+							pblk->ppaf.ch_offset;
+	pblk->ppaf.lun_mask = ((1ULL << ppaf.lun_len) - 1) <<
+							pblk->ppaf.lun_offset;
+	pblk->ppaf.pg_mask = ((1ULL << ppaf.pg_len) - 1) <<
+							pblk->ppaf.pg_offset;
+	pblk->ppaf.blk_mask = ((1ULL << ppaf.blk_len) - 1) <<
+							pblk->ppaf.blk_offset;
+
+	pblk->ppaf_bitsize = pblk->ppaf.blk_offset + ppaf.blk_len;
+
+	return 0;
+}
+
+static int pblk_init_global_caches(struct pblk *pblk)
+{
+	char cache_name[PBLK_CACHE_NAME_LEN];
+
+	down_write(&pblk_lock);
+	pblk_blk_ws_cache = kmem_cache_create("pblk_blk_ws",
+				sizeof(struct pblk_line_ws), 0, 0, NULL);
+	if (!pblk_blk_ws_cache) {
+		up_write(&pblk_lock);
+		return -ENOMEM;
+	}
+
+	pblk_rec_cache = kmem_cache_create("pblk_rec",
+				sizeof(struct pblk_rec_ctx), 0, 0, NULL);
+	if (!pblk_rec_cache) {
+		kmem_cache_destroy(pblk_blk_ws_cache);
+		up_write(&pblk_lock);
+		return -ENOMEM;
+	}
+
+	pblk_r_rq_cache = kmem_cache_create("pblk_r_rq", pblk_r_rq_size,
+				0, 0, NULL);
+	if (!pblk_r_rq_cache) {
+		kmem_cache_destroy(pblk_blk_ws_cache);
+		kmem_cache_destroy(pblk_rec_cache);
+		up_write(&pblk_lock);
+		return -ENOMEM;
+	}
+
+	pblk_w_rq_cache = kmem_cache_create("pblk_w_rq", pblk_w_rq_size,
+				0, 0, NULL);
+	if (!pblk_w_rq_cache) {
+		kmem_cache_destroy(pblk_blk_ws_cache);
+		kmem_cache_destroy(pblk_rec_cache);
+		kmem_cache_destroy(pblk_r_rq_cache);
+		up_write(&pblk_lock);
+		return -ENOMEM;
+	}
+
+	snprintf(cache_name, sizeof(cache_name), "pblk_line_m_%s",
+							pblk->disk->disk_name);
+	pblk_line_meta_cache = kmem_cache_create(cache_name,
+				pblk->lm.sec_bitmap_len, 0, 0, NULL);
+	if (!pblk_line_meta_cache) {
+		kmem_cache_destroy(pblk_blk_ws_cache);
+		kmem_cache_destroy(pblk_rec_cache);
+		kmem_cache_destroy(pblk_r_rq_cache);
+		kmem_cache_destroy(pblk_w_rq_cache);
+		up_write(&pblk_lock);
+		return -ENOMEM;
+	}
+	up_write(&pblk_lock);
+
+	return 0;
+}
+
+static int pblk_core_init(struct pblk *pblk)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	int max_write_ppas;
+	int mod;
+
+	pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE);
+	max_write_ppas = pblk->min_write_pgs * geo->nr_luns;
+	pblk->max_write_pgs = (max_write_ppas < nvm_max_phys_sects(dev)) ?
+				max_write_ppas : nvm_max_phys_sects(dev);
+	pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg *
+						geo->nr_planes * geo->nr_luns;
+
+	if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) {
+		pr_err("pblk: cannot support device max_phys_sect\n");
+		return -EINVAL;
+	}
+
+	div_u64_rem(geo->sec_per_blk, pblk->min_write_pgs, &mod);
+	if (mod) {
+		pr_err("pblk: bad configuration of sectors/pages\n");
+		return -EINVAL;
+	}
+
+	if (pblk_init_global_caches(pblk))
+		return -ENOMEM;
+
+	pblk->page_pool = mempool_create_page_pool(PAGE_POOL_SIZE, 0);
+	if (!pblk->page_pool)
+		return -ENOMEM;
+
+	pblk->line_ws_pool = mempool_create_slab_pool(geo->nr_luns,
+							pblk_blk_ws_cache);
+	if (!pblk->line_ws_pool)
+		goto free_page_pool;
+
+	pblk->rec_pool = mempool_create_slab_pool(geo->nr_luns, pblk_rec_cache);
+	if (!pblk->rec_pool)
+		goto free_blk_ws_pool;
+
+	pblk->r_rq_pool = mempool_create_slab_pool(64, pblk_r_rq_cache);
+	if (!pblk->r_rq_pool)
+		goto free_rec_pool;
+
+	pblk->w_rq_pool = mempool_create_slab_pool(64, pblk_w_rq_cache);
+	if (!pblk->w_rq_pool)
+		goto free_r_rq_pool;
+
+	pblk->line_meta_pool =
+			mempool_create_slab_pool(16, pblk_line_meta_cache);
+	if (!pblk->line_meta_pool)
+		goto free_w_rq_pool;
+
+	pblk->kw_wq = alloc_workqueue("pblk-aux-wq",
+					WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+	if (!pblk->kw_wq)
+		goto free_line_meta_pool;
+
+	if (pblk_set_ppaf(pblk))
+		goto free_kw_wq;
+
+	if (pblk_rwb_init(pblk))
+		goto free_kw_wq;
+
+	INIT_LIST_HEAD(&pblk->compl_list);
+	return 0;
+
+free_kw_wq:
+	destroy_workqueue(pblk->kw_wq);
+free_line_meta_pool:
+	mempool_destroy(pblk->line_meta_pool);
+free_w_rq_pool:
+	mempool_destroy(pblk->w_rq_pool);
+free_r_rq_pool:
+	mempool_destroy(pblk->r_rq_pool);
+free_rec_pool:
+	mempool_destroy(pblk->rec_pool);
+free_blk_ws_pool:
+	mempool_destroy(pblk->line_ws_pool);
+free_page_pool:
+	mempool_destroy(pblk->page_pool);
+	return -ENOMEM;
+}
+
+static void pblk_core_free(struct pblk *pblk)
+{
+	if (pblk->kw_wq)
+		destroy_workqueue(pblk->kw_wq);
+
+	mempool_destroy(pblk->page_pool);
+	mempool_destroy(pblk->line_ws_pool);
+	mempool_destroy(pblk->rec_pool);
+	mempool_destroy(pblk->r_rq_pool);
+	mempool_destroy(pblk->w_rq_pool);
+	mempool_destroy(pblk->line_meta_pool);
+
+	kmem_cache_destroy(pblk_blk_ws_cache);
+	kmem_cache_destroy(pblk_rec_cache);
+	kmem_cache_destroy(pblk_r_rq_cache);
+	kmem_cache_destroy(pblk_w_rq_cache);
+	kmem_cache_destroy(pblk_line_meta_cache);
+}
+
+static void pblk_luns_free(struct pblk *pblk)
+{
+	kfree(pblk->luns);
+}
+
+static void pblk_lines_free(struct pblk *pblk)
+{
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	struct pblk_line *line;
+	int i;
+
+	spin_lock(&l_mg->free_lock);
+	for (i = 0; i < l_mg->nr_lines; i++) {
+		line = &pblk->lines[i];
+
+		pblk_line_free(pblk, line);
+		kfree(line->blk_bitmap);
+		kfree(line->erase_bitmap);
+	}
+	spin_unlock(&l_mg->free_lock);
+}
+
+static void pblk_line_meta_free(struct pblk *pblk)
+{
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	int i;
+
+	kfree(l_mg->bb_template);
+	kfree(l_mg->bb_aux);
+
+	for (i = 0; i < PBLK_DATA_LINES; i++) {
+		pblk_mfree(l_mg->sline_meta[i].meta, l_mg->smeta_alloc_type);
+		pblk_mfree(l_mg->eline_meta[i].meta, l_mg->emeta_alloc_type);
+	}
+
+	kfree(pblk->lines);
+}
+
+static int pblk_bb_discovery(struct nvm_tgt_dev *dev, struct pblk_lun *rlun)
+{
+	struct nvm_geo *geo = &dev->geo;
+	struct ppa_addr ppa;
+	u8 *blks;
+	int nr_blks, ret;
+
+	nr_blks = geo->blks_per_lun * geo->plane_mode;
+	blks = kmalloc(nr_blks, GFP_KERNEL);
+	if (!blks)
+		return -ENOMEM;
+
+	ppa.ppa = 0;
+	ppa.g.ch = rlun->bppa.g.ch;
+	ppa.g.lun = rlun->bppa.g.lun;
+
+	ret = nvm_get_tgt_bb_tbl(dev, ppa, blks);
+	if (ret)
+		goto out;
+
+	nr_blks = nvm_bb_tbl_fold(dev->parent, blks, nr_blks);
+	if (nr_blks < 0) {
+		kfree(blks);
+		ret = nr_blks;
+	}
+
+	rlun->bb_list = blks;
+
+out:
+	return ret;
+}
+
+static int pblk_bb_line(struct pblk *pblk, struct pblk_line *line)
+{
+	struct pblk_line_meta *lm = &pblk->lm;
+	struct pblk_lun *rlun;
+	int bb_cnt = 0;
+	int i;
+
+	line->blk_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL);
+	if (!line->blk_bitmap)
+		return -ENOMEM;
+
+	line->erase_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL);
+	if (!line->erase_bitmap) {
+		kfree(line->blk_bitmap);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < lm->blk_per_line; i++) {
+		rlun = &pblk->luns[i];
+		if (rlun->bb_list[line->id] == NVM_BLK_T_FREE)
+			continue;
+
+		set_bit(i, line->blk_bitmap);
+		bb_cnt++;
+	}
+
+	return bb_cnt;
+}
+
+static int pblk_luns_init(struct pblk *pblk, struct ppa_addr *luns)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct pblk_lun *rlun;
+	int i, ret;
+
+	/* TODO: Implement unbalanced LUN support */
+	if (geo->luns_per_chnl < 0) {
+		pr_err("pblk: unbalanced LUN config.\n");
+		return -EINVAL;
+	}
+
+	pblk->luns = kcalloc(geo->nr_luns, sizeof(struct pblk_lun), GFP_KERNEL);
+	if (!pblk->luns)
+		return -ENOMEM;
+
+	for (i = 0; i < geo->nr_luns; i++) {
+		/* Stripe across channels */
+		int ch = i % geo->nr_chnls;
+		int lun_raw = i / geo->nr_chnls;
+		int lunid = lun_raw + ch * geo->luns_per_chnl;
+
+		rlun = &pblk->luns[i];
+		rlun->bppa = luns[lunid];
+
+		sema_init(&rlun->wr_sem, 1);
+
+		ret = pblk_bb_discovery(dev, rlun);
+		if (ret) {
+			while (--i >= 0)
+				kfree(pblk->luns[i].bb_list);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int pblk_lines_configure(struct pblk *pblk, int flags)
+{
+	struct pblk_line *line = NULL;
+	int ret = 0;
+
+	if (!(flags & NVM_TARGET_FACTORY)) {
+		line = pblk_recov_l2p(pblk);
+		if (IS_ERR(line)) {
+			pr_err("pblk: could not recover l2p table\n");
+			ret = -EFAULT;
+		}
+	}
+
+	if (!line) {
+		/* Configure next line for user data */
+		line = pblk_line_get_first_data(pblk);
+		if (!line) {
+			pr_err("pblk: line list corrupted\n");
+			ret = -EFAULT;
+		}
+	}
+
+	return ret;
+}
+
+/* See comment over struct line_emeta definition */
+static unsigned int calc_emeta_len(struct pblk *pblk, struct pblk_line_meta *lm)
+{
+	return (sizeof(struct line_emeta) +
+			((lm->sec_per_line - lm->emeta_sec) * sizeof(u64)) +
+			(pblk->l_mg.nr_lines * sizeof(u32)) +
+			lm->blk_bitmap_len);
+}
+
+static void pblk_set_provision(struct pblk *pblk, long nr_free_blks)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	sector_t provisioned;
+
+	pblk->over_pct = 20;
+
+	provisioned = nr_free_blks;
+	provisioned *= (100 - pblk->over_pct);
+	sector_div(provisioned, 100);
+
+	/* Internally pblk manages all free blocks, but all calculations based
+	 * on user capacity consider only provisioned blocks
+	 */
+	pblk->rl.total_blocks = nr_free_blks;
+	pblk->rl.nr_secs = nr_free_blks * geo->sec_per_blk;
+	pblk->capacity = provisioned * geo->sec_per_blk;
+	atomic_set(&pblk->rl.free_blocks, nr_free_blks);
+}
+
+static int pblk_lines_init(struct pblk *pblk)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	struct pblk_line_meta *lm = &pblk->lm;
+	struct pblk_line *line;
+	unsigned int smeta_len, emeta_len;
+	long nr_bad_blks, nr_meta_blks, nr_free_blks;
+	int bb_distance;
+	int i;
+	int ret = 0;
+
+	lm->sec_per_line = geo->sec_per_blk * geo->nr_luns;
+	lm->blk_per_line = geo->nr_luns;
+	lm->blk_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long);
+	lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long);
+	lm->lun_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long);
+	lm->high_thrs = lm->sec_per_line / 2;
+	lm->mid_thrs = lm->sec_per_line / 4;
+
+	/* Calculate necessary pages for smeta. See comment over struct
+	 * line_smeta definition
+	 */
+	lm->smeta_len = sizeof(struct line_smeta) +
+				PBLK_LINE_NR_LUN_BITMAP * lm->lun_bitmap_len;
+
+	i = 1;
+add_smeta_page:
+	lm->smeta_sec = i * geo->sec_per_pl;
+	lm->smeta_len = lm->smeta_sec * geo->sec_size;
+
+	smeta_len = sizeof(struct line_smeta) +
+				PBLK_LINE_NR_LUN_BITMAP * lm->lun_bitmap_len;
+	if (smeta_len > lm->smeta_len) {
+		i++;
+		goto add_smeta_page;
+	}
+
+	/* Calculate necessary pages for emeta. See comment over struct
+	 * line_emeta definition
+	 */
+	i = 1;
+add_emeta_page:
+	lm->emeta_sec = i * geo->sec_per_pl;
+	lm->emeta_len = lm->emeta_sec * geo->sec_size;
+
+	emeta_len = calc_emeta_len(pblk, lm);
+	if (emeta_len > lm->emeta_len) {
+		i++;
+		goto add_emeta_page;
+	}
+	lm->emeta_bb = geo->nr_luns - i;
+
+	nr_meta_blks = (lm->smeta_sec + lm->emeta_sec +
+				(geo->sec_per_blk / 2)) / geo->sec_per_blk;
+	lm->min_blk_line = nr_meta_blks + 1;
+
+	l_mg->nr_lines = geo->blks_per_lun;
+	l_mg->log_line = l_mg->data_line = NULL;
+	l_mg->l_seq_nr = l_mg->d_seq_nr = 0;
+	l_mg->nr_free_lines = 0;
+	bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES);
+
+	/* smeta is always small enough to fit on a kmalloc memory allocation,
+	 * emeta depends on the number of LUNs allocated to the pblk instance
+	 */
+	l_mg->smeta_alloc_type = PBLK_KMALLOC_META;
+	for (i = 0; i < PBLK_DATA_LINES; i++) {
+		l_mg->sline_meta[i].meta = kmalloc(lm->smeta_len, GFP_KERNEL);
+		if (!l_mg->sline_meta[i].meta)
+			while (--i >= 0) {
+				kfree(l_mg->sline_meta[i].meta);
+				ret = -ENOMEM;
+				goto fail;
+			}
+	}
+
+	if (lm->emeta_len > KMALLOC_MAX_CACHE_SIZE) {
+		l_mg->emeta_alloc_type = PBLK_VMALLOC_META;
+
+		for (i = 0; i < PBLK_DATA_LINES; i++) {
+			l_mg->eline_meta[i].meta = vmalloc(lm->emeta_len);
+			if (!l_mg->eline_meta[i].meta)
+				while (--i >= 0) {
+					vfree(l_mg->eline_meta[i].meta);
+					ret = -ENOMEM;
+					goto fail;
+				}
+		}
+	} else {
+		l_mg->emeta_alloc_type = PBLK_KMALLOC_META;
+
+		for (i = 0; i < PBLK_DATA_LINES; i++) {
+			l_mg->eline_meta[i].meta =
+					kmalloc(lm->emeta_len, GFP_KERNEL);
+			if (!l_mg->eline_meta[i].meta)
+				while (--i >= 0) {
+					kfree(l_mg->eline_meta[i].meta);
+					ret = -ENOMEM;
+					goto fail;
+				}
+		}
+	}
+
+	l_mg->bb_template = kzalloc(lm->sec_bitmap_len, GFP_KERNEL);
+	if (!l_mg->bb_template)
+		goto fail_free_meta;
+
+	l_mg->bb_aux = kzalloc(lm->sec_bitmap_len, GFP_KERNEL);
+	if (!l_mg->bb_aux)
+		goto fail_free_bb_template;
+
+	bb_distance = (geo->nr_luns) * geo->sec_per_pl;
+	for (i = 0; i < lm->sec_per_line; i += bb_distance)
+		bitmap_set(l_mg->bb_template, i, geo->sec_per_pl);
+
+	INIT_LIST_HEAD(&l_mg->free_list);
+	INIT_LIST_HEAD(&l_mg->corrupt_list);
+	INIT_LIST_HEAD(&l_mg->bad_list);
+	INIT_LIST_HEAD(&l_mg->gc_full_list);
+	INIT_LIST_HEAD(&l_mg->gc_high_list);
+	INIT_LIST_HEAD(&l_mg->gc_mid_list);
+	INIT_LIST_HEAD(&l_mg->gc_low_list);
+	INIT_LIST_HEAD(&l_mg->gc_empty_list);
+
+	l_mg->gc_lists[0] = &l_mg->gc_high_list;
+	l_mg->gc_lists[1] = &l_mg->gc_mid_list;
+	l_mg->gc_lists[2] = &l_mg->gc_low_list;
+
+	spin_lock_init(&l_mg->free_lock);
+	spin_lock_init(&l_mg->gc_lock);
+
+	pblk->lines = kcalloc(l_mg->nr_lines, sizeof(struct pblk_line),
+								GFP_KERNEL);
+	if (!pblk->lines)
+		goto fail_free_bb_aux;
+
+	nr_free_blks = 0;
+	for (i = 0; i < l_mg->nr_lines; i++) {
+		line = &pblk->lines[i];
+
+		line->pblk = pblk;
+		line->id = i;
+		line->type = PBLK_LINETYPE_FREE;
+		line->state = PBLK_LINESTATE_FREE;
+		line->gc_group = PBLK_LINEGC_NONE;
+		spin_lock_init(&line->lock);
+
+		nr_bad_blks = pblk_bb_line(pblk, line);
+		if (nr_bad_blks < 0 || nr_bad_blks > lm->blk_per_line)
+			goto fail_free_lines;
+
+		line->blk_in_line = lm->blk_per_line - nr_bad_blks;
+		if (line->blk_in_line < lm->min_blk_line) {
+			line->state = PBLK_LINESTATE_BAD;
+			list_add_tail(&line->list, &l_mg->bad_list);
+			continue;
+		}
+
+		nr_free_blks += line->blk_in_line;
+
+		l_mg->nr_free_lines++;
+		list_add_tail(&line->list, &l_mg->free_list);
+	}
+
+	pblk_set_provision(pblk, nr_free_blks);
+
+	sema_init(&pblk->erase_sem, 1);
+
+	/* Cleanup per-LUN bad block lists - managed within lines on run-time */
+	for (i = 0; i < geo->nr_luns; i++)
+		kfree(pblk->luns[i].bb_list);
+
+	return 0;
+fail_free_lines:
+	kfree(pblk->lines);
+fail_free_bb_aux:
+	kfree(l_mg->bb_aux);
+fail_free_bb_template:
+	kfree(l_mg->bb_template);
+fail_free_meta:
+	for (i = 0; i < PBLK_DATA_LINES; i++) {
+		pblk_mfree(l_mg->sline_meta[i].meta, l_mg->smeta_alloc_type);
+		pblk_mfree(l_mg->eline_meta[i].meta, l_mg->emeta_alloc_type);
+	}
+fail:
+	for (i = 0; i < geo->nr_luns; i++)
+		kfree(pblk->luns[i].bb_list);
+
+	return ret;
+}
+
+static int pblk_writer_init(struct pblk *pblk)
+{
+	setup_timer(&pblk->wtimer, pblk_write_timer_fn, (unsigned long)pblk);
+	mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
+
+	pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
+	if (IS_ERR(pblk->writer_ts)) {
+		pr_err("pblk: could not allocate writer kthread\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+static void pblk_writer_stop(struct pblk *pblk)
+{
+	if (pblk->writer_ts)
+		kthread_stop(pblk->writer_ts);
+	del_timer(&pblk->wtimer);
+}
+
+static void pblk_free(struct pblk *pblk)
+{
+	pblk_luns_free(pblk);
+	pblk_lines_free(pblk);
+	pblk_line_meta_free(pblk);
+	pblk_core_free(pblk);
+	pblk_l2p_free(pblk);
+
+	kfree(pblk);
+}
+
+static void pblk_tear_down(struct pblk *pblk)
+{
+	pblk_flush_writer(pblk);
+	pblk_writer_stop(pblk);
+	pblk_rb_sync_l2p(&pblk->rwb);
+	pblk_recov_pad(pblk);
+	pblk_rwb_free(pblk);
+	pblk_rl_free(&pblk->rl);
+
+	pr_debug("pblk: consistent tear down\n");
+}
+
+static void pblk_exit(void *private)
+{
+	struct pblk *pblk = private;
+
+	down_write(&pblk_lock);
+	pblk_gc_exit(pblk);
+	pblk_tear_down(pblk);
+	pblk_free(pblk);
+	up_write(&pblk_lock);
+}
+
+static sector_t pblk_capacity(void *private)
+{
+	struct pblk *pblk = private;
+
+	return pblk->capacity * NR_PHY_IN_LOG;
+}
+
+static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
+		       int flags)
+{
+	struct nvm_geo *geo = &dev->geo;
+	struct request_queue *bqueue = dev->q;
+	struct request_queue *tqueue = tdisk->queue;
+	struct pblk *pblk;
+	int ret;
+
+	if (dev->identity.dom & NVM_RSP_L2P) {
+		pr_err("pblk: device-side L2P table not supported. (%x)\n",
+							dev->identity.dom);
+		return ERR_PTR(-EINVAL);
+	}
+
+	pblk = kzalloc(sizeof(struct pblk), GFP_KERNEL);
+	if (!pblk)
+		return ERR_PTR(-ENOMEM);
+
+	pblk->dev = dev;
+	pblk->disk = tdisk;
+
+	spin_lock_init(&pblk->trans_lock);
+	spin_lock_init(&pblk->lock);
+
+	if (flags & NVM_TARGET_FACTORY)
+		pblk_setup_uuid(pblk);
+
+#ifdef CONFIG_NVM_DEBUG
+	atomic_long_set(&pblk->inflight_writes, 0);
+	atomic_long_set(&pblk->padded_writes, 0);
+	atomic_long_set(&pblk->padded_wb, 0);
+	atomic_long_set(&pblk->nr_flush, 0);
+	atomic_long_set(&pblk->req_writes, 0);
+	atomic_long_set(&pblk->sub_writes, 0);
+	atomic_long_set(&pblk->sync_writes, 0);
+	atomic_long_set(&pblk->compl_writes, 0);
+	atomic_long_set(&pblk->inflight_reads, 0);
+	atomic_long_set(&pblk->sync_reads, 0);
+	atomic_long_set(&pblk->recov_writes, 0);
+	atomic_long_set(&pblk->recov_writes, 0);
+	atomic_long_set(&pblk->recov_gc_writes, 0);
+#endif
+
+	atomic_long_set(&pblk->read_failed, 0);
+	atomic_long_set(&pblk->read_empty, 0);
+	atomic_long_set(&pblk->read_high_ecc, 0);
+	atomic_long_set(&pblk->read_failed_gc, 0);
+	atomic_long_set(&pblk->write_failed, 0);
+	atomic_long_set(&pblk->erase_failed, 0);
+
+	ret = pblk_luns_init(pblk, dev->luns);
+	if (ret) {
+		pr_err("pblk: could not initialize luns\n");
+		goto fail;
+	}
+
+	ret = pblk_lines_init(pblk);
+	if (ret) {
+		pr_err("pblk: could not initialize lines\n");
+		goto fail_free_luns;
+	}
+
+	ret = pblk_core_init(pblk);
+	if (ret) {
+		pr_err("pblk: could not initialize core\n");
+		goto fail_free_line_meta;
+	}
+
+	ret = pblk_l2p_init(pblk);
+	if (ret) {
+		pr_err("pblk: could not initialize maps\n");
+		goto fail_free_core;
+	}
+
+	ret = pblk_lines_configure(pblk, flags);
+	if (ret) {
+		pr_err("pblk: could not configure lines\n");
+		goto fail_free_l2p;
+	}
+
+	ret = pblk_writer_init(pblk);
+	if (ret) {
+		pr_err("pblk: could not initialize write thread\n");
+		goto fail_free_lines;
+	}
+
+	ret = pblk_gc_init(pblk);
+	if (ret) {
+		pr_err("pblk: could not initialize gc\n");
+		goto fail_stop_writer;
+	}
+
+	/* inherit the size from the underlying device */
+	blk_queue_logical_block_size(tqueue, queue_physical_block_size(bqueue));
+	blk_queue_max_hw_sectors(tqueue, queue_max_hw_sectors(bqueue));
+
+	blk_queue_write_cache(tqueue, true, false);
+
+	tqueue->limits.discard_granularity = geo->pgs_per_blk * geo->pfpg_size;
+	tqueue->limits.discard_alignment = 0;
+	blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9);
+	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, tqueue);
+
+	pr_info("pblk init: luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
+			geo->nr_luns, pblk->l_mg.nr_lines,
+			(unsigned long long)pblk->rl.nr_secs,
+			pblk->rwb.nr_entries);
+
+	wake_up_process(pblk->writer_ts);
+	return pblk;
+
+fail_stop_writer:
+	pblk_writer_stop(pblk);
+fail_free_lines:
+	pblk_lines_free(pblk);
+fail_free_l2p:
+	pblk_l2p_free(pblk);
+fail_free_core:
+	pblk_core_free(pblk);
+fail_free_line_meta:
+	pblk_line_meta_free(pblk);
+fail_free_luns:
+	pblk_luns_free(pblk);
+fail:
+	kfree(pblk);
+	return ERR_PTR(ret);
+}
+
+/* physical block device target */
+static struct nvm_tgt_type tt_pblk = {
+	.name		= "pblk",
+	.version	= {1, 0, 0},
+
+	.make_rq	= pblk_make_rq,
+	.capacity	= pblk_capacity,
+
+	.init		= pblk_init,
+	.exit		= pblk_exit,
+
+	.sysfs_init	= pblk_sysfs_init,
+	.sysfs_exit	= pblk_sysfs_exit,
+};
+
+static int __init pblk_module_init(void)
+{
+	return nvm_register_tgt_type(&tt_pblk);
+}
+
+static void pblk_module_exit(void)
+{
+	nvm_unregister_tgt_type(&tt_pblk);
+}
+
+module_init(pblk_module_init);
+module_exit(pblk_module_exit);
+MODULE_AUTHOR("Javier Gonzalez <javier@cnexlabs.com>");
+MODULE_AUTHOR("Matias Bjorling <matias@cnexlabs.com>");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Physical Block-Device for Open-Channel SSDs");
diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
new file mode 100644
index 000000000000..3f8bab4c4d5c
--- /dev/null
+++ b/drivers/lightnvm/pblk-map.c
@@ -0,0 +1,136 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *                  Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-map.c - pblk's lba-ppa mapping strategy
+ *
+ */
+
+#include "pblk.h"
+
+static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
+			       struct ppa_addr *ppa_list,
+			       unsigned long *lun_bitmap,
+			       struct pblk_sec_meta *meta_list,
+			       unsigned int valid_secs)
+{
+	struct pblk_line *line = pblk_line_get_data(pblk);
+	struct line_emeta *emeta = line->emeta;
+	struct pblk_w_ctx *w_ctx;
+	__le64 *lba_list = pblk_line_emeta_to_lbas(emeta);
+	u64 paddr;
+	int nr_secs = pblk->min_write_pgs;
+	int i;
+
+	paddr = pblk_alloc_page(pblk, line, nr_secs);
+
+	for (i = 0; i < nr_secs; i++, paddr++) {
+		/* ppa to be sent to the device */
+		ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
+
+		/* Write context for target bio completion on write buffer. Note
+		 * that the write buffer is protected by the sync backpointer,
+		 * and a single writer thread have access to each specific entry
+		 * at a time. Thus, it is safe to modify the context for the
+		 * entry we are setting up for submission without taking any
+		 * lock or memory barrier.
+		 */
+		if (i < valid_secs) {
+			kref_get(&line->ref);
+			w_ctx = pblk_rb_w_ctx(&pblk->rwb, sentry + i);
+			w_ctx->ppa = ppa_list[i];
+			meta_list[i].lba = cpu_to_le64(w_ctx->lba);
+			lba_list[paddr] = cpu_to_le64(w_ctx->lba);
+			le64_add_cpu(&line->emeta->nr_valid_lbas, 1);
+		} else {
+			meta_list[i].lba = cpu_to_le64(ADDR_EMPTY);
+			lba_list[paddr] = cpu_to_le64(ADDR_EMPTY);
+			pblk_map_pad_invalidate(pblk, line, paddr);
+		}
+	}
+
+	if (pblk_line_is_full(line)) {
+		line = pblk_line_replace_data(pblk);
+		if (!line)
+			return;
+	}
+
+	pblk_down_rq(pblk, ppa_list, nr_secs, lun_bitmap);
+}
+
+void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
+		 unsigned long *lun_bitmap, unsigned int valid_secs,
+		 unsigned int off)
+{
+	struct pblk_sec_meta *meta_list = rqd->meta_list;
+	unsigned int map_secs;
+	int min = pblk->min_write_pgs;
+	int i;
+
+	for (i = off; i < rqd->nr_ppas; i += min) {
+		map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
+		pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
+					lun_bitmap, &meta_list[i], map_secs);
+	}
+}
+
+/* only if erase_ppa is set, acquire erase semaphore */
+void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
+		       unsigned int sentry, unsigned long *lun_bitmap,
+		       unsigned int valid_secs, struct ppa_addr *erase_ppa)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct pblk_line *e_line = pblk_line_get_data_next(pblk);
+	struct pblk_sec_meta *meta_list = rqd->meta_list;
+	unsigned int map_secs;
+	int min = pblk->min_write_pgs;
+	int i, erase_lun;
+
+	for (i = 0; i < rqd->nr_ppas; i += min) {
+		map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
+		pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
+					lun_bitmap, &meta_list[i], map_secs);
+
+		erase_lun = rqd->ppa_list[i].g.lun * geo->nr_chnls +
+							rqd->ppa_list[i].g.ch;
+
+		if (!test_bit(erase_lun, e_line->erase_bitmap)) {
+			if (down_trylock(&pblk->erase_sem))
+				continue;
+
+			set_bit(erase_lun, e_line->erase_bitmap);
+			e_line->left_eblks--;
+			*erase_ppa = rqd->ppa_list[i];
+			erase_ppa->g.blk = e_line->id;
+
+			/* Avoid evaluating e_line->left_eblks */
+			return pblk_map_rq(pblk, rqd, sentry, lun_bitmap,
+							valid_secs, i + min);
+		}
+	}
+
+	/* Erase blocks that are bad in this line but might not be in next */
+	if (unlikely(ppa_empty(*erase_ppa))) {
+		struct pblk_line_meta *lm = &pblk->lm;
+
+		i = find_first_zero_bit(e_line->erase_bitmap, lm->blk_per_line);
+		if (i == lm->blk_per_line)
+			return;
+
+		set_bit(i, e_line->erase_bitmap);
+		e_line->left_eblks--;
+		*erase_ppa = pblk->luns[i].bppa; /* set ch and lun */
+		erase_ppa->g.blk = e_line->id;
+	}
+}
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
new file mode 100644
index 000000000000..045384ddc1f9
--- /dev/null
+++ b/drivers/lightnvm/pblk-rb.c
@@ -0,0 +1,852 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *
+ * Based upon the circular ringbuffer.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-rb.c - pblk's write buffer
+ */
+
+#include <linux/circ_buf.h>
+
+#include "pblk.h"
+
+static DECLARE_RWSEM(pblk_rb_lock);
+
+void pblk_rb_data_free(struct pblk_rb *rb)
+{
+	struct pblk_rb_pages *p, *t;
+
+	down_write(&pblk_rb_lock);
+	list_for_each_entry_safe(p, t, &rb->pages, list) {
+		free_pages((unsigned long)page_address(p->pages), p->order);
+		list_del(&p->list);
+		kfree(p);
+	}
+	up_write(&pblk_rb_lock);
+}
+
+/*
+ * Initialize ring buffer. The data and metadata buffers must be previously
+ * allocated and their size must be a power of two
+ * (Documentation/circular-buffers.txt)
+ */
+int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
+		 unsigned int power_size, unsigned int power_seg_sz)
+{
+	struct pblk *pblk = container_of(rb, struct pblk, rwb);
+	unsigned int init_entry = 0;
+	unsigned int alloc_order = power_size;
+	unsigned int max_order = MAX_ORDER - 1;
+	unsigned int order, iter;
+
+	down_write(&pblk_rb_lock);
+	rb->entries = rb_entry_base;
+	rb->seg_size = (1 << power_seg_sz);
+	rb->nr_entries = (1 << power_size);
+	rb->mem = rb->subm = rb->sync = rb->l2p_update = 0;
+	rb->sync_point = EMPTY_ENTRY;
+
+	spin_lock_init(&rb->w_lock);
+	spin_lock_init(&rb->s_lock);
+
+	INIT_LIST_HEAD(&rb->pages);
+
+	if (alloc_order >= max_order) {
+		order = max_order;
+		iter = (1 << (alloc_order - max_order));
+	} else {
+		order = alloc_order;
+		iter = 1;
+	}
+
+	do {
+		struct pblk_rb_entry *entry;
+		struct pblk_rb_pages *page_set;
+		void *kaddr;
+		unsigned long set_size;
+		int i;
+
+		page_set = kmalloc(sizeof(struct pblk_rb_pages), GFP_KERNEL);
+		if (!page_set) {
+			up_write(&pblk_rb_lock);
+			return -ENOMEM;
+		}
+
+		page_set->order = order;
+		page_set->pages = alloc_pages(GFP_KERNEL, order);
+		if (!page_set->pages) {
+			kfree(page_set);
+			pblk_rb_data_free(rb);
+			up_write(&pblk_rb_lock);
+			return -ENOMEM;
+		}
+		kaddr = page_address(page_set->pages);
+
+		entry = &rb->entries[init_entry];
+		entry->data = kaddr;
+		entry->cacheline = pblk_cacheline_to_addr(init_entry++);
+		entry->w_ctx.flags = PBLK_WRITABLE_ENTRY;
+
+		set_size = (1 << order);
+		for (i = 1; i < set_size; i++) {
+			entry = &rb->entries[init_entry];
+			entry->cacheline = pblk_cacheline_to_addr(init_entry++);
+			entry->data = kaddr + (i * rb->seg_size);
+			entry->w_ctx.flags = PBLK_WRITABLE_ENTRY;
+			bio_list_init(&entry->w_ctx.bios);
+		}
+
+		list_add_tail(&page_set->list, &rb->pages);
+		iter--;
+	} while (iter > 0);
+	up_write(&pblk_rb_lock);
+
+#ifdef CONFIG_NVM_DEBUG
+	atomic_set(&rb->inflight_sync_point, 0);
+#endif
+
+	/*
+	 * Initialize rate-limiter, which controls access to the write buffer
+	 * but user and GC I/O
+	 */
+	pblk_rl_init(&pblk->rl, rb->nr_entries);
+
+	return 0;
+}
+
+/*
+ * pblk_rb_calculate_size -- calculate the size of the write buffer
+ */
+unsigned int pblk_rb_calculate_size(unsigned int nr_entries)
+{
+	/* Alloc a write buffer that can at least fit 128 entries */
+	return (1 << max(get_count_order(nr_entries), 7));
+}
+
+void *pblk_rb_entries_ref(struct pblk_rb *rb)
+{
+	return rb->entries;
+}
+
+static void clean_wctx(struct pblk_w_ctx *w_ctx)
+{
+	int flags;
+
+try:
+	flags = READ_ONCE(w_ctx->flags);
+	if (!(flags & PBLK_SUBMITTED_ENTRY))
+		goto try;
+
+	/* Release flags on context. Protect from writes and reads */
+	smp_store_release(&w_ctx->flags, PBLK_WRITABLE_ENTRY);
+	pblk_ppa_set_empty(&w_ctx->ppa);
+}
+
+#define pblk_rb_ring_count(head, tail, size) CIRC_CNT(head, tail, size)
+#define pblk_rb_ring_space(rb, head, tail, size) \
+					(CIRC_SPACE(head, tail, size))
+
+/*
+ * Buffer space is calculated with respect to the back pointer signaling
+ * synchronized entries to the media.
+ */
+static unsigned int pblk_rb_space(struct pblk_rb *rb)
+{
+	unsigned int mem = READ_ONCE(rb->mem);
+	unsigned int sync = READ_ONCE(rb->sync);
+
+	return pblk_rb_ring_space(rb, mem, sync, rb->nr_entries);
+}
+
+/*
+ * Buffer count is calculated with respect to the submission entry signaling the
+ * entries that are available to send to the media
+ */
+unsigned int pblk_rb_read_count(struct pblk_rb *rb)
+{
+	unsigned int mem = READ_ONCE(rb->mem);
+	unsigned int subm = READ_ONCE(rb->subm);
+
+	return pblk_rb_ring_count(mem, subm, rb->nr_entries);
+}
+
+unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int nr_entries)
+{
+	unsigned int subm;
+
+	subm = READ_ONCE(rb->subm);
+	/* Commit read means updating submission pointer */
+	smp_store_release(&rb->subm,
+				(subm + nr_entries) & (rb->nr_entries - 1));
+
+	return subm;
+}
+
+static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd,
+				unsigned int to_update)
+{
+	struct pblk *pblk = container_of(rb, struct pblk, rwb);
+	struct pblk_line *line;
+	struct pblk_rb_entry *entry;
+	struct pblk_w_ctx *w_ctx;
+	unsigned int i;
+
+	for (i = 0; i < to_update; i++) {
+		entry = &rb->entries[*l2p_upd];
+		w_ctx = &entry->w_ctx;
+
+		pblk_update_map_dev(pblk, w_ctx->lba, w_ctx->ppa,
+							entry->cacheline);
+
+		line = &pblk->lines[pblk_tgt_ppa_to_line(w_ctx->ppa)];
+		kref_put(&line->ref, pblk_line_put);
+		clean_wctx(w_ctx);
+		*l2p_upd = (*l2p_upd + 1) & (rb->nr_entries - 1);
+	}
+
+	return 0;
+}
+
+/*
+ * When we move the l2p_update pointer, we update the l2p table - lookups will
+ * point to the physical address instead of to the cacheline in the write buffer
+ * from this moment on.
+ */
+static int pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int nr_entries,
+			      unsigned int mem, unsigned int sync)
+{
+	unsigned int space, count;
+	int ret = 0;
+
+	lockdep_assert_held(&rb->w_lock);
+
+	/* Update l2p only as buffer entries are being overwritten */
+	space = pblk_rb_ring_space(rb, mem, rb->l2p_update, rb->nr_entries);
+	if (space > nr_entries)
+		goto out;
+
+	count = nr_entries - space;
+	/* l2p_update used exclusively under rb->w_lock */
+	ret = __pblk_rb_update_l2p(rb, &rb->l2p_update, count);
+
+out:
+	return ret;
+}
+
+/*
+ * Update the l2p entry for all sectors stored on the write buffer. This means
+ * that all future lookups to the l2p table will point to a device address, not
+ * to the cacheline in the write buffer.
+ */
+void pblk_rb_sync_l2p(struct pblk_rb *rb)
+{
+	unsigned int sync;
+	unsigned int to_update;
+
+	spin_lock(&rb->w_lock);
+
+	/* Protect from reads and writes */
+	sync = smp_load_acquire(&rb->sync);
+
+	to_update = pblk_rb_ring_count(sync, rb->l2p_update, rb->nr_entries);
+	__pblk_rb_update_l2p(rb, &rb->l2p_update, to_update);
+
+	spin_unlock(&rb->w_lock);
+}
+
+/*
+ * Write @nr_entries to ring buffer from @data buffer if there is enough space.
+ * Typically, 4KB data chunks coming from a bio will be copied to the ring
+ * buffer, thus the write will fail if not all incoming data can be copied.
+ *
+ */
+static void __pblk_rb_write_entry(struct pblk_rb *rb, void *data,
+				  struct pblk_w_ctx w_ctx,
+				  struct pblk_rb_entry *entry)
+{
+	memcpy(entry->data, data, rb->seg_size);
+
+	entry->w_ctx.lba = w_ctx.lba;
+	entry->w_ctx.ppa = w_ctx.ppa;
+}
+
+void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
+			      struct pblk_w_ctx w_ctx, unsigned int ring_pos)
+{
+	struct pblk *pblk = container_of(rb, struct pblk, rwb);
+	struct pblk_rb_entry *entry;
+	int flags;
+
+	entry = &rb->entries[ring_pos];
+	flags = READ_ONCE(entry->w_ctx.flags);
+#ifdef CONFIG_NVM_DEBUG
+	/* Caller must guarantee that the entry is free */
+	BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
+#endif
+
+	__pblk_rb_write_entry(rb, data, w_ctx, entry);
+
+	pblk_update_map_cache(pblk, w_ctx.lba, entry->cacheline);
+	flags = w_ctx.flags | PBLK_WRITTEN_DATA;
+
+	/* Release flags on write context. Protect from writes */
+	smp_store_release(&entry->w_ctx.flags, flags);
+}
+
+void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
+			    struct pblk_w_ctx w_ctx, struct pblk_line *gc_line,
+			    unsigned int ring_pos)
+{
+	struct pblk *pblk = container_of(rb, struct pblk, rwb);
+	struct pblk_rb_entry *entry;
+	int flags;
+
+	entry = &rb->entries[ring_pos];
+	flags = READ_ONCE(entry->w_ctx.flags);
+#ifdef CONFIG_NVM_DEBUG
+	/* Caller must guarantee that the entry is free */
+	BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
+#endif
+
+	__pblk_rb_write_entry(rb, data, w_ctx, entry);
+
+	if (!pblk_update_map_gc(pblk, w_ctx.lba, entry->cacheline, gc_line))
+		entry->w_ctx.lba = ADDR_EMPTY;
+
+	flags = w_ctx.flags | PBLK_WRITTEN_DATA;
+
+	/* Release flags on write context. Protect from writes */
+	smp_store_release(&entry->w_ctx.flags, flags);
+}
+
+static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio,
+				  unsigned int pos)
+{
+	struct pblk_rb_entry *entry;
+	unsigned int subm, sync_point;
+	int flags;
+
+	subm = READ_ONCE(rb->subm);
+
+#ifdef CONFIG_NVM_DEBUG
+	atomic_inc(&rb->inflight_sync_point);
+#endif
+
+	if (pos == subm)
+		return 0;
+
+	sync_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1);
+	entry = &rb->entries[sync_point];
+
+	flags = READ_ONCE(entry->w_ctx.flags);
+	flags |= PBLK_FLUSH_ENTRY;
+
+	/* Release flags on context. Protect from writes */
+	smp_store_release(&entry->w_ctx.flags, flags);
+
+	/* Protect syncs */
+	smp_store_release(&rb->sync_point, sync_point);
+
+	spin_lock_irq(&rb->s_lock);
+	bio_list_add(&entry->w_ctx.bios, bio);
+	spin_unlock_irq(&rb->s_lock);
+
+	return 1;
+}
+
+static int __pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries,
+			       unsigned int *pos)
+{
+	unsigned int mem;
+	unsigned int sync;
+
+	sync = READ_ONCE(rb->sync);
+	mem = READ_ONCE(rb->mem);
+
+	if (pblk_rb_ring_space(rb, mem, sync, rb->nr_entries) < nr_entries)
+		return 0;
+
+	if (pblk_rb_update_l2p(rb, nr_entries, mem, sync))
+		return 0;
+
+	*pos = mem;
+
+	return 1;
+}
+
+static int pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries,
+			     unsigned int *pos)
+{
+	if (!__pblk_rb_may_write(rb, nr_entries, pos))
+		return 0;
+
+	/* Protect from read count */
+	smp_store_release(&rb->mem, (*pos + nr_entries) & (rb->nr_entries - 1));
+	return 1;
+}
+
+static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries,
+				   unsigned int *pos, struct bio *bio,
+				   int *io_ret)
+{
+	unsigned int mem;
+
+	if (!__pblk_rb_may_write(rb, nr_entries, pos))
+		return 0;
+
+	mem = (*pos + nr_entries) & (rb->nr_entries - 1);
+	*io_ret = NVM_IO_DONE;
+
+	if (bio->bi_opf & REQ_PREFLUSH) {
+		struct pblk *pblk = container_of(rb, struct pblk, rwb);
+
+#ifdef CONFIG_NVM_DEBUG
+		atomic_long_inc(&pblk->nr_flush);
+#endif
+		if (pblk_rb_sync_point_set(&pblk->rwb, bio, mem))
+			*io_ret = NVM_IO_OK;
+	}
+
+	/* Protect from read count */
+	smp_store_release(&rb->mem, mem);
+	return 1;
+}
+
+/*
+ * Atomically check that (i) there is space on the write buffer for the
+ * incoming I/O, and (ii) the current I/O type has enough budget in the write
+ * buffer (rate-limiter).
+ */
+int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio,
+			   unsigned int nr_entries, unsigned int *pos)
+{
+	struct pblk *pblk = container_of(rb, struct pblk, rwb);
+	int flush_done;
+
+	spin_lock(&rb->w_lock);
+	if (!pblk_rl_user_may_insert(&pblk->rl, nr_entries)) {
+		spin_unlock(&rb->w_lock);
+		return NVM_IO_REQUEUE;
+	}
+
+	if (!pblk_rb_may_write_flush(rb, nr_entries, pos, bio, &flush_done)) {
+		spin_unlock(&rb->w_lock);
+		return NVM_IO_REQUEUE;
+	}
+
+	pblk_rl_user_in(&pblk->rl, nr_entries);
+	spin_unlock(&rb->w_lock);
+
+	return flush_done;
+}
+
+/*
+ * Look at pblk_rb_may_write_user comment
+ */
+int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries,
+			 unsigned int *pos)
+{
+	struct pblk *pblk = container_of(rb, struct pblk, rwb);
+
+	spin_lock(&rb->w_lock);
+	if (!pblk_rl_gc_may_insert(&pblk->rl, nr_entries)) {
+		spin_unlock(&rb->w_lock);
+		return 0;
+	}
+
+	if (!pblk_rb_may_write(rb, nr_entries, pos)) {
+		spin_unlock(&rb->w_lock);
+		return 0;
+	}
+
+	pblk_rl_gc_in(&pblk->rl, nr_entries);
+	spin_unlock(&rb->w_lock);
+
+	return 1;
+}
+
+/*
+ * The caller of this function must ensure that the backpointer will not
+ * overwrite the entries passed on the list.
+ */
+unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
+				      struct list_head *list,
+				      unsigned int max)
+{
+	struct pblk_rb_entry *entry, *tentry;
+	struct page *page;
+	unsigned int read = 0;
+	int ret;
+
+	list_for_each_entry_safe(entry, tentry, list, index) {
+		if (read > max) {
+			pr_err("pblk: too many entries on list\n");
+			goto out;
+		}
+
+		page = virt_to_page(entry->data);
+		if (!page) {
+			pr_err("pblk: could not allocate write bio page\n");
+			goto out;
+		}
+
+		ret = bio_add_page(bio, page, rb->seg_size, 0);
+		if (ret != rb->seg_size) {
+			pr_err("pblk: could not add page to write bio\n");
+			goto out;
+		}
+
+		list_del(&entry->index);
+		read++;
+	}
+
+out:
+	return read;
+}
+
+/*
+ * Read available entries on rb and add them to the given bio. To avoid a memory
+ * copy, a page reference to the write buffer is used to be added to the bio.
+ *
+ * This function is used by the write thread to form the write bio that will
+ * persist data on the write buffer to the media.
+ */
+unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct bio *bio,
+				 struct pblk_c_ctx *c_ctx,
+				 unsigned int pos,
+				 unsigned int nr_entries,
+				 unsigned int count)
+{
+	struct pblk *pblk = container_of(rb, struct pblk, rwb);
+	struct pblk_rb_entry *entry;
+	struct page *page;
+	unsigned int pad = 0, read = 0, to_read = nr_entries;
+	unsigned int user_io = 0, gc_io = 0;
+	unsigned int i;
+	int flags;
+	int ret;
+
+	if (count < nr_entries) {
+		pad = nr_entries - count;
+		to_read = count;
+	}
+
+	c_ctx->sentry = pos;
+	c_ctx->nr_valid = to_read;
+	c_ctx->nr_padded = pad;
+
+	for (i = 0; i < to_read; i++) {
+		entry = &rb->entries[pos];
+
+		/* A write has been allowed into the buffer, but data is still
+		 * being copied to it. It is ok to busy wait.
+		 */
+try:
+		flags = READ_ONCE(entry->w_ctx.flags);
+		if (!(flags & PBLK_WRITTEN_DATA))
+			goto try;
+
+		if (flags & PBLK_IOTYPE_USER)
+			user_io++;
+		else if (flags & PBLK_IOTYPE_GC)
+			gc_io++;
+		else
+			WARN(1, "pblk: unknown IO type\n");
+
+		page = virt_to_page(entry->data);
+		if (!page) {
+			pr_err("pblk: could not allocate write bio page\n");
+			flags &= ~PBLK_WRITTEN_DATA;
+			flags |= PBLK_SUBMITTED_ENTRY;
+			/* Release flags on context. Protect from writes */
+			smp_store_release(&entry->w_ctx.flags, flags);
+			goto out;
+		}
+
+		ret = bio_add_page(bio, page, rb->seg_size, 0);
+		if (ret != rb->seg_size) {
+			pr_err("pblk: could not add page to write bio\n");
+			flags &= ~PBLK_WRITTEN_DATA;
+			flags |= PBLK_SUBMITTED_ENTRY;
+			/* Release flags on context. Protect from writes */
+			smp_store_release(&entry->w_ctx.flags, flags);
+			goto out;
+		}
+
+		if (flags & PBLK_FLUSH_ENTRY) {
+			unsigned int sync_point;
+
+			sync_point = READ_ONCE(rb->sync_point);
+			if (sync_point == pos) {
+				/* Protect syncs */
+				smp_store_release(&rb->sync_point, EMPTY_ENTRY);
+			}
+
+			flags &= ~PBLK_FLUSH_ENTRY;
+#ifdef CONFIG_NVM_DEBUG
+			atomic_dec(&rb->inflight_sync_point);
+#endif
+		}
+
+		flags &= ~PBLK_WRITTEN_DATA;
+		flags |= PBLK_SUBMITTED_ENTRY;
+
+		/* Release flags on context. Protect from writes */
+		smp_store_release(&entry->w_ctx.flags, flags);
+
+		pos = (pos + 1) & (rb->nr_entries - 1);
+	}
+
+	read = to_read;
+	pblk_rl_out(&pblk->rl, user_io, gc_io);
+#ifdef CONFIG_NVM_DEBUG
+	atomic_long_add(pad, &((struct pblk *)
+			(container_of(rb, struct pblk, rwb)))->padded_writes);
+#endif
+out:
+	return read;
+}
+
+/*
+ * Copy to bio only if the lba matches the one on the given cache entry.
+ * Otherwise, it means that the entry has been overwritten, and the bio should
+ * be directed to disk.
+ */
+int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
+			u64 pos, int bio_iter)
+{
+	struct pblk_rb_entry *entry;
+	struct pblk_w_ctx *w_ctx;
+	void *data;
+	int flags;
+	int ret = 1;
+
+	spin_lock(&rb->w_lock);
+
+#ifdef CONFIG_NVM_DEBUG
+	/* Caller must ensure that the access will not cause an overflow */
+	BUG_ON(pos >= rb->nr_entries);
+#endif
+	entry = &rb->entries[pos];
+	w_ctx = &entry->w_ctx;
+	flags = READ_ONCE(w_ctx->flags);
+
+	/* Check if the entry has been overwritten or is scheduled to be */
+	if (w_ctx->lba != lba || flags & PBLK_WRITABLE_ENTRY) {
+		ret = 0;
+		goto out;
+	}
+
+	/* Only advance the bio if it hasn't been advanced already. If advanced,
+	 * this bio is at least a partial bio (i.e., it has partially been
+	 * filled with data from the cache). If part of the data resides on the
+	 * media, we will read later on
+	 */
+	if (unlikely(!bio->bi_iter.bi_idx))
+		bio_advance(bio, bio_iter * PBLK_EXPOSED_PAGE_SIZE);
+
+	data = bio_data(bio);
+	memcpy(data, entry->data, rb->seg_size);
+
+out:
+	spin_unlock(&rb->w_lock);
+	return ret;
+}
+
+struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos)
+{
+	unsigned int entry = pos & (rb->nr_entries - 1);
+
+	return &rb->entries[entry].w_ctx;
+}
+
+unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags)
+	__acquires(&rb->s_lock)
+{
+	if (flags)
+		spin_lock_irqsave(&rb->s_lock, *flags);
+	else
+		spin_lock_irq(&rb->s_lock);
+
+	return rb->sync;
+}
+
+void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags)
+	__releases(&rb->s_lock)
+{
+	lockdep_assert_held(&rb->s_lock);
+
+	if (flags)
+		spin_unlock_irqrestore(&rb->s_lock, *flags);
+	else
+		spin_unlock_irq(&rb->s_lock);
+}
+
+unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries)
+{
+	unsigned int sync;
+	unsigned int i;
+
+	lockdep_assert_held(&rb->s_lock);
+
+	sync = READ_ONCE(rb->sync);
+
+	for (i = 0; i < nr_entries; i++)
+		sync = (sync + 1) & (rb->nr_entries - 1);
+
+	/* Protect from counts */
+	smp_store_release(&rb->sync, sync);
+
+	return sync;
+}
+
+unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb)
+{
+	unsigned int subm, sync_point;
+	unsigned int count;
+
+	/* Protect syncs */
+	sync_point = smp_load_acquire(&rb->sync_point);
+	if (sync_point == EMPTY_ENTRY)
+		return 0;
+
+	subm = READ_ONCE(rb->subm);
+
+	/* The sync point itself counts as a sector to sync */
+	count = pblk_rb_ring_count(sync_point, subm, rb->nr_entries) + 1;
+
+	return count;
+}
+
+/*
+ * Scan from the current position of the sync pointer to find the entry that
+ * corresponds to the given ppa. This is necessary since write requests can be
+ * completed out of order. The assumption is that the ppa is close to the sync
+ * pointer thus the search will not take long.
+ *
+ * The caller of this function must guarantee that the sync pointer will no
+ * reach the entry while it is using the metadata associated with it. With this
+ * assumption in mind, there is no need to take the sync lock.
+ */
+struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb,
+					      struct ppa_addr *ppa)
+{
+	unsigned int sync, subm, count;
+	unsigned int i;
+
+	sync = READ_ONCE(rb->sync);
+	subm = READ_ONCE(rb->subm);
+	count = pblk_rb_ring_count(subm, sync, rb->nr_entries);
+
+	for (i = 0; i < count; i++)
+		sync = (sync + 1) & (rb->nr_entries - 1);
+
+	return NULL;
+}
+
+int pblk_rb_tear_down_check(struct pblk_rb *rb)
+{
+	struct pblk_rb_entry *entry;
+	int i;
+	int ret = 0;
+
+	spin_lock(&rb->w_lock);
+	spin_lock_irq(&rb->s_lock);
+
+	if ((rb->mem == rb->subm) && (rb->subm == rb->sync) &&
+				(rb->sync == rb->l2p_update) &&
+				(rb->sync_point == EMPTY_ENTRY)) {
+		goto out;
+	}
+
+	if (!rb->entries) {
+		ret = 1;
+		goto out;
+	}
+
+	for (i = 0; i < rb->nr_entries; i++) {
+		entry = &rb->entries[i];
+
+		if (!entry->data) {
+			ret = 1;
+			goto out;
+		}
+	}
+
+out:
+	spin_unlock(&rb->w_lock);
+	spin_unlock_irq(&rb->s_lock);
+
+	return ret;
+}
+
+unsigned int pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned int pos)
+{
+	return (pos & (rb->nr_entries - 1));
+}
+
+int pblk_rb_pos_oob(struct pblk_rb *rb, u64 pos)
+{
+	return (pos >= rb->nr_entries);
+}
+
+ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
+{
+	struct pblk *pblk = container_of(rb, struct pblk, rwb);
+	struct pblk_c_ctx *c;
+	ssize_t offset;
+	int queued_entries = 0;
+
+	spin_lock_irq(&rb->s_lock);
+	list_for_each_entry(c, &pblk->compl_list, list)
+		queued_entries++;
+	spin_unlock_irq(&rb->s_lock);
+
+	if (rb->sync_point != EMPTY_ENTRY)
+		offset = scnprintf(buf, PAGE_SIZE,
+			"%u\t%u\t%u\t%u\t%u\t%u\t%u - %u/%u/%u - %d\n",
+			rb->nr_entries,
+			rb->mem,
+			rb->subm,
+			rb->sync,
+			rb->l2p_update,
+#ifdef CONFIG_NVM_DEBUG
+			atomic_read(&rb->inflight_sync_point),
+#else
+			0,
+#endif
+			rb->sync_point,
+			pblk_rb_read_count(rb),
+			pblk_rb_space(rb),
+			pblk_rb_sync_point_count(rb),
+			queued_entries);
+	else
+		offset = scnprintf(buf, PAGE_SIZE,
+			"%u\t%u\t%u\t%u\t%u\t%u\tNULL - %u/%u/%u - %d\n",
+			rb->nr_entries,
+			rb->mem,
+			rb->subm,
+			rb->sync,
+			rb->l2p_update,
+#ifdef CONFIG_NVM_DEBUG
+			atomic_read(&rb->inflight_sync_point),
+#else
+			0,
+#endif
+			pblk_rb_read_count(rb),
+			pblk_rb_space(rb),
+			pblk_rb_sync_point_count(rb),
+			queued_entries);
+
+	return offset;
+}
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
new file mode 100644
index 000000000000..eff0982c076f
--- /dev/null
+++ b/drivers/lightnvm/pblk-read.c
@@ -0,0 +1,529 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *                  Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-read.c - pblk's read path
+ */
+
+#include "pblk.h"
+
+/*
+ * There is no guarantee that the value read from cache has not been updated and
+ * resides at another location in the cache. We guarantee though that if the
+ * value is read from the cache, it belongs to the mapped lba. In order to
+ * guarantee and order between writes and reads are ordered, a flush must be
+ * issued.
+ */
+static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio,
+				sector_t lba, struct ppa_addr ppa,
+				int bio_iter)
+{
+#ifdef CONFIG_NVM_DEBUG
+	/* Callers must ensure that the ppa points to a cache address */
+	BUG_ON(pblk_ppa_empty(ppa));
+	BUG_ON(!pblk_addr_in_cache(ppa));
+#endif
+
+	return pblk_rb_copy_to_bio(&pblk->rwb, bio, lba,
+					pblk_addr_to_cacheline(ppa), bio_iter);
+}
+
+static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
+				 unsigned long *read_bitmap)
+{
+	struct bio *bio = rqd->bio;
+	struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS];
+	sector_t blba = pblk_get_lba(bio);
+	int nr_secs = rqd->nr_ppas;
+	int advanced_bio = 0;
+	int i, j = 0;
+
+	/* logic error: lba out-of-bounds. Ignore read request */
+	if (!(blba + nr_secs < pblk->rl.nr_secs)) {
+		WARN_ON("pblk: read lbas out of bounds\n");
+		return;
+	}
+
+	pblk_lookup_l2p_seq(pblk, ppas, blba, nr_secs);
+
+	for (i = 0; i < nr_secs; i++) {
+		struct ppa_addr p = ppas[i];
+		sector_t lba = blba + i;
+
+retry:
+		if (pblk_ppa_empty(p)) {
+			WARN_ON(test_and_set_bit(i, read_bitmap));
+			continue;
+		}
+
+		/* Try to read from write buffer. The address is later checked
+		 * on the write buffer to prevent retrieving overwritten data.
+		 */
+		if (pblk_addr_in_cache(p)) {
+			if (!pblk_read_from_cache(pblk, bio, lba, p, i)) {
+				pblk_lookup_l2p_seq(pblk, &p, lba, 1);
+				goto retry;
+			}
+			WARN_ON(test_and_set_bit(i, read_bitmap));
+			advanced_bio = 1;
+		} else {
+			/* Read from media non-cached sectors */
+			rqd->ppa_list[j++] = p;
+		}
+
+		if (advanced_bio)
+			bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
+	}
+
+#ifdef CONFIG_NVM_DEBUG
+	atomic_long_add(nr_secs, &pblk->inflight_reads);
+#endif
+}
+
+static int pblk_submit_read_io(struct pblk *pblk, struct nvm_rq *rqd)
+{
+	int err;
+
+	rqd->flags = pblk_set_read_mode(pblk);
+
+	err = pblk_submit_io(pblk, rqd);
+	if (err)
+		return NVM_IO_ERR;
+
+	return NVM_IO_OK;
+}
+
+static void pblk_end_io_read(struct nvm_rq *rqd)
+{
+	struct pblk *pblk = rqd->private;
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct pblk_r_ctx *r_ctx = nvm_rq_to_pdu(rqd);
+	struct bio *bio = rqd->bio;
+
+	if (rqd->error)
+		pblk_log_read_err(pblk, rqd);
+#ifdef CONFIG_NVM_DEBUG
+	else
+		WARN_ONCE(bio->bi_error, "pblk: corrupted read error\n");
+#endif
+
+	if (rqd->nr_ppas > 1)
+		nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
+
+	bio_put(bio);
+	if (r_ctx->orig_bio) {
+#ifdef CONFIG_NVM_DEBUG
+		WARN_ONCE(r_ctx->orig_bio->bi_error,
+						"pblk: corrupted read bio\n");
+#endif
+		bio_endio(r_ctx->orig_bio);
+		bio_put(r_ctx->orig_bio);
+	}
+
+#ifdef CONFIG_NVM_DEBUG
+	atomic_long_add(rqd->nr_ppas, &pblk->sync_reads);
+	atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads);
+#endif
+
+	pblk_free_rqd(pblk, rqd, READ);
+}
+
+static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
+				      unsigned int bio_init_idx,
+				      unsigned long *read_bitmap)
+{
+	struct bio *new_bio, *bio = rqd->bio;
+	struct bio_vec src_bv, dst_bv;
+	void *ppa_ptr = NULL;
+	void *src_p, *dst_p;
+	dma_addr_t dma_ppa_list = 0;
+	int nr_secs = rqd->nr_ppas;
+	int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
+	int i, ret, hole;
+	DECLARE_COMPLETION_ONSTACK(wait);
+
+	new_bio = bio_alloc(GFP_KERNEL, nr_holes);
+	if (!new_bio) {
+		pr_err("pblk: could not alloc read bio\n");
+		return NVM_IO_ERR;
+	}
+
+	if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
+		goto err;
+
+	if (nr_holes != new_bio->bi_vcnt) {
+		pr_err("pblk: malformed bio\n");
+		goto err;
+	}
+
+	new_bio->bi_iter.bi_sector = 0; /* internal bio */
+	bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
+	new_bio->bi_private = &wait;
+	new_bio->bi_end_io = pblk_end_bio_sync;
+
+	rqd->bio = new_bio;
+	rqd->nr_ppas = nr_holes;
+	rqd->end_io = NULL;
+
+	if (unlikely(nr_secs > 1 && nr_holes == 1)) {
+		ppa_ptr = rqd->ppa_list;
+		dma_ppa_list = rqd->dma_ppa_list;
+		rqd->ppa_addr = rqd->ppa_list[0];
+	}
+
+	ret = pblk_submit_read_io(pblk, rqd);
+	if (ret) {
+		bio_put(rqd->bio);
+		pr_err("pblk: read IO submission failed\n");
+		goto err;
+	}
+
+	if (!wait_for_completion_io_timeout(&wait,
+				msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+		pr_err("pblk: partial read I/O timed out\n");
+	}
+
+	if (rqd->error) {
+		atomic_long_inc(&pblk->read_failed);
+#ifdef CONFIG_NVM_DEBUG
+		pblk_print_failed_rqd(pblk, rqd, rqd->error);
+#endif
+	}
+
+	if (unlikely(nr_secs > 1 && nr_holes == 1)) {
+		rqd->ppa_list = ppa_ptr;
+		rqd->dma_ppa_list = dma_ppa_list;
+	}
+
+	/* Fill the holes in the original bio */
+	i = 0;
+	hole = find_first_zero_bit(read_bitmap, nr_secs);
+	do {
+		src_bv = new_bio->bi_io_vec[i++];
+		dst_bv = bio->bi_io_vec[bio_init_idx + hole];
+
+		src_p = kmap_atomic(src_bv.bv_page);
+		dst_p = kmap_atomic(dst_bv.bv_page);
+
+		memcpy(dst_p + dst_bv.bv_offset,
+			src_p + src_bv.bv_offset,
+			PBLK_EXPOSED_PAGE_SIZE);
+
+		kunmap_atomic(src_p);
+		kunmap_atomic(dst_p);
+
+		mempool_free(src_bv.bv_page, pblk->page_pool);
+
+		hole = find_next_zero_bit(read_bitmap, nr_secs, hole + 1);
+	} while (hole < nr_secs);
+
+	bio_put(new_bio);
+
+	/* Complete the original bio and associated request */
+	rqd->bio = bio;
+	rqd->nr_ppas = nr_secs;
+	rqd->private = pblk;
+
+	bio_endio(bio);
+	pblk_end_io_read(rqd);
+	return NVM_IO_OK;
+
+err:
+	/* Free allocated pages in new bio */
+	pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt);
+	rqd->private = pblk;
+	pblk_end_io_read(rqd);
+	return NVM_IO_ERR;
+}
+
+static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd,
+			 unsigned long *read_bitmap)
+{
+	struct bio *bio = rqd->bio;
+	struct ppa_addr ppa;
+	sector_t lba = pblk_get_lba(bio);
+
+	/* logic error: lba out-of-bounds. Ignore read request */
+	if (!(lba < pblk->rl.nr_secs)) {
+		WARN_ON("pblk: read lba out of bounds\n");
+		return;
+	}
+
+	pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
+
+#ifdef CONFIG_NVM_DEBUG
+	atomic_long_inc(&pblk->inflight_reads);
+#endif
+
+retry:
+	if (pblk_ppa_empty(ppa)) {
+		WARN_ON(test_and_set_bit(0, read_bitmap));
+		return;
+	}
+
+	/* Try to read from write buffer. The address is later checked on the
+	 * write buffer to prevent retrieving overwritten data.
+	 */
+	if (pblk_addr_in_cache(ppa)) {
+		if (!pblk_read_from_cache(pblk, bio, lba, ppa, 0)) {
+			pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
+			goto retry;
+		}
+		WARN_ON(test_and_set_bit(0, read_bitmap));
+	} else {
+		rqd->ppa_addr = ppa;
+	}
+}
+
+int pblk_submit_read(struct pblk *pblk, struct bio *bio)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	int nr_secs = pblk_get_secs(bio);
+	struct nvm_rq *rqd;
+	unsigned long read_bitmap; /* Max 64 ppas per request */
+	unsigned int bio_init_idx;
+	int ret = NVM_IO_ERR;
+
+	if (nr_secs > PBLK_MAX_REQ_ADDRS)
+		return NVM_IO_ERR;
+
+	bitmap_zero(&read_bitmap, nr_secs);
+
+	rqd = pblk_alloc_rqd(pblk, READ);
+	if (IS_ERR(rqd)) {
+		pr_err_ratelimited("pblk: not able to alloc rqd");
+		return NVM_IO_ERR;
+	}
+
+	rqd->opcode = NVM_OP_PREAD;
+	rqd->bio = bio;
+	rqd->nr_ppas = nr_secs;
+	rqd->private = pblk;
+	rqd->end_io = pblk_end_io_read;
+
+	/* Save the index for this bio's start. This is needed in case
+	 * we need to fill a partial read.
+	 */
+	bio_init_idx = pblk_get_bi_idx(bio);
+
+	if (nr_secs > 1) {
+		rqd->ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+						&rqd->dma_ppa_list);
+		if (!rqd->ppa_list) {
+			pr_err("pblk: not able to allocate ppa list\n");
+			goto fail_rqd_free;
+		}
+
+		pblk_read_ppalist_rq(pblk, rqd, &read_bitmap);
+	} else {
+		pblk_read_rq(pblk, rqd, &read_bitmap);
+	}
+
+	bio_get(bio);
+	if (bitmap_full(&read_bitmap, nr_secs)) {
+		bio_endio(bio);
+		pblk_end_io_read(rqd);
+		return NVM_IO_OK;
+	}
+
+	/* All sectors are to be read from the device */
+	if (bitmap_empty(&read_bitmap, rqd->nr_ppas)) {
+		struct bio *int_bio = NULL;
+		struct pblk_r_ctx *r_ctx = nvm_rq_to_pdu(rqd);
+
+		/* Clone read bio to deal with read errors internally */
+		int_bio = bio_clone_bioset(bio, GFP_KERNEL, fs_bio_set);
+		if (!int_bio) {
+			pr_err("pblk: could not clone read bio\n");
+			return NVM_IO_ERR;
+		}
+
+		rqd->bio = int_bio;
+		r_ctx->orig_bio = bio;
+
+		ret = pblk_submit_read_io(pblk, rqd);
+		if (ret) {
+			pr_err("pblk: read IO submission failed\n");
+			if (int_bio)
+				bio_put(int_bio);
+			return ret;
+		}
+
+		return NVM_IO_OK;
+	}
+
+	/* The read bio request could be partially filled by the write buffer,
+	 * but there are some holes that need to be read from the drive.
+	 */
+	ret = pblk_fill_partial_read_bio(pblk, rqd, bio_init_idx, &read_bitmap);
+	if (ret) {
+		pr_err("pblk: failed to perform partial read\n");
+		return ret;
+	}
+
+	return NVM_IO_OK;
+
+fail_rqd_free:
+	pblk_free_rqd(pblk, rqd, READ);
+	return ret;
+}
+
+static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
+			      struct pblk_line *line, u64 *lba_list,
+			      unsigned int nr_secs)
+{
+	struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS];
+	int valid_secs = 0;
+	int i;
+
+	pblk_lookup_l2p_rand(pblk, ppas, lba_list, nr_secs);
+
+	for (i = 0; i < nr_secs; i++) {
+		if (pblk_addr_in_cache(ppas[i]) || ppas[i].g.blk != line->id ||
+						pblk_ppa_empty(ppas[i])) {
+			lba_list[i] = ADDR_EMPTY;
+			continue;
+		}
+
+		rqd->ppa_list[valid_secs++] = ppas[i];
+	}
+
+#ifdef CONFIG_NVM_DEBUG
+	atomic_long_add(valid_secs, &pblk->inflight_reads);
+#endif
+	return valid_secs;
+}
+
+static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
+		      struct pblk_line *line, sector_t lba)
+{
+	struct ppa_addr ppa;
+	int valid_secs = 0;
+
+	/* logic error: lba out-of-bounds */
+	if (!(lba < pblk->rl.nr_secs)) {
+		WARN_ON("pblk: read lba out of bounds\n");
+		goto out;
+	}
+
+	if (lba == ADDR_EMPTY)
+		goto out;
+
+	spin_lock(&pblk->trans_lock);
+	ppa = pblk_trans_map_get(pblk, lba);
+	spin_unlock(&pblk->trans_lock);
+
+	/* Ignore updated values until the moment */
+	if (pblk_addr_in_cache(ppa) || ppa.g.blk != line->id ||
+							pblk_ppa_empty(ppa))
+		goto out;
+
+	rqd->ppa_addr = ppa;
+	valid_secs = 1;
+
+#ifdef CONFIG_NVM_DEBUG
+	atomic_long_inc(&pblk->inflight_reads);
+#endif
+
+out:
+	return valid_secs;
+}
+
+int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data,
+			unsigned int nr_secs, unsigned int *secs_to_gc,
+			struct pblk_line *line)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct request_queue *q = dev->q;
+	struct bio *bio;
+	struct nvm_rq rqd;
+	int ret, data_len;
+	DECLARE_COMPLETION_ONSTACK(wait);
+
+	memset(&rqd, 0, sizeof(struct nvm_rq));
+
+	if (nr_secs > 1) {
+		rqd.ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+							&rqd.dma_ppa_list);
+		if (!rqd.ppa_list)
+			return NVM_IO_ERR;
+
+		*secs_to_gc = read_ppalist_rq_gc(pblk, &rqd, line, lba_list,
+								nr_secs);
+		if (*secs_to_gc == 1) {
+			struct ppa_addr ppa;
+
+			ppa = rqd.ppa_list[0];
+			nvm_dev_dma_free(dev->parent, rqd.ppa_list,
+							rqd.dma_ppa_list);
+			rqd.ppa_addr = ppa;
+		}
+	} else {
+		*secs_to_gc = read_rq_gc(pblk, &rqd, line, lba_list[0]);
+	}
+
+	if (!(*secs_to_gc))
+		goto out;
+
+	data_len = (*secs_to_gc) * geo->sec_size;
+	bio = bio_map_kern(q, data, data_len, GFP_KERNEL);
+	if (IS_ERR(bio)) {
+		pr_err("pblk: could not allocate GC bio (%lu)\n", PTR_ERR(bio));
+		goto err_free_dma;
+	}
+
+	bio->bi_iter.bi_sector = 0; /* internal bio */
+	bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+	rqd.opcode = NVM_OP_PREAD;
+	rqd.end_io = pblk_end_io_sync;
+	rqd.private = &wait;
+	rqd.nr_ppas = *secs_to_gc;
+	rqd.bio = bio;
+
+	ret = pblk_submit_read_io(pblk, &rqd);
+	if (ret) {
+		bio_endio(bio);
+		pr_err("pblk: GC read request failed\n");
+		goto err_free_dma;
+	}
+
+	if (!wait_for_completion_io_timeout(&wait,
+				msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+		pr_err("pblk: GC read I/O timed out\n");
+	}
+
+	if (rqd.error) {
+		atomic_long_inc(&pblk->read_failed_gc);
+#ifdef CONFIG_NVM_DEBUG
+		pblk_print_failed_rqd(pblk, &rqd, rqd.error);
+#endif
+	}
+
+#ifdef CONFIG_NVM_DEBUG
+	atomic_long_add(*secs_to_gc, &pblk->sync_reads);
+	atomic_long_add(*secs_to_gc, &pblk->recov_gc_reads);
+	atomic_long_sub(*secs_to_gc, &pblk->inflight_reads);
+#endif
+
+out:
+	if (rqd.nr_ppas > 1)
+		nvm_dev_dma_free(dev->parent, rqd.ppa_list, rqd.dma_ppa_list);
+	return NVM_IO_OK;
+
+err_free_dma:
+	if (rqd.nr_ppas > 1)
+		nvm_dev_dma_free(dev->parent, rqd.ppa_list, rqd.dma_ppa_list);
+	return NVM_IO_ERR;
+}
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
new file mode 100644
index 000000000000..0d50f415cfde
--- /dev/null
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -0,0 +1,998 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial: Javier Gonzalez <javier@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-recovery.c - pblk's recovery path
+ */
+
+#include "pblk.h"
+
+void pblk_submit_rec(struct work_struct *work)
+{
+	struct pblk_rec_ctx *recovery =
+			container_of(work, struct pblk_rec_ctx, ws_rec);
+	struct pblk *pblk = recovery->pblk;
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_rq *rqd = recovery->rqd;
+	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
+	int max_secs = nvm_max_phys_sects(dev);
+	struct bio *bio;
+	unsigned int nr_rec_secs;
+	unsigned int pgs_read;
+	int ret;
+
+	nr_rec_secs = bitmap_weight((unsigned long int *)&rqd->ppa_status,
+								max_secs);
+
+	bio = bio_alloc(GFP_KERNEL, nr_rec_secs);
+	if (!bio) {
+		pr_err("pblk: not able to create recovery bio\n");
+		return;
+	}
+
+	bio->bi_iter.bi_sector = 0;
+	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+	rqd->bio = bio;
+	rqd->nr_ppas = nr_rec_secs;
+
+	pgs_read = pblk_rb_read_to_bio_list(&pblk->rwb, bio, &recovery->failed,
+								nr_rec_secs);
+	if (pgs_read != nr_rec_secs) {
+		pr_err("pblk: could not read recovery entries\n");
+		goto err;
+	}
+
+	if (pblk_setup_w_rec_rq(pblk, rqd, c_ctx)) {
+		pr_err("pblk: could not setup recovery request\n");
+		goto err;
+	}
+
+#ifdef CONFIG_NVM_DEBUG
+	atomic_long_add(nr_rec_secs, &pblk->recov_writes);
+#endif
+
+	ret = pblk_submit_io(pblk, rqd);
+	if (ret) {
+		pr_err("pblk: I/O submission failed: %d\n", ret);
+		goto err;
+	}
+
+	mempool_free(recovery, pblk->rec_pool);
+	return;
+
+err:
+	bio_put(bio);
+	pblk_free_rqd(pblk, rqd, WRITE);
+}
+
+int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
+			struct pblk_rec_ctx *recovery, u64 *comp_bits,
+			unsigned int comp)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	int max_secs = nvm_max_phys_sects(dev);
+	struct nvm_rq *rec_rqd;
+	struct pblk_c_ctx *rec_ctx;
+	int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded;
+
+	rec_rqd = pblk_alloc_rqd(pblk, WRITE);
+	if (IS_ERR(rec_rqd)) {
+		pr_err("pblk: could not create recovery req.\n");
+		return -ENOMEM;
+	}
+
+	rec_ctx = nvm_rq_to_pdu(rec_rqd);
+
+	/* Copy completion bitmap, but exclude the first X completed entries */
+	bitmap_shift_right((unsigned long int *)&rec_rqd->ppa_status,
+				(unsigned long int *)comp_bits,
+				comp, max_secs);
+
+	/* Save the context for the entries that need to be re-written and
+	 * update current context with the completed entries.
+	 */
+	rec_ctx->sentry = pblk_rb_wrap_pos(&pblk->rwb, c_ctx->sentry + comp);
+	if (comp >= c_ctx->nr_valid) {
+		rec_ctx->nr_valid = 0;
+		rec_ctx->nr_padded = nr_entries - comp;
+
+		c_ctx->nr_padded = comp - c_ctx->nr_valid;
+	} else {
+		rec_ctx->nr_valid = c_ctx->nr_valid - comp;
+		rec_ctx->nr_padded = c_ctx->nr_padded;
+
+		c_ctx->nr_valid = comp;
+		c_ctx->nr_padded = 0;
+	}
+
+	recovery->rqd = rec_rqd;
+	recovery->pblk = pblk;
+
+	return 0;
+}
+
+__le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta)
+{
+	u32 crc;
+
+	crc = pblk_calc_emeta_crc(pblk, emeta);
+	if (le32_to_cpu(emeta->crc) != crc)
+		return NULL;
+
+	if (le32_to_cpu(emeta->header.identifier) != PBLK_MAGIC)
+		return NULL;
+
+	return pblk_line_emeta_to_lbas(emeta);
+}
+
+static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct pblk_line_meta *lm = &pblk->lm;
+	struct line_emeta *emeta = line->emeta;
+	__le64 *lba_list;
+	int data_start;
+	int nr_data_lbas, nr_valid_lbas, nr_lbas = 0;
+	int i;
+
+	lba_list = pblk_recov_get_lba_list(pblk, emeta);
+	if (!lba_list)
+		return 1;
+
+	data_start = pblk_line_smeta_start(pblk, line) + lm->smeta_sec;
+	nr_data_lbas = lm->sec_per_line - lm->emeta_sec;
+	nr_valid_lbas = le64_to_cpu(emeta->nr_valid_lbas);
+
+	for (i = data_start; i < nr_data_lbas && nr_lbas < nr_valid_lbas; i++) {
+		struct ppa_addr ppa;
+		int pos;
+
+		ppa = addr_to_pblk_ppa(pblk, i, line->id);
+		pos = pblk_ppa_to_pos(geo, ppa);
+
+		/* Do not update bad blocks */
+		if (test_bit(pos, line->blk_bitmap))
+			continue;
+
+		if (le64_to_cpu(lba_list[i]) == ADDR_EMPTY) {
+			spin_lock(&line->lock);
+			if (test_and_set_bit(i, line->invalid_bitmap))
+				WARN_ON_ONCE("pblk: rec. double invalidate:\n");
+			else
+				line->vsc--;
+			spin_unlock(&line->lock);
+
+			continue;
+		}
+
+		pblk_update_map(pblk, le64_to_cpu(lba_list[i]), ppa);
+		nr_lbas++;
+	}
+
+	if (nr_valid_lbas != nr_lbas)
+		pr_err("pblk: line %d - inconsistent lba list(%llu/%d)\n",
+				line->id, line->emeta->nr_valid_lbas, nr_lbas);
+
+	line->left_msecs = 0;
+
+	return 0;
+}
+
+static int pblk_calc_sec_in_line(struct pblk *pblk, struct pblk_line *line)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct pblk_line_meta *lm = &pblk->lm;
+	int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line);
+
+	return lm->sec_per_line - lm->smeta_sec - lm->emeta_sec -
+				nr_bb * geo->sec_per_blk;
+}
+
+struct pblk_recov_alloc {
+	struct ppa_addr *ppa_list;
+	struct pblk_sec_meta *meta_list;
+	struct nvm_rq *rqd;
+	void *data;
+	dma_addr_t dma_ppa_list;
+	dma_addr_t dma_meta_list;
+};
+
+static int pblk_recov_read_oob(struct pblk *pblk, struct pblk_line *line,
+			       struct pblk_recov_alloc p, u64 r_ptr)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct ppa_addr *ppa_list;
+	struct pblk_sec_meta *meta_list;
+	struct nvm_rq *rqd;
+	struct bio *bio;
+	void *data;
+	dma_addr_t dma_ppa_list, dma_meta_list;
+	u64 r_ptr_int;
+	int left_ppas;
+	int rq_ppas, rq_len;
+	int i, j;
+	int ret = 0;
+	DECLARE_COMPLETION_ONSTACK(wait);
+
+	ppa_list = p.ppa_list;
+	meta_list = p.meta_list;
+	rqd = p.rqd;
+	data = p.data;
+	dma_ppa_list = p.dma_ppa_list;
+	dma_meta_list = p.dma_meta_list;
+
+	left_ppas = line->cur_sec - r_ptr;
+	if (!left_ppas)
+		return 0;
+
+	r_ptr_int = r_ptr;
+
+next_read_rq:
+	memset(rqd, 0, pblk_r_rq_size);
+
+	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+	if (!rq_ppas)
+		rq_ppas = pblk->min_write_pgs;
+	rq_len = rq_ppas * geo->sec_size;
+
+	bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
+	if (IS_ERR(bio))
+		return PTR_ERR(bio);
+
+	bio->bi_iter.bi_sector = 0; /* internal bio */
+	bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+	rqd->bio = bio;
+	rqd->opcode = NVM_OP_PREAD;
+	rqd->flags = pblk_set_read_mode(pblk);
+	rqd->meta_list = meta_list;
+	rqd->nr_ppas = rq_ppas;
+	rqd->ppa_list = ppa_list;
+	rqd->dma_ppa_list = dma_ppa_list;
+	rqd->dma_meta_list = dma_meta_list;
+	rqd->end_io = pblk_end_io_sync;
+	rqd->private = &wait;
+
+	for (i = 0; i < rqd->nr_ppas; ) {
+		struct ppa_addr ppa;
+		int pos;
+
+		ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id);
+		pos = pblk_dev_ppa_to_pos(geo, ppa);
+
+		while (test_bit(pos, line->blk_bitmap)) {
+			r_ptr_int += pblk->min_write_pgs;
+			ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id);
+			pos = pblk_dev_ppa_to_pos(geo, ppa);
+		}
+
+		for (j = 0; j < pblk->min_write_pgs; j++, i++, r_ptr_int++)
+			rqd->ppa_list[i] =
+				addr_to_gen_ppa(pblk, r_ptr_int, line->id);
+	}
+
+	/* If read fails, more padding is needed */
+	ret = pblk_submit_io(pblk, rqd);
+	if (ret) {
+		pr_err("pblk: I/O submission failed: %d\n", ret);
+		return ret;
+	}
+
+	if (!wait_for_completion_io_timeout(&wait,
+				msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+		pr_err("pblk: L2P recovery read timed out\n");
+		return -EINTR;
+	}
+
+	reinit_completion(&wait);
+
+	/* At this point, the read should not fail. If it does, it is a problem
+	 * we cannot recover from here. Need FTL log.
+	 */
+	if (rqd->error) {
+		pr_err("pblk: L2P recovery failed (%d)\n", rqd->error);
+		return -EINTR;
+	}
+
+	for (i = 0; i < rqd->nr_ppas; i++) {
+		u64 lba = le64_to_cpu(meta_list[i].lba);
+
+		if (lba == ADDR_EMPTY || lba > pblk->rl.nr_secs)
+			continue;
+
+		pblk_update_map(pblk, lba, rqd->ppa_list[i]);
+	}
+
+	left_ppas -= rq_ppas;
+	if (left_ppas > 0)
+		goto next_read_rq;
+
+	return 0;
+}
+
+static int pblk_recov_pad_oob(struct pblk *pblk, struct pblk_line *line,
+			      struct pblk_recov_alloc p, int left_ppas)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct ppa_addr *ppa_list;
+	struct pblk_sec_meta *meta_list;
+	struct nvm_rq *rqd;
+	struct bio *bio;
+	void *data;
+	dma_addr_t dma_ppa_list, dma_meta_list;
+	__le64 *lba_list = pblk_line_emeta_to_lbas(line->emeta);
+	u64 w_ptr = line->cur_sec;
+	int left_line_ppas = line->left_msecs;
+	int rq_ppas, rq_len;
+	int i, j;
+	int ret = 0;
+	DECLARE_COMPLETION_ONSTACK(wait);
+
+	ppa_list = p.ppa_list;
+	meta_list = p.meta_list;
+	rqd = p.rqd;
+	data = p.data;
+	dma_ppa_list = p.dma_ppa_list;
+	dma_meta_list = p.dma_meta_list;
+
+next_pad_rq:
+	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+	if (!rq_ppas)
+		rq_ppas = pblk->min_write_pgs;
+	rq_len = rq_ppas * geo->sec_size;
+
+	bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
+	if (IS_ERR(bio))
+		return PTR_ERR(bio);
+
+	bio->bi_iter.bi_sector = 0; /* internal bio */
+	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+
+	memset(rqd, 0, pblk_r_rq_size);
+
+	rqd->bio = bio;
+	rqd->opcode = NVM_OP_PWRITE;
+	rqd->flags = pblk_set_progr_mode(pblk, WRITE);
+	rqd->meta_list = meta_list;
+	rqd->nr_ppas = rq_ppas;
+	rqd->ppa_list = ppa_list;
+	rqd->dma_ppa_list = dma_ppa_list;
+	rqd->dma_meta_list = dma_meta_list;
+	rqd->end_io = pblk_end_io_sync;
+	rqd->private = &wait;
+
+	for (i = 0; i < rqd->nr_ppas; ) {
+		struct ppa_addr ppa;
+		int pos;
+
+		w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
+		ppa = addr_to_pblk_ppa(pblk, w_ptr, line->id);
+		pos = pblk_ppa_to_pos(geo, ppa);
+
+		while (test_bit(pos, line->blk_bitmap)) {
+			w_ptr += pblk->min_write_pgs;
+			ppa = addr_to_pblk_ppa(pblk, w_ptr, line->id);
+			pos = pblk_ppa_to_pos(geo, ppa);
+		}
+
+		for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++) {
+			struct ppa_addr dev_ppa;
+
+			dev_ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
+
+			pblk_map_invalidate(pblk, dev_ppa);
+			meta_list[i].lba = cpu_to_le64(ADDR_EMPTY);
+			lba_list[w_ptr] = cpu_to_le64(ADDR_EMPTY);
+			rqd->ppa_list[i] = dev_ppa;
+		}
+	}
+
+	ret = pblk_submit_io(pblk, rqd);
+	if (ret) {
+		pr_err("pblk: I/O submission failed: %d\n", ret);
+		return ret;
+	}
+
+	if (!wait_for_completion_io_timeout(&wait,
+				msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+		pr_err("pblk: L2P recovery write timed out\n");
+	}
+	reinit_completion(&wait);
+
+	left_line_ppas -= rq_ppas;
+	left_ppas -= rq_ppas;
+	if (left_ppas > 0 && left_line_ppas)
+		goto next_pad_rq;
+
+	return 0;
+}
+
+/* When this function is called, it means that not all upper pages have been
+ * written in a page that contains valid data. In order to recover this data, we
+ * first find the write pointer on the device, then we pad all necessary
+ * sectors, and finally attempt to read the valid data
+ */
+static int pblk_recov_scan_all_oob(struct pblk *pblk, struct pblk_line *line,
+				   struct pblk_recov_alloc p)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct ppa_addr *ppa_list;
+	struct pblk_sec_meta *meta_list;
+	struct nvm_rq *rqd;
+	struct bio *bio;
+	void *data;
+	dma_addr_t dma_ppa_list, dma_meta_list;
+	u64 w_ptr = 0, r_ptr;
+	int rq_ppas, rq_len;
+	int i, j;
+	int ret = 0;
+	int rec_round;
+	int left_ppas = pblk_calc_sec_in_line(pblk, line) - line->cur_sec;
+	DECLARE_COMPLETION_ONSTACK(wait);
+
+	ppa_list = p.ppa_list;
+	meta_list = p.meta_list;
+	rqd = p.rqd;
+	data = p.data;
+	dma_ppa_list = p.dma_ppa_list;
+	dma_meta_list = p.dma_meta_list;
+
+	/* we could recover up until the line write pointer */
+	r_ptr = line->cur_sec;
+	rec_round = 0;
+
+next_rq:
+	memset(rqd, 0, pblk_r_rq_size);
+
+	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+	if (!rq_ppas)
+		rq_ppas = pblk->min_write_pgs;
+	rq_len = rq_ppas * geo->sec_size;
+
+	bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
+	if (IS_ERR(bio))
+		return PTR_ERR(bio);
+
+	bio->bi_iter.bi_sector = 0; /* internal bio */
+	bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+	rqd->bio = bio;
+	rqd->opcode = NVM_OP_PREAD;
+	rqd->flags = pblk_set_read_mode(pblk);
+	rqd->meta_list = meta_list;
+	rqd->nr_ppas = rq_ppas;
+	rqd->ppa_list = ppa_list;
+	rqd->dma_ppa_list = dma_ppa_list;
+	rqd->dma_meta_list = dma_meta_list;
+	rqd->end_io = pblk_end_io_sync;
+	rqd->private = &wait;
+
+	for (i = 0; i < rqd->nr_ppas; ) {
+		struct ppa_addr ppa;
+		int pos;
+
+		w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
+		ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
+		pos = pblk_dev_ppa_to_pos(geo, ppa);
+
+		while (test_bit(pos, line->blk_bitmap)) {
+			w_ptr += pblk->min_write_pgs;
+			ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
+			pos = pblk_dev_ppa_to_pos(geo, ppa);
+		}
+
+		for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++)
+			rqd->ppa_list[i] =
+				addr_to_gen_ppa(pblk, w_ptr, line->id);
+	}
+
+	ret = pblk_submit_io(pblk, rqd);
+	if (ret) {
+		pr_err("pblk: I/O submission failed: %d\n", ret);
+		return ret;
+	}
+
+	if (!wait_for_completion_io_timeout(&wait,
+				msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+		pr_err("pblk: L2P recovery read timed out\n");
+	}
+	reinit_completion(&wait);
+
+	/* This should not happen since the read failed during normal recovery,
+	 * but the media works funny sometimes...
+	 */
+	if (!rec_round++ && !rqd->error) {
+		rec_round = 0;
+		for (i = 0; i < rqd->nr_ppas; i++, r_ptr++) {
+			u64 lba = le64_to_cpu(meta_list[i].lba);
+
+			if (lba == ADDR_EMPTY || lba > pblk->rl.nr_secs)
+				continue;
+
+			pblk_update_map(pblk, lba, rqd->ppa_list[i]);
+		}
+	}
+
+	/* Reached the end of the written line */
+	if (rqd->error == NVM_RSP_ERR_EMPTYPAGE) {
+		int pad_secs, nr_error_bits, bit;
+		int ret;
+
+		bit = find_first_bit((void *)&rqd->ppa_status, rqd->nr_ppas);
+		nr_error_bits = rqd->nr_ppas - bit;
+
+		/* Roll back failed sectors */
+		line->cur_sec -= nr_error_bits;
+		line->left_msecs += nr_error_bits;
+		bitmap_clear(line->map_bitmap, line->cur_sec, nr_error_bits);
+
+		pad_secs = pblk_pad_distance(pblk);
+		if (pad_secs > line->left_msecs)
+			pad_secs = line->left_msecs;
+
+		ret = pblk_recov_pad_oob(pblk, line, p, pad_secs);
+		if (ret)
+			pr_err("pblk: OOB padding failed (err:%d)\n", ret);
+
+		ret = pblk_recov_read_oob(pblk, line, p, r_ptr);
+		if (ret)
+			pr_err("pblk: OOB read failed (err:%d)\n", ret);
+
+		line->left_ssecs = line->left_msecs;
+		left_ppas = 0;
+	}
+
+	left_ppas -= rq_ppas;
+	if (left_ppas > 0)
+		goto next_rq;
+
+	return ret;
+}
+
+static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
+			       struct pblk_recov_alloc p, int *done)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct ppa_addr *ppa_list;
+	struct pblk_sec_meta *meta_list;
+	struct nvm_rq *rqd;
+	struct bio *bio;
+	void *data;
+	dma_addr_t dma_ppa_list, dma_meta_list;
+	u64 paddr;
+	int rq_ppas, rq_len;
+	int i, j;
+	int ret = 0;
+	int left_ppas = pblk_calc_sec_in_line(pblk, line);
+	DECLARE_COMPLETION_ONSTACK(wait);
+
+	ppa_list = p.ppa_list;
+	meta_list = p.meta_list;
+	rqd = p.rqd;
+	data = p.data;
+	dma_ppa_list = p.dma_ppa_list;
+	dma_meta_list = p.dma_meta_list;
+
+	*done = 1;
+
+next_rq:
+	memset(rqd, 0, pblk_r_rq_size);
+
+	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+	if (!rq_ppas)
+		rq_ppas = pblk->min_write_pgs;
+	rq_len = rq_ppas * geo->sec_size;
+
+	bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
+	if (IS_ERR(bio))
+		return PTR_ERR(bio);
+
+	bio->bi_iter.bi_sector = 0; /* internal bio */
+	bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+	rqd->bio = bio;
+	rqd->opcode = NVM_OP_PREAD;
+	rqd->flags = pblk_set_read_mode(pblk);
+	rqd->meta_list = meta_list;
+	rqd->nr_ppas = rq_ppas;
+	rqd->ppa_list = ppa_list;
+	rqd->dma_ppa_list = dma_ppa_list;
+	rqd->dma_meta_list = dma_meta_list;
+	rqd->end_io = pblk_end_io_sync;
+	rqd->private = &wait;
+
+	for (i = 0; i < rqd->nr_ppas; ) {
+		struct ppa_addr ppa;
+		int pos;
+
+		paddr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
+		ppa = addr_to_gen_ppa(pblk, paddr, line->id);
+		pos = pblk_dev_ppa_to_pos(geo, ppa);
+
+		while (test_bit(pos, line->blk_bitmap)) {
+			paddr += pblk->min_write_pgs;
+			ppa = addr_to_gen_ppa(pblk, paddr, line->id);
+			pos = pblk_dev_ppa_to_pos(geo, ppa);
+		}
+
+		for (j = 0; j < pblk->min_write_pgs; j++, i++, paddr++)
+			rqd->ppa_list[i] =
+				addr_to_gen_ppa(pblk, paddr, line->id);
+	}
+
+	ret = pblk_submit_io(pblk, rqd);
+	if (ret) {
+		pr_err("pblk: I/O submission failed: %d\n", ret);
+		bio_put(bio);
+		return ret;
+	}
+
+	if (!wait_for_completion_io_timeout(&wait,
+				msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+		pr_err("pblk: L2P recovery read timed out\n");
+	}
+	reinit_completion(&wait);
+
+	/* Reached the end of the written line */
+	if (rqd->error) {
+		int nr_error_bits, bit;
+
+		bit = find_first_bit((void *)&rqd->ppa_status, rqd->nr_ppas);
+		nr_error_bits = rqd->nr_ppas - bit;
+
+		/* Roll back failed sectors */
+		line->cur_sec -= nr_error_bits;
+		line->left_msecs += nr_error_bits;
+		line->left_ssecs = line->left_msecs;
+		bitmap_clear(line->map_bitmap, line->cur_sec, nr_error_bits);
+
+		left_ppas = 0;
+		rqd->nr_ppas = bit;
+
+		if (rqd->error != NVM_RSP_ERR_EMPTYPAGE)
+			*done = 0;
+	}
+
+	for (i = 0; i < rqd->nr_ppas; i++) {
+		u64 lba = le64_to_cpu(meta_list[i].lba);
+
+		if (lba == ADDR_EMPTY || lba > pblk->rl.nr_secs)
+			continue;
+
+		pblk_update_map(pblk, lba, rqd->ppa_list[i]);
+	}
+
+	left_ppas -= rq_ppas;
+	if (left_ppas > 0)
+		goto next_rq;
+
+	return ret;
+}
+
+/* Scan line for lbas on out of bound area */
+static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct nvm_rq *rqd;
+	struct ppa_addr *ppa_list;
+	struct pblk_sec_meta *meta_list;
+	struct pblk_recov_alloc p;
+	void *data;
+	dma_addr_t dma_ppa_list, dma_meta_list;
+	int done, ret = 0;
+
+	rqd = pblk_alloc_rqd(pblk, READ);
+	if (IS_ERR(rqd))
+		return PTR_ERR(rqd);
+
+	meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list);
+	if (!meta_list) {
+		ret = -ENOMEM;
+		goto free_rqd;
+	}
+
+	ppa_list = (void *)(meta_list) + pblk_dma_meta_size;
+	dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
+
+	data = kcalloc(pblk->max_write_pgs, geo->sec_size, GFP_KERNEL);
+	if (!data) {
+		ret = -ENOMEM;
+		goto free_meta_list;
+	}
+
+	p.ppa_list = ppa_list;
+	p.meta_list = meta_list;
+	p.rqd = rqd;
+	p.data = data;
+	p.dma_ppa_list = dma_ppa_list;
+	p.dma_meta_list = dma_meta_list;
+
+	ret = pblk_recov_scan_oob(pblk, line, p, &done);
+	if (ret) {
+		pr_err("pblk: could not recover L2P from OOB\n");
+		goto out;
+	}
+
+	if (!done) {
+		ret = pblk_recov_scan_all_oob(pblk, line, p);
+		if (ret) {
+			pr_err("pblk: could not recover L2P from OOB\n");
+			goto out;
+		}
+	}
+
+	if (pblk_line_is_full(line))
+		pblk_line_recov_close(pblk, line);
+
+out:
+	kfree(data);
+free_meta_list:
+	nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list);
+free_rqd:
+	pblk_free_rqd(pblk, rqd, READ);
+
+	return ret;
+}
+
+/* Insert lines ordered by sequence number (seq_num) on list */
+static void pblk_recov_line_add_ordered(struct list_head *head,
+					struct pblk_line *line)
+{
+	struct pblk_line *t = NULL;
+
+	list_for_each_entry(t, head, list)
+		if (t->seq_nr > line->seq_nr)
+			break;
+
+	__list_add(&line->list, t->list.prev, &t->list);
+}
+
+struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct pblk_line_meta *lm = &pblk->lm;
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	struct pblk_line *line, *tline, *data_line = NULL;
+	struct line_smeta *smeta;
+	struct line_emeta *emeta;
+	int found_lines = 0, recovered_lines = 0, open_lines = 0;
+	int is_next = 0;
+	int meta_line;
+	int i, valid_uuid = 0;
+	LIST_HEAD(recov_list);
+
+	/* TODO: Implement FTL snapshot */
+
+	/* Scan recovery - takes place when FTL snapshot fails */
+	spin_lock(&l_mg->free_lock);
+	meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES);
+	set_bit(meta_line, &l_mg->meta_bitmap);
+	smeta = l_mg->sline_meta[meta_line].meta;
+	emeta = l_mg->eline_meta[meta_line].meta;
+	spin_unlock(&l_mg->free_lock);
+
+	/* Order data lines using their sequence number */
+	for (i = 0; i < l_mg->nr_lines; i++) {
+		u32 crc;
+
+		line = &pblk->lines[i];
+
+		memset(smeta, 0, lm->smeta_len);
+		line->smeta = smeta;
+		line->lun_bitmap = ((void *)(smeta)) +
+						sizeof(struct line_smeta);
+
+		/* Lines that cannot be read are assumed as not written here */
+		if (pblk_line_read_smeta(pblk, line))
+			continue;
+
+		crc = pblk_calc_smeta_crc(pblk, smeta);
+		if (le32_to_cpu(smeta->crc) != crc)
+			continue;
+
+		if (le32_to_cpu(smeta->header.identifier) != PBLK_MAGIC)
+			continue;
+
+		if (le16_to_cpu(smeta->header.version) != 1) {
+			pr_err("pblk: found incompatible line version %u\n",
+					smeta->header.version);
+			return ERR_PTR(-EINVAL);
+		}
+
+		/* The first valid instance uuid is used for initialization */
+		if (!valid_uuid) {
+			memcpy(pblk->instance_uuid, smeta->header.uuid, 16);
+			valid_uuid = 1;
+		}
+
+		if (memcmp(pblk->instance_uuid, smeta->header.uuid, 16)) {
+			pr_debug("pblk: ignore line %u due to uuid mismatch\n",
+					i);
+			continue;
+		}
+
+		/* Update line metadata */
+		spin_lock(&line->lock);
+		line->id = le32_to_cpu(line->smeta->header.id);
+		line->type = le16_to_cpu(line->smeta->header.type);
+		line->seq_nr = le64_to_cpu(line->smeta->seq_nr);
+		spin_unlock(&line->lock);
+
+		/* Update general metadata */
+		spin_lock(&l_mg->free_lock);
+		if (line->seq_nr >= l_mg->d_seq_nr)
+			l_mg->d_seq_nr = line->seq_nr + 1;
+		l_mg->nr_free_lines--;
+		spin_unlock(&l_mg->free_lock);
+
+		if (pblk_line_recov_alloc(pblk, line))
+			goto out;
+
+		pblk_recov_line_add_ordered(&recov_list, line);
+		found_lines++;
+		pr_debug("pblk: recovering data line %d, seq:%llu\n",
+						line->id, smeta->seq_nr);
+	}
+
+	if (!found_lines) {
+		pblk_setup_uuid(pblk);
+
+		spin_lock(&l_mg->free_lock);
+		WARN_ON_ONCE(!test_and_clear_bit(meta_line,
+							&l_mg->meta_bitmap));
+		spin_unlock(&l_mg->free_lock);
+
+		goto out;
+	}
+
+	/* Verify closed blocks and recover this portion of L2P table*/
+	list_for_each_entry_safe(line, tline, &recov_list, list) {
+		int off, nr_bb;
+
+		recovered_lines++;
+		/* Calculate where emeta starts based on the line bb */
+		off = lm->sec_per_line - lm->emeta_sec;
+		nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line);
+		off -= nr_bb * geo->sec_per_pl;
+
+		memset(emeta, 0, lm->emeta_len);
+		line->emeta = emeta;
+		line->emeta_ssec = off;
+
+		if (pblk_line_read_emeta(pblk, line)) {
+			pblk_recov_l2p_from_oob(pblk, line);
+			goto next;
+		}
+
+		if (pblk_recov_l2p_from_emeta(pblk, line))
+			pblk_recov_l2p_from_oob(pblk, line);
+
+next:
+		if (pblk_line_is_full(line)) {
+			struct list_head *move_list;
+
+			spin_lock(&line->lock);
+			line->state = PBLK_LINESTATE_CLOSED;
+			move_list = pblk_line_gc_list(pblk, line);
+			spin_unlock(&line->lock);
+
+			spin_lock(&l_mg->gc_lock);
+			list_move_tail(&line->list, move_list);
+			spin_unlock(&l_mg->gc_lock);
+
+			mempool_free(line->map_bitmap, pblk->line_meta_pool);
+			line->map_bitmap = NULL;
+			line->smeta = NULL;
+			line->emeta = NULL;
+		} else {
+			if (open_lines > 1)
+				pr_err("pblk: failed to recover L2P\n");
+
+			open_lines++;
+			line->meta_line = meta_line;
+			data_line = line;
+		}
+	}
+
+	spin_lock(&l_mg->free_lock);
+	if (!open_lines) {
+		WARN_ON_ONCE(!test_and_clear_bit(meta_line,
+							&l_mg->meta_bitmap));
+		pblk_line_replace_data(pblk);
+	} else {
+		/* Allocate next line for preparation */
+		l_mg->data_next = pblk_line_get(pblk);
+		if (l_mg->data_next) {
+			l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
+			l_mg->data_next->type = PBLK_LINETYPE_DATA;
+			is_next = 1;
+		}
+	}
+	spin_unlock(&l_mg->free_lock);
+
+	if (is_next) {
+		pblk_line_erase(pblk, l_mg->data_next);
+		pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
+	}
+
+out:
+	if (found_lines != recovered_lines)
+		pr_err("pblk: failed to recover all found lines %d/%d\n",
+						found_lines, recovered_lines);
+
+	return data_line;
+}
+
+/*
+ * Pad until smeta can be read on current data line
+ */
+void pblk_recov_pad(struct pblk *pblk)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct pblk_line *line;
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	struct nvm_rq *rqd;
+	struct pblk_recov_alloc p;
+	struct ppa_addr *ppa_list;
+	struct pblk_sec_meta *meta_list;
+	void *data;
+	dma_addr_t dma_ppa_list, dma_meta_list;
+
+	spin_lock(&l_mg->free_lock);
+	line = l_mg->data_line;
+	spin_unlock(&l_mg->free_lock);
+
+	rqd = pblk_alloc_rqd(pblk, READ);
+	if (IS_ERR(rqd))
+		return;
+
+	meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list);
+	if (!meta_list)
+		goto free_rqd;
+
+	ppa_list = (void *)(meta_list) + pblk_dma_meta_size;
+	dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
+
+	data = kcalloc(pblk->max_write_pgs, geo->sec_size, GFP_KERNEL);
+	if (!data)
+		goto free_meta_list;
+
+	p.ppa_list = ppa_list;
+	p.meta_list = meta_list;
+	p.rqd = rqd;
+	p.data = data;
+	p.dma_ppa_list = dma_ppa_list;
+	p.dma_meta_list = dma_meta_list;
+
+	if (pblk_recov_pad_oob(pblk, line, p, line->left_msecs)) {
+		pr_err("pblk: Tear down padding failed\n");
+		goto free_data;
+	}
+
+	pblk_line_close(pblk, line);
+
+free_data:
+	kfree(data);
+free_meta_list:
+	nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list);
+free_rqd:
+	pblk_free_rqd(pblk, rqd, READ);
+}
diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c
new file mode 100644
index 000000000000..4042162ec9bc
--- /dev/null
+++ b/drivers/lightnvm/pblk-rl.c
@@ -0,0 +1,182 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *                  Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-rl.c - pblk's rate limiter for user I/O
+ *
+ */
+
+#include "pblk.h"
+
+static void pblk_rl_kick_u_timer(struct pblk_rl *rl)
+{
+	mod_timer(&rl->u_timer, jiffies + msecs_to_jiffies(5000));
+}
+
+int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries)
+{
+	int rb_user_cnt = atomic_read(&rl->rb_user_cnt);
+
+	return (!(rb_user_cnt + nr_entries > rl->rb_user_max));
+}
+
+int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries)
+{
+	int rb_gc_cnt = atomic_read(&rl->rb_gc_cnt);
+	int rb_user_active;
+
+	/* If there is no user I/O let GC take over space on the write buffer */
+	rb_user_active = READ_ONCE(rl->rb_user_active);
+	return (!(rb_gc_cnt + nr_entries > rl->rb_gc_max && rb_user_active));
+}
+
+void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries)
+{
+	atomic_add(nr_entries, &rl->rb_user_cnt);
+
+	/* Release user I/O state. Protect from GC */
+	smp_store_release(&rl->rb_user_active, 1);
+	pblk_rl_kick_u_timer(rl);
+}
+
+void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries)
+{
+	atomic_add(nr_entries, &rl->rb_gc_cnt);
+}
+
+void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc)
+{
+	atomic_sub(nr_user, &rl->rb_user_cnt);
+	atomic_sub(nr_gc, &rl->rb_gc_cnt);
+}
+
+unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl)
+{
+	return atomic_read(&rl->free_blocks);
+}
+
+/*
+ * We check for (i) the number of free blocks in the current LUN and (ii) the
+ * total number of free blocks in the pblk instance. This is to even out the
+ * number of free blocks on each LUN when GC kicks in.
+ *
+ * Only the total number of free blocks is used to configure the rate limiter.
+ */
+static int pblk_rl_update_rates(struct pblk_rl *rl, unsigned long max)
+{
+	unsigned long free_blocks = pblk_rl_nr_free_blks(rl);
+
+	if (free_blocks >= rl->high) {
+		rl->rb_user_max = max - rl->rb_gc_rsv;
+		rl->rb_gc_max = rl->rb_gc_rsv;
+		rl->rb_state = PBLK_RL_HIGH;
+	} else if (free_blocks < rl->high) {
+		int shift = rl->high_pw - rl->rb_windows_pw;
+		int user_windows = free_blocks >> shift;
+		int user_max = user_windows << PBLK_MAX_REQ_ADDRS_PW;
+		int gc_max;
+
+		rl->rb_user_max = user_max;
+		gc_max = max - rl->rb_user_max;
+		rl->rb_gc_max = max(gc_max, rl->rb_gc_rsv);
+
+		if (free_blocks > rl->low)
+			rl->rb_state = PBLK_RL_MID;
+		else
+			rl->rb_state = PBLK_RL_LOW;
+	}
+
+	return rl->rb_state;
+}
+
+void pblk_rl_set_gc_rsc(struct pblk_rl *rl, int rsv)
+{
+	rl->rb_gc_rsv = rl->rb_gc_max = rsv;
+}
+
+void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line)
+{
+	struct pblk *pblk = container_of(rl, struct pblk, rl);
+	int ret;
+
+	atomic_add(line->blk_in_line, &rl->free_blocks);
+	/* Rates will not change that often - no need to lock update */
+	ret = pblk_rl_update_rates(rl, rl->rb_budget);
+
+	if (ret == (PBLK_RL_MID | PBLK_RL_LOW))
+		pblk_gc_should_start(pblk);
+	else
+		pblk_gc_should_stop(pblk);
+}
+
+void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line)
+{
+	struct pblk *pblk = container_of(rl, struct pblk, rl);
+	int ret;
+
+	atomic_sub(line->blk_in_line, &rl->free_blocks);
+
+	/* Rates will not change that often - no need to lock update */
+	ret = pblk_rl_update_rates(rl, rl->rb_budget);
+	if (ret == (PBLK_RL_MID | PBLK_RL_LOW))
+		pblk_gc_should_start(pblk);
+	else
+		pblk_gc_should_stop(pblk);
+}
+
+int pblk_rl_gc_thrs(struct pblk_rl *rl)
+{
+	return rl->high;
+}
+
+int pblk_rl_sysfs_rate_show(struct pblk_rl *rl)
+{
+	return rl->rb_user_max;
+}
+
+static void pblk_rl_u_timer(unsigned long data)
+{
+	struct pblk_rl *rl = (struct pblk_rl *)data;
+
+	/* Release user I/O state. Protect from GC */
+	smp_store_release(&rl->rb_user_active, 0);
+}
+
+void pblk_rl_free(struct pblk_rl *rl)
+{
+	del_timer(&rl->u_timer);
+}
+
+void pblk_rl_init(struct pblk_rl *rl, int budget)
+{
+	unsigned int rb_windows;
+
+	rl->high = rl->total_blocks / PBLK_USER_HIGH_THRS;
+	rl->low = rl->total_blocks / PBLK_USER_LOW_THRS;
+	rl->high_pw = get_count_order(rl->high);
+
+	/* This will always be a power-of-2 */
+	rb_windows = budget / PBLK_MAX_REQ_ADDRS;
+	rl->rb_windows_pw = get_count_order(rb_windows) + 1;
+
+	/* To start with, all buffer is available to user I/O writers */
+	rl->rb_budget = budget;
+	rl->rb_user_max = budget;
+	atomic_set(&rl->rb_user_cnt, 0);
+	rl->rb_gc_max = 0;
+	rl->rb_state = PBLK_RL_HIGH;
+	atomic_set(&rl->rb_gc_cnt, 0);
+
+	setup_timer(&rl->u_timer, pblk_rl_u_timer, (unsigned long)rl);
+	rl->rb_user_active = 0;
+}
diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c
new file mode 100644
index 000000000000..f0af1d1ceeff
--- /dev/null
+++ b/drivers/lightnvm/pblk-sysfs.c
@@ -0,0 +1,507 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *                  Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Implementation of a physical block-device target for Open-channel SSDs.
+ *
+ * pblk-sysfs.c - pblk's sysfs
+ *
+ */
+
+#include "pblk.h"
+
+static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct pblk_lun *rlun;
+	ssize_t sz = 0;
+	int i;
+
+	for (i = 0; i < geo->nr_luns; i++) {
+		int active = 1;
+
+		rlun = &pblk->luns[i];
+		if (!down_trylock(&rlun->wr_sem)) {
+			active = 0;
+			up(&rlun->wr_sem);
+		}
+		sz += snprintf(page + sz, PAGE_SIZE - sz,
+				"pblk: pos:%d, ch:%d, lun:%d - %d\n",
+					i,
+					rlun->bppa.g.ch,
+					rlun->bppa.g.lun,
+					active);
+	}
+
+	return sz;
+}
+
+static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	int free_blocks, total_blocks;
+	int rb_user_max, rb_user_cnt;
+	int rb_gc_max, rb_gc_rsv, rb_gc_cnt, rb_budget, rb_state;
+
+	free_blocks = atomic_read(&pblk->rl.free_blocks);
+	rb_user_max = pblk->rl.rb_user_max;
+	rb_user_cnt = atomic_read(&pblk->rl.rb_user_cnt);
+	rb_gc_max = pblk->rl.rb_gc_max;
+	rb_gc_rsv = pblk->rl.rb_gc_rsv;
+	rb_gc_cnt = atomic_read(&pblk->rl.rb_gc_cnt);
+	rb_budget = pblk->rl.rb_budget;
+	rb_state = pblk->rl.rb_state;
+
+	total_blocks = geo->blks_per_lun * geo->nr_luns;
+
+	return snprintf(page, PAGE_SIZE,
+		"u:%u/%u,gc:%u/%u/%u(%u/%u)(stop:<%u,full:>%u,free:%d/%d)-%d\n",
+				rb_user_cnt,
+				rb_user_max,
+				rb_gc_cnt,
+				rb_gc_max,
+				rb_gc_rsv,
+				rb_state,
+				rb_budget,
+				pblk->rl.low,
+				pblk->rl.high,
+				free_blocks,
+				total_blocks,
+				READ_ONCE(pblk->rl.rb_user_active));
+}
+
+static ssize_t pblk_sysfs_gc_state_show(struct pblk *pblk, char *page)
+{
+	int gc_enabled, gc_active;
+
+	pblk_gc_sysfs_state_show(pblk, &gc_enabled, &gc_active);
+	return snprintf(page, PAGE_SIZE, "gc_enabled=%d, gc_active=%d\n",
+					gc_enabled, gc_active);
+}
+
+static ssize_t pblk_sysfs_stats(struct pblk *pblk, char *page)
+{
+	ssize_t sz;
+
+	sz = snprintf(page, PAGE_SIZE,
+			"read_failed=%lu, read_high_ecc=%lu, read_empty=%lu, read_failed_gc=%lu, write_failed=%lu, erase_failed=%lu\n",
+			atomic_long_read(&pblk->read_failed),
+			atomic_long_read(&pblk->read_high_ecc),
+			atomic_long_read(&pblk->read_empty),
+			atomic_long_read(&pblk->read_failed_gc),
+			atomic_long_read(&pblk->write_failed),
+			atomic_long_read(&pblk->erase_failed));
+
+	return sz;
+}
+
+static ssize_t pblk_sysfs_write_buffer(struct pblk *pblk, char *page)
+{
+	return pblk_rb_sysfs(&pblk->rwb, page);
+}
+
+static ssize_t pblk_sysfs_ppaf(struct pblk *pblk, char *page)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	ssize_t sz = 0;
+
+	sz = snprintf(page, PAGE_SIZE - sz,
+		"g:(b:%d)blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n",
+		pblk->ppaf_bitsize,
+		pblk->ppaf.blk_offset, geo->ppaf.blk_len,
+		pblk->ppaf.pg_offset, geo->ppaf.pg_len,
+		pblk->ppaf.lun_offset, geo->ppaf.lun_len,
+		pblk->ppaf.ch_offset, geo->ppaf.ch_len,
+		pblk->ppaf.pln_offset, geo->ppaf.pln_len,
+		pblk->ppaf.sec_offset, geo->ppaf.sect_len);
+
+	sz += snprintf(page + sz, PAGE_SIZE - sz,
+		"d:blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n",
+		geo->ppaf.blk_offset, geo->ppaf.blk_len,
+		geo->ppaf.pg_offset, geo->ppaf.pg_len,
+		geo->ppaf.lun_offset, geo->ppaf.lun_len,
+		geo->ppaf.ch_offset, geo->ppaf.ch_len,
+		geo->ppaf.pln_offset, geo->ppaf.pln_len,
+		geo->ppaf.sect_offset, geo->ppaf.sect_len);
+
+	return sz;
+}
+
+static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct pblk_line_meta *lm = &pblk->lm;
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	struct pblk_line *line;
+	ssize_t sz = 0;
+	int nr_free_lines;
+	int cur_data, cur_log;
+	int free_line_cnt = 0, closed_line_cnt = 0;
+	int d_line_cnt = 0, l_line_cnt = 0;
+	int gc_full = 0, gc_high = 0, gc_mid = 0, gc_low = 0, gc_empty = 0;
+	int free = 0, bad = 0, cor = 0;
+	int msecs = 0, ssecs = 0, cur_sec = 0, vsc = 0, sec_in_line = 0;
+	int map_weight = 0, meta_weight = 0;
+
+	spin_lock(&l_mg->free_lock);
+	cur_data = (l_mg->data_line) ? l_mg->data_line->id : -1;
+	cur_log = (l_mg->log_line) ? l_mg->log_line->id : -1;
+	nr_free_lines = l_mg->nr_free_lines;
+
+	list_for_each_entry(line, &l_mg->free_list, list)
+		free_line_cnt++;
+	spin_unlock(&l_mg->free_lock);
+
+	spin_lock(&l_mg->gc_lock);
+	list_for_each_entry(line, &l_mg->gc_full_list, list) {
+		if (line->type == PBLK_LINETYPE_DATA)
+			d_line_cnt++;
+		else if (line->type == PBLK_LINETYPE_LOG)
+			l_line_cnt++;
+		closed_line_cnt++;
+		gc_full++;
+	}
+
+	list_for_each_entry(line, &l_mg->gc_high_list, list) {
+		if (line->type == PBLK_LINETYPE_DATA)
+			d_line_cnt++;
+		else if (line->type == PBLK_LINETYPE_LOG)
+			l_line_cnt++;
+		closed_line_cnt++;
+		gc_high++;
+	}
+
+	list_for_each_entry(line, &l_mg->gc_mid_list, list) {
+		if (line->type == PBLK_LINETYPE_DATA)
+			d_line_cnt++;
+		else if (line->type == PBLK_LINETYPE_LOG)
+			l_line_cnt++;
+		closed_line_cnt++;
+		gc_mid++;
+	}
+
+	list_for_each_entry(line, &l_mg->gc_low_list, list) {
+		if (line->type == PBLK_LINETYPE_DATA)
+			d_line_cnt++;
+		else if (line->type == PBLK_LINETYPE_LOG)
+			l_line_cnt++;
+		closed_line_cnt++;
+		gc_low++;
+	}
+
+	list_for_each_entry(line, &l_mg->gc_empty_list, list) {
+		if (line->type == PBLK_LINETYPE_DATA)
+			d_line_cnt++;
+		else if (line->type == PBLK_LINETYPE_LOG)
+			l_line_cnt++;
+		closed_line_cnt++;
+		gc_empty++;
+	}
+
+	list_for_each_entry(line, &l_mg->free_list, list)
+		free++;
+	list_for_each_entry(line, &l_mg->bad_list, list)
+		bad++;
+	list_for_each_entry(line, &l_mg->corrupt_list, list)
+		cor++;
+	spin_unlock(&l_mg->gc_lock);
+
+	spin_lock(&l_mg->free_lock);
+	if (l_mg->data_line) {
+		cur_sec = l_mg->data_line->cur_sec;
+		msecs = l_mg->data_line->left_msecs;
+		ssecs = l_mg->data_line->left_ssecs;
+		vsc = l_mg->data_line->vsc;
+		sec_in_line = l_mg->data_line->sec_in_line;
+		meta_weight = bitmap_weight(&l_mg->meta_bitmap,
+							PBLK_DATA_LINES);
+		map_weight = bitmap_weight(l_mg->data_line->map_bitmap,
+							lm->sec_per_line);
+	}
+	spin_unlock(&l_mg->free_lock);
+
+	if (nr_free_lines != free_line_cnt)
+		pr_err("pblk: corrupted free line list\n");
+
+	sz = snprintf(page, PAGE_SIZE - sz,
+		"line: nluns:%d, nblks:%d, nsecs:%d\n",
+		geo->nr_luns, lm->blk_per_line, lm->sec_per_line);
+
+	sz += snprintf(page + sz, PAGE_SIZE - sz,
+		"lines:d:%d,l:%d-f:%d(%d),b:%d,co:%d,c:%d(d:%d,l:%d)t:%d\n",
+					cur_data, cur_log,
+					free, nr_free_lines, bad, cor,
+					closed_line_cnt,
+					d_line_cnt, l_line_cnt,
+					l_mg->nr_lines);
+
+	sz += snprintf(page + sz, PAGE_SIZE - sz,
+		"GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, queue:%d\n",
+			gc_full, gc_high, gc_mid, gc_low, gc_empty,
+			atomic_read(&pblk->gc.inflight_gc));
+
+	sz += snprintf(page + sz, PAGE_SIZE - sz,
+		"data (%d) cur:%d, left:%d/%d, vsc:%d, s:%d, map:%d/%d (%d)\n",
+			cur_data, cur_sec, msecs, ssecs, vsc, sec_in_line,
+			map_weight, lm->sec_per_line, meta_weight);
+
+	return sz;
+}
+
+static ssize_t pblk_sysfs_lines_info(struct pblk *pblk, char *page)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct pblk_line_meta *lm = &pblk->lm;
+	ssize_t sz = 0;
+
+	sz = snprintf(page, PAGE_SIZE - sz,
+				"smeta - len:%d, secs:%d\n",
+					lm->smeta_len, lm->smeta_sec);
+	sz += snprintf(page + sz, PAGE_SIZE - sz,
+				"emeta - len:%d, sec:%d, bb_start:%d\n",
+					lm->emeta_len, lm->emeta_sec,
+					lm->emeta_bb);
+	sz += snprintf(page + sz, PAGE_SIZE - sz,
+				"bitmap lengths: sec:%d, blk:%d, lun:%d\n",
+					lm->sec_bitmap_len,
+					lm->blk_bitmap_len,
+					lm->lun_bitmap_len);
+	sz += snprintf(page + sz, PAGE_SIZE - sz,
+				"blk_line:%d, sec_line:%d, sec_blk:%d\n",
+					lm->blk_per_line,
+					lm->sec_per_line,
+					geo->sec_per_blk);
+
+	return sz;
+}
+
+#ifdef CONFIG_NVM_DEBUG
+static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page)
+{
+	return snprintf(page, PAGE_SIZE,
+		"%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\n",
+			atomic_long_read(&pblk->inflight_writes),
+			atomic_long_read(&pblk->inflight_reads),
+			atomic_long_read(&pblk->req_writes),
+			atomic_long_read(&pblk->nr_flush),
+			atomic_long_read(&pblk->padded_writes),
+			atomic_long_read(&pblk->padded_wb),
+			atomic_long_read(&pblk->sub_writes),
+			atomic_long_read(&pblk->sync_writes),
+			atomic_long_read(&pblk->compl_writes),
+			atomic_long_read(&pblk->recov_writes),
+			atomic_long_read(&pblk->recov_gc_writes),
+			atomic_long_read(&pblk->recov_gc_reads),
+			atomic_long_read(&pblk->sync_reads));
+}
+#endif
+
+static ssize_t pblk_sysfs_rate_store(struct pblk *pblk, const char *page,
+				     size_t len)
+{
+	struct pblk_gc *gc = &pblk->gc;
+	size_t c_len;
+	int value;
+
+	c_len = strcspn(page, "\n");
+	if (c_len >= len)
+		return -EINVAL;
+
+	if (kstrtouint(page, 0, &value))
+		return -EINVAL;
+
+	spin_lock(&gc->lock);
+	pblk_rl_set_gc_rsc(&pblk->rl, value);
+	spin_unlock(&gc->lock);
+
+	return len;
+}
+
+static ssize_t pblk_sysfs_gc_force(struct pblk *pblk, const char *page,
+				   size_t len)
+{
+	size_t c_len;
+	int force;
+
+	c_len = strcspn(page, "\n");
+	if (c_len >= len)
+		return -EINVAL;
+
+	if (kstrtouint(page, 0, &force))
+		return -EINVAL;
+
+	if (force < 0 || force > 1)
+		return -EINVAL;
+
+	pblk_gc_sysfs_force(pblk, force);
+
+	return len;
+}
+
+static struct attribute sys_write_luns = {
+	.name = "write_luns",
+	.mode = 0444,
+};
+
+static struct attribute sys_rate_limiter_attr = {
+	.name = "rate_limiter",
+	.mode = 0444,
+};
+
+static struct attribute sys_gc_state = {
+	.name = "gc_state",
+	.mode = 0444,
+};
+
+static struct attribute sys_errors_attr = {
+	.name = "errors",
+	.mode = 0444,
+};
+
+static struct attribute sys_rb_attr = {
+	.name = "write_buffer",
+	.mode = 0444,
+};
+
+static struct attribute sys_stats_ppaf_attr = {
+	.name = "ppa_format",
+	.mode = 0444,
+};
+
+static struct attribute sys_lines_attr = {
+	.name = "lines",
+	.mode = 0444,
+};
+
+static struct attribute sys_lines_info_attr = {
+	.name = "lines_info",
+	.mode = 0444,
+};
+
+static struct attribute sys_gc_force = {
+	.name = "gc_force",
+	.mode = 0200,
+};
+
+static struct attribute sys_gc_rl_max = {
+	.name = "gc_rl_max",
+	.mode = 0200,
+};
+
+#ifdef CONFIG_NVM_DEBUG
+static struct attribute sys_stats_debug_attr = {
+	.name = "stats",
+	.mode = 0444,
+};
+#endif
+
+static struct attribute *pblk_attrs[] = {
+	&sys_write_luns,
+	&sys_rate_limiter_attr,
+	&sys_errors_attr,
+	&sys_gc_state,
+	&sys_gc_force,
+	&sys_gc_rl_max,
+	&sys_rb_attr,
+	&sys_stats_ppaf_attr,
+	&sys_lines_attr,
+	&sys_lines_info_attr,
+#ifdef CONFIG_NVM_DEBUG
+	&sys_stats_debug_attr,
+#endif
+	NULL,
+};
+
+static ssize_t pblk_sysfs_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	struct pblk *pblk = container_of(kobj, struct pblk, kobj);
+
+	if (strcmp(attr->name, "rate_limiter") == 0)
+		return pblk_sysfs_rate_limiter(pblk, buf);
+	else if (strcmp(attr->name, "write_luns") == 0)
+		return pblk_sysfs_luns_show(pblk, buf);
+	else if (strcmp(attr->name, "gc_state") == 0)
+		return pblk_sysfs_gc_state_show(pblk, buf);
+	else if (strcmp(attr->name, "errors") == 0)
+		return pblk_sysfs_stats(pblk, buf);
+	else if (strcmp(attr->name, "write_buffer") == 0)
+		return pblk_sysfs_write_buffer(pblk, buf);
+	else if (strcmp(attr->name, "ppa_format") == 0)
+		return pblk_sysfs_ppaf(pblk, buf);
+	else if (strcmp(attr->name, "lines") == 0)
+		return pblk_sysfs_lines(pblk, buf);
+	else if (strcmp(attr->name, "lines_info") == 0)
+		return pblk_sysfs_lines_info(pblk, buf);
+#ifdef CONFIG_NVM_DEBUG
+	else if (strcmp(attr->name, "stats") == 0)
+		return pblk_sysfs_stats_debug(pblk, buf);
+#endif
+	return 0;
+}
+
+static ssize_t pblk_sysfs_store(struct kobject *kobj, struct attribute *attr,
+				const char *buf, size_t len)
+{
+	struct pblk *pblk = container_of(kobj, struct pblk, kobj);
+
+	if (strcmp(attr->name, "gc_rl_max") == 0)
+		return pblk_sysfs_rate_store(pblk, buf, len);
+	else if (strcmp(attr->name, "gc_force") == 0)
+		return pblk_sysfs_gc_force(pblk, buf, len);
+
+	return 0;
+}
+
+static const struct sysfs_ops pblk_sysfs_ops = {
+	.show = pblk_sysfs_show,
+	.store = pblk_sysfs_store,
+};
+
+static struct kobj_type pblk_ktype = {
+	.sysfs_ops	= &pblk_sysfs_ops,
+	.default_attrs	= pblk_attrs,
+};
+
+int pblk_sysfs_init(struct gendisk *tdisk)
+{
+	struct pblk *pblk = tdisk->private_data;
+	struct device *parent_dev = disk_to_dev(pblk->disk);
+	int ret;
+
+	ret = kobject_init_and_add(&pblk->kobj, &pblk_ktype,
+					kobject_get(&parent_dev->kobj),
+					"%s", "pblk");
+	if (ret) {
+		pr_err("pblk: could not register %s/pblk\n",
+						tdisk->disk_name);
+		return ret;
+	}
+
+	kobject_uevent(&pblk->kobj, KOBJ_ADD);
+	return 0;
+}
+
+void pblk_sysfs_exit(struct gendisk *tdisk)
+{
+	struct pblk *pblk = tdisk->private_data;
+
+	kobject_uevent(&pblk->kobj, KOBJ_REMOVE);
+	kobject_del(&pblk->kobj);
+	kobject_put(&pblk->kobj);
+}
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
new file mode 100644
index 000000000000..ee57db993cd1
--- /dev/null
+++ b/drivers/lightnvm/pblk-write.c
@@ -0,0 +1,411 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *                  Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-write.c - pblk's write path from write buffer to media
+ */
+
+#include "pblk.h"
+
+static void pblk_sync_line(struct pblk *pblk, struct pblk_line *line)
+{
+#ifdef CONFIG_NVM_DEBUG
+	atomic_long_inc(&pblk->sync_writes);
+#endif
+
+	/* Counter protected by rb sync lock */
+	line->left_ssecs--;
+	if (!line->left_ssecs)
+		pblk_line_run_ws(pblk, line, NULL, pblk_line_close_ws);
+}
+
+static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
+				    struct pblk_c_ctx *c_ctx)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct bio *original_bio;
+	unsigned long ret;
+	int i;
+
+	for (i = 0; i < c_ctx->nr_valid; i++) {
+		struct pblk_w_ctx *w_ctx;
+		struct ppa_addr p;
+		struct pblk_line *line;
+
+		w_ctx = pblk_rb_w_ctx(&pblk->rwb, c_ctx->sentry + i);
+
+		p = rqd->ppa_list[i];
+		line = &pblk->lines[pblk_dev_ppa_to_line(p)];
+		pblk_sync_line(pblk, line);
+
+		while ((original_bio = bio_list_pop(&w_ctx->bios)))
+			bio_endio(original_bio);
+	}
+
+#ifdef CONFIG_NVM_DEBUG
+	atomic_long_add(c_ctx->nr_valid, &pblk->compl_writes);
+#endif
+
+	ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid);
+
+	if (rqd->meta_list)
+		nvm_dev_dma_free(dev->parent, rqd->meta_list,
+							rqd->dma_meta_list);
+
+	bio_put(rqd->bio);
+	pblk_free_rqd(pblk, rqd, WRITE);
+
+	return ret;
+}
+
+static unsigned long pblk_end_queued_w_bio(struct pblk *pblk,
+					   struct nvm_rq *rqd,
+					   struct pblk_c_ctx *c_ctx)
+{
+	list_del(&c_ctx->list);
+	return pblk_end_w_bio(pblk, rqd, c_ctx);
+}
+
+static void pblk_complete_write(struct pblk *pblk, struct nvm_rq *rqd,
+				struct pblk_c_ctx *c_ctx)
+{
+	struct pblk_c_ctx *c, *r;
+	unsigned long flags;
+	unsigned long pos;
+
+#ifdef CONFIG_NVM_DEBUG
+	atomic_long_sub(c_ctx->nr_valid, &pblk->inflight_writes);
+#endif
+
+	pblk_up_rq(pblk, rqd->ppa_list, rqd->nr_ppas, c_ctx->lun_bitmap);
+
+	pos = pblk_rb_sync_init(&pblk->rwb, &flags);
+	if (pos == c_ctx->sentry) {
+		pos = pblk_end_w_bio(pblk, rqd, c_ctx);
+
+retry:
+		list_for_each_entry_safe(c, r, &pblk->compl_list, list) {
+			rqd = nvm_rq_from_c_ctx(c);
+			if (c->sentry == pos) {
+				pos = pblk_end_queued_w_bio(pblk, rqd, c);
+				goto retry;
+			}
+		}
+	} else {
+		WARN_ON(nvm_rq_from_c_ctx(c_ctx) != rqd);
+		list_add_tail(&c_ctx->list, &pblk->compl_list);
+	}
+	pblk_rb_sync_end(&pblk->rwb, &flags);
+}
+
+/* When a write fails, we are not sure whether the block has grown bad or a page
+ * range is more susceptible to write errors. If a high number of pages fail, we
+ * assume that the block is bad and we mark it accordingly. In all cases, we
+ * remap and resubmit the failed entries as fast as possible; if a flush is
+ * waiting on a completion, the whole stack would stall otherwise.
+ */
+static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
+{
+	void *comp_bits = &rqd->ppa_status;
+	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
+	struct pblk_rec_ctx *recovery;
+	struct ppa_addr *ppa_list = rqd->ppa_list;
+	int nr_ppas = rqd->nr_ppas;
+	unsigned int c_entries;
+	int bit, ret;
+
+	if (unlikely(nr_ppas == 1))
+		ppa_list = &rqd->ppa_addr;
+
+	recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC);
+	if (!recovery) {
+		pr_err("pblk: could not allocate recovery context\n");
+		return;
+	}
+	INIT_LIST_HEAD(&recovery->failed);
+
+	bit = -1;
+	while ((bit = find_next_bit(comp_bits, nr_ppas, bit + 1)) < nr_ppas) {
+		struct pblk_rb_entry *entry;
+		struct ppa_addr ppa;
+
+		/* Logic error */
+		if (bit > c_ctx->nr_valid) {
+			WARN_ON_ONCE("pblk: corrupted write request\n");
+			goto out;
+		}
+
+		ppa = ppa_list[bit];
+		entry = pblk_rb_sync_scan_entry(&pblk->rwb, &ppa);
+		if (!entry) {
+			pr_err("pblk: could not scan entry on write failure\n");
+			goto out;
+		}
+
+		/* The list is filled first and emptied afterwards. No need for
+		 * protecting it with a lock
+		 */
+		list_add_tail(&entry->index, &recovery->failed);
+	}
+
+	c_entries = find_first_bit(comp_bits, nr_ppas);
+	ret = pblk_recov_setup_rq(pblk, c_ctx, recovery, comp_bits, c_entries);
+	if (ret) {
+		pr_err("pblk: could not recover from write failure\n");
+		goto out;
+	}
+
+	INIT_WORK(&recovery->ws_rec, pblk_submit_rec);
+	queue_work(pblk->kw_wq, &recovery->ws_rec);
+
+out:
+	pblk_complete_write(pblk, rqd, c_ctx);
+}
+
+static void pblk_end_io_write(struct nvm_rq *rqd)
+{
+	struct pblk *pblk = rqd->private;
+	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
+
+	if (rqd->error) {
+		pblk_log_write_err(pblk, rqd);
+		return pblk_end_w_fail(pblk, rqd);
+	}
+#ifdef CONFIG_NVM_DEBUG
+	else
+		WARN_ONCE(rqd->bio->bi_error, "pblk: corrupted write error\n");
+#endif
+
+	pblk_complete_write(pblk, rqd, c_ctx);
+}
+
+static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
+			   unsigned int nr_secs)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+
+	/* Setup write request */
+	rqd->opcode = NVM_OP_PWRITE;
+	rqd->nr_ppas = nr_secs;
+	rqd->flags = pblk_set_progr_mode(pblk, WRITE);
+	rqd->private = pblk;
+	rqd->end_io = pblk_end_io_write;
+
+	rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+							&rqd->dma_meta_list);
+	if (!rqd->meta_list)
+		return -ENOMEM;
+
+	if (unlikely(nr_secs == 1))
+		return 0;
+
+	rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size;
+	rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size;
+
+	return 0;
+}
+
+static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
+			   struct pblk_c_ctx *c_ctx)
+{
+	struct pblk_line_meta *lm = &pblk->lm;
+	struct pblk_line *e_line = pblk_line_get_data_next(pblk);
+	struct ppa_addr erase_ppa;
+	unsigned int valid = c_ctx->nr_valid;
+	unsigned int padded = c_ctx->nr_padded;
+	unsigned int nr_secs = valid + padded;
+	unsigned long *lun_bitmap;
+	int ret = 0;
+
+	lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL);
+	if (!lun_bitmap) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	c_ctx->lun_bitmap = lun_bitmap;
+
+	ret = pblk_alloc_w_rq(pblk, rqd, nr_secs);
+	if (ret) {
+		kfree(lun_bitmap);
+		goto out;
+	}
+
+	ppa_set_empty(&erase_ppa);
+	if (likely(!e_line || !e_line->left_eblks))
+		pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, valid, 0);
+	else
+		pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
+							valid, &erase_ppa);
+
+out:
+	if (unlikely(e_line && !ppa_empty(erase_ppa))) {
+		if (pblk_blk_erase_async(pblk, erase_ppa)) {
+			struct nvm_tgt_dev *dev = pblk->dev;
+			struct nvm_geo *geo = &dev->geo;
+			int bit;
+
+			e_line->left_eblks++;
+			bit = erase_ppa.g.lun * geo->nr_chnls + erase_ppa.g.ch;
+			WARN_ON(!test_and_clear_bit(bit, e_line->erase_bitmap));
+			up(&pblk->erase_sem);
+		}
+	}
+
+	return ret;
+}
+
+int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
+			struct pblk_c_ctx *c_ctx)
+{
+	struct pblk_line_meta *lm = &pblk->lm;
+	unsigned long *lun_bitmap;
+	int ret;
+
+	lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL);
+	if (!lun_bitmap)
+		return -ENOMEM;
+
+	c_ctx->lun_bitmap = lun_bitmap;
+
+	ret = pblk_alloc_w_rq(pblk, rqd, rqd->nr_ppas);
+	if (ret)
+		return ret;
+
+	pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, c_ctx->nr_valid, 0);
+
+	rqd->ppa_status = (u64)0;
+	rqd->flags = pblk_set_progr_mode(pblk, WRITE);
+
+	return ret;
+}
+
+static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
+				  unsigned int secs_to_flush)
+{
+	int secs_to_sync;
+
+	secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush);
+
+#ifdef CONFIG_NVM_DEBUG
+	if ((!secs_to_sync && secs_to_flush)
+			|| (secs_to_sync < 0)
+			|| (secs_to_sync > secs_avail && !secs_to_flush)) {
+		pr_err("pblk: bad sector calculation (a:%d,s:%d,f:%d)\n",
+				secs_avail, secs_to_sync, secs_to_flush);
+	}
+#endif
+
+	return secs_to_sync;
+}
+
+static int pblk_submit_write(struct pblk *pblk)
+{
+	struct bio *bio;
+	struct nvm_rq *rqd;
+	struct pblk_c_ctx *c_ctx;
+	unsigned int pgs_read;
+	unsigned int secs_avail, secs_to_sync, secs_to_com;
+	unsigned int secs_to_flush;
+	unsigned long pos;
+	int err;
+
+	/* If there are no sectors in the cache, flushes (bios without data)
+	 * will be cleared on the cache threads
+	 */
+	secs_avail = pblk_rb_read_count(&pblk->rwb);
+	if (!secs_avail)
+		return 1;
+
+	secs_to_flush = pblk_rb_sync_point_count(&pblk->rwb);
+	if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
+		return 1;
+
+	rqd = pblk_alloc_rqd(pblk, WRITE);
+	if (IS_ERR(rqd)) {
+		pr_err("pblk: cannot allocate write req.\n");
+		return 1;
+	}
+	c_ctx = nvm_rq_to_pdu(rqd);
+
+	bio = bio_alloc(GFP_KERNEL, pblk->max_write_pgs);
+	if (!bio) {
+		pr_err("pblk: cannot allocate write bio\n");
+		goto fail_free_rqd;
+	}
+	bio->bi_iter.bi_sector = 0; /* internal bio */
+	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+	rqd->bio = bio;
+
+	secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush);
+	if (secs_to_sync > pblk->max_write_pgs) {
+		pr_err("pblk: bad buffer sync calculation\n");
+		goto fail_put_bio;
+	}
+
+	secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync;
+	pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
+
+	pgs_read = pblk_rb_read_to_bio(&pblk->rwb, bio, c_ctx, pos,
+						secs_to_sync, secs_avail);
+	if (!pgs_read) {
+		pr_err("pblk: corrupted write bio\n");
+		goto fail_put_bio;
+	}
+
+	if (c_ctx->nr_padded)
+		if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, c_ctx->nr_padded))
+			goto fail_put_bio;
+
+	/* Assign lbas to ppas and populate request structure */
+	err = pblk_setup_w_rq(pblk, rqd, c_ctx);
+	if (err) {
+		pr_err("pblk: could not setup write request\n");
+		goto fail_free_bio;
+	}
+
+	err = pblk_submit_io(pblk, rqd);
+	if (err) {
+		pr_err("pblk: I/O submission failed: %d\n", err);
+		goto fail_free_bio;
+	}
+
+#ifdef CONFIG_NVM_DEBUG
+	atomic_long_add(secs_to_sync, &pblk->sub_writes);
+#endif
+
+	return 0;
+
+fail_free_bio:
+	if (c_ctx->nr_padded)
+		pblk_bio_free_pages(pblk, bio, secs_to_sync, c_ctx->nr_padded);
+fail_put_bio:
+	bio_put(bio);
+fail_free_rqd:
+	pblk_free_rqd(pblk, rqd, WRITE);
+
+	return 1;
+}
+
+int pblk_write_ts(void *data)
+{
+	struct pblk *pblk = data;
+
+	while (!kthread_should_stop()) {
+		if (!pblk_submit_write(pblk))
+			continue;
+		set_current_state(TASK_INTERRUPTIBLE);
+		io_schedule();
+	}
+
+	return 0;
+}
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
new file mode 100644
index 000000000000..c82120ce3be5
--- /dev/null
+++ b/drivers/lightnvm/pblk.h
@@ -0,0 +1,1121 @@
+/*
+ * Copyright (C) 2015 IT University of Copenhagen (rrpc.h)
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Matias Bjorling <matias@cnexlabs.com>
+ * Write buffering: Javier Gonzalez <javier@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Implementation of a Physical Block-device target for Open-channel SSDs.
+ *
+ */
+
+#ifndef PBLK_H_
+#define PBLK_H_
+
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/bio.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/vmalloc.h>
+#include <linux/crc32.h>
+#include <linux/uuid.h>
+
+#include <linux/lightnvm.h>
+
+/* Run only GC if less than 1/X blocks are free */
+#define GC_LIMIT_INVERSE 5
+#define GC_TIME_MSECS 1000
+
+#define PBLK_SECTOR (512)
+#define PBLK_EXPOSED_PAGE_SIZE (4096)
+#define PBLK_MAX_REQ_ADDRS (64)
+#define PBLK_MAX_REQ_ADDRS_PW (6)
+
+#define PBLK_CACHE_NAME_LEN (DISK_NAME_LEN + 16)
+
+#define PBLK_COMMAND_TIMEOUT_MS 30000
+
+/* Max 512 LUNs per device */
+#define PBLK_MAX_LUNS_BITMAP (4)
+
+#define NR_PHY_IN_LOG (PBLK_EXPOSED_PAGE_SIZE / PBLK_SECTOR)
+
+#define pblk_for_each_lun(pblk, rlun, i) \
+		for ((i) = 0, rlun = &(pblk)->luns[0]; \
+			(i) < (pblk)->nr_luns; (i)++, rlun = &(pblk)->luns[(i)])
+
+#define ERASE 2 /* READ = 0, WRITE = 1 */
+
+enum {
+	/* IO Types */
+	PBLK_IOTYPE_USER	= 1 << 0,
+	PBLK_IOTYPE_GC		= 1 << 1,
+
+	/* Write buffer flags */
+	PBLK_FLUSH_ENTRY	= 1 << 2,
+	PBLK_WRITTEN_DATA	= 1 << 3,
+	PBLK_SUBMITTED_ENTRY	= 1 << 4,
+	PBLK_WRITABLE_ENTRY	= 1 << 5,
+};
+
+enum {
+	PBLK_BLK_ST_OPEN =	0x1,
+	PBLK_BLK_ST_CLOSED =	0x2,
+};
+
+/* The number of GC lists and the rate-limiter states go together. This way the
+ * rate-limiter can dictate how much GC is needed based on resource utilization.
+ */
+#define PBLK_NR_GC_LISTS 3
+#define PBLK_MAX_GC_JOBS 32
+
+enum {
+	PBLK_RL_HIGH = 1,
+	PBLK_RL_MID = 2,
+	PBLK_RL_LOW = 3,
+};
+
+struct pblk_sec_meta {
+	u64 reserved;
+	__le64 lba;
+};
+
+#define pblk_dma_meta_size (sizeof(struct pblk_sec_meta) * PBLK_MAX_REQ_ADDRS)
+
+/* write completion context */
+struct pblk_c_ctx {
+	struct list_head list;		/* Head for out-of-order completion */
+
+	unsigned long *lun_bitmap;	/* Luns used on current request */
+	unsigned int sentry;
+	unsigned int nr_valid;
+	unsigned int nr_padded;
+};
+
+/* Read context */
+struct pblk_r_ctx {
+	struct bio *orig_bio;
+};
+
+/* Recovery context */
+struct pblk_rec_ctx {
+	struct pblk *pblk;
+	struct nvm_rq *rqd;
+	struct list_head failed;
+	struct work_struct ws_rec;
+};
+
+/* Write context */
+struct pblk_w_ctx {
+	struct bio_list bios;		/* Original bios - used for completion
+					 * in REQ_FUA, REQ_FLUSH case
+					 */
+	sector_t lba;			/* Logic addr. associated with entry */
+	struct ppa_addr ppa;		/* Physic addr. associated with entry */
+	int flags;			/* Write context flags */
+};
+
+struct pblk_rb_entry {
+	struct ppa_addr cacheline;	/* Cacheline for this entry */
+	void *data;			/* Pointer to data on this entry */
+	struct pblk_w_ctx w_ctx;	/* Context for this entry */
+	struct list_head index;		/* List head to enable indexes */
+};
+
+#define EMPTY_ENTRY (~0U)
+
+struct pblk_rb_pages {
+	struct page *pages;
+	int order;
+	struct list_head list;
+};
+
+struct pblk_rb {
+	struct pblk_rb_entry *entries;	/* Ring buffer entries */
+	unsigned int mem;		/* Write offset - points to next
+					 * writable entry in memory
+					 */
+	unsigned int subm;		/* Read offset - points to last entry
+					 * that has been submitted to the media
+					 * to be persisted
+					 */
+	unsigned int sync;		/* Synced - backpointer that signals
+					 * the last submitted entry that has
+					 * been successfully persisted to media
+					 */
+	unsigned int sync_point;	/* Sync point - last entry that must be
+					 * flushed to the media. Used with
+					 * REQ_FLUSH and REQ_FUA
+					 */
+	unsigned int l2p_update;	/* l2p update point - next entry for
+					 * which l2p mapping will be updated to
+					 * contain a device ppa address (instead
+					 * of a cacheline
+					 */
+	unsigned int nr_entries;	/* Number of entries in write buffer -
+					 * must be a power of two
+					 */
+	unsigned int seg_size;		/* Size of the data segments being
+					 * stored on each entry. Typically this
+					 * will be 4KB
+					 */
+
+	struct list_head pages;		/* List of data pages */
+
+	spinlock_t w_lock;		/* Write lock */
+	spinlock_t s_lock;		/* Sync lock */
+
+#ifdef CONFIG_NVM_DEBUG
+	atomic_t inflight_sync_point;	/* Not served REQ_FLUSH | REQ_FUA */
+#endif
+};
+
+#define PBLK_RECOVERY_SECTORS 16
+
+struct pblk_lun {
+	struct ppa_addr bppa;
+
+	u8 *bb_list;			/* Bad block list for LUN. Only used on
+					 * bring up. Bad blocks are managed
+					 * within lines on run-time.
+					 */
+
+	struct semaphore wr_sem;
+};
+
+struct pblk_gc_rq {
+	struct pblk_line *line;
+	void *data;
+	u64 *lba_list;
+	int nr_secs;
+	int secs_to_gc;
+	struct list_head list;
+};
+
+struct pblk_gc {
+	int gc_active;
+	int gc_enabled;
+	int gc_forced;
+	int gc_jobs_active;
+	atomic_t inflight_gc;
+
+	struct task_struct *gc_ts;
+	struct task_struct *gc_writer_ts;
+	struct workqueue_struct *gc_reader_wq;
+	struct timer_list gc_timer;
+
+	int w_entries;
+	struct list_head w_list;
+
+	spinlock_t lock;
+	spinlock_t w_lock;
+};
+
+struct pblk_rl {
+	unsigned int high;	/* Upper threshold for rate limiter (free run -
+				 * user I/O rate limiter
+				 */
+	unsigned int low;	/* Lower threshold for rate limiter (user I/O
+				 * rate limiter - stall)
+				 */
+	unsigned int high_pw;	/* High rounded up as a power of 2 */
+
+#define PBLK_USER_HIGH_THRS 2	/* Begin write limit at 50 percent
+				 * available blks
+				 */
+#define PBLK_USER_LOW_THRS 20	/* Aggressive GC at 5% available blocks */
+
+	int rb_windows_pw;	/* Number of rate windows in the write buffer
+				 * given as a power-of-2. This guarantees that
+				 * when user I/O is being rate limited, there
+				 * will be reserved enough space for the GC to
+				 * place its payload. A window is of
+				 * pblk->max_write_pgs size, which in NVMe is
+				 * 64, i.e., 256kb.
+				 */
+	int rb_budget;		/* Total number of entries available for I/O */
+	int rb_user_max;	/* Max buffer entries available for user I/O */
+	atomic_t rb_user_cnt;	/* User I/O buffer counter */
+	int rb_gc_max;		/* Max buffer entries available for GC I/O */
+	int rb_gc_rsv;		/* Reserved buffer entries for GC I/O */
+	int rb_state;		/* Rate-limiter current state */
+	atomic_t rb_gc_cnt;	/* GC I/O buffer counter */
+
+	int rb_user_active;
+	struct timer_list u_timer;
+
+	unsigned long long nr_secs;
+	unsigned long total_blocks;
+	atomic_t free_blocks;
+};
+
+#define PBLK_LINE_NR_LUN_BITMAP 2
+#define PBLK_LINE_NR_SEC_BITMAP 2
+#define PBLK_LINE_EMPTY (~0U)
+
+enum {
+	/* Line Types */
+	PBLK_LINETYPE_FREE = 0,
+	PBLK_LINETYPE_LOG = 1,
+	PBLK_LINETYPE_DATA = 2,
+
+	/* Line state */
+	PBLK_LINESTATE_FREE = 10,
+	PBLK_LINESTATE_OPEN = 11,
+	PBLK_LINESTATE_CLOSED = 12,
+	PBLK_LINESTATE_GC = 13,
+	PBLK_LINESTATE_BAD = 14,
+	PBLK_LINESTATE_CORRUPT = 15,
+
+	/* GC group */
+	PBLK_LINEGC_NONE = 20,
+	PBLK_LINEGC_EMPTY = 21,
+	PBLK_LINEGC_LOW = 22,
+	PBLK_LINEGC_MID = 23,
+	PBLK_LINEGC_HIGH = 24,
+	PBLK_LINEGC_FULL = 25,
+};
+
+#define PBLK_MAGIC 0x70626c6b /*pblk*/
+
+struct line_header {
+	__le32 crc;
+	__le32 identifier;	/* pblk identifier */
+	__u8 uuid[16];		/* instance uuid */
+	__le16 type;		/* line type */
+	__le16 version;		/* type version */
+	__le32 id;		/* line id for current line */
+};
+
+struct line_smeta {
+	struct line_header header;
+
+	__le32 crc;		/* Full structure including struct crc */
+	/* Previous line metadata */
+	__le32 prev_id;		/* Line id for previous line */
+
+	/* Current line metadata */
+	__le64 seq_nr;		/* Sequence number for current line */
+
+	/* Active writers */
+	__le32 window_wr_lun;	/* Number of parallel LUNs to write */
+
+	__le32 rsvd[2];
+};
+
+/*
+ * Metadata Layout:
+ *	1. struct pblk_emeta
+ *	2. nr_lbas u64 forming lba list
+ *	3. nr_lines (all) u32 valid sector count (vsc) (~0U: non-alloc line)
+ *	4. nr_luns bits (u64 format) forming line bad block bitmap
+ *
+ *	3. and 4. will be part of FTL log
+ */
+struct line_emeta {
+	struct line_header header;
+
+	__le32 crc;		/* Full structure including struct crc */
+
+	/* Previous line metadata */
+	__le32 prev_id;		/* Line id for prev line */
+
+	/* Current line metadata */
+	__le64 seq_nr;		/* Sequence number for current line */
+
+	/* Active writers */
+	__le32 window_wr_lun;	/* Number of parallel LUNs to write */
+
+	/* Bookkeeping for recovery */
+	__le32 next_id;		/* Line id for next line */
+	__le64 nr_lbas;		/* Number of lbas mapped in line */
+	__le64 nr_valid_lbas;	/* Number of valid lbas mapped in line */
+};
+
+struct pblk_line {
+	struct pblk *pblk;
+	unsigned int id;		/* Line number corresponds to the
+					 * block line
+					 */
+	unsigned int seq_nr;		/* Unique line sequence number */
+
+	int state;			/* PBLK_LINESTATE_X */
+	int type;			/* PBLK_LINETYPE_X */
+	int gc_group;			/* PBLK_LINEGC_X */
+	struct list_head list;		/* Free, GC lists */
+
+	unsigned long *lun_bitmap;	/* Bitmap for LUNs mapped in line */
+
+	struct line_smeta *smeta;	/* Start metadata */
+	struct line_emeta *emeta;	/* End metadata */
+	int meta_line;			/* Metadata line id */
+	u64 smeta_ssec;			/* Sector where smeta starts */
+	u64 emeta_ssec;			/* Sector where emeta starts */
+
+	unsigned int sec_in_line;	/* Number of usable secs in line */
+
+	unsigned int blk_in_line;	/* Number of good blocks in line */
+	unsigned long *blk_bitmap;	/* Bitmap for valid/invalid blocks */
+	unsigned long *erase_bitmap;	/* Bitmap for erased blocks */
+
+	unsigned long *map_bitmap;	/* Bitmap for mapped sectors in line */
+	unsigned long *invalid_bitmap;	/* Bitmap for invalid sectors in line */
+
+	int left_eblks;			/* Blocks left for erasing */
+	atomic_t left_seblks;		/* Blocks left for sync erasing */
+
+	int left_msecs;			/* Sectors left for mapping */
+	int left_ssecs;			/* Sectors left to sync */
+	unsigned int cur_sec;		/* Sector map pointer */
+	unsigned int vsc;		/* Valid sector count in line */
+
+	struct kref ref;		/* Write buffer L2P references */
+
+	spinlock_t lock;		/* Necessary for invalid_bitmap only */
+};
+
+#define PBLK_DATA_LINES 2
+
+enum{
+	PBLK_KMALLOC_META = 1,
+	PBLK_VMALLOC_META = 2,
+};
+
+struct pblk_line_metadata {
+	void *meta;
+};
+
+struct pblk_line_mgmt {
+	int nr_lines;			/* Total number of full lines */
+	int nr_free_lines;		/* Number of full lines in free list */
+
+	/* Free lists - use free_lock */
+	struct list_head free_list;	/* Full lines ready to use */
+	struct list_head corrupt_list;	/* Full lines corrupted */
+	struct list_head bad_list;	/* Full lines bad */
+
+	/* GC lists - use gc_lock */
+	struct list_head *gc_lists[PBLK_NR_GC_LISTS];
+	struct list_head gc_high_list;	/* Full lines ready to GC, high isc */
+	struct list_head gc_mid_list;	/* Full lines ready to GC, mid isc */
+	struct list_head gc_low_list;	/* Full lines ready to GC, low isc */
+
+	struct list_head gc_full_list;	/* Full lines ready to GC, no valid */
+	struct list_head gc_empty_list;	/* Full lines close, all valid */
+
+	struct pblk_line *log_line;	/* Current FTL log line */
+	struct pblk_line *data_line;	/* Current data line */
+	struct pblk_line *log_next;	/* Next FTL log line */
+	struct pblk_line *data_next;	/* Next data line */
+
+	/* Metadata allocation type: VMALLOC | KMALLOC */
+	int smeta_alloc_type;
+	int emeta_alloc_type;
+
+	/* Pre-allocated metadata for data lines */
+	struct pblk_line_metadata sline_meta[PBLK_DATA_LINES];
+	struct pblk_line_metadata eline_meta[PBLK_DATA_LINES];
+	unsigned long meta_bitmap;
+
+	/* Helpers for fast bitmap calculations */
+	unsigned long *bb_template;
+	unsigned long *bb_aux;
+
+	unsigned long d_seq_nr;		/* Data line unique sequence number */
+	unsigned long l_seq_nr;		/* Log line unique sequence number */
+
+	spinlock_t free_lock;
+	spinlock_t gc_lock;
+};
+
+struct pblk_line_meta {
+	unsigned int smeta_len;		/* Total length for smeta */
+	unsigned int smeta_sec;		/* Sectors needed for smeta*/
+	unsigned int emeta_len;		/* Total length for emeta */
+	unsigned int emeta_sec;		/* Sectors needed for emeta*/
+	unsigned int emeta_bb;		/* Boundary for bb that affects emeta */
+	unsigned int sec_bitmap_len;	/* Length for sector bitmap in line */
+	unsigned int blk_bitmap_len;	/* Length for block bitmap in line */
+	unsigned int lun_bitmap_len;	/* Length for lun bitmap in line */
+
+	unsigned int blk_per_line;	/* Number of blocks in a full line */
+	unsigned int sec_per_line;	/* Number of sectors in a line */
+	unsigned int min_blk_line;	/* Min. number of good blocks in line */
+
+	unsigned int mid_thrs;		/* Threshold for GC mid list */
+	unsigned int high_thrs;		/* Threshold for GC high list */
+};
+
+struct pblk_addr_format {
+	u64	ch_mask;
+	u64	lun_mask;
+	u64	pln_mask;
+	u64	blk_mask;
+	u64	pg_mask;
+	u64	sec_mask;
+	u8	ch_offset;
+	u8	lun_offset;
+	u8	pln_offset;
+	u8	blk_offset;
+	u8	pg_offset;
+	u8	sec_offset;
+};
+
+struct pblk {
+	struct nvm_tgt_dev *dev;
+	struct gendisk *disk;
+
+	struct kobject kobj;
+
+	struct pblk_lun *luns;
+
+	struct pblk_line *lines;		/* Line array */
+	struct pblk_line_mgmt l_mg;		/* Line management */
+	struct pblk_line_meta lm;		/* Line metadata */
+
+	int ppaf_bitsize;
+	struct pblk_addr_format ppaf;
+
+	struct pblk_rb rwb;
+
+	int min_write_pgs; /* Minimum amount of pages required by controller */
+	int max_write_pgs; /* Maximum amount of pages supported by controller */
+	int pgs_in_buffer; /* Number of pages that need to be held in buffer to
+			    * guarantee successful reads.
+			    */
+
+	sector_t capacity; /* Device capacity when bad blocks are subtracted */
+	int over_pct;      /* Percentage of device used for over-provisioning */
+
+	/* pblk provisioning values. Used by rate limiter */
+	struct pblk_rl rl;
+
+	struct semaphore erase_sem;
+
+	unsigned char instance_uuid[16];
+#ifdef CONFIG_NVM_DEBUG
+	/* All debug counters apply to 4kb sector I/Os */
+	atomic_long_t inflight_writes;	/* Inflight writes (user and gc) */
+	atomic_long_t padded_writes;	/* Sectors padded due to flush/fua */
+	atomic_long_t padded_wb;	/* Sectors padded in write buffer */
+	atomic_long_t nr_flush;		/* Number of flush/fua I/O */
+	atomic_long_t req_writes;	/* Sectors stored on write buffer */
+	atomic_long_t sub_writes;	/* Sectors submitted from buffer */
+	atomic_long_t sync_writes;	/* Sectors synced to media */
+	atomic_long_t compl_writes;	/* Sectors completed in write bio */
+	atomic_long_t inflight_reads;	/* Inflight sector read requests */
+	atomic_long_t sync_reads;	/* Completed sector read requests */
+	atomic_long_t recov_writes;	/* Sectors submitted from recovery */
+	atomic_long_t recov_gc_writes;	/* Sectors submitted from write GC */
+	atomic_long_t recov_gc_reads;	/* Sectors submitted from read GC */
+#endif
+
+	spinlock_t lock;
+
+	atomic_long_t read_failed;
+	atomic_long_t read_empty;
+	atomic_long_t read_high_ecc;
+	atomic_long_t read_failed_gc;
+	atomic_long_t write_failed;
+	atomic_long_t erase_failed;
+
+	struct task_struct *writer_ts;
+
+	/* Simple translation map of logical addresses to physical addresses.
+	 * The logical addresses is known by the host system, while the physical
+	 * addresses are used when writing to the disk block device.
+	 */
+	unsigned char *trans_map;
+	spinlock_t trans_lock;
+
+	struct list_head compl_list;
+
+	mempool_t *page_pool;
+	mempool_t *line_ws_pool;
+	mempool_t *rec_pool;
+	mempool_t *r_rq_pool;
+	mempool_t *w_rq_pool;
+	mempool_t *line_meta_pool;
+
+	struct workqueue_struct *kw_wq;
+	struct timer_list wtimer;
+
+	struct pblk_gc gc;
+};
+
+struct pblk_line_ws {
+	struct pblk *pblk;
+	struct pblk_line *line;
+	void *priv;
+	struct work_struct ws;
+};
+
+#define pblk_r_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_r_ctx))
+#define pblk_w_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_c_ctx))
+
+/*
+ * pblk ring buffer operations
+ */
+int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
+		 unsigned int power_size, unsigned int power_seg_sz);
+unsigned int pblk_rb_calculate_size(unsigned int nr_entries);
+void *pblk_rb_entries_ref(struct pblk_rb *rb);
+int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio,
+			   unsigned int nr_entries, unsigned int *pos);
+int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries,
+			 unsigned int *pos);
+void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
+			      struct pblk_w_ctx w_ctx, unsigned int pos);
+void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
+			    struct pblk_w_ctx w_ctx, struct pblk_line *gc_line,
+			    unsigned int pos);
+struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos);
+
+void pblk_rb_sync_l2p(struct pblk_rb *rb);
+unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct bio *bio,
+				 struct pblk_c_ctx *c_ctx,
+				 unsigned int pos,
+				 unsigned int nr_entries,
+				 unsigned int count);
+unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
+				      struct list_head *list,
+				      unsigned int max);
+int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
+			u64 pos, int bio_iter);
+unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int entries);
+
+unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags);
+unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries);
+struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb,
+					      struct ppa_addr *ppa);
+void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags);
+unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb);
+
+unsigned int pblk_rb_read_count(struct pblk_rb *rb);
+unsigned int pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned int pos);
+
+int pblk_rb_tear_down_check(struct pblk_rb *rb);
+int pblk_rb_pos_oob(struct pblk_rb *rb, u64 pos);
+void pblk_rb_data_free(struct pblk_rb *rb);
+ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf);
+
+/*
+ * pblk core
+ */
+struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw);
+int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
+			struct pblk_c_ctx *c_ctx);
+void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw);
+void pblk_flush_writer(struct pblk *pblk);
+struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba);
+void pblk_discard(struct pblk *pblk, struct bio *bio);
+void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd);
+void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd);
+int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd);
+struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
+			      unsigned int nr_secs, unsigned int len,
+			      gfp_t gfp_mask);
+struct pblk_line *pblk_line_get(struct pblk *pblk);
+struct pblk_line *pblk_line_get_first_data(struct pblk *pblk);
+struct pblk_line *pblk_line_replace_data(struct pblk *pblk);
+int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line);
+void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line);
+struct pblk_line *pblk_line_get_data(struct pblk *pblk);
+struct pblk_line *pblk_line_get_data_next(struct pblk *pblk);
+int pblk_line_erase(struct pblk *pblk, struct pblk_line *line);
+int pblk_line_is_full(struct pblk_line *line);
+void pblk_line_free(struct pblk *pblk, struct pblk_line *line);
+void pblk_line_close_ws(struct work_struct *work);
+void pblk_line_close(struct pblk *pblk, struct pblk_line *line);
+void pblk_line_mark_bb(struct work_struct *work);
+void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
+		      void (*work)(struct work_struct *));
+u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line);
+int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line);
+int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line);
+int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr erase_ppa);
+void pblk_line_put(struct kref *ref);
+struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line);
+u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
+int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
+		   unsigned long secs_to_flush);
+void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
+		  unsigned long *lun_bitmap);
+void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
+		unsigned long *lun_bitmap);
+void pblk_end_bio_sync(struct bio *bio);
+void pblk_end_io_sync(struct nvm_rq *rqd);
+int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
+		       int nr_pages);
+void pblk_map_pad_invalidate(struct pblk *pblk, struct pblk_line *line,
+			     u64 paddr);
+void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off,
+			 int nr_pages);
+void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa);
+void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa);
+void pblk_update_map_cache(struct pblk *pblk, sector_t lba,
+			   struct ppa_addr ppa);
+void pblk_update_map_dev(struct pblk *pblk, sector_t lba,
+			 struct ppa_addr ppa, struct ppa_addr entry_line);
+int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
+		       struct pblk_line *gc_line);
+void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
+			  u64 *lba_list, int nr_secs);
+void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
+			 sector_t blba, int nr_secs);
+
+/*
+ * pblk user I/O write path
+ */
+int pblk_write_to_cache(struct pblk *pblk, struct bio *bio,
+			unsigned long flags);
+int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list,
+			   unsigned int nr_entries, unsigned int nr_rec_entries,
+			   struct pblk_line *gc_line, unsigned long flags);
+
+/*
+ * pblk map
+ */
+void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
+		       unsigned int sentry, unsigned long *lun_bitmap,
+		       unsigned int valid_secs, struct ppa_addr *erase_ppa);
+void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
+		 unsigned long *lun_bitmap, unsigned int valid_secs,
+		 unsigned int off);
+
+/*
+ * pblk write thread
+ */
+int pblk_write_ts(void *data);
+void pblk_write_timer_fn(unsigned long data);
+void pblk_write_should_kick(struct pblk *pblk);
+
+/*
+ * pblk read path
+ */
+int pblk_submit_read(struct pblk *pblk, struct bio *bio);
+int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data,
+			unsigned int nr_secs, unsigned int *secs_to_gc,
+			struct pblk_line *line);
+/*
+ * pblk recovery
+ */
+void pblk_submit_rec(struct work_struct *work);
+struct pblk_line *pblk_recov_l2p(struct pblk *pblk);
+void pblk_recov_pad(struct pblk *pblk);
+__le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta);
+int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
+			struct pblk_rec_ctx *recovery, u64 *comp_bits,
+			unsigned int comp);
+
+/*
+ * pblk gc
+ */
+#define PBLK_GC_TRIES 3
+
+int pblk_gc_init(struct pblk *pblk);
+void pblk_gc_exit(struct pblk *pblk);
+void pblk_gc_should_start(struct pblk *pblk);
+void pblk_gc_should_stop(struct pblk *pblk);
+int pblk_gc_status(struct pblk *pblk);
+void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
+			      int *gc_active);
+void pblk_gc_sysfs_force(struct pblk *pblk, int force);
+
+/*
+ * pblk rate limiter
+ */
+void pblk_rl_init(struct pblk_rl *rl, int budget);
+void pblk_rl_free(struct pblk_rl *rl);
+int pblk_rl_gc_thrs(struct pblk_rl *rl);
+unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl);
+int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries);
+void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries);
+int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries);
+void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries);
+void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc);
+void pblk_rl_set_gc_rsc(struct pblk_rl *rl, int rsv);
+int pblk_rl_sysfs_rate_show(struct pblk_rl *rl);
+void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line);
+void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line);
+
+/*
+ * pblk sysfs
+ */
+int pblk_sysfs_init(struct gendisk *tdisk);
+void pblk_sysfs_exit(struct gendisk *tdisk);
+
+static inline void *pblk_malloc(size_t size, int type, gfp_t flags)
+{
+	if (type == PBLK_KMALLOC_META)
+		return kmalloc(size, flags);
+	return vmalloc(size);
+}
+
+static inline void pblk_mfree(void *ptr, int type)
+{
+	if (type == PBLK_KMALLOC_META)
+		kfree(ptr);
+	else
+		vfree(ptr);
+}
+
+static inline struct nvm_rq *nvm_rq_from_c_ctx(void *c_ctx)
+{
+	return c_ctx - sizeof(struct nvm_rq);
+}
+
+static inline void *pblk_line_emeta_to_lbas(struct line_emeta *emeta)
+{
+	return (emeta) + 1;
+}
+
+#define NVM_MEM_PAGE_WRITE (8)
+
+static inline int pblk_pad_distance(struct pblk *pblk)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+
+	return NVM_MEM_PAGE_WRITE * geo->nr_luns * geo->sec_per_pl;
+}
+
+static inline int pblk_dev_ppa_to_line(struct ppa_addr p)
+{
+	return p.g.blk;
+}
+
+static inline int pblk_tgt_ppa_to_line(struct ppa_addr p)
+{
+	return p.g.blk;
+}
+
+static inline int pblk_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p)
+{
+	return p.g.lun * geo->nr_chnls + p.g.ch;
+}
+
+/* A block within a line corresponds to the lun */
+static inline int pblk_dev_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p)
+{
+	return p.g.lun * geo->nr_chnls + p.g.ch;
+}
+
+static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32)
+{
+	struct ppa_addr ppa64;
+
+	ppa64.ppa = 0;
+
+	if (ppa32 == -1) {
+		ppa64.ppa = ADDR_EMPTY;
+	} else if (ppa32 & (1U << 31)) {
+		ppa64.c.line = ppa32 & ((~0U) >> 1);
+		ppa64.c.is_cached = 1;
+	} else {
+		ppa64.g.blk = (ppa32 & pblk->ppaf.blk_mask) >>
+							pblk->ppaf.blk_offset;
+		ppa64.g.pg = (ppa32 & pblk->ppaf.pg_mask) >>
+							pblk->ppaf.pg_offset;
+		ppa64.g.lun = (ppa32 & pblk->ppaf.lun_mask) >>
+							pblk->ppaf.lun_offset;
+		ppa64.g.ch = (ppa32 & pblk->ppaf.ch_mask) >>
+							pblk->ppaf.ch_offset;
+		ppa64.g.pl = (ppa32 & pblk->ppaf.pln_mask) >>
+							pblk->ppaf.pln_offset;
+		ppa64.g.sec = (ppa32 & pblk->ppaf.sec_mask) >>
+							pblk->ppaf.sec_offset;
+	}
+
+	return ppa64;
+}
+
+static inline struct ppa_addr pblk_trans_map_get(struct pblk *pblk,
+								sector_t lba)
+{
+	struct ppa_addr ppa;
+
+	if (pblk->ppaf_bitsize < 32) {
+		u32 *map = (u32 *)pblk->trans_map;
+
+		ppa = pblk_ppa32_to_ppa64(pblk, map[lba]);
+	} else {
+		struct ppa_addr *map = (struct ppa_addr *)pblk->trans_map;
+
+		ppa = map[lba];
+	}
+
+	return ppa;
+}
+
+static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64)
+{
+	u32 ppa32 = 0;
+
+	if (ppa64.ppa == ADDR_EMPTY) {
+		ppa32 = ~0U;
+	} else if (ppa64.c.is_cached) {
+		ppa32 |= ppa64.c.line;
+		ppa32 |= 1U << 31;
+	} else {
+		ppa32 |= ppa64.g.blk << pblk->ppaf.blk_offset;
+		ppa32 |= ppa64.g.pg << pblk->ppaf.pg_offset;
+		ppa32 |= ppa64.g.lun << pblk->ppaf.lun_offset;
+		ppa32 |= ppa64.g.ch << pblk->ppaf.ch_offset;
+		ppa32 |= ppa64.g.pl << pblk->ppaf.pln_offset;
+		ppa32 |= ppa64.g.sec << pblk->ppaf.sec_offset;
+	}
+
+	return ppa32;
+}
+
+static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba,
+						struct ppa_addr ppa)
+{
+	if (pblk->ppaf_bitsize < 32) {
+		u32 *map = (u32 *)pblk->trans_map;
+
+		map[lba] = pblk_ppa64_to_ppa32(pblk, ppa);
+	} else {
+		u64 *map = (u64 *)pblk->trans_map;
+
+		map[lba] = ppa.ppa;
+	}
+}
+
+static inline u64 pblk_dev_ppa_to_line_addr(struct pblk *pblk,
+							struct ppa_addr p)
+{
+	u64 paddr;
+
+	paddr = 0;
+	paddr |= (u64)p.g.pg << pblk->ppaf.pg_offset;
+	paddr |= (u64)p.g.lun << pblk->ppaf.lun_offset;
+	paddr |= (u64)p.g.ch << pblk->ppaf.ch_offset;
+	paddr |= (u64)p.g.pl << pblk->ppaf.pln_offset;
+	paddr |= (u64)p.g.sec << pblk->ppaf.sec_offset;
+
+	return paddr;
+}
+
+static inline int pblk_ppa_empty(struct ppa_addr ppa_addr)
+{
+	return (ppa_addr.ppa == ADDR_EMPTY);
+}
+
+static inline void pblk_ppa_set_empty(struct ppa_addr *ppa_addr)
+{
+	ppa_addr->ppa = ADDR_EMPTY;
+}
+
+static inline int pblk_addr_in_cache(struct ppa_addr ppa)
+{
+	return (ppa.ppa != ADDR_EMPTY && ppa.c.is_cached);
+}
+
+static inline int pblk_addr_to_cacheline(struct ppa_addr ppa)
+{
+	return ppa.c.line;
+}
+
+static inline struct ppa_addr pblk_cacheline_to_addr(int addr)
+{
+	struct ppa_addr p;
+
+	p.c.line = addr;
+	p.c.is_cached = 1;
+
+	return p;
+}
+
+static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr,
+					      u64 line_id)
+{
+	struct ppa_addr ppa;
+
+	ppa.ppa = 0;
+	ppa.g.blk = line_id;
+	ppa.g.pg = (paddr & pblk->ppaf.pg_mask) >> pblk->ppaf.pg_offset;
+	ppa.g.lun = (paddr & pblk->ppaf.lun_mask) >> pblk->ppaf.lun_offset;
+	ppa.g.ch = (paddr & pblk->ppaf.ch_mask) >> pblk->ppaf.ch_offset;
+	ppa.g.pl = (paddr & pblk->ppaf.pln_mask) >> pblk->ppaf.pln_offset;
+	ppa.g.sec = (paddr & pblk->ppaf.sec_mask) >> pblk->ppaf.sec_offset;
+
+	return ppa;
+}
+
+static inline struct ppa_addr addr_to_pblk_ppa(struct pblk *pblk, u64 paddr,
+					 u64 line_id)
+{
+	struct ppa_addr ppa;
+
+	ppa = addr_to_gen_ppa(pblk, paddr, line_id);
+
+	return ppa;
+}
+
+static inline u32 pblk_calc_meta_header_crc(struct pblk *pblk,
+					    struct line_smeta *smeta)
+{
+	u32 crc = ~(u32)0;
+
+	crc = crc32_le(crc, (unsigned char *)smeta + sizeof(crc),
+				sizeof(struct line_header) - sizeof(crc));
+
+	return crc;
+}
+
+static inline u32 pblk_calc_smeta_crc(struct pblk *pblk,
+				      struct line_smeta *smeta)
+{
+	struct pblk_line_meta *lm = &pblk->lm;
+	u32 crc = ~(u32)0;
+
+	crc = crc32_le(crc, (unsigned char *)smeta +
+				sizeof(struct line_header) + sizeof(crc),
+				lm->smeta_len -
+				sizeof(struct line_header) - sizeof(crc));
+
+	return crc;
+}
+
+static inline u32 pblk_calc_emeta_crc(struct pblk *pblk,
+				      struct line_emeta *emeta)
+{
+	struct pblk_line_meta *lm = &pblk->lm;
+	u32 crc = ~(u32)0;
+
+	crc = crc32_le(crc, (unsigned char *)emeta +
+				sizeof(struct line_header) + sizeof(crc),
+				lm->emeta_len -
+				sizeof(struct line_header) - sizeof(crc));
+
+	return crc;
+}
+
+static inline int pblk_set_progr_mode(struct pblk *pblk, int type)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	int flags;
+
+	flags = geo->plane_mode >> 1;
+
+	if (type == WRITE)
+		flags |= NVM_IO_SCRAMBLE_ENABLE;
+
+	return flags;
+}
+
+static inline int pblk_set_read_mode(struct pblk *pblk)
+{
+	return NVM_IO_SNGL_ACCESS | NVM_IO_SUSPEND | NVM_IO_SCRAMBLE_ENABLE;
+}
+
+#ifdef CONFIG_NVM_DEBUG
+static inline void print_ppa(struct ppa_addr *p, char *msg, int error)
+{
+	if (p->c.is_cached) {
+		pr_err("ppa: (%s: %x) cache line: %llu\n",
+				msg, error, (u64)p->c.line);
+	} else {
+		pr_err("ppa: (%s: %x):ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n",
+			msg, error,
+			p->g.ch, p->g.lun, p->g.blk,
+			p->g.pg, p->g.pl, p->g.sec);
+	}
+}
+
+static inline void pblk_print_failed_rqd(struct pblk *pblk, struct nvm_rq *rqd,
+					 int error)
+{
+	int bit = -1;
+
+	if (rqd->nr_ppas ==  1) {
+		print_ppa(&rqd->ppa_addr, "rqd", error);
+		return;
+	}
+
+	while ((bit = find_next_bit((void *)&rqd->ppa_status, rqd->nr_ppas,
+						bit + 1)) < rqd->nr_ppas) {
+		print_ppa(&rqd->ppa_list[bit], "rqd", error);
+	}
+
+	pr_err("error:%d, ppa_status:%llx\n", error, rqd->ppa_status);
+}
+#endif
+
+static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
+				       struct ppa_addr *ppas, int nr_ppas)
+{
+	struct nvm_geo *geo = &tgt_dev->geo;
+	struct ppa_addr *ppa;
+	int i;
+
+	for (i = 0; i < nr_ppas; i++) {
+		ppa = &ppas[i];
+
+		if (!ppa->c.is_cached &&
+				ppa->g.ch < geo->nr_chnls &&
+				ppa->g.lun < geo->luns_per_chnl &&
+				ppa->g.pl < geo->nr_planes &&
+				ppa->g.blk < geo->blks_per_lun &&
+				ppa->g.pg < geo->pgs_per_blk &&
+				ppa->g.sec < geo->sec_per_pg)
+			continue;
+
+#ifdef CONFIG_NVM_DEBUG
+		print_ppa(ppa, "boundary", i);
+#endif
+		return 1;
+	}
+	return 0;
+}
+
+static inline int pblk_boundary_paddr_checks(struct pblk *pblk, u64 paddr)
+{
+	struct pblk_line_meta *lm = &pblk->lm;
+
+	if (paddr > lm->sec_per_line)
+		return 1;
+
+	return 0;
+}
+
+static inline unsigned int pblk_get_bi_idx(struct bio *bio)
+{
+	return bio->bi_iter.bi_idx;
+}
+
+static inline sector_t pblk_get_lba(struct bio *bio)
+{
+	return bio->bi_iter.bi_sector / NR_PHY_IN_LOG;
+}
+
+static inline unsigned int pblk_get_secs(struct bio *bio)
+{
+	return  bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE;
+}
+
+static inline sector_t pblk_get_sector(sector_t lba)
+{
+	return lba * NR_PHY_IN_LOG;
+}
+
+static inline void pblk_setup_uuid(struct pblk *pblk)
+{
+	uuid_le uuid;
+
+	uuid_le_gen(&uuid);
+	memcpy(pblk->instance_uuid, uuid.b, 16);
+}
+#endif /* PBLK_H_ */
-- 
cgit v1.2.3


From aee69d78dec0ffdf82e35d57c626e80dddc314d5 Mon Sep 17 00:00:00 2001
From: Paolo Valente <paolo.valente@linaro.org>
Date: Wed, 19 Apr 2017 08:29:02 -0600
Subject: block, bfq: introduce the BFQ-v0 I/O scheduler as an extra scheduler

We tag as v0 the version of BFQ containing only BFQ's engine plus
hierarchical support. BFQ's engine is introduced by this commit, while
hierarchical support is added by next commit. We use the v0 tag to
distinguish this minimal version of BFQ from the versions containing
also the features and the improvements added by next commits. BFQ-v0
coincides with the version of BFQ submitted a few years ago [1], apart
from the introduction of preemption, described below.

BFQ is a proportional-share I/O scheduler, whose general structure,
plus a lot of code, are borrowed from CFQ.

- Each process doing I/O on a device is associated with a weight and a
  (bfq_)queue.

- BFQ grants exclusive access to the device, for a while, to one queue
  (process) at a time, and implements this service model by
  associating every queue with a budget, measured in number of
  sectors.

  - After a queue is granted access to the device, the budget of the
    queue is decremented, on each request dispatch, by the size of the
    request.

  - The in-service queue is expired, i.e., its service is suspended,
    only if one of the following events occurs: 1) the queue finishes
    its budget, 2) the queue empties, 3) a "budget timeout" fires.

    - The budget timeout prevents processes doing random I/O from
      holding the device for too long and dramatically reducing
      throughput.

    - Actually, as in CFQ, a queue associated with a process issuing
      sync requests may not be expired immediately when it empties. In
      contrast, BFQ may idle the device for a short time interval,
      giving the process the chance to go on being served if it issues
      a new request in time. Device idling typically boosts the
      throughput on rotational devices, if processes do synchronous
      and sequential I/O. In addition, under BFQ, device idling is
      also instrumental in guaranteeing the desired throughput
      fraction to processes issuing sync requests (see [2] for
      details).

      - With respect to idling for service guarantees, if several
        processes are competing for the device at the same time, but
        all processes (and groups, after the following commit) have
        the same weight, then BFQ guarantees the expected throughput
        distribution without ever idling the device. Throughput is
        thus as high as possible in this common scenario.

  - Queues are scheduled according to a variant of WF2Q+, named
    B-WF2Q+, and implemented using an augmented rb-tree to preserve an
    O(log N) overall complexity.  See [2] for more details. B-WF2Q+ is
    also ready for hierarchical scheduling. However, for a cleaner
    logical breakdown, the code that enables and completes
    hierarchical support is provided in the next commit, which focuses
    exactly on this feature.

  - B-WF2Q+ guarantees a tight deviation with respect to an ideal,
    perfectly fair, and smooth service. In particular, B-WF2Q+
    guarantees that each queue receives a fraction of the device
    throughput proportional to its weight, even if the throughput
    fluctuates, and regardless of: the device parameters, the current
    workload and the budgets assigned to the queue.

  - The last, budget-independence, property (although probably
    counterintuitive in the first place) is definitely beneficial, for
    the following reasons:

    - First, with any proportional-share scheduler, the maximum
      deviation with respect to an ideal service is proportional to
      the maximum budget (slice) assigned to queues. As a consequence,
      BFQ can keep this deviation tight not only because of the
      accurate service of B-WF2Q+, but also because BFQ *does not*
      need to assign a larger budget to a queue to let the queue
      receive a higher fraction of the device throughput.

    - Second, BFQ is free to choose, for every process (queue), the
      budget that best fits the needs of the process, or best
      leverages the I/O pattern of the process. In particular, BFQ
      updates queue budgets with a simple feedback-loop algorithm that
      allows a high throughput to be achieved, while still providing
      tight latency guarantees to time-sensitive applications. When
      the in-service queue expires, this algorithm computes the next
      budget of the queue so as to:

      - Let large budgets be eventually assigned to the queues
        associated with I/O-bound applications performing sequential
        I/O: in fact, the longer these applications are served once
        got access to the device, the higher the throughput is.

      - Let small budgets be eventually assigned to the queues
        associated with time-sensitive applications (which typically
        perform sporadic and short I/O), because, the smaller the
        budget assigned to a queue waiting for service is, the sooner
        B-WF2Q+ will serve that queue (Subsec 3.3 in [2]).

- Weights can be assigned to processes only indirectly, through I/O
  priorities, and according to the relation:
  weight = 10 * (IOPRIO_BE_NR - ioprio).
  The next patch provides, instead, a cgroups interface through which
  weights can be assigned explicitly.

- If several processes are competing for the device at the same time,
  but all processes and groups have the same weight, then BFQ
  guarantees the expected throughput distribution without ever idling
  the device. It uses preemption instead. Throughput is then much
  higher in this common scenario.

- ioprio classes are served in strict priority order, i.e.,
  lower-priority queues are not served as long as there are
  higher-priority queues.  Among queues in the same class, the
  bandwidth is distributed in proportion to the weight of each
  queue. A very thin extra bandwidth is however guaranteed to the Idle
  class, to prevent it from starving.

- If the strict_guarantees parameter is set (default: unset), then BFQ
     - always performs idling when the in-service queue becomes empty;
     - forces the device to serve one I/O request at a time, by
       dispatching a new request only if there is no outstanding
       request.
  In the presence of differentiated weights or I/O-request sizes,
  both the above conditions are needed to guarantee that every
  queue receives its allotted share of the bandwidth (see
  Documentation/block/bfq-iosched.txt for more details). Setting
  strict_guarantees may evidently affect throughput.

[1] https://lkml.org/lkml/2008/4/1/234
    https://lkml.org/lkml/2008/11/11/148

[2] P. Valente and M. Andreolini, "Improving Application
    Responsiveness with the BFQ Disk I/O Scheduler", Proceedings of
    the 5th Annual International Systems and Storage Conference
    (SYSTOR '12), June 2012.
    Slightly extended version:
    http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite-
							results.pdf

Signed-off-by: Fabio Checconi <fchecconi@gmail.com>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/block/00-INDEX        |    2 +
 Documentation/block/bfq-iosched.txt |  517 +++++
 block/Kconfig.iosched               |   11 +
 block/Makefile                      |    1 +
 block/bfq-iosched.c                 | 4166 +++++++++++++++++++++++++++++++++++
 5 files changed, 4697 insertions(+)
 create mode 100644 Documentation/block/bfq-iosched.txt
 create mode 100644 block/bfq-iosched.c

(limited to 'Documentation')

diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX
index e55103ace382..8d55b4bbb5e2 100644
--- a/Documentation/block/00-INDEX
+++ b/Documentation/block/00-INDEX
@@ -1,5 +1,7 @@
 00-INDEX
 	- This file
+bfq-iosched.txt
+	- BFQ IO scheduler and its tunables
 biodoc.txt
 	- Notes on the Generic Block Layer Rewrite in Linux 2.5
 biovecs.txt
diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt
new file mode 100644
index 000000000000..cbf85f6f1fd8
--- /dev/null
+++ b/Documentation/block/bfq-iosched.txt
@@ -0,0 +1,517 @@
+BFQ (Budget Fair Queueing)
+==========================
+
+BFQ is a proportional-share I/O scheduler, with some extra
+low-latency capabilities. In addition to cgroups support (blkio or io
+controllers), BFQ's main features are:
+- BFQ guarantees a high system and application responsiveness, and a
+  low latency for time-sensitive applications, such as audio or video
+  players;
+- BFQ distributes bandwidth, and not just time, among processes or
+  groups (switching back to time distribution when needed to keep
+  throughput high).
+
+On average CPUs, the current version of BFQ can handle devices
+performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a
+reference, 30-50 KIOPS correspond to very high bandwidths with
+sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and
+to 120-200 MB/s with 4KB random I/O. BFQ has not yet been tested on
+multi-queue devices.
+
+The table of contents follow. Impatients can just jump to Section 3.
+
+CONTENTS
+
+1. When may BFQ be useful?
+ 1-1 Personal systems
+ 1-2 Server systems
+2. How does BFQ work?
+3. What are BFQ's tunable?
+4. BFQ group scheduling
+ 4-1 Service guarantees provided
+ 4-2 Interface
+
+1. When may BFQ be useful?
+==========================
+
+BFQ provides the following benefits on personal and server systems.
+
+1-1 Personal systems
+--------------------
+
+Low latency for interactive applications
+
+Regardless of the actual background workload, BFQ guarantees that, for
+interactive tasks, the storage device is virtually as responsive as if
+it was idle. For example, even if one or more of the following
+background workloads are being executed:
+- one or more large files are being read, written or copied,
+- a tree of source files is being compiled,
+- one or more virtual machines are performing I/O,
+- a software update is in progress,
+- indexing daemons are scanning filesystems and updating their
+  databases,
+starting an application or loading a file from within an application
+takes about the same time as if the storage device was idle. As a
+comparison, with CFQ, NOOP or DEADLINE, and in the same conditions,
+applications experience high latencies, or even become unresponsive
+until the background workload terminates (also on SSDs).
+
+Low latency for soft real-time applications
+
+Also soft real-time applications, such as audio and video
+players/streamers, enjoy a low latency and a low drop rate, regardless
+of the background I/O workload. As a consequence, these applications
+do not suffer from almost any glitch due to the background workload.
+
+Higher speed for code-development tasks
+
+If some additional workload happens to be executed in parallel, then
+BFQ executes the I/O-related components of typical code-development
+tasks (compilation, checkout, merge, ...) much more quickly than CFQ,
+NOOP or DEADLINE.
+
+High throughput
+
+On hard disks, BFQ achieves up to 30% higher throughput than CFQ, and
+up to 150% higher throughput than DEADLINE and NOOP, with all the
+sequential workloads considered in our tests. With random workloads,
+and with all the workloads on flash-based devices, BFQ achieves,
+instead, about the same throughput as the other schedulers.
+
+Strong fairness, bandwidth and delay guarantees
+
+BFQ distributes the device throughput, and not just the device time,
+among I/O-bound applications in proportion their weights, with any
+workload and regardless of the device parameters. From these bandwidth
+guarantees, it is possible to compute tight per-I/O-request delay
+guarantees by a simple formula. If not configured for strict service
+guarantees, BFQ switches to time-based resource sharing (only) for
+applications that would otherwise cause a throughput loss.
+
+1-2 Server systems
+------------------
+
+Most benefits for server systems follow from the same service
+properties as above. In particular, regardless of whether additional,
+possibly heavy workloads are being served, BFQ guarantees:
+
+. audio and video-streaming with zero or very low jitter and drop
+  rate;
+
+. fast retrieval of WEB pages and embedded objects;
+
+. real-time recording of data in live-dumping applications (e.g.,
+  packet logging);
+
+. responsiveness in local and remote access to a server.
+
+
+2. How does BFQ work?
+=====================
+
+BFQ is a proportional-share I/O scheduler, whose general structure,
+plus a lot of code, are borrowed from CFQ.
+
+- Each process doing I/O on a device is associated with a weight and a
+  (bfq_)queue.
+
+- BFQ grants exclusive access to the device, for a while, to one queue
+  (process) at a time, and implements this service model by
+  associating every queue with a budget, measured in number of
+  sectors.
+
+  - After a queue is granted access to the device, the budget of the
+    queue is decremented, on each request dispatch, by the size of the
+    request.
+
+  - The in-service queue is expired, i.e., its service is suspended,
+    only if one of the following events occurs: 1) the queue finishes
+    its budget, 2) the queue empties, 3) a "budget timeout" fires.
+
+    - The budget timeout prevents processes doing random I/O from
+      holding the device for too long and dramatically reducing
+      throughput.
+
+    - Actually, as in CFQ, a queue associated with a process issuing
+      sync requests may not be expired immediately when it empties. In
+      contrast, BFQ may idle the device for a short time interval,
+      giving the process the chance to go on being served if it issues
+      a new request in time. Device idling typically boosts the
+      throughput on rotational devices, if processes do synchronous
+      and sequential I/O. In addition, under BFQ, device idling is
+      also instrumental in guaranteeing the desired throughput
+      fraction to processes issuing sync requests (see the description
+      of the slice_idle tunable in this document, or [1, 2], for more
+      details).
+
+      - With respect to idling for service guarantees, if several
+	processes are competing for the device at the same time, but
+	all processes (and groups, after the following commit) have
+	the same weight, then BFQ guarantees the expected throughput
+	distribution without ever idling the device. Throughput is
+	thus as high as possible in this common scenario.
+
+  - If low-latency mode is enabled (default configuration), BFQ
+    executes some special heuristics to detect interactive and soft
+    real-time applications (e.g., video or audio players/streamers),
+    and to reduce their latency. The most important action taken to
+    achieve this goal is to give to the queues associated with these
+    applications more than their fair share of the device
+    throughput. For brevity, we call just "weight-raising" the whole
+    sets of actions taken by BFQ to privilege these queues. In
+    particular, BFQ provides a milder form of weight-raising for
+    interactive applications, and a stronger form for soft real-time
+    applications.
+
+  - BFQ automatically deactivates idling for queues born in a burst of
+    queue creations. In fact, these queues are usually associated with
+    the processes of applications and services that benefit mostly
+    from a high throughput. Examples are systemd during boot, or git
+    grep.
+
+  - As CFQ, BFQ merges queues performing interleaved I/O, i.e.,
+    performing random I/O that becomes mostly sequential if
+    merged. Differently from CFQ, BFQ achieves this goal with a more
+    reactive mechanism, called Early Queue Merge (EQM). EQM is so
+    responsive in detecting interleaved I/O (cooperating processes),
+    that it enables BFQ to achieve a high throughput, by queue
+    merging, even for queues for which CFQ needs a different
+    mechanism, preemption, to get a high throughput. As such EQM is a
+    unified mechanism to achieve a high throughput with interleaved
+    I/O.
+
+  - Queues are scheduled according to a variant of WF2Q+, named
+    B-WF2Q+, and implemented using an augmented rb-tree to preserve an
+    O(log N) overall complexity.  See [2] for more details. B-WF2Q+ is
+    also ready for hierarchical scheduling. However, for a cleaner
+    logical breakdown, the code that enables and completes
+    hierarchical support is provided in the next commit, which focuses
+    exactly on this feature.
+
+  - B-WF2Q+ guarantees a tight deviation with respect to an ideal,
+    perfectly fair, and smooth service. In particular, B-WF2Q+
+    guarantees that each queue receives a fraction of the device
+    throughput proportional to its weight, even if the throughput
+    fluctuates, and regardless of: the device parameters, the current
+    workload and the budgets assigned to the queue.
+
+  - The last, budget-independence, property (although probably
+    counterintuitive in the first place) is definitely beneficial, for
+    the following reasons:
+
+    - First, with any proportional-share scheduler, the maximum
+      deviation with respect to an ideal service is proportional to
+      the maximum budget (slice) assigned to queues. As a consequence,
+      BFQ can keep this deviation tight not only because of the
+      accurate service of B-WF2Q+, but also because BFQ *does not*
+      need to assign a larger budget to a queue to let the queue
+      receive a higher fraction of the device throughput.
+
+    - Second, BFQ is free to choose, for every process (queue), the
+      budget that best fits the needs of the process, or best
+      leverages the I/O pattern of the process. In particular, BFQ
+      updates queue budgets with a simple feedback-loop algorithm that
+      allows a high throughput to be achieved, while still providing
+      tight latency guarantees to time-sensitive applications. When
+      the in-service queue expires, this algorithm computes the next
+      budget of the queue so as to:
+
+      - Let large budgets be eventually assigned to the queues
+	associated with I/O-bound applications performing sequential
+	I/O: in fact, the longer these applications are served once
+	got access to the device, the higher the throughput is.
+
+      - Let small budgets be eventually assigned to the queues
+	associated with time-sensitive applications (which typically
+	perform sporadic and short I/O), because, the smaller the
+	budget assigned to a queue waiting for service is, the sooner
+	B-WF2Q+ will serve that queue (Subsec 3.3 in [2]).
+
+- If several processes are competing for the device at the same time,
+  but all processes and groups have the same weight, then BFQ
+  guarantees the expected throughput distribution without ever idling
+  the device. It uses preemption instead. Throughput is then much
+  higher in this common scenario.
+
+- ioprio classes are served in strict priority order, i.e.,
+  lower-priority queues are not served as long as there are
+  higher-priority queues.  Among queues in the same class, the
+  bandwidth is distributed in proportion to the weight of each
+  queue. A very thin extra bandwidth is however guaranteed to
+  the Idle class, to prevent it from starving.
+
+
+3. What are BFQ's tunable?
+==========================
+
+The tunables back_seek-max, back_seek_penalty, fifo_expire_async and
+fifo_expire_sync below are the same as in CFQ. Their description is
+just copied from that for CFQ. Some considerations in the description
+of slice_idle are copied from CFQ too.
+
+per-process ioprio and weight
+-----------------------------
+
+Unless the cgroups interface is used, weights can be assigned to
+processes only indirectly, through I/O priorities, and according to
+the relation: weight = (IOPRIO_BE_NR - ioprio) * 10.
+
+slice_idle
+----------
+
+This parameter specifies how long BFQ should idle for next I/O
+request, when certain sync BFQ queues become empty. By default
+slice_idle is a non-zero value. Idling has a double purpose: boosting
+throughput and making sure that the desired throughput distribution is
+respected (see the description of how BFQ works, and, if needed, the
+papers referred there).
+
+As for throughput, idling can be very helpful on highly seeky media
+like single spindle SATA/SAS disks where we can cut down on overall
+number of seeks and see improved throughput.
+
+Setting slice_idle to 0 will remove all the idling on queues and one
+should see an overall improved throughput on faster storage devices
+like multiple SATA/SAS disks in hardware RAID configuration.
+
+So depending on storage and workload, it might be useful to set
+slice_idle=0.  In general for SATA/SAS disks and software RAID of
+SATA/SAS disks keeping slice_idle enabled should be useful. For any
+configurations where there are multiple spindles behind single LUN
+(Host based hardware RAID controller or for storage arrays), setting
+slice_idle=0 might end up in better throughput and acceptable
+latencies.
+
+Idling is however necessary to have service guarantees enforced in
+case of differentiated weights or differentiated I/O-request lengths.
+To see why, suppose that a given BFQ queue A must get several I/O
+requests served for each request served for another queue B. Idling
+ensures that, if A makes a new I/O request slightly after becoming
+empty, then no request of B is dispatched in the middle, and thus A
+does not lose the possibility to get more than one request dispatched
+before the next request of B is dispatched. Note that idling
+guarantees the desired differentiated treatment of queues only in
+terms of I/O-request dispatches. To guarantee that the actual service
+order then corresponds to the dispatch order, the strict_guarantees
+tunable must be set too.
+
+There is an important flipside for idling: apart from the above cases
+where it is beneficial also for throughput, idling can severely impact
+throughput. One important case is random workload. Because of this
+issue, BFQ tends to avoid idling as much as possible, when it is not
+beneficial also for throughput. As a consequence of this behavior, and
+of further issues described for the strict_guarantees tunable,
+short-term service guarantees may be occasionally violated. And, in
+some cases, these guarantees may be more important than guaranteeing
+maximum throughput. For example, in video playing/streaming, a very
+low drop rate may be more important than maximum throughput. In these
+cases, consider setting the strict_guarantees parameter.
+
+strict_guarantees
+-----------------
+
+If this parameter is set (default: unset), then BFQ
+
+- always performs idling when the in-service queue becomes empty;
+
+- forces the device to serve one I/O request at a time, by dispatching a
+  new request only if there is no outstanding request.
+
+In the presence of differentiated weights or I/O-request sizes, both
+the above conditions are needed to guarantee that every BFQ queue
+receives its allotted share of the bandwidth. The first condition is
+needed for the reasons explained in the description of the slice_idle
+tunable.  The second condition is needed because all modern storage
+devices reorder internally-queued requests, which may trivially break
+the service guarantees enforced by the I/O scheduler.
+
+Setting strict_guarantees may evidently affect throughput.
+
+back_seek_max
+-------------
+
+This specifies, given in Kbytes, the maximum "distance" for backward seeking.
+The distance is the amount of space from the current head location to the
+sectors that are backward in terms of distance.
+
+This parameter allows the scheduler to anticipate requests in the "backward"
+direction and consider them as being the "next" if they are within this
+distance from the current head location.
+
+back_seek_penalty
+-----------------
+
+This parameter is used to compute the cost of backward seeking. If the
+backward distance of request is just 1/back_seek_penalty from a "front"
+request, then the seeking cost of two requests is considered equivalent.
+
+So scheduler will not bias toward one or the other request (otherwise scheduler
+will bias toward front request). Default value of back_seek_penalty is 2.
+
+fifo_expire_async
+-----------------
+
+This parameter is used to set the timeout of asynchronous requests. Default
+value of this is 248ms.
+
+fifo_expire_sync
+----------------
+
+This parameter is used to set the timeout of synchronous requests. Default
+value of this is 124ms. In case to favor synchronous requests over asynchronous
+one, this value should be decreased relative to fifo_expire_async.
+
+low_latency
+-----------
+
+This parameter is used to enable/disable BFQ's low latency mode. By
+default, low latency mode is enabled. If enabled, interactive and soft
+real-time applications are privileged and experience a lower latency,
+as explained in more detail in the description of how BFQ works.
+
+timeout_sync
+------------
+
+Maximum amount of device time that can be given to a task (queue) once
+it has been selected for service. On devices with costly seeks,
+increasing this time usually increases maximum throughput. On the
+opposite end, increasing this time coarsens the granularity of the
+short-term bandwidth and latency guarantees, especially if the
+following parameter is set to zero.
+
+max_budget
+----------
+
+Maximum amount of service, measured in sectors, that can be provided
+to a BFQ queue once it is set in service (of course within the limits
+of the above timeout). According to what said in the description of
+the algorithm, larger values increase the throughput in proportion to
+the percentage of sequential I/O requests issued. The price of larger
+values is that they coarsen the granularity of short-term bandwidth
+and latency guarantees.
+
+The default value is 0, which enables auto-tuning: BFQ sets max_budget
+to the maximum number of sectors that can be served during
+timeout_sync, according to the estimated peak rate.
+
+weights
+-------
+
+Read-only parameter, used to show the weights of the currently active
+BFQ queues.
+
+
+wr_ tunables
+------------
+
+BFQ exports a few parameters to control/tune the behavior of
+low-latency heuristics.
+
+wr_coeff
+
+Factor by which the weight of a weight-raised queue is multiplied. If
+the queue is deemed soft real-time, then the weight is further
+multiplied by an additional, constant factor.
+
+wr_max_time
+
+Maximum duration of a weight-raising period for an interactive task
+(ms). If set to zero (default value), then this value is computed
+automatically, as a function of the peak rate of the device. In any
+case, when the value of this parameter is read, it always reports the
+current duration, regardless of whether it has been set manually or
+computed automatically.
+
+wr_max_softrt_rate
+
+Maximum service rate below which a queue is deemed to be associated
+with a soft real-time application, and is then weight-raised
+accordingly (sectors/sec).
+
+wr_min_idle_time
+
+Minimum idle period after which interactive weight-raising may be
+reactivated for a queue (in ms).
+
+wr_rt_max_time
+
+Maximum weight-raising duration for soft real-time queues (in ms). The
+start time from which this duration is considered is automatically
+moved forward if the queue is detected to be still soft real-time
+before the current soft real-time weight-raising period finishes.
+
+wr_min_inter_arr_async
+
+Minimum period between I/O request arrivals after which weight-raising
+may be reactivated for an already busy async queue (in ms).
+
+
+4. Group scheduling with BFQ
+============================
+
+BFQ supports both cgroup-v1 and cgroup-v2 io controllers, namely blkio
+and io. In particular, BFQ supports weight-based proportional
+share.
+
+4-1 Service guarantees provided
+-------------------------------
+
+With BFQ, proportional share means true proportional share of the
+device bandwidth, according to group weights. For example, a group
+with weight 200 gets twice the bandwidth, and not just twice the time,
+of a group with weight 100.
+
+BFQ supports hierarchies (group trees) of any depth. Bandwidth is
+distributed among groups and processes in the expected way: for each
+group, the children of the group share the whole bandwidth of the
+group in proportion to their weights. In particular, this implies
+that, for each leaf group, every process of the group receives the
+same share of the whole group bandwidth, unless the ioprio of the
+process is modified.
+
+The resource-sharing guarantee for a group may partially or totally
+switch from bandwidth to time, if providing bandwidth guarantees to
+the group lowers the throughput too much. This switch occurs on a
+per-process basis: if a process of a leaf group causes throughput loss
+if served in such a way to receive its share of the bandwidth, then
+BFQ switches back to just time-based proportional share for that
+process.
+
+4-2 Interface
+-------------
+
+To get proportional sharing of bandwidth with BFQ for a given device,
+BFQ must of course be the active scheduler for that device.
+
+Within each group directory, the names of the files associated with
+BFQ-specific cgroup parameters and stats begin with the "bfq."
+prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for
+BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group
+parameter to set the weight of a group with BFQ is blkio.bfq.weight
+or io.bfq.weight.
+
+Parameters to set
+-----------------
+
+For each group, there is only the following parameter to set.
+
+weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the
+group inside its parent. Available values: 1..10000 (default 100). The
+linear mapping between ioprio and weights, described at the beginning
+of the tunable section, is still valid, but all weights higher than
+IOPRIO_BE_NR*10 are mapped to ioprio 0.
+
+
+[1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
+    Scheduler", Proceedings of the First Workshop on Mobile System
+    Technologies (MST-2015), May 2015.
+    http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
+
+[2] P. Valente and M. Andreolini, "Improving Application
+    Responsiveness with the BFQ Disk I/O Scheduler", Proceedings of
+    the 5th Annual International Systems and Storage Conference
+    (SYSTOR '12), June 2012.
+    Slightly extended version:
+    http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite-
+							results.pdf
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 916e69c68fa4..6fc36027b70e 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -78,6 +78,17 @@ config MQ_IOSCHED_KYBER
 	  synchronous writes, it will self-tune queue depths to achieve that
 	  goal.
 
+config IOSCHED_BFQ
+	tristate "BFQ I/O scheduler"
+	default n
+	---help---
+	BFQ I/O scheduler for BLK-MQ. BFQ distributes the bandwidth of
+	of the device among all processes according to their weights,
+	regardless of the device parameters and with any workload. It
+	also guarantees a low latency to interactive and soft
+	real-time applications.  Details in
+	Documentation/block/bfq-iosched.txt
+
 endmenu
 
 endif
diff --git a/block/Makefile b/block/Makefile
index 6146d2eaaeaa..4c1d68cb49dd 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -21,6 +21,7 @@ obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
 obj-$(CONFIG_MQ_IOSCHED_DEADLINE)	+= mq-deadline.o
 obj-$(CONFIG_MQ_IOSCHED_KYBER)	+= kyber-iosched.o
+obj-$(CONFIG_IOSCHED_BFQ)	+= bfq-iosched.o
 
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
 obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
new file mode 100644
index 000000000000..c4e7d8db796a
--- /dev/null
+++ b/block/bfq-iosched.c
@@ -0,0 +1,4166 @@
+/*
+ * Budget Fair Queueing (BFQ) I/O scheduler.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
+ *                    Arianna Avanzini <avanzini@google.com>
+ *
+ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation; either version 2 of the
+ *  License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ * BFQ is a proportional-share I/O scheduler, with some extra
+ * low-latency capabilities. BFQ also supports full hierarchical
+ * scheduling through cgroups. Next paragraphs provide an introduction
+ * on BFQ inner workings. Details on BFQ benefits, usage and
+ * limitations can be found in Documentation/block/bfq-iosched.txt.
+ *
+ * BFQ is a proportional-share storage-I/O scheduling algorithm based
+ * on the slice-by-slice service scheme of CFQ. But BFQ assigns
+ * budgets, measured in number of sectors, to processes instead of
+ * time slices. The device is not granted to the in-service process
+ * for a given time slice, but until it has exhausted its assigned
+ * budget. This change from the time to the service domain enables BFQ
+ * to distribute the device throughput among processes as desired,
+ * without any distortion due to throughput fluctuations, or to device
+ * internal queueing. BFQ uses an ad hoc internal scheduler, called
+ * B-WF2Q+, to schedule processes according to their budgets. More
+ * precisely, BFQ schedules queues associated with processes. Each
+ * process/queue is assigned a user-configurable weight, and B-WF2Q+
+ * guarantees that each queue receives a fraction of the throughput
+ * proportional to its weight. Thanks to the accurate policy of
+ * B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound
+ * processes issuing sequential requests (to boost the throughput),
+ * and yet guarantee a low latency to interactive and soft real-time
+ * applications.
+ *
+ * In particular, to provide these low-latency guarantees, BFQ
+ * explicitly privileges the I/O of two classes of time-sensitive
+ * applications: interactive and soft real-time. This feature enables
+ * BFQ to provide applications in these classes with a very low
+ * latency. Finally, BFQ also features additional heuristics for
+ * preserving both a low latency and a high throughput on NCQ-capable,
+ * rotational or flash-based devices, and to get the job done quickly
+ * for applications consisting in many I/O-bound processes.
+ *
+ * BFQ is described in [1], where also a reference to the initial, more
+ * theoretical paper on BFQ can be found. The interested reader can find
+ * in the latter paper full details on the main algorithm, as well as
+ * formulas of the guarantees and formal proofs of all the properties.
+ * With respect to the version of BFQ presented in these papers, this
+ * implementation adds a few more heuristics, such as the one that
+ * guarantees a low latency to soft real-time applications, and a
+ * hierarchical extension based on H-WF2Q+.
+ *
+ * B-WF2Q+ is based on WF2Q+, which is described in [2], together with
+ * H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
+ * with O(log N) complexity derives from the one introduced with EEVDF
+ * in [3].
+ *
+ * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
+ *     Scheduler", Proceedings of the First Workshop on Mobile System
+ *     Technologies (MST-2015), May 2015.
+ *     http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
+ *
+ * [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing
+ *     Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689,
+ *     Oct 1997.
+ *
+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
+ *
+ * [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline
+ *     First: A Flexible and Accurate Mechanism for Proportional Share
+ *     Resource Allocation", technical report.
+ *
+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/elevator.h>
+#include <linux/ktime.h>
+#include <linux/rbtree.h>
+#include <linux/ioprio.h>
+#include <linux/sbitmap.h>
+#include <linux/delay.h>
+
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+#include "blk-mq-sched.h"
+#include <linux/blktrace_api.h>
+#include <linux/hrtimer.h>
+#include <linux/blk-cgroup.h>
+
+#define BFQ_IOPRIO_CLASSES	3
+#define BFQ_CL_IDLE_TIMEOUT	(HZ/5)
+
+#define BFQ_MIN_WEIGHT			1
+#define BFQ_MAX_WEIGHT			1000
+#define BFQ_WEIGHT_CONVERSION_COEFF	10
+
+#define BFQ_DEFAULT_QUEUE_IOPRIO	4
+
+#define BFQ_DEFAULT_GRP_WEIGHT	10
+#define BFQ_DEFAULT_GRP_IOPRIO	0
+#define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE
+
+struct bfq_entity;
+
+/**
+ * struct bfq_service_tree - per ioprio_class service tree.
+ *
+ * Each service tree represents a B-WF2Q+ scheduler on its own.  Each
+ * ioprio_class has its own independent scheduler, and so its own
+ * bfq_service_tree.  All the fields are protected by the queue lock
+ * of the containing bfqd.
+ */
+struct bfq_service_tree {
+	/* tree for active entities (i.e., those backlogged) */
+	struct rb_root active;
+	/* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
+	struct rb_root idle;
+
+	/* idle entity with minimum F_i */
+	struct bfq_entity *first_idle;
+	/* idle entity with maximum F_i */
+	struct bfq_entity *last_idle;
+
+	/* scheduler virtual time */
+	u64 vtime;
+	/* scheduler weight sum; active and idle entities contribute to it */
+	unsigned long wsum;
+};
+
+/**
+ * struct bfq_sched_data - multi-class scheduler.
+ *
+ * bfq_sched_data is the basic scheduler queue.  It supports three
+ * ioprio_classes, and can be used either as a toplevel queue or as
+ * an intermediate queue on a hierarchical setup.
+ * @next_in_service points to the active entity of the sched_data
+ * service trees that will be scheduled next.
+ *
+ * The supported ioprio_classes are the same as in CFQ, in descending
+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
+ * Requests from higher priority queues are served before all the
+ * requests from lower priority queues; among requests of the same
+ * queue requests are served according to B-WF2Q+.
+ * All the fields are protected by the queue lock of the containing bfqd.
+ */
+struct bfq_sched_data {
+	/* entity in service */
+	struct bfq_entity *in_service_entity;
+	/* head-of-the-line entity in the scheduler */
+	struct bfq_entity *next_in_service;
+	/* array of service trees, one per ioprio_class */
+	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
+};
+
+/**
+ * struct bfq_entity - schedulable entity.
+ *
+ * A bfq_entity is used to represent a bfq_queue (leaf node in the upper
+ * level scheduler). Each entity belongs to the sched_data of the parent
+ * group hierarchy. Non-leaf entities have also their own sched_data,
+ * stored in @my_sched_data.
+ *
+ * Each entity stores independently its priority values; this would
+ * allow different weights on different devices, but this
+ * functionality is not exported to userspace by now.  Priorities and
+ * weights are updated lazily, first storing the new values into the
+ * new_* fields, then setting the @prio_changed flag.  As soon as
+ * there is a transition in the entity state that allows the priority
+ * update to take place the effective and the requested priority
+ * values are synchronized.
+ *
+ * The weight value is calculated from the ioprio to export the same
+ * interface as CFQ.  When dealing with  ``well-behaved'' queues (i.e.,
+ * queues that do not spend too much time to consume their budget
+ * and have true sequential behavior, and when there are no external
+ * factors breaking anticipation) the relative weights at each level
+ * of the hierarchy should be guaranteed.  All the fields are
+ * protected by the queue lock of the containing bfqd.
+ */
+struct bfq_entity {
+	/* service_tree member */
+	struct rb_node rb_node;
+
+	/*
+	 * flag, true if the entity is on a tree (either the active or
+	 * the idle one of its service_tree).
+	 */
+	int on_st;
+
+	/* B-WF2Q+ start and finish timestamps [sectors/weight] */
+	u64 start, finish;
+
+	/* tree the entity is enqueued into; %NULL if not on a tree */
+	struct rb_root *tree;
+
+	/*
+	 * minimum start time of the (active) subtree rooted at this
+	 * entity; used for O(log N) lookups into active trees
+	 */
+	u64 min_start;
+
+	/* amount of service received during the last service slot */
+	int service;
+
+	/* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
+	int budget;
+
+	/* weight of the queue */
+	int weight;
+	/* next weight if a change is in progress */
+	int new_weight;
+
+	/* original weight, used to implement weight boosting */
+	int orig_weight;
+
+	/* parent entity, for hierarchical scheduling */
+	struct bfq_entity *parent;
+
+	/*
+	 * For non-leaf nodes in the hierarchy, the associated
+	 * scheduler queue, %NULL on leaf nodes.
+	 */
+	struct bfq_sched_data *my_sched_data;
+	/* the scheduler queue this entity belongs to */
+	struct bfq_sched_data *sched_data;
+
+	/* flag, set to request a weight, ioprio or ioprio_class change  */
+	int prio_changed;
+};
+
+/**
+ * struct bfq_ttime - per process thinktime stats.
+ */
+struct bfq_ttime {
+	/* completion time of the last request */
+	u64 last_end_request;
+
+	/* total process thinktime */
+	u64 ttime_total;
+	/* number of thinktime samples */
+	unsigned long ttime_samples;
+	/* average process thinktime */
+	u64 ttime_mean;
+};
+
+/**
+ * struct bfq_queue - leaf schedulable entity.
+ *
+ * A bfq_queue is a leaf request queue; it can be associated with an
+ * io_context or more, if it is async.
+ */
+struct bfq_queue {
+	/* reference counter */
+	int ref;
+	/* parent bfq_data */
+	struct bfq_data *bfqd;
+
+	/* current ioprio and ioprio class */
+	unsigned short ioprio, ioprio_class;
+	/* next ioprio and ioprio class if a change is in progress */
+	unsigned short new_ioprio, new_ioprio_class;
+
+	/* sorted list of pending requests */
+	struct rb_root sort_list;
+	/* if fifo isn't expired, next request to serve */
+	struct request *next_rq;
+	/* number of sync and async requests queued */
+	int queued[2];
+	/* number of requests currently allocated */
+	int allocated;
+	/* number of pending metadata requests */
+	int meta_pending;
+	/* fifo list of requests in sort_list */
+	struct list_head fifo;
+
+	/* entity representing this queue in the scheduler */
+	struct bfq_entity entity;
+
+	/* maximum budget allowed from the feedback mechanism */
+	int max_budget;
+	/* budget expiration (in jiffies) */
+	unsigned long budget_timeout;
+
+	/* number of requests on the dispatch list or inside driver */
+	int dispatched;
+
+	/* status flags */
+	unsigned long flags;
+
+	/* node for active/idle bfqq list inside parent bfqd */
+	struct list_head bfqq_list;
+
+	/* associated @bfq_ttime struct */
+	struct bfq_ttime ttime;
+
+	/* bit vector: a 1 for each seeky requests in history */
+	u32 seek_history;
+	/* position of the last request enqueued */
+	sector_t last_request_pos;
+
+	/* Number of consecutive pairs of request completion and
+	 * arrival, such that the queue becomes idle after the
+	 * completion, but the next request arrives within an idle
+	 * time slice; used only if the queue's IO_bound flag has been
+	 * cleared.
+	 */
+	unsigned int requests_within_timer;
+
+	/* pid of the process owning the queue, used for logging purposes */
+	pid_t pid;
+};
+
+/**
+ * struct bfq_io_cq - per (request_queue, io_context) structure.
+ */
+struct bfq_io_cq {
+	/* associated io_cq structure */
+	struct io_cq icq; /* must be the first member */
+	/* array of two process queues, the sync and the async */
+	struct bfq_queue *bfqq[2];
+	/* per (request_queue, blkcg) ioprio */
+	int ioprio;
+};
+
+/**
+ * struct bfq_data - per-device data structure.
+ *
+ * All the fields are protected by @lock.
+ */
+struct bfq_data {
+	/* device request queue */
+	struct request_queue *queue;
+	/* dispatch queue */
+	struct list_head dispatch;
+
+	/* root @bfq_sched_data for the device */
+	struct bfq_sched_data sched_data;
+
+	/*
+	 * Number of bfq_queues containing requests (including the
+	 * queue in service, even if it is idling).
+	 */
+	int busy_queues;
+	/* number of queued requests */
+	int queued;
+	/* number of requests dispatched and waiting for completion */
+	int rq_in_driver;
+
+	/*
+	 * Maximum number of requests in driver in the last
+	 * @hw_tag_samples completed requests.
+	 */
+	int max_rq_in_driver;
+	/* number of samples used to calculate hw_tag */
+	int hw_tag_samples;
+	/* flag set to one if the driver is showing a queueing behavior */
+	int hw_tag;
+
+	/* number of budgets assigned */
+	int budgets_assigned;
+
+	/*
+	 * Timer set when idling (waiting) for the next request from
+	 * the queue in service.
+	 */
+	struct hrtimer idle_slice_timer;
+
+	/* bfq_queue in service */
+	struct bfq_queue *in_service_queue;
+	/* bfq_io_cq (bic) associated with the @in_service_queue */
+	struct bfq_io_cq *in_service_bic;
+
+	/* on-disk position of the last served request */
+	sector_t last_position;
+
+	/* beginning of the last budget */
+	ktime_t last_budget_start;
+	/* beginning of the last idle slice */
+	ktime_t last_idling_start;
+	/* number of samples used to calculate @peak_rate */
+	int peak_rate_samples;
+	/*
+	 * Peak read/write rate, observed during the service of a
+	 * budget [BFQ_RATE_SHIFT * sectors/usec]. The value is
+	 * left-shifted by BFQ_RATE_SHIFT to increase precision in
+	 * fixed-point calculations.
+	 */
+	u64 peak_rate;
+	/* maximum budget allotted to a bfq_queue before rescheduling */
+	int bfq_max_budget;
+
+	/* list of all the bfq_queues active on the device */
+	struct list_head active_list;
+	/* list of all the bfq_queues idle on the device */
+	struct list_head idle_list;
+
+	/*
+	 * Timeout for async/sync requests; when it fires, requests
+	 * are served in fifo order.
+	 */
+	u64 bfq_fifo_expire[2];
+	/* weight of backward seeks wrt forward ones */
+	unsigned int bfq_back_penalty;
+	/* maximum allowed backward seek */
+	unsigned int bfq_back_max;
+	/* maximum idling time */
+	u32 bfq_slice_idle;
+	/* last time CLASS_IDLE was served */
+	u64 bfq_class_idle_last_service;
+
+	/* user-configured max budget value (0 for auto-tuning) */
+	int bfq_user_max_budget;
+	/*
+	 * Timeout for bfq_queues to consume their budget; used to
+	 * prevent seeky queues from imposing long latencies to
+	 * sequential or quasi-sequential ones (this also implies that
+	 * seeky queues cannot receive guarantees in the service
+	 * domain; after a timeout they are charged for the time they
+	 * have been in service, to preserve fairness among them, but
+	 * without service-domain guarantees).
+	 */
+	unsigned int bfq_timeout;
+
+	/*
+	 * Number of consecutive requests that must be issued within
+	 * the idle time slice to set again idling to a queue which
+	 * was marked as non-I/O-bound (see the definition of the
+	 * IO_bound flag for further details).
+	 */
+	unsigned int bfq_requests_within_timer;
+
+	/*
+	 * Force device idling whenever needed to provide accurate
+	 * service guarantees, without caring about throughput
+	 * issues. CAVEAT: this may even increase latencies, in case
+	 * of useless idling for processes that did stop doing I/O.
+	 */
+	bool strict_guarantees;
+
+	/* fallback dummy bfqq for extreme OOM conditions */
+	struct bfq_queue oom_bfqq;
+
+	spinlock_t lock;
+
+	/*
+	 * bic associated with the task issuing current bio for
+	 * merging. This and the next field are used as a support to
+	 * be able to perform the bic lookup, needed by bio-merge
+	 * functions, before the scheduler lock is taken, and thus
+	 * avoid taking the request-queue lock while the scheduler
+	 * lock is being held.
+	 */
+	struct bfq_io_cq *bio_bic;
+	/* bfqq associated with the task issuing current bio for merging */
+	struct bfq_queue *bio_bfqq;
+};
+
+enum bfqq_state_flags {
+	BFQQF_busy = 0,		/* has requests or is in service */
+	BFQQF_wait_request,	/* waiting for a request */
+	BFQQF_non_blocking_wait_rq, /*
+				     * waiting for a request
+				     * without idling the device
+				     */
+	BFQQF_fifo_expire,	/* FIFO checked in this slice */
+	BFQQF_idle_window,	/* slice idling enabled */
+	BFQQF_sync,		/* synchronous queue */
+	BFQQF_budget_new,	/* no completion with this budget */
+	BFQQF_IO_bound,		/*
+				 * bfqq has timed-out at least once
+				 * having consumed at most 2/10 of
+				 * its budget
+				 */
+};
+
+#define BFQ_BFQQ_FNS(name)						\
+static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\
+{									\
+	__set_bit(BFQQF_##name, &(bfqq)->flags);			\
+}									\
+static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)		\
+{									\
+	__clear_bit(BFQQF_##name, &(bfqq)->flags);		\
+}									\
+static int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\
+{									\
+	return test_bit(BFQQF_##name, &(bfqq)->flags);		\
+}
+
+BFQ_BFQQ_FNS(busy);
+BFQ_BFQQ_FNS(wait_request);
+BFQ_BFQQ_FNS(non_blocking_wait_rq);
+BFQ_BFQQ_FNS(fifo_expire);
+BFQ_BFQQ_FNS(idle_window);
+BFQ_BFQQ_FNS(sync);
+BFQ_BFQQ_FNS(budget_new);
+BFQ_BFQQ_FNS(IO_bound);
+#undef BFQ_BFQQ_FNS
+
+/* Logging facilities. */
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
+	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
+
+#define bfq_log(bfqd, fmt, args...) \
+	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
+
+/* Expiration reasons. */
+enum bfqq_expiration {
+	BFQQE_TOO_IDLE = 0,		/*
+					 * queue has been idling for
+					 * too long
+					 */
+	BFQQE_BUDGET_TIMEOUT,	/* budget took too long to be used */
+	BFQQE_BUDGET_EXHAUSTED,	/* budget consumed */
+	BFQQE_NO_MORE_REQUESTS,	/* the queue has no more requests */
+	BFQQE_PREEMPTED		/* preemption in progress */
+};
+
+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
+
+static struct bfq_service_tree *
+bfq_entity_service_tree(struct bfq_entity *entity)
+{
+	struct bfq_sched_data *sched_data = entity->sched_data;
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	unsigned int idx = bfqq ? bfqq->ioprio_class - 1 :
+				  BFQ_DEFAULT_GRP_CLASS - 1;
+
+	return sched_data->service_tree + idx;
+}
+
+static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
+{
+	return bic->bfqq[is_sync];
+}
+
+static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq,
+			 bool is_sync)
+{
+	bic->bfqq[is_sync] = bfqq;
+}
+
+static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
+{
+	return bic->icq.q->elevator->elevator_data;
+}
+
+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio);
+static void bfq_put_queue(struct bfq_queue *bfqq);
+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
+				       struct bio *bio, bool is_sync,
+				       struct bfq_io_cq *bic);
+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+
+/*
+ * Array of async queues for all the processes, one queue
+ * per ioprio value per ioprio_class.
+ */
+struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+/* Async queue for the idle class (ioprio is ignored) */
+struct bfq_queue *async_idle_bfqq;
+
+/* Expiration time of sync (0) and async (1) requests, in ns. */
+static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
+
+/* Maximum backwards seek (magic number lifted from CFQ), in KiB. */
+static const int bfq_back_max = 16 * 1024;
+
+/* Penalty of a backwards seek, in number of sectors. */
+static const int bfq_back_penalty = 2;
+
+/* Idling period duration, in ns. */
+static u64 bfq_slice_idle = NSEC_PER_SEC / 125;
+
+/* Minimum number of assigned budgets for which stats are safe to compute. */
+static const int bfq_stats_min_budgets = 194;
+
+/* Default maximum budget values, in sectors and number of requests. */
+static const int bfq_default_max_budget = 16 * 1024;
+
+/* Default timeout values, in jiffies, approximating CFQ defaults. */
+static const int bfq_timeout = HZ / 8;
+
+static struct kmem_cache *bfq_pool;
+
+/* Below this threshold (in ms), we consider thinktime immediate. */
+#define BFQ_MIN_TT		(2 * NSEC_PER_MSEC)
+
+/* hw_tag detection: parallel requests threshold and min samples needed. */
+#define BFQ_HW_QUEUE_THRESHOLD	4
+#define BFQ_HW_QUEUE_SAMPLES	32
+
+#define BFQQ_SEEK_THR		(sector_t)(8 * 100)
+#define BFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
+#define BFQQ_CLOSE_THR		(sector_t)(8 * 1024)
+#define BFQQ_SEEKY(bfqq)	(hweight32(bfqq->seek_history) > 32/8)
+
+/* Budget feedback step. */
+#define BFQ_BUDGET_STEP         128
+
+/* Min samples used for peak rate estimation (for autotuning). */
+#define BFQ_PEAK_RATE_SAMPLES	32
+
+/* Shift used for peak rate fixed precision calculations. */
+#define BFQ_RATE_SHIFT		16
+
+#define BFQ_SERVICE_TREE_INIT	((struct bfq_service_tree)		\
+				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
+
+#define RQ_BIC(rq)		((struct bfq_io_cq *) (rq)->elv.priv[0])
+#define RQ_BFQQ(rq)		((rq)->elv.priv[1])
+
+/**
+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
+ * @icq: the iocontext queue.
+ */
+static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
+{
+	/* bic->icq is the first member, %NULL will convert to %NULL */
+	return container_of(icq, struct bfq_io_cq, icq);
+}
+
+/**
+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
+ * @bfqd: the lookup key.
+ * @ioc: the io_context of the process doing I/O.
+ * @q: the request queue.
+ */
+static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
+					struct io_context *ioc,
+					struct request_queue *q)
+{
+	if (ioc) {
+		unsigned long flags;
+		struct bfq_io_cq *icq;
+
+		spin_lock_irqsave(q->queue_lock, flags);
+		icq = icq_to_bic(ioc_lookup_icq(ioc, q));
+		spin_unlock_irqrestore(q->queue_lock, flags);
+
+		return icq;
+	}
+
+	return NULL;
+}
+
+/*
+ * Next two macros are just fake loops for the moment. They will
+ * become true loops in the cgroups-enabled variant of the code. Such
+ * a variant, in its turn, will be introduced by next commit.
+ */
+#define for_each_entity(entity)	\
+	for (; entity ; entity = NULL)
+
+#define for_each_entity_safe(entity, parent) \
+	for (parent = NULL; entity ; entity = parent)
+
+static int bfq_update_next_in_service(struct bfq_sched_data *sd)
+{
+	return 0;
+}
+
+static void bfq_check_next_in_service(struct bfq_sched_data *sd,
+				      struct bfq_entity *entity)
+{
+}
+
+static void bfq_update_budget(struct bfq_entity *next_in_service)
+{
+}
+
+/*
+ * Shift for timestamp calculations.  This actually limits the maximum
+ * service allowed in one timestamp delta (small shift values increase it),
+ * the maximum total weight that can be used for the queues in the system
+ * (big shift values increase it), and the period of virtual time
+ * wraparounds.
+ */
+#define WFQ_SERVICE_SHIFT	22
+
+/**
+ * bfq_gt - compare two timestamps.
+ * @a: first ts.
+ * @b: second ts.
+ *
+ * Return @a > @b, dealing with wrapping correctly.
+ */
+static int bfq_gt(u64 a, u64 b)
+{
+	return (s64)(a - b) > 0;
+}
+
+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = NULL;
+
+	if (!entity->my_sched_data)
+		bfqq = container_of(entity, struct bfq_queue, entity);
+
+	return bfqq;
+}
+
+
+/**
+ * bfq_delta - map service into the virtual time domain.
+ * @service: amount of service.
+ * @weight: scale factor (weight of an entity or weight sum).
+ */
+static u64 bfq_delta(unsigned long service, unsigned long weight)
+{
+	u64 d = (u64)service << WFQ_SERVICE_SHIFT;
+
+	do_div(d, weight);
+	return d;
+}
+
+/**
+ * bfq_calc_finish - assign the finish time to an entity.
+ * @entity: the entity to act upon.
+ * @service: the service to be charged to the entity.
+ */
+static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	entity->finish = entity->start +
+		bfq_delta(service, entity->weight);
+
+	if (bfqq) {
+		bfq_log_bfqq(bfqq->bfqd, bfqq,
+			"calc_finish: serv %lu, w %d",
+			service, entity->weight);
+		bfq_log_bfqq(bfqq->bfqd, bfqq,
+			"calc_finish: start %llu, finish %llu, delta %llu",
+			entity->start, entity->finish,
+			bfq_delta(service, entity->weight));
+	}
+}
+
+/**
+ * bfq_entity_of - get an entity from a node.
+ * @node: the node field of the entity.
+ *
+ * Convert a node pointer to the relative entity.  This is used only
+ * to simplify the logic of some functions and not as the generic
+ * conversion mechanism because, e.g., in the tree walking functions,
+ * the check for a %NULL value would be redundant.
+ */
+static struct bfq_entity *bfq_entity_of(struct rb_node *node)
+{
+	struct bfq_entity *entity = NULL;
+
+	if (node)
+		entity = rb_entry(node, struct bfq_entity, rb_node);
+
+	return entity;
+}
+
+/**
+ * bfq_extract - remove an entity from a tree.
+ * @root: the tree root.
+ * @entity: the entity to remove.
+ */
+static void bfq_extract(struct rb_root *root, struct bfq_entity *entity)
+{
+	entity->tree = NULL;
+	rb_erase(&entity->rb_node, root);
+}
+
+/**
+ * bfq_idle_extract - extract an entity from the idle tree.
+ * @st: the service tree of the owning @entity.
+ * @entity: the entity being removed.
+ */
+static void bfq_idle_extract(struct bfq_service_tree *st,
+			     struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct rb_node *next;
+
+	if (entity == st->first_idle) {
+		next = rb_next(&entity->rb_node);
+		st->first_idle = bfq_entity_of(next);
+	}
+
+	if (entity == st->last_idle) {
+		next = rb_prev(&entity->rb_node);
+		st->last_idle = bfq_entity_of(next);
+	}
+
+	bfq_extract(&st->idle, entity);
+
+	if (bfqq)
+		list_del(&bfqq->bfqq_list);
+}
+
+/**
+ * bfq_insert - generic tree insertion.
+ * @root: tree root.
+ * @entity: entity to insert.
+ *
+ * This is used for the idle and the active tree, since they are both
+ * ordered by finish time.
+ */
+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
+{
+	struct bfq_entity *entry;
+	struct rb_node **node = &root->rb_node;
+	struct rb_node *parent = NULL;
+
+	while (*node) {
+		parent = *node;
+		entry = rb_entry(parent, struct bfq_entity, rb_node);
+
+		if (bfq_gt(entry->finish, entity->finish))
+			node = &parent->rb_left;
+		else
+			node = &parent->rb_right;
+	}
+
+	rb_link_node(&entity->rb_node, parent, node);
+	rb_insert_color(&entity->rb_node, root);
+
+	entity->tree = root;
+}
+
+/**
+ * bfq_update_min - update the min_start field of a entity.
+ * @entity: the entity to update.
+ * @node: one of its children.
+ *
+ * This function is called when @entity may store an invalid value for
+ * min_start due to updates to the active tree.  The function  assumes
+ * that the subtree rooted at @node (which may be its left or its right
+ * child) has a valid min_start value.
+ */
+static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node)
+{
+	struct bfq_entity *child;
+
+	if (node) {
+		child = rb_entry(node, struct bfq_entity, rb_node);
+		if (bfq_gt(entity->min_start, child->min_start))
+			entity->min_start = child->min_start;
+	}
+}
+
+/**
+ * bfq_update_active_node - recalculate min_start.
+ * @node: the node to update.
+ *
+ * @node may have changed position or one of its children may have moved,
+ * this function updates its min_start value.  The left and right subtrees
+ * are assumed to hold a correct min_start value.
+ */
+static void bfq_update_active_node(struct rb_node *node)
+{
+	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
+
+	entity->min_start = entity->start;
+	bfq_update_min(entity, node->rb_right);
+	bfq_update_min(entity, node->rb_left);
+}
+
+/**
+ * bfq_update_active_tree - update min_start for the whole active tree.
+ * @node: the starting node.
+ *
+ * @node must be the deepest modified node after an update.  This function
+ * updates its min_start using the values held by its children, assuming
+ * that they did not change, and then updates all the nodes that may have
+ * changed in the path to the root.  The only nodes that may have changed
+ * are the ones in the path or their siblings.
+ */
+static void bfq_update_active_tree(struct rb_node *node)
+{
+	struct rb_node *parent;
+
+up:
+	bfq_update_active_node(node);
+
+	parent = rb_parent(node);
+	if (!parent)
+		return;
+
+	if (node == parent->rb_left && parent->rb_right)
+		bfq_update_active_node(parent->rb_right);
+	else if (parent->rb_left)
+		bfq_update_active_node(parent->rb_left);
+
+	node = parent;
+	goto up;
+}
+
+/**
+ * bfq_active_insert - insert an entity in the active tree of its
+ *                     group/device.
+ * @st: the service tree of the entity.
+ * @entity: the entity being inserted.
+ *
+ * The active tree is ordered by finish time, but an extra key is kept
+ * per each node, containing the minimum value for the start times of
+ * its children (and the node itself), so it's possible to search for
+ * the eligible node with the lowest finish time in logarithmic time.
+ */
+static void bfq_active_insert(struct bfq_service_tree *st,
+			      struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct rb_node *node = &entity->rb_node;
+
+	bfq_insert(&st->active, entity);
+
+	if (node->rb_left)
+		node = node->rb_left;
+	else if (node->rb_right)
+		node = node->rb_right;
+
+	bfq_update_active_tree(node);
+
+	if (bfqq)
+		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
+}
+
+/**
+ * bfq_ioprio_to_weight - calc a weight from an ioprio.
+ * @ioprio: the ioprio value to convert.
+ */
+static unsigned short bfq_ioprio_to_weight(int ioprio)
+{
+	return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
+}
+
+/**
+ * bfq_weight_to_ioprio - calc an ioprio from a weight.
+ * @weight: the weight value to convert.
+ *
+ * To preserve as much as possible the old only-ioprio user interface,
+ * 0 is used as an escape ioprio value for weights (numerically) equal or
+ * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
+ */
+static unsigned short bfq_weight_to_ioprio(int weight)
+{
+	return max_t(int, 0,
+		     IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight);
+}
+
+static void bfq_get_entity(struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	if (bfqq) {
+		bfqq->ref++;
+		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
+			     bfqq, bfqq->ref);
+	}
+}
+
+/**
+ * bfq_find_deepest - find the deepest node that an extraction can modify.
+ * @node: the node being removed.
+ *
+ * Do the first step of an extraction in an rb tree, looking for the
+ * node that will replace @node, and returning the deepest node that
+ * the following modifications to the tree can touch.  If @node is the
+ * last node in the tree return %NULL.
+ */
+static struct rb_node *bfq_find_deepest(struct rb_node *node)
+{
+	struct rb_node *deepest;
+
+	if (!node->rb_right && !node->rb_left)
+		deepest = rb_parent(node);
+	else if (!node->rb_right)
+		deepest = node->rb_left;
+	else if (!node->rb_left)
+		deepest = node->rb_right;
+	else {
+		deepest = rb_next(node);
+		if (deepest->rb_right)
+			deepest = deepest->rb_right;
+		else if (rb_parent(deepest) != node)
+			deepest = rb_parent(deepest);
+	}
+
+	return deepest;
+}
+
+/**
+ * bfq_active_extract - remove an entity from the active tree.
+ * @st: the service_tree containing the tree.
+ * @entity: the entity being removed.
+ */
+static void bfq_active_extract(struct bfq_service_tree *st,
+			       struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct rb_node *node;
+
+	node = bfq_find_deepest(&entity->rb_node);
+	bfq_extract(&st->active, entity);
+
+	if (node)
+		bfq_update_active_tree(node);
+
+	if (bfqq)
+		list_del(&bfqq->bfqq_list);
+}
+
+/**
+ * bfq_idle_insert - insert an entity into the idle tree.
+ * @st: the service tree containing the tree.
+ * @entity: the entity to insert.
+ */
+static void bfq_idle_insert(struct bfq_service_tree *st,
+			    struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct bfq_entity *first_idle = st->first_idle;
+	struct bfq_entity *last_idle = st->last_idle;
+
+	if (!first_idle || bfq_gt(first_idle->finish, entity->finish))
+		st->first_idle = entity;
+	if (!last_idle || bfq_gt(entity->finish, last_idle->finish))
+		st->last_idle = entity;
+
+	bfq_insert(&st->idle, entity);
+
+	if (bfqq)
+		list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
+}
+
+/**
+ * bfq_forget_entity - do not consider entity any longer for scheduling
+ * @st: the service tree.
+ * @entity: the entity being removed.
+ * @is_in_service: true if entity is currently the in-service entity.
+ *
+ * Forget everything about @entity. In addition, if entity represents
+ * a queue, and the latter is not in service, then release the service
+ * reference to the queue (the one taken through bfq_get_entity). In
+ * fact, in this case, there is really no more service reference to
+ * the queue, as the latter is also outside any service tree. If,
+ * instead, the queue is in service, then __bfq_bfqd_reset_in_service
+ * will take care of putting the reference when the queue finally
+ * stops being served.
+ */
+static void bfq_forget_entity(struct bfq_service_tree *st,
+			      struct bfq_entity *entity,
+			      bool is_in_service)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	entity->on_st = 0;
+	st->wsum -= entity->weight;
+	if (bfqq && !is_in_service)
+		bfq_put_queue(bfqq);
+}
+
+/**
+ * bfq_put_idle_entity - release the idle tree ref of an entity.
+ * @st: service tree for the entity.
+ * @entity: the entity being released.
+ */
+static void bfq_put_idle_entity(struct bfq_service_tree *st,
+				struct bfq_entity *entity)
+{
+	bfq_idle_extract(st, entity);
+	bfq_forget_entity(st, entity,
+			  entity == entity->sched_data->in_service_entity);
+}
+
+/**
+ * bfq_forget_idle - update the idle tree if necessary.
+ * @st: the service tree to act upon.
+ *
+ * To preserve the global O(log N) complexity we only remove one entry here;
+ * as the idle tree will not grow indefinitely this can be done safely.
+ */
+static void bfq_forget_idle(struct bfq_service_tree *st)
+{
+	struct bfq_entity *first_idle = st->first_idle;
+	struct bfq_entity *last_idle = st->last_idle;
+
+	if (RB_EMPTY_ROOT(&st->active) && last_idle &&
+	    !bfq_gt(last_idle->finish, st->vtime)) {
+		/*
+		 * Forget the whole idle tree, increasing the vtime past
+		 * the last finish time of idle entities.
+		 */
+		st->vtime = last_idle->finish;
+	}
+
+	if (first_idle && !bfq_gt(first_idle->finish, st->vtime))
+		bfq_put_idle_entity(st, first_idle);
+}
+
+static struct bfq_service_tree *
+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
+			 struct bfq_entity *entity)
+{
+	struct bfq_service_tree *new_st = old_st;
+
+	if (entity->prio_changed) {
+		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+		unsigned short prev_weight, new_weight;
+		struct bfq_data *bfqd = NULL;
+
+		if (bfqq)
+			bfqd = bfqq->bfqd;
+
+		old_st->wsum -= entity->weight;
+
+		if (entity->new_weight != entity->orig_weight) {
+			if (entity->new_weight < BFQ_MIN_WEIGHT ||
+			    entity->new_weight > BFQ_MAX_WEIGHT) {
+				pr_crit("update_weight_prio: new_weight %d\n",
+					entity->new_weight);
+				if (entity->new_weight < BFQ_MIN_WEIGHT)
+					entity->new_weight = BFQ_MIN_WEIGHT;
+				else
+					entity->new_weight = BFQ_MAX_WEIGHT;
+			}
+			entity->orig_weight = entity->new_weight;
+			if (bfqq)
+				bfqq->ioprio =
+				  bfq_weight_to_ioprio(entity->orig_weight);
+		}
+
+		if (bfqq)
+			bfqq->ioprio_class = bfqq->new_ioprio_class;
+		entity->prio_changed = 0;
+
+		/*
+		 * NOTE: here we may be changing the weight too early,
+		 * this will cause unfairness.  The correct approach
+		 * would have required additional complexity to defer
+		 * weight changes to the proper time instants (i.e.,
+		 * when entity->finish <= old_st->vtime).
+		 */
+		new_st = bfq_entity_service_tree(entity);
+
+		prev_weight = entity->weight;
+		new_weight = entity->orig_weight;
+		entity->weight = new_weight;
+
+		new_st->wsum += entity->weight;
+
+		if (new_st != old_st)
+			entity->start = new_st->vtime;
+	}
+
+	return new_st;
+}
+
+/**
+ * bfq_bfqq_served - update the scheduler status after selection for
+ *                   service.
+ * @bfqq: the queue being served.
+ * @served: bytes to transfer.
+ *
+ * NOTE: this can be optimized, as the timestamps of upper level entities
+ * are synchronized every time a new bfqq is selected for service.  By now,
+ * we keep it to better check consistency.
+ */
+static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+	struct bfq_service_tree *st;
+
+	for_each_entity(entity) {
+		st = bfq_entity_service_tree(entity);
+
+		entity->service += served;
+
+		st->vtime += bfq_delta(served, st->wsum);
+		bfq_forget_idle(st);
+	}
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);
+}
+
+/**
+ * bfq_bfqq_charge_full_budget - set the service to the entity budget.
+ * @bfqq: the queue that needs a service update.
+ *
+ * When it's not possible to be fair in the service domain, because
+ * a queue is not consuming its budget fast enough (the meaning of
+ * fast depends on the timeout parameter), we charge it a full
+ * budget.  In this way we should obtain a sort of time-domain
+ * fairness among all the seeky/slow queues.
+ */
+static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
+
+	bfq_bfqq_served(bfqq, entity->budget - entity->service);
+}
+
+/**
+ * __bfq_activate_entity - activate an entity.
+ * @entity: the entity being activated.
+ * @non_blocking_wait_rq: true if this entity was waiting for a request
+ *
+ * Called whenever an entity is activated, i.e., it is not active and one
+ * of its children receives a new request, or has to be reactivated due to
+ * budget exhaustion.  It uses the current budget of the entity (and the
+ * service received if @entity is active) of the queue to calculate its
+ * timestamps.
+ */
+static void __bfq_activate_entity(struct bfq_entity *entity,
+				  bool non_blocking_wait_rq)
+{
+	struct bfq_sched_data *sd = entity->sched_data;
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+	bool backshifted = false;
+
+	if (entity == sd->in_service_entity) {
+		/*
+		 * If we are requeueing the current entity we have
+		 * to take care of not charging to it service it has
+		 * not received.
+		 */
+		bfq_calc_finish(entity, entity->service);
+		entity->start = entity->finish;
+		sd->in_service_entity = NULL;
+	} else if (entity->tree == &st->active) {
+		/*
+		 * Requeueing an entity due to a change of some
+		 * next_in_service entity below it.  We reuse the
+		 * old start time.
+		 */
+		bfq_active_extract(st, entity);
+	} else {
+		unsigned long long min_vstart;
+
+		/* See comments on bfq_fqq_update_budg_for_activation */
+		if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
+			backshifted = true;
+			min_vstart = entity->finish;
+		} else
+			min_vstart = st->vtime;
+
+		if (entity->tree == &st->idle) {
+			/*
+			 * Must be on the idle tree, bfq_idle_extract() will
+			 * check for that.
+			 */
+			bfq_idle_extract(st, entity);
+			entity->start = bfq_gt(min_vstart, entity->finish) ?
+				min_vstart : entity->finish;
+		} else {
+			/*
+			 * The finish time of the entity may be invalid, and
+			 * it is in the past for sure, otherwise the queue
+			 * would have been on the idle tree.
+			 */
+			entity->start = min_vstart;
+			st->wsum += entity->weight;
+			/*
+			 * entity is about to be inserted into a service tree,
+			 * and then set in service: get a reference to make
+			 * sure entity does not disappear until it is no
+			 * longer in service or scheduled for service.
+			 */
+			bfq_get_entity(entity);
+
+			entity->on_st = 1;
+		}
+	}
+
+	st = __bfq_entity_update_weight_prio(st, entity);
+	bfq_calc_finish(entity, entity->budget);
+
+	/*
+	 * If some queues enjoy backshifting for a while, then their
+	 * (virtual) finish timestamps may happen to become lower and
+	 * lower than the system virtual time.	In particular, if
+	 * these queues often happen to be idle for short time
+	 * periods, and during such time periods other queues with
+	 * higher timestamps happen to be busy, then the backshifted
+	 * timestamps of the former queues can become much lower than
+	 * the system virtual time. In fact, to serve the queues with
+	 * higher timestamps while the ones with lower timestamps are
+	 * idle, the system virtual time may be pushed-up to much
+	 * higher values than the finish timestamps of the idle
+	 * queues. As a consequence, the finish timestamps of all new
+	 * or newly activated queues may end up being much larger than
+	 * those of lucky queues with backshifted timestamps. The
+	 * latter queues may then monopolize the device for a lot of
+	 * time. This would simply break service guarantees.
+	 *
+	 * To reduce this problem, push up a little bit the
+	 * backshifted timestamps of the queue associated with this
+	 * entity (only a queue can happen to have the backshifted
+	 * flag set): just enough to let the finish timestamp of the
+	 * queue be equal to the current value of the system virtual
+	 * time. This may introduce a little unfairness among queues
+	 * with backshifted timestamps, but it does not break
+	 * worst-case fairness guarantees.
+	 */
+	if (backshifted && bfq_gt(st->vtime, entity->finish)) {
+		unsigned long delta = st->vtime - entity->finish;
+
+		entity->start += delta;
+		entity->finish += delta;
+	}
+
+	bfq_active_insert(st, entity);
+}
+
+/**
+ * bfq_activate_entity - activate an entity and its ancestors if necessary.
+ * @entity: the entity to activate.
+ * @non_blocking_wait_rq: true if this entity was waiting for a request
+ *
+ * Activate @entity and all the entities on the path from it to the root.
+ */
+static void bfq_activate_entity(struct bfq_entity *entity,
+				bool non_blocking_wait_rq)
+{
+	struct bfq_sched_data *sd;
+
+	for_each_entity(entity) {
+		__bfq_activate_entity(entity, non_blocking_wait_rq);
+
+		sd = entity->sched_data;
+		if (!bfq_update_next_in_service(sd))
+			/*
+			 * No need to propagate the activation to the
+			 * upper entities, as they will be updated when
+			 * the in-service entity is rescheduled.
+			 */
+			break;
+	}
+}
+
+/**
+ * __bfq_deactivate_entity - deactivate an entity from its service tree.
+ * @entity: the entity to deactivate.
+ * @requeue: if false, the entity will not be put into the idle tree.
+ *
+ * Deactivate an entity, independently from its previous state.  If the
+ * entity was not on a service tree just return, otherwise if it is on
+ * any scheduler tree, extract it from that tree, and if necessary
+ * and if the caller did not specify @requeue, put it on the idle tree.
+ *
+ * Return %1 if the caller should update the entity hierarchy, i.e.,
+ * if the entity was in service or if it was the next_in_service for
+ * its sched_data; return %0 otherwise.
+ */
+static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
+{
+	struct bfq_sched_data *sd = entity->sched_data;
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+	int is_in_service = entity == sd->in_service_entity;
+	int ret = 0;
+
+	if (!entity->on_st)
+		return 0;
+
+	if (is_in_service) {
+		bfq_calc_finish(entity, entity->service);
+		sd->in_service_entity = NULL;
+	} else if (entity->tree == &st->active)
+		bfq_active_extract(st, entity);
+	else if (entity->tree == &st->idle)
+		bfq_idle_extract(st, entity);
+
+	if (is_in_service || sd->next_in_service == entity)
+		ret = bfq_update_next_in_service(sd);
+
+	if (!requeue || !bfq_gt(entity->finish, st->vtime))
+		bfq_forget_entity(st, entity, is_in_service);
+	else
+		bfq_idle_insert(st, entity);
+
+	return ret;
+}
+
+/**
+ * bfq_deactivate_entity - deactivate an entity.
+ * @entity: the entity to deactivate.
+ * @requeue: true if the entity can be put on the idle tree
+ */
+static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
+{
+	struct bfq_sched_data *sd;
+	struct bfq_entity *parent = NULL;
+
+	for_each_entity_safe(entity, parent) {
+		sd = entity->sched_data;
+
+		if (!__bfq_deactivate_entity(entity, requeue))
+			/*
+			 * The parent entity is still backlogged, and
+			 * we don't need to update it as it is still
+			 * in service.
+			 */
+			break;
+
+		if (sd->next_in_service)
+			/*
+			 * The parent entity is still backlogged and
+			 * the budgets on the path towards the root
+			 * need to be updated.
+			 */
+			goto update;
+
+		/*
+		 * If we get here, then the parent is no more backlogged and
+		 * we want to propagate the deactivation upwards.
+		 */
+		requeue = 1;
+	}
+
+	return;
+
+update:
+	entity = parent;
+	for_each_entity(entity) {
+		__bfq_activate_entity(entity, false);
+
+		sd = entity->sched_data;
+		if (!bfq_update_next_in_service(sd))
+			break;
+	}
+}
+
+/**
+ * bfq_update_vtime - update vtime if necessary.
+ * @st: the service tree to act upon.
+ *
+ * If necessary update the service tree vtime to have at least one
+ * eligible entity, skipping to its start time.  Assumes that the
+ * active tree of the device is not empty.
+ *
+ * NOTE: this hierarchical implementation updates vtimes quite often,
+ * we may end up with reactivated processes getting timestamps after a
+ * vtime skip done because we needed a ->first_active entity on some
+ * intermediate node.
+ */
+static void bfq_update_vtime(struct bfq_service_tree *st)
+{
+	struct bfq_entity *entry;
+	struct rb_node *node = st->active.rb_node;
+
+	entry = rb_entry(node, struct bfq_entity, rb_node);
+	if (bfq_gt(entry->min_start, st->vtime)) {
+		st->vtime = entry->min_start;
+		bfq_forget_idle(st);
+	}
+}
+
+/**
+ * bfq_first_active_entity - find the eligible entity with
+ *                           the smallest finish time
+ * @st: the service tree to select from.
+ *
+ * This function searches the first schedulable entity, starting from the
+ * root of the tree and going on the left every time on this side there is
+ * a subtree with at least one eligible (start >= vtime) entity. The path on
+ * the right is followed only if a) the left subtree contains no eligible
+ * entities and b) no eligible entity has been found yet.
+ */
+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
+{
+	struct bfq_entity *entry, *first = NULL;
+	struct rb_node *node = st->active.rb_node;
+
+	while (node) {
+		entry = rb_entry(node, struct bfq_entity, rb_node);
+left:
+		if (!bfq_gt(entry->start, st->vtime))
+			first = entry;
+
+		if (node->rb_left) {
+			entry = rb_entry(node->rb_left,
+					 struct bfq_entity, rb_node);
+			if (!bfq_gt(entry->min_start, st->vtime)) {
+				node = node->rb_left;
+				goto left;
+			}
+		}
+		if (first)
+			break;
+		node = node->rb_right;
+	}
+
+	return first;
+}
+
+/**
+ * __bfq_lookup_next_entity - return the first eligible entity in @st.
+ * @st: the service tree.
+ *
+ * Update the virtual time in @st and return the first eligible entity
+ * it contains.
+ */
+static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
+						   bool force)
+{
+	struct bfq_entity *entity, *new_next_in_service = NULL;
+
+	if (RB_EMPTY_ROOT(&st->active))
+		return NULL;
+
+	bfq_update_vtime(st);
+	entity = bfq_first_active_entity(st);
+
+	/*
+	 * If the chosen entity does not match with the sched_data's
+	 * next_in_service and we are forcedly serving the IDLE priority
+	 * class tree, bubble up budget update.
+	 */
+	if (unlikely(force && entity != entity->sched_data->next_in_service)) {
+		new_next_in_service = entity;
+		for_each_entity(new_next_in_service)
+			bfq_update_budget(new_next_in_service);
+	}
+
+	return entity;
+}
+
+/**
+ * bfq_lookup_next_entity - return the first eligible entity in @sd.
+ * @sd: the sched_data.
+ * @extract: if true the returned entity will be also extracted from @sd.
+ *
+ * NOTE: since we cache the next_in_service entity at each level of the
+ * hierarchy, the complexity of the lookup can be decreased with
+ * absolutely no effort just returning the cached next_in_service value;
+ * we prefer to do full lookups to test the consistency of the data
+ * structures.
+ */
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
+						 int extract,
+						 struct bfq_data *bfqd)
+{
+	struct bfq_service_tree *st = sd->service_tree;
+	struct bfq_entity *entity;
+	int i = 0;
+
+	/*
+	 * Choose from idle class, if needed to guarantee a minimum
+	 * bandwidth to this class. This should also mitigate
+	 * priority-inversion problems in case a low priority task is
+	 * holding file system resources.
+	 */
+	if (bfqd &&
+	    jiffies - bfqd->bfq_class_idle_last_service >
+	    BFQ_CL_IDLE_TIMEOUT) {
+		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,
+						  true);
+		if (entity) {
+			i = BFQ_IOPRIO_CLASSES - 1;
+			bfqd->bfq_class_idle_last_service = jiffies;
+			sd->next_in_service = entity;
+		}
+	}
+	for (; i < BFQ_IOPRIO_CLASSES; i++) {
+		entity = __bfq_lookup_next_entity(st + i, false);
+		if (entity) {
+			if (extract) {
+				bfq_check_next_in_service(sd, entity);
+				bfq_active_extract(st + i, entity);
+				sd->in_service_entity = entity;
+				sd->next_in_service = NULL;
+			}
+			break;
+		}
+	}
+
+	return entity;
+}
+
+static bool next_queue_may_preempt(struct bfq_data *bfqd)
+{
+	struct bfq_sched_data *sd = &bfqd->sched_data;
+
+	return sd->next_in_service != sd->in_service_entity;
+}
+
+
+/*
+ * Get next queue for service.
+ */
+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
+{
+	struct bfq_entity *entity = NULL;
+	struct bfq_sched_data *sd;
+	struct bfq_queue *bfqq;
+
+	if (bfqd->busy_queues == 0)
+		return NULL;
+
+	sd = &bfqd->sched_data;
+	for (; sd ; sd = entity->my_sched_data) {
+		entity = bfq_lookup_next_entity(sd, 1, bfqd);
+		entity->service = 0;
+	}
+
+	bfqq = bfq_entity_to_bfqq(entity);
+
+	return bfqq;
+}
+
+static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
+{
+	struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
+	struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
+
+	if (bfqd->in_service_bic) {
+		put_io_context(bfqd->in_service_bic->icq.ioc);
+		bfqd->in_service_bic = NULL;
+	}
+
+	bfq_clear_bfqq_wait_request(in_serv_bfqq);
+	hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+	bfqd->in_service_queue = NULL;
+
+	/*
+	 * in_serv_entity is no longer in service, so, if it is in no
+	 * service tree either, then release the service reference to
+	 * the queue it represents (taken with bfq_get_entity).
+	 */
+	if (!in_serv_entity->on_st)
+		bfq_put_queue(in_serv_bfqq);
+}
+
+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+				int requeue)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	bfq_deactivate_entity(entity, requeue);
+}
+
+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	bfq_activate_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq));
+	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
+}
+
+/*
+ * Called when the bfqq no longer has requests pending, remove it from
+ * the service tree.
+ */
+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			      int requeue)
+{
+	bfq_log_bfqq(bfqd, bfqq, "del from busy");
+
+	bfq_clear_bfqq_busy(bfqq);
+
+	bfqd->busy_queues--;
+
+	bfq_deactivate_bfqq(bfqd, bfqq, requeue);
+}
+
+/*
+ * Called when an inactive queue receives a new request.
+ */
+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	bfq_log_bfqq(bfqd, bfqq, "add to busy");
+
+	bfq_activate_bfqq(bfqd, bfqq);
+
+	bfq_mark_bfqq_busy(bfqq);
+	bfqd->busy_queues++;
+}
+
+static void bfq_init_entity(struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	entity->weight = entity->new_weight;
+	entity->orig_weight = entity->new_weight;
+
+	bfqq->ioprio = bfqq->new_ioprio;
+	bfqq->ioprio_class = bfqq->new_ioprio_class;
+
+	entity->sched_data = &bfqq->bfqd->sched_data;
+}
+
+#define bfq_class_idle(bfqq)	((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
+#define bfq_class_rt(bfqq)	((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
+
+#define bfq_sample_valid(samples)	((samples) > 80)
+
+/*
+ * Scheduler run of queue, if there are requests pending and no one in the
+ * driver that will restart queueing.
+ */
+static void bfq_schedule_dispatch(struct bfq_data *bfqd)
+{
+	if (bfqd->queued != 0) {
+		bfq_log(bfqd, "schedule dispatch");
+		blk_mq_run_hw_queues(bfqd->queue, true);
+	}
+}
+
+/*
+ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
+ * We choose the request that is closesr to the head right now.  Distance
+ * behind the head is penalized and only allowed to a certain extent.
+ */
+static struct request *bfq_choose_req(struct bfq_data *bfqd,
+				      struct request *rq1,
+				      struct request *rq2,
+				      sector_t last)
+{
+	sector_t s1, s2, d1 = 0, d2 = 0;
+	unsigned long back_max;
+#define BFQ_RQ1_WRAP	0x01 /* request 1 wraps */
+#define BFQ_RQ2_WRAP	0x02 /* request 2 wraps */
+	unsigned int wrap = 0; /* bit mask: requests behind the disk head? */
+
+	if (!rq1 || rq1 == rq2)
+		return rq2;
+	if (!rq2)
+		return rq1;
+
+	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
+		return rq1;
+	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
+		return rq2;
+	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
+		return rq1;
+	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
+		return rq2;
+
+	s1 = blk_rq_pos(rq1);
+	s2 = blk_rq_pos(rq2);
+
+	/*
+	 * By definition, 1KiB is 2 sectors.
+	 */
+	back_max = bfqd->bfq_back_max * 2;
+
+	/*
+	 * Strict one way elevator _except_ in the case where we allow
+	 * short backward seeks which are biased as twice the cost of a
+	 * similar forward seek.
+	 */
+	if (s1 >= last)
+		d1 = s1 - last;
+	else if (s1 + back_max >= last)
+		d1 = (last - s1) * bfqd->bfq_back_penalty;
+	else
+		wrap |= BFQ_RQ1_WRAP;
+
+	if (s2 >= last)
+		d2 = s2 - last;
+	else if (s2 + back_max >= last)
+		d2 = (last - s2) * bfqd->bfq_back_penalty;
+	else
+		wrap |= BFQ_RQ2_WRAP;
+
+	/* Found required data */
+
+	/*
+	 * By doing switch() on the bit mask "wrap" we avoid having to
+	 * check two variables for all permutations: --> faster!
+	 */
+	switch (wrap) {
+	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
+		if (d1 < d2)
+			return rq1;
+		else if (d2 < d1)
+			return rq2;
+
+		if (s1 >= s2)
+			return rq1;
+		else
+			return rq2;
+
+	case BFQ_RQ2_WRAP:
+		return rq1;
+	case BFQ_RQ1_WRAP:
+		return rq2;
+	case BFQ_RQ1_WRAP|BFQ_RQ2_WRAP: /* both rqs wrapped */
+	default:
+		/*
+		 * Since both rqs are wrapped,
+		 * start with the one that's further behind head
+		 * (--> only *one* back seek required),
+		 * since back seek takes more time than forward.
+		 */
+		if (s1 <= s2)
+			return rq1;
+		else
+			return rq2;
+	}
+}
+
+/*
+ * Return expired entry, or NULL to just start from scratch in rbtree.
+ */
+static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
+				      struct request *last)
+{
+	struct request *rq;
+
+	if (bfq_bfqq_fifo_expire(bfqq))
+		return NULL;
+
+	bfq_mark_bfqq_fifo_expire(bfqq);
+
+	rq = rq_entry_fifo(bfqq->fifo.next);
+
+	if (rq == last || ktime_get_ns() < rq->fifo_time)
+		return NULL;
+
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq);
+	return rq;
+}
+
+static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
+					struct bfq_queue *bfqq,
+					struct request *last)
+{
+	struct rb_node *rbnext = rb_next(&last->rb_node);
+	struct rb_node *rbprev = rb_prev(&last->rb_node);
+	struct request *next, *prev = NULL;
+
+	/* Follow expired path, else get first next available. */
+	next = bfq_check_fifo(bfqq, last);
+	if (next)
+		return next;
+
+	if (rbprev)
+		prev = rb_entry_rq(rbprev);
+
+	if (rbnext)
+		next = rb_entry_rq(rbnext);
+	else {
+		rbnext = rb_first(&bfqq->sort_list);
+		if (rbnext && rbnext != &last->rb_node)
+			next = rb_entry_rq(rbnext);
+	}
+
+	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
+}
+
+static unsigned long bfq_serv_to_charge(struct request *rq,
+					struct bfq_queue *bfqq)
+{
+	return blk_rq_sectors(rq);
+}
+
+/**
+ * bfq_updated_next_req - update the queue after a new next_rq selection.
+ * @bfqd: the device data the queue belongs to.
+ * @bfqq: the queue to update.
+ *
+ * If the first request of a queue changes we make sure that the queue
+ * has enough budget to serve at least its first request (if the
+ * request has grown).  We do this because if the queue has not enough
+ * budget for its first request, it has to go through two dispatch
+ * rounds to actually get it dispatched.
+ */
+static void bfq_updated_next_req(struct bfq_data *bfqd,
+				 struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+	struct request *next_rq = bfqq->next_rq;
+	unsigned long new_budget;
+
+	if (!next_rq)
+		return;
+
+	if (bfqq == bfqd->in_service_queue)
+		/*
+		 * In order not to break guarantees, budgets cannot be
+		 * changed after an entity has been selected.
+		 */
+		return;
+
+	new_budget = max_t(unsigned long, bfqq->max_budget,
+			   bfq_serv_to_charge(next_rq, bfqq));
+	if (entity->budget != new_budget) {
+		entity->budget = new_budget;
+		bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
+					 new_budget);
+		bfq_activate_bfqq(bfqd, bfqq);
+	}
+}
+
+static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	return entity->budget - entity->service;
+}
+
+/*
+ * If enough samples have been computed, return the current max budget
+ * stored in bfqd, which is dynamically updated according to the
+ * estimated disk peak rate; otherwise return the default max budget
+ */
+static int bfq_max_budget(struct bfq_data *bfqd)
+{
+	if (bfqd->budgets_assigned < bfq_stats_min_budgets)
+		return bfq_default_max_budget;
+	else
+		return bfqd->bfq_max_budget;
+}
+
+/*
+ * Return min budget, which is a fraction of the current or default
+ * max budget (trying with 1/32)
+ */
+static int bfq_min_budget(struct bfq_data *bfqd)
+{
+	if (bfqd->budgets_assigned < bfq_stats_min_budgets)
+		return bfq_default_max_budget / 32;
+	else
+		return bfqd->bfq_max_budget / 32;
+}
+
+static void bfq_bfqq_expire(struct bfq_data *bfqd,
+			    struct bfq_queue *bfqq,
+			    bool compensate,
+			    enum bfqq_expiration reason);
+
+/*
+ * The next function, invoked after the input queue bfqq switches from
+ * idle to busy, updates the budget of bfqq. The function also tells
+ * whether the in-service queue should be expired, by returning
+ * true. The purpose of expiring the in-service queue is to give bfqq
+ * the chance to possibly preempt the in-service queue, and the reason
+ * for preempting the in-service queue is to achieve the following
+ * goal: guarantee to bfqq its reserved bandwidth even if bfqq has
+ * expired because it has remained idle.
+ *
+ * In particular, bfqq may have expired for one of the following two
+ * reasons:
+ *
+ * - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling
+ *   and did not make it to issue a new request before its last
+ *   request was served;
+ *
+ * - BFQQE_TOO_IDLE bfqq did enjoy device idling, but did not issue
+ *   a new request before the expiration of the idling-time.
+ *
+ * Even if bfqq has expired for one of the above reasons, the process
+ * associated with the queue may be however issuing requests greedily,
+ * and thus be sensitive to the bandwidth it receives (bfqq may have
+ * remained idle for other reasons: CPU high load, bfqq not enjoying
+ * idling, I/O throttling somewhere in the path from the process to
+ * the I/O scheduler, ...). But if, after every expiration for one of
+ * the above two reasons, bfqq has to wait for the service of at least
+ * one full budget of another queue before being served again, then
+ * bfqq is likely to get a much lower bandwidth or resource time than
+ * its reserved ones. To address this issue, two countermeasures need
+ * to be taken.
+ *
+ * First, the budget and the timestamps of bfqq need to be updated in
+ * a special way on bfqq reactivation: they need to be updated as if
+ * bfqq did not remain idle and did not expire. In fact, if they are
+ * computed as if bfqq expired and remained idle until reactivation,
+ * then the process associated with bfqq is treated as if, instead of
+ * being greedy, it stopped issuing requests when bfqq remained idle,
+ * and restarts issuing requests only on this reactivation. In other
+ * words, the scheduler does not help the process recover the "service
+ * hole" between bfqq expiration and reactivation. As a consequence,
+ * the process receives a lower bandwidth than its reserved one. In
+ * contrast, to recover this hole, the budget must be updated as if
+ * bfqq was not expired at all before this reactivation, i.e., it must
+ * be set to the value of the remaining budget when bfqq was
+ * expired. Along the same line, timestamps need to be assigned the
+ * value they had the last time bfqq was selected for service, i.e.,
+ * before last expiration. Thus timestamps need to be back-shifted
+ * with respect to their normal computation (see [1] for more details
+ * on this tricky aspect).
+ *
+ * Secondly, to allow the process to recover the hole, the in-service
+ * queue must be expired too, to give bfqq the chance to preempt it
+ * immediately. In fact, if bfqq has to wait for a full budget of the
+ * in-service queue to be completed, then it may become impossible to
+ * let the process recover the hole, even if the back-shifted
+ * timestamps of bfqq are lower than those of the in-service queue. If
+ * this happens for most or all of the holes, then the process may not
+ * receive its reserved bandwidth. In this respect, it is worth noting
+ * that, being the service of outstanding requests unpreemptible, a
+ * little fraction of the holes may however be unrecoverable, thereby
+ * causing a little loss of bandwidth.
+ *
+ * The last important point is detecting whether bfqq does need this
+ * bandwidth recovery. In this respect, the next function deems the
+ * process associated with bfqq greedy, and thus allows it to recover
+ * the hole, if: 1) the process is waiting for the arrival of a new
+ * request (which implies that bfqq expired for one of the above two
+ * reasons), and 2) such a request has arrived soon. The first
+ * condition is controlled through the flag non_blocking_wait_rq,
+ * while the second through the flag arrived_in_time. If both
+ * conditions hold, then the function computes the budget in the
+ * above-described special way, and signals that the in-service queue
+ * should be expired. Timestamp back-shifting is done later in
+ * __bfq_activate_entity.
+ */
+static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
+						struct bfq_queue *bfqq,
+						bool arrived_in_time)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
+		/*
+		 * We do not clear the flag non_blocking_wait_rq here, as
+		 * the latter is used in bfq_activate_bfqq to signal
+		 * that timestamps need to be back-shifted (and is
+		 * cleared right after).
+		 */
+
+		/*
+		 * In next assignment we rely on that either
+		 * entity->service or entity->budget are not updated
+		 * on expiration if bfqq is empty (see
+		 * __bfq_bfqq_recalc_budget). Thus both quantities
+		 * remain unchanged after such an expiration, and the
+		 * following statement therefore assigns to
+		 * entity->budget the remaining budget on such an
+		 * expiration. For clarity, entity->service is not
+		 * updated on expiration in any case, and, in normal
+		 * operation, is reset only when bfqq is selected for
+		 * service (see bfq_get_next_queue).
+		 */
+		entity->budget = min_t(unsigned long,
+				       bfq_bfqq_budget_left(bfqq),
+				       bfqq->max_budget);
+
+		return true;
+	}
+
+	entity->budget = max_t(unsigned long, bfqq->max_budget,
+			       bfq_serv_to_charge(bfqq->next_rq, bfqq));
+	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
+	return false;
+}
+
+static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
+					     struct bfq_queue *bfqq,
+					     struct request *rq)
+{
+	bool bfqq_wants_to_preempt,
+		/*
+		 * See the comments on
+		 * bfq_bfqq_update_budg_for_activation for
+		 * details on the usage of the next variable.
+		 */
+		arrived_in_time =  ktime_get_ns() <=
+			bfqq->ttime.last_end_request +
+			bfqd->bfq_slice_idle * 3;
+
+	/*
+	 * Update budget and check whether bfqq may want to preempt
+	 * the in-service queue.
+	 */
+	bfqq_wants_to_preempt =
+		bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
+						    arrived_in_time);
+
+	if (!bfq_bfqq_IO_bound(bfqq)) {
+		if (arrived_in_time) {
+			bfqq->requests_within_timer++;
+			if (bfqq->requests_within_timer >=
+			    bfqd->bfq_requests_within_timer)
+				bfq_mark_bfqq_IO_bound(bfqq);
+		} else
+			bfqq->requests_within_timer = 0;
+	}
+
+	bfq_add_bfqq_busy(bfqd, bfqq);
+
+	/*
+	 * Expire in-service queue only if preemption may be needed
+	 * for guarantees. In this respect, the function
+	 * next_queue_may_preempt just checks a simple, necessary
+	 * condition, and not a sufficient condition based on
+	 * timestamps. In fact, for the latter condition to be
+	 * evaluated, timestamps would need first to be updated, and
+	 * this operation is quite costly (see the comments on the
+	 * function bfq_bfqq_update_budg_for_activation).
+	 */
+	if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
+	    next_queue_may_preempt(bfqd))
+		bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
+				false, BFQQE_PREEMPTED);
+}
+
+static void bfq_add_request(struct request *rq)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+	struct bfq_data *bfqd = bfqq->bfqd;
+	struct request *next_rq, *prev;
+
+	bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
+	bfqq->queued[rq_is_sync(rq)]++;
+	bfqd->queued++;
+
+	elv_rb_add(&bfqq->sort_list, rq);
+
+	/*
+	 * Check if this request is a better next-serve candidate.
+	 */
+	prev = bfqq->next_rq;
+	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
+	bfqq->next_rq = next_rq;
+
+	if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
+		bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, rq);
+	else if (prev != bfqq->next_rq)
+		bfq_updated_next_req(bfqd, bfqq);
+}
+
+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
+					  struct bio *bio,
+					  struct request_queue *q)
+{
+	struct bfq_queue *bfqq = bfqd->bio_bfqq;
+
+
+	if (bfqq)
+		return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
+
+	return NULL;
+}
+
+#if 0 /* Still not clear if we can do without next two functions */
+static void bfq_activate_request(struct request_queue *q, struct request *rq)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+
+	bfqd->rq_in_driver++;
+	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
+	bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
+		(unsigned long long)bfqd->last_position);
+}
+
+static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+
+	bfqd->rq_in_driver--;
+}
+#endif
+
+static void bfq_remove_request(struct request_queue *q,
+			       struct request *rq)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+	struct bfq_data *bfqd = bfqq->bfqd;
+	const int sync = rq_is_sync(rq);
+
+	if (bfqq->next_rq == rq) {
+		bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
+		bfq_updated_next_req(bfqd, bfqq);
+	}
+
+	if (rq->queuelist.prev != &rq->queuelist)
+		list_del_init(&rq->queuelist);
+	bfqq->queued[sync]--;
+	bfqd->queued--;
+	elv_rb_del(&bfqq->sort_list, rq);
+
+	elv_rqhash_del(q, rq);
+	if (q->last_merge == rq)
+		q->last_merge = NULL;
+
+	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
+		bfqq->next_rq = NULL;
+
+		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) {
+			bfq_del_bfqq_busy(bfqd, bfqq, 1);
+			/*
+			 * bfqq emptied. In normal operation, when
+			 * bfqq is empty, bfqq->entity.service and
+			 * bfqq->entity.budget must contain,
+			 * respectively, the service received and the
+			 * budget used last time bfqq emptied. These
+			 * facts do not hold in this case, as at least
+			 * this last removal occurred while bfqq is
+			 * not in service. To avoid inconsistencies,
+			 * reset both bfqq->entity.service and
+			 * bfqq->entity.budget, if bfqq has still a
+			 * process that may issue I/O requests to it.
+			 */
+			bfqq->entity.budget = bfqq->entity.service = 0;
+		}
+	}
+
+	if (rq->cmd_flags & REQ_META)
+		bfqq->meta_pending--;
+}
+
+static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
+{
+	struct request_queue *q = hctx->queue;
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct request *free = NULL;
+	/*
+	 * bfq_bic_lookup grabs the queue_lock: invoke it now and
+	 * store its return value for later use, to avoid nesting
+	 * queue_lock inside the bfqd->lock. We assume that the bic
+	 * returned by bfq_bic_lookup does not go away before
+	 * bfqd->lock is taken.
+	 */
+	struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q);
+	bool ret;
+
+	spin_lock_irq(&bfqd->lock);
+
+	if (bic)
+		bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf));
+	else
+		bfqd->bio_bfqq = NULL;
+	bfqd->bio_bic = bic;
+
+	ret = blk_mq_sched_try_merge(q, bio, &free);
+
+	if (free)
+		blk_mq_free_request(free);
+	spin_unlock_irq(&bfqd->lock);
+
+	return ret;
+}
+
+static int bfq_request_merge(struct request_queue *q, struct request **req,
+			     struct bio *bio)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct request *__rq;
+
+	__rq = bfq_find_rq_fmerge(bfqd, bio, q);
+	if (__rq && elv_bio_merge_ok(__rq, bio)) {
+		*req = __rq;
+		return ELEVATOR_FRONT_MERGE;
+	}
+
+	return ELEVATOR_NO_MERGE;
+}
+
+static void bfq_request_merged(struct request_queue *q, struct request *req,
+			       enum elv_merge type)
+{
+	if (type == ELEVATOR_FRONT_MERGE &&
+	    rb_prev(&req->rb_node) &&
+	    blk_rq_pos(req) <
+	    blk_rq_pos(container_of(rb_prev(&req->rb_node),
+				    struct request, rb_node))) {
+		struct bfq_queue *bfqq = RQ_BFQQ(req);
+		struct bfq_data *bfqd = bfqq->bfqd;
+		struct request *prev, *next_rq;
+
+		/* Reposition request in its sort_list */
+		elv_rb_del(&bfqq->sort_list, req);
+		elv_rb_add(&bfqq->sort_list, req);
+
+		/* Choose next request to be served for bfqq */
+		prev = bfqq->next_rq;
+		next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
+					 bfqd->last_position);
+		bfqq->next_rq = next_rq;
+		/*
+		 * If next_rq changes, update the queue's budget to fit
+		 * the new request.
+		 */
+		if (prev != bfqq->next_rq)
+			bfq_updated_next_req(bfqd, bfqq);
+	}
+}
+
+static void bfq_requests_merged(struct request_queue *q, struct request *rq,
+				struct request *next)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
+
+	if (!RB_EMPTY_NODE(&rq->rb_node))
+		return;
+	spin_lock_irq(&bfqq->bfqd->lock);
+
+	/*
+	 * If next and rq belong to the same bfq_queue and next is older
+	 * than rq, then reposition rq in the fifo (by substituting next
+	 * with rq). Otherwise, if next and rq belong to different
+	 * bfq_queues, never reposition rq: in fact, we would have to
+	 * reposition it with respect to next's position in its own fifo,
+	 * which would most certainly be too expensive with respect to
+	 * the benefits.
+	 */
+	if (bfqq == next_bfqq &&
+	    !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
+	    next->fifo_time < rq->fifo_time) {
+		list_del_init(&rq->queuelist);
+		list_replace_init(&next->queuelist, &rq->queuelist);
+		rq->fifo_time = next->fifo_time;
+	}
+
+	if (bfqq->next_rq == next)
+		bfqq->next_rq = rq;
+
+	bfq_remove_request(q, next);
+
+	spin_unlock_irq(&bfqq->bfqd->lock);
+}
+
+static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
+				struct bio *bio)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	bool is_sync = op_is_sync(bio->bi_opf);
+	struct bfq_queue *bfqq = bfqd->bio_bfqq;
+
+	/*
+	 * Disallow merge of a sync bio into an async request.
+	 */
+	if (is_sync && !rq_is_sync(rq))
+		return false;
+
+	/*
+	 * Lookup the bfqq that this bio will be queued with. Allow
+	 * merge only if rq is queued there.
+	 */
+	if (!bfqq)
+		return false;
+
+	return bfqq == RQ_BFQQ(rq);
+}
+
+static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
+				       struct bfq_queue *bfqq)
+{
+	if (bfqq) {
+		bfq_mark_bfqq_budget_new(bfqq);
+		bfq_clear_bfqq_fifo_expire(bfqq);
+
+		bfqd->budgets_assigned = (bfqd->budgets_assigned * 7 + 256) / 8;
+
+		bfq_log_bfqq(bfqd, bfqq,
+			     "set_in_service_queue, cur-budget = %d",
+			     bfqq->entity.budget);
+	}
+
+	bfqd->in_service_queue = bfqq;
+}
+
+/*
+ * Get and set a new queue for service.
+ */
+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
+{
+	struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
+
+	__bfq_set_in_service_queue(bfqd, bfqq);
+	return bfqq;
+}
+
+/*
+ * bfq_default_budget - return the default budget for @bfqq on @bfqd.
+ * @bfqd: the device descriptor.
+ * @bfqq: the queue to consider.
+ *
+ * We use 3/4 of the @bfqd maximum budget as the default value
+ * for the max_budget field of the queues.  This lets the feedback
+ * mechanism to start from some middle ground, then the behavior
+ * of the process will drive the heuristics towards high values, if
+ * it behaves as a greedy sequential reader, or towards small values
+ * if it shows a more intermittent behavior.
+ */
+static unsigned long bfq_default_budget(struct bfq_data *bfqd,
+					struct bfq_queue *bfqq)
+{
+	unsigned long budget;
+
+	/*
+	 * When we need an estimate of the peak rate we need to avoid
+	 * to give budgets that are too short due to previous
+	 * measurements.  So, in the first 10 assignments use a
+	 * ``safe'' budget value. For such first assignment the value
+	 * of bfqd->budgets_assigned happens to be lower than 194.
+	 * See __bfq_set_in_service_queue for the formula by which
+	 * this field is computed.
+	 */
+	if (bfqd->budgets_assigned < 194 && bfqd->bfq_user_max_budget == 0)
+		budget = bfq_default_max_budget;
+	else
+		budget = bfqd->bfq_max_budget;
+
+	return budget - budget / 4;
+}
+
+static void bfq_arm_slice_timer(struct bfq_data *bfqd)
+{
+	struct bfq_queue *bfqq = bfqd->in_service_queue;
+	struct bfq_io_cq *bic;
+	u32 sl;
+
+	/* Processes have exited, don't wait. */
+	bic = bfqd->in_service_bic;
+	if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0)
+		return;
+
+	bfq_mark_bfqq_wait_request(bfqq);
+
+	/*
+	 * We don't want to idle for seeks, but we do want to allow
+	 * fair distribution of slice time for a process doing back-to-back
+	 * seeks. So allow a little bit of time for him to submit a new rq.
+	 */
+	sl = bfqd->bfq_slice_idle;
+	/*
+	 * Grant only minimum idle time if the queue is seeky.
+	 */
+	if (BFQQ_SEEKY(bfqq))
+		sl = min_t(u64, sl, BFQ_MIN_TT);
+
+	bfqd->last_idling_start = ktime_get();
+	hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
+		      HRTIMER_MODE_REL);
+}
+
+/*
+ * Set the maximum time for the in-service queue to consume its
+ * budget. This prevents seeky processes from lowering the disk
+ * throughput (always guaranteed with a time slice scheme as in CFQ).
+ */
+static void bfq_set_budget_timeout(struct bfq_data *bfqd)
+{
+	struct bfq_queue *bfqq = bfqd->in_service_queue;
+	unsigned int timeout_coeff = bfqq->entity.weight /
+				     bfqq->entity.orig_weight;
+
+	bfqd->last_budget_start = ktime_get();
+
+	bfq_clear_bfqq_budget_new(bfqq);
+	bfqq->budget_timeout = jiffies +
+		bfqd->bfq_timeout * timeout_coeff;
+
+	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
+		jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff));
+}
+
+/*
+ * Remove request from internal lists.
+ */
+static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+
+	/*
+	 * For consistency, the next instruction should have been
+	 * executed after removing the request from the queue and
+	 * dispatching it.  We execute instead this instruction before
+	 * bfq_remove_request() (and hence introduce a temporary
+	 * inconsistency), for efficiency.  In fact, should this
+	 * dispatch occur for a non in-service bfqq, this anticipated
+	 * increment prevents two counters related to bfqq->dispatched
+	 * from risking to be, first, uselessly decremented, and then
+	 * incremented again when the (new) value of bfqq->dispatched
+	 * happens to be taken into account.
+	 */
+	bfqq->dispatched++;
+
+	bfq_remove_request(q, rq);
+}
+
+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	__bfq_bfqd_reset_in_service(bfqd);
+
+	if (RB_EMPTY_ROOT(&bfqq->sort_list))
+		bfq_del_bfqq_busy(bfqd, bfqq, 1);
+	else
+		bfq_activate_bfqq(bfqd, bfqq);
+}
+
+/**
+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
+ * @bfqd: device data.
+ * @bfqq: queue to update.
+ * @reason: reason for expiration.
+ *
+ * Handle the feedback on @bfqq budget at queue expiration.
+ * See the body for detailed comments.
+ */
+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
+				     struct bfq_queue *bfqq,
+				     enum bfqq_expiration reason)
+{
+	struct request *next_rq;
+	int budget, min_budget;
+
+	budget = bfqq->max_budget;
+	min_budget = bfq_min_budget(bfqd);
+
+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",
+		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d",
+		budget, bfq_min_budget(bfqd));
+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
+		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
+
+	if (bfq_bfqq_sync(bfqq)) {
+		switch (reason) {
+		/*
+		 * Caveat: in all the following cases we trade latency
+		 * for throughput.
+		 */
+		case BFQQE_TOO_IDLE:
+			if (budget > min_budget + BFQ_BUDGET_STEP)
+				budget -= BFQ_BUDGET_STEP;
+			else
+				budget = min_budget;
+			break;
+		case BFQQE_BUDGET_TIMEOUT:
+			budget = bfq_default_budget(bfqd, bfqq);
+			break;
+		case BFQQE_BUDGET_EXHAUSTED:
+			/*
+			 * The process still has backlog, and did not
+			 * let either the budget timeout or the disk
+			 * idling timeout expire. Hence it is not
+			 * seeky, has a short thinktime and may be
+			 * happy with a higher budget too. So
+			 * definitely increase the budget of this good
+			 * candidate to boost the disk throughput.
+			 */
+			budget = min(budget + 8 * BFQ_BUDGET_STEP,
+				     bfqd->bfq_max_budget);
+			break;
+		case BFQQE_NO_MORE_REQUESTS:
+			/*
+			 * For queues that expire for this reason, it
+			 * is particularly important to keep the
+			 * budget close to the actual service they
+			 * need. Doing so reduces the timestamp
+			 * misalignment problem described in the
+			 * comments in the body of
+			 * __bfq_activate_entity. In fact, suppose
+			 * that a queue systematically expires for
+			 * BFQQE_NO_MORE_REQUESTS and presents a
+			 * new request in time to enjoy timestamp
+			 * back-shifting. The larger the budget of the
+			 * queue is with respect to the service the
+			 * queue actually requests in each service
+			 * slot, the more times the queue can be
+			 * reactivated with the same virtual finish
+			 * time. It follows that, even if this finish
+			 * time is pushed to the system virtual time
+			 * to reduce the consequent timestamp
+			 * misalignment, the queue unjustly enjoys for
+			 * many re-activations a lower finish time
+			 * than all newly activated queues.
+			 *
+			 * The service needed by bfqq is measured
+			 * quite precisely by bfqq->entity.service.
+			 * Since bfqq does not enjoy device idling,
+			 * bfqq->entity.service is equal to the number
+			 * of sectors that the process associated with
+			 * bfqq requested to read/write before waiting
+			 * for request completions, or blocking for
+			 * other reasons.
+			 */
+			budget = max_t(int, bfqq->entity.service, min_budget);
+			break;
+		default:
+			return;
+		}
+	} else {
+		/*
+		 * Async queues get always the maximum possible
+		 * budget, as for them we do not care about latency
+		 * (in addition, their ability to dispatch is limited
+		 * by the charging factor).
+		 */
+		budget = bfqd->bfq_max_budget;
+	}
+
+	bfqq->max_budget = budget;
+
+	if (bfqd->budgets_assigned >= bfq_stats_min_budgets &&
+	    !bfqd->bfq_user_max_budget)
+		bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget);
+
+	/*
+	 * If there is still backlog, then assign a new budget, making
+	 * sure that it is large enough for the next request.  Since
+	 * the finish time of bfqq must be kept in sync with the
+	 * budget, be sure to call __bfq_bfqq_expire() *after* this
+	 * update.
+	 *
+	 * If there is no backlog, then no need to update the budget;
+	 * it will be updated on the arrival of a new request.
+	 */
+	next_rq = bfqq->next_rq;
+	if (next_rq)
+		bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
+					    bfq_serv_to_charge(next_rq, bfqq));
+
+	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d",
+			next_rq ? blk_rq_sectors(next_rq) : 0,
+			bfqq->entity.budget);
+}
+
+static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
+{
+	unsigned long max_budget;
+
+	/*
+	 * The max_budget calculated when autotuning is equal to the
+	 * amount of sectors transferred in timeout at the estimated
+	 * peak rate. To get this value, peak_rate is, first,
+	 * multiplied by 1000, because timeout is measured in ms,
+	 * while peak_rate is measured in sectors/usecs. Then the
+	 * result of this multiplication is right-shifted by
+	 * BFQ_RATE_SHIFT, because peak_rate is equal to the value of
+	 * the peak rate left-shifted by BFQ_RATE_SHIFT.
+	 */
+	max_budget = (unsigned long)(peak_rate * 1000 *
+				     timeout >> BFQ_RATE_SHIFT);
+
+	return max_budget;
+}
+
+/*
+ * In addition to updating the peak rate, checks whether the process
+ * is "slow", and returns 1 if so. This slow flag is used, in addition
+ * to the budget timeout, to reduce the amount of service provided to
+ * seeky processes, and hence reduce their chances to lower the
+ * throughput. See the code for more details.
+ */
+static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+				 bool compensate)
+{
+	u64 bw, usecs, expected, timeout;
+	ktime_t delta;
+	int update = 0;
+
+	if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
+		return false;
+
+	if (compensate)
+		delta = bfqd->last_idling_start;
+	else
+		delta = ktime_get();
+	delta = ktime_sub(delta, bfqd->last_budget_start);
+	usecs = ktime_to_us(delta);
+
+	/* don't use too short time intervals */
+	if (usecs < 1000)
+		return false;
+
+	/*
+	 * Calculate the bandwidth for the last slice.  We use a 64 bit
+	 * value to store the peak rate, in sectors per usec in fixed
+	 * point math.  We do so to have enough precision in the estimate
+	 * and to avoid overflows.
+	 */
+	bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
+	do_div(bw, (unsigned long)usecs);
+
+	timeout = jiffies_to_msecs(bfqd->bfq_timeout);
+
+	/*
+	 * Use only long (> 20ms) intervals to filter out spikes for
+	 * the peak rate estimation.
+	 */
+	if (usecs > 20000) {
+		if (bw > bfqd->peak_rate) {
+			bfqd->peak_rate = bw;
+			update = 1;
+			bfq_log(bfqd, "new peak_rate=%llu", bw);
+		}
+
+		update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
+
+		if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
+			bfqd->peak_rate_samples++;
+
+		if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
+		    update && bfqd->bfq_user_max_budget == 0) {
+			bfqd->bfq_max_budget =
+				bfq_calc_max_budget(bfqd->peak_rate,
+						    timeout);
+			bfq_log(bfqd, "new max_budget=%d",
+				bfqd->bfq_max_budget);
+		}
+	}
+
+	/*
+	 * A process is considered ``slow'' (i.e., seeky, so that we
+	 * cannot treat it fairly in the service domain, as it would
+	 * slow down too much the other processes) if, when a slice
+	 * ends for whatever reason, it has received service at a
+	 * rate that would not be high enough to complete the budget
+	 * before the budget timeout expiration.
+	 */
+	expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
+
+	/*
+	 * Caveat: processes doing IO in the slower disk zones will
+	 * tend to be slow(er) even if not seeky. And the estimated
+	 * peak rate will actually be an average over the disk
+	 * surface. Hence, to not be too harsh with unlucky processes,
+	 * we keep a budget/3 margin of safety before declaring a
+	 * process slow.
+	 */
+	return expected > (4 * bfqq->entity.budget) / 3;
+}
+
+/*
+ * Return the farthest past time instant according to jiffies
+ * macros.
+ */
+static unsigned long bfq_smallest_from_now(void)
+{
+	return jiffies - MAX_JIFFY_OFFSET;
+}
+
+/**
+ * bfq_bfqq_expire - expire a queue.
+ * @bfqd: device owning the queue.
+ * @bfqq: the queue to expire.
+ * @compensate: if true, compensate for the time spent idling.
+ * @reason: the reason causing the expiration.
+ *
+ *
+ * If the process associated with the queue is slow (i.e., seeky), or
+ * in case of budget timeout, or, finally, if it is async, we
+ * artificially charge it an entire budget (independently of the
+ * actual service it received). As a consequence, the queue will get
+ * higher timestamps than the correct ones upon reactivation, and
+ * hence it will be rescheduled as if it had received more service
+ * than what it actually received. In the end, this class of processes
+ * will receive less service in proportion to how slowly they consume
+ * their budgets (and hence how seriously they tend to lower the
+ * throughput).
+ *
+ * In contrast, when a queue expires because it has been idling for
+ * too much or because it exhausted its budget, we do not touch the
+ * amount of service it has received. Hence when the queue will be
+ * reactivated and its timestamps updated, the latter will be in sync
+ * with the actual service received by the queue until expiration.
+ *
+ * Charging a full budget to the first type of queues and the exact
+ * service to the others has the effect of using the WF2Q+ policy to
+ * schedule the former on a timeslice basis, without violating the
+ * service domain guarantees of the latter.
+ */
+static void bfq_bfqq_expire(struct bfq_data *bfqd,
+			    struct bfq_queue *bfqq,
+			    bool compensate,
+			    enum bfqq_expiration reason)
+{
+	bool slow;
+	int ref;
+
+	/*
+	 * Update device peak rate for autotuning and check whether the
+	 * process is slow (see bfq_update_peak_rate).
+	 */
+	slow = bfq_update_peak_rate(bfqd, bfqq, compensate);
+
+	/*
+	 * As above explained, 'punish' slow (i.e., seeky), timed-out
+	 * and async queues, to favor sequential sync workloads.
+	 */
+	if (slow || reason == BFQQE_BUDGET_TIMEOUT)
+		bfq_bfqq_charge_full_budget(bfqq);
+
+	if (reason == BFQQE_TOO_IDLE &&
+	    bfqq->entity.service <= 2 * bfqq->entity.budget / 10)
+		bfq_clear_bfqq_IO_bound(bfqq);
+
+	bfq_log_bfqq(bfqd, bfqq,
+		"expire (%d, slow %d, num_disp %d, idle_win %d)", reason,
+		slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
+
+	/*
+	 * Increase, decrease or leave budget unchanged according to
+	 * reason.
+	 */
+	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
+	ref = bfqq->ref;
+	__bfq_bfqq_expire(bfqd, bfqq);
+
+	/* mark bfqq as waiting a request only if a bic still points to it */
+	if (ref > 1 && !bfq_bfqq_busy(bfqq) &&
+	    reason != BFQQE_BUDGET_TIMEOUT &&
+	    reason != BFQQE_BUDGET_EXHAUSTED)
+		bfq_mark_bfqq_non_blocking_wait_rq(bfqq);
+}
+
+/*
+ * Budget timeout is not implemented through a dedicated timer, but
+ * just checked on request arrivals and completions, as well as on
+ * idle timer expirations.
+ */
+static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
+{
+	if (bfq_bfqq_budget_new(bfqq) ||
+	    time_is_after_jiffies(bfqq->budget_timeout))
+		return false;
+	return true;
+}
+
+/*
+ * If we expire a queue that is actively waiting (i.e., with the
+ * device idled) for the arrival of a new request, then we may incur
+ * the timestamp misalignment problem described in the body of the
+ * function __bfq_activate_entity. Hence we return true only if this
+ * condition does not hold, or if the queue is slow enough to deserve
+ * only to be kicked off for preserving a high throughput.
+ */
+static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
+{
+	bfq_log_bfqq(bfqq->bfqd, bfqq,
+		"may_budget_timeout: wait_request %d left %d timeout %d",
+		bfq_bfqq_wait_request(bfqq),
+			bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3,
+		bfq_bfqq_budget_timeout(bfqq));
+
+	return (!bfq_bfqq_wait_request(bfqq) ||
+		bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)
+		&&
+		bfq_bfqq_budget_timeout(bfqq);
+}
+
+/*
+ * For a queue that becomes empty, device idling is allowed only if
+ * this function returns true for the queue. And this function returns
+ * true only if idling is beneficial for throughput.
+ */
+static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
+{
+	struct bfq_data *bfqd = bfqq->bfqd;
+	bool idling_boosts_thr;
+
+	if (bfqd->strict_guarantees)
+		return true;
+
+	/*
+	 * The value of the next variable is computed considering that
+	 * idling is usually beneficial for the throughput if:
+	 * (a) the device is not NCQ-capable, or
+	 * (b) regardless of the presence of NCQ, the request pattern
+	 *     for bfqq is I/O-bound (possible throughput losses
+	 *     caused by granting idling to seeky queues are mitigated
+	 *     by the fact that, in all scenarios where boosting
+	 *     throughput is the best thing to do, i.e., in all
+	 *     symmetric scenarios, only a minimal idle time is
+	 *     allowed to seeky queues).
+	 */
+	idling_boosts_thr = !bfqd->hw_tag || bfq_bfqq_IO_bound(bfqq);
+
+	/*
+	 * We have now the components we need to compute the return
+	 * value of the function, which is true only if both the
+	 * following conditions hold:
+	 * 1) bfqq is sync, because idling make sense only for sync queues;
+	 * 2) idling boosts the throughput.
+	 */
+	return bfq_bfqq_sync(bfqq) && idling_boosts_thr;
+}
+
+/*
+ * If the in-service queue is empty but the function bfq_bfqq_may_idle
+ * returns true, then:
+ * 1) the queue must remain in service and cannot be expired, and
+ * 2) the device must be idled to wait for the possible arrival of a new
+ *    request for the queue.
+ * See the comments on the function bfq_bfqq_may_idle for the reasons
+ * why performing device idling is the best choice to boost the throughput
+ * and preserve service guarantees when bfq_bfqq_may_idle itself
+ * returns true.
+ */
+static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
+{
+	struct bfq_data *bfqd = bfqq->bfqd;
+
+	return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
+	       bfq_bfqq_may_idle(bfqq);
+}
+
+/*
+ * Select a queue for service.  If we have a current queue in service,
+ * check whether to continue servicing it, or retrieve and set a new one.
+ */
+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
+{
+	struct bfq_queue *bfqq;
+	struct request *next_rq;
+	enum bfqq_expiration reason = BFQQE_BUDGET_TIMEOUT;
+
+	bfqq = bfqd->in_service_queue;
+	if (!bfqq)
+		goto new_queue;
+
+	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
+
+	if (bfq_may_expire_for_budg_timeout(bfqq) &&
+	    !bfq_bfqq_wait_request(bfqq) &&
+	    !bfq_bfqq_must_idle(bfqq))
+		goto expire;
+
+check_queue:
+	/*
+	 * This loop is rarely executed more than once. Even when it
+	 * happens, it is much more convenient to re-execute this loop
+	 * than to return NULL and trigger a new dispatch to get a
+	 * request served.
+	 */
+	next_rq = bfqq->next_rq;
+	/*
+	 * If bfqq has requests queued and it has enough budget left to
+	 * serve them, keep the queue, otherwise expire it.
+	 */
+	if (next_rq) {
+		if (bfq_serv_to_charge(next_rq, bfqq) >
+			bfq_bfqq_budget_left(bfqq)) {
+			/*
+			 * Expire the queue for budget exhaustion,
+			 * which makes sure that the next budget is
+			 * enough to serve the next request, even if
+			 * it comes from the fifo expired path.
+			 */
+			reason = BFQQE_BUDGET_EXHAUSTED;
+			goto expire;
+		} else {
+			/*
+			 * The idle timer may be pending because we may
+			 * not disable disk idling even when a new request
+			 * arrives.
+			 */
+			if (bfq_bfqq_wait_request(bfqq)) {
+				/*
+				 * If we get here: 1) at least a new request
+				 * has arrived but we have not disabled the
+				 * timer because the request was too small,
+				 * 2) then the block layer has unplugged
+				 * the device, causing the dispatch to be
+				 * invoked.
+				 *
+				 * Since the device is unplugged, now the
+				 * requests are probably large enough to
+				 * provide a reasonable throughput.
+				 * So we disable idling.
+				 */
+				bfq_clear_bfqq_wait_request(bfqq);
+				hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+			}
+			goto keep_queue;
+		}
+	}
+
+	/*
+	 * No requests pending. However, if the in-service queue is idling
+	 * for a new request, or has requests waiting for a completion and
+	 * may idle after their completion, then keep it anyway.
+	 */
+	if (bfq_bfqq_wait_request(bfqq) ||
+	    (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
+		bfqq = NULL;
+		goto keep_queue;
+	}
+
+	reason = BFQQE_NO_MORE_REQUESTS;
+expire:
+	bfq_bfqq_expire(bfqd, bfqq, false, reason);
+new_queue:
+	bfqq = bfq_set_in_service_queue(bfqd);
+	if (bfqq) {
+		bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue");
+		goto check_queue;
+	}
+keep_queue:
+	if (bfqq)
+		bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue");
+	else
+		bfq_log(bfqd, "select_queue: no queue returned");
+
+	return bfqq;
+}
+
+/*
+ * Dispatch next request from bfqq.
+ */
+static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
+						 struct bfq_queue *bfqq)
+{
+	struct request *rq = bfqq->next_rq;
+	unsigned long service_to_charge;
+
+	service_to_charge = bfq_serv_to_charge(rq, bfqq);
+
+	bfq_bfqq_served(bfqq, service_to_charge);
+
+	bfq_dispatch_remove(bfqd->queue, rq);
+
+	if (!bfqd->in_service_bic) {
+		atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
+		bfqd->in_service_bic = RQ_BIC(rq);
+	}
+
+	/*
+	 * Expire bfqq, pretending that its budget expired, if bfqq
+	 * belongs to CLASS_IDLE and other queues are waiting for
+	 * service.
+	 */
+	if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq))
+		goto expire;
+
+	return rq;
+
+expire:
+	bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
+	return rq;
+}
+
+static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
+{
+	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+
+	/*
+	 * Avoiding lock: a race on bfqd->busy_queues should cause at
+	 * most a call to dispatch for nothing
+	 */
+	return !list_empty_careful(&bfqd->dispatch) ||
+		bfqd->busy_queues > 0;
+}
+
+static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+	struct request *rq = NULL;
+	struct bfq_queue *bfqq = NULL;
+
+	if (!list_empty(&bfqd->dispatch)) {
+		rq = list_first_entry(&bfqd->dispatch, struct request,
+				      queuelist);
+		list_del_init(&rq->queuelist);
+
+		bfqq = RQ_BFQQ(rq);
+
+		if (bfqq) {
+			/*
+			 * Increment counters here, because this
+			 * dispatch does not follow the standard
+			 * dispatch flow (where counters are
+			 * incremented)
+			 */
+			bfqq->dispatched++;
+
+			goto inc_in_driver_start_rq;
+		}
+
+		/*
+		 * We exploit the put_rq_private hook to decrement
+		 * rq_in_driver, but put_rq_private will not be
+		 * invoked on this request. So, to avoid unbalance,
+		 * just start this request, without incrementing
+		 * rq_in_driver. As a negative consequence,
+		 * rq_in_driver is deceptively lower than it should be
+		 * while this request is in service. This may cause
+		 * bfq_schedule_dispatch to be invoked uselessly.
+		 *
+		 * As for implementing an exact solution, the
+		 * put_request hook, if defined, is probably invoked
+		 * also on this request. So, by exploiting this hook,
+		 * we could 1) increment rq_in_driver here, and 2)
+		 * decrement it in put_request. Such a solution would
+		 * let the value of the counter be always accurate,
+		 * but it would entail using an extra interface
+		 * function. This cost seems higher than the benefit,
+		 * being the frequency of non-elevator-private
+		 * requests very low.
+		 */
+		goto start_rq;
+	}
+
+	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
+
+	if (bfqd->busy_queues == 0)
+		goto exit;
+
+	/*
+	 * Force device to serve one request at a time if
+	 * strict_guarantees is true. Forcing this service scheme is
+	 * currently the ONLY way to guarantee that the request
+	 * service order enforced by the scheduler is respected by a
+	 * queueing device. Otherwise the device is free even to make
+	 * some unlucky request wait for as long as the device
+	 * wishes.
+	 *
+	 * Of course, serving one request at at time may cause loss of
+	 * throughput.
+	 */
+	if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
+		goto exit;
+
+	bfqq = bfq_select_queue(bfqd);
+	if (!bfqq)
+		goto exit;
+
+	rq = bfq_dispatch_rq_from_bfqq(bfqd, bfqq);
+
+	if (rq) {
+inc_in_driver_start_rq:
+		bfqd->rq_in_driver++;
+start_rq:
+		rq->rq_flags |= RQF_STARTED;
+	}
+exit:
+	return rq;
+}
+
+static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+	struct request *rq;
+
+	spin_lock_irq(&bfqd->lock);
+	rq = __bfq_dispatch_request(hctx);
+	spin_unlock_irq(&bfqd->lock);
+
+	return rq;
+}
+
+/*
+ * Task holds one reference to the queue, dropped when task exits.  Each rq
+ * in-flight on this queue also holds a reference, dropped when rq is freed.
+ *
+ * Scheduler lock must be held here. Recall not to use bfqq after calling
+ * this function on it.
+ */
+static void bfq_put_queue(struct bfq_queue *bfqq)
+{
+	if (bfqq->bfqd)
+		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d",
+			     bfqq, bfqq->ref);
+
+	bfqq->ref--;
+	if (bfqq->ref)
+		return;
+
+	kmem_cache_free(bfq_pool, bfqq);
+}
+
+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	if (bfqq == bfqd->in_service_queue) {
+		__bfq_bfqq_expire(bfqd, bfqq);
+		bfq_schedule_dispatch(bfqd);
+	}
+
+	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref);
+
+	bfq_put_queue(bfqq); /* release process reference */
+}
+
+static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
+{
+	struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
+	struct bfq_data *bfqd;
+
+	if (bfqq)
+		bfqd = bfqq->bfqd; /* NULL if scheduler already exited */
+
+	if (bfqq && bfqd) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&bfqd->lock, flags);
+		bfq_exit_bfqq(bfqd, bfqq);
+		bic_set_bfqq(bic, NULL, is_sync);
+		spin_unlock_irq(&bfqd->lock);
+	}
+}
+
+static void bfq_exit_icq(struct io_cq *icq)
+{
+	struct bfq_io_cq *bic = icq_to_bic(icq);
+
+	bfq_exit_icq_bfqq(bic, true);
+	bfq_exit_icq_bfqq(bic, false);
+}
+
+/*
+ * Update the entity prio values; note that the new values will not
+ * be used until the next (re)activation.
+ */
+static void
+bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
+{
+	struct task_struct *tsk = current;
+	int ioprio_class;
+	struct bfq_data *bfqd = bfqq->bfqd;
+
+	if (!bfqd)
+		return;
+
+	ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
+	switch (ioprio_class) {
+	default:
+		dev_err(bfqq->bfqd->queue->backing_dev_info->dev,
+			"bfq: bad prio class %d\n", ioprio_class);
+	case IOPRIO_CLASS_NONE:
+		/*
+		 * No prio set, inherit CPU scheduling settings.
+		 */
+		bfqq->new_ioprio = task_nice_ioprio(tsk);
+		bfqq->new_ioprio_class = task_nice_ioclass(tsk);
+		break;
+	case IOPRIO_CLASS_RT:
+		bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+		bfqq->new_ioprio_class = IOPRIO_CLASS_RT;
+		break;
+	case IOPRIO_CLASS_BE:
+		bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+		bfqq->new_ioprio_class = IOPRIO_CLASS_BE;
+		break;
+	case IOPRIO_CLASS_IDLE:
+		bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE;
+		bfqq->new_ioprio = 7;
+		bfq_clear_bfqq_idle_window(bfqq);
+		break;
+	}
+
+	if (bfqq->new_ioprio >= IOPRIO_BE_NR) {
+		pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
+			bfqq->new_ioprio);
+		bfqq->new_ioprio = IOPRIO_BE_NR;
+	}
+
+	bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
+	bfqq->entity.prio_changed = 1;
+}
+
+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
+{
+	struct bfq_data *bfqd = bic_to_bfqd(bic);
+	struct bfq_queue *bfqq;
+	int ioprio = bic->icq.ioc->ioprio;
+
+	/*
+	 * This condition may trigger on a newly created bic, be sure to
+	 * drop the lock before returning.
+	 */
+	if (unlikely(!bfqd) || likely(bic->ioprio == ioprio))
+		return;
+
+	bic->ioprio = ioprio;
+
+	bfqq = bic_to_bfqq(bic, false);
+	if (bfqq) {
+		/* release process reference on this queue */
+		bfq_put_queue(bfqq);
+		bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
+		bic_set_bfqq(bic, bfqq, false);
+	}
+
+	bfqq = bic_to_bfqq(bic, true);
+	if (bfqq)
+		bfq_set_next_ioprio_data(bfqq, bic);
+}
+
+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			  struct bfq_io_cq *bic, pid_t pid, int is_sync)
+{
+	RB_CLEAR_NODE(&bfqq->entity.rb_node);
+	INIT_LIST_HEAD(&bfqq->fifo);
+
+	bfqq->ref = 0;
+	bfqq->bfqd = bfqd;
+
+	if (bic)
+		bfq_set_next_ioprio_data(bfqq, bic);
+
+	if (is_sync) {
+		if (!bfq_class_idle(bfqq))
+			bfq_mark_bfqq_idle_window(bfqq);
+		bfq_mark_bfqq_sync(bfqq);
+	} else
+		bfq_clear_bfqq_sync(bfqq);
+
+	/* set end request to minus infinity from now */
+	bfqq->ttime.last_end_request = ktime_get_ns() + 1;
+
+	bfq_mark_bfqq_IO_bound(bfqq);
+
+	bfqq->pid = pid;
+
+	/* Tentative initial value to trade off between thr and lat */
+	bfqq->max_budget = bfq_default_budget(bfqd, bfqq);
+	bfqq->budget_timeout = bfq_smallest_from_now();
+	bfqq->pid = pid;
+
+	/* first request is almost certainly seeky */
+	bfqq->seek_history = 1;
+}
+
+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
+					       int ioprio_class, int ioprio)
+{
+	switch (ioprio_class) {
+	case IOPRIO_CLASS_RT:
+		return &async_bfqq[0][ioprio];
+	case IOPRIO_CLASS_NONE:
+		ioprio = IOPRIO_NORM;
+		/* fall through */
+	case IOPRIO_CLASS_BE:
+		return &async_bfqq[1][ioprio];
+	case IOPRIO_CLASS_IDLE:
+		return &async_idle_bfqq;
+	default:
+		return NULL;
+	}
+}
+
+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
+				       struct bio *bio, bool is_sync,
+				       struct bfq_io_cq *bic)
+{
+	const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
+	struct bfq_queue **async_bfqq = NULL;
+	struct bfq_queue *bfqq;
+
+	rcu_read_lock();
+
+	if (!is_sync) {
+		async_bfqq = bfq_async_queue_prio(bfqd, ioprio_class,
+						  ioprio);
+		bfqq = *async_bfqq;
+		if (bfqq)
+			goto out;
+	}
+
+	bfqq = kmem_cache_alloc_node(bfq_pool,
+				     GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN,
+				     bfqd->queue->node);
+
+	if (bfqq) {
+		bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
+			      is_sync);
+		bfq_init_entity(&bfqq->entity);
+		bfq_log_bfqq(bfqd, bfqq, "allocated");
+	} else {
+		bfqq = &bfqd->oom_bfqq;
+		bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
+		goto out;
+	}
+
+	/*
+	 * Pin the queue now that it's allocated, scheduler exit will
+	 * prune it.
+	 */
+	if (async_bfqq) {
+		bfqq->ref++;
+		bfq_log_bfqq(bfqd, bfqq,
+			     "get_queue, bfqq not in async: %p, %d",
+			     bfqq, bfqq->ref);
+		*async_bfqq = bfqq;
+	}
+
+out:
+	bfqq->ref++; /* get a process reference to this queue */
+	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
+	rcu_read_unlock();
+	return bfqq;
+}
+
+static void bfq_update_io_thinktime(struct bfq_data *bfqd,
+				    struct bfq_queue *bfqq)
+{
+	struct bfq_ttime *ttime = &bfqq->ttime;
+	u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
+
+	elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle);
+
+	ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8;
+	ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed,  8);
+	ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
+				     ttime->ttime_samples);
+}
+
+static void
+bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+		       struct request *rq)
+{
+	sector_t sdist = 0;
+
+	if (bfqq->last_request_pos) {
+		if (bfqq->last_request_pos < blk_rq_pos(rq))
+			sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
+		else
+			sdist = bfqq->last_request_pos - blk_rq_pos(rq);
+	}
+
+	bfqq->seek_history <<= 1;
+	bfqq->seek_history |= sdist > BFQQ_SEEK_THR &&
+		(!blk_queue_nonrot(bfqd->queue) ||
+		 blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
+}
+
+/*
+ * Disable idle window if the process thinks too long or seeks so much that
+ * it doesn't matter.
+ */
+static void bfq_update_idle_window(struct bfq_data *bfqd,
+				   struct bfq_queue *bfqq,
+				   struct bfq_io_cq *bic)
+{
+	int enable_idle;
+
+	/* Don't idle for async or idle io prio class. */
+	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
+		return;
+
+	enable_idle = bfq_bfqq_idle_window(bfqq);
+
+	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
+	    bfqd->bfq_slice_idle == 0 ||
+		(bfqd->hw_tag && BFQQ_SEEKY(bfqq)))
+		enable_idle = 0;
+	else if (bfq_sample_valid(bfqq->ttime.ttime_samples)) {
+		if (bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)
+			enable_idle = 0;
+		else
+			enable_idle = 1;
+	}
+	bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
+		enable_idle);
+
+	if (enable_idle)
+		bfq_mark_bfqq_idle_window(bfqq);
+	else
+		bfq_clear_bfqq_idle_window(bfqq);
+}
+
+/*
+ * Called when a new fs request (rq) is added to bfqq.  Check if there's
+ * something we should do about it.
+ */
+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			    struct request *rq)
+{
+	struct bfq_io_cq *bic = RQ_BIC(rq);
+
+	if (rq->cmd_flags & REQ_META)
+		bfqq->meta_pending++;
+
+	bfq_update_io_thinktime(bfqd, bfqq);
+	bfq_update_io_seektime(bfqd, bfqq, rq);
+	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
+	    !BFQQ_SEEKY(bfqq))
+		bfq_update_idle_window(bfqd, bfqq, bic);
+
+	bfq_log_bfqq(bfqd, bfqq,
+		     "rq_enqueued: idle_window=%d (seeky %d)",
+		     bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq));
+
+	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
+
+	if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
+		bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
+				 blk_rq_sectors(rq) < 32;
+		bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
+
+		/*
+		 * There is just this request queued: if the request
+		 * is small and the queue is not to be expired, then
+		 * just exit.
+		 *
+		 * In this way, if the device is being idled to wait
+		 * for a new request from the in-service queue, we
+		 * avoid unplugging the device and committing the
+		 * device to serve just a small request. On the
+		 * contrary, we wait for the block layer to decide
+		 * when to unplug the device: hopefully, new requests
+		 * will be merged to this one quickly, then the device
+		 * will be unplugged and larger requests will be
+		 * dispatched.
+		 */
+		if (small_req && !budget_timeout)
+			return;
+
+		/*
+		 * A large enough request arrived, or the queue is to
+		 * be expired: in both cases disk idling is to be
+		 * stopped, so clear wait_request flag and reset
+		 * timer.
+		 */
+		bfq_clear_bfqq_wait_request(bfqq);
+		hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+
+		/*
+		 * The queue is not empty, because a new request just
+		 * arrived. Hence we can safely expire the queue, in
+		 * case of budget timeout, without risking that the
+		 * timestamps of the queue are not updated correctly.
+		 * See [1] for more details.
+		 */
+		if (budget_timeout)
+			bfq_bfqq_expire(bfqd, bfqq, false,
+					BFQQE_BUDGET_TIMEOUT);
+	}
+}
+
+static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+
+	bfq_add_request(rq);
+
+	rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
+	list_add_tail(&rq->queuelist, &bfqq->fifo);
+
+	bfq_rq_enqueued(bfqd, bfqq, rq);
+}
+
+static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+			       bool at_head)
+{
+	struct request_queue *q = hctx->queue;
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+
+	spin_lock_irq(&bfqd->lock);
+	if (blk_mq_sched_try_insert_merge(q, rq)) {
+		spin_unlock_irq(&bfqd->lock);
+		return;
+	}
+
+	spin_unlock_irq(&bfqd->lock);
+
+	blk_mq_sched_request_inserted(rq);
+
+	spin_lock_irq(&bfqd->lock);
+	if (at_head || blk_rq_is_passthrough(rq)) {
+		if (at_head)
+			list_add(&rq->queuelist, &bfqd->dispatch);
+		else
+			list_add_tail(&rq->queuelist, &bfqd->dispatch);
+	} else {
+		__bfq_insert_request(bfqd, rq);
+
+		if (rq_mergeable(rq)) {
+			elv_rqhash_add(q, rq);
+			if (!q->last_merge)
+				q->last_merge = rq;
+		}
+	}
+
+	spin_unlock_irq(&bfqd->lock);
+}
+
+static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
+				struct list_head *list, bool at_head)
+{
+	while (!list_empty(list)) {
+		struct request *rq;
+
+		rq = list_first_entry(list, struct request, queuelist);
+		list_del_init(&rq->queuelist);
+		bfq_insert_request(hctx, rq, at_head);
+	}
+}
+
+static void bfq_update_hw_tag(struct bfq_data *bfqd)
+{
+	bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
+				       bfqd->rq_in_driver);
+
+	if (bfqd->hw_tag == 1)
+		return;
+
+	/*
+	 * This sample is valid if the number of outstanding requests
+	 * is large enough to allow a queueing behavior.  Note that the
+	 * sum is not exact, as it's not taking into account deactivated
+	 * requests.
+	 */
+	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
+		return;
+
+	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
+		return;
+
+	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
+	bfqd->max_rq_in_driver = 0;
+	bfqd->hw_tag_samples = 0;
+}
+
+static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
+{
+	bfq_update_hw_tag(bfqd);
+
+	bfqd->rq_in_driver--;
+	bfqq->dispatched--;
+
+	bfqq->ttime.last_end_request = ktime_get_ns();
+
+	/*
+	 * If this is the in-service queue, check if it needs to be expired,
+	 * or if we want to idle in case it has no pending requests.
+	 */
+	if (bfqd->in_service_queue == bfqq) {
+		if (bfq_bfqq_budget_new(bfqq))
+			bfq_set_budget_timeout(bfqd);
+
+		if (bfq_bfqq_must_idle(bfqq)) {
+			bfq_arm_slice_timer(bfqd);
+			return;
+		} else if (bfq_may_expire_for_budg_timeout(bfqq))
+			bfq_bfqq_expire(bfqd, bfqq, false,
+					BFQQE_BUDGET_TIMEOUT);
+		else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
+			 (bfqq->dispatched == 0 ||
+			  !bfq_bfqq_may_idle(bfqq)))
+			bfq_bfqq_expire(bfqd, bfqq, false,
+					BFQQE_NO_MORE_REQUESTS);
+	}
+}
+
+static void bfq_put_rq_priv_body(struct bfq_queue *bfqq)
+{
+	bfqq->allocated--;
+
+	bfq_put_queue(bfqq);
+}
+
+static void bfq_put_rq_private(struct request_queue *q, struct request *rq)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+	struct bfq_data *bfqd = bfqq->bfqd;
+
+
+	if (likely(rq->rq_flags & RQF_STARTED)) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&bfqd->lock, flags);
+
+		bfq_completed_request(bfqq, bfqd);
+		bfq_put_rq_priv_body(bfqq);
+
+		spin_unlock_irqrestore(&bfqd->lock, flags);
+	} else {
+		/*
+		 * Request rq may be still/already in the scheduler,
+		 * in which case we need to remove it. And we cannot
+		 * defer such a check and removal, to avoid
+		 * inconsistencies in the time interval from the end
+		 * of this function to the start of the deferred work.
+		 * This situation seems to occur only in process
+		 * context, as a consequence of a merge. In the
+		 * current version of the code, this implies that the
+		 * lock is held.
+		 */
+
+		if (!RB_EMPTY_NODE(&rq->rb_node))
+			bfq_remove_request(q, rq);
+		bfq_put_rq_priv_body(bfqq);
+	}
+
+	rq->elv.priv[0] = NULL;
+	rq->elv.priv[1] = NULL;
+}
+
+/*
+ * Allocate bfq data structures associated with this request.
+ */
+static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
+			      struct bio *bio)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
+	const int is_sync = rq_is_sync(rq);
+	struct bfq_queue *bfqq;
+
+	spin_lock_irq(&bfqd->lock);
+
+	bfq_check_ioprio_change(bic, bio);
+
+	if (!bic)
+		goto queue_fail;
+
+	bfqq = bic_to_bfqq(bic, is_sync);
+	if (!bfqq || bfqq == &bfqd->oom_bfqq) {
+		if (bfqq)
+			bfq_put_queue(bfqq);
+		bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
+		bic_set_bfqq(bic, bfqq, is_sync);
+	}
+
+	bfqq->allocated++;
+	bfqq->ref++;
+	bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d",
+		     rq, bfqq, bfqq->ref);
+
+	rq->elv.priv[0] = bic;
+	rq->elv.priv[1] = bfqq;
+
+	spin_unlock_irq(&bfqd->lock);
+
+	return 0;
+
+queue_fail:
+	spin_unlock_irq(&bfqd->lock);
+
+	return 1;
+}
+
+static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq)
+{
+	struct bfq_data *bfqd = bfqq->bfqd;
+	enum bfqq_expiration reason;
+	unsigned long flags;
+
+	spin_lock_irqsave(&bfqd->lock, flags);
+	bfq_clear_bfqq_wait_request(bfqq);
+
+	if (bfqq != bfqd->in_service_queue) {
+		spin_unlock_irqrestore(&bfqd->lock, flags);
+		return;
+	}
+
+	if (bfq_bfqq_budget_timeout(bfqq))
+		/*
+		 * Also here the queue can be safely expired
+		 * for budget timeout without wasting
+		 * guarantees
+		 */
+		reason = BFQQE_BUDGET_TIMEOUT;
+	else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
+		/*
+		 * The queue may not be empty upon timer expiration,
+		 * because we may not disable the timer when the
+		 * first request of the in-service queue arrives
+		 * during disk idling.
+		 */
+		reason = BFQQE_TOO_IDLE;
+	else
+		goto schedule_dispatch;
+
+	bfq_bfqq_expire(bfqd, bfqq, true, reason);
+
+schedule_dispatch:
+	spin_unlock_irqrestore(&bfqd->lock, flags);
+	bfq_schedule_dispatch(bfqd);
+}
+
+/*
+ * Handler of the expiration of the timer running if the in-service queue
+ * is idling inside its time slice.
+ */
+static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer)
+{
+	struct bfq_data *bfqd = container_of(timer, struct bfq_data,
+					     idle_slice_timer);
+	struct bfq_queue *bfqq = bfqd->in_service_queue;
+
+	/*
+	 * Theoretical race here: the in-service queue can be NULL or
+	 * different from the queue that was idling if a new request
+	 * arrives for the current queue and there is a full dispatch
+	 * cycle that changes the in-service queue.  This can hardly
+	 * happen, but in the worst case we just expire a queue too
+	 * early.
+	 */
+	if (bfqq)
+		bfq_idle_slice_timer_body(bfqq);
+
+	return HRTIMER_NORESTART;
+}
+
+static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
+				 struct bfq_queue **bfqq_ptr)
+{
+	struct bfq_queue *bfqq = *bfqq_ptr;
+
+	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
+	if (bfqq) {
+		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
+			     bfqq, bfqq->ref);
+		bfq_put_queue(bfqq);
+		*bfqq_ptr = NULL;
+	}
+}
+
+/*
+ * Release the extra reference of the async queues as the device
+ * goes away.
+ */
+static void bfq_put_async_queues(struct bfq_data *bfqd)
+{
+	int i, j;
+
+	for (i = 0; i < 2; i++)
+		for (j = 0; j < IOPRIO_BE_NR; j++)
+			__bfq_put_async_bfqq(bfqd, &async_bfqq[i][j]);
+
+	__bfq_put_async_bfqq(bfqd, &async_idle_bfqq);
+}
+
+static void bfq_exit_queue(struct elevator_queue *e)
+{
+	struct bfq_data *bfqd = e->elevator_data;
+	struct bfq_queue *bfqq, *n;
+
+	hrtimer_cancel(&bfqd->idle_slice_timer);
+
+	spin_lock_irq(&bfqd->lock);
+	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
+		bfq_deactivate_bfqq(bfqd, bfqq, false);
+	bfq_put_async_queues(bfqd);
+	spin_unlock_irq(&bfqd->lock);
+
+	hrtimer_cancel(&bfqd->idle_slice_timer);
+
+	kfree(bfqd);
+}
+
+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
+{
+	struct bfq_data *bfqd;
+	struct elevator_queue *eq;
+	int i;
+
+	eq = elevator_alloc(q, e);
+	if (!eq)
+		return -ENOMEM;
+
+	bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
+	if (!bfqd) {
+		kobject_put(&eq->kobj);
+		return -ENOMEM;
+	}
+	eq->elevator_data = bfqd;
+
+	/*
+	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
+	 * Grab a permanent reference to it, so that the normal code flow
+	 * will not attempt to free it.
+	 */
+	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
+	bfqd->oom_bfqq.ref++;
+	bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
+	bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
+	bfqd->oom_bfqq.entity.new_weight =
+		bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
+	/*
+	 * Trigger weight initialization, according to ioprio, at the
+	 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
+	 * class won't be changed any more.
+	 */
+	bfqd->oom_bfqq.entity.prio_changed = 1;
+
+	bfqd->queue = q;
+
+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
+		bfqd->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+
+	hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
+		     HRTIMER_MODE_REL);
+	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
+
+	INIT_LIST_HEAD(&bfqd->active_list);
+	INIT_LIST_HEAD(&bfqd->idle_list);
+
+	bfqd->hw_tag = -1;
+
+	bfqd->bfq_max_budget = bfq_default_max_budget;
+
+	bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
+	bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
+	bfqd->bfq_back_max = bfq_back_max;
+	bfqd->bfq_back_penalty = bfq_back_penalty;
+	bfqd->bfq_slice_idle = bfq_slice_idle;
+	bfqd->bfq_class_idle_last_service = 0;
+	bfqd->bfq_timeout = bfq_timeout;
+
+	bfqd->bfq_requests_within_timer = 120;
+
+	spin_lock_init(&bfqd->lock);
+	INIT_LIST_HEAD(&bfqd->dispatch);
+
+	q->elevator = eq;
+
+	return 0;
+}
+
+static void bfq_slab_kill(void)
+{
+	kmem_cache_destroy(bfq_pool);
+}
+
+static int __init bfq_slab_setup(void)
+{
+	bfq_pool = KMEM_CACHE(bfq_queue, 0);
+	if (!bfq_pool)
+		return -ENOMEM;
+	return 0;
+}
+
+static ssize_t bfq_var_show(unsigned int var, char *page)
+{
+	return sprintf(page, "%u\n", var);
+}
+
+static ssize_t bfq_var_store(unsigned long *var, const char *page,
+			     size_t count)
+{
+	unsigned long new_val;
+	int ret = kstrtoul(page, 10, &new_val);
+
+	if (ret == 0)
+		*var = new_val;
+
+	return count;
+}
+
+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
+static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
+{									\
+	struct bfq_data *bfqd = e->elevator_data;			\
+	u64 __data = __VAR;						\
+	if (__CONV == 1)						\
+		__data = jiffies_to_msecs(__data);			\
+	else if (__CONV == 2)						\
+		__data = div_u64(__data, NSEC_PER_MSEC);		\
+	return bfq_var_show(__data, (page));				\
+}
+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2);
+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2);
+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2);
+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1);
+SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0);
+#undef SHOW_FUNCTION
+
+#define USEC_SHOW_FUNCTION(__FUNC, __VAR)				\
+static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
+{									\
+	struct bfq_data *bfqd = e->elevator_data;			\
+	u64 __data = __VAR;						\
+	__data = div_u64(__data, NSEC_PER_USEC);			\
+	return bfq_var_show(__data, (page));				\
+}
+USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle);
+#undef USEC_SHOW_FUNCTION
+
+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
+static ssize_t								\
+__FUNC(struct elevator_queue *e, const char *page, size_t count)	\
+{									\
+	struct bfq_data *bfqd = e->elevator_data;			\
+	unsigned long uninitialized_var(__data);			\
+	int ret = bfq_var_store(&__data, (page), count);		\
+	if (__data < (MIN))						\
+		__data = (MIN);						\
+	else if (__data > (MAX))					\
+		__data = (MAX);						\
+	if (__CONV == 1)						\
+		*(__PTR) = msecs_to_jiffies(__data);			\
+	else if (__CONV == 2)						\
+		*(__PTR) = (u64)__data * NSEC_PER_MSEC;			\
+	else								\
+		*(__PTR) = __data;					\
+	return ret;							\
+}
+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
+		INT_MAX, 2);
+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
+		INT_MAX, 2);
+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
+		INT_MAX, 0);
+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2);
+#undef STORE_FUNCTION
+
+#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)			\
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\
+{									\
+	struct bfq_data *bfqd = e->elevator_data;			\
+	unsigned long uninitialized_var(__data);			\
+	int ret = bfq_var_store(&__data, (page), count);		\
+	if (__data < (MIN))						\
+		__data = (MIN);						\
+	else if (__data > (MAX))					\
+		__data = (MAX);						\
+	*(__PTR) = (u64)__data * NSEC_PER_USEC;				\
+	return ret;							\
+}
+USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0,
+		    UINT_MAX);
+#undef USEC_STORE_FUNCTION
+
+static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
+{
+	u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout);
+
+	if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
+		return bfq_calc_max_budget(bfqd->peak_rate, timeout);
+	else
+		return bfq_default_max_budget;
+}
+
+static ssize_t bfq_max_budget_store(struct elevator_queue *e,
+				    const char *page, size_t count)
+{
+	struct bfq_data *bfqd = e->elevator_data;
+	unsigned long uninitialized_var(__data);
+	int ret = bfq_var_store(&__data, (page), count);
+
+	if (__data == 0)
+		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
+	else {
+		if (__data > INT_MAX)
+			__data = INT_MAX;
+		bfqd->bfq_max_budget = __data;
+	}
+
+	bfqd->bfq_user_max_budget = __data;
+
+	return ret;
+}
+
+/*
+ * Leaving this name to preserve name compatibility with cfq
+ * parameters, but this timeout is used for both sync and async.
+ */
+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
+				      const char *page, size_t count)
+{
+	struct bfq_data *bfqd = e->elevator_data;
+	unsigned long uninitialized_var(__data);
+	int ret = bfq_var_store(&__data, (page), count);
+
+	if (__data < 1)
+		__data = 1;
+	else if (__data > INT_MAX)
+		__data = INT_MAX;
+
+	bfqd->bfq_timeout = msecs_to_jiffies(__data);
+	if (bfqd->bfq_user_max_budget == 0)
+		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
+
+	return ret;
+}
+
+static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e,
+				     const char *page, size_t count)
+{
+	struct bfq_data *bfqd = e->elevator_data;
+	unsigned long uninitialized_var(__data);
+	int ret = bfq_var_store(&__data, (page), count);
+
+	if (__data > 1)
+		__data = 1;
+	if (!bfqd->strict_guarantees && __data == 1
+	    && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC)
+		bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC;
+
+	bfqd->strict_guarantees = __data;
+
+	return ret;
+}
+
+#define BFQ_ATTR(name) \
+	__ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store)
+
+static struct elv_fs_entry bfq_attrs[] = {
+	BFQ_ATTR(fifo_expire_sync),
+	BFQ_ATTR(fifo_expire_async),
+	BFQ_ATTR(back_seek_max),
+	BFQ_ATTR(back_seek_penalty),
+	BFQ_ATTR(slice_idle),
+	BFQ_ATTR(slice_idle_us),
+	BFQ_ATTR(max_budget),
+	BFQ_ATTR(timeout_sync),
+	BFQ_ATTR(strict_guarantees),
+	__ATTR_NULL
+};
+
+static struct elevator_type iosched_bfq_mq = {
+	.ops.mq = {
+		.get_rq_priv		= bfq_get_rq_private,
+		.put_rq_priv		= bfq_put_rq_private,
+		.exit_icq		= bfq_exit_icq,
+		.insert_requests	= bfq_insert_requests,
+		.dispatch_request	= bfq_dispatch_request,
+		.next_request		= elv_rb_latter_request,
+		.former_request		= elv_rb_former_request,
+		.allow_merge		= bfq_allow_bio_merge,
+		.bio_merge		= bfq_bio_merge,
+		.request_merge		= bfq_request_merge,
+		.requests_merged	= bfq_requests_merged,
+		.request_merged		= bfq_request_merged,
+		.has_work		= bfq_has_work,
+		.init_sched		= bfq_init_queue,
+		.exit_sched		= bfq_exit_queue,
+	},
+
+	.uses_mq =		true,
+	.icq_size =		sizeof(struct bfq_io_cq),
+	.icq_align =		__alignof__(struct bfq_io_cq),
+	.elevator_attrs =	bfq_attrs,
+	.elevator_name =	"bfq",
+	.elevator_owner =	THIS_MODULE,
+};
+
+static int __init bfq_init(void)
+{
+	int ret;
+
+	ret = -ENOMEM;
+	if (bfq_slab_setup())
+		goto err_pol_unreg;
+
+	ret = elv_register(&iosched_bfq_mq);
+	if (ret)
+		goto err_pol_unreg;
+
+	return 0;
+
+err_pol_unreg:
+	return ret;
+}
+
+static void __exit bfq_exit(void)
+{
+	elv_unregister(&iosched_bfq_mq);
+	bfq_slab_kill();
+}
+
+module_init(bfq_init);
+module_exit(bfq_exit);
+
+MODULE_AUTHOR("Paolo Valente");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler");
-- 
cgit v1.2.3


From e21b7a0b988772e82e7147e1c659a5afe2ae003c Mon Sep 17 00:00:00 2001
From: Arianna Avanzini <avanzini.arianna@gmail.com>
Date: Wed, 12 Apr 2017 18:23:08 +0200
Subject: block, bfq: add full hierarchical scheduling and cgroups support

Add complete support for full hierarchical scheduling, with a cgroups
interface. Full hierarchical scheduling is implemented through the
'entity' abstraction: both bfq_queues, i.e., the internal BFQ queues
associated with processes, and groups are represented in general by
entities. Given the bfq_queues associated with the processes belonging
to a given group, the entities representing these queues are sons of
the entity representing the group. At higher levels, if a group, say
G, contains other groups, then the entity representing G is the parent
entity of the entities representing the groups in G.

Hierarchical scheduling is performed as follows: if the timestamps of
a leaf entity (i.e., of a bfq_queue) change, and such a change lets
the entity become the next-to-serve entity for its parent entity, then
the timestamps of the parent entity are recomputed as a function of
the budget of its new next-to-serve leaf entity. If the parent entity
belongs, in its turn, to a group, and its new timestamps let it become
the next-to-serve for its parent entity, then the timestamps of the
latter parent entity are recomputed as well, and so on. When a new
bfq_queue must be set in service, the reverse path is followed: the
next-to-serve highest-level entity is chosen, then its next-to-serve
child entity, and so on, until the next-to-serve leaf entity is
reached, and the bfq_queue that this entity represents is set in
service.

Writeback is accounted for on a per-group basis, i.e., for each group,
the async I/O requests of the processes of the group are enqueued in a
distinct bfq_queue, and the entity associated with this queue is a
child of the entity associated with the group.

Weights can be assigned explicitly to groups and processes through the
cgroups interface, differently from what happens, for single
processes, if the cgroups interface is not used (as explained in the
description of the previous patch). In particular, since each node has
a full scheduler, each group can be assigned its own weight.

Signed-off-by: Fabio Checconi <fchecconi@gmail.com>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/block/bfq-iosched.txt |   17 +-
 block/Kconfig.iosched               |   10 +
 block/bfq-iosched.c                 | 2568 ++++++++++++++++++++++++++++++-----
 include/linux/blkdev.h              |    2 +-
 4 files changed, 2213 insertions(+), 384 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt
index cbf85f6f1fd8..461b27fce979 100644
--- a/Documentation/block/bfq-iosched.txt
+++ b/Documentation/block/bfq-iosched.txt
@@ -253,9 +253,14 @@ of slice_idle are copied from CFQ too.
 per-process ioprio and weight
 -----------------------------
 
-Unless the cgroups interface is used, weights can be assigned to
-processes only indirectly, through I/O priorities, and according to
-the relation: weight = (IOPRIO_BE_NR - ioprio) * 10.
+Unless the cgroups interface is used (see "4. BFQ group scheduling"),
+weights can be assigned to processes only indirectly, through I/O
+priorities, and according to the relation:
+weight = (IOPRIO_BE_NR - ioprio) * 10.
+
+Beware that, if low-latency is set, then BFQ automatically raises the
+weight of the queues associated with interactive and soft real-time
+applications. Unset this tunable if you need/want to control weights.
 
 slice_idle
 ----------
@@ -450,9 +455,9 @@ may be reactivated for an already busy async queue (in ms).
 4. Group scheduling with BFQ
 ============================
 
-BFQ supports both cgroup-v1 and cgroup-v2 io controllers, namely blkio
-and io. In particular, BFQ supports weight-based proportional
-share.
+BFQ supports both cgroups-v1 and cgroups-v2 io controllers, namely
+blkio and io. In particular, BFQ supports weight-based proportional
+share. To activate cgroups support, set BFQ_GROUP_IOSCHED.
 
 4-1 Service guarantees provided
 -------------------------------
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 6fc36027b70e..fd2cefa47d35 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -40,6 +40,7 @@ config CFQ_GROUP_IOSCHED
 	  Enable group IO scheduling in CFQ.
 
 choice
+
 	prompt "Default I/O scheduler"
 	default DEFAULT_CFQ
 	help
@@ -89,6 +90,15 @@ config IOSCHED_BFQ
 	real-time applications.  Details in
 	Documentation/block/bfq-iosched.txt
 
+config BFQ_GROUP_IOSCHED
+       bool "BFQ hierarchical scheduling support"
+       depends on IOSCHED_BFQ && BLK_CGROUP
+       default n
+       ---help---
+
+       Enable hierarchical scheduling in BFQ, using the blkio
+       (cgroups-v1) or io (cgroups-v2) controller.
+
 endmenu
 
 endif
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index c4e7d8db796a..af1740a1d453 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -90,6 +90,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
+#include <linux/cgroup.h>
 #include <linux/elevator.h>
 #include <linux/ktime.h>
 #include <linux/rbtree.h>
@@ -114,7 +115,7 @@
 
 #define BFQ_DEFAULT_QUEUE_IOPRIO	4
 
-#define BFQ_DEFAULT_GRP_WEIGHT	10
+#define BFQ_WEIGHT_LEGACY_DFL	100
 #define BFQ_DEFAULT_GRP_IOPRIO	0
 #define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE
 
@@ -149,10 +150,11 @@ struct bfq_service_tree {
  * struct bfq_sched_data - multi-class scheduler.
  *
  * bfq_sched_data is the basic scheduler queue.  It supports three
- * ioprio_classes, and can be used either as a toplevel queue or as
- * an intermediate queue on a hierarchical setup.
- * @next_in_service points to the active entity of the sched_data
- * service trees that will be scheduled next.
+ * ioprio_classes, and can be used either as a toplevel queue or as an
+ * intermediate queue on a hierarchical setup.  @next_in_service
+ * points to the active entity of the sched_data service trees that
+ * will be scheduled next. It is used to reduce the number of steps
+ * needed for each hierarchical-schedule update.
  *
  * The supported ioprio_classes are the same as in CFQ, in descending
  * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
@@ -164,19 +166,23 @@ struct bfq_service_tree {
 struct bfq_sched_data {
 	/* entity in service */
 	struct bfq_entity *in_service_entity;
-	/* head-of-the-line entity in the scheduler */
+	/* head-of-line entity (see comments above) */
 	struct bfq_entity *next_in_service;
 	/* array of service trees, one per ioprio_class */
 	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
+	/* last time CLASS_IDLE was served */
+	unsigned long bfq_class_idle_last_service;
+
 };
 
 /**
  * struct bfq_entity - schedulable entity.
  *
- * A bfq_entity is used to represent a bfq_queue (leaf node in the upper
- * level scheduler). Each entity belongs to the sched_data of the parent
- * group hierarchy. Non-leaf entities have also their own sched_data,
- * stored in @my_sched_data.
+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
+ * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each
+ * entity belongs to the sched_data of the parent group in the cgroup
+ * hierarchy.  Non-leaf entities have also their own sched_data, stored
+ * in @my_sched_data.
  *
  * Each entity stores independently its priority values; this would
  * allow different weights on different devices, but this
@@ -187,23 +193,24 @@ struct bfq_sched_data {
  * update to take place the effective and the requested priority
  * values are synchronized.
  *
- * The weight value is calculated from the ioprio to export the same
- * interface as CFQ.  When dealing with  ``well-behaved'' queues (i.e.,
- * queues that do not spend too much time to consume their budget
- * and have true sequential behavior, and when there are no external
- * factors breaking anticipation) the relative weights at each level
- * of the hierarchy should be guaranteed.  All the fields are
- * protected by the queue lock of the containing bfqd.
+ * Unless cgroups are used, the weight value is calculated from the
+ * ioprio to export the same interface as CFQ.  When dealing with
+ * ``well-behaved'' queues (i.e., queues that do not spend too much
+ * time to consume their budget and have true sequential behavior, and
+ * when there are no external factors breaking anticipation) the
+ * relative weights at each level of the cgroups hierarchy should be
+ * guaranteed.  All the fields are protected by the queue lock of the
+ * containing bfqd.
  */
 struct bfq_entity {
 	/* service_tree member */
 	struct rb_node rb_node;
 
 	/*
-	 * flag, true if the entity is on a tree (either the active or
-	 * the idle one of its service_tree).
+	 * Flag, true if the entity is on a tree (either the active or
+	 * the idle one of its service_tree) or is in service.
 	 */
-	int on_st;
+	bool on_st;
 
 	/* B-WF2Q+ start and finish timestamps [sectors/weight] */
 	u64 start, finish;
@@ -246,6 +253,8 @@ struct bfq_entity {
 	int prio_changed;
 };
 
+struct bfq_group;
+
 /**
  * struct bfq_ttime - per process thinktime stats.
  */
@@ -265,7 +274,11 @@ struct bfq_ttime {
  * struct bfq_queue - leaf schedulable entity.
  *
  * A bfq_queue is a leaf request queue; it can be associated with an
- * io_context or more, if it is async.
+ * io_context or more, if it is async. @cgroup holds a reference to
+ * the cgroup, to be sure that it does not disappear while a bfqq
+ * still references it (mostly to avoid races between request issuing
+ * and task migration followed by cgroup destruction).  All the fields
+ * are protected by the queue lock of the containing bfqd.
  */
 struct bfq_queue {
 	/* reference counter */
@@ -338,6 +351,9 @@ struct bfq_io_cq {
 	struct bfq_queue *bfqq[2];
 	/* per (request_queue, blkcg) ioprio */
 	int ioprio;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	uint64_t blkcg_serial_nr; /* the current blkcg serial */
+#endif
 };
 
 /**
@@ -351,8 +367,8 @@ struct bfq_data {
 	/* dispatch queue */
 	struct list_head dispatch;
 
-	/* root @bfq_sched_data for the device */
-	struct bfq_sched_data sched_data;
+	/* root bfq_group for the device */
+	struct bfq_group *root_group;
 
 	/*
 	 * Number of bfq_queues containing requests (including the
@@ -423,8 +439,6 @@ struct bfq_data {
 	unsigned int bfq_back_max;
 	/* maximum idling time */
 	u32 bfq_slice_idle;
-	/* last time CLASS_IDLE was served */
-	u64 bfq_class_idle_last_service;
 
 	/* user-configured max budget value (0 for auto-tuning) */
 	int bfq_user_max_budget;
@@ -516,8 +530,35 @@ BFQ_BFQQ_FNS(IO_bound);
 #undef BFQ_BFQQ_FNS
 
 /* Logging facilities. */
-#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
-	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
+
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...)	do {			\
+	char __pbuf[128];						\
+									\
+	blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
+	blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
+			bfq_bfqq_sync((bfqq)) ? 'S' : 'A',		\
+			  __pbuf, ##args);				\
+} while (0)
+
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...)	do {			\
+	char __pbuf[128];						\
+									\
+	blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf));		\
+	blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args);	\
+} while (0)
+
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...)	\
+	blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid,	\
+			bfq_bfqq_sync((bfqq)) ? 'S' : 'A',		\
+				##args)
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...)		do {} while (0)
+
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
 
 #define bfq_log(bfqd, fmt, args...) \
 	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
@@ -534,15 +575,120 @@ enum bfqq_expiration {
 	BFQQE_PREEMPTED		/* preemption in progress */
 };
 
+struct bfqg_stats {
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	/* number of ios merged */
+	struct blkg_rwstat		merged;
+	/* total time spent on device in ns, may not be accurate w/ queueing */
+	struct blkg_rwstat		service_time;
+	/* total time spent waiting in scheduler queue in ns */
+	struct blkg_rwstat		wait_time;
+	/* number of IOs queued up */
+	struct blkg_rwstat		queued;
+	/* total disk time and nr sectors dispatched by this group */
+	struct blkg_stat		time;
+	/* sum of number of ios queued across all samples */
+	struct blkg_stat		avg_queue_size_sum;
+	/* count of samples taken for average */
+	struct blkg_stat		avg_queue_size_samples;
+	/* how many times this group has been removed from service tree */
+	struct blkg_stat		dequeue;
+	/* total time spent waiting for it to be assigned a timeslice. */
+	struct blkg_stat		group_wait_time;
+	/* time spent idling for this blkcg_gq */
+	struct blkg_stat		idle_time;
+	/* total time with empty current active q with other requests queued */
+	struct blkg_stat		empty_time;
+	/* fields after this shouldn't be cleared on stat reset */
+	uint64_t			start_group_wait_time;
+	uint64_t			start_idle_time;
+	uint64_t			start_empty_time;
+	uint16_t			flags;
+#endif	/* CONFIG_BFQ_GROUP_IOSCHED */
+};
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
+/*
+ * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
+ *
+ * @ps: @blkcg_policy_storage that this structure inherits
+ * @weight: weight of the bfq_group
+ */
+struct bfq_group_data {
+	/* must be the first member */
+	struct blkcg_policy_data pd;
+
+	unsigned short weight;
+};
+
+/**
+ * struct bfq_group - per (device, cgroup) data structure.
+ * @entity: schedulable entity to insert into the parent group sched_data.
+ * @sched_data: own sched_data, to contain child entities (they may be
+ *              both bfq_queues and bfq_groups).
+ * @bfqd: the bfq_data for the device this group acts upon.
+ * @async_bfqq: array of async queues for all the tasks belonging to
+ *              the group, one queue per ioprio value per ioprio_class,
+ *              except for the idle class that has only one queue.
+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
+ *             to avoid too many special cases during group creation/
+ *             migration.
+ * @stats: stats for this bfqg.
+ *
+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
+ * there is a set of bfq_groups, each one collecting the lower-level
+ * entities belonging to the group that are acting on the same device.
+ *
+ * Locking works as follows:
+ *    o @bfqd is protected by the queue lock, RCU is used to access it
+ *      from the readers.
+ *    o All the other fields are protected by the @bfqd queue lock.
+ */
+struct bfq_group {
+	/* must be the first member */
+	struct blkg_policy_data pd;
+
+	struct bfq_entity entity;
+	struct bfq_sched_data sched_data;
+
+	void *bfqd;
+
+	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+	struct bfq_queue *async_idle_bfqq;
+
+	struct bfq_entity *my_entity;
+
+	struct bfqg_stats stats;
+};
+
+#else
+struct bfq_group {
+	struct bfq_sched_data sched_data;
+
+	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+	struct bfq_queue *async_idle_bfqq;
+
+	struct rb_root rq_pos_tree;
+};
+#endif
+
 static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
 
+static unsigned int bfq_class_idx(struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	return bfqq ? bfqq->ioprio_class - 1 :
+		BFQ_DEFAULT_GRP_CLASS - 1;
+}
+
 static struct bfq_service_tree *
 bfq_entity_service_tree(struct bfq_entity *entity)
 {
 	struct bfq_sched_data *sched_data = entity->sched_data;
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-	unsigned int idx = bfqq ? bfqq->ioprio_class - 1 :
-				  BFQ_DEFAULT_GRP_CLASS - 1;
+	unsigned int idx = bfq_class_idx(entity);
 
 	return sched_data->service_tree + idx;
 }
@@ -568,16 +714,9 @@ static void bfq_put_queue(struct bfq_queue *bfqq);
 static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
 				       struct bio *bio, bool is_sync,
 				       struct bfq_io_cq *bic);
+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
 static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
 
-/*
- * Array of async queues for all the processes, one queue
- * per ioprio value per ioprio_class.
- */
-struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
-/* Async queue for the idle class (ioprio is ignored) */
-struct bfq_queue *async_idle_bfqq;
-
 /* Expiration time of sync (0) and async (1) requests, in ns. */
 static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
 
@@ -663,30 +802,222 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
 }
 
 /*
- * Next two macros are just fake loops for the moment. They will
- * become true loops in the cgroups-enabled variant of the code. Such
- * a variant, in its turn, will be introduced by next commit.
+ * Scheduler run of queue, if there are requests pending and no one in the
+ * driver that will restart queueing.
+ */
+static void bfq_schedule_dispatch(struct bfq_data *bfqd)
+{
+	if (bfqd->queued != 0) {
+		bfq_log(bfqd, "schedule dispatch");
+		blk_mq_run_hw_queues(bfqd->queue, true);
+	}
+}
+
+/**
+ * bfq_gt - compare two timestamps.
+ * @a: first ts.
+ * @b: second ts.
+ *
+ * Return @a > @b, dealing with wrapping correctly.
+ */
+static int bfq_gt(u64 a, u64 b)
+{
+	return (s64)(a - b) > 0;
+}
+
+static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree)
+{
+	struct rb_node *node = tree->rb_node;
+
+	return rb_entry(node, struct bfq_entity, rb_node);
+}
+
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd);
+
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service);
+
+/**
+ * bfq_update_next_in_service - update sd->next_in_service
+ * @sd: sched_data for which to perform the update.
+ * @new_entity: if not NULL, pointer to the entity whose activation,
+ *		requeueing or repositionig triggered the invocation of
+ *		this function.
+ *
+ * This function is called to update sd->next_in_service, which, in
+ * its turn, may change as a consequence of the insertion or
+ * extraction of an entity into/from one of the active trees of
+ * sd. These insertions/extractions occur as a consequence of
+ * activations/deactivations of entities, with some activations being
+ * 'true' activations, and other activations being requeueings (i.e.,
+ * implementing the second, requeueing phase of the mechanism used to
+ * reposition an entity in its active tree; see comments on
+ * __bfq_activate_entity and __bfq_requeue_entity for details). In
+ * both the last two activation sub-cases, new_entity points to the
+ * just activated or requeued entity.
+ *
+ * Returns true if sd->next_in_service changes in such a way that
+ * entity->parent may become the next_in_service for its parent
+ * entity.
  */
+static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
+				       struct bfq_entity *new_entity)
+{
+	struct bfq_entity *next_in_service = sd->next_in_service;
+	bool parent_sched_may_change = false;
+
+	/*
+	 * If this update is triggered by the activation, requeueing
+	 * or repositiong of an entity that does not coincide with
+	 * sd->next_in_service, then a full lookup in the active tree
+	 * can be avoided. In fact, it is enough to check whether the
+	 * just-modified entity has a higher priority than
+	 * sd->next_in_service, or, even if it has the same priority
+	 * as sd->next_in_service, is eligible and has a lower virtual
+	 * finish time than sd->next_in_service. If this compound
+	 * condition holds, then the new entity becomes the new
+	 * next_in_service. Otherwise no change is needed.
+	 */
+	if (new_entity && new_entity != sd->next_in_service) {
+		/*
+		 * Flag used to decide whether to replace
+		 * sd->next_in_service with new_entity. Tentatively
+		 * set to true, and left as true if
+		 * sd->next_in_service is NULL.
+		 */
+		bool replace_next = true;
+
+		/*
+		 * If there is already a next_in_service candidate
+		 * entity, then compare class priorities or timestamps
+		 * to decide whether to replace sd->service_tree with
+		 * new_entity.
+		 */
+		if (next_in_service) {
+			unsigned int new_entity_class_idx =
+				bfq_class_idx(new_entity);
+			struct bfq_service_tree *st =
+				sd->service_tree + new_entity_class_idx;
+
+			/*
+			 * For efficiency, evaluate the most likely
+			 * sub-condition first.
+			 */
+			replace_next =
+				(new_entity_class_idx ==
+				 bfq_class_idx(next_in_service)
+				 &&
+				 !bfq_gt(new_entity->start, st->vtime)
+				 &&
+				 bfq_gt(next_in_service->finish,
+					new_entity->finish))
+				||
+				new_entity_class_idx <
+				bfq_class_idx(next_in_service);
+		}
+
+		if (replace_next)
+			next_in_service = new_entity;
+	} else /* invoked because of a deactivation: lookup needed */
+		next_in_service = bfq_lookup_next_entity(sd);
+
+	if (next_in_service) {
+		parent_sched_may_change = !sd->next_in_service ||
+			bfq_update_parent_budget(next_in_service);
+	}
+
+	sd->next_in_service = next_in_service;
+
+	if (!next_in_service)
+		return parent_sched_may_change;
+
+	return parent_sched_may_change;
+}
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+/* both next loops stop at one of the child entities of the root group */
 #define for_each_entity(entity)	\
-	for (; entity ; entity = NULL)
+	for (; entity ; entity = entity->parent)
 
+/*
+ * For each iteration, compute parent in advance, so as to be safe if
+ * entity is deallocated during the iteration. Such a deallocation may
+ * happen as a consequence of a bfq_put_queue that frees the bfq_queue
+ * containing entity.
+ */
 #define for_each_entity_safe(entity, parent) \
-	for (parent = NULL; entity ; entity = parent)
+	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
 
-static int bfq_update_next_in_service(struct bfq_sched_data *sd)
+/*
+ * Returns true if this budget changes may let next_in_service->parent
+ * become the next_in_service entity for its parent entity.
+ */
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
 {
-	return 0;
+	struct bfq_entity *bfqg_entity;
+	struct bfq_group *bfqg;
+	struct bfq_sched_data *group_sd;
+	bool ret = false;
+
+	group_sd = next_in_service->sched_data;
+
+	bfqg = container_of(group_sd, struct bfq_group, sched_data);
+	/*
+	 * bfq_group's my_entity field is not NULL only if the group
+	 * is not the root group. We must not touch the root entity
+	 * as it must never become an in-service entity.
+	 */
+	bfqg_entity = bfqg->my_entity;
+	if (bfqg_entity) {
+		if (bfqg_entity->budget > next_in_service->budget)
+			ret = true;
+		bfqg_entity->budget = next_in_service->budget;
+	}
+
+	return ret;
+}
+
+/*
+ * This function tells whether entity stops being a candidate for next
+ * service, according to the following logic.
+ *
+ * This function is invoked for an entity that is about to be set in
+ * service. If such an entity is a queue, then the entity is no longer
+ * a candidate for next service (i.e, a candidate entity to serve
+ * after the in-service entity is expired). The function then returns
+ * true.
+ */
+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
+{
+	if (bfq_entity_to_bfqq(entity))
+		return true;
+
+	return false;
 }
 
-static void bfq_check_next_in_service(struct bfq_sched_data *sd,
-				      struct bfq_entity *entity)
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+/*
+ * Next two macros are fake loops when cgroups support is not
+ * enabled. I fact, in such a case, there is only one level to go up
+ * (to reach the root group).
+ */
+#define for_each_entity(entity)	\
+	for (; entity ; entity = NULL)
+
+#define for_each_entity_safe(entity, parent) \
+	for (parent = NULL; entity ; entity = parent)
+
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
 {
+	return false;
 }
 
-static void bfq_update_budget(struct bfq_entity *next_in_service)
+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
 {
+	return true;
 }
 
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
+
 /*
  * Shift for timestamp calculations.  This actually limits the maximum
  * service allowed in one timestamp delta (small shift values increase it),
@@ -696,18 +1027,6 @@ static void bfq_update_budget(struct bfq_entity *next_in_service)
  */
 #define WFQ_SERVICE_SHIFT	22
 
-/**
- * bfq_gt - compare two timestamps.
- * @a: first ts.
- * @b: second ts.
- *
- * Return @a > @b, dealing with wrapping correctly.
- */
-static int bfq_gt(u64 a, u64 b)
-{
-	return (s64)(a - b) > 0;
-}
-
 static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
 {
 	struct bfq_queue *bfqq = NULL;
@@ -926,6 +1245,11 @@ static void bfq_active_insert(struct bfq_service_tree *st,
 {
 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 	struct rb_node *node = &entity->rb_node;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	struct bfq_sched_data *sd = NULL;
+	struct bfq_group *bfqg = NULL;
+	struct bfq_data *bfqd = NULL;
+#endif
 
 	bfq_insert(&st->active, entity);
 
@@ -936,6 +1260,11 @@ static void bfq_active_insert(struct bfq_service_tree *st,
 
 	bfq_update_active_tree(node);
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	sd = entity->sched_data;
+	bfqg = container_of(sd, struct bfq_group, sched_data);
+	bfqd = (struct bfq_data *)bfqg->bfqd;
+#endif
 	if (bfqq)
 		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
 }
@@ -1014,6 +1343,11 @@ static void bfq_active_extract(struct bfq_service_tree *st,
 {
 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 	struct rb_node *node;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	struct bfq_sched_data *sd = NULL;
+	struct bfq_group *bfqg = NULL;
+	struct bfq_data *bfqd = NULL;
+#endif
 
 	node = bfq_find_deepest(&entity->rb_node);
 	bfq_extract(&st->active, entity);
@@ -1021,6 +1355,11 @@ static void bfq_active_extract(struct bfq_service_tree *st,
 	if (node)
 		bfq_update_active_tree(node);
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	sd = entity->sched_data;
+	bfqg = container_of(sd, struct bfq_group, sched_data);
+	bfqd = (struct bfq_data *)bfqg->bfqd;
+#endif
 	if (bfqq)
 		list_del(&bfqq->bfqq_list);
 }
@@ -1069,7 +1408,7 @@ static void bfq_forget_entity(struct bfq_service_tree *st,
 {
 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 
-	entity->on_st = 0;
+	entity->on_st = false;
 	st->wsum -= entity->weight;
 	if (bfqq && !is_in_service)
 		bfq_put_queue(bfqq);
@@ -1115,7 +1454,7 @@ static void bfq_forget_idle(struct bfq_service_tree *st)
 
 static struct bfq_service_tree *
 __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
-			 struct bfq_entity *entity)
+				struct bfq_entity *entity)
 {
 	struct bfq_service_tree *new_st = old_st;
 
@@ -1123,9 +1462,20 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
 		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 		unsigned short prev_weight, new_weight;
 		struct bfq_data *bfqd = NULL;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		struct bfq_sched_data *sd;
+		struct bfq_group *bfqg;
+#endif
 
 		if (bfqq)
 			bfqd = bfqq->bfqd;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		else {
+			sd = entity->my_sched_data;
+			bfqg = container_of(sd, struct bfq_group, sched_data);
+			bfqd = (struct bfq_data *)bfqg->bfqd;
+		}
+#endif
 
 		old_st->wsum -= entity->weight;
 
@@ -1171,6 +1521,9 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
 	return new_st;
 }
 
+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
+
 /**
  * bfq_bfqq_served - update the scheduler status after selection for
  *                   service.
@@ -1194,6 +1547,7 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
 		st->vtime += bfq_delta(served, st->wsum);
 		bfq_forget_idle(st);
 	}
+	bfqg_stats_set_start_empty_time(bfqq_group(bfqq));
 	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);
 }
 
@@ -1216,78 +1570,10 @@ static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
 	bfq_bfqq_served(bfqq, entity->budget - entity->service);
 }
 
-/**
- * __bfq_activate_entity - activate an entity.
- * @entity: the entity being activated.
- * @non_blocking_wait_rq: true if this entity was waiting for a request
- *
- * Called whenever an entity is activated, i.e., it is not active and one
- * of its children receives a new request, or has to be reactivated due to
- * budget exhaustion.  It uses the current budget of the entity (and the
- * service received if @entity is active) of the queue to calculate its
- * timestamps.
- */
-static void __bfq_activate_entity(struct bfq_entity *entity,
-				  bool non_blocking_wait_rq)
+static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
+					struct bfq_service_tree *st,
+					bool backshifted)
 {
-	struct bfq_sched_data *sd = entity->sched_data;
-	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
-	bool backshifted = false;
-
-	if (entity == sd->in_service_entity) {
-		/*
-		 * If we are requeueing the current entity we have
-		 * to take care of not charging to it service it has
-		 * not received.
-		 */
-		bfq_calc_finish(entity, entity->service);
-		entity->start = entity->finish;
-		sd->in_service_entity = NULL;
-	} else if (entity->tree == &st->active) {
-		/*
-		 * Requeueing an entity due to a change of some
-		 * next_in_service entity below it.  We reuse the
-		 * old start time.
-		 */
-		bfq_active_extract(st, entity);
-	} else {
-		unsigned long long min_vstart;
-
-		/* See comments on bfq_fqq_update_budg_for_activation */
-		if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
-			backshifted = true;
-			min_vstart = entity->finish;
-		} else
-			min_vstart = st->vtime;
-
-		if (entity->tree == &st->idle) {
-			/*
-			 * Must be on the idle tree, bfq_idle_extract() will
-			 * check for that.
-			 */
-			bfq_idle_extract(st, entity);
-			entity->start = bfq_gt(min_vstart, entity->finish) ?
-				min_vstart : entity->finish;
-		} else {
-			/*
-			 * The finish time of the entity may be invalid, and
-			 * it is in the past for sure, otherwise the queue
-			 * would have been on the idle tree.
-			 */
-			entity->start = min_vstart;
-			st->wsum += entity->weight;
-			/*
-			 * entity is about to be inserted into a service tree,
-			 * and then set in service: get a reference to make
-			 * sure entity does not disappear until it is no
-			 * longer in service or scheduled for service.
-			 */
-			bfq_get_entity(entity);
-
-			entity->on_st = 1;
-		}
-	}
-
 	st = __bfq_entity_update_weight_prio(st, entity);
 	bfq_calc_finish(entity, entity->budget);
 
@@ -1329,27 +1615,185 @@ static void __bfq_activate_entity(struct bfq_entity *entity,
 }
 
 /**
- * bfq_activate_entity - activate an entity and its ancestors if necessary.
+ * __bfq_activate_entity - handle activation of entity.
+ * @entity: the entity being activated.
+ * @non_blocking_wait_rq: true if entity was waiting for a request
+ *
+ * Called for a 'true' activation, i.e., if entity is not active and
+ * one of its children receives a new request.
+ *
+ * Basically, this function updates the timestamps of entity and
+ * inserts entity into its active tree, ater possible extracting it
+ * from its idle tree.
+ */
+static void __bfq_activate_entity(struct bfq_entity *entity,
+				  bool non_blocking_wait_rq)
+{
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+	bool backshifted = false;
+	unsigned long long min_vstart;
+
+	/* See comments on bfq_fqq_update_budg_for_activation */
+	if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
+		backshifted = true;
+		min_vstart = entity->finish;
+	} else
+		min_vstart = st->vtime;
+
+	if (entity->tree == &st->idle) {
+		/*
+		 * Must be on the idle tree, bfq_idle_extract() will
+		 * check for that.
+		 */
+		bfq_idle_extract(st, entity);
+		entity->start = bfq_gt(min_vstart, entity->finish) ?
+			min_vstart : entity->finish;
+	} else {
+		/*
+		 * The finish time of the entity may be invalid, and
+		 * it is in the past for sure, otherwise the queue
+		 * would have been on the idle tree.
+		 */
+		entity->start = min_vstart;
+		st->wsum += entity->weight;
+		/*
+		 * entity is about to be inserted into a service tree,
+		 * and then set in service: get a reference to make
+		 * sure entity does not disappear until it is no
+		 * longer in service or scheduled for service.
+		 */
+		bfq_get_entity(entity);
+
+		entity->on_st = true;
+	}
+
+	bfq_update_fin_time_enqueue(entity, st, backshifted);
+}
+
+/**
+ * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
+ * @entity: the entity being requeued or repositioned.
+ *
+ * Requeueing is needed if this entity stops being served, which
+ * happens if a leaf descendant entity has expired. On the other hand,
+ * repositioning is needed if the next_inservice_entity for the child
+ * entity has changed. See the comments inside the function for
+ * details.
+ *
+ * Basically, this function: 1) removes entity from its active tree if
+ * present there, 2) updates the timestamps of entity and 3) inserts
+ * entity back into its active tree (in the new, right position for
+ * the new values of the timestamps).
+ */
+static void __bfq_requeue_entity(struct bfq_entity *entity)
+{
+	struct bfq_sched_data *sd = entity->sched_data;
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+
+	if (entity == sd->in_service_entity) {
+		/*
+		 * We are requeueing the current in-service entity,
+		 * which may have to be done for one of the following
+		 * reasons:
+		 * - entity represents the in-service queue, and the
+		 *   in-service queue is being requeued after an
+		 *   expiration;
+		 * - entity represents a group, and its budget has
+		 *   changed because one of its child entities has
+		 *   just been either activated or requeued for some
+		 *   reason; the timestamps of the entity need then to
+		 *   be updated, and the entity needs to be enqueued
+		 *   or repositioned accordingly.
+		 *
+		 * In particular, before requeueing, the start time of
+		 * the entity must be moved forward to account for the
+		 * service that the entity has received while in
+		 * service. This is done by the next instructions. The
+		 * finish time will then be updated according to this
+		 * new value of the start time, and to the budget of
+		 * the entity.
+		 */
+		bfq_calc_finish(entity, entity->service);
+		entity->start = entity->finish;
+		/*
+		 * In addition, if the entity had more than one child
+		 * when set in service, then was not extracted from
+		 * the active tree. This implies that the position of
+		 * the entity in the active tree may need to be
+		 * changed now, because we have just updated the start
+		 * time of the entity, and we will update its finish
+		 * time in a moment (the requeueing is then, more
+		 * precisely, a repositioning in this case). To
+		 * implement this repositioning, we: 1) dequeue the
+		 * entity here, 2) update the finish time and
+		 * requeue the entity according to the new
+		 * timestamps below.
+		 */
+		if (entity->tree)
+			bfq_active_extract(st, entity);
+	} else { /* The entity is already active, and not in service */
+		/*
+		 * In this case, this function gets called only if the
+		 * next_in_service entity below this entity has
+		 * changed, and this change has caused the budget of
+		 * this entity to change, which, finally implies that
+		 * the finish time of this entity must be
+		 * updated. Such an update may cause the scheduling,
+		 * i.e., the position in the active tree, of this
+		 * entity to change. We handle this change by: 1)
+		 * dequeueing the entity here, 2) updating the finish
+		 * time and requeueing the entity according to the new
+		 * timestamps below. This is the same approach as the
+		 * non-extracted-entity sub-case above.
+		 */
+		bfq_active_extract(st, entity);
+	}
+
+	bfq_update_fin_time_enqueue(entity, st, false);
+}
+
+static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
+					  struct bfq_sched_data *sd,
+					  bool non_blocking_wait_rq)
+{
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+
+	if (sd->in_service_entity == entity || entity->tree == &st->active)
+		 /*
+		  * in service or already queued on the active tree,
+		  * requeue or reposition
+		  */
+		__bfq_requeue_entity(entity);
+	else
+		/*
+		 * Not in service and not queued on its active tree:
+		 * the activity is idle and this is a true activation.
+		 */
+		__bfq_activate_entity(entity, non_blocking_wait_rq);
+}
+
+
+/**
+ * bfq_activate_entity - activate or requeue an entity representing a bfq_queue,
+ *			 and activate, requeue or reposition all ancestors
+ *			 for which such an update becomes necessary.
  * @entity: the entity to activate.
  * @non_blocking_wait_rq: true if this entity was waiting for a request
- *
- * Activate @entity and all the entities on the path from it to the root.
+ * @requeue: true if this is a requeue, which implies that bfqq is
+ *	     being expired; thus ALL its ancestors stop being served and must
+ *	     therefore be requeued
  */
-static void bfq_activate_entity(struct bfq_entity *entity,
-				bool non_blocking_wait_rq)
+static void bfq_activate_requeue_entity(struct bfq_entity *entity,
+					bool non_blocking_wait_rq,
+					bool requeue)
 {
 	struct bfq_sched_data *sd;
 
 	for_each_entity(entity) {
-		__bfq_activate_entity(entity, non_blocking_wait_rq);
-
 		sd = entity->sched_data;
-		if (!bfq_update_next_in_service(sd))
-			/*
-			 * No need to propagate the activation to the
-			 * upper entities, as they will be updated when
-			 * the in-service entity is rescheduled.
-			 */
+		__bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
+
+		if (!bfq_update_next_in_service(sd, entity) && !requeue)
 			break;
 	}
 }
@@ -1357,52 +1801,48 @@ static void bfq_activate_entity(struct bfq_entity *entity,
 /**
  * __bfq_deactivate_entity - deactivate an entity from its service tree.
  * @entity: the entity to deactivate.
- * @requeue: if false, the entity will not be put into the idle tree.
+ * @ins_into_idle_tree: if false, the entity will not be put into the
+ *			idle tree.
  *
- * Deactivate an entity, independently from its previous state.  If the
- * entity was not on a service tree just return, otherwise if it is on
- * any scheduler tree, extract it from that tree, and if necessary
- * and if the caller did not specify @requeue, put it on the idle tree.
- *
- * Return %1 if the caller should update the entity hierarchy, i.e.,
- * if the entity was in service or if it was the next_in_service for
- * its sched_data; return %0 otherwise.
+ * Deactivates an entity, independently from its previous state.  Must
+ * be invoked only if entity is on a service tree. Extracts the entity
+ * from that tree, and if necessary and allowed, puts it on the idle
+ * tree.
  */
-static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
+static bool __bfq_deactivate_entity(struct bfq_entity *entity,
+				    bool ins_into_idle_tree)
 {
 	struct bfq_sched_data *sd = entity->sched_data;
 	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
 	int is_in_service = entity == sd->in_service_entity;
-	int ret = 0;
 
-	if (!entity->on_st)
-		return 0;
+	if (!entity->on_st) /* entity never activated, or already inactive */
+		return false;
 
-	if (is_in_service) {
+	if (is_in_service)
 		bfq_calc_finish(entity, entity->service);
-		sd->in_service_entity = NULL;
-	} else if (entity->tree == &st->active)
+
+	if (entity->tree == &st->active)
 		bfq_active_extract(st, entity);
-	else if (entity->tree == &st->idle)
+	else if (!is_in_service && entity->tree == &st->idle)
 		bfq_idle_extract(st, entity);
 
-	if (is_in_service || sd->next_in_service == entity)
-		ret = bfq_update_next_in_service(sd);
-
-	if (!requeue || !bfq_gt(entity->finish, st->vtime))
+	if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime))
 		bfq_forget_entity(st, entity, is_in_service);
 	else
 		bfq_idle_insert(st, entity);
 
-	return ret;
+	return true;
 }
 
 /**
- * bfq_deactivate_entity - deactivate an entity.
+ * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
  * @entity: the entity to deactivate.
- * @requeue: true if the entity can be put on the idle tree
+ * @ins_into_idle_tree: true if the entity can be put on the idle tree
  */
-static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
+static void bfq_deactivate_entity(struct bfq_entity *entity,
+				  bool ins_into_idle_tree,
+				  bool expiration)
 {
 	struct bfq_sched_data *sd;
 	struct bfq_entity *parent = NULL;
@@ -1410,63 +1850,102 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
 	for_each_entity_safe(entity, parent) {
 		sd = entity->sched_data;
 
-		if (!__bfq_deactivate_entity(entity, requeue))
+		if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) {
 			/*
-			 * The parent entity is still backlogged, and
-			 * we don't need to update it as it is still
-			 * in service.
+			 * entity is not in any tree any more, so
+			 * this deactivation is a no-op, and there is
+			 * nothing to change for upper-level entities
+			 * (in case of expiration, this can never
+			 * happen).
 			 */
-			break;
+			return;
+		}
+
+		if (sd->next_in_service == entity)
+			/*
+			 * entity was the next_in_service entity,
+			 * then, since entity has just been
+			 * deactivated, a new one must be found.
+			 */
+			bfq_update_next_in_service(sd, NULL);
 
 		if (sd->next_in_service)
 			/*
-			 * The parent entity is still backlogged and
-			 * the budgets on the path towards the root
-			 * need to be updated.
+			 * The parent entity is still backlogged,
+			 * because next_in_service is not NULL. So, no
+			 * further upwards deactivation must be
+			 * performed.  Yet, next_in_service has
+			 * changed.  Then the schedule does need to be
+			 * updated upwards.
 			 */
-			goto update;
+			break;
 
 		/*
-		 * If we get here, then the parent is no more backlogged and
-		 * we want to propagate the deactivation upwards.
+		 * If we get here, then the parent is no more
+		 * backlogged and we need to propagate the
+		 * deactivation upwards. Thus let the loop go on.
 		 */
-		requeue = 1;
-	}
 
-	return;
+		/*
+		 * Also let parent be queued into the idle tree on
+		 * deactivation, to preserve service guarantees, and
+		 * assuming that who invoked this function does not
+		 * need parent entities too to be removed completely.
+		 */
+		ins_into_idle_tree = true;
+	}
 
-update:
+	/*
+	 * If the deactivation loop is fully executed, then there are
+	 * no more entities to touch and next loop is not executed at
+	 * all. Otherwise, requeue remaining entities if they are
+	 * about to stop receiving service, or reposition them if this
+	 * is not the case.
+	 */
 	entity = parent;
 	for_each_entity(entity) {
-		__bfq_activate_entity(entity, false);
+		/*
+		 * Invoke __bfq_requeue_entity on entity, even if
+		 * already active, to requeue/reposition it in the
+		 * active tree (because sd->next_in_service has
+		 * changed)
+		 */
+		__bfq_requeue_entity(entity);
 
 		sd = entity->sched_data;
-		if (!bfq_update_next_in_service(sd))
+		if (!bfq_update_next_in_service(sd, entity) &&
+		    !expiration)
+			/*
+			 * next_in_service unchanged or not causing
+			 * any change in entity->parent->sd, and no
+			 * requeueing needed for expiration: stop
+			 * here.
+			 */
 			break;
 	}
 }
 
 /**
- * bfq_update_vtime - update vtime if necessary.
+ * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
+ *                       if needed, to have at least one entity eligible.
  * @st: the service tree to act upon.
  *
- * If necessary update the service tree vtime to have at least one
- * eligible entity, skipping to its start time.  Assumes that the
- * active tree of the device is not empty.
- *
- * NOTE: this hierarchical implementation updates vtimes quite often,
- * we may end up with reactivated processes getting timestamps after a
- * vtime skip done because we needed a ->first_active entity on some
- * intermediate node.
+ * Assumes that st is not empty.
  */
-static void bfq_update_vtime(struct bfq_service_tree *st)
+static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st)
 {
-	struct bfq_entity *entry;
-	struct rb_node *node = st->active.rb_node;
+	struct bfq_entity *root_entity = bfq_root_active_entity(&st->active);
+
+	if (bfq_gt(root_entity->min_start, st->vtime))
+		return root_entity->min_start;
+
+	return st->vtime;
+}
 
-	entry = rb_entry(node, struct bfq_entity, rb_node);
-	if (bfq_gt(entry->min_start, st->vtime)) {
-		st->vtime = entry->min_start;
+static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value)
+{
+	if (new_value > st->vtime) {
+		st->vtime = new_value;
 		bfq_forget_idle(st);
 	}
 }
@@ -1475,6 +1954,7 @@ static void bfq_update_vtime(struct bfq_service_tree *st)
  * bfq_first_active_entity - find the eligible entity with
  *                           the smallest finish time
  * @st: the service tree to select from.
+ * @vtime: the system virtual to use as a reference for eligibility
  *
  * This function searches the first schedulable entity, starting from the
  * root of the tree and going on the left every time on this side there is
@@ -1482,7 +1962,8 @@ static void bfq_update_vtime(struct bfq_service_tree *st)
  * the right is followed only if a) the left subtree contains no eligible
  * entities and b) no eligible entity has been found yet.
  */
-static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st,
+						  u64 vtime)
 {
 	struct bfq_entity *entry, *first = NULL;
 	struct rb_node *node = st->active.rb_node;
@@ -1490,13 +1971,13 @@ static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
 	while (node) {
 		entry = rb_entry(node, struct bfq_entity, rb_node);
 left:
-		if (!bfq_gt(entry->start, st->vtime))
+		if (!bfq_gt(entry->start, vtime))
 			first = entry;
 
 		if (node->rb_left) {
 			entry = rb_entry(node->rb_left,
 					 struct bfq_entity, rb_node);
-			if (!bfq_gt(entry->min_start, st->vtime)) {
+			if (!bfq_gt(entry->min_start, vtime)) {
 				node = node->rb_left;
 				goto left;
 			}
@@ -1506,222 +1987,1429 @@ left:
 		node = node->rb_right;
 	}
 
-	return first;
+	return first;
+}
+
+/**
+ * __bfq_lookup_next_entity - return the first eligible entity in @st.
+ * @st: the service tree.
+ *
+ * If there is no in-service entity for the sched_data st belongs to,
+ * then return the entity that will be set in service if:
+ * 1) the parent entity this st belongs to is set in service;
+ * 2) no entity belonging to such parent entity undergoes a state change
+ * that would influence the timestamps of the entity (e.g., becomes idle,
+ * becomes backlogged, changes its budget, ...).
+ *
+ * In this first case, update the virtual time in @st too (see the
+ * comments on this update inside the function).
+ *
+ * In constrast, if there is an in-service entity, then return the
+ * entity that would be set in service if not only the above
+ * conditions, but also the next one held true: the currently
+ * in-service entity, on expiration,
+ * 1) gets a finish time equal to the current one, or
+ * 2) is not eligible any more, or
+ * 3) is idle.
+ */
+static struct bfq_entity *
+__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service)
+{
+	struct bfq_entity *entity;
+	u64 new_vtime;
+
+	if (RB_EMPTY_ROOT(&st->active))
+		return NULL;
+
+	/*
+	 * Get the value of the system virtual time for which at
+	 * least one entity is eligible.
+	 */
+	new_vtime = bfq_calc_vtime_jump(st);
+
+	/*
+	 * If there is no in-service entity for the sched_data this
+	 * active tree belongs to, then push the system virtual time
+	 * up to the value that guarantees that at least one entity is
+	 * eligible. If, instead, there is an in-service entity, then
+	 * do not make any such update, because there is already an
+	 * eligible entity, namely the in-service one (even if the
+	 * entity is not on st, because it was extracted when set in
+	 * service).
+	 */
+	if (!in_service)
+		bfq_update_vtime(st, new_vtime);
+
+	entity = bfq_first_active_entity(st, new_vtime);
+
+	return entity;
+}
+
+/**
+ * bfq_lookup_next_entity - return the first eligible entity in @sd.
+ * @sd: the sched_data.
+ *
+ * This function is invoked when there has been a change in the trees
+ * for sd, and we need know what is the new next entity after this
+ * change.
+ */
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd)
+{
+	struct bfq_service_tree *st = sd->service_tree;
+	struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1);
+	struct bfq_entity *entity = NULL;
+	int class_idx = 0;
+
+	/*
+	 * Choose from idle class, if needed to guarantee a minimum
+	 * bandwidth to this class (and if there is some active entity
+	 * in idle class). This should also mitigate
+	 * priority-inversion problems in case a low priority task is
+	 * holding file system resources.
+	 */
+	if (time_is_before_jiffies(sd->bfq_class_idle_last_service +
+				   BFQ_CL_IDLE_TIMEOUT)) {
+		if (!RB_EMPTY_ROOT(&idle_class_st->active))
+			class_idx = BFQ_IOPRIO_CLASSES - 1;
+		/* About to be served if backlogged, or not yet backlogged */
+		sd->bfq_class_idle_last_service = jiffies;
+	}
+
+	/*
+	 * Find the next entity to serve for the highest-priority
+	 * class, unless the idle class needs to be served.
+	 */
+	for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) {
+		entity = __bfq_lookup_next_entity(st + class_idx,
+						  sd->in_service_entity);
+
+		if (entity)
+			break;
+	}
+
+	if (!entity)
+		return NULL;
+
+	return entity;
+}
+
+static bool next_queue_may_preempt(struct bfq_data *bfqd)
+{
+	struct bfq_sched_data *sd = &bfqd->root_group->sched_data;
+
+	return sd->next_in_service != sd->in_service_entity;
+}
+
+/*
+ * Get next queue for service.
+ */
+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
+{
+	struct bfq_entity *entity = NULL;
+	struct bfq_sched_data *sd;
+	struct bfq_queue *bfqq;
+
+	if (bfqd->busy_queues == 0)
+		return NULL;
+
+	/*
+	 * Traverse the path from the root to the leaf entity to
+	 * serve. Set in service all the entities visited along the
+	 * way.
+	 */
+	sd = &bfqd->root_group->sched_data;
+	for (; sd ; sd = entity->my_sched_data) {
+		/*
+		 * WARNING. We are about to set the in-service entity
+		 * to sd->next_in_service, i.e., to the (cached) value
+		 * returned by bfq_lookup_next_entity(sd) the last
+		 * time it was invoked, i.e., the last time when the
+		 * service order in sd changed as a consequence of the
+		 * activation or deactivation of an entity. In this
+		 * respect, if we execute bfq_lookup_next_entity(sd)
+		 * in this very moment, it may, although with low
+		 * probability, yield a different entity than that
+		 * pointed to by sd->next_in_service. This rare event
+		 * happens in case there was no CLASS_IDLE entity to
+		 * serve for sd when bfq_lookup_next_entity(sd) was
+		 * invoked for the last time, while there is now one
+		 * such entity.
+		 *
+		 * If the above event happens, then the scheduling of
+		 * such entity in CLASS_IDLE is postponed until the
+		 * service of the sd->next_in_service entity
+		 * finishes. In fact, when the latter is expired,
+		 * bfq_lookup_next_entity(sd) gets called again,
+		 * exactly to update sd->next_in_service.
+		 */
+
+		/* Make next_in_service entity become in_service_entity */
+		entity = sd->next_in_service;
+		sd->in_service_entity = entity;
+
+		/*
+		 * Reset the accumulator of the amount of service that
+		 * the entity is about to receive.
+		 */
+		entity->service = 0;
+
+		/*
+		 * If entity is no longer a candidate for next
+		 * service, then we extract it from its active tree,
+		 * for the following reason. To further boost the
+		 * throughput in some special case, BFQ needs to know
+		 * which is the next candidate entity to serve, while
+		 * there is already an entity in service. In this
+		 * respect, to make it easy to compute/update the next
+		 * candidate entity to serve after the current
+		 * candidate has been set in service, there is a case
+		 * where it is necessary to extract the current
+		 * candidate from its service tree. Such a case is
+		 * when the entity just set in service cannot be also
+		 * a candidate for next service. Details about when
+		 * this conditions holds are reported in the comments
+		 * on the function bfq_no_longer_next_in_service()
+		 * invoked below.
+		 */
+		if (bfq_no_longer_next_in_service(entity))
+			bfq_active_extract(bfq_entity_service_tree(entity),
+					   entity);
+
+		/*
+		 * For the same reason why we may have just extracted
+		 * entity from its active tree, we may need to update
+		 * next_in_service for the sched_data of entity too,
+		 * regardless of whether entity has been extracted.
+		 * In fact, even if entity has not been extracted, a
+		 * descendant entity may get extracted. Such an event
+		 * would cause a change in next_in_service for the
+		 * level of the descendant entity, and thus possibly
+		 * back to upper levels.
+		 *
+		 * We cannot perform the resulting needed update
+		 * before the end of this loop, because, to know which
+		 * is the correct next-to-serve candidate entity for
+		 * each level, we need first to find the leaf entity
+		 * to set in service. In fact, only after we know
+		 * which is the next-to-serve leaf entity, we can
+		 * discover whether the parent entity of the leaf
+		 * entity becomes the next-to-serve, and so on.
+		 */
+
+	}
+
+	bfqq = bfq_entity_to_bfqq(entity);
+
+	/*
+	 * We can finally update all next-to-serve entities along the
+	 * path from the leaf entity just set in service to the root.
+	 */
+	for_each_entity(entity) {
+		struct bfq_sched_data *sd = entity->sched_data;
+
+		if (!bfq_update_next_in_service(sd, NULL))
+			break;
+	}
+
+	return bfqq;
+}
+
+static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
+{
+	struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
+	struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
+	struct bfq_entity *entity = in_serv_entity;
+
+	if (bfqd->in_service_bic) {
+		put_io_context(bfqd->in_service_bic->icq.ioc);
+		bfqd->in_service_bic = NULL;
+	}
+
+	bfq_clear_bfqq_wait_request(in_serv_bfqq);
+	hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+	bfqd->in_service_queue = NULL;
+
+	/*
+	 * When this function is called, all in-service entities have
+	 * been properly deactivated or requeued, so we can safely
+	 * execute the final step: reset in_service_entity along the
+	 * path from entity to the root.
+	 */
+	for_each_entity(entity)
+		entity->sched_data->in_service_entity = NULL;
+
+	/*
+	 * in_serv_entity is no longer in service, so, if it is in no
+	 * service tree either, then release the service reference to
+	 * the queue it represents (taken with bfq_get_entity).
+	 */
+	if (!in_serv_entity->on_st)
+		bfq_put_queue(in_serv_bfqq);
+}
+
+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+				bool ins_into_idle_tree, bool expiration)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	bfq_deactivate_entity(entity, ins_into_idle_tree, expiration);
+}
+
+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq),
+				    false);
+	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
+}
+
+static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	bfq_activate_requeue_entity(entity, false,
+				    bfqq == bfqd->in_service_queue);
+}
+
+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
+
+/*
+ * Called when the bfqq no longer has requests pending, remove it from
+ * the service tree. As a special case, it can be invoked during an
+ * expiration.
+ */
+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			      bool expiration)
+{
+	bfq_log_bfqq(bfqd, bfqq, "del from busy");
+
+	bfq_clear_bfqq_busy(bfqq);
+
+	bfqd->busy_queues--;
+
+	bfqg_stats_update_dequeue(bfqq_group(bfqq));
+
+	bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
+}
+
+/*
+ * Called when an inactive queue receives a new request.
+ */
+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	bfq_log_bfqq(bfqd, bfqq, "add to busy");
+
+	bfq_activate_bfqq(bfqd, bfqq);
+
+	bfq_mark_bfqq_busy(bfqq);
+	bfqd->busy_queues++;
+}
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
+/* bfqg stats flags */
+enum bfqg_stats_flags {
+	BFQG_stats_waiting = 0,
+	BFQG_stats_idling,
+	BFQG_stats_empty,
+};
+
+#define BFQG_FLAG_FNS(name)						\
+static void bfqg_stats_mark_##name(struct bfqg_stats *stats)	\
+{									\
+	stats->flags |= (1 << BFQG_stats_##name);			\
+}									\
+static void bfqg_stats_clear_##name(struct bfqg_stats *stats)	\
+{									\
+	stats->flags &= ~(1 << BFQG_stats_##name);			\
+}									\
+static int bfqg_stats_##name(struct bfqg_stats *stats)		\
+{									\
+	return (stats->flags & (1 << BFQG_stats_##name)) != 0;		\
+}									\
+
+BFQG_FLAG_FNS(waiting)
+BFQG_FLAG_FNS(idling)
+BFQG_FLAG_FNS(empty)
+#undef BFQG_FLAG_FNS
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
+{
+	unsigned long long now;
+
+	if (!bfqg_stats_waiting(stats))
+		return;
+
+	now = sched_clock();
+	if (time_after64(now, stats->start_group_wait_time))
+		blkg_stat_add(&stats->group_wait_time,
+			      now - stats->start_group_wait_time);
+	bfqg_stats_clear_waiting(stats);
+}
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
+						 struct bfq_group *curr_bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	if (bfqg_stats_waiting(stats))
+		return;
+	if (bfqg == curr_bfqg)
+		return;
+	stats->start_group_wait_time = sched_clock();
+	bfqg_stats_mark_waiting(stats);
+}
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
+{
+	unsigned long long now;
+
+	if (!bfqg_stats_empty(stats))
+		return;
+
+	now = sched_clock();
+	if (time_after64(now, stats->start_empty_time))
+		blkg_stat_add(&stats->empty_time,
+			      now - stats->start_empty_time);
+	bfqg_stats_clear_empty(stats);
+}
+
+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
+{
+	blkg_stat_add(&bfqg->stats.dequeue, 1);
+}
+
+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	if (blkg_rwstat_total(&stats->queued))
+		return;
+
+	/*
+	 * group is already marked empty. This can happen if bfqq got new
+	 * request in parent group and moved to this group while being added
+	 * to service tree. Just ignore the event and move on.
+	 */
+	if (bfqg_stats_empty(stats))
+		return;
+
+	stats->start_empty_time = sched_clock();
+	bfqg_stats_mark_empty(stats);
+}
+
+static void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	if (bfqg_stats_idling(stats)) {
+		unsigned long long now = sched_clock();
+
+		if (time_after64(now, stats->start_idle_time))
+			blkg_stat_add(&stats->idle_time,
+				      now - stats->start_idle_time);
+		bfqg_stats_clear_idling(stats);
+	}
+}
+
+static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	stats->start_idle_time = sched_clock();
+	bfqg_stats_mark_idling(stats);
+}
+
+static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	blkg_stat_add(&stats->avg_queue_size_sum,
+		      blkg_rwstat_total(&stats->queued));
+	blkg_stat_add(&stats->avg_queue_size_samples, 1);
+	bfqg_stats_update_group_wait_time(stats);
+}
+
+/*
+ * blk-cgroup policy-related handlers
+ * The following functions help in converting between blk-cgroup
+ * internal structures and BFQ-specific structures.
+ */
+
+static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)
+{
+	return pd ? container_of(pd, struct bfq_group, pd) : NULL;
+}
+
+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)
+{
+	return pd_to_blkg(&bfqg->pd);
+}
+
+static struct blkcg_policy blkcg_policy_bfq;
+
+static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)
+{
+	return pd_to_bfqg(blkg_to_pd(blkg, &blkcg_policy_bfq));
+}
+
+/*
+ * bfq_group handlers
+ * The following functions help in navigating the bfq_group hierarchy
+ * by allowing to find the parent of a bfq_group or the bfq_group
+ * associated to a bfq_queue.
+ */
+
+static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
+{
+	struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
+
+	return pblkg ? blkg_to_bfqg(pblkg) : NULL;
+}
+
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
+{
+	struct bfq_entity *group_entity = bfqq->entity.parent;
+
+	return group_entity ? container_of(group_entity, struct bfq_group,
+					   entity) :
+			      bfqq->bfqd->root_group;
+}
+
+/*
+ * The following two functions handle get and put of a bfq_group by
+ * wrapping the related blk-cgroup hooks.
+ */
+
+static void bfqg_get(struct bfq_group *bfqg)
+{
+	return blkg_get(bfqg_to_blkg(bfqg));
+}
+
+static void bfqg_put(struct bfq_group *bfqg)
+{
+	return blkg_put(bfqg_to_blkg(bfqg));
+}
+
+static void bfqg_stats_update_io_add(struct bfq_group *bfqg,
+				     struct bfq_queue *bfqq,
+				     unsigned int op)
+{
+	blkg_rwstat_add(&bfqg->stats.queued, op, 1);
+	bfqg_stats_end_empty_time(&bfqg->stats);
+	if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
+		bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
+}
+
+static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op)
+{
+	blkg_rwstat_add(&bfqg->stats.queued, op, -1);
+}
+
+static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op)
+{
+	blkg_rwstat_add(&bfqg->stats.merged, op, 1);
+}
+
+static void bfqg_stats_update_completion(struct bfq_group *bfqg,
+			uint64_t start_time, uint64_t io_start_time,
+			unsigned int op)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+	unsigned long long now = sched_clock();
+
+	if (time_after64(now, io_start_time))
+		blkg_rwstat_add(&stats->service_time, op,
+				now - io_start_time);
+	if (time_after64(io_start_time, start_time))
+		blkg_rwstat_add(&stats->wait_time, op,
+				io_start_time - start_time);
+}
+
+/* @stats = 0 */
+static void bfqg_stats_reset(struct bfqg_stats *stats)
+{
+	/* queued stats shouldn't be cleared */
+	blkg_rwstat_reset(&stats->merged);
+	blkg_rwstat_reset(&stats->service_time);
+	blkg_rwstat_reset(&stats->wait_time);
+	blkg_stat_reset(&stats->time);
+	blkg_stat_reset(&stats->avg_queue_size_sum);
+	blkg_stat_reset(&stats->avg_queue_size_samples);
+	blkg_stat_reset(&stats->dequeue);
+	blkg_stat_reset(&stats->group_wait_time);
+	blkg_stat_reset(&stats->idle_time);
+	blkg_stat_reset(&stats->empty_time);
+}
+
+/* @to += @from */
+static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
+{
+	if (!to || !from)
+		return;
+
+	/* queued stats shouldn't be cleared */
+	blkg_rwstat_add_aux(&to->merged, &from->merged);
+	blkg_rwstat_add_aux(&to->service_time, &from->service_time);
+	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
+	blkg_stat_add_aux(&from->time, &from->time);
+	blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+	blkg_stat_add_aux(&to->avg_queue_size_samples,
+			  &from->avg_queue_size_samples);
+	blkg_stat_add_aux(&to->dequeue, &from->dequeue);
+	blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
+	blkg_stat_add_aux(&to->idle_time, &from->idle_time);
+	blkg_stat_add_aux(&to->empty_time, &from->empty_time);
+}
+
+/*
+ * Transfer @bfqg's stats to its parent's aux counts so that the ancestors'
+ * recursive stats can still account for the amount used by this bfqg after
+ * it's gone.
+ */
+static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
+{
+	struct bfq_group *parent;
+
+	if (!bfqg) /* root_group */
+		return;
+
+	parent = bfqg_parent(bfqg);
+
+	lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
+
+	if (unlikely(!parent))
+		return;
+
+	bfqg_stats_add_aux(&parent->stats, &bfqg->stats);
+	bfqg_stats_reset(&bfqg->stats);
+}
+
+static void bfq_init_entity(struct bfq_entity *entity,
+			    struct bfq_group *bfqg)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	entity->weight = entity->new_weight;
+	entity->orig_weight = entity->new_weight;
+	if (bfqq) {
+		bfqq->ioprio = bfqq->new_ioprio;
+		bfqq->ioprio_class = bfqq->new_ioprio_class;
+		bfqg_get(bfqg);
+	}
+	entity->parent = bfqg->my_entity; /* NULL for root group */
+	entity->sched_data = &bfqg->sched_data;
+}
+
+static void bfqg_stats_exit(struct bfqg_stats *stats)
+{
+	blkg_rwstat_exit(&stats->merged);
+	blkg_rwstat_exit(&stats->service_time);
+	blkg_rwstat_exit(&stats->wait_time);
+	blkg_rwstat_exit(&stats->queued);
+	blkg_stat_exit(&stats->time);
+	blkg_stat_exit(&stats->avg_queue_size_sum);
+	blkg_stat_exit(&stats->avg_queue_size_samples);
+	blkg_stat_exit(&stats->dequeue);
+	blkg_stat_exit(&stats->group_wait_time);
+	blkg_stat_exit(&stats->idle_time);
+	blkg_stat_exit(&stats->empty_time);
+}
+
+static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
+{
+	if (blkg_rwstat_init(&stats->merged, gfp) ||
+	    blkg_rwstat_init(&stats->service_time, gfp) ||
+	    blkg_rwstat_init(&stats->wait_time, gfp) ||
+	    blkg_rwstat_init(&stats->queued, gfp) ||
+	    blkg_stat_init(&stats->time, gfp) ||
+	    blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
+	    blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
+	    blkg_stat_init(&stats->dequeue, gfp) ||
+	    blkg_stat_init(&stats->group_wait_time, gfp) ||
+	    blkg_stat_init(&stats->idle_time, gfp) ||
+	    blkg_stat_init(&stats->empty_time, gfp)) {
+		bfqg_stats_exit(stats);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
+{
+	return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
+}
+
+static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)
+{
+	return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
+}
+
+static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
+{
+	struct bfq_group_data *bgd;
+
+	bgd = kzalloc(sizeof(*bgd), gfp);
+	if (!bgd)
+		return NULL;
+	return &bgd->pd;
+}
+
+static void bfq_cpd_init(struct blkcg_policy_data *cpd)
+{
+	struct bfq_group_data *d = cpd_to_bfqgd(cpd);
+
+	d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
+		CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL;
+}
+
+static void bfq_cpd_free(struct blkcg_policy_data *cpd)
+{
+	kfree(cpd_to_bfqgd(cpd));
+}
+
+static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
+{
+	struct bfq_group *bfqg;
+
+	bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
+	if (!bfqg)
+		return NULL;
+
+	if (bfqg_stats_init(&bfqg->stats, gfp)) {
+		kfree(bfqg);
+		return NULL;
+	}
+
+	return &bfqg->pd;
+}
+
+static void bfq_pd_init(struct blkg_policy_data *pd)
+{
+	struct blkcg_gq *blkg = pd_to_blkg(pd);
+	struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+	struct bfq_data *bfqd = blkg->q->elevator->elevator_data;
+	struct bfq_entity *entity = &bfqg->entity;
+	struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg);
+
+	entity->orig_weight = entity->weight = entity->new_weight = d->weight;
+	entity->my_sched_data = &bfqg->sched_data;
+	bfqg->my_entity = entity; /*
+				   * the root_group's will be set to NULL
+				   * in bfq_init_queue()
+				   */
+	bfqg->bfqd = bfqd;
+}
+
+static void bfq_pd_free(struct blkg_policy_data *pd)
+{
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+
+	bfqg_stats_exit(&bfqg->stats);
+	return kfree(bfqg);
+}
+
+static void bfq_pd_reset_stats(struct blkg_policy_data *pd)
+{
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+
+	bfqg_stats_reset(&bfqg->stats);
+}
+
+static void bfq_group_set_parent(struct bfq_group *bfqg,
+					struct bfq_group *parent)
+{
+	struct bfq_entity *entity;
+
+	entity = &bfqg->entity;
+	entity->parent = parent->my_entity;
+	entity->sched_data = &parent->sched_data;
+}
+
+static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd,
+					 struct blkcg *blkcg)
+{
+	struct blkcg_gq *blkg;
+
+	blkg = blkg_lookup(blkcg, bfqd->queue);
+	if (likely(blkg))
+		return blkg_to_bfqg(blkg);
+	return NULL;
+}
+
+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
+					    struct blkcg *blkcg)
+{
+	struct bfq_group *bfqg, *parent;
+	struct bfq_entity *entity;
+
+	bfqg = bfq_lookup_bfqg(bfqd, blkcg);
+
+	if (unlikely(!bfqg))
+		return NULL;
+
+	/*
+	 * Update chain of bfq_groups as we might be handling a leaf group
+	 * which, along with some of its relatives, has not been hooked yet
+	 * to the private hierarchy of BFQ.
+	 */
+	entity = &bfqg->entity;
+	for_each_entity(entity) {
+		bfqg = container_of(entity, struct bfq_group, entity);
+		if (bfqg != bfqd->root_group) {
+			parent = bfqg_parent(bfqg);
+			if (!parent)
+				parent = bfqd->root_group;
+			bfq_group_set_parent(bfqg, parent);
+		}
+	}
+
+	return bfqg;
+}
+
+static void bfq_bfqq_expire(struct bfq_data *bfqd,
+			    struct bfq_queue *bfqq,
+			    bool compensate,
+			    enum bfqq_expiration reason);
+
+/**
+ * bfq_bfqq_move - migrate @bfqq to @bfqg.
+ * @bfqd: queue descriptor.
+ * @bfqq: the queue to move.
+ * @bfqg: the group to move to.
+ *
+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
+ * it on the new one.  Avoid putting the entity on the old group idle tree.
+ *
+ * Must be called under the queue lock; the cgroup owning @bfqg must
+ * not disappear (by now this just means that we are called under
+ * rcu_read_lock()).
+ */
+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			  struct bfq_group *bfqg)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	/* If bfqq is empty, then bfq_bfqq_expire also invokes
+	 * bfq_del_bfqq_busy, thereby removing bfqq and its entity
+	 * from data structures related to current group. Otherwise we
+	 * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
+	 * we do below.
+	 */
+	if (bfqq == bfqd->in_service_queue)
+		bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
+				false, BFQQE_PREEMPTED);
+
+	if (bfq_bfqq_busy(bfqq))
+		bfq_deactivate_bfqq(bfqd, bfqq, false, false);
+	else if (entity->on_st)
+		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
+	bfqg_put(bfqq_group(bfqq));
+
+	/*
+	 * Here we use a reference to bfqg.  We don't need a refcounter
+	 * as the cgroup reference will not be dropped, so that its
+	 * destroy() callback will not be invoked.
+	 */
+	entity->parent = bfqg->my_entity;
+	entity->sched_data = &bfqg->sched_data;
+	bfqg_get(bfqg);
+
+	if (bfq_bfqq_busy(bfqq))
+		bfq_activate_bfqq(bfqd, bfqq);
+
+	if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
+		bfq_schedule_dispatch(bfqd);
+}
+
+/**
+ * __bfq_bic_change_cgroup - move @bic to @cgroup.
+ * @bfqd: the queue descriptor.
+ * @bic: the bic to move.
+ * @blkcg: the blk-cgroup to move to.
+ *
+ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
+ * has to make sure that the reference to cgroup is valid across the call.
+ *
+ * NOTE: an alternative approach might have been to store the current
+ * cgroup in bfqq and getting a reference to it, reducing the lookup
+ * time here, at the price of slightly more complex code.
+ */
+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
+						struct bfq_io_cq *bic,
+						struct blkcg *blkcg)
+{
+	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
+	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
+	struct bfq_group *bfqg;
+	struct bfq_entity *entity;
+
+	bfqg = bfq_find_set_group(bfqd, blkcg);
+
+	if (unlikely(!bfqg))
+		bfqg = bfqd->root_group;
+
+	if (async_bfqq) {
+		entity = &async_bfqq->entity;
+
+		if (entity->sched_data != &bfqg->sched_data) {
+			bic_set_bfqq(bic, NULL, 0);
+			bfq_log_bfqq(bfqd, async_bfqq,
+				     "bic_change_group: %p %d",
+				     async_bfqq,
+				     async_bfqq->ref);
+			bfq_put_queue(async_bfqq);
+		}
+	}
+
+	if (sync_bfqq) {
+		entity = &sync_bfqq->entity;
+		if (entity->sched_data != &bfqg->sched_data)
+			bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
+	}
+
+	return bfqg;
+}
+
+static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
+{
+	struct bfq_data *bfqd = bic_to_bfqd(bic);
+	struct bfq_group *bfqg = NULL;
+	uint64_t serial_nr;
+
+	rcu_read_lock();
+	serial_nr = bio_blkcg(bio)->css.serial_nr;
+
+	/*
+	 * Check whether blkcg has changed.  The condition may trigger
+	 * spuriously on a newly created cic but there's no harm.
+	 */
+	if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
+		goto out;
+
+	bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
+	bic->blkcg_serial_nr = serial_nr;
+out:
+	rcu_read_unlock();
+}
+
+/**
+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
+ * @st: the service tree being flushed.
+ */
+static void bfq_flush_idle_tree(struct bfq_service_tree *st)
+{
+	struct bfq_entity *entity = st->first_idle;
+
+	for (; entity ; entity = st->first_idle)
+		__bfq_deactivate_entity(entity, false);
+}
+
+/**
+ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
+ * @bfqd: the device data structure with the root group.
+ * @entity: the entity to move.
+ */
+static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
+				     struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
 }
 
 /**
- * __bfq_lookup_next_entity - return the first eligible entity in @st.
- * @st: the service tree.
+ * bfq_reparent_active_entities - move to the root group all active
+ *                                entities.
+ * @bfqd: the device data structure with the root group.
+ * @bfqg: the group to move from.
+ * @st: the service tree with the entities.
  *
- * Update the virtual time in @st and return the first eligible entity
- * it contains.
+ * Needs queue_lock to be taken and reference to be valid over the call.
  */
-static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
-						   bool force)
+static void bfq_reparent_active_entities(struct bfq_data *bfqd,
+					 struct bfq_group *bfqg,
+					 struct bfq_service_tree *st)
 {
-	struct bfq_entity *entity, *new_next_in_service = NULL;
-
-	if (RB_EMPTY_ROOT(&st->active))
-		return NULL;
+	struct rb_root *active = &st->active;
+	struct bfq_entity *entity = NULL;
 
-	bfq_update_vtime(st);
-	entity = bfq_first_active_entity(st);
+	if (!RB_EMPTY_ROOT(&st->active))
+		entity = bfq_entity_of(rb_first(active));
 
-	/*
-	 * If the chosen entity does not match with the sched_data's
-	 * next_in_service and we are forcedly serving the IDLE priority
-	 * class tree, bubble up budget update.
-	 */
-	if (unlikely(force && entity != entity->sched_data->next_in_service)) {
-		new_next_in_service = entity;
-		for_each_entity(new_next_in_service)
-			bfq_update_budget(new_next_in_service);
-	}
+	for (; entity ; entity = bfq_entity_of(rb_first(active)))
+		bfq_reparent_leaf_entity(bfqd, entity);
 
-	return entity;
+	if (bfqg->sched_data.in_service_entity)
+		bfq_reparent_leaf_entity(bfqd,
+			bfqg->sched_data.in_service_entity);
 }
 
 /**
- * bfq_lookup_next_entity - return the first eligible entity in @sd.
- * @sd: the sched_data.
- * @extract: if true the returned entity will be also extracted from @sd.
+ * bfq_pd_offline - deactivate the entity associated with @pd,
+ *		    and reparent its children entities.
+ * @pd: descriptor of the policy going offline.
  *
- * NOTE: since we cache the next_in_service entity at each level of the
- * hierarchy, the complexity of the lookup can be decreased with
- * absolutely no effort just returning the cached next_in_service value;
- * we prefer to do full lookups to test the consistency of the data
- * structures.
+ * blkio already grabs the queue_lock for us, so no need to use
+ * RCU-based magic
  */
-static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
-						 int extract,
-						 struct bfq_data *bfqd)
+static void bfq_pd_offline(struct blkg_policy_data *pd)
 {
-	struct bfq_service_tree *st = sd->service_tree;
-	struct bfq_entity *entity;
-	int i = 0;
+	struct bfq_service_tree *st;
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+	struct bfq_data *bfqd = bfqg->bfqd;
+	struct bfq_entity *entity = bfqg->my_entity;
+	unsigned long flags;
+	int i;
 
+	if (!entity) /* root group */
+		return;
+
+	spin_lock_irqsave(&bfqd->lock, flags);
 	/*
-	 * Choose from idle class, if needed to guarantee a minimum
-	 * bandwidth to this class. This should also mitigate
-	 * priority-inversion problems in case a low priority task is
-	 * holding file system resources.
+	 * Empty all service_trees belonging to this group before
+	 * deactivating the group itself.
 	 */
-	if (bfqd &&
-	    jiffies - bfqd->bfq_class_idle_last_service >
-	    BFQ_CL_IDLE_TIMEOUT) {
-		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,
-						  true);
-		if (entity) {
-			i = BFQ_IOPRIO_CLASSES - 1;
-			bfqd->bfq_class_idle_last_service = jiffies;
-			sd->next_in_service = entity;
-		}
-	}
-	for (; i < BFQ_IOPRIO_CLASSES; i++) {
-		entity = __bfq_lookup_next_entity(st + i, false);
-		if (entity) {
-			if (extract) {
-				bfq_check_next_in_service(sd, entity);
-				bfq_active_extract(st + i, entity);
-				sd->in_service_entity = entity;
-				sd->next_in_service = NULL;
-			}
-			break;
-		}
+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
+		st = bfqg->sched_data.service_tree + i;
+
+		/*
+		 * The idle tree may still contain bfq_queues belonging
+		 * to exited task because they never migrated to a different
+		 * cgroup from the one being destroyed now.  No one else
+		 * can access them so it's safe to act without any lock.
+		 */
+		bfq_flush_idle_tree(st);
+
+		/*
+		 * It may happen that some queues are still active
+		 * (busy) upon group destruction (if the corresponding
+		 * processes have been forced to terminate). We move
+		 * all the leaf entities corresponding to these queues
+		 * to the root_group.
+		 * Also, it may happen that the group has an entity
+		 * in service, which is disconnected from the active
+		 * tree: it must be moved, too.
+		 * There is no need to put the sync queues, as the
+		 * scheduler has taken no reference.
+		 */
+		bfq_reparent_active_entities(bfqd, bfqg, st);
 	}
 
-	return entity;
+	__bfq_deactivate_entity(entity, false);
+	bfq_put_async_queues(bfqd, bfqg);
+
+	spin_unlock_irqrestore(&bfqd->lock, flags);
+	/*
+	 * @blkg is going offline and will be ignored by
+	 * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
+	 * that they don't get lost.  If IOs complete after this point, the
+	 * stats for them will be lost.  Oh well...
+	 */
+	bfqg_stats_xfer_dead(bfqg);
 }
 
-static bool next_queue_may_preempt(struct bfq_data *bfqd)
+static int bfq_io_show_weight(struct seq_file *sf, void *v)
 {
-	struct bfq_sched_data *sd = &bfqd->sched_data;
+	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+	unsigned int val = 0;
 
-	return sd->next_in_service != sd->in_service_entity;
-}
+	if (bfqgd)
+		val = bfqgd->weight;
 
+	seq_printf(sf, "%u\n", val);
 
-/*
- * Get next queue for service.
- */
-static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
+	return 0;
+}
+
+static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
+				    struct cftype *cftype,
+				    u64 val)
 {
-	struct bfq_entity *entity = NULL;
-	struct bfq_sched_data *sd;
-	struct bfq_queue *bfqq;
+	struct blkcg *blkcg = css_to_blkcg(css);
+	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+	struct blkcg_gq *blkg;
+	int ret = -ERANGE;
 
-	if (bfqd->busy_queues == 0)
-		return NULL;
+	if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)
+		return ret;
 
-	sd = &bfqd->sched_data;
-	for (; sd ; sd = entity->my_sched_data) {
-		entity = bfq_lookup_next_entity(sd, 1, bfqd);
-		entity->service = 0;
+	ret = 0;
+	spin_lock_irq(&blkcg->lock);
+	bfqgd->weight = (unsigned short)val;
+	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
+		struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+
+		if (!bfqg)
+			continue;
+		/*
+		 * Setting the prio_changed flag of the entity
+		 * to 1 with new_weight == weight would re-set
+		 * the value of the weight to its ioprio mapping.
+		 * Set the flag only if necessary.
+		 */
+		if ((unsigned short)val != bfqg->entity.new_weight) {
+			bfqg->entity.new_weight = (unsigned short)val;
+			/*
+			 * Make sure that the above new value has been
+			 * stored in bfqg->entity.new_weight before
+			 * setting the prio_changed flag. In fact,
+			 * this flag may be read asynchronously (in
+			 * critical sections protected by a different
+			 * lock than that held here), and finding this
+			 * flag set may cause the execution of the code
+			 * for updating parameters whose value may
+			 * depend also on bfqg->entity.new_weight (in
+			 * __bfq_entity_update_weight_prio).
+			 * This barrier makes sure that the new value
+			 * of bfqg->entity.new_weight is correctly
+			 * seen in that code.
+			 */
+			smp_wmb();
+			bfqg->entity.prio_changed = 1;
+		}
 	}
+	spin_unlock_irq(&blkcg->lock);
 
-	bfqq = bfq_entity_to_bfqq(entity);
+	return ret;
+}
 
-	return bfqq;
+static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
+				 char *buf, size_t nbytes,
+				 loff_t off)
+{
+	u64 weight;
+	/* First unsigned long found in the file is used */
+	int ret = kstrtoull(strim(buf), 0, &weight);
+
+	if (ret)
+		return ret;
+
+	return bfq_io_set_weight_legacy(of_css(of), NULL, weight);
 }
 
-static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
+static int bfqg_print_stat(struct seq_file *sf, void *v)
 {
-	struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
-	struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
+			  &blkcg_policy_bfq, seq_cft(sf)->private, false);
+	return 0;
+}
 
-	if (bfqd->in_service_bic) {
-		put_io_context(bfqd->in_service_bic->icq.ioc);
-		bfqd->in_service_bic = NULL;
-	}
+static int bfqg_print_rwstat(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
+			  &blkcg_policy_bfq, seq_cft(sf)->private, true);
+	return 0;
+}
 
-	bfq_clear_bfqq_wait_request(in_serv_bfqq);
-	hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
-	bfqd->in_service_queue = NULL;
+static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
+				      struct blkg_policy_data *pd, int off)
+{
+	u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
+					  &blkcg_policy_bfq, off);
+	return __blkg_prfill_u64(sf, pd, sum);
+}
 
-	/*
-	 * in_serv_entity is no longer in service, so, if it is in no
-	 * service tree either, then release the service reference to
-	 * the queue it represents (taken with bfq_get_entity).
-	 */
-	if (!in_serv_entity->on_st)
-		bfq_put_queue(in_serv_bfqq);
+static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
+					struct blkg_policy_data *pd, int off)
+{
+	struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
+							   &blkcg_policy_bfq,
+							   off);
+	return __blkg_prfill_rwstat(sf, pd, &sum);
 }
 
-static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-				int requeue)
+static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
 {
-	struct bfq_entity *entity = &bfqq->entity;
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
+			  seq_cft(sf)->private, false);
+	return 0;
+}
 
-	bfq_deactivate_entity(entity, requeue);
+static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
+			  seq_cft(sf)->private, true);
+	return 0;
 }
 
-static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
+			       int off)
 {
-	struct bfq_entity *entity = &bfqq->entity;
+	u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
 
-	bfq_activate_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq));
-	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
+	return __blkg_prfill_u64(sf, pd, sum >> 9);
 }
 
-/*
- * Called when the bfqq no longer has requests pending, remove it from
- * the service tree.
- */
-static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-			      int requeue)
+static int bfqg_print_stat_sectors(struct seq_file *sf, void *v)
 {
-	bfq_log_bfqq(bfqd, bfqq, "del from busy");
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false);
+	return 0;
+}
 
-	bfq_clear_bfqq_busy(bfqq);
+static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
+					 struct blkg_policy_data *pd, int off)
+{
+	struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
+					offsetof(struct blkcg_gq, stat_bytes));
+	u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
+		atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
 
-	bfqd->busy_queues--;
+	return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
 
-	bfq_deactivate_bfqq(bfqd, bfqq, requeue);
+static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0,
+			  false);
+	return 0;
 }
 
-/*
- * Called when an inactive queue receives a new request.
- */
-static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
+				      struct blkg_policy_data *pd, int off)
 {
-	bfq_log_bfqq(bfqd, bfqq, "add to busy");
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+	u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
+	u64 v = 0;
 
-	bfq_activate_bfqq(bfqd, bfqq);
+	if (samples) {
+		v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
+		v = div64_u64(v, samples);
+	}
+	__blkg_prfill_u64(sf, pd, v);
+	return 0;
+}
 
-	bfq_mark_bfqq_busy(bfqq);
-	bfqd->busy_queues++;
+/* print avg_queue_size */
+static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,
+			  0, false);
+	return 0;
+}
+
+static struct bfq_group *
+bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
+{
+	int ret;
+
+	ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
+	if (ret)
+		return NULL;
+
+	return blkg_to_bfqg(bfqd->queue->root_blkg);
 }
 
-static void bfq_init_entity(struct bfq_entity *entity)
+static struct cftype bfq_blkcg_legacy_files[] = {
+	{
+		.name = "bfq.weight",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = bfq_io_show_weight,
+		.write_u64 = bfq_io_set_weight_legacy,
+	},
+
+	/* statistics, covers only the tasks in the bfqg */
+	{
+		.name = "bfq.time",
+		.private = offsetof(struct bfq_group, stats.time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.sectors",
+		.seq_show = bfqg_print_stat_sectors,
+	},
+	{
+		.name = "bfq.io_service_bytes",
+		.private = (unsigned long)&blkcg_policy_bfq,
+		.seq_show = blkg_print_stat_bytes,
+	},
+	{
+		.name = "bfq.io_serviced",
+		.private = (unsigned long)&blkcg_policy_bfq,
+		.seq_show = blkg_print_stat_ios,
+	},
+	{
+		.name = "bfq.io_service_time",
+		.private = offsetof(struct bfq_group, stats.service_time),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_wait_time",
+		.private = offsetof(struct bfq_group, stats.wait_time),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_merged",
+		.private = offsetof(struct bfq_group, stats.merged),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_queued",
+		.private = offsetof(struct bfq_group, stats.queued),
+		.seq_show = bfqg_print_rwstat,
+	},
+
+	/* the same statictics which cover the bfqg and its descendants */
+	{
+		.name = "bfq.time_recursive",
+		.private = offsetof(struct bfq_group, stats.time),
+		.seq_show = bfqg_print_stat_recursive,
+	},
+	{
+		.name = "bfq.sectors_recursive",
+		.seq_show = bfqg_print_stat_sectors_recursive,
+	},
+	{
+		.name = "bfq.io_service_bytes_recursive",
+		.private = (unsigned long)&blkcg_policy_bfq,
+		.seq_show = blkg_print_stat_bytes_recursive,
+	},
+	{
+		.name = "bfq.io_serviced_recursive",
+		.private = (unsigned long)&blkcg_policy_bfq,
+		.seq_show = blkg_print_stat_ios_recursive,
+	},
+	{
+		.name = "bfq.io_service_time_recursive",
+		.private = offsetof(struct bfq_group, stats.service_time),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_wait_time_recursive",
+		.private = offsetof(struct bfq_group, stats.wait_time),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_merged_recursive",
+		.private = offsetof(struct bfq_group, stats.merged),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_queued_recursive",
+		.private = offsetof(struct bfq_group, stats.queued),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.avg_queue_size",
+		.seq_show = bfqg_print_avg_queue_size,
+	},
+	{
+		.name = "bfq.group_wait_time",
+		.private = offsetof(struct bfq_group, stats.group_wait_time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.idle_time",
+		.private = offsetof(struct bfq_group, stats.idle_time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.empty_time",
+		.private = offsetof(struct bfq_group, stats.empty_time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.dequeue",
+		.private = offsetof(struct bfq_group, stats.dequeue),
+		.seq_show = bfqg_print_stat,
+	},
+	{ }	/* terminate */
+};
+
+static struct cftype bfq_blkg_files[] = {
+	{
+		.name = "bfq.weight",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = bfq_io_show_weight,
+		.write = bfq_io_set_weight,
+	},
+	{} /* terminate */
+};
+
+#else	/* CONFIG_BFQ_GROUP_IOSCHED */
+
+static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg,
+			struct bfq_queue *bfqq, unsigned int op) { }
+static inline void
+bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { }
+static inline void
+bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { }
+static inline void bfqg_stats_update_completion(struct bfq_group *bfqg,
+			uint64_t start_time, uint64_t io_start_time,
+			unsigned int op) { }
+static inline void
+bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
+				     struct bfq_group *curr_bfqg) { }
+static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { }
+static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
+
+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			  struct bfq_group *bfqg) {}
+
+static void bfq_init_entity(struct bfq_entity *entity,
+			    struct bfq_group *bfqg)
 {
 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 
 	entity->weight = entity->new_weight;
 	entity->orig_weight = entity->new_weight;
+	if (bfqq) {
+		bfqq->ioprio = bfqq->new_ioprio;
+		bfqq->ioprio_class = bfqq->new_ioprio_class;
+	}
+	entity->sched_data = &bfqg->sched_data;
+}
+
+static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {}
+
+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
+					    struct blkcg *blkcg)
+{
+	return bfqd->root_group;
+}
+
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
+{
+	return bfqq->bfqd->root_group;
+}
 
-	bfqq->ioprio = bfqq->new_ioprio;
-	bfqq->ioprio_class = bfqq->new_ioprio_class;
+static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd,
+						    int node)
+{
+	struct bfq_group *bfqg;
+	int i;
+
+	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
+	if (!bfqg)
+		return NULL;
 
-	entity->sched_data = &bfqq->bfqd->sched_data;
+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
+		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+
+	return bfqg;
 }
+#endif	/* CONFIG_BFQ_GROUP_IOSCHED */
 
 #define bfq_class_idle(bfqq)	((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
 #define bfq_class_rt(bfqq)	((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
 
 #define bfq_sample_valid(samples)	((samples) > 80)
 
-/*
- * Scheduler run of queue, if there are requests pending and no one in the
- * driver that will restart queueing.
- */
-static void bfq_schedule_dispatch(struct bfq_data *bfqd)
-{
-	if (bfqd->queued != 0) {
-		bfq_log(bfqd, "schedule dispatch");
-		blk_mq_run_hw_queues(bfqd->queue, true);
-	}
-}
-
 /*
  * Lifted from AS - choose which of rq1 and rq2 that is best served now.
  * We choose the request that is closesr to the head right now.  Distance
@@ -1905,7 +3593,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd,
 		entity->budget = new_budget;
 		bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
 					 new_budget);
-		bfq_activate_bfqq(bfqd, bfqq);
+		bfq_requeue_bfqq(bfqd, bfqq);
 	}
 }
 
@@ -2076,6 +3764,8 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
 			bfqq->ttime.last_end_request +
 			bfqd->bfq_slice_idle * 3;
 
+	bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags);
+
 	/*
 	 * Update budget and check whether bfqq may want to preempt
 	 * the in-service queue.
@@ -2195,7 +3885,7 @@ static void bfq_remove_request(struct request_queue *q,
 		bfqq->next_rq = NULL;
 
 		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) {
-			bfq_del_bfqq_busy(bfqd, bfqq, 1);
+			bfq_del_bfqq_busy(bfqd, bfqq, false);
 			/*
 			 * bfqq emptied. In normal operation, when
 			 * bfqq is empty, bfqq->entity.service and
@@ -2215,6 +3905,8 @@ static void bfq_remove_request(struct request_queue *q,
 
 	if (rq->cmd_flags & REQ_META)
 		bfqq->meta_pending--;
+
+	bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags);
 }
 
 static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
@@ -2300,7 +3992,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
 	struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
 
 	if (!RB_EMPTY_NODE(&rq->rb_node))
-		return;
+		goto end;
 	spin_lock_irq(&bfqq->bfqd->lock);
 
 	/*
@@ -2326,6 +4018,8 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
 	bfq_remove_request(q, next);
 
 	spin_unlock_irq(&bfqq->bfqd->lock);
+end:
+	bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
 }
 
 static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
@@ -2355,6 +4049,7 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
 				       struct bfq_queue *bfqq)
 {
 	if (bfqq) {
+		bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
 		bfq_mark_bfqq_budget_new(bfqq);
 		bfq_clear_bfqq_fifo_expire(bfqq);
 
@@ -2441,6 +4136,7 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
 	bfqd->last_idling_start = ktime_get();
 	hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
 		      HRTIMER_MODE_REL);
+	bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
 }
 
 /*
@@ -2490,12 +4186,17 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
 
 static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 {
-	__bfq_bfqd_reset_in_service(bfqd);
-
 	if (RB_EMPTY_ROOT(&bfqq->sort_list))
-		bfq_del_bfqq_busy(bfqd, bfqq, 1);
+		bfq_del_bfqq_busy(bfqd, bfqq, true);
 	else
-		bfq_activate_bfqq(bfqd, bfqq);
+		bfq_requeue_bfqq(bfqd, bfqq);
+
+	/*
+	 * All in-service entities must have been properly deactivated
+	 * or requeued before executing the next function, which
+	 * resets all in-service entites as no more in service.
+	 */
+	__bfq_bfqd_reset_in_service(bfqd);
 }
 
 /**
@@ -2972,6 +4673,7 @@ check_queue:
 				 */
 				bfq_clear_bfqq_wait_request(bfqq);
 				hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+				bfqg_stats_update_idle_time(bfqq_group(bfqq));
 			}
 			goto keep_queue;
 		}
@@ -3159,6 +4861,10 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
  */
 static void bfq_put_queue(struct bfq_queue *bfqq)
 {
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	struct bfq_group *bfqg = bfqq_group(bfqq);
+#endif
+
 	if (bfqq->bfqd)
 		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d",
 			     bfqq, bfqq->ref);
@@ -3167,7 +4873,12 @@ static void bfq_put_queue(struct bfq_queue *bfqq)
 	if (bfqq->ref)
 		return;
 
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq);
+
 	kmem_cache_free(bfq_pool, bfqq);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	bfqg_put(bfqg);
+#endif
 }
 
 static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
@@ -3323,18 +5034,19 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 }
 
 static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
+					       struct bfq_group *bfqg,
 					       int ioprio_class, int ioprio)
 {
 	switch (ioprio_class) {
 	case IOPRIO_CLASS_RT:
-		return &async_bfqq[0][ioprio];
+		return &bfqg->async_bfqq[0][ioprio];
 	case IOPRIO_CLASS_NONE:
 		ioprio = IOPRIO_NORM;
 		/* fall through */
 	case IOPRIO_CLASS_BE:
-		return &async_bfqq[1][ioprio];
+		return &bfqg->async_bfqq[1][ioprio];
 	case IOPRIO_CLASS_IDLE:
-		return &async_idle_bfqq;
+		return &bfqg->async_idle_bfqq;
 	default:
 		return NULL;
 	}
@@ -3348,11 +5060,18 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
 	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
 	struct bfq_queue **async_bfqq = NULL;
 	struct bfq_queue *bfqq;
+	struct bfq_group *bfqg;
 
 	rcu_read_lock();
 
+	bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
+	if (!bfqg) {
+		bfqq = &bfqd->oom_bfqq;
+		goto out;
+	}
+
 	if (!is_sync) {
-		async_bfqq = bfq_async_queue_prio(bfqd, ioprio_class,
+		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
 						  ioprio);
 		bfqq = *async_bfqq;
 		if (bfqq)
@@ -3366,7 +5085,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
 	if (bfqq) {
 		bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
 			      is_sync);
-		bfq_init_entity(&bfqq->entity);
+		bfq_init_entity(&bfqq->entity, bfqg);
 		bfq_log_bfqq(bfqd, bfqq, "allocated");
 	} else {
 		bfqq = &bfqd->oom_bfqq;
@@ -3379,9 +5098,14 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
 	 * prune it.
 	 */
 	if (async_bfqq) {
-		bfqq->ref++;
-		bfq_log_bfqq(bfqd, bfqq,
-			     "get_queue, bfqq not in async: %p, %d",
+		bfqq->ref++; /*
+			      * Extra group reference, w.r.t. sync
+			      * queue. This extra reference is removed
+			      * only if bfqq->bfqg disappears, to
+			      * guarantee that this queue is not freed
+			      * until its group goes away.
+			      */
+		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
 			     bfqq, bfqq->ref);
 		*async_bfqq = bfqq;
 	}
@@ -3516,6 +5240,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		 */
 		bfq_clear_bfqq_wait_request(bfqq);
 		hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+		bfqg_stats_update_idle_time(bfqq_group(bfqq));
 
 		/*
 		 * The queue is not empty, because a new request just
@@ -3657,6 +5382,11 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq)
 	struct bfq_queue *bfqq = RQ_BFQQ(rq);
 	struct bfq_data *bfqd = bfqq->bfqd;
 
+	if (rq->rq_flags & RQF_STARTED)
+		bfqg_stats_update_completion(bfqq_group(bfqq),
+					     rq_start_time_ns(rq),
+					     rq_io_start_time_ns(rq),
+					     rq->cmd_flags);
 
 	if (likely(rq->rq_flags & RQF_STARTED)) {
 		unsigned long flags;
@@ -3707,6 +5437,8 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
 	if (!bic)
 		goto queue_fail;
 
+	bfq_bic_update_cgroup(bic, bio);
+
 	bfqq = bic_to_bfqq(bic, is_sync);
 	if (!bfqq || bfqq == &bfqd->oom_bfqq) {
 		if (bfqq)
@@ -3803,6 +5535,8 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
 
 	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
 	if (bfqq) {
+		bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
+
 		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
 			     bfqq, bfqq->ref);
 		bfq_put_queue(bfqq);
@@ -3811,18 +5545,20 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
 }
 
 /*
- * Release the extra reference of the async queues as the device
- * goes away.
+ * Release all the bfqg references to its async queues.  If we are
+ * deallocating the group these queues may still contain requests, so
+ * we reparent them to the root cgroup (i.e., the only one that will
+ * exist for sure until all the requests on a device are gone).
  */
-static void bfq_put_async_queues(struct bfq_data *bfqd)
+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
 {
 	int i, j;
 
 	for (i = 0; i < 2; i++)
 		for (j = 0; j < IOPRIO_BE_NR; j++)
-			__bfq_put_async_bfqq(bfqd, &async_bfqq[i][j]);
+			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
 
-	__bfq_put_async_bfqq(bfqd, &async_idle_bfqq);
+	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
 }
 
 static void bfq_exit_queue(struct elevator_queue *e)
@@ -3834,20 +5570,42 @@ static void bfq_exit_queue(struct elevator_queue *e)
 
 	spin_lock_irq(&bfqd->lock);
 	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
-		bfq_deactivate_bfqq(bfqd, bfqq, false);
-	bfq_put_async_queues(bfqd);
+		bfq_deactivate_bfqq(bfqd, bfqq, false, false);
 	spin_unlock_irq(&bfqd->lock);
 
 	hrtimer_cancel(&bfqd->idle_slice_timer);
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
+#else
+	spin_lock_irq(&bfqd->lock);
+	bfq_put_async_queues(bfqd, bfqd->root_group);
+	kfree(bfqd->root_group);
+	spin_unlock_irq(&bfqd->lock);
+#endif
+
 	kfree(bfqd);
 }
 
+static void bfq_init_root_group(struct bfq_group *root_group,
+				struct bfq_data *bfqd)
+{
+	int i;
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	root_group->entity.parent = NULL;
+	root_group->my_entity = NULL;
+	root_group->bfqd = bfqd;
+#endif
+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
+		root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+	root_group->sched_data.bfq_class_idle_last_service = jiffies;
+}
+
 static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 {
 	struct bfq_data *bfqd;
 	struct elevator_queue *eq;
-	int i;
 
 	eq = elevator_alloc(q, e);
 	if (!eq)
@@ -3860,6 +5618,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	}
 	eq->elevator_data = bfqd;
 
+	spin_lock_irq(q->queue_lock);
+	q->elevator = eq;
+	spin_unlock_irq(q->queue_lock);
+
 	/*
 	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
 	 * Grab a permanent reference to it, so that the normal code flow
@@ -3880,8 +5642,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 
 	bfqd->queue = q;
 
-	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
-		bfqd->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+	INIT_LIST_HEAD(&bfqd->dispatch);
 
 	hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
 		     HRTIMER_MODE_REL);
@@ -3899,17 +5660,40 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	bfqd->bfq_back_max = bfq_back_max;
 	bfqd->bfq_back_penalty = bfq_back_penalty;
 	bfqd->bfq_slice_idle = bfq_slice_idle;
-	bfqd->bfq_class_idle_last_service = 0;
 	bfqd->bfq_timeout = bfq_timeout;
 
 	bfqd->bfq_requests_within_timer = 120;
 
 	spin_lock_init(&bfqd->lock);
-	INIT_LIST_HEAD(&bfqd->dispatch);
 
-	q->elevator = eq;
+	/*
+	 * The invocation of the next bfq_create_group_hierarchy
+	 * function is the head of a chain of function calls
+	 * (bfq_create_group_hierarchy->blkcg_activate_policy->
+	 * blk_mq_freeze_queue) that may lead to the invocation of the
+	 * has_work hook function. For this reason,
+	 * bfq_create_group_hierarchy is invoked only after all
+	 * scheduler data has been initialized, apart from the fields
+	 * that can be initialized only after invoking
+	 * bfq_create_group_hierarchy. This, in particular, enables
+	 * has_work to correctly return false. Of course, to avoid
+	 * other inconsistencies, the blk-mq stack must then refrain
+	 * from invoking further scheduler hooks before this init
+	 * function is finished.
+	 */
+	bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node);
+	if (!bfqd->root_group)
+		goto out_free;
+	bfq_init_root_group(bfqd->root_group, bfqd);
+	bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
+
 
 	return 0;
+
+out_free:
+	kfree(bfqd);
+	kobject_put(&eq->kobj);
+	return -ENOMEM;
 }
 
 static void bfq_slab_kill(void)
@@ -4134,10 +5918,34 @@ static struct elevator_type iosched_bfq_mq = {
 	.elevator_owner =	THIS_MODULE,
 };
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static struct blkcg_policy blkcg_policy_bfq = {
+	.dfl_cftypes		= bfq_blkg_files,
+	.legacy_cftypes		= bfq_blkcg_legacy_files,
+
+	.cpd_alloc_fn		= bfq_cpd_alloc,
+	.cpd_init_fn		= bfq_cpd_init,
+	.cpd_bind_fn	        = bfq_cpd_init,
+	.cpd_free_fn		= bfq_cpd_free,
+
+	.pd_alloc_fn		= bfq_pd_alloc,
+	.pd_init_fn		= bfq_pd_init,
+	.pd_offline_fn		= bfq_pd_offline,
+	.pd_free_fn		= bfq_pd_free,
+	.pd_reset_stats_fn	= bfq_pd_reset_stats,
+};
+#endif
+
 static int __init bfq_init(void)
 {
 	int ret;
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	ret = blkcg_policy_register(&blkcg_policy_bfq);
+	if (ret)
+		return ret;
+#endif
+
 	ret = -ENOMEM;
 	if (bfq_slab_setup())
 		goto err_pol_unreg;
@@ -4149,12 +5957,18 @@ static int __init bfq_init(void)
 	return 0;
 
 err_pol_unreg:
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	blkcg_policy_unregister(&blkcg_policy_bfq);
+#endif
 	return ret;
 }
 
 static void __exit bfq_exit(void)
 {
 	elv_unregister(&iosched_bfq_mq);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	blkcg_policy_unregister(&blkcg_policy_bfq);
+#endif
 	bfq_slab_kill();
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ec993573e0a8..fe9c512cc6fa 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -50,7 +50,7 @@ struct blk_stat_callback;
  * Maximum number of blkcg policies allowed to be registered concurrently.
  * Defined here to simplify include dependency.
  */
-#define BLKCG_MAX_POLS		2
+#define BLKCG_MAX_POLS		3
 
 typedef void (rq_end_io_fn)(struct request *, int);
 
-- 
cgit v1.2.3


From 44e44a1b329ed37a98bc41ab21fb6897d5a922ac Mon Sep 17 00:00:00 2001
From: Paolo Valente <paolo.valente@linaro.org>
Date: Wed, 12 Apr 2017 18:23:12 +0200
Subject: block, bfq: improve responsiveness

This patch introduces a simple heuristic to load applications quickly,
and to perform the I/O requested by interactive applications just as
quickly. To this purpose, both a newly-created queue and a queue
associated with an interactive application (we explain in a moment how
BFQ decides whether the associated application is interactive),
receive the following two special treatments:

1) The weight of the queue is raised.

2) The queue unconditionally enjoys device idling when it empties; in
fact, if the requests of a queue are sync, then performing device
idling for the queue is a necessary condition to guarantee that the
queue receives a fraction of the throughput proportional to its weight
(see [1] for details).

For brevity, we call just weight-raising the combination of these
two preferential treatments. For a newly-created queue,
weight-raising starts immediately and lasts for a time interval that:
1) depends on the device speed and type (rotational or
non-rotational), and 2) is equal to the time needed to load (start up)
a large-size application on that device, with cold caches and with no
additional workload.

Finally, as for guaranteeing a fast execution to interactive,
I/O-related tasks (such as opening a file), consider that any
interactive application blocks and waits for user input both after
starting up and after executing some task. After a while, the user may
trigger new operations, after which the application stops again, and
so on. Accordingly, the low-latency heuristic weight-raises again a
queue in case it becomes backlogged after being idle for a
sufficiently long (configurable) time. The weight-raising then lasts
for the same time as for a just-created queue.

According to our experiments, the combination of this low-latency
heuristic and of the improvements described in the previous patch
allows BFQ to guarantee a high application responsiveness.

[1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
    Scheduler", Proceedings of the First Workshop on Mobile System
    Technologies (MST-2015), May 2015.
    http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf

Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/block/bfq-iosched.txt |   9 +
 block/bfq-iosched.c                 | 740 ++++++++++++++++++++++++++++++++----
 2 files changed, 675 insertions(+), 74 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt
index 461b27fce979..1b87df6cd476 100644
--- a/Documentation/block/bfq-iosched.txt
+++ b/Documentation/block/bfq-iosched.txt
@@ -375,6 +375,11 @@ default, low latency mode is enabled. If enabled, interactive and soft
 real-time applications are privileged and experience a lower latency,
 as explained in more detail in the description of how BFQ works.
 
+DO NOT enable this mode if you need full control on bandwidth
+distribution. In fact, if it is enabled, then BFQ automatically
+increases the bandwidth share of privileged applications, as the main
+means to guarantee a lower latency to them.
+
 timeout_sync
 ------------
 
@@ -507,6 +512,10 @@ linear mapping between ioprio and weights, described at the beginning
 of the tunable section, is still valid, but all weights higher than
 IOPRIO_BE_NR*10 are mapped to ioprio 0.
 
+Recall that, if low-latency is set, then BFQ automatically raises the
+weight of the queues associated with interactive and soft real-time
+applications. Unset this tunable if you need/want to control weights.
+
 
 [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
     Scheduler", Proceedings of the First Workshop on Mobile System
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index dce273b91015..1a32c8341ab0 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -339,6 +339,17 @@ struct bfq_queue {
 
 	/* pid of the process owning the queue, used for logging purposes */
 	pid_t pid;
+
+	/* current maximum weight-raising time for this queue */
+	unsigned long wr_cur_max_time;
+	/*
+	 * Start time of the current weight-raising period if
+	 * the @bfq-queue is being weight-raised, otherwise
+	 * finish time of the last weight-raising period.
+	 */
+	unsigned long last_wr_start_finish;
+	/* factor by which the weight of this queue is multiplied */
+	unsigned int wr_coeff;
 };
 
 /**
@@ -356,6 +367,11 @@ struct bfq_io_cq {
 #endif
 };
 
+enum bfq_device_speed {
+	BFQ_BFQD_FAST,
+	BFQ_BFQD_SLOW,
+};
+
 /**
  * struct bfq_data - per-device data structure.
  *
@@ -487,6 +503,34 @@ struct bfq_data {
 	 */
 	bool strict_guarantees;
 
+	/* if set to true, low-latency heuristics are enabled */
+	bool low_latency;
+	/*
+	 * Maximum factor by which the weight of a weight-raised queue
+	 * is multiplied.
+	 */
+	unsigned int bfq_wr_coeff;
+	/* maximum duration of a weight-raising period (jiffies) */
+	unsigned int bfq_wr_max_time;
+	/*
+	 * Minimum idle period after which weight-raising may be
+	 * reactivated for a queue (in jiffies).
+	 */
+	unsigned int bfq_wr_min_idle_time;
+	/*
+	 * Minimum period between request arrivals after which
+	 * weight-raising may be reactivated for an already busy async
+	 * queue (in jiffies).
+	 */
+	unsigned long bfq_wr_min_inter_arr_async;
+	/*
+	 * Cached value of the product R*T, used for computing the
+	 * maximum duration of weight raising automatically.
+	 */
+	u64 RT_prod;
+	/* device-speed class for the low-latency heuristic */
+	enum bfq_device_speed device_speed;
+
 	/* fallback dummy bfqq for extreme OOM conditions */
 	struct bfq_queue oom_bfqq;
 
@@ -515,7 +559,6 @@ enum bfqq_state_flags {
 	BFQQF_fifo_expire,	/* FIFO checked in this slice */
 	BFQQF_idle_window,	/* slice idling enabled */
 	BFQQF_sync,		/* synchronous queue */
-	BFQQF_budget_new,	/* no completion with this budget */
 	BFQQF_IO_bound,		/*
 				 * bfqq has timed-out at least once
 				 * having consumed at most 2/10 of
@@ -543,7 +586,6 @@ BFQ_BFQQ_FNS(non_blocking_wait_rq);
 BFQ_BFQQ_FNS(fifo_expire);
 BFQ_BFQQ_FNS(idle_window);
 BFQ_BFQQ_FNS(sync);
-BFQ_BFQQ_FNS(budget_new);
 BFQ_BFQQ_FNS(IO_bound);
 #undef BFQ_BFQQ_FNS
 
@@ -637,7 +679,7 @@ struct bfq_group_data {
 	/* must be the first member */
 	struct blkcg_policy_data pd;
 
-	unsigned short weight;
+	unsigned int weight;
 };
 
 /**
@@ -732,6 +774,8 @@ static void bfq_put_queue(struct bfq_queue *bfqq);
 static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
 				       struct bio *bio, bool is_sync,
 				       struct bfq_io_cq *bic);
+static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
+				    struct bfq_group *bfqg);
 static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
 static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
 
@@ -787,6 +831,56 @@ static struct kmem_cache *bfq_pool;
 /* Shift used for peak rate fixed precision calculations. */
 #define BFQ_RATE_SHIFT		16
 
+/*
+ * By default, BFQ computes the duration of the weight raising for
+ * interactive applications automatically, using the following formula:
+ * duration = (R / r) * T, where r is the peak rate of the device, and
+ * R and T are two reference parameters.
+ * In particular, R is the peak rate of the reference device (see below),
+ * and T is a reference time: given the systems that are likely to be
+ * installed on the reference device according to its speed class, T is
+ * about the maximum time needed, under BFQ and while reading two files in
+ * parallel, to load typical large applications on these systems.
+ * In practice, the slower/faster the device at hand is, the more/less it
+ * takes to load applications with respect to the reference device.
+ * Accordingly, the longer/shorter BFQ grants weight raising to interactive
+ * applications.
+ *
+ * BFQ uses four different reference pairs (R, T), depending on:
+ * . whether the device is rotational or non-rotational;
+ * . whether the device is slow, such as old or portable HDDs, as well as
+ *   SD cards, or fast, such as newer HDDs and SSDs.
+ *
+ * The device's speed class is dynamically (re)detected in
+ * bfq_update_peak_rate() every time the estimated peak rate is updated.
+ *
+ * In the following definitions, R_slow[0]/R_fast[0] and
+ * T_slow[0]/T_fast[0] are the reference values for a slow/fast
+ * rotational device, whereas R_slow[1]/R_fast[1] and
+ * T_slow[1]/T_fast[1] are the reference values for a slow/fast
+ * non-rotational device. Finally, device_speed_thresh are the
+ * thresholds used to switch between speed classes. The reference
+ * rates are not the actual peak rates of the devices used as a
+ * reference, but slightly lower values. The reason for using these
+ * slightly lower values is that the peak-rate estimator tends to
+ * yield slightly lower values than the actual peak rate (it can yield
+ * the actual peak rate only if there is only one process doing I/O,
+ * and the process does sequential I/O).
+ *
+ * Both the reference peak rates and the thresholds are measured in
+ * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
+ */
+static int R_slow[2] = {1000, 10700};
+static int R_fast[2] = {14000, 33000};
+/*
+ * To improve readability, a conversion function is used to initialize the
+ * following arrays, which entails that they can be initialized only in a
+ * function.
+ */
+static int T_slow[2];
+static int T_fast[2];
+static int device_speed_thresh[2];
+
 #define BFQ_SERVICE_TREE_INIT	((struct bfq_service_tree)		\
 				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
 
@@ -1486,7 +1580,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
 
 	if (entity->prio_changed) {
 		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-		unsigned short prev_weight, new_weight;
+		unsigned int prev_weight, new_weight;
 		struct bfq_data *bfqd = NULL;
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 		struct bfq_sched_data *sd;
@@ -1535,7 +1629,8 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
 		new_st = bfq_entity_service_tree(entity);
 
 		prev_weight = entity->weight;
-		new_weight = entity->orig_weight;
+		new_weight = entity->orig_weight *
+			     (bfqq ? bfqq->wr_coeff : 1);
 		entity->weight = new_weight;
 
 		new_st->wsum += entity->weight;
@@ -1630,6 +1725,8 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
 					struct bfq_service_tree *st,
 					bool backshifted)
 {
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
 	st = __bfq_entity_update_weight_prio(st, entity);
 	bfq_calc_finish(entity, entity->budget);
 
@@ -1659,10 +1756,19 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
 	 * time. This may introduce a little unfairness among queues
 	 * with backshifted timestamps, but it does not break
 	 * worst-case fairness guarantees.
+	 *
+	 * As a special case, if bfqq is weight-raised, push up
+	 * timestamps much less, to keep very low the probability that
+	 * this push up causes the backshifted finish timestamps of
+	 * weight-raised queues to become higher than the backshifted
+	 * finish timestamps of non weight-raised queues.
 	 */
 	if (backshifted && bfq_gt(st->vtime, entity->finish)) {
 		unsigned long delta = st->vtime - entity->finish;
 
+		if (bfqq)
+			delta /= bfqq->wr_coeff;
+
 		entity->start += delta;
 		entity->finish += delta;
 	}
@@ -3070,6 +3176,18 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
 	bfqg_stats_xfer_dead(bfqg);
 }
 
+static void bfq_end_wr_async(struct bfq_data *bfqd)
+{
+	struct blkcg_gq *blkg;
+
+	list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) {
+		struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+
+		bfq_end_wr_async_queues(bfqd, bfqg);
+	}
+	bfq_end_wr_async_queues(bfqd, bfqd->root_group);
+}
+
 static int bfq_io_show_weight(struct seq_file *sf, void *v)
 {
 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
@@ -3433,6 +3551,11 @@ static void bfq_init_entity(struct bfq_entity *entity,
 
 static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {}
 
+static void bfq_end_wr_async(struct bfq_data *bfqd)
+{
+	bfq_end_wr_async_queues(bfqd, bfqd->root_group);
+}
+
 static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
 					    struct blkcg *blkcg)
 {
@@ -3613,7 +3736,7 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
 static unsigned long bfq_serv_to_charge(struct request *rq,
 					struct bfq_queue *bfqq)
 {
-	if (bfq_bfqq_sync(bfqq))
+	if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1)
 		return blk_rq_sectors(rq);
 
 	return blk_rq_sectors(rq) * bfq_async_charge_factor;
@@ -3700,12 +3823,12 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,
  * whether the in-service queue should be expired, by returning
  * true. The purpose of expiring the in-service queue is to give bfqq
  * the chance to possibly preempt the in-service queue, and the reason
- * for preempting the in-service queue is to achieve the following
- * goal: guarantee to bfqq its reserved bandwidth even if bfqq has
- * expired because it has remained idle.
+ * for preempting the in-service queue is to achieve one of the two
+ * goals below.
  *
- * In particular, bfqq may have expired for one of the following two
- * reasons:
+ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has
+ * expired because it has remained idle. In particular, bfqq may have
+ * expired for one of the following two reasons:
  *
  * - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling
  *   and did not make it to issue a new request before its last
@@ -3769,10 +3892,36 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,
  * above-described special way, and signals that the in-service queue
  * should be expired. Timestamp back-shifting is done later in
  * __bfq_activate_entity.
+ *
+ * 2. Reduce latency. Even if timestamps are not backshifted to let
+ * the process associated with bfqq recover a service hole, bfqq may
+ * however happen to have, after being (re)activated, a lower finish
+ * timestamp than the in-service queue.	 That is, the next budget of
+ * bfqq may have to be completed before the one of the in-service
+ * queue. If this is the case, then preempting the in-service queue
+ * allows this goal to be achieved, apart from the unpreemptible,
+ * outstanding requests mentioned above.
+ *
+ * Unfortunately, regardless of which of the above two goals one wants
+ * to achieve, service trees need first to be updated to know whether
+ * the in-service queue must be preempted. To have service trees
+ * correctly updated, the in-service queue must be expired and
+ * rescheduled, and bfqq must be scheduled too. This is one of the
+ * most costly operations (in future versions, the scheduling
+ * mechanism may be re-designed in such a way to make it possible to
+ * know whether preemption is needed without needing to update service
+ * trees). In addition, queue preemptions almost always cause random
+ * I/O, and thus loss of throughput. Because of these facts, the next
+ * function adopts the following simple scheme to avoid both costly
+ * operations and too frequent preemptions: it requests the expiration
+ * of the in-service queue (unconditionally) only for queues that need
+ * to recover a hole, or that either are weight-raised or deserve to
+ * be weight-raised.
  */
 static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
 						struct bfq_queue *bfqq,
-						bool arrived_in_time)
+						bool arrived_in_time,
+						bool wr_or_deserves_wr)
 {
 	struct bfq_entity *entity = &bfqq->entity;
 
@@ -3807,14 +3956,85 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
 	entity->budget = max_t(unsigned long, bfqq->max_budget,
 			       bfq_serv_to_charge(bfqq->next_rq, bfqq));
 	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
-	return false;
+	return wr_or_deserves_wr;
+}
+
+static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
+{
+	u64 dur;
+
+	if (bfqd->bfq_wr_max_time > 0)
+		return bfqd->bfq_wr_max_time;
+
+	dur = bfqd->RT_prod;
+	do_div(dur, bfqd->peak_rate);
+
+	/*
+	 * Limit duration between 3 and 13 seconds. Tests show that
+	 * higher values than 13 seconds often yield the opposite of
+	 * the desired result, i.e., worsen responsiveness by letting
+	 * non-interactive and non-soft-real-time applications
+	 * preserve weight raising for a too long time interval.
+	 *
+	 * On the other end, lower values than 3 seconds make it
+	 * difficult for most interactive tasks to complete their jobs
+	 * before weight-raising finishes.
+	 */
+	if (dur > msecs_to_jiffies(13000))
+		dur = msecs_to_jiffies(13000);
+	else if (dur < msecs_to_jiffies(3000))
+		dur = msecs_to_jiffies(3000);
+
+	return dur;
+}
+
+static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
+					     struct bfq_queue *bfqq,
+					     unsigned int old_wr_coeff,
+					     bool wr_or_deserves_wr,
+					     bool interactive)
+{
+	if (old_wr_coeff == 1 && wr_or_deserves_wr) {
+		/* start a weight-raising period */
+		bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+		/* update wr duration */
+		bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+
+		/*
+		 * If needed, further reduce budget to make sure it is
+		 * close to bfqq's backlog, so as to reduce the
+		 * scheduling-error component due to a too large
+		 * budget. Do not care about throughput consequences,
+		 * but only about latency. Finally, do not assign a
+		 * too small budget either, to avoid increasing
+		 * latency by causing too frequent expirations.
+		 */
+		bfqq->entity.budget = min_t(unsigned long,
+					    bfqq->entity.budget,
+					    2 * bfq_min_budget(bfqd));
+	} else if (old_wr_coeff > 1) {
+		/* update wr duration */
+		bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+	}
+}
+
+static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd,
+					struct bfq_queue *bfqq)
+{
+	return bfqq->dispatched == 0 &&
+		time_is_before_jiffies(
+			bfqq->budget_timeout +
+			bfqd->bfq_wr_min_idle_time);
 }
 
 static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
 					     struct bfq_queue *bfqq,
-					     struct request *rq)
+					     int old_wr_coeff,
+					     struct request *rq,
+					     bool *interactive)
 {
-	bool bfqq_wants_to_preempt,
+	bool wr_or_deserves_wr,	bfqq_wants_to_preempt,
+		idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq),
 		/*
 		 * See the comments on
 		 * bfq_bfqq_update_budg_for_activation for
@@ -3827,12 +4047,23 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
 	bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags);
 
 	/*
-	 * Update budget and check whether bfqq may want to preempt
-	 * the in-service queue.
+	 * bfqq deserves to be weight-raised if:
+	 * - it is sync,
+	 * - it has been idle for enough time.
+	 */
+	*interactive = idle_for_long_time;
+	wr_or_deserves_wr = bfqd->low_latency &&
+		(bfqq->wr_coeff > 1 ||
+		 (bfq_bfqq_sync(bfqq) && *interactive));
+
+	/*
+	 * Using the last flag, update budget and check whether bfqq
+	 * may want to preempt the in-service queue.
 	 */
 	bfqq_wants_to_preempt =
 		bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
-						    arrived_in_time);
+						    arrived_in_time,
+						    wr_or_deserves_wr);
 
 	if (!bfq_bfqq_IO_bound(bfqq)) {
 		if (arrived_in_time) {
@@ -3844,6 +4075,16 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
 			bfqq->requests_within_timer = 0;
 	}
 
+	if (bfqd->low_latency) {
+		bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq,
+						 old_wr_coeff,
+						 wr_or_deserves_wr,
+						 *interactive);
+
+		if (old_wr_coeff != bfqq->wr_coeff)
+			bfqq->entity.prio_changed = 1;
+	}
+
 	bfq_add_bfqq_busy(bfqd, bfqq);
 
 	/*
@@ -3857,6 +4098,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
 	 * function bfq_bfqq_update_budg_for_activation).
 	 */
 	if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
+	    bfqd->in_service_queue->wr_coeff == 1 &&
 	    next_queue_may_preempt(bfqd))
 		bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
 				false, BFQQE_PREEMPTED);
@@ -3867,6 +4109,8 @@ static void bfq_add_request(struct request *rq)
 	struct bfq_queue *bfqq = RQ_BFQQ(rq);
 	struct bfq_data *bfqd = bfqq->bfqd;
 	struct request *next_rq, *prev;
+	unsigned int old_wr_coeff = bfqq->wr_coeff;
+	bool interactive = false;
 
 	bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
 	bfqq->queued[rq_is_sync(rq)]++;
@@ -3882,9 +4126,45 @@ static void bfq_add_request(struct request *rq)
 	bfqq->next_rq = next_rq;
 
 	if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
-		bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, rq);
-	else if (prev != bfqq->next_rq)
-		bfq_updated_next_req(bfqd, bfqq);
+		bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff,
+						 rq, &interactive);
+	else {
+		if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&
+		    time_is_before_jiffies(
+				bfqq->last_wr_start_finish +
+				bfqd->bfq_wr_min_inter_arr_async)) {
+			bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+
+			bfqq->entity.prio_changed = 1;
+		}
+		if (prev != bfqq->next_rq)
+			bfq_updated_next_req(bfqd, bfqq);
+	}
+
+	/*
+	 * Assign jiffies to last_wr_start_finish in the following
+	 * cases:
+	 *
+	 * . if bfqq is not going to be weight-raised, because, for
+	 *   non weight-raised queues, last_wr_start_finish stores the
+	 *   arrival time of the last request; as of now, this piece
+	 *   of information is used only for deciding whether to
+	 *   weight-raise async queues
+	 *
+	 * . if bfqq is not weight-raised, because, if bfqq is now
+	 *   switching to weight-raised, then last_wr_start_finish
+	 *   stores the time when weight-raising starts
+	 *
+	 * . if bfqq is interactive, because, regardless of whether
+	 *   bfqq is currently weight-raised, the weight-raising
+	 *   period must start or restart (this case is considered
+	 *   separately because it is not detected by the above
+	 *   conditions, if bfqq is already weight-raised)
+	 */
+	if (bfqd->low_latency &&
+		(old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))
+		bfqq->last_wr_start_finish = jiffies;
 }
 
 static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
@@ -4087,6 +4367,46 @@ end:
 	bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
 }
 
+/* Must be called with bfqq != NULL */
+static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
+{
+	bfqq->wr_coeff = 1;
+	bfqq->wr_cur_max_time = 0;
+	/*
+	 * Trigger a weight change on the next invocation of
+	 * __bfq_entity_update_weight_prio.
+	 */
+	bfqq->entity.prio_changed = 1;
+}
+
+static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
+				    struct bfq_group *bfqg)
+{
+	int i, j;
+
+	for (i = 0; i < 2; i++)
+		for (j = 0; j < IOPRIO_BE_NR; j++)
+			if (bfqg->async_bfqq[i][j])
+				bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
+	if (bfqg->async_idle_bfqq)
+		bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
+}
+
+static void bfq_end_wr(struct bfq_data *bfqd)
+{
+	struct bfq_queue *bfqq;
+
+	spin_lock_irq(&bfqd->lock);
+
+	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
+		bfq_bfqq_end_wr(bfqq);
+	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
+		bfq_bfqq_end_wr(bfqq);
+	bfq_end_wr_async(bfqd);
+
+	spin_unlock_irq(&bfqd->lock);
+}
+
 static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
 				struct bio *bio)
 {
@@ -4110,16 +4430,32 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
 	return bfqq == RQ_BFQQ(rq);
 }
 
+/*
+ * Set the maximum time for the in-service queue to consume its
+ * budget. This prevents seeky processes from lowering the throughput.
+ * In practice, a time-slice service scheme is used with seeky
+ * processes.
+ */
+static void bfq_set_budget_timeout(struct bfq_data *bfqd,
+				   struct bfq_queue *bfqq)
+{
+	bfqd->last_budget_start = ktime_get();
+
+	bfqq->budget_timeout = jiffies +
+		bfqd->bfq_timeout *
+		(bfqq->entity.weight / bfqq->entity.orig_weight);
+}
+
 static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
 				       struct bfq_queue *bfqq)
 {
 	if (bfqq) {
 		bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
-		bfq_mark_bfqq_budget_new(bfqq);
 		bfq_clear_bfqq_fifo_expire(bfqq);
 
 		bfqd->budgets_assigned = (bfqd->budgets_assigned * 7 + 256) / 8;
 
+		bfq_set_budget_timeout(bfqd, bfqq);
 		bfq_log_bfqq(bfqd, bfqq,
 			     "set_in_service_queue, cur-budget = %d",
 			     bfqq->entity.budget);
@@ -4159,9 +4495,13 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
 	 */
 	sl = bfqd->bfq_slice_idle;
 	/*
-	 * Grant only minimum idle time if the queue is seeky.
+	 * Unless the queue is being weight-raised, grant only minimum
+	 * idle time if the queue is seeky. A long idling is preserved
+	 * for a weight-raised queue, because it is needed for
+	 * guaranteeing to the queue its reserved share of the
+	 * throughput.
 	 */
-	if (BFQQ_SEEKY(bfqq))
+	if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1)
 		sl = min_t(u64, sl, BFQ_MIN_TT);
 
 	bfqd->last_idling_start = ktime_get();
@@ -4170,27 +4510,6 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
 	bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
 }
 
-/*
- * Set the maximum time for the in-service queue to consume its
- * budget. This prevents seeky processes from lowering the disk
- * throughput (always guaranteed with a time slice scheme as in CFQ).
- */
-static void bfq_set_budget_timeout(struct bfq_data *bfqd)
-{
-	struct bfq_queue *bfqq = bfqd->in_service_queue;
-	unsigned int timeout_coeff = bfqq->entity.weight /
-				     bfqq->entity.orig_weight;
-
-	bfqd->last_budget_start = ktime_get();
-
-	bfq_clear_bfqq_budget_new(bfqq);
-	bfqq->budget_timeout = jiffies +
-		bfqd->bfq_timeout * timeout_coeff;
-
-	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
-		jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff));
-}
-
 /*
  * In autotuning mode, max_budget is dynamically recomputed as the
  * amount of sectors transferred in timeout at the estimated peak
@@ -4204,6 +4523,42 @@ static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd)
 		jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT;
 }
 
+/*
+ * Update parameters related to throughput and responsiveness, as a
+ * function of the estimated peak rate. See comments on
+ * bfq_calc_max_budget(), and on T_slow and T_fast arrays.
+ */
+static void update_thr_responsiveness_params(struct bfq_data *bfqd)
+{
+	int dev_type = blk_queue_nonrot(bfqd->queue);
+
+	if (bfqd->bfq_user_max_budget == 0)
+		bfqd->bfq_max_budget =
+			bfq_calc_max_budget(bfqd);
+
+	if (bfqd->device_speed == BFQ_BFQD_FAST &&
+	    bfqd->peak_rate < device_speed_thresh[dev_type]) {
+		bfqd->device_speed = BFQ_BFQD_SLOW;
+		bfqd->RT_prod = R_slow[dev_type] *
+			T_slow[dev_type];
+	} else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
+		   bfqd->peak_rate > device_speed_thresh[dev_type]) {
+		bfqd->device_speed = BFQ_BFQD_FAST;
+		bfqd->RT_prod = R_fast[dev_type] *
+			T_fast[dev_type];
+	}
+
+	bfq_log(bfqd,
+"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec",
+		dev_type == 0 ? "ROT" : "NONROT",
+		bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW",
+		bfqd->device_speed == BFQ_BFQD_FAST ?
+		(USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT :
+		(USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT,
+		(USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>>
+		BFQ_RATE_SHIFT);
+}
+
 static void bfq_reset_rate_computation(struct bfq_data *bfqd,
 				       struct request *rq)
 {
@@ -4315,9 +4670,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq)
 	rate /= divisor; /* smoothing constant alpha = 1/divisor */
 
 	bfqd->peak_rate += rate;
-	if (bfqd->bfq_user_max_budget == 0)
-		bfqd->bfq_max_budget =
-			bfq_calc_max_budget(bfqd);
+	update_thr_responsiveness_params(bfqd);
 
 reset_computation:
 	bfq_reset_rate_computation(bfqd, rq);
@@ -4439,9 +4792,18 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
 
 static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 {
-	if (RB_EMPTY_ROOT(&bfqq->sort_list))
+	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
+		if (bfqq->dispatched == 0)
+			/*
+			 * Overloading budget_timeout field to store
+			 * the time at which the queue remains with no
+			 * backlog and no outstanding request; used by
+			 * the weight-raising mechanism.
+			 */
+			bfqq->budget_timeout = jiffies;
+
 		bfq_del_bfqq_busy(bfqd, bfqq, true);
-	else
+	} else
 		bfq_requeue_bfqq(bfqd, bfqq);
 
 	/*
@@ -4468,9 +4830,18 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
 	struct request *next_rq;
 	int budget, min_budget;
 
-	budget = bfqq->max_budget;
 	min_budget = bfq_min_budget(bfqd);
 
+	if (bfqq->wr_coeff == 1)
+		budget = bfqq->max_budget;
+	else /*
+	      * Use a constant, low budget for weight-raised queues,
+	      * to help achieve a low latency. Keep it slightly higher
+	      * than the minimum possible budget, to cause a little
+	      * bit fewer expirations.
+	      */
+		budget = 2 * min_budget;
+
 	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",
 		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
 	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d",
@@ -4478,7 +4849,7 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
 	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
 		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
 
-	if (bfq_bfqq_sync(bfqq)) {
+	if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) {
 		switch (reason) {
 		/*
 		 * Caveat: in all the following cases we trade latency
@@ -4577,7 +4948,7 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
 		default:
 			return;
 		}
-	} else {
+	} else if (!bfq_bfqq_sync(bfqq)) {
 		/*
 		 * Async queues get always the maximum possible
 		 * budget, as for them we do not care about latency
@@ -4766,15 +5137,19 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,
 	 * bandwidth, and not time, distribution with little unlucky
 	 * or quasi-sequential processes.
 	 */
-	if (slow ||
-	    (reason == BFQQE_BUDGET_TIMEOUT &&
-	     bfq_bfqq_budget_left(bfqq) >=  entity->budget / 3))
+	if (bfqq->wr_coeff == 1 &&
+	    (slow ||
+	     (reason == BFQQE_BUDGET_TIMEOUT &&
+	      bfq_bfqq_budget_left(bfqq) >=  entity->budget / 3)))
 		bfq_bfqq_charge_time(bfqd, bfqq, delta);
 
 	if (reason == BFQQE_TOO_IDLE &&
 	    entity->service <= 2 * entity->budget / 10)
 		bfq_clear_bfqq_IO_bound(bfqq);
 
+	if (bfqd->low_latency && bfqq->wr_coeff == 1)
+		bfqq->last_wr_start_finish = jiffies;
+
 	bfq_log_bfqq(bfqd, bfqq,
 		"expire (%d, slow %d, num_disp %d, idle_win %d)", reason,
 		slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
@@ -4801,10 +5176,7 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,
  */
 static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
 {
-	if (bfq_bfqq_budget_new(bfqq) ||
-	    time_is_after_jiffies(bfqq->budget_timeout))
-		return false;
-	return true;
+	return time_is_before_eq_jiffies(bfqq->budget_timeout);
 }
 
 /*
@@ -4831,19 +5203,40 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
 
 /*
  * For a queue that becomes empty, device idling is allowed only if
- * this function returns true for the queue. And this function returns
- * true only if idling is beneficial for throughput.
+ * this function returns true for the queue. As a consequence, since
+ * device idling plays a critical role in both throughput boosting and
+ * service guarantees, the return value of this function plays a
+ * critical role in both these aspects as well.
+ *
+ * In a nutshell, this function returns true only if idling is
+ * beneficial for throughput or, even if detrimental for throughput,
+ * idling is however necessary to preserve service guarantees (low
+ * latency, desired throughput distribution, ...). In particular, on
+ * NCQ-capable devices, this function tries to return false, so as to
+ * help keep the drives' internal queues full, whenever this helps the
+ * device boost the throughput without causing any service-guarantee
+ * issue.
+ *
+ * In more detail, the return value of this function is obtained by,
+ * first, computing a number of boolean variables that take into
+ * account throughput and service-guarantee issues, and, then,
+ * combining these variables in a logical expression. Most of the
+ * issues taken into account are not trivial. We discuss these issues
+ * individually while introducing the variables.
  */
 static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
 {
 	struct bfq_data *bfqd = bfqq->bfqd;
-	bool idling_boosts_thr;
+	bool idling_boosts_thr, asymmetric_scenario;
 
 	if (bfqd->strict_guarantees)
 		return true;
 
 	/*
-	 * The value of the next variable is computed considering that
+	 * The next variable takes into account the cases where idling
+	 * boosts the throughput.
+	 *
+	 * The value of the variable is computed considering that
 	 * idling is usually beneficial for the throughput if:
 	 * (a) the device is not NCQ-capable, or
 	 * (b) regardless of the presence of NCQ, the request pattern
@@ -4857,13 +5250,80 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
 	idling_boosts_thr = !bfqd->hw_tag || bfq_bfqq_IO_bound(bfqq);
 
 	/*
-	 * We have now the components we need to compute the return
-	 * value of the function, which is true only if both the
-	 * following conditions hold:
+	 * There is then a case where idling must be performed not for
+	 * throughput concerns, but to preserve service guarantees. To
+	 * introduce it, we can note that allowing the drive to
+	 * enqueue more than one request at a time, and hence
+	 * delegating de facto final scheduling decisions to the
+	 * drive's internal scheduler, causes loss of control on the
+	 * actual request service order. In particular, the critical
+	 * situation is when requests from different processes happens
+	 * to be present, at the same time, in the internal queue(s)
+	 * of the drive. In such a situation, the drive, by deciding
+	 * the service order of the internally-queued requests, does
+	 * determine also the actual throughput distribution among
+	 * these processes. But the drive typically has no notion or
+	 * concern about per-process throughput distribution, and
+	 * makes its decisions only on a per-request basis. Therefore,
+	 * the service distribution enforced by the drive's internal
+	 * scheduler is likely to coincide with the desired
+	 * device-throughput distribution only in a completely
+	 * symmetric scenario where: (i) each of these processes must
+	 * get the same throughput as the others; (ii) all these
+	 * processes have the same I/O pattern (either sequential or
+	 * random).  In fact, in such a scenario, the drive will tend
+	 * to treat the requests of each of these processes in about
+	 * the same way as the requests of the others, and thus to
+	 * provide each of these processes with about the same
+	 * throughput (which is exactly the desired throughput
+	 * distribution). In contrast, in any asymmetric scenario,
+	 * device idling is certainly needed to guarantee that bfqq
+	 * receives its assigned fraction of the device throughput
+	 * (see [1] for details).
+	 *
+	 * As for sub-condition (i), actually we check only whether
+	 * bfqq is being weight-raised. In fact, if bfqq is not being
+	 * weight-raised, we have that:
+	 * - if the process associated with bfqq is not I/O-bound, then
+	 *   it is not either latency- or throughput-critical; therefore
+	 *   idling is not needed for bfqq;
+	 * - if the process asociated with bfqq is I/O-bound, then
+	 *   idling is already granted with bfqq (see the comments on
+	 *   idling_boosts_thr).
+	 *
+	 * We do not check sub-condition (ii) at all, i.e., the next
+	 * variable is true if and only if bfqq is being
+	 * weight-raised. We do not need to control sub-condition (ii)
+	 * for the following reason:
+	 * - if bfqq is being weight-raised, then idling is already
+	 *   guaranteed to bfqq by sub-condition (i);
+	 * - if bfqq is not being weight-raised, then idling is
+	 *   already guaranteed to bfqq (only) if it matters, i.e., if
+	 *   bfqq is associated to a currently I/O-bound process (see
+	 *   the above comment on sub-condition (i)).
+	 *
+	 * As a side note, it is worth considering that the above
+	 * device-idling countermeasures may however fail in the
+	 * following unlucky scenario: if idling is (correctly)
+	 * disabled in a time period during which the symmetry
+	 * sub-condition holds, and hence the device is allowed to
+	 * enqueue many requests, but at some later point in time some
+	 * sub-condition stops to hold, then it may become impossible
+	 * to let requests be served in the desired order until all
+	 * the requests already queued in the device have been served.
+	 */
+	asymmetric_scenario = bfqq->wr_coeff > 1;
+
+	/*
+	 * We have now all the components we need to compute the return
+	 * value of the function, which is true only if both the following
+	 * conditions hold:
 	 * 1) bfqq is sync, because idling make sense only for sync queues;
-	 * 2) idling boosts the throughput.
+	 * 2) idling either boosts the throughput (without issues), or
+	 *    is necessary to preserve service guarantees.
 	 */
-	return bfq_bfqq_sync(bfqq) && idling_boosts_thr;
+	return bfq_bfqq_sync(bfqq) &&
+		(idling_boosts_thr || asymmetric_scenario);
 }
 
 /*
@@ -4986,6 +5446,43 @@ keep_queue:
 	return bfqq;
 }
 
+static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */
+		bfq_log_bfqq(bfqd, bfqq,
+			"raising period dur %u/%u msec, old coeff %u, w %d(%d)",
+			jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
+			jiffies_to_msecs(bfqq->wr_cur_max_time),
+			bfqq->wr_coeff,
+			bfqq->entity.weight, bfqq->entity.orig_weight);
+
+		if (entity->prio_changed)
+			bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
+
+		/*
+		 * If too much time has elapsed from the beginning of
+		 * this weight-raising period, then end weight
+		 * raising.
+		 */
+		if (time_is_before_jiffies(bfqq->last_wr_start_finish +
+					   bfqq->wr_cur_max_time)) {
+			bfqq->last_wr_start_finish = jiffies;
+			bfq_log_bfqq(bfqd, bfqq,
+				     "wrais ending at %lu, rais_max_time %u",
+				     bfqq->last_wr_start_finish,
+				     jiffies_to_msecs(bfqq->wr_cur_max_time));
+			bfq_bfqq_end_wr(bfqq);
+		}
+	}
+	/* Update weight both if it must be raised and if it must be lowered */
+	if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1))
+		__bfq_entity_update_weight_prio(
+			bfq_entity_service_tree(entity),
+			entity);
+}
+
 /*
  * Dispatch next request from bfqq.
  */
@@ -5001,6 +5498,19 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
 
 	bfq_dispatch_remove(bfqd->queue, rq);
 
+	/*
+	 * If weight raising has to terminate for bfqq, then next
+	 * function causes an immediate update of bfqq's weight,
+	 * without waiting for next activation. As a consequence, on
+	 * expiration, bfqq will be timestamped as if has never been
+	 * weight-raised during this service slot, even if it has
+	 * received part or even most of the service as a
+	 * weight-raised queue. This inflates bfqq's timestamps, which
+	 * is beneficial, as bfqq is then more willing to leave the
+	 * device immediately to possible other weight-raised queues.
+	 */
+	bfq_update_wr_data(bfqd, bfqq);
+
 	if (!bfqd->in_service_bic) {
 		atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
 		bfqd->in_service_bic = RQ_BIC(rq);
@@ -5306,6 +5816,9 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
 	bfqq->budget_timeout = bfq_smallest_from_now();
 
+	bfqq->wr_coeff = 1;
+	bfqq->last_wr_start_finish = bfq_smallest_from_now();
+
 	/* first request is almost certainly seeky */
 	bfqq->seek_history = 1;
 }
@@ -5440,7 +5953,8 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,
 		(bfqd->hw_tag && BFQQ_SEEKY(bfqq)))
 		enable_idle = 0;
 	else if (bfq_sample_valid(bfqq->ttime.ttime_samples)) {
-		if (bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)
+		if (bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle &&
+			bfqq->wr_coeff == 1)
 			enable_idle = 0;
 		else
 			enable_idle = 1;
@@ -5618,6 +6132,16 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
 	bfqd->rq_in_driver--;
 	bfqq->dispatched--;
 
+	if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
+		/*
+		 * Set budget_timeout (which we overload to store the
+		 * time at which the queue remains with no backlog and
+		 * no outstanding request; used by the weight-raising
+		 * mechanism).
+		 */
+		bfqq->budget_timeout = jiffies;
+	}
+
 	now_ns = ktime_get_ns();
 
 	bfqq->ttime.last_end_request = now_ns;
@@ -5655,10 +6179,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
 	 * or if we want to idle in case it has no pending requests.
 	 */
 	if (bfqd->in_service_queue == bfqq) {
-		if (bfq_bfqq_budget_new(bfqq))
-			bfq_set_budget_timeout(bfqd);
-
-		if (bfq_bfqq_must_idle(bfqq)) {
+		if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) {
 			bfq_arm_slice_timer(bfqd);
 			return;
 		} else if (bfq_may_expire_for_budg_timeout(bfqq))
@@ -5966,6 +6487,26 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 
 	bfqd->bfq_requests_within_timer = 120;
 
+	bfqd->low_latency = true;
+
+	/*
+	 * Trade-off between responsiveness and fairness.
+	 */
+	bfqd->bfq_wr_coeff = 30;
+	bfqd->bfq_wr_max_time = 0;
+	bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);
+	bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);
+
+	/*
+	 * Begin by assuming, optimistically, that the device is a
+	 * high-speed one, and that its peak rate is equal to 2/3 of
+	 * the highest reference rate.
+	 */
+	bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *
+			T_fast[blk_queue_nonrot(bfqd->queue)];
+	bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
+	bfqd->device_speed = BFQ_BFQD_FAST;
+
 	spin_lock_init(&bfqd->lock);
 
 	/*
@@ -6047,6 +6588,7 @@ SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2);
 SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
 SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1);
 SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0);
+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
 #undef SHOW_FUNCTION
 
 #define USEC_SHOW_FUNCTION(__FUNC, __VAR)				\
@@ -6167,6 +6709,22 @@ static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e,
 	return ret;
 }
 
+static ssize_t bfq_low_latency_store(struct elevator_queue *e,
+				     const char *page, size_t count)
+{
+	struct bfq_data *bfqd = e->elevator_data;
+	unsigned long uninitialized_var(__data);
+	int ret = bfq_var_store(&__data, (page), count);
+
+	if (__data > 1)
+		__data = 1;
+	if (__data == 0 && bfqd->low_latency != 0)
+		bfq_end_wr(bfqd);
+	bfqd->low_latency = __data;
+
+	return ret;
+}
+
 #define BFQ_ATTR(name) \
 	__ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store)
 
@@ -6180,6 +6738,7 @@ static struct elv_fs_entry bfq_attrs[] = {
 	BFQ_ATTR(max_budget),
 	BFQ_ATTR(timeout_sync),
 	BFQ_ATTR(strict_guarantees),
+	BFQ_ATTR(low_latency),
 	__ATTR_NULL
 };
 
@@ -6242,6 +6801,39 @@ static int __init bfq_init(void)
 	if (bfq_slab_setup())
 		goto err_pol_unreg;
 
+	/*
+	 * Times to load large popular applications for the typical
+	 * systems installed on the reference devices (see the
+	 * comments before the definitions of the next two
+	 * arrays). Actually, we use slightly slower values, as the
+	 * estimated peak rate tends to be smaller than the actual
+	 * peak rate.  The reason for this last fact is that estimates
+	 * are computed over much shorter time intervals than the long
+	 * intervals typically used for benchmarking. Why? First, to
+	 * adapt more quickly to variations. Second, because an I/O
+	 * scheduler cannot rely on a peak-rate-evaluation workload to
+	 * be run for a long time.
+	 */
+	T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */
+	T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */
+	T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */
+	T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */
+
+	/*
+	 * Thresholds that determine the switch between speed classes
+	 * (see the comments before the definition of the array
+	 * device_speed_thresh). These thresholds are biased towards
+	 * transitions to the fast class. This is safer than the
+	 * opposite bias. In fact, a wrong transition to the slow
+	 * class results in short weight-raising periods, because the
+	 * speed of the device then tends to be higher that the
+	 * reference peak rate. On the opposite end, a wrong
+	 * transition to the fast class tends to increase
+	 * weight-raising periods, because of the opposite reason.
+	 */
+	device_speed_thresh[0] = (4 * R_slow[0]) / 3;
+	device_speed_thresh[1] = (4 * R_slow[1]) / 3;
+
 	ret = elv_register(&iosched_bfq_mq);
 	if (ret)
 		goto err_pol_unreg;
-- 
cgit v1.2.3