diff options
Diffstat (limited to 'drivers/block')
38 files changed, 1974 insertions, 1697 deletions
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 758da2287d9a..5fd50a284168 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1864,7 +1864,6 @@ static int __exit amiga_floppy_remove(struct platform_device *pdev) static struct platform_driver amiga_floppy_driver = { .driver = { .name = "amiga-floppy", - .owner = THIS_MODULE, }, }; diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index dd73e1ff1759..46c282fff104 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -395,7 +395,7 @@ aoeblk_gdalloc(void *vp) WARN_ON(d->flags & DEVFL_TKILL); WARN_ON(d->gd); WARN_ON(d->flags & DEVFL_UP); - blk_queue_max_hw_sectors(q, BLK_DEF_MAX_SECTORS); + blk_queue_max_hw_sectors(q, 1024); q->backing_dev_info.name = "aoe"; q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE; d->bufpool = mp; diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index d26a3fa63688..1318e3217cb0 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -158,14 +158,14 @@ static int _drbd_md_sync_page_io(struct drbd_device *device, if (bio_add_page(bio, device->md_io.page, size, 0) != size) goto out; bio->bi_private = device; - bio->bi_end_io = drbd_md_io_complete; + bio->bi_end_io = drbd_md_endio; bio->bi_rw = rw; if (!(rw & WRITE) && device->state.disk == D_DISKLESS && device->ldev == NULL) /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */ ; else if (!get_ldev_if_state(device, D_ATTACHING)) { - /* Corresponding put_ldev in drbd_md_io_complete() */ + /* Corresponding put_ldev in drbd_md_endio() */ drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); err = -ENODEV; goto out; @@ -827,8 +827,7 @@ static int update_sync_bits(struct drbd_device *device, * */ int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, - enum update_sync_bits_mode mode, - const char *file, const unsigned int line) + enum update_sync_bits_mode mode) { /* Is called from worker and receiver context _only_ */ unsigned long sbnr, ebnr, lbnr; diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 426c97aef900..434c77dcc99e 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -941,7 +941,7 @@ static void drbd_bm_aio_ctx_destroy(struct kref *kref) } /* bv_page may be a copy, or may be the original */ -static void bm_async_io_complete(struct bio *bio, int error) +static void drbd_bm_endio(struct bio *bio, int error) { struct drbd_bm_aio_ctx *ctx = bio->bi_private; struct drbd_device *device = ctx->device; @@ -1027,7 +1027,7 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho * according to api. Do we want to assert that? */ bio_add_page(bio, page, len, 0); bio->bi_private = ctx; - bio->bi_end_io = bm_async_io_complete; + bio->bi_end_io = drbd_bm_endio; if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) { bio->bi_rw |= rw; @@ -1125,7 +1125,7 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned } /* - * We initialize ctx->in_flight to one to make sure bm_async_io_complete + * We initialize ctx->in_flight to one to make sure drbd_bm_endio * will not set ctx->done early, and decrement / test it here. If there * are still some bios in flight, we need to wait for them here. * If all IO is done already (or nothing had been submitted), there is diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c index 5c20b18540b8..9a950022ff88 100644 --- a/drivers/block/drbd/drbd_debugfs.c +++ b/drivers/block/drbd/drbd_debugfs.c @@ -419,7 +419,7 @@ static int in_flight_summary_show(struct seq_file *m, void *pos) return 0; } -/* simple_positive(file->f_dentry) respectively debugfs_positive(), +/* simple_positive(file->f_path.dentry) respectively debugfs_positive(), * but neither is "reachable" from here. * So we have our own inline version of it above. :-( */ static inline int debugfs_positive(struct dentry *dentry) @@ -437,14 +437,14 @@ static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, vo /* Are we still linked, * or has debugfs_remove() already been called? */ - parent = file->f_dentry->d_parent; + parent = file->f_path.dentry->d_parent; /* not sure if this can happen: */ if (!parent || !parent->d_inode) goto out; /* serialize with d_delete() */ mutex_lock(&parent->d_inode->i_mutex); /* Make sure the object is still alive */ - if (debugfs_positive(file->f_dentry) + if (debugfs_positive(file->f_path.dentry) && kref_get_unless_zero(kref)) ret = 0; mutex_unlock(&parent->d_inode->i_mutex); @@ -695,7 +695,7 @@ static void resync_dump_detail(struct seq_file *m, struct lc_element *e) { struct bm_extent *bme = lc_entry(e, struct bm_extent, lce); - seq_printf(m, "%5d %s %s %s\n", bme->rs_left, + seq_printf(m, "%5d %s %s %s", bme->rs_left, test_bit(BME_NO_WRITES, &bme->flags) ? "NO_WRITES" : "---------", test_bit(BME_LOCKED, &bme->flags) ? "LOCKED" : "------", test_bit(BME_PRIORITY, &bme->flags) ? "PRIORITY" : "--------" diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 1a000016ccdf..b905e9888b88 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -61,8 +61,6 @@ # define __must_hold(x) #endif -#define __no_warn(lock, stmt) do { __acquire(lock); stmt; __release(lock); } while (0) - /* module parameter, defined in drbd_main.c */ extern unsigned int minor_count; extern bool disable_sendpage; @@ -1456,7 +1454,6 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t); /* drbd_nl.c */ -extern int drbd_msg_put_info(struct sk_buff *skb, const char *info); extern void drbd_suspend_io(struct drbd_device *device); extern void drbd_resume_io(struct drbd_device *device); extern char *ppsize(char *buf, unsigned long long size); @@ -1483,7 +1480,7 @@ extern int drbd_khelper(struct drbd_device *device, char *cmd); /* drbd_worker.c */ /* bi_end_io handlers */ -extern void drbd_md_io_complete(struct bio *bio, int error); +extern void drbd_md_endio(struct bio *bio, int error); extern void drbd_peer_request_endio(struct bio *bio, int error); extern void drbd_request_endio(struct bio *bio, int error); extern int drbd_worker(struct drbd_thread *thi); @@ -1560,52 +1557,31 @@ extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled); extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed); extern int drbd_connected(struct drbd_peer_device *); -/* Yes, there is kernel_setsockopt, but only since 2.6.18. - * So we have our own copy of it here. */ -static inline int drbd_setsockopt(struct socket *sock, int level, int optname, - char *optval, int optlen) -{ - mm_segment_t oldfs = get_fs(); - char __user *uoptval; - int err; - - uoptval = (char __user __force *)optval; - - set_fs(KERNEL_DS); - if (level == SOL_SOCKET) - err = sock_setsockopt(sock, level, optname, uoptval, optlen); - else - err = sock->ops->setsockopt(sock, level, optname, uoptval, - optlen); - set_fs(oldfs); - return err; -} - static inline void drbd_tcp_cork(struct socket *sock) { int val = 1; - (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, + (void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK, (char*)&val, sizeof(val)); } static inline void drbd_tcp_uncork(struct socket *sock) { int val = 0; - (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, + (void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK, (char*)&val, sizeof(val)); } static inline void drbd_tcp_nodelay(struct socket *sock) { int val = 1; - (void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY, + (void) kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char*)&val, sizeof(val)); } static inline void drbd_tcp_quickack(struct socket *sock) { int val = 2; - (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK, + (void) kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK, (char*)&val, sizeof(val)); } @@ -1664,14 +1640,13 @@ extern void drbd_advance_rs_marks(struct drbd_device *device, unsigned long stil enum update_sync_bits_mode { RECORD_RS_FAILED, SET_OUT_OF_SYNC, SET_IN_SYNC }; extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, - enum update_sync_bits_mode mode, - const char *file, const unsigned int line); + enum update_sync_bits_mode mode); #define drbd_set_in_sync(device, sector, size) \ - __drbd_change_sync(device, sector, size, SET_IN_SYNC, __FILE__, __LINE__) + __drbd_change_sync(device, sector, size, SET_IN_SYNC) #define drbd_set_out_of_sync(device, sector, size) \ - __drbd_change_sync(device, sector, size, SET_OUT_OF_SYNC, __FILE__, __LINE__) + __drbd_change_sync(device, sector, size, SET_OUT_OF_SYNC) #define drbd_rs_failed_io(device, sector, size) \ - __drbd_change_sync(device, sector, size, RECORD_RS_FAILED, __FILE__, __LINE__) + __drbd_change_sync(device, sector, size, RECORD_RS_FAILED) extern void drbd_al_shrink(struct drbd_device *device); extern int drbd_initialize_al(struct drbd_device *, void *); @@ -2100,16 +2075,19 @@ static inline bool is_sync_state(enum drbd_conns connection_state) /** * get_ldev() - Increase the ref count on device->ldev. Returns 0 if there is no ldev - * @M: DRBD device. + * @_device: DRBD device. + * @_min_state: Minimum device state required for success. * * You have to call put_ldev() when finished working with device->ldev. */ -#define get_ldev(M) __cond_lock(local, _get_ldev_if_state(M,D_INCONSISTENT)) -#define get_ldev_if_state(M,MINS) __cond_lock(local, _get_ldev_if_state(M,MINS)) +#define get_ldev_if_state(_device, _min_state) \ + (_get_ldev_if_state((_device), (_min_state)) ? \ + ({ __acquire(x); true; }) : false) +#define get_ldev(_device) get_ldev_if_state(_device, D_INCONSISTENT) static inline void put_ldev(struct drbd_device *device) { - enum drbd_disk_state ds = device->state.disk; + enum drbd_disk_state disk_state = device->state.disk; /* We must check the state *before* the atomic_dec becomes visible, * or we have a theoretical race where someone hitting zero, * while state still D_FAILED, will then see D_DISKLESS in the @@ -2122,10 +2100,10 @@ static inline void put_ldev(struct drbd_device *device) __release(local); D_ASSERT(device, i >= 0); if (i == 0) { - if (ds == D_DISKLESS) + if (disk_state == D_DISKLESS) /* even internal references gone, safe to destroy */ drbd_device_post_work(device, DESTROY_DISK); - if (ds == D_FAILED) + if (disk_state == D_FAILED) /* all application IO references gone. */ if (!test_and_set_bit(GOING_DISKLESS, &device->flags)) drbd_device_post_work(device, GO_DISKLESS); diff --git a/drivers/block/drbd/drbd_interval.c b/drivers/block/drbd/drbd_interval.c index 89c497c630b4..51b25ad85251 100644 --- a/drivers/block/drbd/drbd_interval.c +++ b/drivers/block/drbd/drbd_interval.c @@ -37,40 +37,8 @@ compute_subtree_last(struct drbd_interval *node) return max; } -static void augment_propagate(struct rb_node *rb, struct rb_node *stop) -{ - while (rb != stop) { - struct drbd_interval *node = rb_entry(rb, struct drbd_interval, rb); - sector_t subtree_last = compute_subtree_last(node); - if (node->end == subtree_last) - break; - node->end = subtree_last; - rb = rb_parent(&node->rb); - } -} - -static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new) -{ - struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb); - struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb); - - new->end = old->end; -} - -static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new) -{ - struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb); - struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb); - - new->end = old->end; - old->end = compute_subtree_last(old); -} - -static const struct rb_augment_callbacks augment_callbacks = { - augment_propagate, - augment_copy, - augment_rotate, -}; +RB_DECLARE_CALLBACKS(static, augment_callbacks, struct drbd_interval, rb, + sector_t, end, compute_subtree_last); /** * drbd_insert_interval - insert a new interval into a tree @@ -79,6 +47,7 @@ bool drbd_insert_interval(struct rb_root *root, struct drbd_interval *this) { struct rb_node **new = &root->rb_node, *parent = NULL; + sector_t this_end = this->sector + (this->size >> 9); BUG_ON(!IS_ALIGNED(this->size, 512)); @@ -87,6 +56,8 @@ drbd_insert_interval(struct rb_root *root, struct drbd_interval *this) rb_entry(*new, struct drbd_interval, rb); parent = *new; + if (here->end < this_end) + here->end = this_end; if (this->sector < here->sector) new = &(*new)->rb_left; else if (this->sector > here->sector) @@ -99,6 +70,7 @@ drbd_insert_interval(struct rb_root *root, struct drbd_interval *this) return false; } + this->end = this_end; rb_link_node(&this->rb, parent, new); rb_insert_augmented(&this->rb, root, &augment_callbacks); return true; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 9b465bb68487..1fc83427199c 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1622,13 +1622,13 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request * struct drbd_socket *sock; struct p_data *p; unsigned int dp_flags = 0; - int dgs; + int digest_size; int err; sock = &peer_device->connection->data; p = drbd_prepare_command(peer_device, sock); - dgs = peer_device->connection->integrity_tfm ? - crypto_hash_digestsize(peer_device->connection->integrity_tfm) : 0; + digest_size = peer_device->connection->integrity_tfm ? + crypto_hash_digestsize(peer_device->connection->integrity_tfm) : 0; if (!p) return -EIO; @@ -1659,9 +1659,9 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request * /* our digest is still only over the payload. * TRIM does not carry any payload. */ - if (dgs) + if (digest_size) drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, p + 1); - err = __send_command(peer_device->connection, device->vnr, sock, P_DATA, sizeof(*p) + dgs, NULL, req->i.size); + err = __send_command(peer_device->connection, device->vnr, sock, P_DATA, sizeof(*p) + digest_size, NULL, req->i.size); if (!err) { /* For protocol A, we have to memcpy the payload into * socket buffers, as we may complete right away @@ -1674,23 +1674,23 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request * * out ok after sending on this side, but does not fit on the * receiving side, we sure have detected corruption elsewhere. */ - if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK)) || dgs) + if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK)) || digest_size) err = _drbd_send_bio(peer_device, req->master_bio); else err = _drbd_send_zc_bio(peer_device, req->master_bio); /* double check digest, sometimes buffers have been modified in flight. */ - if (dgs > 0 && dgs <= 64) { + if (digest_size > 0 && digest_size <= 64) { /* 64 byte, 512 bit, is the largest digest size * currently supported in kernel crypto. */ unsigned char digest[64]; drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, digest); - if (memcmp(p + 1, digest, dgs)) { + if (memcmp(p + 1, digest, digest_size)) { drbd_warn(device, "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n", (unsigned long long)req->i.sector, req->i.size); } - } /* else if (dgs > 64) { + } /* else if (digest_size > 64) { ... Be noisy about digest too large ... } */ } @@ -1711,13 +1711,13 @@ int drbd_send_block(struct drbd_peer_device *peer_device, enum drbd_packet cmd, struct drbd_socket *sock; struct p_data *p; int err; - int dgs; + int digest_size; sock = &peer_device->connection->data; p = drbd_prepare_command(peer_device, sock); - dgs = peer_device->connection->integrity_tfm ? - crypto_hash_digestsize(peer_device->connection->integrity_tfm) : 0; + digest_size = peer_device->connection->integrity_tfm ? + crypto_hash_digestsize(peer_device->connection->integrity_tfm) : 0; if (!p) return -EIO; @@ -1725,9 +1725,9 @@ int drbd_send_block(struct drbd_peer_device *peer_device, enum drbd_packet cmd, p->block_id = peer_req->block_id; p->seq_num = 0; /* unused */ p->dp_flags = 0; - if (dgs) + if (digest_size) drbd_csum_ee(peer_device->connection->integrity_tfm, peer_req, p + 1); - err = __send_command(peer_device->connection, device->vnr, sock, cmd, sizeof(*p) + dgs, NULL, peer_req->i.size); + err = __send_command(peer_device->connection, device->vnr, sock, cmd, sizeof(*p) + digest_size, NULL, peer_req->i.size); if (!err) err = _drbd_send_zc_ee(peer_device, peer_req); mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */ @@ -2532,10 +2532,6 @@ int set_resource_options(struct drbd_resource *resource, struct res_opts *res_op if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) return -ENOMEM; - /* - retcode = ERR_NOMEM; - drbd_msg_put_info("unable to allocate cpumask"); - */ /* silently ignore cpu mask on UP kernel */ if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) { @@ -2731,7 +2727,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig device = minor_to_device(minor); if (device) - return ERR_MINOR_EXISTS; + return ERR_MINOR_OR_VOLUME_EXISTS; /* GFP_KERNEL, we are outside of all write-out paths */ device = kzalloc(sizeof(struct drbd_device), GFP_KERNEL); @@ -2793,20 +2789,16 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig id = idr_alloc(&drbd_devices, device, minor, minor + 1, GFP_KERNEL); if (id < 0) { - if (id == -ENOSPC) { - err = ERR_MINOR_EXISTS; - drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already"); - } + if (id == -ENOSPC) + err = ERR_MINOR_OR_VOLUME_EXISTS; goto out_no_minor_idr; } kref_get(&device->kref); id = idr_alloc(&resource->devices, device, vnr, vnr + 1, GFP_KERNEL); if (id < 0) { - if (id == -ENOSPC) { - err = ERR_MINOR_EXISTS; - drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already"); - } + if (id == -ENOSPC) + err = ERR_MINOR_OR_VOLUME_EXISTS; goto out_idr_remove_minor; } kref_get(&device->kref); @@ -2825,10 +2817,8 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig id = idr_alloc(&connection->peer_devices, peer_device, vnr, vnr + 1, GFP_KERNEL); if (id < 0) { - if (id == -ENOSPC) { + if (id == -ENOSPC) err = ERR_INVALID_REQUEST; - drbd_msg_put_info(adm_ctx->reply_skb, "requested volume exists already"); - } goto out_idr_remove_from_resource; } kref_get(&connection->kref); @@ -2836,7 +2826,6 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig if (init_submitter(device)) { err = ERR_NOMEM; - drbd_msg_put_info(adm_ctx->reply_skb, "unable to create submit workqueue"); goto out_idr_remove_vol; } diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 1cd47df44bda..74df8cfad414 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -92,7 +92,7 @@ static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info) /* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only * reason it could fail was no space in skb, and there are 4k available. */ -int drbd_msg_put_info(struct sk_buff *skb, const char *info) +static int drbd_msg_put_info(struct sk_buff *skb, const char *info) { struct nlattr *nla; int err = -EMSGSIZE; @@ -588,7 +588,7 @@ drbd_set_role(struct drbd_device *const device, enum drbd_role new_role, int for val.i = 0; val.role = new_role; while (try++ < max_tries) { - rv = _drbd_request_state(device, mask, val, CS_WAIT_COMPLETE); + rv = _drbd_request_state_holding_state_mutex(device, mask, val, CS_WAIT_COMPLETE); /* in case we first succeeded to outdate, * but now suddenly could establish a connection */ @@ -2052,7 +2052,7 @@ check_net_options(struct drbd_connection *connection, struct net_conf *new_net_c rv = _check_net_options(connection, rcu_dereference(connection->net_conf), new_net_conf); rcu_read_unlock(); - /* connection->volumes protected by genl_lock() here */ + /* connection->peer_devices protected by genl_lock() here */ idr_for_each_entry(&connection->peer_devices, peer_device, i) { struct drbd_device *device = peer_device->device; if (!device->bitmap) { @@ -3483,7 +3483,7 @@ int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) * that first_peer_device(device)->connection and device->vnr match the request. */ if (adm_ctx.device) { if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) - retcode = ERR_MINOR_EXISTS; + retcode = ERR_MINOR_OR_VOLUME_EXISTS; /* else: still NO_ERROR */ goto out; } @@ -3530,6 +3530,27 @@ out: return 0; } +static int adm_del_resource(struct drbd_resource *resource) +{ + struct drbd_connection *connection; + + for_each_connection(connection, resource) { + if (connection->cstate > C_STANDALONE) + return ERR_NET_CONFIGURED; + } + if (!idr_is_empty(&resource->devices)) + return ERR_RES_IN_USE; + + list_del_rcu(&resource->resources); + /* Make sure all threads have actually stopped: state handling only + * does drbd_thread_stop_nowait(). */ + list_for_each_entry(connection, &resource->connections, connections) + drbd_thread_stop(&connection->worker); + synchronize_rcu(); + drbd_free_resource(resource); + return NO_ERROR; +} + int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) { struct drbd_config_context adm_ctx; @@ -3575,14 +3596,6 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) } } - /* If we reach this, all volumes (of this connection) are Secondary, - * Disconnected, Diskless, aka Unconfigured. Make sure all threads have - * actually stopped, state handling only does drbd_thread_stop_nowait(). */ - for_each_connection(connection, resource) - drbd_thread_stop(&connection->worker); - - /* Now, nothing can fail anymore */ - /* delete volumes */ idr_for_each_entry(&resource->devices, device, i) { retcode = adm_del_minor(device); @@ -3593,10 +3606,7 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) } } - list_del_rcu(&resource->resources); - synchronize_rcu(); - drbd_free_resource(resource); - retcode = NO_ERROR; + retcode = adm_del_resource(resource); out: mutex_unlock(&resource->adm_mutex); finish: @@ -3608,7 +3618,6 @@ int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info) { struct drbd_config_context adm_ctx; struct drbd_resource *resource; - struct drbd_connection *connection; enum drbd_ret_code retcode; retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); @@ -3616,27 +3625,10 @@ int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info) return retcode; if (retcode != NO_ERROR) goto finish; - resource = adm_ctx.resource; - mutex_lock(&resource->adm_mutex); - for_each_connection(connection, resource) { - if (connection->cstate > C_STANDALONE) { - retcode = ERR_NET_CONFIGURED; - goto out; - } - } - if (!idr_is_empty(&resource->devices)) { - retcode = ERR_RES_IN_USE; - goto out; - } - list_del_rcu(&resource->resources); - for_each_connection(connection, resource) - drbd_thread_stop(&connection->worker); - synchronize_rcu(); - drbd_free_resource(resource); - retcode = NO_ERROR; -out: + mutex_lock(&resource->adm_mutex); + retcode = adm_del_resource(resource); mutex_unlock(&resource->adm_mutex); finish: drbd_adm_finish(&adm_ctx, info, retcode); diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 06e6147c7601..3b10fa6cb039 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -142,10 +142,12 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se (unsigned long) Bit2KB(rs_left >> 10), (unsigned long) Bit2KB(rs_total >> 10)); else - seq_printf(seq, "(%lu/%lu)K\n\t", + seq_printf(seq, "(%lu/%lu)K", (unsigned long) Bit2KB(rs_left), (unsigned long) Bit2KB(rs_total)); + seq_printf(seq, "\n\t"); + /* see drivers/md/md.c * We do not want to overflow, so the order of operands and * the * 100 / 100 trick are important. We do a +1 to be diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 9342b8da73ab..d169b4a79267 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1371,9 +1371,9 @@ int drbd_submit_peer_request(struct drbd_device *device, struct bio *bio; struct page *page = peer_req->pages; sector_t sector = peer_req->i.sector; - unsigned ds = peer_req->i.size; + unsigned data_size = peer_req->i.size; unsigned n_bios = 0; - unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT; + unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; int err = -ENOMEM; if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) { @@ -1388,7 +1388,7 @@ int drbd_submit_peer_request(struct drbd_device *device, list_add_tail(&peer_req->w.list, &device->active_ee); spin_unlock_irq(&device->resource->req_lock); if (blkdev_issue_zeroout(device->ldev->backing_bdev, - sector, ds >> 9, GFP_NOIO)) + sector, data_size >> 9, GFP_NOIO)) peer_req->flags |= EE_WAS_ERROR; drbd_endio_write_sec_final(peer_req); return 0; @@ -1426,12 +1426,12 @@ next_bio: ++n_bios; if (rw & REQ_DISCARD) { - bio->bi_iter.bi_size = ds; + bio->bi_iter.bi_size = data_size; goto submit; } page_chain_for_each(page) { - unsigned len = min_t(unsigned, ds, PAGE_SIZE); + unsigned len = min_t(unsigned, data_size, PAGE_SIZE); if (!bio_add_page(bio, page, len, 0)) { /* A single page must always be possible! * But in case it fails anyways, @@ -1446,11 +1446,11 @@ next_bio: } goto next_bio; } - ds -= len; + data_size -= len; sector += len >> 9; --nr_pages; } - D_ASSERT(device, ds == 0); + D_ASSERT(device, data_size == 0); submit: D_ASSERT(device, page == NULL); @@ -1591,24 +1591,24 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, const sector_t capacity = drbd_get_capacity(device->this_bdev); struct drbd_peer_request *peer_req; struct page *page; - int dgs, ds, err; - unsigned int data_size = pi->size; + int digest_size, err; + unsigned int data_size = pi->size, ds; void *dig_in = peer_device->connection->int_dig_in; void *dig_vv = peer_device->connection->int_dig_vv; unsigned long *data; struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL; - dgs = 0; + digest_size = 0; if (!trim && peer_device->connection->peer_integrity_tfm) { - dgs = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm); + digest_size = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm); /* * FIXME: Receive the incoming digest into the receive buffer * here, together with its struct p_data? */ - err = drbd_recv_all_warn(peer_device->connection, dig_in, dgs); + err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size); if (err) return NULL; - data_size -= dgs; + data_size -= digest_size; } if (trim) { @@ -1661,16 +1661,16 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, ds -= len; } - if (dgs) { + if (digest_size) { drbd_csum_ee(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv); - if (memcmp(dig_in, dig_vv, dgs)) { + if (memcmp(dig_in, dig_vv, digest_size)) { drbd_err(device, "Digest integrity check FAILED: %llus +%u\n", (unsigned long long)sector, data_size); drbd_free_peer_req(device, peer_req); return NULL; } } - device->recv_cnt += data_size>>9; + device->recv_cnt += data_size >> 9; return peer_req; } @@ -1708,17 +1708,17 @@ static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_req struct bio_vec bvec; struct bvec_iter iter; struct bio *bio; - int dgs, err, expect; + int digest_size, err, expect; void *dig_in = peer_device->connection->int_dig_in; void *dig_vv = peer_device->connection->int_dig_vv; - dgs = 0; + digest_size = 0; if (peer_device->connection->peer_integrity_tfm) { - dgs = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm); - err = drbd_recv_all_warn(peer_device->connection, dig_in, dgs); + digest_size = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm); + err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size); if (err) return err; - data_size -= dgs; + data_size -= digest_size; } /* optimistically update recv_cnt. if receiving fails below, @@ -1738,9 +1738,9 @@ static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_req data_size -= expect; } - if (dgs) { + if (digest_size) { drbd_csum_bio(peer_device->connection->peer_integrity_tfm, bio, dig_vv); - if (memcmp(dig_in, dig_vv, dgs)) { + if (memcmp(dig_in, dig_vv, digest_size)) { drbd_err(peer_device, "Digest integrity check FAILED. Broken NICs?\n"); return -EINVAL; } @@ -2482,7 +2482,7 @@ bool drbd_rs_c_min_rate_throttle(struct drbd_device *device) atomic_read(&device->rs_sect_ev); if (atomic_read(&device->ap_actlog_cnt) - || !device->rs_last_events || curr_events - device->rs_last_events > 64) { + || curr_events - device->rs_last_events > 64) { unsigned long rs_left; int i; @@ -5561,6 +5561,7 @@ int drbd_asender(struct drbd_thread *thi) * rv < expected: "woken" by signal during receive * rv == 0 : "connection shut down by peer" */ +received_more: if (likely(rv > 0)) { received += rv; buf += rv; @@ -5636,6 +5637,11 @@ int drbd_asender(struct drbd_thread *thi) expect = header_size; cmd = NULL; } + if (test_bit(SEND_PING, &connection->flags)) + continue; + rv = drbd_recv_short(connection->meta.socket, buf, expect-received, MSG_DONTWAIT); + if (rv > 0) + goto received_more; } if (0) { diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index c67717d572d1..34f2f0ba409b 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -36,29 +36,15 @@ static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, /* Update disk stats at start of I/O request */ static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request *req) { - const int rw = bio_data_dir(req->master_bio); - int cpu; - cpu = part_stat_lock(); - part_round_stats(cpu, &device->vdisk->part0); - part_stat_inc(cpu, &device->vdisk->part0, ios[rw]); - part_stat_add(cpu, &device->vdisk->part0, sectors[rw], req->i.size >> 9); - (void) cpu; /* The macro invocations above want the cpu argument, I do not like - the compiler warning about cpu only assigned but never used... */ - part_inc_in_flight(&device->vdisk->part0, rw); - part_stat_unlock(); + generic_start_io_acct(bio_data_dir(req->master_bio), req->i.size >> 9, + &device->vdisk->part0); } /* Update disk stats when completing request upwards */ static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *req) { - int rw = bio_data_dir(req->master_bio); - unsigned long duration = jiffies - req->start_jif; - int cpu; - cpu = part_stat_lock(); - part_stat_add(cpu, &device->vdisk->part0, ticks[rw], duration); - part_round_stats(cpu, &device->vdisk->part0); - part_dec_in_flight(&device->vdisk->part0, rw); - part_stat_unlock(); + generic_end_io_acct(bio_data_dir(req->master_bio), + &device->vdisk->part0, req->start_jif); } static struct drbd_request *drbd_req_new(struct drbd_device *device, @@ -1545,6 +1531,7 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct struct request_queue * const b = device->ldev->backing_bdev->bd_disk->queue; if (b->merge_bvec_fn) { + bvm->bi_bdev = device->ldev->backing_bdev; backing_limit = b->merge_bvec_fn(b, bvm, bvec); limit = min(limit, backing_limit); } @@ -1628,7 +1615,7 @@ void request_timer_fn(unsigned long data) time_after(now, req_peer->pre_send_jif + ent) && !time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) { drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n"); - _drbd_set_state(_NS(device, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); + _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_VERBOSE | CS_HARD); } if (dt && oldest_submit_jif != now && time_after(now, oldest_submit_jif + dt) && @@ -1645,6 +1632,6 @@ void request_timer_fn(unsigned long data) ? oldest_submit_jif + dt : now + et; nt = time_before(ent, dt) ? ent : dt; out: - spin_unlock_irq(&connection->resource->req_lock); + spin_unlock_irq(&device->resource->req_lock); mod_timer(&device->request_timer, nt); } diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index c35c0f001bb7..2d7dd269b6a8 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -136,50 +136,50 @@ enum drbd_role conn_highest_peer(struct drbd_connection *connection) enum drbd_disk_state conn_highest_disk(struct drbd_connection *connection) { - enum drbd_disk_state ds = D_DISKLESS; + enum drbd_disk_state disk_state = D_DISKLESS; struct drbd_peer_device *peer_device; int vnr; rcu_read_lock(); idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { struct drbd_device *device = peer_device->device; - ds = max_t(enum drbd_disk_state, ds, device->state.disk); + disk_state = max_t(enum drbd_disk_state, disk_state, device->state.disk); } rcu_read_unlock(); - return ds; + return disk_state; } enum drbd_disk_state conn_lowest_disk(struct drbd_connection *connection) { - enum drbd_disk_state ds = D_MASK; + enum drbd_disk_state disk_state = D_MASK; struct drbd_peer_device *peer_device; int vnr; rcu_read_lock(); idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { struct drbd_device *device = peer_device->device; - ds = min_t(enum drbd_disk_state, ds, device->state.disk); + disk_state = min_t(enum drbd_disk_state, disk_state, device->state.disk); } rcu_read_unlock(); - return ds; + return disk_state; } enum drbd_disk_state conn_highest_pdsk(struct drbd_connection *connection) { - enum drbd_disk_state ds = D_DISKLESS; + enum drbd_disk_state disk_state = D_DISKLESS; struct drbd_peer_device *peer_device; int vnr; rcu_read_lock(); idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { struct drbd_device *device = peer_device->device; - ds = max_t(enum drbd_disk_state, ds, device->state.pdsk); + disk_state = max_t(enum drbd_disk_state, disk_state, device->state.pdsk); } rcu_read_unlock(); - return ds; + return disk_state; } enum drbd_conns conn_lowest_conn(struct drbd_connection *connection) @@ -215,6 +215,18 @@ static bool no_peer_wf_report_params(struct drbd_connection *connection) return rv; } +static void wake_up_all_devices(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + wake_up(&peer_device->device->state_wait); + rcu_read_unlock(); + +} + /** * cl_wide_st_chg() - true if the state change is a cluster wide one @@ -410,6 +422,22 @@ _drbd_request_state(struct drbd_device *device, union drbd_state mask, return rv; } +enum drbd_state_rv +_drbd_request_state_holding_state_mutex(struct drbd_device *device, union drbd_state mask, + union drbd_state val, enum chg_state_flags f) +{ + enum drbd_state_rv rv; + + BUG_ON(f & CS_SERIALIZE); + + wait_event_cmd(device->state_wait, + (rv = drbd_req_state(device, mask, val, f)) != SS_IN_TRANSIENT_STATE, + mutex_unlock(device->state_mutex), + mutex_lock(device->state_mutex)); + + return rv; +} + static void print_st(struct drbd_device *device, const char *name, union drbd_state ns) { drbd_err(device, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n", @@ -629,14 +657,11 @@ is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_c if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED) rv = SS_IN_TRANSIENT_STATE; - /* if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS) - rv = SS_IN_TRANSIENT_STATE; */ - /* While establishing a connection only allow cstate to change. - Delay/refuse role changes, detach attach etc... */ + Delay/refuse role changes, detach attach etc... (they do not touch cstate) */ if (test_bit(STATE_SENT, &connection->flags) && - !(os.conn == C_WF_REPORT_PARAMS || - (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION))) + !((ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION) || + (ns.conn >= C_CONNECTED && os.conn == C_WF_REPORT_PARAMS))) rv = SS_IN_TRANSIENT_STATE; if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) @@ -1032,8 +1057,10 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, /* Wake up role changes, that were delayed because of connection establishing */ if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS && - no_peer_wf_report_params(connection)) + no_peer_wf_report_params(connection)) { clear_bit(STATE_SENT, &connection->flags); + wake_up_all_devices(connection); + } wake_up(&device->misc_wait); wake_up(&device->state_wait); @@ -1072,7 +1099,6 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, set_ov_position(device, ns.conn); device->rs_start = now; - device->rs_last_events = 0; device->rs_last_sect_ev = 0; device->ov_last_oos_size = 0; device->ov_last_oos_start = 0; diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h index cc41605ba21c..7f53c40823cd 100644 --- a/drivers/block/drbd/drbd_state.h +++ b/drivers/block/drbd/drbd_state.h @@ -117,6 +117,11 @@ extern enum drbd_state_rv _drbd_request_state(struct drbd_device *, union drbd_state, union drbd_state, enum chg_state_flags); + +extern enum drbd_state_rv +_drbd_request_state_holding_state_mutex(struct drbd_device *, union drbd_state, + union drbd_state, enum chg_state_flags); + extern enum drbd_state_rv __drbd_set_state(struct drbd_device *, union drbd_state, enum chg_state_flags, struct completion *done); diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 50776b362828..d0fae55d871d 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -43,10 +43,10 @@ static int make_ov_request(struct drbd_device *, int); static int make_resync_request(struct drbd_device *, int); /* endio handlers: - * drbd_md_io_complete (defined here) + * drbd_md_endio (defined here) * drbd_request_endio (defined here) * drbd_peer_request_endio (defined here) - * bm_async_io_complete (defined in drbd_bitmap.c) + * drbd_bm_endio (defined in drbd_bitmap.c) * * For all these callbacks, note the following: * The callbacks will be called in irq context by the IDE drivers, @@ -65,7 +65,7 @@ rwlock_t global_state_lock; /* used for synchronous meta data and bitmap IO * submitted by drbd_md_sync_page_io() */ -void drbd_md_io_complete(struct bio *bio, int error) +void drbd_md_endio(struct bio *bio, int error) { struct drbd_device *device; @@ -1592,11 +1592,15 @@ void drbd_resync_after_changed(struct drbd_device *device) void drbd_rs_controller_reset(struct drbd_device *device) { + struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk; struct fifo_buffer *plan; atomic_set(&device->rs_sect_in, 0); atomic_set(&device->rs_sect_ev, 0); device->rs_in_flight = 0; + device->rs_last_events = + (int)part_stat_read(&disk->part0, sectors[0]) + + (int)part_stat_read(&disk->part0, sectors[1]); /* Updating the RCU protected object in place is necessary since this function gets called from atomic context. @@ -1743,7 +1747,6 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) device->rs_failed = 0; device->rs_paused = 0; device->rs_same_csum = 0; - device->rs_last_events = 0; device->rs_last_sect_ev = 0; device->rs_total = tw; device->rs_start = now; @@ -1853,9 +1856,12 @@ static void drbd_ldev_destroy(struct drbd_device *device) device->resync = NULL; lc_destroy(device->act_log); device->act_log = NULL; - __no_warn(local, - drbd_free_ldev(device->ldev); - device->ldev = NULL;); + + __acquire(local); + drbd_free_ldev(device->ldev); + device->ldev = NULL; + __release(local); + clear_bit(GOING_DISKLESS, &device->flags); wake_up(&device->misc_wait); } @@ -1928,19 +1934,18 @@ void __update_timing_details( ++(*cb_nr); } -#define WORK_PENDING(work_bit, todo) (todo & (1UL << work_bit)) static void do_device_work(struct drbd_device *device, const unsigned long todo) { - if (WORK_PENDING(MD_SYNC, todo)) + if (test_bit(MD_SYNC, &todo)) do_md_sync(device); - if (WORK_PENDING(RS_DONE, todo) || - WORK_PENDING(RS_PROGRESS, todo)) - update_on_disk_bitmap(device, WORK_PENDING(RS_DONE, todo)); - if (WORK_PENDING(GO_DISKLESS, todo)) + if (test_bit(RS_DONE, &todo) || + test_bit(RS_PROGRESS, &todo)) + update_on_disk_bitmap(device, test_bit(RS_DONE, &todo)); + if (test_bit(GO_DISKLESS, &todo)) go_diskless(device); - if (WORK_PENDING(DESTROY_DISK, todo)) + if (test_bit(DESTROY_DISK, &todo)) drbd_ldev_destroy(device); - if (WORK_PENDING(RS_START, todo)) + if (test_bit(RS_START, &todo)) do_start_resync(device); } @@ -1992,22 +1997,13 @@ static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head * return !list_empty(work_list); } -static bool dequeue_work_item(struct drbd_work_queue *queue, struct list_head *work_list) -{ - spin_lock_irq(&queue->q_lock); - if (!list_empty(&queue->q)) - list_move(queue->q.next, work_list); - spin_unlock_irq(&queue->q_lock); - return !list_empty(work_list); -} - static void wait_for_work(struct drbd_connection *connection, struct list_head *work_list) { DEFINE_WAIT(wait); struct net_conf *nc; int uncork, cork; - dequeue_work_item(&connection->sender_work, work_list); + dequeue_work_batch(&connection->sender_work, work_list); if (!list_empty(work_list)) return; @@ -2033,8 +2029,6 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head * prepare_to_wait(&connection->sender_work.q_wait, &wait, TASK_INTERRUPTIBLE); spin_lock_irq(&connection->resource->req_lock); spin_lock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */ - /* dequeue single item only, - * we still use drbd_queue_work_front() in some places */ if (!list_empty(&connection->sender_work.q)) list_splice_tail_init(&connection->sender_work.q, work_list); spin_unlock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */ @@ -2121,7 +2115,7 @@ int drbd_worker(struct drbd_thread *thi) if (get_t_state(thi) != RUNNING) break; - while (!list_empty(&work_list)) { + if (!list_empty(&work_list)) { w = list_first_entry(&work_list, struct drbd_work, list); list_del_init(&w->list); update_worker_timing_details(connection, w->cb); @@ -2137,13 +2131,13 @@ int drbd_worker(struct drbd_thread *thi) update_worker_timing_details(connection, do_unqueued_work); do_unqueued_work(connection); } - while (!list_empty(&work_list)) { + if (!list_empty(&work_list)) { w = list_first_entry(&work_list, struct drbd_work, list); list_del_init(&w->list); update_worker_timing_details(connection, w->cb); w->cb(w, 1); - } - dequeue_work_batch(&connection->sender_work, &work_list); + } else + dequeue_work_batch(&connection->sender_work, &work_list); } while (!list_empty(&work_list) || test_bit(DEVICE_WORK_PENDING, &connection->flags)); rcu_read_lock(); diff --git a/drivers/block/hd.c b/drivers/block/hd.c index 8a290c08262f..3abb121825bc 100644 --- a/drivers/block/hd.c +++ b/drivers/block/hd.c @@ -694,16 +694,6 @@ static const struct block_device_operations hd_fops = { .getgeo = hd_getgeo, }; -/* - * This is the hard disk IRQ description. The IRQF_DISABLED in sa_flags - * means we run the IRQ-handler with interrupts disabled: this is bad for - * interrupt latency, but anything else has led to problems on some - * machines. - * - * We enable interrupts in some of the routines after making sure it's - * safe. - */ - static int __init hd_init(void) { int drive; @@ -761,7 +751,7 @@ static int __init hd_init(void) p->cyl, p->head, p->sect); } - if (request_irq(HD_IRQ, hd_interrupt, IRQF_DISABLED, "hd", NULL)) { + if (request_irq(HD_IRQ, hd_interrupt, 0, "hd", NULL)) { printk("hd: unable to get IRQ%d for the hard disk driver\n", HD_IRQ); goto out1; diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index e352cac707e8..145ce2aa2e78 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -1082,7 +1082,6 @@ static struct platform_driver mg_disk_driver = { .remove = mg_remove, .driver = { .name = MG_DEV_NAME, - .owner = THIS_MODULE, .pm = &mg_pm, } }; diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 5c8e7fe07745..3bd7ca9853a8 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -247,7 +247,7 @@ static void mtip_async_complete(struct mtip_port *port, if (unlikely(cmd->unaligned)) up(&port->cmd_slot_unal); - blk_mq_end_io(rq, status ? -EIO : 0); + blk_mq_end_request(rq, status ? -EIO : 0); } /* @@ -3739,7 +3739,7 @@ static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq) int err; err = mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq)); - blk_mq_end_io(rq, err); + blk_mq_end_request(rq, err); return 0; } @@ -3775,13 +3775,17 @@ static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx, return false; } -static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq) +static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { + struct request *rq = bd->rq; int ret; if (unlikely(mtip_check_unal_depth(hctx, rq))) return BLK_MQ_RQ_QUEUE_BUSY; + blk_mq_start_request(rq); + ret = mtip_submit_request(hctx, rq); if (likely(!ret)) return BLK_MQ_RQ_QUEUE_OK; @@ -3951,6 +3955,7 @@ skip_create_disk: /* Set device limits. */ set_bit(QUEUE_FLAG_NONROT, &dd->queue->queue_flags); + clear_bit(QUEUE_FLAG_ADD_RANDOM, &dd->queue->queue_flags); blk_queue_max_segments(dd->queue, MTIP_MAX_SG); blk_queue_physical_block_size(dd->queue, 4096); blk_queue_max_hw_sectors(dd->queue, 0xffff); diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index fb31b8ee4372..4bc2a5cb9935 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -847,6 +847,7 @@ static int __init nbd_init(void) * Tell the block layer that we are not a rotational device */ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue); + queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, disk->queue); disk->queue->limits.discard_granularity = 512; disk->queue->limits.max_discard_sectors = UINT_MAX; disk->queue->limits.discard_zeroes_data = 0; diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 00d469c7f9f7..ae9f615382f6 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -78,7 +78,33 @@ module_param(home_node, int, S_IRUGO); MODULE_PARM_DESC(home_node, "Home node for the device"); static int queue_mode = NULL_Q_MQ; -module_param(queue_mode, int, S_IRUGO); + +static int null_param_store_val(const char *str, int *val, int min, int max) +{ + int ret, new_val; + + ret = kstrtoint(str, 10, &new_val); + if (ret) + return -EINVAL; + + if (new_val < min || new_val > max) + return -EINVAL; + + *val = new_val; + return 0; +} + +static int null_set_queue_mode(const char *str, const struct kernel_param *kp) +{ + return null_param_store_val(str, &queue_mode, NULL_Q_BIO, NULL_Q_MQ); +} + +static struct kernel_param_ops null_queue_mode_param_ops = { + .set = null_set_queue_mode, + .get = param_get_int, +}; + +device_param_cb(queue_mode, &null_queue_mode_param_ops, &queue_mode, S_IRUGO); MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)"); static int gb = 250; @@ -94,7 +120,19 @@ module_param(nr_devices, int, S_IRUGO); MODULE_PARM_DESC(nr_devices, "Number of devices to register"); static int irqmode = NULL_IRQ_SOFTIRQ; -module_param(irqmode, int, S_IRUGO); + +static int null_set_irqmode(const char *str, const struct kernel_param *kp) +{ + return null_param_store_val(str, &irqmode, NULL_IRQ_NONE, + NULL_IRQ_TIMER); +} + +static struct kernel_param_ops null_irqmode_param_ops = { + .set = null_set_irqmode, + .get = param_get_int, +}; + +device_param_cb(irqmode, &null_irqmode_param_ops, &irqmode, S_IRUGO); MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer"); static int completion_nsec = 10000; @@ -177,7 +215,7 @@ static void end_cmd(struct nullb_cmd *cmd) { switch (queue_mode) { case NULL_Q_MQ: - blk_mq_end_io(cmd->rq, 0); + blk_mq_end_request(cmd->rq, 0); return; case NULL_Q_RQ: INIT_LIST_HEAD(&cmd->rq->queuelist); @@ -313,13 +351,16 @@ static void null_request_fn(struct request_queue *q) } } -static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq) +static int null_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { - struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); + struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); - cmd->rq = rq; + cmd->rq = bd->rq; cmd->nq = hctx->driver_data; + blk_mq_start_request(bd->rq); + null_handle_cmd(cmd); return BLK_MQ_RQ_QUEUE_OK; } @@ -447,14 +488,10 @@ static int init_driver_queues(struct nullb *nullb) ret = setup_commands(nq); if (ret) - goto err_queue; + return ret; nullb->nr_queues++; } - return 0; -err_queue: - cleanup_queues(nullb); - return ret; } static int null_add_dev(void) @@ -504,7 +541,9 @@ static int null_add_dev(void) goto out_cleanup_queues; } blk_queue_make_request(nullb->q, null_queue_bio); - init_driver_queues(nullb); + rv = init_driver_queues(nullb); + if (rv) + goto out_cleanup_blk_queue; } else { nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node); if (!nullb->q) { @@ -513,11 +552,14 @@ static int null_add_dev(void) } blk_queue_prep_rq(nullb->q, null_rq_prep_fn); blk_queue_softirq_done(nullb->q, null_softirq_done_fn); - init_driver_queues(nullb); + rv = init_driver_queues(nullb); + if (rv) + goto out_cleanup_blk_queue; } nullb->q->queuedata = nullb; queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q); + queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, nullb->q); disk = nullb->disk = alloc_disk_node(1, home_node); if (!disk) { diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 02351e217165..b1d5d8797315 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -13,9 +13,9 @@ */ #include <linux/nvme.h> -#include <linux/bio.h> #include <linux/bitops.h> #include <linux/blkdev.h> +#include <linux/blk-mq.h> #include <linux/cpu.h> #include <linux/delay.h> #include <linux/errno.h> @@ -33,7 +33,6 @@ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/pci.h> -#include <linux/percpu.h> #include <linux/poison.h> #include <linux/ptrace.h> #include <linux/sched.h> @@ -42,12 +41,12 @@ #include <scsi/sg.h> #include <asm-generic/io-64-nonatomic-lo-hi.h> -#include <trace/events/block.h> - #define NVME_Q_DEPTH 1024 +#define NVME_AQ_DEPTH 64 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) #define ADMIN_TIMEOUT (admin_timeout * HZ) +#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ) #define IOD_TIMEOUT (retry_time * HZ) static unsigned char admin_timeout = 60; @@ -62,6 +61,10 @@ static unsigned char retry_time = 30; module_param(retry_time, byte, 0644); MODULE_PARM_DESC(retry_time, "time in seconds to retry failed I/O"); +static unsigned char shutdown_timeout = 5; +module_param(shutdown_timeout, byte, 0644); +MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); + static int nvme_major; module_param(nvme_major, int, 0); @@ -76,10 +79,12 @@ static wait_queue_head_t nvme_kthread_wait; static struct notifier_block nvme_nb; static void nvme_reset_failed_dev(struct work_struct *ws); +static int nvme_process_cq(struct nvme_queue *nvmeq); struct async_cmd_info { struct kthread_work work; struct kthread_worker *worker; + struct request *req; u32 result; int status; void *ctx; @@ -90,7 +95,7 @@ struct async_cmd_info { * commands and one for I/O commands). */ struct nvme_queue { - struct rcu_head r_head; + struct llist_node node; struct device *q_dmadev; struct nvme_dev *dev; char irqname[24]; /* nvme4294967295-65535\0 */ @@ -99,10 +104,6 @@ struct nvme_queue { volatile struct nvme_completion *cqes; dma_addr_t sq_dma_addr; dma_addr_t cq_dma_addr; - wait_queue_head_t sq_full; - wait_queue_t sq_cong_wait; - struct bio_list sq_cong; - struct list_head iod_bio; u32 __iomem *q_db; u16 q_depth; u16 cq_vector; @@ -112,10 +113,8 @@ struct nvme_queue { u16 qid; u8 cq_phase; u8 cqe_seen; - u8 q_suspended; - cpumask_var_t cpu_mask; struct async_cmd_info cmdinfo; - unsigned long cmdid_data[]; + struct blk_mq_hw_ctx *hctx; }; /* @@ -143,62 +142,79 @@ typedef void (*nvme_completion_fn)(struct nvme_queue *, void *, struct nvme_cmd_info { nvme_completion_fn fn; void *ctx; - unsigned long timeout; int aborted; + struct nvme_queue *nvmeq; }; -static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq) +static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) { - return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)]; + struct nvme_dev *dev = data; + struct nvme_queue *nvmeq = dev->queues[0]; + + WARN_ON(nvmeq->hctx); + nvmeq->hctx = hctx; + hctx->driver_data = nvmeq; + return 0; } -static unsigned nvme_queue_extra(int depth) +static int nvme_admin_init_request(void *data, struct request *req, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) { - return DIV_ROUND_UP(depth, 8) + (depth * sizeof(struct nvme_cmd_info)); + struct nvme_dev *dev = data; + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = dev->queues[0]; + + BUG_ON(!nvmeq); + cmd->nvmeq = nvmeq; + return 0; } -/** - * alloc_cmdid() - Allocate a Command ID - * @nvmeq: The queue that will be used for this command - * @ctx: A pointer that will be passed to the handler - * @handler: The function to call on completion - * - * Allocate a Command ID for a queue. The data passed in will - * be passed to the completion handler. This is implemented by using - * the bottom two bits of the ctx pointer to store the handler ID. - * Passing in a pointer that's not 4-byte aligned will cause a BUG. - * We can change this if it becomes a problem. - * - * May be called with local interrupts disabled and the q_lock held, - * or with interrupts enabled and no locks held. - */ -static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, - nvme_completion_fn handler, unsigned timeout) +static void nvme_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { - int depth = nvmeq->q_depth - 1; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - int cmdid; + struct nvme_queue *nvmeq = hctx->driver_data; - do { - cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth); - if (cmdid >= depth) - return -EBUSY; - } while (test_and_set_bit(cmdid, nvmeq->cmdid_data)); + nvmeq->hctx = NULL; +} + +static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_dev *dev = data; + struct nvme_queue *nvmeq = dev->queues[ + (hctx_idx % dev->queue_count) + 1]; + + if (!nvmeq->hctx) + nvmeq->hctx = hctx; - info[cmdid].fn = handler; - info[cmdid].ctx = ctx; - info[cmdid].timeout = jiffies + timeout; - info[cmdid].aborted = 0; - return cmdid; + /* nvmeq queues are shared between namespaces. We assume here that + * blk-mq map the tags so they match up with the nvme queue tags. */ + WARN_ON(nvmeq->hctx->tags != hctx->tags); + + hctx->driver_data = nvmeq; + return 0; } -static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, - nvme_completion_fn handler, unsigned timeout) +static int nvme_init_request(void *data, struct request *req, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) { - int cmdid; - wait_event_killable(nvmeq->sq_full, - (cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0); - return (cmdid < 0) ? -EINTR : cmdid; + struct nvme_dev *dev = data; + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; + + BUG_ON(!nvmeq); + cmd->nvmeq = nvmeq; + return 0; +} + +static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx, + nvme_completion_fn handler) +{ + cmd->fn = handler; + cmd->ctx = ctx; + cmd->aborted = 0; } /* Special values must be less than 0x1000 */ @@ -206,17 +222,12 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, #define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) #define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) #define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) -#define CMD_CTX_ABORT (0x318 + CMD_CTX_BASE) static void special_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) { if (ctx == CMD_CTX_CANCELLED) return; - if (ctx == CMD_CTX_ABORT) { - ++nvmeq->dev->abort_limit; - return; - } if (ctx == CMD_CTX_COMPLETED) { dev_warn(nvmeq->q_dmadev, "completed id %d twice on queue %d\n", @@ -229,99 +240,89 @@ static void special_completion(struct nvme_queue *nvmeq, void *ctx, cqe->command_id, le16_to_cpup(&cqe->sq_id)); return; } - dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx); } -static void async_completion(struct nvme_queue *nvmeq, void *ctx, - struct nvme_completion *cqe) -{ - struct async_cmd_info *cmdinfo = ctx; - cmdinfo->result = le32_to_cpup(&cqe->result); - cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; - queue_kthread_work(cmdinfo->worker, &cmdinfo->work); -} - -/* - * Called with local interrupts disabled and the q_lock held. May not sleep. - */ -static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid, - nvme_completion_fn *fn) +static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn) { void *ctx; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - if (cmdid >= nvmeq->q_depth || !info[cmdid].fn) { - if (fn) - *fn = special_completion; - return CMD_CTX_INVALID; - } if (fn) - *fn = info[cmdid].fn; - ctx = info[cmdid].ctx; - info[cmdid].fn = special_completion; - info[cmdid].ctx = CMD_CTX_COMPLETED; - clear_bit(cmdid, nvmeq->cmdid_data); - wake_up(&nvmeq->sq_full); + *fn = cmd->fn; + ctx = cmd->ctx; + cmd->fn = special_completion; + cmd->ctx = CMD_CTX_CANCELLED; return ctx; } -static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid, - nvme_completion_fn *fn) +static void async_req_completion(struct nvme_queue *nvmeq, void *ctx, + struct nvme_completion *cqe) { - void *ctx; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - if (fn) - *fn = info[cmdid].fn; - ctx = info[cmdid].ctx; - info[cmdid].fn = special_completion; - info[cmdid].ctx = CMD_CTX_CANCELLED; - return ctx; -} + struct request *req = ctx; -static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid) -{ - return rcu_dereference_raw(dev->queues[qid]); + u32 result = le32_to_cpup(&cqe->result); + u16 status = le16_to_cpup(&cqe->status) >> 1; + + if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) + ++nvmeq->dev->event_limit; + if (status == NVME_SC_SUCCESS) + dev_warn(nvmeq->q_dmadev, + "async event result %08x\n", result); + + blk_mq_free_hctx_request(nvmeq->hctx, req); } -static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU) +static void abort_completion(struct nvme_queue *nvmeq, void *ctx, + struct nvme_completion *cqe) { - struct nvme_queue *nvmeq; - unsigned queue_id = get_cpu_var(*dev->io_queue); + struct request *req = ctx; - rcu_read_lock(); - nvmeq = rcu_dereference(dev->queues[queue_id]); - if (nvmeq) - return nvmeq; + u16 status = le16_to_cpup(&cqe->status) >> 1; + u32 result = le32_to_cpup(&cqe->result); - rcu_read_unlock(); - put_cpu_var(*dev->io_queue); - return NULL; + blk_mq_free_hctx_request(nvmeq->hctx, req); + + dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result); + ++nvmeq->dev->abort_limit; } -static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) +static void async_completion(struct nvme_queue *nvmeq, void *ctx, + struct nvme_completion *cqe) { - rcu_read_unlock(); - put_cpu_var(nvmeq->dev->io_queue); + struct async_cmd_info *cmdinfo = ctx; + cmdinfo->result = le32_to_cpup(&cqe->result); + cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; + queue_kthread_work(cmdinfo->worker, &cmdinfo->work); + blk_mq_free_hctx_request(nvmeq->hctx, cmdinfo->req); } -static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx) - __acquires(RCU) +static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq, + unsigned int tag) { - struct nvme_queue *nvmeq; + struct blk_mq_hw_ctx *hctx = nvmeq->hctx; + struct request *req = blk_mq_tag_to_rq(hctx->tags, tag); - rcu_read_lock(); - nvmeq = rcu_dereference(dev->queues[q_idx]); - if (nvmeq) - return nvmeq; - - rcu_read_unlock(); - return NULL; + return blk_mq_rq_to_pdu(req); } -static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) +/* + * Called with local interrupts disabled and the q_lock held. May not sleep. + */ +static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag, + nvme_completion_fn *fn) { - rcu_read_unlock(); + struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag); + void *ctx; + if (tag >= nvmeq->q_depth) { + *fn = special_completion; + return CMD_CTX_INVALID; + } + if (fn) + *fn = cmd->fn; + ctx = cmd->ctx; + cmd->fn = special_completion; + cmd->ctx = CMD_CTX_COMPLETED; + return ctx; } /** @@ -331,26 +332,29 @@ static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) * * Safe to use from interrupt context */ -static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) +static int __nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) { - unsigned long flags; - u16 tail; - spin_lock_irqsave(&nvmeq->q_lock, flags); - if (nvmeq->q_suspended) { - spin_unlock_irqrestore(&nvmeq->q_lock, flags); - return -EBUSY; - } - tail = nvmeq->sq_tail; + u16 tail = nvmeq->sq_tail; + memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); if (++tail == nvmeq->q_depth) tail = 0; writel(tail, nvmeq->q_db); nvmeq->sq_tail = tail; - spin_unlock_irqrestore(&nvmeq->q_lock, flags); return 0; } +static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) +{ + unsigned long flags; + int ret; + spin_lock_irqsave(&nvmeq->q_lock, flags); + ret = __nvme_submit_cmd(nvmeq, cmd); + spin_unlock_irqrestore(&nvmeq->q_lock, flags); + return ret; +} + static __le64 **iod_list(struct nvme_iod *iod) { return ((void *)iod) + iod->offset; @@ -361,17 +365,17 @@ static __le64 **iod_list(struct nvme_iod *iod) * as it only leads to a small amount of wasted memory for the lifetime of * the I/O. */ -static int nvme_npages(unsigned size) +static int nvme_npages(unsigned size, struct nvme_dev *dev) { - unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE); - return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); + unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); + return DIV_ROUND_UP(8 * nprps, dev->page_size - 8); } static struct nvme_iod * -nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) +nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp) { struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + - sizeof(__le64 *) * nvme_npages(nbytes) + + sizeof(__le64 *) * nvme_npages(nbytes, dev) + sizeof(struct scatterlist) * nseg, gfp); if (iod) { @@ -380,7 +384,6 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) iod->length = nbytes; iod->nents = 0; iod->first_dma = 0ULL; - iod->start_time = jiffies; } return iod; @@ -388,7 +391,7 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) { - const int last_prp = PAGE_SIZE / 8 - 1; + const int last_prp = dev->page_size / 8 - 1; int i; __le64 **list = iod_list(iod); dma_addr_t prp_dma = iod->first_dma; @@ -404,65 +407,49 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) kfree(iod); } -static void nvme_start_io_acct(struct bio *bio) -{ - struct gendisk *disk = bio->bi_bdev->bd_disk; - if (blk_queue_io_stat(disk->queue)) { - const int rw = bio_data_dir(bio); - int cpu = part_stat_lock(); - part_round_stats(cpu, &disk->part0); - part_stat_inc(cpu, &disk->part0, ios[rw]); - part_stat_add(cpu, &disk->part0, sectors[rw], - bio_sectors(bio)); - part_inc_in_flight(&disk->part0, rw); - part_stat_unlock(); - } -} - -static void nvme_end_io_acct(struct bio *bio, unsigned long start_time) +static int nvme_error_status(u16 status) { - struct gendisk *disk = bio->bi_bdev->bd_disk; - if (blk_queue_io_stat(disk->queue)) { - const int rw = bio_data_dir(bio); - unsigned long duration = jiffies - start_time; - int cpu = part_stat_lock(); - part_stat_add(cpu, &disk->part0, ticks[rw], duration); - part_round_stats(cpu, &disk->part0); - part_dec_in_flight(&disk->part0, rw); - part_stat_unlock(); + switch (status & 0x7ff) { + case NVME_SC_SUCCESS: + return 0; + case NVME_SC_CAP_EXCEEDED: + return -ENOSPC; + default: + return -EIO; } } -static void bio_completion(struct nvme_queue *nvmeq, void *ctx, +static void req_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) { struct nvme_iod *iod = ctx; - struct bio *bio = iod->private; + struct request *req = iod->private; + struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); + u16 status = le16_to_cpup(&cqe->status) >> 1; - int error = 0; if (unlikely(status)) { - if (!(status & NVME_SC_DNR || - bio->bi_rw & REQ_FAILFAST_MASK) && - (jiffies - iod->start_time) < IOD_TIMEOUT) { - if (!waitqueue_active(&nvmeq->sq_full)) - add_wait_queue(&nvmeq->sq_full, - &nvmeq->sq_cong_wait); - list_add_tail(&iod->node, &nvmeq->iod_bio); - wake_up(&nvmeq->sq_full); + if (!(status & NVME_SC_DNR || blk_noretry_request(req)) + && (jiffies - req->start_time) < req->timeout) { + blk_mq_requeue_request(req); + blk_mq_kick_requeue_list(req->q); return; } - error = -EIO; - } - if (iod->nents) { - dma_unmap_sg(nvmeq->q_dmadev, iod->sg, iod->nents, - bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - nvme_end_io_acct(bio, iod->start_time); - } + req->errors = nvme_error_status(status); + } else + req->errors = 0; + + if (cmd_rq->aborted) + dev_warn(&nvmeq->dev->pci_dev->dev, + "completing aborted command with status:%04x\n", + status); + + if (iod->nents) + dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents, + rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); nvme_free_iod(nvmeq->dev, iod); - trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio, error); - bio_endio(bio, error); + blk_mq_complete_request(req); } /* length is in bytes. gfp flags indicates whether we may sleep. */ @@ -479,26 +466,27 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len, __le64 **list = iod_list(iod); dma_addr_t prp_dma; int nprps, i; + u32 page_size = dev->page_size; - length -= (PAGE_SIZE - offset); + length -= (page_size - offset); if (length <= 0) return total_len; - dma_len -= (PAGE_SIZE - offset); + dma_len -= (page_size - offset); if (dma_len) { - dma_addr += (PAGE_SIZE - offset); + dma_addr += (page_size - offset); } else { sg = sg_next(sg); dma_addr = sg_dma_address(sg); dma_len = sg_dma_len(sg); } - if (length <= PAGE_SIZE) { + if (length <= page_size) { iod->first_dma = dma_addr; return total_len; } - nprps = DIV_ROUND_UP(length, PAGE_SIZE); + nprps = DIV_ROUND_UP(length, page_size); if (nprps <= (256 / 8)) { pool = dev->prp_small_pool; iod->npages = 0; @@ -511,13 +499,13 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len, if (!prp_list) { iod->first_dma = dma_addr; iod->npages = -1; - return (total_len - length) + PAGE_SIZE; + return (total_len - length) + page_size; } list[0] = prp_list; iod->first_dma = prp_dma; i = 0; for (;;) { - if (i == PAGE_SIZE / 8) { + if (i == page_size >> 3) { __le64 *old_prp_list = prp_list; prp_list = dma_pool_alloc(pool, gfp, &prp_dma); if (!prp_list) @@ -528,9 +516,9 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len, i = 1; } prp_list[i++] = cpu_to_le64(dma_addr); - dma_len -= PAGE_SIZE; - dma_addr += PAGE_SIZE; - length -= PAGE_SIZE; + dma_len -= page_size; + dma_addr += page_size; + length -= page_size; if (length <= 0) break; if (dma_len > 0) @@ -544,88 +532,25 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len, return total_len; } -static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq, - int len) -{ - struct bio *split = bio_split(bio, len >> 9, GFP_ATOMIC, NULL); - if (!split) - return -ENOMEM; - - trace_block_split(bdev_get_queue(bio->bi_bdev), bio, - split->bi_iter.bi_sector); - bio_chain(split, bio); - - if (!waitqueue_active(&nvmeq->sq_full)) - add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); - bio_list_add(&nvmeq->sq_cong, split); - bio_list_add(&nvmeq->sq_cong, bio); - wake_up(&nvmeq->sq_full); - - return 0; -} - -/* NVMe scatterlists require no holes in the virtual address */ -#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \ - (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE)) - -static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod, - struct bio *bio, enum dma_data_direction dma_dir, int psegs) -{ - struct bio_vec bvec, bvprv; - struct bvec_iter iter; - struct scatterlist *sg = NULL; - int length = 0, nsegs = 0, split_len = bio->bi_iter.bi_size; - int first = 1; - - if (nvmeq->dev->stripe_size) - split_len = nvmeq->dev->stripe_size - - ((bio->bi_iter.bi_sector << 9) & - (nvmeq->dev->stripe_size - 1)); - - sg_init_table(iod->sg, psegs); - bio_for_each_segment(bvec, bio, iter) { - if (!first && BIOVEC_PHYS_MERGEABLE(&bvprv, &bvec)) { - sg->length += bvec.bv_len; - } else { - if (!first && BIOVEC_NOT_VIRT_MERGEABLE(&bvprv, &bvec)) - return nvme_split_and_submit(bio, nvmeq, - length); - - sg = sg ? sg + 1 : iod->sg; - sg_set_page(sg, bvec.bv_page, - bvec.bv_len, bvec.bv_offset); - nsegs++; - } - - if (split_len - length < bvec.bv_len) - return nvme_split_and_submit(bio, nvmeq, split_len); - length += bvec.bv_len; - bvprv = bvec; - first = 0; - } - iod->nents = nsegs; - sg_mark_end(sg); - if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0) - return -ENOMEM; - - BUG_ON(length != bio->bi_iter.bi_size); - return length; -} - -static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, - struct bio *bio, struct nvme_iod *iod, int cmdid) +/* + * We reuse the small pool to allocate the 16-byte range here as it is not + * worth having a special pool for these or additional cases to handle freeing + * the iod. + */ +static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, + struct request *req, struct nvme_iod *iod) { struct nvme_dsm_range *range = (struct nvme_dsm_range *)iod_list(iod)[0]; struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; range->cattr = cpu_to_le32(0); - range->nlb = cpu_to_le32(bio->bi_iter.bi_size >> ns->lba_shift); - range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector)); + range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift); + range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); memset(cmnd, 0, sizeof(*cmnd)); cmnd->dsm.opcode = nvme_cmd_dsm; - cmnd->dsm.command_id = cmdid; + cmnd->dsm.command_id = req->tag; cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); cmnd->dsm.nr = 0; @@ -634,11 +559,9 @@ static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, if (++nvmeq->sq_tail == nvmeq->q_depth) nvmeq->sq_tail = 0; writel(nvmeq->sq_tail, nvmeq->q_db); - - return 0; } -static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, +static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, int cmdid) { struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; @@ -651,49 +574,34 @@ static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, if (++nvmeq->sq_tail == nvmeq->q_depth) nvmeq->sq_tail = 0; writel(nvmeq->sq_tail, nvmeq->q_db); - - return 0; } -static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod) +static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, + struct nvme_ns *ns) { - struct bio *bio = iod->private; - struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data; + struct request *req = iod->private; struct nvme_command *cmnd; - int cmdid; - u16 control; - u32 dsmgmt; - - cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT); - if (unlikely(cmdid < 0)) - return cmdid; + u16 control = 0; + u32 dsmgmt = 0; - if (bio->bi_rw & REQ_DISCARD) - return nvme_submit_discard(nvmeq, ns, bio, iod, cmdid); - if (bio->bi_rw & REQ_FLUSH) - return nvme_submit_flush(nvmeq, ns, cmdid); - - control = 0; - if (bio->bi_rw & REQ_FUA) + if (req->cmd_flags & REQ_FUA) control |= NVME_RW_FUA; - if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD)) + if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) control |= NVME_RW_LR; - dsmgmt = 0; - if (bio->bi_rw & REQ_RAHEAD) + if (req->cmd_flags & REQ_RAHEAD) dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; memset(cmnd, 0, sizeof(*cmnd)); - cmnd->rw.opcode = bio_data_dir(bio) ? nvme_cmd_write : nvme_cmd_read; - cmnd->rw.command_id = cmdid; + cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); + cmnd->rw.command_id = req->tag; cmnd->rw.nsid = cpu_to_le32(ns->ns_id); cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); - cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector)); - cmnd->rw.length = - cpu_to_le16((bio->bi_iter.bi_size >> ns->lba_shift) - 1); + cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); cmnd->rw.control = cpu_to_le16(control); cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); @@ -704,45 +612,26 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod) return 0; } -static int nvme_split_flush_data(struct nvme_queue *nvmeq, struct bio *bio) -{ - struct bio *split = bio_clone(bio, GFP_ATOMIC); - if (!split) - return -ENOMEM; - - split->bi_iter.bi_size = 0; - split->bi_phys_segments = 0; - bio->bi_rw &= ~REQ_FLUSH; - bio_chain(split, bio); - - if (!waitqueue_active(&nvmeq->sq_full)) - add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); - bio_list_add(&nvmeq->sq_cong, split); - bio_list_add(&nvmeq->sq_cong, bio); - wake_up_process(nvme_thread); - - return 0; -} - -/* - * Called with local interrupts disabled and the q_lock held. May not sleep. - */ -static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, - struct bio *bio) +static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { + struct nvme_ns *ns = hctx->queue->queuedata; + struct nvme_queue *nvmeq = hctx->driver_data; + struct request *req = bd->rq; + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); struct nvme_iod *iod; - int psegs = bio_phys_segments(ns->queue, bio); - int result; + int psegs = req->nr_phys_segments; + enum dma_data_direction dma_dir; + unsigned size = !(req->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(req) : + sizeof(struct nvme_dsm_range); - if ((bio->bi_rw & REQ_FLUSH) && psegs) - return nvme_split_flush_data(nvmeq, bio); - - iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC); + iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC); if (!iod) - return -ENOMEM; + return BLK_MQ_RQ_QUEUE_BUSY; - iod->private = bio; - if (bio->bi_rw & REQ_DISCARD) { + iod->private = req; + + if (req->cmd_flags & REQ_DISCARD) { void *range; /* * We reuse the small pool to allocate the 16-byte range here @@ -752,35 +641,50 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC, &iod->first_dma); - if (!range) { - result = -ENOMEM; - goto free_iod; - } + if (!range) + goto retry_cmd; iod_list(iod)[0] = (__le64 *)range; iod->npages = 0; } else if (psegs) { - result = nvme_map_bio(nvmeq, iod, bio, - bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, - psegs); - if (result <= 0) - goto free_iod; - if (nvme_setup_prps(nvmeq->dev, iod, result, GFP_ATOMIC) != - result) { - result = -ENOMEM; - goto free_iod; + dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; + + sg_init_table(iod->sg, psegs); + iod->nents = blk_rq_map_sg(req->q, req, iod->sg); + if (!iod->nents) + goto error_cmd; + + if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir)) + goto retry_cmd; + + if (blk_rq_bytes(req) != + nvme_setup_prps(nvmeq->dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) { + dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, + iod->nents, dma_dir); + goto retry_cmd; } - nvme_start_io_acct(bio); } - if (unlikely(nvme_submit_iod(nvmeq, iod))) { - if (!waitqueue_active(&nvmeq->sq_full)) - add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); - list_add_tail(&iod->node, &nvmeq->iod_bio); - } - return 0; - free_iod: + blk_mq_start_request(req); + + nvme_set_info(cmd, iod, req_completion); + spin_lock_irq(&nvmeq->q_lock); + if (req->cmd_flags & REQ_DISCARD) + nvme_submit_discard(nvmeq, ns, req, iod); + else if (req->cmd_flags & REQ_FLUSH) + nvme_submit_flush(nvmeq, ns, req->tag); + else + nvme_submit_iod(nvmeq, iod, ns); + + nvme_process_cq(nvmeq); + spin_unlock_irq(&nvmeq->q_lock); + return BLK_MQ_RQ_QUEUE_OK; + + error_cmd: nvme_free_iod(nvmeq->dev, iod); - return result; + return BLK_MQ_RQ_QUEUE_ERROR; + retry_cmd: + nvme_free_iod(nvmeq->dev, iod); + return BLK_MQ_RQ_QUEUE_BUSY; } static int nvme_process_cq(struct nvme_queue *nvmeq) @@ -801,8 +705,7 @@ static int nvme_process_cq(struct nvme_queue *nvmeq) head = 0; phase = !phase; } - - ctx = free_cmdid(nvmeq, cqe.command_id, &fn); + ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn); fn(nvmeq, ctx, &cqe); } @@ -823,29 +726,13 @@ static int nvme_process_cq(struct nvme_queue *nvmeq) return 1; } -static void nvme_make_request(struct request_queue *q, struct bio *bio) +/* Admin queue isn't initialized as a request queue. If at some point this + * happens anyway, make sure to notify the user */ +static int nvme_admin_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { - struct nvme_ns *ns = q->queuedata; - struct nvme_queue *nvmeq = get_nvmeq(ns->dev); - int result = -EBUSY; - - if (!nvmeq) { - bio_endio(bio, -EIO); - return; - } - - spin_lock_irq(&nvmeq->q_lock); - if (!nvmeq->q_suspended && bio_list_empty(&nvmeq->sq_cong)) - result = nvme_submit_bio_queue(nvmeq, ns, bio); - if (unlikely(result)) { - if (!waitqueue_active(&nvmeq->sq_full)) - add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); - bio_list_add(&nvmeq->sq_cong, bio); - } - - nvme_process_cq(nvmeq); - spin_unlock_irq(&nvmeq->q_lock); - put_nvmeq(nvmeq); + WARN_ON_ONCE(1); + return BLK_MQ_RQ_QUEUE_ERROR; } static irqreturn_t nvme_irq(int irq, void *data) @@ -869,10 +756,11 @@ static irqreturn_t nvme_irq_check(int irq, void *data) return IRQ_WAKE_THREAD; } -static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid) +static void nvme_abort_cmd_info(struct nvme_queue *nvmeq, struct nvme_cmd_info * + cmd_info) { spin_lock_irq(&nvmeq->q_lock); - cancel_cmdid(nvmeq, cmdid, NULL); + cancel_cmd_info(cmd_info, NULL); spin_unlock_irq(&nvmeq->q_lock); } @@ -895,47 +783,40 @@ static void sync_completion(struct nvme_queue *nvmeq, void *ctx, * Returns 0 on success. If the result is negative, it's a Linux error code; * if the result is positive, it's an NVM Express status code */ -static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx, - struct nvme_command *cmd, +static int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd, u32 *result, unsigned timeout) { - int cmdid, ret; + int ret; struct sync_cmd_info cmdinfo; - struct nvme_queue *nvmeq; - - nvmeq = lock_nvmeq(dev, q_idx); - if (!nvmeq) - return -ENODEV; + struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = cmd_rq->nvmeq; cmdinfo.task = current; cmdinfo.status = -EINTR; - cmdid = alloc_cmdid(nvmeq, &cmdinfo, sync_completion, timeout); - if (cmdid < 0) { - unlock_nvmeq(nvmeq); - return cmdid; - } - cmd->common.command_id = cmdid; + cmd->common.command_id = req->tag; + + nvme_set_info(cmd_rq, &cmdinfo, sync_completion); set_current_state(TASK_KILLABLE); ret = nvme_submit_cmd(nvmeq, cmd); if (ret) { - free_cmdid(nvmeq, cmdid, NULL); - unlock_nvmeq(nvmeq); + nvme_finish_cmd(nvmeq, req->tag, NULL); set_current_state(TASK_RUNNING); - return ret; } - unlock_nvmeq(nvmeq); - schedule_timeout(timeout); - - if (cmdinfo.status == -EINTR) { - nvmeq = lock_nvmeq(dev, q_idx); - if (nvmeq) { - nvme_abort_command(nvmeq, cmdid); - unlock_nvmeq(nvmeq); - } + ret = schedule_timeout(timeout); + + /* + * Ensure that sync_completion has either run, or that it will + * never run. + */ + nvme_abort_cmd_info(nvmeq, blk_mq_rq_to_pdu(req)); + + /* + * We never got the completion + */ + if (cmdinfo.status == -EINTR) return -EINTR; - } if (result) *result = cmdinfo.result; @@ -943,59 +824,99 @@ static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx, return cmdinfo.status; } -static int nvme_submit_async_cmd(struct nvme_queue *nvmeq, +static int nvme_submit_async_admin_req(struct nvme_dev *dev) +{ + struct nvme_queue *nvmeq = dev->queues[0]; + struct nvme_command c; + struct nvme_cmd_info *cmd_info; + struct request *req; + + req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, false); + if (IS_ERR(req)) + return PTR_ERR(req); + + cmd_info = blk_mq_rq_to_pdu(req); + nvme_set_info(cmd_info, req, async_req_completion); + + memset(&c, 0, sizeof(c)); + c.common.opcode = nvme_admin_async_event; + c.common.command_id = req->tag; + + return __nvme_submit_cmd(nvmeq, &c); +} + +static int nvme_submit_admin_async_cmd(struct nvme_dev *dev, struct nvme_command *cmd, struct async_cmd_info *cmdinfo, unsigned timeout) { - int cmdid; + struct nvme_queue *nvmeq = dev->queues[0]; + struct request *req; + struct nvme_cmd_info *cmd_rq; + + req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false); + if (IS_ERR(req)) + return PTR_ERR(req); - cmdid = alloc_cmdid_killable(nvmeq, cmdinfo, async_completion, timeout); - if (cmdid < 0) - return cmdid; + req->timeout = timeout; + cmd_rq = blk_mq_rq_to_pdu(req); + cmdinfo->req = req; + nvme_set_info(cmd_rq, cmdinfo, async_completion); cmdinfo->status = -EINTR; - cmd->common.command_id = cmdid; + + cmd->common.command_id = req->tag; + return nvme_submit_cmd(nvmeq, cmd); } -int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, - u32 *result) +static int __nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, + u32 *result, unsigned timeout) { - return nvme_submit_sync_cmd(dev, 0, cmd, result, ADMIN_TIMEOUT); + int res; + struct request *req; + + req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false); + if (IS_ERR(req)) + return PTR_ERR(req); + res = nvme_submit_sync_cmd(req, cmd, result, timeout); + blk_mq_free_request(req); + return res; } -int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_command *cmd, +int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, u32 *result) { - return nvme_submit_sync_cmd(dev, smp_processor_id() + 1, cmd, result, - NVME_IO_TIMEOUT); + return __nvme_submit_admin_cmd(dev, cmd, result, ADMIN_TIMEOUT); } -static int nvme_submit_admin_cmd_async(struct nvme_dev *dev, - struct nvme_command *cmd, struct async_cmd_info *cmdinfo) +int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_ns *ns, + struct nvme_command *cmd, u32 *result) { - return nvme_submit_async_cmd(raw_nvmeq(dev, 0), cmd, cmdinfo, - ADMIN_TIMEOUT); + int res; + struct request *req; + + req = blk_mq_alloc_request(ns->queue, WRITE, (GFP_KERNEL|__GFP_WAIT), + false); + if (IS_ERR(req)) + return PTR_ERR(req); + res = nvme_submit_sync_cmd(req, cmd, result, NVME_IO_TIMEOUT); + blk_mq_free_request(req); + return res; } static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) { - int status; struct nvme_command c; memset(&c, 0, sizeof(c)); c.delete_queue.opcode = opcode; c.delete_queue.qid = cpu_to_le16(id); - status = nvme_submit_admin_cmd(dev, &c, NULL); - if (status) - return -EIO; - return 0; + return nvme_submit_admin_cmd(dev, &c, NULL); } static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, struct nvme_queue *nvmeq) { - int status; struct nvme_command c; int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; @@ -1007,16 +928,12 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, c.create_cq.cq_flags = cpu_to_le16(flags); c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); - status = nvme_submit_admin_cmd(dev, &c, NULL); - if (status) - return -EIO; - return 0; + return nvme_submit_admin_cmd(dev, &c, NULL); } static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, struct nvme_queue *nvmeq) { - int status; struct nvme_command c; int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; @@ -1028,10 +945,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, c.create_sq.sq_flags = cpu_to_le16(flags); c.create_sq.cqid = cpu_to_le16(qid); - status = nvme_submit_admin_cmd(dev, &c, NULL); - if (status) - return -EIO; - return 0; + return nvme_submit_admin_cmd(dev, &c, NULL); } static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) @@ -1087,28 +1001,27 @@ int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, } /** - * nvme_abort_cmd - Attempt aborting a command - * @cmdid: Command id of a timed out IO - * @queue: The queue with timed out IO + * nvme_abort_req - Attempt aborting a request * * Schedule controller reset if the command was already aborted once before and * still hasn't been returned to the driver, or if this is the admin queue. */ -static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq) +static void nvme_abort_req(struct request *req) { - int a_cmdid; - struct nvme_command cmd; + struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = cmd_rq->nvmeq; struct nvme_dev *dev = nvmeq->dev; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - struct nvme_queue *adminq; + struct request *abort_req; + struct nvme_cmd_info *abort_cmd; + struct nvme_command cmd; - if (!nvmeq->qid || info[cmdid].aborted) { + if (!nvmeq->qid || cmd_rq->aborted) { if (work_busy(&dev->reset_work)) return; list_del_init(&dev->node); dev_warn(&dev->pci_dev->dev, - "I/O %d QID %d timeout, reset controller\n", cmdid, - nvmeq->qid); + "I/O %d QID %d timeout, reset controller\n", + req->tag, nvmeq->qid); dev->reset_workfn = nvme_reset_failed_dev; queue_work(nvme_workq, &dev->reset_work); return; @@ -1117,120 +1030,110 @@ static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq) if (!dev->abort_limit) return; - adminq = rcu_dereference(dev->queues[0]); - a_cmdid = alloc_cmdid(adminq, CMD_CTX_ABORT, special_completion, - ADMIN_TIMEOUT); - if (a_cmdid < 0) + abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, + false); + if (IS_ERR(abort_req)) return; + abort_cmd = blk_mq_rq_to_pdu(abort_req); + nvme_set_info(abort_cmd, abort_req, abort_completion); + memset(&cmd, 0, sizeof(cmd)); cmd.abort.opcode = nvme_admin_abort_cmd; - cmd.abort.cid = cmdid; + cmd.abort.cid = req->tag; cmd.abort.sqid = cpu_to_le16(nvmeq->qid); - cmd.abort.command_id = a_cmdid; + cmd.abort.command_id = abort_req->tag; --dev->abort_limit; - info[cmdid].aborted = 1; - info[cmdid].timeout = jiffies + ADMIN_TIMEOUT; + cmd_rq->aborted = 1; - dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", cmdid, + dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag, nvmeq->qid); - nvme_submit_cmd(adminq, &cmd); + if (nvme_submit_cmd(dev->queues[0], &cmd) < 0) { + dev_warn(nvmeq->q_dmadev, + "Could not abort I/O %d QID %d", + req->tag, nvmeq->qid); + blk_mq_free_request(abort_req); + } } -/** - * nvme_cancel_ios - Cancel outstanding I/Os - * @queue: The queue to cancel I/Os on - * @timeout: True to only cancel I/Os which have timed out - */ -static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) +static void nvme_cancel_queue_ios(struct blk_mq_hw_ctx *hctx, + struct request *req, void *data, bool reserved) { - int depth = nvmeq->q_depth - 1; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - unsigned long now = jiffies; - int cmdid; + struct nvme_queue *nvmeq = data; + void *ctx; + nvme_completion_fn fn; + struct nvme_cmd_info *cmd; + static struct nvme_completion cqe = { + .status = cpu_to_le16(NVME_SC_ABORT_REQ << 1), + }; - for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) { - void *ctx; - nvme_completion_fn fn; - static struct nvme_completion cqe = { - .status = cpu_to_le16(NVME_SC_ABORT_REQ << 1), - }; + cmd = blk_mq_rq_to_pdu(req); - if (timeout && !time_after(now, info[cmdid].timeout)) - continue; - if (info[cmdid].ctx == CMD_CTX_CANCELLED) - continue; - if (timeout && nvmeq->dev->initialized) { - nvme_abort_cmd(cmdid, nvmeq); - continue; - } - dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", cmdid, - nvmeq->qid); - ctx = cancel_cmdid(nvmeq, cmdid, &fn); - fn(nvmeq, ctx, &cqe); - } + if (cmd->ctx == CMD_CTX_CANCELLED) + return; + + dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", + req->tag, nvmeq->qid); + ctx = cancel_cmd_info(cmd, &fn); + fn(nvmeq, ctx, &cqe); } -static void nvme_free_queue(struct rcu_head *r) +static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) { - struct nvme_queue *nvmeq = container_of(r, struct nvme_queue, r_head); + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = cmd->nvmeq; - spin_lock_irq(&nvmeq->q_lock); - while (bio_list_peek(&nvmeq->sq_cong)) { - struct bio *bio = bio_list_pop(&nvmeq->sq_cong); - bio_endio(bio, -EIO); - } - while (!list_empty(&nvmeq->iod_bio)) { - static struct nvme_completion cqe = { - .status = cpu_to_le16( - (NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1), - }; - struct nvme_iod *iod = list_first_entry(&nvmeq->iod_bio, - struct nvme_iod, - node); - list_del(&iod->node); - bio_completion(nvmeq, iod, &cqe); - } - spin_unlock_irq(&nvmeq->q_lock); + dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag, + nvmeq->qid); + if (nvmeq->dev->initialized) + nvme_abort_req(req); + + /* + * The aborted req will be completed on receiving the abort req. + * We enable the timer again. If hit twice, it'll cause a device reset, + * as the device then is in a faulty state. + */ + return BLK_EH_RESET_TIMER; +} +static void nvme_free_queue(struct nvme_queue *nvmeq) +{ dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes, nvmeq->cq_dma_addr); dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), nvmeq->sq_cmds, nvmeq->sq_dma_addr); - if (nvmeq->qid) - free_cpumask_var(nvmeq->cpu_mask); kfree(nvmeq); } static void nvme_free_queues(struct nvme_dev *dev, int lowest) { + LLIST_HEAD(q_list); + struct nvme_queue *nvmeq, *next; + struct llist_node *entry; int i; for (i = dev->queue_count - 1; i >= lowest; i--) { - struct nvme_queue *nvmeq = raw_nvmeq(dev, i); - rcu_assign_pointer(dev->queues[i], NULL); - call_rcu(&nvmeq->r_head, nvme_free_queue); + struct nvme_queue *nvmeq = dev->queues[i]; + llist_add(&nvmeq->node, &q_list); dev->queue_count--; + dev->queues[i] = NULL; } + synchronize_rcu(); + entry = llist_del_all(&q_list); + llist_for_each_entry_safe(nvmeq, next, entry, node) + nvme_free_queue(nvmeq); } /** * nvme_suspend_queue - put queue into suspended state * @nvmeq - queue to suspend - * - * Returns 1 if already suspended, 0 otherwise. */ static int nvme_suspend_queue(struct nvme_queue *nvmeq) { int vector = nvmeq->dev->entry[nvmeq->cq_vector].vector; spin_lock_irq(&nvmeq->q_lock); - if (nvmeq->q_suspended) { - spin_unlock_irq(&nvmeq->q_lock); - return 1; - } - nvmeq->q_suspended = 1; nvmeq->dev->online_queues--; spin_unlock_irq(&nvmeq->q_lock); @@ -1242,15 +1145,18 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) static void nvme_clear_queue(struct nvme_queue *nvmeq) { + struct blk_mq_hw_ctx *hctx = nvmeq->hctx; + spin_lock_irq(&nvmeq->q_lock); nvme_process_cq(nvmeq); - nvme_cancel_ios(nvmeq, false); + if (hctx && hctx->tags) + blk_mq_tag_busy_iter(hctx, nvme_cancel_queue_ios, nvmeq); spin_unlock_irq(&nvmeq->q_lock); } static void nvme_disable_queue(struct nvme_dev *dev, int qid) { - struct nvme_queue *nvmeq = raw_nvmeq(dev, qid); + struct nvme_queue *nvmeq = dev->queues[qid]; if (!nvmeq) return; @@ -1270,25 +1176,20 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth, int vector) { struct device *dmadev = &dev->pci_dev->dev; - unsigned extra = nvme_queue_extra(depth); - struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL); + struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL); if (!nvmeq) return NULL; - nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth), - &nvmeq->cq_dma_addr, GFP_KERNEL); + nvmeq->cqes = dma_zalloc_coherent(dmadev, CQ_SIZE(depth), + &nvmeq->cq_dma_addr, GFP_KERNEL); if (!nvmeq->cqes) goto free_nvmeq; - memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth)); nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth), &nvmeq->sq_dma_addr, GFP_KERNEL); if (!nvmeq->sq_cmds) goto free_cqdma; - if (qid && !zalloc_cpumask_var(&nvmeq->cpu_mask, GFP_KERNEL)) - goto free_sqdma; - nvmeq->q_dmadev = dmadev; nvmeq->dev = dev; snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", @@ -1296,23 +1197,15 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, spin_lock_init(&nvmeq->q_lock); nvmeq->cq_head = 0; nvmeq->cq_phase = 1; - init_waitqueue_head(&nvmeq->sq_full); - init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread); - bio_list_init(&nvmeq->sq_cong); - INIT_LIST_HEAD(&nvmeq->iod_bio); nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; nvmeq->q_depth = depth; nvmeq->cq_vector = vector; nvmeq->qid = qid; - nvmeq->q_suspended = 1; dev->queue_count++; - rcu_assign_pointer(dev->queues[qid], nvmeq); + dev->queues[qid] = nvmeq; return nvmeq; - free_sqdma: - dma_free_coherent(dmadev, SQ_SIZE(depth), (void *)nvmeq->sq_cmds, - nvmeq->sq_dma_addr); free_cqdma: dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes, nvmeq->cq_dma_addr); @@ -1335,17 +1228,15 @@ static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) { struct nvme_dev *dev = nvmeq->dev; - unsigned extra = nvme_queue_extra(nvmeq->q_depth); + spin_lock_irq(&nvmeq->q_lock); nvmeq->sq_tail = 0; nvmeq->cq_head = 0; nvmeq->cq_phase = 1; nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; - memset(nvmeq->cmdid_data, 0, extra); memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); - nvme_cancel_ios(nvmeq, false); - nvmeq->q_suspended = 0; dev->online_queues++; + spin_unlock_irq(&nvmeq->q_lock); } static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) @@ -1365,10 +1256,7 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) if (result < 0) goto release_sq; - spin_lock_irq(&nvmeq->q_lock); nvme_init_queue(nvmeq, qid); - spin_unlock_irq(&nvmeq->q_lock); - return result; release_sq: @@ -1408,27 +1296,32 @@ static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) */ static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap) { - u32 cc = readl(&dev->bar->cc); + dev->ctrl_config &= ~NVME_CC_SHN_MASK; + dev->ctrl_config &= ~NVME_CC_ENABLE; + writel(dev->ctrl_config, &dev->bar->cc); - if (cc & NVME_CC_ENABLE) - writel(cc & ~NVME_CC_ENABLE, &dev->bar->cc); return nvme_wait_ready(dev, cap, false); } static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap) { + dev->ctrl_config &= ~NVME_CC_SHN_MASK; + dev->ctrl_config |= NVME_CC_ENABLE; + writel(dev->ctrl_config, &dev->bar->cc); + return nvme_wait_ready(dev, cap, true); } static int nvme_shutdown_ctrl(struct nvme_dev *dev) { unsigned long timeout; - u32 cc; - cc = (readl(&dev->bar->cc) & ~NVME_CC_SHN_MASK) | NVME_CC_SHN_NORMAL; - writel(cc, &dev->bar->cc); + dev->ctrl_config &= ~NVME_CC_SHN_MASK; + dev->ctrl_config |= NVME_CC_SHN_NORMAL; - timeout = 2 * HZ + jiffies; + writel(dev->ctrl_config, &dev->bar->cc); + + timeout = SHUTDOWN_TIMEOUT + jiffies; while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) != NVME_CSTS_SHST_CMPLT) { msleep(100); @@ -1444,20 +1337,86 @@ static int nvme_shutdown_ctrl(struct nvme_dev *dev) return 0; } +static struct blk_mq_ops nvme_mq_admin_ops = { + .queue_rq = nvme_admin_queue_rq, + .map_queue = blk_mq_map_queue, + .init_hctx = nvme_admin_init_hctx, + .exit_hctx = nvme_exit_hctx, + .init_request = nvme_admin_init_request, + .timeout = nvme_timeout, +}; + +static struct blk_mq_ops nvme_mq_ops = { + .queue_rq = nvme_queue_rq, + .map_queue = blk_mq_map_queue, + .init_hctx = nvme_init_hctx, + .exit_hctx = nvme_exit_hctx, + .init_request = nvme_init_request, + .timeout = nvme_timeout, +}; + +static int nvme_alloc_admin_tags(struct nvme_dev *dev) +{ + if (!dev->admin_q) { + dev->admin_tagset.ops = &nvme_mq_admin_ops; + dev->admin_tagset.nr_hw_queues = 1; + dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1; + dev->admin_tagset.timeout = ADMIN_TIMEOUT; + dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev); + dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_info); + dev->admin_tagset.driver_data = dev; + + if (blk_mq_alloc_tag_set(&dev->admin_tagset)) + return -ENOMEM; + + dev->admin_q = blk_mq_init_queue(&dev->admin_tagset); + if (!dev->admin_q) { + blk_mq_free_tag_set(&dev->admin_tagset); + return -ENOMEM; + } + } + + return 0; +} + +static void nvme_free_admin_tags(struct nvme_dev *dev) +{ + if (dev->admin_q) + blk_mq_free_tag_set(&dev->admin_tagset); +} + static int nvme_configure_admin_queue(struct nvme_dev *dev) { int result; u32 aqa; u64 cap = readq(&dev->bar->cap); struct nvme_queue *nvmeq; + unsigned page_shift = PAGE_SHIFT; + unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12; + unsigned dev_page_max = NVME_CAP_MPSMAX(cap) + 12; + + if (page_shift < dev_page_min) { + dev_err(&dev->pci_dev->dev, + "Minimum device page size (%u) too large for " + "host (%u)\n", 1 << dev_page_min, + 1 << page_shift); + return -ENODEV; + } + if (page_shift > dev_page_max) { + dev_info(&dev->pci_dev->dev, + "Device maximum page size (%u) smaller than " + "host (%u); enabling work-around\n", + 1 << dev_page_max, 1 << page_shift); + page_shift = dev_page_max; + } result = nvme_disable_ctrl(dev, cap); if (result < 0) return result; - nvmeq = raw_nvmeq(dev, 0); + nvmeq = dev->queues[0]; if (!nvmeq) { - nvmeq = nvme_alloc_queue(dev, 0, 64, 0); + nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH, 0); if (!nvmeq) return -ENOMEM; } @@ -1465,27 +1424,35 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) aqa = nvmeq->q_depth - 1; aqa |= aqa << 16; - dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM; - dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; + dev->page_size = 1 << page_shift; + + dev->ctrl_config = NVME_CC_CSS_NVM; + dev->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; writel(aqa, &dev->bar->aqa); writeq(nvmeq->sq_dma_addr, &dev->bar->asq); writeq(nvmeq->cq_dma_addr, &dev->bar->acq); - writel(dev->ctrl_config, &dev->bar->cc); result = nvme_enable_ctrl(dev, cap); if (result) - return result; + goto free_nvmeq; + + result = nvme_alloc_admin_tags(dev); + if (result) + goto free_nvmeq; result = queue_request_irq(dev, nvmeq, nvmeq->irqname); if (result) - return result; + goto free_tags; - spin_lock_irq(&nvmeq->q_lock); - nvme_init_queue(nvmeq, 0); - spin_unlock_irq(&nvmeq->q_lock); + return result; + + free_tags: + nvme_free_admin_tags(dev); + free_nvmeq: + nvme_free_queues(dev, 0); return result; } @@ -1516,7 +1483,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, } err = -ENOMEM; - iod = nvme_alloc_iod(count, length, GFP_KERNEL); + iod = nvme_alloc_iod(count, length, dev, GFP_KERNEL); if (!iod) goto put_pages; @@ -1644,7 +1611,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) if (length != (io.nblocks + 1) << ns->lba_shift) status = -ENOMEM; else - status = nvme_submit_io_cmd(dev, &c, NULL); + status = nvme_submit_io_cmd(dev, ns, &c, NULL); if (meta_len) { if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) { @@ -1676,10 +1643,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) return status; } -static int nvme_user_admin_cmd(struct nvme_dev *dev, - struct nvme_admin_cmd __user *ucmd) +static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns, + struct nvme_passthru_cmd __user *ucmd) { - struct nvme_admin_cmd cmd; + struct nvme_passthru_cmd cmd; struct nvme_command c; int status, length; struct nvme_iod *uninitialized_var(iod); @@ -1716,10 +1683,23 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev, timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) : ADMIN_TIMEOUT; + if (length != cmd.data_len) status = -ENOMEM; - else - status = nvme_submit_sync_cmd(dev, 0, &c, &cmd.result, timeout); + else if (ns) { + struct request *req; + + req = blk_mq_alloc_request(ns->queue, WRITE, + (GFP_KERNEL|__GFP_WAIT), false); + if (IS_ERR(req)) + status = PTR_ERR(req); + else { + status = nvme_submit_sync_cmd(req, &c, &cmd.result, + timeout); + blk_mq_free_request(req); + } + } else + status = __nvme_submit_admin_cmd(dev, &c, &cmd.result, timeout); if (cmd.data_len) { nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); @@ -1743,7 +1723,9 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, force_successful_syscall_return(); return ns->ns_id; case NVME_IOCTL_ADMIN_CMD: - return nvme_user_admin_cmd(ns->dev, (void __user *)arg); + return nvme_user_cmd(ns->dev, NULL, (void __user *)arg); + case NVME_IOCTL_IO_CMD: + return nvme_user_cmd(ns->dev, ns, (void __user *)arg); case NVME_IOCTL_SUBMIT_IO: return nvme_submit_io(ns, (void __user *)arg); case SG_GET_VERSION_NUM: @@ -1759,11 +1741,9 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { - struct nvme_ns *ns = bdev->bd_disk->private_data; - switch (cmd) { case SG_IO: - return nvme_sg_io32(ns, arg); + return -ENOIOCTLCMD; } return nvme_ioctl(bdev, mode, cmd, arg); } @@ -1773,11 +1753,18 @@ static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, static int nvme_open(struct block_device *bdev, fmode_t mode) { - struct nvme_ns *ns = bdev->bd_disk->private_data; - struct nvme_dev *dev = ns->dev; + int ret = 0; + struct nvme_ns *ns; - kref_get(&dev->kref); - return 0; + spin_lock(&dev_list_lock); + ns = bdev->bd_disk->private_data; + if (!ns) + ret = -ENXIO; + else if (!kref_get_unless_zero(&ns->dev->kref)) + ret = -ENXIO; + spin_unlock(&dev_list_lock); + + return ret; } static void nvme_free_dev(struct kref *kref); @@ -1799,6 +1786,35 @@ static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo) return 0; } +static int nvme_revalidate_disk(struct gendisk *disk) +{ + struct nvme_ns *ns = disk->private_data; + struct nvme_dev *dev = ns->dev; + struct nvme_id_ns *id; + dma_addr_t dma_addr; + int lbaf; + + id = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr, + GFP_KERNEL); + if (!id) { + dev_warn(&dev->pci_dev->dev, "%s: Memory alocation failure\n", + __func__); + return 0; + } + + if (nvme_identify(dev, ns->ns_id, 0, dma_addr)) + goto free; + + lbaf = id->flbas & 0xf; + ns->lba_shift = id->lbaf[lbaf].ds; + + blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); + set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); + free: + dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr); + return 0; +} + static const struct block_device_operations nvme_fops = { .owner = THIS_MODULE, .ioctl = nvme_ioctl, @@ -1806,43 +1822,9 @@ static const struct block_device_operations nvme_fops = { .open = nvme_open, .release = nvme_release, .getgeo = nvme_getgeo, + .revalidate_disk= nvme_revalidate_disk, }; -static void nvme_resubmit_iods(struct nvme_queue *nvmeq) -{ - struct nvme_iod *iod, *next; - - list_for_each_entry_safe(iod, next, &nvmeq->iod_bio, node) { - if (unlikely(nvme_submit_iod(nvmeq, iod))) - break; - list_del(&iod->node); - if (bio_list_empty(&nvmeq->sq_cong) && - list_empty(&nvmeq->iod_bio)) - remove_wait_queue(&nvmeq->sq_full, - &nvmeq->sq_cong_wait); - } -} - -static void nvme_resubmit_bios(struct nvme_queue *nvmeq) -{ - while (bio_list_peek(&nvmeq->sq_cong)) { - struct bio *bio = bio_list_pop(&nvmeq->sq_cong); - struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data; - - if (bio_list_empty(&nvmeq->sq_cong) && - list_empty(&nvmeq->iod_bio)) - remove_wait_queue(&nvmeq->sq_full, - &nvmeq->sq_cong_wait); - if (nvme_submit_bio_queue(nvmeq, ns, bio)) { - if (!waitqueue_active(&nvmeq->sq_full)) - add_wait_queue(&nvmeq->sq_full, - &nvmeq->sq_cong_wait); - bio_list_add_head(&nvmeq->sq_cong, bio); - break; - } - } -} - static int nvme_kthread(void *data) { struct nvme_dev *dev, *next; @@ -1858,28 +1840,26 @@ static int nvme_kthread(void *data) continue; list_del_init(&dev->node); dev_warn(&dev->pci_dev->dev, - "Failed status, reset controller\n"); + "Failed status: %x, reset controller\n", + readl(&dev->bar->csts)); dev->reset_workfn = nvme_reset_failed_dev; queue_work(nvme_workq, &dev->reset_work); continue; } - rcu_read_lock(); for (i = 0; i < dev->queue_count; i++) { - struct nvme_queue *nvmeq = - rcu_dereference(dev->queues[i]); + struct nvme_queue *nvmeq = dev->queues[i]; if (!nvmeq) continue; spin_lock_irq(&nvmeq->q_lock); - if (nvmeq->q_suspended) - goto unlock; nvme_process_cq(nvmeq); - nvme_cancel_ios(nvmeq, true); - nvme_resubmit_bios(nvmeq); - nvme_resubmit_iods(nvmeq); - unlock: + + while ((i == 0) && (dev->event_limit > 0)) { + if (nvme_submit_async_admin_req(dev)) + break; + dev->event_limit--; + } spin_unlock_irq(&nvmeq->q_lock); } - rcu_read_unlock(); } spin_unlock(&dev_list_lock); schedule_timeout(round_jiffies_relative(HZ)); @@ -1902,27 +1882,28 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, { struct nvme_ns *ns; struct gendisk *disk; + int node = dev_to_node(&dev->pci_dev->dev); int lbaf; if (rt->attributes & NVME_LBART_ATTRIB_HIDE) return NULL; - ns = kzalloc(sizeof(*ns), GFP_KERNEL); + ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) return NULL; - ns->queue = blk_alloc_queue(GFP_KERNEL); - if (!ns->queue) + ns->queue = blk_mq_init_queue(&dev->tagset); + if (IS_ERR(ns->queue)) goto out_free_ns; - ns->queue->queue_flags = QUEUE_FLAG_DEFAULT; queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); - blk_queue_make_request(ns->queue, nvme_make_request); + queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, ns->queue); ns->dev = dev; ns->queue->queuedata = ns; - disk = alloc_disk(0); + disk = alloc_disk_node(0, node); if (!disk) goto out_free_queue; + ns->ns_id = nsid; ns->disk = disk; lbaf = id->flbas & 0xf; @@ -1931,6 +1912,8 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); if (dev->max_hw_sectors) blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); + if (dev->stripe_size) + blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9); if (dev->vwc & NVME_CTRL_VWC_PRESENT) blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); @@ -1956,143 +1939,19 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, return NULL; } -static int nvme_find_closest_node(int node) -{ - int n, val, min_val = INT_MAX, best_node = node; - - for_each_online_node(n) { - if (n == node) - continue; - val = node_distance(node, n); - if (val < min_val) { - min_val = val; - best_node = n; - } - } - return best_node; -} - -static void nvme_set_queue_cpus(cpumask_t *qmask, struct nvme_queue *nvmeq, - int count) -{ - int cpu; - for_each_cpu(cpu, qmask) { - if (cpumask_weight(nvmeq->cpu_mask) >= count) - break; - if (!cpumask_test_and_set_cpu(cpu, nvmeq->cpu_mask)) - *per_cpu_ptr(nvmeq->dev->io_queue, cpu) = nvmeq->qid; - } -} - -static void nvme_add_cpus(cpumask_t *mask, const cpumask_t *unassigned_cpus, - const cpumask_t *new_mask, struct nvme_queue *nvmeq, int cpus_per_queue) -{ - int next_cpu; - for_each_cpu(next_cpu, new_mask) { - cpumask_or(mask, mask, get_cpu_mask(next_cpu)); - cpumask_or(mask, mask, topology_thread_cpumask(next_cpu)); - cpumask_and(mask, mask, unassigned_cpus); - nvme_set_queue_cpus(mask, nvmeq, cpus_per_queue); - } -} - static void nvme_create_io_queues(struct nvme_dev *dev) { - unsigned i, max; + unsigned i; - max = min(dev->max_qid, num_online_cpus()); - for (i = dev->queue_count; i <= max; i++) + for (i = dev->queue_count; i <= dev->max_qid; i++) if (!nvme_alloc_queue(dev, i, dev->q_depth, i - 1)) break; - max = min(dev->queue_count - 1, num_online_cpus()); - for (i = dev->online_queues; i <= max; i++) - if (nvme_create_queue(raw_nvmeq(dev, i), i)) + for (i = dev->online_queues; i <= dev->queue_count - 1; i++) + if (nvme_create_queue(dev->queues[i], i)) break; } -/* - * If there are fewer queues than online cpus, this will try to optimally - * assign a queue to multiple cpus by grouping cpus that are "close" together: - * thread siblings, core, socket, closest node, then whatever else is - * available. - */ -static void nvme_assign_io_queues(struct nvme_dev *dev) -{ - unsigned cpu, cpus_per_queue, queues, remainder, i; - cpumask_var_t unassigned_cpus; - - nvme_create_io_queues(dev); - - queues = min(dev->online_queues - 1, num_online_cpus()); - if (!queues) - return; - - cpus_per_queue = num_online_cpus() / queues; - remainder = queues - (num_online_cpus() - queues * cpus_per_queue); - - if (!alloc_cpumask_var(&unassigned_cpus, GFP_KERNEL)) - return; - - cpumask_copy(unassigned_cpus, cpu_online_mask); - cpu = cpumask_first(unassigned_cpus); - for (i = 1; i <= queues; i++) { - struct nvme_queue *nvmeq = lock_nvmeq(dev, i); - cpumask_t mask; - - cpumask_clear(nvmeq->cpu_mask); - if (!cpumask_weight(unassigned_cpus)) { - unlock_nvmeq(nvmeq); - break; - } - - mask = *get_cpu_mask(cpu); - nvme_set_queue_cpus(&mask, nvmeq, cpus_per_queue); - if (cpus_weight(mask) < cpus_per_queue) - nvme_add_cpus(&mask, unassigned_cpus, - topology_thread_cpumask(cpu), - nvmeq, cpus_per_queue); - if (cpus_weight(mask) < cpus_per_queue) - nvme_add_cpus(&mask, unassigned_cpus, - topology_core_cpumask(cpu), - nvmeq, cpus_per_queue); - if (cpus_weight(mask) < cpus_per_queue) - nvme_add_cpus(&mask, unassigned_cpus, - cpumask_of_node(cpu_to_node(cpu)), - nvmeq, cpus_per_queue); - if (cpus_weight(mask) < cpus_per_queue) - nvme_add_cpus(&mask, unassigned_cpus, - cpumask_of_node( - nvme_find_closest_node( - cpu_to_node(cpu))), - nvmeq, cpus_per_queue); - if (cpus_weight(mask) < cpus_per_queue) - nvme_add_cpus(&mask, unassigned_cpus, - unassigned_cpus, - nvmeq, cpus_per_queue); - - WARN(cpumask_weight(nvmeq->cpu_mask) != cpus_per_queue, - "nvme%d qid:%d mis-matched queue-to-cpu assignment\n", - dev->instance, i); - - irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, - nvmeq->cpu_mask); - cpumask_andnot(unassigned_cpus, unassigned_cpus, - nvmeq->cpu_mask); - cpu = cpumask_next(cpu, unassigned_cpus); - if (remainder && !--remainder) - cpus_per_queue++; - unlock_nvmeq(nvmeq); - } - WARN(cpumask_weight(unassigned_cpus), "nvme%d unassigned online cpus\n", - dev->instance); - i = 0; - cpumask_andnot(unassigned_cpus, cpu_possible_mask, cpu_online_mask); - for_each_cpu(cpu, unassigned_cpus) - *per_cpu_ptr(dev->io_queue, cpu) = (i++ % queues) + 1; - free_cpumask_var(unassigned_cpus); -} - static int set_queue_count(struct nvme_dev *dev, int count) { int status; @@ -2106,7 +1965,7 @@ static int set_queue_count(struct nvme_dev *dev, int count) if (status > 0) { dev_err(&dev->pci_dev->dev, "Could not set queue count (%d)\n", status); - return -EBUSY; + return 0; } return min(result & 0xffff, result >> 16) + 1; } @@ -2116,39 +1975,15 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride); } -static void nvme_cpu_workfn(struct work_struct *work) -{ - struct nvme_dev *dev = container_of(work, struct nvme_dev, cpu_work); - if (dev->initialized) - nvme_assign_io_queues(dev); -} - -static int nvme_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - struct nvme_dev *dev; - - switch (action) { - case CPU_ONLINE: - case CPU_DEAD: - spin_lock(&dev_list_lock); - list_for_each_entry(dev, &dev_list, node) - schedule_work(&dev->cpu_work); - spin_unlock(&dev_list_lock); - break; - } - return NOTIFY_OK; -} - static int nvme_setup_io_queues(struct nvme_dev *dev) { - struct nvme_queue *adminq = raw_nvmeq(dev, 0); + struct nvme_queue *adminq = dev->queues[0]; struct pci_dev *pdev = dev->pci_dev; int result, i, vecs, nr_io_queues, size; nr_io_queues = num_possible_cpus(); result = set_queue_count(dev, nr_io_queues); - if (result < 0) + if (result <= 0) return result; if (result < nr_io_queues) nr_io_queues = result; @@ -2171,6 +2006,13 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) /* Deregister the admin queue's interrupt */ free_irq(dev->entry[0].vector, adminq); + /* + * If we enable msix early due to not intx, disable it again before + * setting up the full range we need. + */ + if (!pdev->irq) + pci_disable_msix(pdev); + for (i = 0; i < nr_io_queues; i++) dev->entry[i].entry = i; vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues); @@ -2194,14 +2036,12 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) dev->max_qid = nr_io_queues; result = queue_request_irq(dev, adminq, adminq->irqname); - if (result) { - adminq->q_suspended = 1; + if (result) goto free_queues; - } /* Free previously allocated queues that are no longer usable */ nvme_free_queues(dev, nr_io_queues + 1); - nvme_assign_io_queues(dev); + nvme_create_io_queues(dev); return 0; @@ -2244,14 +2084,37 @@ static int nvme_dev_add(struct nvme_dev *dev) dev->oncs = le16_to_cpup(&ctrl->oncs); dev->abort_limit = ctrl->acl + 1; dev->vwc = ctrl->vwc; + dev->event_limit = min(ctrl->aerl + 1, 8); memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); if (ctrl->mdts) dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); if ((pdev->vendor == PCI_VENDOR_ID_INTEL) && - (pdev->device == 0x0953) && ctrl->vs[3]) + (pdev->device == 0x0953) && ctrl->vs[3]) { + unsigned int max_hw_sectors; + dev->stripe_size = 1 << (ctrl->vs[3] + shift); + max_hw_sectors = dev->stripe_size >> (shift - 9); + if (dev->max_hw_sectors) { + dev->max_hw_sectors = min(max_hw_sectors, + dev->max_hw_sectors); + } else + dev->max_hw_sectors = max_hw_sectors; + } + + dev->tagset.ops = &nvme_mq_ops; + dev->tagset.nr_hw_queues = dev->online_queues - 1; + dev->tagset.timeout = NVME_IO_TIMEOUT; + dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev); + dev->tagset.queue_depth = + min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; + dev->tagset.cmd_size = sizeof(struct nvme_cmd_info); + dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; + dev->tagset.driver_data = dev; + + if (blk_mq_alloc_tag_set(&dev->tagset)) + goto out; id_ns = mem; for (i = 1; i <= nn; i++) { @@ -2292,6 +2155,9 @@ static int nvme_dev_map(struct nvme_dev *dev) dev->entry[0].vector = pdev->irq; pci_set_master(pdev); bars = pci_select_bars(pdev, IORESOURCE_MEM); + if (!bars) + goto disable_pci; + if (pci_request_selected_regions(pdev, bars, "nvme")) goto disable_pci; @@ -2302,10 +2168,22 @@ static int nvme_dev_map(struct nvme_dev *dev) dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); if (!dev->bar) goto disable; + if (readl(&dev->bar->csts) == -1) { result = -ENODEV; goto unmap; } + + /* + * Some devices don't advertse INTx interrupts, pre-enable a single + * MSIX vec for setup. We'll adjust this later. + */ + if (!pdev->irq) { + result = pci_enable_msix(pdev, dev->entry, 1); + if (result < 0) + goto unmap; + } + cap = readq(&dev->bar->cap); dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH); dev->db_stride = 1 << NVME_CAP_STRIDE(cap); @@ -2401,7 +2279,8 @@ static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode, c.delete_queue.qid = cpu_to_le16(nvmeq->qid); init_kthread_work(&nvmeq->cmdinfo.work, fn); - return nvme_submit_admin_cmd_async(nvmeq->dev, &c, &nvmeq->cmdinfo); + return nvme_submit_admin_async_cmd(nvmeq->dev, &c, &nvmeq->cmdinfo, + ADMIN_TIMEOUT); } static void nvme_del_cq_work_handler(struct kthread_work *work) @@ -2464,7 +2343,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev) atomic_set(&dq.refcount, 0); dq.worker = &worker; for (i = dev->queue_count - 1; i > 0; i--) { - struct nvme_queue *nvmeq = raw_nvmeq(dev, i); + struct nvme_queue *nvmeq = dev->queues[i]; if (nvme_suspend_queue(nvmeq)) continue; @@ -2500,13 +2379,16 @@ static void nvme_dev_list_remove(struct nvme_dev *dev) static void nvme_dev_shutdown(struct nvme_dev *dev) { int i; + u32 csts = -1; dev->initialized = 0; nvme_dev_list_remove(dev); - if (!dev->bar || (dev->bar && readl(&dev->bar->csts) == -1)) { + if (dev->bar) + csts = readl(&dev->bar->csts); + if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { for (i = dev->queue_count - 1; i >= 0; i--) { - struct nvme_queue *nvmeq = raw_nvmeq(dev, i); + struct nvme_queue *nvmeq = dev->queues[i]; nvme_suspend_queue(nvmeq); nvme_clear_queue(nvmeq); } @@ -2518,6 +2400,12 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) nvme_dev_unmap(dev); } +static void nvme_dev_remove_admin(struct nvme_dev *dev) +{ + if (dev->admin_q && !blk_queue_dying(dev->admin_q)) + blk_cleanup_queue(dev->admin_q); +} + static void nvme_dev_remove(struct nvme_dev *dev) { struct nvme_ns *ns; @@ -2589,6 +2477,11 @@ static void nvme_free_namespaces(struct nvme_dev *dev) list_for_each_entry_safe(ns, next, &dev->namespaces, list) { list_del(&ns->list); + + spin_lock(&dev_list_lock); + ns->disk->private_data = NULL; + spin_unlock(&dev_list_lock); + put_disk(ns->disk); kfree(ns); } @@ -2598,8 +2491,10 @@ static void nvme_free_dev(struct kref *kref) { struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); + pci_dev_put(dev->pci_dev); nvme_free_namespaces(dev); - free_percpu(dev->io_queue); + nvme_release_instance(dev); + blk_mq_free_tag_set(&dev->tagset); kfree(dev->queues); kfree(dev->entry); kfree(dev); @@ -2624,9 +2519,16 @@ static int nvme_dev_release(struct inode *inode, struct file *f) static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) { struct nvme_dev *dev = f->private_data; + struct nvme_ns *ns; + switch (cmd) { case NVME_IOCTL_ADMIN_CMD: - return nvme_user_admin_cmd(dev, (void __user *)arg); + return nvme_user_cmd(dev, NULL, (void __user *)arg); + case NVME_IOCTL_IO_CMD: + if (list_empty(&dev->namespaces)) + return -ENOTTY; + ns = list_first_entry(&dev->namespaces, struct nvme_ns, list); + return nvme_user_cmd(dev, ns, (void __user *)arg); default: return -ENOTTY; } @@ -2640,6 +2542,22 @@ static const struct file_operations nvme_dev_fops = { .compat_ioctl = nvme_dev_ioctl, }; +static void nvme_set_irq_hints(struct nvme_dev *dev) +{ + struct nvme_queue *nvmeq; + int i; + + for (i = 0; i < dev->online_queues; i++) { + nvmeq = dev->queues[i]; + + if (!nvmeq->hctx) + continue; + + irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, + nvmeq->hctx->cpumask); + } +} + static int nvme_dev_start(struct nvme_dev *dev) { int result; @@ -2663,7 +2581,7 @@ static int nvme_dev_start(struct nvme_dev *dev) if (start_thread) { nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); - wake_up(&nvme_kthread_wait); + wake_up_all(&nvme_kthread_wait); } else wait_event_killable(nvme_kthread_wait, nvme_thread); @@ -2672,10 +2590,14 @@ static int nvme_dev_start(struct nvme_dev *dev) goto disable; } + nvme_init_queue(dev->queues[0], 0); + result = nvme_setup_io_queues(dev); - if (result && result != -EBUSY) + if (result) goto disable; + nvme_set_irq_hints(dev); + return result; disable: @@ -2692,7 +2614,7 @@ static int nvme_remove_dead_ctrl(void *arg) struct pci_dev *pdev = dev->pci_dev; if (pci_get_drvdata(pdev)) - pci_stop_and_remove_bus_device(pdev); + pci_stop_and_remove_bus_device_locked(pdev); kref_put(&dev->kref, nvme_free_dev); return 0; } @@ -2701,8 +2623,8 @@ static void nvme_remove_disks(struct work_struct *ws) { struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); - nvme_dev_remove(dev); nvme_free_queues(dev, 1); + nvme_dev_remove(dev); } static int nvme_dev_resume(struct nvme_dev *dev) @@ -2710,9 +2632,9 @@ static int nvme_dev_resume(struct nvme_dev *dev) int ret; ret = nvme_dev_start(dev); - if (ret && ret != -EBUSY) + if (ret) return ret; - if (ret == -EBUSY) { + if (dev->online_queues < 2) { spin_lock(&dev_list_lock); dev->reset_workfn = nvme_remove_disks; queue_work(nvme_workq, &dev->reset_work); @@ -2726,7 +2648,7 @@ static void nvme_dev_reset(struct nvme_dev *dev) { nvme_dev_shutdown(dev); if (nvme_dev_resume(dev)) { - dev_err(&dev->pci_dev->dev, "Device failed to resume\n"); + dev_warn(&dev->pci_dev->dev, "Device failed to resume\n"); kref_get(&dev->kref); if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", dev->instance))) { @@ -2751,33 +2673,33 @@ static void nvme_reset_workfn(struct work_struct *work) static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) { - int result = -ENOMEM; + int node, result = -ENOMEM; struct nvme_dev *dev; - dev = kzalloc(sizeof(*dev), GFP_KERNEL); + node = dev_to_node(&pdev->dev); + if (node == NUMA_NO_NODE) + set_dev_node(&pdev->dev, 0); + + dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); if (!dev) return -ENOMEM; - dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry), - GFP_KERNEL); + dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry), + GFP_KERNEL, node); if (!dev->entry) goto free; - dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *), - GFP_KERNEL); + dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *), + GFP_KERNEL, node); if (!dev->queues) goto free; - dev->io_queue = alloc_percpu(unsigned short); - if (!dev->io_queue) - goto free; INIT_LIST_HEAD(&dev->namespaces); dev->reset_workfn = nvme_reset_failed_dev; INIT_WORK(&dev->reset_work, nvme_reset_workfn); - INIT_WORK(&dev->cpu_work, nvme_cpu_workfn); - dev->pci_dev = pdev; + dev->pci_dev = pci_dev_get(pdev); pci_set_drvdata(pdev, dev); result = nvme_set_instance(dev); if (result) - goto free; + goto put_pci; result = nvme_setup_prp_pools(dev); if (result) @@ -2785,17 +2707,14 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) kref_init(&dev->kref); result = nvme_dev_start(dev); - if (result) { - if (result == -EBUSY) - goto create_cdev; + if (result) goto release_pools; - } - result = nvme_dev_add(dev); + if (dev->online_queues > 1) + result = nvme_dev_add(dev); if (result) goto shutdown; - create_cdev: scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance); dev->miscdev.minor = MISC_DYNAMIC_MINOR; dev->miscdev.parent = &pdev->dev; @@ -2805,11 +2724,14 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto remove; + nvme_set_irq_hints(dev); + dev->initialized = 1; return 0; remove: nvme_dev_remove(dev); + nvme_dev_remove_admin(dev); nvme_free_namespaces(dev); shutdown: nvme_dev_shutdown(dev); @@ -2818,8 +2740,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) nvme_release_prp_pools(dev); release: nvme_release_instance(dev); + put_pci: + pci_dev_put(dev->pci_dev); free: - free_percpu(dev->io_queue); kfree(dev->queues); kfree(dev->entry); kfree(dev); @@ -2828,12 +2751,12 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) { - struct nvme_dev *dev = pci_get_drvdata(pdev); + struct nvme_dev *dev = pci_get_drvdata(pdev); - if (prepare) - nvme_dev_shutdown(dev); - else - nvme_dev_resume(dev); + if (prepare) + nvme_dev_shutdown(dev); + else + nvme_dev_resume(dev); } static void nvme_shutdown(struct pci_dev *pdev) @@ -2852,13 +2775,12 @@ static void nvme_remove(struct pci_dev *pdev) pci_set_drvdata(pdev, NULL); flush_work(&dev->reset_work); - flush_work(&dev->cpu_work); misc_deregister(&dev->miscdev); nvme_dev_remove(dev); nvme_dev_shutdown(dev); + nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); - rcu_barrier(); - nvme_release_instance(dev); + nvme_free_admin_tags(dev); nvme_release_prp_pools(dev); kref_put(&dev->kref, nvme_free_dev); } @@ -2941,18 +2863,11 @@ static int __init nvme_init(void) else if (result > 0) nvme_major = result; - nvme_nb.notifier_call = &nvme_cpu_notify; - result = register_hotcpu_notifier(&nvme_nb); - if (result) - goto unregister_blkdev; - result = pci_register_driver(&nvme_driver); if (result) - goto unregister_hotcpu; + goto unregister_blkdev; return 0; - unregister_hotcpu: - unregister_hotcpu_notifier(&nvme_nb); unregister_blkdev: unregister_blkdev(nvme_major, "nvme"); kill_workq: @@ -2972,6 +2887,6 @@ static void __exit nvme_exit(void) MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>"); MODULE_LICENSE("GPL"); -MODULE_VERSION("0.9"); +MODULE_VERSION("1.0"); module_init(nvme_init); module_exit(nvme_exit); diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index a4cd6d691c63..5e78568026c3 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -329,7 +329,7 @@ INQUIRY_EVPD_BIT_MASK) ? 1 : 0) (GET_U32_FROM_CDB(cdb, READ_CAP_16_CDB_ALLOC_LENGTH_OFFSET)) #define IS_READ_CAP_16(cdb) \ -((cdb[0] == SERVICE_ACTION_IN && cdb[1] == SAI_READ_CAPACITY_16) ? 1 : 0) +((cdb[0] == SERVICE_ACTION_IN_16 && cdb[1] == SAI_READ_CAPACITY_16) ? 1 : 0) /* Request Sense Helper Macros */ #define GET_REQUEST_SENSE_ALLOC_LENGTH(cdb) \ @@ -2105,7 +2105,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, nvme_offset += unit_num_blocks; - nvme_sc = nvme_submit_io_cmd(dev, &c, NULL); + nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL); if (nvme_sc != NVME_SC_SUCCESS) { nvme_unmap_user_pages(dev, (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, @@ -2658,7 +2658,7 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr, c.common.opcode = nvme_cmd_flush; c.common.nsid = cpu_to_le32(ns->ns_id); - nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL); + nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL); res = nvme_trans_status_code(hdr, nvme_sc); if (res) goto out; @@ -2686,7 +2686,7 @@ static int nvme_trans_synchronize_cache(struct nvme_ns *ns, c.common.opcode = nvme_cmd_flush; c.common.nsid = cpu_to_le32(ns->ns_id); - nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL); + nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL); res = nvme_trans_status_code(hdr, nvme_sc); if (res) @@ -2894,7 +2894,7 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr, c.dsm.nr = cpu_to_le32(ndesc - 1); c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); - nvme_sc = nvme_submit_io_cmd(dev, &c, NULL); + nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL); res = nvme_trans_status_code(hdr, nvme_sc); dma_free_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range), @@ -2915,6 +2915,14 @@ static int nvme_scsi_translate(struct nvme_ns *ns, struct sg_io_hdr *hdr) if (copy_from_user(cmd, hdr->cmdp, hdr->cmd_len)) return -EFAULT; + /* + * Prime the hdr with good status for scsi commands that don't require + * an nvme command for translation. + */ + retcode = nvme_trans_status_code(hdr, NVME_SC_SUCCESS); + if (retcode) + return retcode; + opcode = cmd[0]; switch (opcode) { @@ -2947,7 +2955,7 @@ static int nvme_scsi_translate(struct nvme_ns *ns, struct sg_io_hdr *hdr) case READ_CAPACITY: retcode = nvme_trans_read_capacity(ns, hdr, cmd); break; - case SERVICE_ACTION_IN: + case SERVICE_ACTION_IN_16: if (IS_READ_CAP_16(cmd)) retcode = nvme_trans_read_capacity(ns, hdr, cmd); else @@ -3016,152 +3024,6 @@ int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr) return retcode; } -#ifdef CONFIG_COMPAT -typedef struct sg_io_hdr32 { - compat_int_t interface_id; /* [i] 'S' for SCSI generic (required) */ - compat_int_t dxfer_direction; /* [i] data transfer direction */ - unsigned char cmd_len; /* [i] SCSI command length ( <= 16 bytes) */ - unsigned char mx_sb_len; /* [i] max length to write to sbp */ - unsigned short iovec_count; /* [i] 0 implies no scatter gather */ - compat_uint_t dxfer_len; /* [i] byte count of data transfer */ - compat_uint_t dxferp; /* [i], [*io] points to data transfer memory - or scatter gather list */ - compat_uptr_t cmdp; /* [i], [*i] points to command to perform */ - compat_uptr_t sbp; /* [i], [*o] points to sense_buffer memory */ - compat_uint_t timeout; /* [i] MAX_UINT->no timeout (unit: millisec) */ - compat_uint_t flags; /* [i] 0 -> default, see SG_FLAG... */ - compat_int_t pack_id; /* [i->o] unused internally (normally) */ - compat_uptr_t usr_ptr; /* [i->o] unused internally */ - unsigned char status; /* [o] scsi status */ - unsigned char masked_status; /* [o] shifted, masked scsi status */ - unsigned char msg_status; /* [o] messaging level data (optional) */ - unsigned char sb_len_wr; /* [o] byte count actually written to sbp */ - unsigned short host_status; /* [o] errors from host adapter */ - unsigned short driver_status; /* [o] errors from software driver */ - compat_int_t resid; /* [o] dxfer_len - actual_transferred */ - compat_uint_t duration; /* [o] time taken by cmd (unit: millisec) */ - compat_uint_t info; /* [o] auxiliary information */ -} sg_io_hdr32_t; /* 64 bytes long (on sparc32) */ - -typedef struct sg_iovec32 { - compat_uint_t iov_base; - compat_uint_t iov_len; -} sg_iovec32_t; - -static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iovec_count) -{ - sg_iovec_t __user *iov = (sg_iovec_t __user *) (sgio + 1); - sg_iovec32_t __user *iov32 = dxferp; - int i; - - for (i = 0; i < iovec_count; i++) { - u32 base, len; - - if (get_user(base, &iov32[i].iov_base) || - get_user(len, &iov32[i].iov_len) || - put_user(compat_ptr(base), &iov[i].iov_base) || - put_user(len, &iov[i].iov_len)) - return -EFAULT; - } - - if (put_user(iov, &sgio->dxferp)) - return -EFAULT; - return 0; -} - -int nvme_sg_io32(struct nvme_ns *ns, unsigned long arg) -{ - sg_io_hdr32_t __user *sgio32 = (sg_io_hdr32_t __user *)arg; - sg_io_hdr_t __user *sgio; - u16 iovec_count; - u32 data; - void __user *dxferp; - int err; - int interface_id; - - if (get_user(interface_id, &sgio32->interface_id)) - return -EFAULT; - if (interface_id != 'S') - return -EINVAL; - - if (get_user(iovec_count, &sgio32->iovec_count)) - return -EFAULT; - - { - void __user *top = compat_alloc_user_space(0); - void __user *new = compat_alloc_user_space(sizeof(sg_io_hdr_t) + - (iovec_count * sizeof(sg_iovec_t))); - if (new > top) - return -EINVAL; - - sgio = new; - } - - /* Ok, now construct. */ - if (copy_in_user(&sgio->interface_id, &sgio32->interface_id, - (2 * sizeof(int)) + - (2 * sizeof(unsigned char)) + - (1 * sizeof(unsigned short)) + - (1 * sizeof(unsigned int)))) - return -EFAULT; - - if (get_user(data, &sgio32->dxferp)) - return -EFAULT; - dxferp = compat_ptr(data); - if (iovec_count) { - if (sg_build_iovec(sgio, dxferp, iovec_count)) - return -EFAULT; - } else { - if (put_user(dxferp, &sgio->dxferp)) - return -EFAULT; - } - - { - unsigned char __user *cmdp; - unsigned char __user *sbp; - - if (get_user(data, &sgio32->cmdp)) - return -EFAULT; - cmdp = compat_ptr(data); - - if (get_user(data, &sgio32->sbp)) - return -EFAULT; - sbp = compat_ptr(data); - - if (put_user(cmdp, &sgio->cmdp) || - put_user(sbp, &sgio->sbp)) - return -EFAULT; - } - - if (copy_in_user(&sgio->timeout, &sgio32->timeout, - 3 * sizeof(int))) - return -EFAULT; - - if (get_user(data, &sgio32->usr_ptr)) - return -EFAULT; - if (put_user(compat_ptr(data), &sgio->usr_ptr)) - return -EFAULT; - - err = nvme_sg_io(ns, sgio); - if (err >= 0) { - void __user *datap; - - if (copy_in_user(&sgio32->pack_id, &sgio->pack_id, - sizeof(int)) || - get_user(datap, &sgio->usr_ptr) || - put_user((u32)(unsigned long)datap, - &sgio32->usr_ptr) || - copy_in_user(&sgio32->status, &sgio->status, - (4 * sizeof(unsigned char)) + - (2 * sizeof(unsigned short)) + - (3 * sizeof(int)))) - err = -EFAULT; - } - - return err; -} -#endif - int nvme_sg_get_version_num(int __user *ip) { return put_user(sg_version_num, ip); diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index 719cb1bc1640..3b7c9f1be484 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -69,8 +69,8 @@ nice This parameter controls the driver's use of idle CPU time, at the expense of some speed. - If this driver is built into the kernel, you can use kernel - the following command line parameters, with the same values + If this driver is built into the kernel, you can use the + following kernel command line parameters, with the same values as the corresponding module parameters listed above: pcd.drive0 diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index fea7e76a00de..d48715b287e6 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -722,6 +722,8 @@ static int pd_special_command(struct pd_unit *disk, int err = 0; rq = blk_get_request(disk->gd->queue, READ, __GFP_WAIT); + if (IS_ERR(rq)) + return PTR_ERR(rq); rq->cmd_type = REQ_TYPE_SPECIAL; rq->special = func; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 758ac442c5b5..09e628dafd9d 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -704,6 +704,8 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ? WRITE : READ, __GFP_WAIT); + if (IS_ERR(rq)) + return PTR_ERR(rq); blk_rq_set_block_pc(rq); if (cgc->buflen) { diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4b97baf8afa3..3ec85dfce124 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -210,6 +210,12 @@ enum obj_request_type { OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES }; +enum obj_operation_type { + OBJ_OP_WRITE, + OBJ_OP_READ, + OBJ_OP_DISCARD, +}; + enum obj_req_flags { OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ @@ -276,6 +282,7 @@ enum img_req_flags { IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ + IMG_REQ_DISCARD, /* discard: normal = 0, discard request = 1 */ }; struct rbd_img_request { @@ -335,7 +342,6 @@ struct rbd_device { struct list_head rq_queue; /* incoming rq queue */ spinlock_t lock; /* queue, flags, open_count */ - struct workqueue_struct *rq_wq; struct work_struct rq_work; struct rbd_image_header header; @@ -395,6 +401,8 @@ static struct kmem_cache *rbd_segment_name_cache; static int rbd_major; static DEFINE_IDA(rbd_dev_id_ida); +static struct workqueue_struct *rbd_wq; + /* * Default to false for now, as single-major requires >= 0.75 version of * userspace rbd utility. @@ -785,6 +793,20 @@ static int parse_rbd_opts_token(char *c, void *private) return 0; } +static char* obj_op_name(enum obj_operation_type op_type) +{ + switch (op_type) { + case OBJ_OP_READ: + return "read"; + case OBJ_OP_WRITE: + return "write"; + case OBJ_OP_DISCARD: + return "discard"; + default: + return "???"; + } +} + /* * Get a ceph client with specific addr and configuration, if one does * not exist create it. Either way, ceph_opts is consumed by this @@ -1600,6 +1622,21 @@ static bool img_request_write_test(struct rbd_img_request *img_request) return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; } +/* + * Set the discard flag when the img_request is an discard request + */ +static void img_request_discard_set(struct rbd_img_request *img_request) +{ + set_bit(IMG_REQ_DISCARD, &img_request->flags); + smp_mb(); +} + +static bool img_request_discard_test(struct rbd_img_request *img_request) +{ + smp_mb(); + return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0; +} + static void img_request_child_set(struct rbd_img_request *img_request) { set_bit(IMG_REQ_CHILD, &img_request->flags); @@ -1636,6 +1673,17 @@ static bool img_request_layered_test(struct rbd_img_request *img_request) return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; } +static enum obj_operation_type +rbd_img_request_op_type(struct rbd_img_request *img_request) +{ + if (img_request_write_test(img_request)) + return OBJ_OP_WRITE; + else if (img_request_discard_test(img_request)) + return OBJ_OP_DISCARD; + else + return OBJ_OP_READ; +} + static void rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) { @@ -1722,6 +1770,21 @@ static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) obj_request_done_set(obj_request); } +static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request) +{ + dout("%s: obj %p result %d %llu\n", __func__, obj_request, + obj_request->result, obj_request->length); + /* + * There is no such thing as a successful short discard. Set + * it to our originally-requested length. + */ + obj_request->xferred = obj_request->length; + /* discarding a non-existent object is not a problem */ + if (obj_request->result == -ENOENT) + obj_request->result = 0; + obj_request_done_set(obj_request); +} + /* * For a simple stat call there's nothing to do. We'll do more if * this is part of a write sequence for a layered image. @@ -1773,6 +1836,11 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, case CEPH_OSD_OP_STAT: rbd_osd_stat_callback(obj_request); break; + case CEPH_OSD_OP_DELETE: + case CEPH_OSD_OP_TRUNCATE: + case CEPH_OSD_OP_ZERO: + rbd_osd_discard_callback(obj_request); + break; case CEPH_OSD_OP_CALL: case CEPH_OSD_OP_NOTIFY_ACK: case CEPH_OSD_OP_WATCH: @@ -1823,7 +1891,7 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) */ static struct ceph_osd_request *rbd_osd_req_create( struct rbd_device *rbd_dev, - bool write_request, + enum obj_operation_type op_type, unsigned int num_ops, struct rbd_obj_request *obj_request) { @@ -1831,16 +1899,18 @@ static struct ceph_osd_request *rbd_osd_req_create( struct ceph_osd_client *osdc; struct ceph_osd_request *osd_req; - if (obj_request_img_data_test(obj_request)) { + if (obj_request_img_data_test(obj_request) && + (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) { struct rbd_img_request *img_request = obj_request->img_request; - - rbd_assert(write_request == - img_request_write_test(img_request)); - if (write_request) - snapc = img_request->snapc; + if (op_type == OBJ_OP_WRITE) { + rbd_assert(img_request_write_test(img_request)); + } else { + rbd_assert(img_request_discard_test(img_request)); + } + snapc = img_request->snapc; } - rbd_assert(num_ops == 1 || (write_request && num_ops == 2)); + rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2)); /* Allocate and initialize the request, for the num_ops ops */ @@ -1850,7 +1920,7 @@ static struct ceph_osd_request *rbd_osd_req_create( if (!osd_req) return NULL; /* ENOMEM */ - if (write_request) + if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; else osd_req->r_flags = CEPH_OSD_FLAG_READ; @@ -1865,9 +1935,10 @@ static struct ceph_osd_request *rbd_osd_req_create( } /* - * Create a copyup osd request based on the information in the - * object request supplied. A copyup request has three osd ops, - * a copyup method call, a hint op, and a write op. + * Create a copyup osd request based on the information in the object + * request supplied. A copyup request has two or three osd ops, a + * copyup method call, potentially a hint op, and a write or truncate + * or zero op. */ static struct ceph_osd_request * rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) @@ -1877,18 +1948,24 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) struct rbd_device *rbd_dev; struct ceph_osd_client *osdc; struct ceph_osd_request *osd_req; + int num_osd_ops = 3; rbd_assert(obj_request_img_data_test(obj_request)); img_request = obj_request->img_request; rbd_assert(img_request); - rbd_assert(img_request_write_test(img_request)); + rbd_assert(img_request_write_test(img_request) || + img_request_discard_test(img_request)); - /* Allocate and initialize the request, for the three ops */ + if (img_request_discard_test(img_request)) + num_osd_ops = 2; + + /* Allocate and initialize the request, for all the ops */ snapc = img_request->snapc; rbd_dev = img_request->rbd_dev; osdc = &rbd_dev->rbd_client->client->osdc; - osd_req = ceph_osdc_alloc_request(osdc, snapc, 3, false, GFP_ATOMIC); + osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops, + false, GFP_ATOMIC); if (!osd_req) return NULL; /* ENOMEM */ @@ -2057,7 +2134,8 @@ static bool rbd_dev_parent_get(struct rbd_device *rbd_dev) static struct rbd_img_request *rbd_img_request_create( struct rbd_device *rbd_dev, u64 offset, u64 length, - bool write_request) + enum obj_operation_type op_type, + struct ceph_snap_context *snapc) { struct rbd_img_request *img_request; @@ -2065,20 +2143,17 @@ static struct rbd_img_request *rbd_img_request_create( if (!img_request) return NULL; - if (write_request) { - down_read(&rbd_dev->header_rwsem); - ceph_get_snap_context(rbd_dev->header.snapc); - up_read(&rbd_dev->header_rwsem); - } - img_request->rq = NULL; img_request->rbd_dev = rbd_dev; img_request->offset = offset; img_request->length = length; img_request->flags = 0; - if (write_request) { + if (op_type == OBJ_OP_DISCARD) { + img_request_discard_set(img_request); + img_request->snapc = snapc; + } else if (op_type == OBJ_OP_WRITE) { img_request_write_set(img_request); - img_request->snapc = rbd_dev->header.snapc; + img_request->snapc = snapc; } else { img_request->snap_id = rbd_dev->spec->snap_id; } @@ -2093,8 +2168,7 @@ static struct rbd_img_request *rbd_img_request_create( kref_init(&img_request->kref); dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, - write_request ? "write" : "read", offset, length, - img_request); + obj_op_name(op_type), offset, length, img_request); return img_request; } @@ -2118,7 +2192,8 @@ static void rbd_img_request_destroy(struct kref *kref) rbd_dev_parent_put(img_request->rbd_dev); } - if (img_request_write_test(img_request)) + if (img_request_write_test(img_request) || + img_request_discard_test(img_request)) ceph_put_snap_context(img_request->snapc); kmem_cache_free(rbd_img_request_cache, img_request); @@ -2134,8 +2209,8 @@ static struct rbd_img_request *rbd_parent_request_create( rbd_assert(obj_request->img_request); rbd_dev = obj_request->img_request->rbd_dev; - parent_request = rbd_img_request_create(rbd_dev->parent, - img_offset, length, false); + parent_request = rbd_img_request_create(rbd_dev->parent, img_offset, + length, OBJ_OP_READ, NULL); if (!parent_request) return NULL; @@ -2176,11 +2251,18 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) result = obj_request->result; if (result) { struct rbd_device *rbd_dev = img_request->rbd_dev; + enum obj_operation_type op_type; + + if (img_request_discard_test(img_request)) + op_type = OBJ_OP_DISCARD; + else if (img_request_write_test(img_request)) + op_type = OBJ_OP_WRITE; + else + op_type = OBJ_OP_READ; rbd_warn(rbd_dev, "%s %llx at %llx (%llx)", - img_request_write_test(img_request) ? "write" : "read", - obj_request->length, obj_request->img_offset, - obj_request->offset); + obj_op_name(op_type), obj_request->length, + obj_request->img_offset, obj_request->offset); rbd_warn(rbd_dev, " result %d xferred %x", result, xferred); if (!img_request->result) @@ -2245,6 +2327,71 @@ out: } /* + * Add individual osd ops to the given ceph_osd_request and prepare + * them for submission. num_ops is the current number of + * osd operations already to the object request. + */ +static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request, + struct ceph_osd_request *osd_request, + enum obj_operation_type op_type, + unsigned int num_ops) +{ + struct rbd_img_request *img_request = obj_request->img_request; + struct rbd_device *rbd_dev = img_request->rbd_dev; + u64 object_size = rbd_obj_bytes(&rbd_dev->header); + u64 offset = obj_request->offset; + u64 length = obj_request->length; + u64 img_end; + u16 opcode; + + if (op_type == OBJ_OP_DISCARD) { + if (!offset && length == object_size && + (!img_request_layered_test(img_request) || + !obj_request_overlaps_parent(obj_request))) { + opcode = CEPH_OSD_OP_DELETE; + } else if ((offset + length == object_size)) { + opcode = CEPH_OSD_OP_TRUNCATE; + } else { + down_read(&rbd_dev->header_rwsem); + img_end = rbd_dev->header.image_size; + up_read(&rbd_dev->header_rwsem); + + if (obj_request->img_offset + length == img_end) + opcode = CEPH_OSD_OP_TRUNCATE; + else + opcode = CEPH_OSD_OP_ZERO; + } + } else if (op_type == OBJ_OP_WRITE) { + opcode = CEPH_OSD_OP_WRITE; + osd_req_op_alloc_hint_init(osd_request, num_ops, + object_size, object_size); + num_ops++; + } else { + opcode = CEPH_OSD_OP_READ; + } + + if (opcode == CEPH_OSD_OP_DELETE) + osd_req_op_init(osd_request, num_ops, opcode); + else + osd_req_op_extent_init(osd_request, num_ops, opcode, + offset, length, 0, 0); + + if (obj_request->type == OBJ_REQUEST_BIO) + osd_req_op_extent_osd_data_bio(osd_request, num_ops, + obj_request->bio_list, length); + else if (obj_request->type == OBJ_REQUEST_PAGES) + osd_req_op_extent_osd_data_pages(osd_request, num_ops, + obj_request->pages, length, + offset & ~PAGE_MASK, false, false); + + /* Discards are also writes */ + if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) + rbd_osd_req_format_write(obj_request); + else + rbd_osd_req_format_read(obj_request); +} + +/* * Split up an image request into one or more object requests, each * to a different object. The "type" parameter indicates whether * "data_desc" is the pointer to the head of a list of bio @@ -2259,28 +2406,26 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, struct rbd_device *rbd_dev = img_request->rbd_dev; struct rbd_obj_request *obj_request = NULL; struct rbd_obj_request *next_obj_request; - bool write_request = img_request_write_test(img_request); struct bio *bio_list = NULL; unsigned int bio_offset = 0; struct page **pages = NULL; + enum obj_operation_type op_type; u64 img_offset; u64 resid; - u16 opcode; dout("%s: img %p type %d data_desc %p\n", __func__, img_request, (int)type, data_desc); - opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ; img_offset = img_request->offset; resid = img_request->length; rbd_assert(resid > 0); + op_type = rbd_img_request_op_type(img_request); if (type == OBJ_REQUEST_BIO) { bio_list = data_desc; rbd_assert(img_offset == bio_list->bi_iter.bi_sector << SECTOR_SHIFT); - } else { - rbd_assert(type == OBJ_REQUEST_PAGES); + } else if (type == OBJ_REQUEST_PAGES) { pages = data_desc; } @@ -2289,7 +2434,6 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, const char *object_name; u64 offset; u64 length; - unsigned int which = 0; object_name = rbd_segment_name(rbd_dev, img_offset); if (!object_name) @@ -2321,7 +2465,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, GFP_ATOMIC); if (!obj_request->bio_list) goto out_unwind; - } else { + } else if (type == OBJ_REQUEST_PAGES) { unsigned int page_count; obj_request->pages = pages; @@ -2332,38 +2476,19 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, pages += page_count; } - osd_req = rbd_osd_req_create(rbd_dev, write_request, - (write_request ? 2 : 1), - obj_request); + osd_req = rbd_osd_req_create(rbd_dev, op_type, + (op_type == OBJ_OP_WRITE) ? 2 : 1, + obj_request); if (!osd_req) goto out_unwind; + obj_request->osd_req = osd_req; obj_request->callback = rbd_img_obj_callback; - rbd_img_request_get(img_request); - - if (write_request) { - osd_req_op_alloc_hint_init(osd_req, which, - rbd_obj_bytes(&rbd_dev->header), - rbd_obj_bytes(&rbd_dev->header)); - which++; - } - - osd_req_op_extent_init(osd_req, which, opcode, offset, length, - 0, 0); - if (type == OBJ_REQUEST_BIO) - osd_req_op_extent_osd_data_bio(osd_req, which, - obj_request->bio_list, length); - else - osd_req_op_extent_osd_data_pages(osd_req, which, - obj_request->pages, length, - offset & ~PAGE_MASK, false, false); + obj_request->img_offset = img_offset; - if (write_request) - rbd_osd_req_format_write(obj_request); - else - rbd_osd_req_format_read(obj_request); + rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0); - obj_request->img_offset = img_offset; + rbd_img_request_get(img_request); img_offset += length; resid -= length; @@ -2386,7 +2511,8 @@ rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request) struct page **pages; u32 page_count; - rbd_assert(obj_request->type == OBJ_REQUEST_BIO); + rbd_assert(obj_request->type == OBJ_REQUEST_BIO || + obj_request->type == OBJ_REQUEST_NODATA); rbd_assert(obj_request_img_data_test(obj_request)); img_request = obj_request->img_request; rbd_assert(img_request); @@ -2424,11 +2550,10 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) struct ceph_osd_client *osdc; struct rbd_device *rbd_dev; struct page **pages; + enum obj_operation_type op_type; u32 page_count; int img_result; u64 parent_length; - u64 offset; - u64 length; rbd_assert(img_request_child_test(img_request)); @@ -2492,26 +2617,10 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0, false, false); - /* Then the hint op */ - - osd_req_op_alloc_hint_init(osd_req, 1, rbd_obj_bytes(&rbd_dev->header), - rbd_obj_bytes(&rbd_dev->header)); + /* Add the other op(s) */ - /* And the original write request op */ - - offset = orig_request->offset; - length = orig_request->length; - osd_req_op_extent_init(osd_req, 2, CEPH_OSD_OP_WRITE, - offset, length, 0, 0); - if (orig_request->type == OBJ_REQUEST_BIO) - osd_req_op_extent_osd_data_bio(osd_req, 2, - orig_request->bio_list, length); - else - osd_req_op_extent_osd_data_pages(osd_req, 2, - orig_request->pages, length, - offset & ~PAGE_MASK, false, false); - - rbd_osd_req_format_write(orig_request); + op_type = rbd_img_request_op_type(orig_request->img_request); + rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1); /* All set, send it off. */ @@ -2728,7 +2837,7 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) rbd_assert(obj_request->img_request); rbd_dev = obj_request->img_request->rbd_dev; - stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, + stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, stat_request); if (!stat_request->osd_req) goto out; @@ -2748,11 +2857,10 @@ out: return ret; } -static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) +static bool img_obj_request_simple(struct rbd_obj_request *obj_request) { struct rbd_img_request *img_request; struct rbd_device *rbd_dev; - bool known; rbd_assert(obj_request_img_data_test(obj_request)); @@ -2760,22 +2868,44 @@ static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) rbd_assert(img_request); rbd_dev = img_request->rbd_dev; + /* Reads */ + if (!img_request_write_test(img_request) && + !img_request_discard_test(img_request)) + return true; + + /* Non-layered writes */ + if (!img_request_layered_test(img_request)) + return true; + /* - * Only writes to layered images need special handling. - * Reads and non-layered writes are simple object requests. - * Layered writes that start beyond the end of the overlap - * with the parent have no parent data, so they too are - * simple object requests. Finally, if the target object is - * known to already exist, its parent data has already been - * copied, so a write to the object can also be handled as a - * simple object request. + * Layered writes outside of the parent overlap range don't + * share any data with the parent. */ - if (!img_request_write_test(img_request) || - !img_request_layered_test(img_request) || - !obj_request_overlaps_parent(obj_request) || - ((known = obj_request_known_test(obj_request)) && - obj_request_exists_test(obj_request))) { + if (!obj_request_overlaps_parent(obj_request)) + return true; + /* + * Entire-object layered writes - we will overwrite whatever + * parent data there is anyway. + */ + if (!obj_request->offset && + obj_request->length == rbd_obj_bytes(&rbd_dev->header)) + return true; + + /* + * If the object is known to already exist, its parent data has + * already been copied. + */ + if (obj_request_known_test(obj_request) && + obj_request_exists_test(obj_request)) + return true; + + return false; +} + +static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) +{ + if (img_obj_request_simple(obj_request)) { struct rbd_device *rbd_dev; struct ceph_osd_client *osdc; @@ -2791,7 +2921,7 @@ static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) * start by reading the data for the full target object from * the parent so we can use it for a copyup to the target. */ - if (known) + if (obj_request_known_test(obj_request)) return rbd_img_obj_parent_read_full(obj_request); /* We don't know whether the target exists. Go find out. */ @@ -2932,7 +3062,7 @@ static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id) return -ENOMEM; ret = -ENOMEM; - obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, + obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, obj_request); if (!obj_request->osd_req) goto out; @@ -2995,7 +3125,7 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper( if (!obj_request) return ERR_PTR(-ENOMEM); - obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1, + obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1, obj_request); if (!obj_request->osd_req) { ret = -ENOMEM; @@ -3133,7 +3263,7 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, obj_request->pages = pages; obj_request->page_count = page_count; - obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, + obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, obj_request); if (!obj_request->osd_req) goto out; @@ -3183,11 +3313,20 @@ out: static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) { struct rbd_img_request *img_request; + struct ceph_snap_context *snapc = NULL; u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; u64 length = blk_rq_bytes(rq); - bool wr = rq_data_dir(rq) == WRITE; + enum obj_operation_type op_type; + u64 mapping_size; int result; + if (rq->cmd_flags & REQ_DISCARD) + op_type = OBJ_OP_DISCARD; + else if (rq->cmd_flags & REQ_WRITE) + op_type = OBJ_OP_WRITE; + else + op_type = OBJ_OP_READ; + /* Ignore/skip any zero-length requests */ if (!length) { @@ -3196,9 +3335,9 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) goto err_rq; } - /* Disallow writes to a read-only device */ + /* Only reads are allowed to a read-only device */ - if (wr) { + if (op_type != OBJ_OP_READ) { if (rbd_dev->mapping.read_only) { result = -EROFS; goto err_rq; @@ -3226,21 +3365,35 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) goto err_rq; /* Shouldn't happen */ } - if (offset + length > rbd_dev->mapping.size) { + down_read(&rbd_dev->header_rwsem); + mapping_size = rbd_dev->mapping.size; + if (op_type != OBJ_OP_READ) { + snapc = rbd_dev->header.snapc; + ceph_get_snap_context(snapc); + } + up_read(&rbd_dev->header_rwsem); + + if (offset + length > mapping_size) { rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset, - length, rbd_dev->mapping.size); + length, mapping_size); result = -EIO; goto err_rq; } - img_request = rbd_img_request_create(rbd_dev, offset, length, wr); + img_request = rbd_img_request_create(rbd_dev, offset, length, op_type, + snapc); if (!img_request) { result = -ENOMEM; goto err_rq; } img_request->rq = rq; - result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, rq->bio); + if (op_type == OBJ_OP_DISCARD) + result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA, + NULL); + else + result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, + rq->bio); if (result) goto err_img_request; @@ -3255,7 +3408,8 @@ err_img_request: err_rq: if (result) rbd_warn(rbd_dev, "%s %llx at %llx result %d", - wr ? "write" : "read", length, offset, result); + obj_op_name(op_type), length, offset, result); + ceph_put_snap_context(snapc); blk_end_request_all(rq, result); } @@ -3302,7 +3456,7 @@ static void rbd_request_fn(struct request_queue *q) } if (queued) - queue_work(rbd_dev->rq_wq, &rbd_dev->rq_work); + queue_work(rbd_wq, &rbd_dev->rq_work); } /* @@ -3382,7 +3536,7 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, page_count = (u32) calc_pages_for(offset, length); pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); if (IS_ERR(pages)) - ret = PTR_ERR(pages); + return PTR_ERR(pages); ret = -ENOMEM; obj_request = rbd_obj_request_create(object_name, offset, length, @@ -3393,7 +3547,7 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, obj_request->pages = pages; obj_request->page_count = page_count; - obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, + obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, obj_request); if (!obj_request->osd_req) goto out; @@ -3610,6 +3764,13 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) blk_queue_io_min(q, segment_size); blk_queue_io_opt(q, segment_size); + /* enable the discard support */ + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); + q->limits.discard_granularity = segment_size; + q->limits.discard_alignment = segment_size; + q->limits.max_discard_sectors = segment_size / SECTOR_SIZE; + q->limits.discard_zeroes_data = 1; + blk_queue_merge_bvec(q, rbd_merge_bvec); disk->queue = q; @@ -4924,7 +5085,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) ret = image_id ? 0 : -ENOMEM; if (!ret) rbd_dev->image_format = 1; - } else if (ret > sizeof (__le32)) { + } else if (ret >= 0) { void *p = response; image_id = ceph_extract_encoded_string(&p, p + ret, @@ -4932,8 +5093,6 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) ret = PTR_ERR_OR_ZERO(image_id); if (!ret) rbd_dev->image_format = 2; - } else { - ret = -EINVAL; } if (!ret) { @@ -5087,15 +5246,9 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only); - rbd_dev->rq_wq = alloc_workqueue("%s", 0, 0, rbd_dev->disk->disk_name); - if (!rbd_dev->rq_wq) { - ret = -ENOMEM; - goto err_out_mapping; - } - ret = rbd_bus_add_dev(rbd_dev); if (ret) - goto err_out_workqueue; + goto err_out_mapping; /* Everything's ready. Announce the disk to the world. */ @@ -5107,9 +5260,6 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) return ret; -err_out_workqueue: - destroy_workqueue(rbd_dev->rq_wq); - rbd_dev->rq_wq = NULL; err_out_mapping: rbd_dev_mapping_clear(rbd_dev); err_out_disk: @@ -5356,7 +5506,6 @@ static void rbd_dev_device_release(struct device *dev) { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - destroy_workqueue(rbd_dev->rq_wq); rbd_free_disk(rbd_dev); clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); rbd_dev_mapping_clear(rbd_dev); @@ -5560,11 +5709,21 @@ static int __init rbd_init(void) if (rc) return rc; + /* + * The number of active work items is limited by the number of + * rbd devices, so leave @max_active at default. + */ + rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0); + if (!rbd_wq) { + rc = -ENOMEM; + goto err_out_slab; + } + if (single_major) { rbd_major = register_blkdev(0, RBD_DRV_NAME); if (rbd_major < 0) { rc = rbd_major; - goto err_out_slab; + goto err_out_wq; } } @@ -5582,6 +5741,8 @@ static int __init rbd_init(void) err_out_blkdev: if (single_major) unregister_blkdev(rbd_major, RBD_DRV_NAME); +err_out_wq: + destroy_workqueue(rbd_wq); err_out_slab: rbd_slab_exit(); return rc; @@ -5593,6 +5754,7 @@ static void __exit rbd_exit(void) rbd_sysfs_cleanup(); if (single_major) unregister_blkdev(rbd_major, RBD_DRV_NAME); + destroy_workqueue(rbd_wq); rbd_slab_exit(); } diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c index 820b4009d5f7..d8b2488aaade 100644 --- a/drivers/block/rsxx/core.c +++ b/drivers/block/rsxx/core.c @@ -62,12 +62,6 @@ static DEFINE_SPINLOCK(rsxx_ida_lock); /* --------------------Debugfs Setup ------------------- */ -struct rsxx_cram { - u32 f_pos; - u32 offset; - void *i_private; -}; - static int rsxx_attr_pci_regs_show(struct seq_file *m, void *p) { struct rsxx_cardinfo *card = m->private; @@ -184,93 +178,50 @@ static int rsxx_attr_pci_regs_open(struct inode *inode, struct file *file) static ssize_t rsxx_cram_read(struct file *fp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct rsxx_cram *info = fp->private_data; - struct rsxx_cardinfo *card = info->i_private; + struct rsxx_cardinfo *card = file_inode(fp)->i_private; char *buf; - int st; + ssize_t st; - buf = kzalloc(sizeof(*buf) * cnt, GFP_KERNEL); + buf = kzalloc(cnt, GFP_KERNEL); if (!buf) return -ENOMEM; - info->f_pos = (u32)*ppos + info->offset; - - st = rsxx_creg_read(card, CREG_ADD_CRAM + info->f_pos, cnt, buf, 1); - if (st) - return st; - - st = copy_to_user(ubuf, buf, cnt); + st = rsxx_creg_read(card, CREG_ADD_CRAM + (u32)*ppos, cnt, buf, 1); + if (!st) + st = copy_to_user(ubuf, buf, cnt); + kfree(buf); if (st) return st; - - info->offset += cnt; - - kfree(buf); - + *ppos += cnt; return cnt; } static ssize_t rsxx_cram_write(struct file *fp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct rsxx_cram *info = fp->private_data; - struct rsxx_cardinfo *card = info->i_private; + struct rsxx_cardinfo *card = file_inode(fp)->i_private; char *buf; - int st; + ssize_t st; - buf = kzalloc(sizeof(*buf) * cnt, GFP_KERNEL); + buf = kzalloc(cnt, GFP_KERNEL); if (!buf) return -ENOMEM; st = copy_from_user(buf, ubuf, cnt); + if (!st) + st = rsxx_creg_write(card, CREG_ADD_CRAM + (u32)*ppos, cnt, + buf, 1); + kfree(buf); if (st) return st; - - info->f_pos = (u32)*ppos + info->offset; - - st = rsxx_creg_write(card, CREG_ADD_CRAM + info->f_pos, cnt, buf, 1); - if (st) - return st; - - info->offset += cnt; - - kfree(buf); - + *ppos += cnt; return cnt; } -static int rsxx_cram_open(struct inode *inode, struct file *file) -{ - struct rsxx_cram *info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) - return -ENOMEM; - - info->i_private = inode->i_private; - info->f_pos = file->f_pos; - file->private_data = info; - - return 0; -} - -static int rsxx_cram_release(struct inode *inode, struct file *file) -{ - struct rsxx_cram *info = file->private_data; - - if (!info) - return 0; - - kfree(info); - file->private_data = NULL; - - return 0; -} - static const struct file_operations debugfs_cram_fops = { .owner = THIS_MODULE, - .open = rsxx_cram_open, .read = rsxx_cram_read, .write = rsxx_cram_write, - .release = rsxx_cram_release, }; static const struct file_operations debugfs_stats_fops = { @@ -886,7 +837,7 @@ static int rsxx_pci_probe(struct pci_dev *dev, "Failed to enable MSI\n"); } - st = request_irq(dev->irq, rsxx_isr, IRQF_DISABLED | IRQF_SHARED, + st = request_irq(dev->irq, rsxx_isr, IRQF_SHARED, DRIVER_NAME, card); if (st) { dev_err(CARD_TO_DEV(card), diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index 2839d37e5af7..ac8c62cb4875 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c @@ -112,37 +112,16 @@ static const struct block_device_operations rsxx_fops = { static void disk_stats_start(struct rsxx_cardinfo *card, struct bio *bio) { - struct hd_struct *part0 = &card->gendisk->part0; - int rw = bio_data_dir(bio); - int cpu; - - cpu = part_stat_lock(); - - part_round_stats(cpu, part0); - part_inc_in_flight(part0, rw); - - part_stat_unlock(); + generic_start_io_acct(bio_data_dir(bio), bio_sectors(bio), + &card->gendisk->part0); } static void disk_stats_complete(struct rsxx_cardinfo *card, struct bio *bio, unsigned long start_time) { - struct hd_struct *part0 = &card->gendisk->part0; - unsigned long duration = jiffies - start_time; - int rw = bio_data_dir(bio); - int cpu; - - cpu = part_stat_lock(); - - part_stat_add(cpu, part0, sectors[rw], bio_sectors(bio)); - part_stat_inc(cpu, part0, ios[rw]); - part_stat_add(cpu, part0, ticks[rw], duration); - - part_round_stats(cpu, part0); - part_dec_in_flight(part0, rw); - - part_stat_unlock(); + generic_end_io_acct(bio_data_dir(bio), &card->gendisk->part0, + start_time); } static void bio_dma_done_cb(struct rsxx_cardinfo *card, @@ -307,6 +286,7 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card) blk_queue_physical_block_size(card->queue, RSXX_HW_BLK_SIZE); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, card->queue); + queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, card->queue); if (rsxx_discard_supported(card)) { queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, card->queue); blk_queue_max_discard_sectors(card->queue, diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 8fcdcfb4b472..1e46eb2305c0 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -4426,6 +4426,7 @@ static int skd_cons_disk(struct skd_device *skdev) q->limits.discard_zeroes_data = 1; queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); + queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q); spin_lock_irqsave(&skdev->lock, flags); pr_debug("%s:%s:%d stopping %s queue\n", diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 5814deb6963d..4b911ed96ea3 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -9,6 +9,7 @@ #include <linux/blkdev.h> #include <linux/hdreg.h> #include <linux/genhd.h> +#include <linux/cdrom.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> @@ -22,8 +23,8 @@ #define DRV_MODULE_NAME "sunvdc" #define PFX DRV_MODULE_NAME ": " -#define DRV_MODULE_VERSION "1.0" -#define DRV_MODULE_RELDATE "June 25, 2007" +#define DRV_MODULE_VERSION "1.2" +#define DRV_MODULE_RELDATE "November 24, 2014" static char version[] = DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n"; @@ -32,13 +33,15 @@ MODULE_DESCRIPTION("Sun LDOM virtual disk client driver"); MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_MODULE_VERSION); -#define VDC_TX_RING_SIZE 256 +#define VDC_TX_RING_SIZE 512 #define WAITING_FOR_LINK_UP 0x01 #define WAITING_FOR_TX_SPACE 0x02 #define WAITING_FOR_GEN_CMD 0x04 #define WAITING_FOR_ANY -1 +static struct workqueue_struct *sunvdc_wq; + struct vdc_req_entry { struct request *req; }; @@ -59,19 +62,25 @@ struct vdc_port { u64 max_xfer_size; u32 vdisk_block_size; + u64 ldc_timeout; + struct timer_list ldc_reset_timer; + struct work_struct ldc_reset_work; + /* The server fills these in for us in the disk attribute * ACK packet. */ u64 operations; u32 vdisk_size; u8 vdisk_type; + u8 vdisk_mtype; char disk_name[32]; - - struct vio_disk_geom geom; - struct vio_disk_vtoc label; }; +static void vdc_ldc_reset(struct vdc_port *port); +static void vdc_ldc_reset_work(struct work_struct *work); +static void vdc_ldc_reset_timer(unsigned long _arg); + static inline struct vdc_port *to_vdc_port(struct vio_driver_state *vio) { return container_of(vio, struct vdc_port, vio); @@ -79,9 +88,16 @@ static inline struct vdc_port *to_vdc_port(struct vio_driver_state *vio) /* Ordered from largest major to lowest */ static struct vio_version vdc_versions[] = { + { .major = 1, .minor = 1 }, { .major = 1, .minor = 0 }, }; +static inline int vdc_version_supported(struct vdc_port *port, + u16 major, u16 minor) +{ + return port->vio.ver.major == major && port->vio.ver.minor >= minor; +} + #define VDCBLK_NAME "vdisk" static int vdc_major; #define PARTITION_SHIFT 3 @@ -94,20 +110,71 @@ static inline u32 vdc_tx_dring_avail(struct vio_dring_state *dr) static int vdc_getgeo(struct block_device *bdev, struct hd_geometry *geo) { struct gendisk *disk = bdev->bd_disk; - struct vdc_port *port = disk->private_data; + sector_t nsect = get_capacity(disk); + sector_t cylinders = nsect; - geo->heads = (u8) port->geom.num_hd; - geo->sectors = (u8) port->geom.num_sec; - geo->cylinders = port->geom.num_cyl; + geo->heads = 0xff; + geo->sectors = 0x3f; + sector_div(cylinders, geo->heads * geo->sectors); + geo->cylinders = cylinders; + if ((sector_t)(geo->cylinders + 1) * geo->heads * geo->sectors < nsect) + geo->cylinders = 0xffff; return 0; } +/* Add ioctl/CDROM_GET_CAPABILITY to support cdrom_id in udev + * when vdisk_mtype is VD_MEDIA_TYPE_CD or VD_MEDIA_TYPE_DVD. + * Needed to be able to install inside an ldom from an iso image. + */ +static int vdc_ioctl(struct block_device *bdev, fmode_t mode, + unsigned command, unsigned long argument) +{ + int i; + struct gendisk *disk; + + switch (command) { + case CDROMMULTISESSION: + pr_debug(PFX "Multisession CDs not supported\n"); + for (i = 0; i < sizeof(struct cdrom_multisession); i++) + if (put_user(0, (char __user *)(argument + i))) + return -EFAULT; + return 0; + + case CDROM_GET_CAPABILITY: + disk = bdev->bd_disk; + + if (bdev->bd_disk && (disk->flags & GENHD_FL_CD)) + return 0; + return -EINVAL; + + default: + pr_debug(PFX "ioctl %08x not supported\n", command); + return -EINVAL; + } +} + static const struct block_device_operations vdc_fops = { .owner = THIS_MODULE, .getgeo = vdc_getgeo, + .ioctl = vdc_ioctl, }; +static void vdc_blk_queue_start(struct vdc_port *port) +{ + struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING]; + + /* restart blk queue when ring is half emptied. also called after + * handshake completes, so check for initial handshake before we've + * allocated a disk. + */ + if (port->disk && blk_queue_stopped(port->disk->queue) && + vdc_tx_dring_avail(dr) * 100 / VDC_TX_RING_SIZE >= 50) { + blk_start_queue(port->disk->queue); + } + +} + static void vdc_finish(struct vio_driver_state *vio, int err, int waiting_for) { if (vio->cmp && @@ -121,7 +188,11 @@ static void vdc_finish(struct vio_driver_state *vio, int err, int waiting_for) static void vdc_handshake_complete(struct vio_driver_state *vio) { + struct vdc_port *port = to_vdc_port(vio); + + del_timer(&port->ldc_reset_timer); vdc_finish(vio, 0, WAITING_FOR_LINK_UP); + vdc_blk_queue_start(port); } static int vdc_handle_unknown(struct vdc_port *port, void *arg) @@ -165,9 +236,9 @@ static int vdc_handle_attr(struct vio_driver_state *vio, void *arg) struct vio_disk_attr_info *pkt = arg; viodbg(HS, "GOT ATTR stype[0x%x] ops[%llx] disk_size[%llu] disk_type[%x] " - "xfer_mode[0x%x] blksz[%u] max_xfer[%llu]\n", + "mtype[0x%x] xfer_mode[0x%x] blksz[%u] max_xfer[%llu]\n", pkt->tag.stype, pkt->operations, - pkt->vdisk_size, pkt->vdisk_type, + pkt->vdisk_size, pkt->vdisk_type, pkt->vdisk_mtype, pkt->xfer_mode, pkt->vdisk_block_size, pkt->max_xfer_size); @@ -192,8 +263,11 @@ static int vdc_handle_attr(struct vio_driver_state *vio, void *arg) } port->operations = pkt->operations; - port->vdisk_size = pkt->vdisk_size; port->vdisk_type = pkt->vdisk_type; + if (vdc_version_supported(port, 1, 1)) { + port->vdisk_size = pkt->vdisk_size; + port->vdisk_mtype = pkt->vdisk_mtype; + } if (pkt->max_xfer_size < port->max_xfer_size) port->max_xfer_size = pkt->max_xfer_size; port->vdisk_block_size = pkt->vdisk_block_size; @@ -224,7 +298,7 @@ static void vdc_end_one(struct vdc_port *port, struct vio_dring_state *dr, ldc_unmap(port->vio.lp, desc->cookies, desc->ncookies); desc->hdr.state = VIO_DESC_FREE; - dr->cons = (index + 1) & (VDC_TX_RING_SIZE - 1); + dr->cons = vio_dring_next(dr, index); req = rqe->req; if (req == NULL) { @@ -236,8 +310,7 @@ static void vdc_end_one(struct vdc_port *port, struct vio_dring_state *dr, __blk_end_request(req, (desc->status ? -EIO : 0), desc->size); - if (blk_queue_stopped(port->disk->queue)) - blk_start_queue(port->disk->queue); + vdc_blk_queue_start(port); } static int vdc_ack(struct vdc_port *port, void *msgbuf) @@ -270,17 +343,20 @@ static void vdc_event(void *arg, int event) spin_lock_irqsave(&vio->lock, flags); - if (unlikely(event == LDC_EVENT_RESET || - event == LDC_EVENT_UP)) { + if (unlikely(event == LDC_EVENT_RESET)) { vio_link_state_change(vio, event); - spin_unlock_irqrestore(&vio->lock, flags); - return; + queue_work(sunvdc_wq, &port->ldc_reset_work); + goto out; + } + + if (unlikely(event == LDC_EVENT_UP)) { + vio_link_state_change(vio, event); + goto out; } if (unlikely(event != LDC_EVENT_DATA_READY)) { - printk(KERN_WARNING PFX "Unexpected LDC event %d\n", event); - spin_unlock_irqrestore(&vio->lock, flags); - return; + pr_warn(PFX "Unexpected LDC event %d\n", event); + goto out; } err = 0; @@ -324,6 +400,7 @@ static void vdc_event(void *arg, int event) } if (err < 0) vdc_finish(&port->vio, err, WAITING_FOR_ANY); +out: spin_unlock_irqrestore(&vio->lock, flags); } @@ -356,6 +433,8 @@ static int __vdc_tx_trigger(struct vdc_port *port) delay = 128; } while (err == -EAGAIN); + if (err == -ENOTCONN) + vdc_ldc_reset(port); return err; } @@ -388,12 +467,6 @@ static int __send_request(struct request *req) for (i = 0; i < nsg; i++) len += sg[i].length; - if (unlikely(vdc_tx_dring_avail(dr) < 1)) { - blk_stop_queue(port->disk->queue); - err = -ENOMEM; - goto out; - } - desc = vio_dring_cur(dr); err = ldc_map_sg(port->vio.lp, sg, nsg, @@ -431,23 +504,34 @@ static int __send_request(struct request *req) printk(KERN_ERR PFX "vdc_tx_trigger() failure, err=%d\n", err); } else { port->req_id++; - dr->prod = (dr->prod + 1) & (VDC_TX_RING_SIZE - 1); + dr->prod = vio_dring_next(dr, dr->prod); } -out: return err; } -static void do_vdc_request(struct request_queue *q) +static void do_vdc_request(struct request_queue *rq) { - while (1) { - struct request *req = blk_fetch_request(q); + struct request *req; - if (!req) - break; + while ((req = blk_peek_request(rq)) != NULL) { + struct vdc_port *port; + struct vio_dring_state *dr; + + port = req->rq_disk->private_data; + dr = &port->vio.drings[VIO_DRIVER_TX_RING]; + if (unlikely(vdc_tx_dring_avail(dr) < 1)) + goto wait; + + blk_start_request(req); - if (__send_request(req) < 0) - __blk_end_request_all(req, -EIO); + if (__send_request(req) < 0) { + blk_requeue_request(rq, req); +wait: + /* Avoid pointless unplugs. */ + blk_stop_queue(rq); + break; + } } } @@ -574,7 +658,7 @@ static int generic_request(struct vdc_port *port, u8 op, void *buf, int len) err = __vdc_tx_trigger(port); if (err >= 0) { port->req_id++; - dr->prod = (dr->prod + 1) & (VDC_TX_RING_SIZE - 1); + dr->prod = vio_dring_next(dr, dr->prod); spin_unlock_irqrestore(&port->vio.lock, flags); wait_for_completion(&comp.com); @@ -638,12 +722,9 @@ static void vdc_free_tx_ring(struct vdc_port *port) } } -static int probe_disk(struct vdc_port *port) +static int vdc_port_up(struct vdc_port *port) { struct vio_completion comp; - struct request_queue *q; - struct gendisk *g; - int err; init_completion(&comp.com); comp.err = 0; @@ -651,29 +732,48 @@ static int probe_disk(struct vdc_port *port) port->vio.cmp = ∁ vio_port_up(&port->vio); - wait_for_completion(&comp.com); - if (comp.err) - return comp.err; + return comp.err; +} - err = generic_request(port, VD_OP_GET_VTOC, - &port->label, sizeof(port->label)); - if (err < 0) { - printk(KERN_ERR PFX "VD_OP_GET_VTOC returns error %d\n", err); - return err; - } +static void vdc_port_down(struct vdc_port *port) +{ + ldc_disconnect(port->vio.lp); + ldc_unbind(port->vio.lp); + vdc_free_tx_ring(port); + vio_ldc_free(&port->vio); +} - err = generic_request(port, VD_OP_GET_DISKGEOM, - &port->geom, sizeof(port->geom)); - if (err < 0) { - printk(KERN_ERR PFX "VD_OP_GET_DISKGEOM returns " - "error %d\n", err); +static int probe_disk(struct vdc_port *port) +{ + struct request_queue *q; + struct gendisk *g; + int err; + + err = vdc_port_up(port); + if (err) return err; - } - port->vdisk_size = ((u64)port->geom.num_cyl * - (u64)port->geom.num_hd * - (u64)port->geom.num_sec); + if (vdc_version_supported(port, 1, 1)) { + /* vdisk_size should be set during the handshake, if it wasn't + * then the underlying disk is reserved by another system + */ + if (port->vdisk_size == -1) + return -ENODEV; + } else { + struct vio_disk_geom geom; + + err = generic_request(port, VD_OP_GET_DISKGEOM, + &geom, sizeof(geom)); + if (err < 0) { + printk(KERN_ERR PFX "VD_OP_GET_DISKGEOM returns " + "error %d\n", err); + return err; + } + port->vdisk_size = ((u64)geom.num_cyl * + (u64)geom.num_hd * + (u64)geom.num_sec); + } q = blk_init_queue(do_vdc_request, &port->vio.lock); if (!q) { @@ -691,6 +791,10 @@ static int probe_disk(struct vdc_port *port) port->disk = g; + /* Each segment in a request is up to an aligned page in size. */ + blk_queue_segment_boundary(q, PAGE_SIZE - 1); + blk_queue_max_segment_size(q, PAGE_SIZE); + blk_queue_max_segments(q, port->ring_cookies); blk_queue_max_hw_sectors(q, port->max_xfer_size); g->major = vdc_major; @@ -704,9 +808,32 @@ static int probe_disk(struct vdc_port *port) set_capacity(g, port->vdisk_size); - printk(KERN_INFO PFX "%s: %u sectors (%u MB)\n", + if (vdc_version_supported(port, 1, 1)) { + switch (port->vdisk_mtype) { + case VD_MEDIA_TYPE_CD: + pr_info(PFX "Virtual CDROM %s\n", port->disk_name); + g->flags |= GENHD_FL_CD; + g->flags |= GENHD_FL_REMOVABLE; + set_disk_ro(g, 1); + break; + + case VD_MEDIA_TYPE_DVD: + pr_info(PFX "Virtual DVD %s\n", port->disk_name); + g->flags |= GENHD_FL_CD; + g->flags |= GENHD_FL_REMOVABLE; + set_disk_ro(g, 1); + break; + + case VD_MEDIA_TYPE_FIXED: + pr_info(PFX "Virtual Hard disk %s\n", port->disk_name); + break; + } + } + + pr_info(PFX "%s: %u sectors (%u MB) protocol %d.%d\n", g->disk_name, - port->vdisk_size, (port->vdisk_size >> (20 - 9))); + port->vdisk_size, (port->vdisk_size >> (20 - 9)), + port->vio.ver.major, port->vio.ver.minor); add_disk(g); @@ -738,6 +865,7 @@ static int vdc_port_probe(struct vio_dev *vdev, const struct vio_device_id *id) struct mdesc_handle *hp; struct vdc_port *port; int err; + const u64 *ldc_timeout; print_version(); @@ -765,6 +893,17 @@ static int vdc_port_probe(struct vio_dev *vdev, const struct vio_device_id *id) else snprintf(port->disk_name, sizeof(port->disk_name), VDCBLK_NAME "%c", 'a' + ((int)vdev->dev_no % 26)); + port->vdisk_size = -1; + + /* Actual wall time may be double due to do_generic_file_read() doing + * a readahead I/O first, and once that fails it will try to read a + * single page. + */ + ldc_timeout = mdesc_get_property(hp, vdev->mp, "vdc-timeout", NULL); + port->ldc_timeout = ldc_timeout ? *ldc_timeout : 0; + setup_timer(&port->ldc_reset_timer, vdc_ldc_reset_timer, + (unsigned long)port); + INIT_WORK(&port->ldc_reset_work, vdc_ldc_reset_work); err = vio_driver_init(&port->vio, vdev, VDEV_DISK, vdc_versions, ARRAY_SIZE(vdc_versions), @@ -814,8 +953,21 @@ static int vdc_port_remove(struct vio_dev *vdev) struct vdc_port *port = dev_get_drvdata(&vdev->dev); if (port) { + unsigned long flags; + + spin_lock_irqsave(&port->vio.lock, flags); + blk_stop_queue(port->disk->queue); + spin_unlock_irqrestore(&port->vio.lock, flags); + + flush_work(&port->ldc_reset_work); + del_timer_sync(&port->ldc_reset_timer); del_timer_sync(&port->vio.timer); + del_gendisk(port->disk); + blk_cleanup_queue(port->disk->queue); + put_disk(port->disk); + port->disk = NULL; + vdc_free_tx_ring(port); vio_ldc_free(&port->vio); @@ -826,6 +978,102 @@ static int vdc_port_remove(struct vio_dev *vdev) return 0; } +static void vdc_requeue_inflight(struct vdc_port *port) +{ + struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING]; + u32 idx; + + for (idx = dr->cons; idx != dr->prod; idx = vio_dring_next(dr, idx)) { + struct vio_disk_desc *desc = vio_dring_entry(dr, idx); + struct vdc_req_entry *rqe = &port->rq_arr[idx]; + struct request *req; + + ldc_unmap(port->vio.lp, desc->cookies, desc->ncookies); + desc->hdr.state = VIO_DESC_FREE; + dr->cons = vio_dring_next(dr, idx); + + req = rqe->req; + if (req == NULL) { + vdc_end_special(port, desc); + continue; + } + + rqe->req = NULL; + blk_requeue_request(port->disk->queue, req); + } +} + +static void vdc_queue_drain(struct vdc_port *port) +{ + struct request *req; + + while ((req = blk_fetch_request(port->disk->queue)) != NULL) + __blk_end_request_all(req, -EIO); +} + +static void vdc_ldc_reset_timer(unsigned long _arg) +{ + struct vdc_port *port = (struct vdc_port *) _arg; + struct vio_driver_state *vio = &port->vio; + unsigned long flags; + + spin_lock_irqsave(&vio->lock, flags); + if (!(port->vio.hs_state & VIO_HS_COMPLETE)) { + pr_warn(PFX "%s ldc down %llu seconds, draining queue\n", + port->disk_name, port->ldc_timeout); + vdc_queue_drain(port); + vdc_blk_queue_start(port); + } + spin_unlock_irqrestore(&vio->lock, flags); +} + +static void vdc_ldc_reset_work(struct work_struct *work) +{ + struct vdc_port *port; + struct vio_driver_state *vio; + unsigned long flags; + + port = container_of(work, struct vdc_port, ldc_reset_work); + vio = &port->vio; + + spin_lock_irqsave(&vio->lock, flags); + vdc_ldc_reset(port); + spin_unlock_irqrestore(&vio->lock, flags); +} + +static void vdc_ldc_reset(struct vdc_port *port) +{ + int err; + + assert_spin_locked(&port->vio.lock); + + pr_warn(PFX "%s ldc link reset\n", port->disk_name); + blk_stop_queue(port->disk->queue); + vdc_requeue_inflight(port); + vdc_port_down(port); + + err = vio_ldc_alloc(&port->vio, &vdc_ldc_cfg, port); + if (err) { + pr_err(PFX "%s vio_ldc_alloc:%d\n", port->disk_name, err); + return; + } + + err = vdc_alloc_tx_ring(port); + if (err) { + pr_err(PFX "%s vio_alloc_tx_ring:%d\n", port->disk_name, err); + goto err_free_ldc; + } + + if (port->ldc_timeout) + mod_timer(&port->ldc_reset_timer, + round_jiffies(jiffies + HZ * port->ldc_timeout)); + mod_timer(&port->vio.timer, round_jiffies(jiffies + HZ)); + return; + +err_free_ldc: + vio_ldc_free(&port->vio); +} + static const struct vio_device_id vdc_port_match[] = { { .type = "vdc-port", @@ -845,9 +1093,13 @@ static int __init vdc_init(void) { int err; + sunvdc_wq = alloc_workqueue("sunvdc", 0, 0); + if (!sunvdc_wq) + return -ENOMEM; + err = register_blkdev(0, VDCBLK_NAME); if (err < 0) - goto out_err; + goto out_free_wq; vdc_major = err; @@ -861,7 +1113,8 @@ out_unregister_blkdev: unregister_blkdev(vdc_major, VDCBLK_NAME); vdc_major = 0; -out_err: +out_free_wq: + destroy_workqueue(sunvdc_wq); return err; } @@ -869,6 +1122,7 @@ static void __exit vdc_exit(void) { vio_unregister_driver(&vdc_port_driver); unregister_blkdev(vdc_major, VDCBLK_NAME); + destroy_workqueue(sunvdc_wq); } module_init(vdc_init); diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 6b44bbe528b7..b5afd495d482 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -971,7 +971,6 @@ static struct platform_driver swim_driver = { .remove = swim_remove, .driver = { .name = CARDNAME, - .owner = THIS_MODULE, }, }; diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index d5e2d12b9d9e..5d552857de41 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -568,7 +568,7 @@ static struct carm_request *carm_get_special(struct carm_host *host) return NULL; rq = blk_get_request(host->oob_q, WRITE /* bogus */, GFP_KERNEL); - if (!rq) { + if (IS_ERR(rq)) { spin_lock_irqsave(&host->lock, flags); carm_put_request(host, crq); spin_unlock_irqrestore(&host->lock, flags); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 0a581400de0f..7ef7c098708f 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -41,12 +41,6 @@ struct virtio_blk /* Process context for config space updates */ struct work_struct config_work; - /* Lock for config space updates */ - struct mutex config_lock; - - /* enable config space updates */ - bool config_enable; - /* What host tells us, plus 2 for header & tailer. */ unsigned int sg_elems; @@ -86,7 +80,7 @@ static int __virtblk_add_req(struct virtqueue *vq, { struct scatterlist hdr, status, cmd, sense, inhdr, *sgs[6]; unsigned int num_out = 0, num_in = 0; - int type = vbr->out_hdr.type & ~VIRTIO_BLK_T_OUT; + __virtio32 type = vbr->out_hdr.type & ~cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT); sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr)); sgs[num_out++] = &hdr; @@ -97,19 +91,19 @@ static int __virtblk_add_req(struct virtqueue *vq, * block, and before the normal inhdr we put the sense data and the * inhdr with additional status information. */ - if (type == VIRTIO_BLK_T_SCSI_CMD) { + if (type == cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_SCSI_CMD)) { sg_init_one(&cmd, vbr->req->cmd, vbr->req->cmd_len); sgs[num_out++] = &cmd; } if (have_data) { - if (vbr->out_hdr.type & VIRTIO_BLK_T_OUT) + if (vbr->out_hdr.type & cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT)) sgs[num_out++] = data_sg; else sgs[num_out + num_in++] = data_sg; } - if (type == VIRTIO_BLK_T_SCSI_CMD) { + if (type == cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_SCSI_CMD)) { sg_init_one(&sense, vbr->req->sense, SCSI_SENSE_BUFFERSIZE); sgs[num_out + num_in++] = &sense; sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr)); @@ -125,17 +119,18 @@ static int __virtblk_add_req(struct virtqueue *vq, static inline void virtblk_request_done(struct request *req) { struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); + struct virtio_blk *vblk = req->q->queuedata; int error = virtblk_result(vbr); if (req->cmd_type == REQ_TYPE_BLOCK_PC) { - req->resid_len = vbr->in_hdr.residual; - req->sense_len = vbr->in_hdr.sense_len; - req->errors = vbr->in_hdr.errors; + req->resid_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.residual); + req->sense_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.sense_len); + req->errors = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors); } else if (req->cmd_type == REQ_TYPE_SPECIAL) { req->errors = (error != 0); } - blk_mq_end_io(req, error); + blk_mq_end_request(req, error); } static void virtblk_done(struct virtqueue *vq) @@ -164,14 +159,15 @@ static void virtblk_done(struct virtqueue *vq) spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); } -static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) +static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { struct virtio_blk *vblk = hctx->queue->queuedata; + struct request *req = bd->rq; struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); unsigned long flags; unsigned int num; int qid = hctx->queue_num; - const bool last = (req->cmd_flags & REQ_END) != 0; int err; bool notify = false; @@ -179,25 +175,25 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) vbr->req = req; if (req->cmd_flags & REQ_FLUSH) { - vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; + vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_FLUSH); vbr->out_hdr.sector = 0; - vbr->out_hdr.ioprio = req_get_ioprio(vbr->req); + vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req)); } else { switch (req->cmd_type) { case REQ_TYPE_FS: vbr->out_hdr.type = 0; - vbr->out_hdr.sector = blk_rq_pos(vbr->req); - vbr->out_hdr.ioprio = req_get_ioprio(vbr->req); + vbr->out_hdr.sector = cpu_to_virtio64(vblk->vdev, blk_rq_pos(vbr->req)); + vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req)); break; case REQ_TYPE_BLOCK_PC: - vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD; + vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_SCSI_CMD); vbr->out_hdr.sector = 0; - vbr->out_hdr.ioprio = req_get_ioprio(vbr->req); + vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req)); break; case REQ_TYPE_SPECIAL: - vbr->out_hdr.type = VIRTIO_BLK_T_GET_ID; + vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID); vbr->out_hdr.sector = 0; - vbr->out_hdr.ioprio = req_get_ioprio(vbr->req); + vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req)); break; default: /* We don't put anything else in the queue. */ @@ -205,12 +201,14 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) } } + blk_mq_start_request(req); + num = blk_rq_map_sg(hctx->queue, vbr->req, vbr->sg); if (num) { if (rq_data_dir(vbr->req) == WRITE) - vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; + vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_OUT); else - vbr->out_hdr.type |= VIRTIO_BLK_T_IN; + vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_IN); } spin_lock_irqsave(&vblk->vqs[qid].lock, flags); @@ -226,7 +224,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) return BLK_MQ_RQ_QUEUE_ERROR; } - if (last && virtqueue_kick_prepare(vblk->vqs[qid].vq)) + if (bd->last && virtqueue_kick_prepare(vblk->vqs[qid].vq)) notify = true; spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); @@ -335,7 +333,8 @@ static ssize_t virtblk_serial_show(struct device *dev, return err; } -DEVICE_ATTR(serial, S_IRUGO, virtblk_serial_show, NULL); + +static DEVICE_ATTR(serial, S_IRUGO, virtblk_serial_show, NULL); static void virtblk_config_changed_work(struct work_struct *work) { @@ -347,10 +346,6 @@ static void virtblk_config_changed_work(struct work_struct *work) char *envp[] = { "RESIZE=1", NULL }; u64 capacity, size; - mutex_lock(&vblk->config_lock); - if (!vblk->config_enable) - goto done; - /* Host must always specify the capacity. */ virtio_cread(vdev, struct virtio_blk_config, capacity, &capacity); @@ -374,8 +369,6 @@ static void virtblk_config_changed_work(struct work_struct *work) set_capacity(vblk->disk, capacity); revalidate_disk(vblk->disk); kobject_uevent_env(&disk_to_dev(vblk->disk)->kobj, KOBJ_CHANGE, envp); -done: - mutex_unlock(&vblk->config_lock); } static void virtblk_config_changed(struct virtio_device *vdev) @@ -486,7 +479,8 @@ static int virtblk_get_cache_mode(struct virtio_device *vdev) struct virtio_blk_config, wce, &writeback); if (err) - writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_WCE); + writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_WCE) || + virtio_has_feature(vdev, VIRTIO_F_VERSION_1); return writeback; } @@ -606,10 +600,8 @@ static int virtblk_probe(struct virtio_device *vdev) vblk->vdev = vdev; vblk->sg_elems = sg_elems; - mutex_init(&vblk->config_lock); INIT_WORK(&vblk->config_work, virtblk_config_changed_work); - vblk->config_enable = true; err = init_vq(vblk); if (err) @@ -733,6 +725,8 @@ static int virtblk_probe(struct virtio_device *vdev) if (!err && opt_io_size) blk_queue_io_opt(q, blk_size * opt_io_size); + virtio_device_ready(vdev); + add_disk(vblk->disk); err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial); if (err) @@ -771,10 +765,8 @@ static void virtblk_remove(struct virtio_device *vdev) int index = vblk->index; int refc; - /* Prevent config work handler from accessing the device. */ - mutex_lock(&vblk->config_lock); - vblk->config_enable = false; - mutex_unlock(&vblk->config_lock); + /* Make sure no work handler is accessing the device. */ + flush_work(&vblk->config_work); del_gendisk(vblk->disk); blk_cleanup_queue(vblk->disk->queue); @@ -784,8 +776,6 @@ static void virtblk_remove(struct virtio_device *vdev) /* Stop all the virtqueues. */ vdev->config->reset(vdev); - flush_work(&vblk->config_work); - refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount); put_disk(vblk->disk); vdev->config->del_vqs(vdev); @@ -805,11 +795,7 @@ static int virtblk_freeze(struct virtio_device *vdev) /* Ensure we don't receive any more interrupts */ vdev->config->reset(vdev); - /* Prevent config work handler from accessing the device. */ - mutex_lock(&vblk->config_lock); - vblk->config_enable = false; - mutex_unlock(&vblk->config_lock); - + /* Make sure no work handler is accessing the device. */ flush_work(&vblk->config_work); blk_mq_stop_hw_queues(vblk->disk->queue); @@ -823,12 +809,14 @@ static int virtblk_restore(struct virtio_device *vdev) struct virtio_blk *vblk = vdev->priv; int ret; - vblk->config_enable = true; ret = init_vq(vdev->priv); - if (!ret) - blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); + if (ret) + return ret; + + virtio_device_ready(vdev); - return ret; + blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); + return 0; } #endif @@ -837,25 +825,34 @@ static const struct virtio_device_id id_table[] = { { 0 }, }; -static unsigned int features[] = { +static unsigned int features_legacy[] = { VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI, VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE, VIRTIO_BLK_F_MQ, +} +; +static unsigned int features[] = { + VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY, + VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, + VIRTIO_BLK_F_TOPOLOGY, + VIRTIO_BLK_F_MQ, }; static struct virtio_driver virtio_blk = { - .feature_table = features, - .feature_table_size = ARRAY_SIZE(features), - .driver.name = KBUILD_MODNAME, - .driver.owner = THIS_MODULE, - .id_table = id_table, - .probe = virtblk_probe, - .remove = virtblk_remove, - .config_changed = virtblk_config_changed, + .feature_table = features, + .feature_table_size = ARRAY_SIZE(features), + .feature_table_legacy = features_legacy, + .feature_table_size_legacy = ARRAY_SIZE(features_legacy), + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .probe = virtblk_probe, + .remove = virtblk_remove, + .config_changed = virtblk_config_changed, #ifdef CONFIG_PM_SLEEP - .freeze = virtblk_freeze, - .restore = virtblk_restore, + .freeze = virtblk_freeze, + .restore = virtblk_restore, #endif }; @@ -887,8 +884,8 @@ out_destroy_workqueue: static void __exit fini(void) { - unregister_blkdev(major, "virtblk"); unregister_virtio_driver(&virtio_blk); + unregister_blkdev(major, "virtblk"); destroy_workqueue(virtblk_wq); } module_init(init); diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 64c60edcdfbc..63fc7f06a014 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -763,6 +763,7 @@ again: BUG_ON(new_map_idx >= segs_to_map); if (unlikely(map[new_map_idx].status != 0)) { pr_debug(DRV_PFX "invalid buffer -- could not remap it\n"); + put_free_pages(blkif, &pages[seg_idx]->page, 1); pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE; ret |= 1; goto next; diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 3a8b810b4980..630a489e757d 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -270,6 +270,9 @@ static int xen_blkif_disconnect(struct xen_blkif *blkif) blkif->blk_rings.common.sring = NULL; } + /* Remove all persistent grants and the cache of ballooned pages. */ + xen_blkbk_free_caches(blkif); + return 0; } @@ -281,9 +284,6 @@ static void xen_blkif_free(struct xen_blkif *blkif) xen_blkif_disconnect(blkif); xen_vbd_free(&blkif->vbd); - /* Remove all persistent grants and the cache of ballooned pages. */ - xen_blkbk_free_caches(blkif); - /* Make sure everything is drained before shutting down */ BUG_ON(blkif->persistent_gnt_c != 0); BUG_ON(atomic_read(&blkif->persistent_gnt_in_use) != 0); @@ -907,22 +907,17 @@ static int connect_ring(struct backend_info *be) return 0; } - -/* ** Driver Registration ** */ - - static const struct xenbus_device_id xen_blkbk_ids[] = { { "vbd" }, { "" } }; - -static DEFINE_XENBUS_DRIVER(xen_blkbk, , +static struct xenbus_driver xen_blkbk_driver = { + .ids = xen_blkbk_ids, .probe = xen_blkbk_probe, .remove = xen_blkbk_remove, .otherend_changed = frontend_changed -); - +}; int xen_blkif_xenbus_init(void) { diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 5deb235bd18f..2236c6f31608 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -126,7 +126,6 @@ struct blkfront_info unsigned int persistent_gnts_c; unsigned long shadow_free; unsigned int feature_flush; - unsigned int flush_op; unsigned int feature_discard:1; unsigned int feature_secdiscard:1; unsigned int discard_granularity; @@ -479,7 +478,19 @@ static int blkif_queue_request(struct request *req) * way. (It's also a FLUSH+FUA, since it is * guaranteed ordered WRT previous writes.) */ - ring_req->operation = info->flush_op; + switch (info->feature_flush & + ((REQ_FLUSH|REQ_FUA))) { + case REQ_FLUSH|REQ_FUA: + ring_req->operation = + BLKIF_OP_WRITE_BARRIER; + break; + case REQ_FLUSH: + ring_req->operation = + BLKIF_OP_FLUSH_DISKCACHE; + break; + default: + ring_req->operation = 0; + } } ring_req->u.rw.nr_segments = nseg; } @@ -582,6 +593,16 @@ static inline void flush_requests(struct blkfront_info *info) notify_remote_via_irq(info->irq); } +static inline bool blkif_request_flush_invalid(struct request *req, + struct blkfront_info *info) +{ + return ((req->cmd_type != REQ_TYPE_FS) || + ((req->cmd_flags & REQ_FLUSH) && + !(info->feature_flush & REQ_FLUSH)) || + ((req->cmd_flags & REQ_FUA) && + !(info->feature_flush & REQ_FUA))); +} + /* * do_blkif_request * read a block; request is in a request queue @@ -604,10 +625,8 @@ static void do_blkif_request(struct request_queue *rq) blk_start_request(req); - if ((req->cmd_type != REQ_TYPE_FS) || - ((req->cmd_flags & (REQ_FLUSH | REQ_FUA)) && - !info->flush_op)) { - __blk_end_request_all(req, -EIO); + if (blkif_request_flush_invalid(req, info)) { + __blk_end_request_all(req, -EOPNOTSUPP); continue; } @@ -677,20 +696,26 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, return 0; } +static const char *flush_info(unsigned int feature_flush) +{ + switch (feature_flush & ((REQ_FLUSH | REQ_FUA))) { + case REQ_FLUSH|REQ_FUA: + return "barrier: enabled;"; + case REQ_FLUSH: + return "flush diskcache: enabled;"; + default: + return "barrier or flush: disabled;"; + } +} static void xlvbd_flush(struct blkfront_info *info) { blk_queue_flush(info->rq, info->feature_flush); - printk(KERN_INFO "blkfront: %s: %s: %s %s %s %s %s\n", - info->gd->disk_name, - info->flush_op == BLKIF_OP_WRITE_BARRIER ? - "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ? - "flush diskcache" : "barrier or flush"), - info->feature_flush ? "enabled;" : "disabled;", - "persistent grants:", - info->feature_persistent ? "enabled;" : "disabled;", - "indirect descriptors:", - info->max_indirect_segments ? "enabled;" : "disabled;"); + pr_info("blkfront: %s: %s %s %s %s %s\n", + info->gd->disk_name, flush_info(info->feature_flush), + "persistent grants:", info->feature_persistent ? + "enabled;" : "disabled;", "indirect descriptors:", + info->max_indirect_segments ? "enabled;" : "disabled;"); } static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) @@ -1182,7 +1207,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) if (error == -EOPNOTSUPP) error = 0; info->feature_flush = 0; - info->flush_op = 0; xlvbd_flush(info); } /* fall through */ @@ -1802,7 +1826,6 @@ static void blkfront_connect(struct blkfront_info *info) physical_sector_size = sector_size; info->feature_flush = 0; - info->flush_op = 0; err = xenbus_gather(XBT_NIL, info->xbdev->otherend, "feature-barrier", "%d", &barrier, @@ -1815,10 +1838,8 @@ static void blkfront_connect(struct blkfront_info *info) * * If there are barriers, then we use flush. */ - if (!err && barrier) { + if (!err && barrier) info->feature_flush = REQ_FLUSH | REQ_FUA; - info->flush_op = BLKIF_OP_WRITE_BARRIER; - } /* * And if there is "feature-flush-cache" use that above * barriers. @@ -1827,10 +1848,8 @@ static void blkfront_connect(struct blkfront_info *info) "feature-flush-cache", "%d", &flush, NULL); - if (!err && flush) { + if (!err && flush) info->feature_flush = REQ_FLUSH; - info->flush_op = BLKIF_OP_FLUSH_DISKCACHE; - } err = xenbus_gather(XBT_NIL, info->xbdev->otherend, "feature-discard", "%d", &discard, @@ -2055,13 +2074,14 @@ static const struct xenbus_device_id blkfront_ids[] = { { "" } }; -static DEFINE_XENBUS_DRIVER(blkfront, , +static struct xenbus_driver blkfront_driver = { + .ids = blkfront_ids, .probe = blkfront_probe, .remove = blkfront_remove, .resume = blkfront_resume, .otherend_changed = blkback_changed, .is_ready = blkfront_is_ready, -); +}; static int __init xlblk_init(void) { diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index d00831c3d731..bd8bda386e02 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -44,15 +44,14 @@ static const char *default_compressor = "lzo"; static unsigned int num_devices = 1; #define ZRAM_ATTR_RO(name) \ -static ssize_t zram_attr_##name##_show(struct device *d, \ +static ssize_t name##_show(struct device *d, \ struct device_attribute *attr, char *b) \ { \ struct zram *zram = dev_to_zram(d); \ return scnprintf(b, PAGE_SIZE, "%llu\n", \ (u64)atomic64_read(&zram->stats.name)); \ } \ -static struct device_attribute dev_attr_##name = \ - __ATTR(name, S_IRUGO, zram_attr_##name##_show, NULL); +static DEVICE_ATTR_RO(name); static inline int init_done(struct zram *zram) { @@ -99,14 +98,15 @@ static ssize_t mem_used_total_show(struct device *dev, { u64 val = 0; struct zram *zram = dev_to_zram(dev); - struct zram_meta *meta = zram->meta; down_read(&zram->init_lock); - if (init_done(zram)) - val = zs_get_total_size_bytes(meta->mem_pool); + if (init_done(zram)) { + struct zram_meta *meta = zram->meta; + val = zs_get_total_pages(meta->mem_pool); + } up_read(&zram->init_lock); - return scnprintf(buf, PAGE_SIZE, "%llu\n", val); + return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); } static ssize_t max_comp_streams_show(struct device *dev, @@ -122,6 +122,73 @@ static ssize_t max_comp_streams_show(struct device *dev, return scnprintf(buf, PAGE_SIZE, "%d\n", val); } +static ssize_t mem_limit_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u64 val; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + val = zram->limit_pages; + up_read(&zram->init_lock); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); +} + +static ssize_t mem_limit_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + u64 limit; + char *tmp; + struct zram *zram = dev_to_zram(dev); + + limit = memparse(buf, &tmp); + if (buf == tmp) /* no chars parsed, invalid input */ + return -EINVAL; + + down_write(&zram->init_lock); + zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT; + up_write(&zram->init_lock); + + return len; +} + +static ssize_t mem_used_max_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u64 val = 0; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + if (init_done(zram)) + val = atomic_long_read(&zram->stats.max_used_pages); + up_read(&zram->init_lock); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); +} + +static ssize_t mem_used_max_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + int err; + unsigned long val; + struct zram *zram = dev_to_zram(dev); + + err = kstrtoul(buf, 10, &val); + if (err || val != 0) + return -EINVAL; + + down_read(&zram->init_lock); + if (init_done(zram)) { + struct zram_meta *meta = zram->meta; + atomic_long_set(&zram->stats.max_used_pages, + zs_get_total_pages(meta->mem_pool)); + } + up_read(&zram->init_lock); + + return len; +} + static ssize_t max_comp_streams_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -219,19 +286,18 @@ static inline int is_partial_io(struct bio_vec *bvec) /* * Check if request is within bounds and aligned on zram logical blocks. */ -static inline int valid_io_request(struct zram *zram, struct bio *bio) +static inline int valid_io_request(struct zram *zram, + sector_t start, unsigned int size) { - u64 start, end, bound; + u64 end, bound; /* unaligned request */ - if (unlikely(bio->bi_iter.bi_sector & - (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) + if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) return 0; - if (unlikely(bio->bi_iter.bi_size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) + if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) return 0; - start = bio->bi_iter.bi_sector; - end = start + (bio->bi_iter.bi_size >> SECTOR_SHIFT); + end = start + (size >> SECTOR_SHIFT); bound = zram->disksize >> SECTOR_SHIFT; /* out of range range */ if (unlikely(start >= bound || end > bound || start > end)) @@ -385,7 +451,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) } static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, - u32 index, int offset, struct bio *bio) + u32 index, int offset) { int ret; struct page *page; @@ -434,6 +500,21 @@ out_cleanup: return ret; } +static inline void update_used_max(struct zram *zram, + const unsigned long pages) +{ + int old_max, cur_max; + + old_max = atomic_long_read(&zram->stats.max_used_pages); + + do { + cur_max = old_max; + if (pages > cur_max) + old_max = atomic_long_cmpxchg( + &zram->stats.max_used_pages, cur_max, pages); + } while (old_max != cur_max); +} + static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, int offset) { @@ -445,6 +526,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, struct zram_meta *meta = zram->meta; struct zcomp_strm *zstrm; bool locked = false; + unsigned long alloced_pages; page = bvec->bv_page; if (is_partial_io(bvec)) { @@ -476,7 +558,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, } if (page_zero_filled(uncmem)) { - kunmap_atomic(user_mem); + if (user_mem) + kunmap_atomic(user_mem); /* Free memory associated with this sector now. */ bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); zram_free_page(zram, index); @@ -513,6 +596,16 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, ret = -ENOMEM; goto out; } + + alloced_pages = zs_get_total_pages(meta->mem_pool); + if (zram->limit_pages && alloced_pages > zram->limit_pages) { + zs_free(meta->mem_pool, handle); + ret = -ENOMEM; + goto out; + } + + update_used_max(zram, alloced_pages); + cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO); if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) { @@ -550,14 +643,13 @@ out: } static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, - int offset, struct bio *bio) + int offset, int rw) { int ret; - int rw = bio_data_dir(bio); if (rw == READ) { atomic64_inc(&zram->stats.num_reads); - ret = zram_bvec_read(zram, bvec, index, offset, bio); + ret = zram_bvec_read(zram, bvec, index, offset); } else { atomic64_inc(&zram->stats.num_writes); ret = zram_bvec_write(zram, bvec, index, offset); @@ -606,6 +698,7 @@ static void zram_bio_discard(struct zram *zram, u32 index, bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); zram_free_page(zram, index); bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + atomic64_inc(&zram->stats.notify_free); index++; n -= PAGE_SIZE; } @@ -617,6 +710,9 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) struct zram_meta *meta; down_write(&zram->init_lock); + + zram->limit_pages = 0; + if (!init_done(zram)) { up_write(&zram->init_lock); return; @@ -754,7 +850,7 @@ out: static void __zram_make_request(struct zram *zram, struct bio *bio) { - int offset; + int offset, rw; u32 index; struct bio_vec bvec; struct bvec_iter iter; @@ -769,6 +865,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) return; } + rw = bio_data_dir(bio); bio_for_each_segment(bvec, bio, iter) { int max_transfer_size = PAGE_SIZE - offset; @@ -783,15 +880,15 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) bv.bv_len = max_transfer_size; bv.bv_offset = bvec.bv_offset; - if (zram_bvec_rw(zram, &bv, index, offset, bio) < 0) + if (zram_bvec_rw(zram, &bv, index, offset, rw) < 0) goto out; bv.bv_len = bvec.bv_len - max_transfer_size; bv.bv_offset += max_transfer_size; - if (zram_bvec_rw(zram, &bv, index + 1, 0, bio) < 0) + if (zram_bvec_rw(zram, &bv, index + 1, 0, rw) < 0) goto out; } else - if (zram_bvec_rw(zram, &bvec, index, offset, bio) < 0) + if (zram_bvec_rw(zram, &bvec, index, offset, rw) < 0) goto out; update_position(&index, &offset, &bvec); @@ -816,7 +913,8 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio) if (unlikely(!init_done(zram))) goto error; - if (!valid_io_request(zram, bio)) { + if (!valid_io_request(zram, bio->bi_iter.bi_sector, + bio->bi_iter.bi_size)) { atomic64_inc(&zram->stats.invalid_io); goto error; } @@ -846,21 +944,64 @@ static void zram_slot_free_notify(struct block_device *bdev, atomic64_inc(&zram->stats.notify_free); } +static int zram_rw_page(struct block_device *bdev, sector_t sector, + struct page *page, int rw) +{ + int offset, err; + u32 index; + struct zram *zram; + struct bio_vec bv; + + zram = bdev->bd_disk->private_data; + if (!valid_io_request(zram, sector, PAGE_SIZE)) { + atomic64_inc(&zram->stats.invalid_io); + return -EINVAL; + } + + down_read(&zram->init_lock); + if (unlikely(!init_done(zram))) { + err = -EIO; + goto out_unlock; + } + + index = sector >> SECTORS_PER_PAGE_SHIFT; + offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT; + + bv.bv_page = page; + bv.bv_len = PAGE_SIZE; + bv.bv_offset = 0; + + err = zram_bvec_rw(zram, &bv, index, offset, rw); +out_unlock: + up_read(&zram->init_lock); + /* + * If I/O fails, just return error(ie, non-zero) without + * calling page_endio. + * It causes resubmit the I/O with bio request by upper functions + * of rw_page(e.g., swap_readpage, __swap_writepage) and + * bio->bi_end_io does things to handle the error + * (e.g., SetPageError, set_page_dirty and extra works). + */ + if (err == 0) + page_endio(page, rw, 0); + return err; +} + static const struct block_device_operations zram_devops = { .swap_slot_free_notify = zram_slot_free_notify, + .rw_page = zram_rw_page, .owner = THIS_MODULE }; -static DEVICE_ATTR(disksize, S_IRUGO | S_IWUSR, - disksize_show, disksize_store); -static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL); -static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store); -static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); -static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); -static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR, - max_comp_streams_show, max_comp_streams_store); -static DEVICE_ATTR(comp_algorithm, S_IRUGO | S_IWUSR, - comp_algorithm_show, comp_algorithm_store); +static DEVICE_ATTR_RW(disksize); +static DEVICE_ATTR_RO(initstate); +static DEVICE_ATTR_WO(reset); +static DEVICE_ATTR_RO(orig_data_size); +static DEVICE_ATTR_RO(mem_used_total); +static DEVICE_ATTR_RW(mem_limit); +static DEVICE_ATTR_RW(mem_used_max); +static DEVICE_ATTR_RW(max_comp_streams); +static DEVICE_ATTR_RW(comp_algorithm); ZRAM_ATTR_RO(num_reads); ZRAM_ATTR_RO(num_writes); @@ -885,6 +1026,8 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_orig_data_size.attr, &dev_attr_compr_data_size.attr, &dev_attr_mem_used_total.attr, + &dev_attr_mem_limit.attr, + &dev_attr_mem_used_max.attr, &dev_attr_max_comp_streams.attr, &dev_attr_comp_algorithm.attr, NULL, @@ -929,6 +1072,7 @@ static int create_device(struct zram *zram, int device_id) set_capacity(zram->disk, 0); /* zram devices sort of resembles non-rotational disks */ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); + queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue); /* * To ensure that we always get PAGE_SIZE aligned * and n*PAGE_SIZED sized I/O requests. diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index e0f725c87cc6..b05a816b09ac 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -66,8 +66,8 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; /* Flags for zram pages (table[page_no].value) */ enum zram_pageflags { /* Page consists entirely of zeros */ - ZRAM_ZERO = ZRAM_FLAG_SHIFT + 1, - ZRAM_ACCESS, /* page in now accessed */ + ZRAM_ZERO = ZRAM_FLAG_SHIFT, + ZRAM_ACCESS, /* page is now accessed */ __NR_ZRAM_PAGEFLAGS, }; @@ -90,6 +90,7 @@ struct zram_stats { atomic64_t notify_free; /* no. of swap slot free notifications */ atomic64_t zero_pages; /* no. of zero filled pages */ atomic64_t pages_stored; /* no. of pages currently stored */ + atomic_long_t max_used_pages; /* no. of maximum pages stored */ }; struct zram_meta { @@ -112,6 +113,11 @@ struct zram { u64 disksize; /* bytes */ int max_comp_streams; struct zram_stats stats; + /* + * the number of pages zram can consume for storing compressed data + */ + unsigned long limit_pages; + char compressor[10]; }; #endif |