diff options
author | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2013-11-14 17:38:05 -0800 |
---|---|---|
committer | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2013-11-14 17:38:05 -0800 |
commit | 42249094f79422fbf5ed4b54eeb48ff096809b8f (patch) | |
tree | 91e6850c8c7e8cc284cf8bb6363f8662f84011f4 /fs/btrfs | |
parent | 936816161978ca716a56c5e553c68f25972b1e3a (diff) | |
parent | 2c027b7c48a888ab173ba45babb4525e278375d9 (diff) | |
download | lwn-42249094f79422fbf5ed4b54eeb48ff096809b8f.tar.gz lwn-42249094f79422fbf5ed4b54eeb48ff096809b8f.zip |
Merge branch 'next' into for-linus
Merge first round of changes for 3.13 merge window.
Diffstat (limited to 'fs/btrfs')
48 files changed, 6714 insertions, 4254 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 2b3b83296977..398cbd517be2 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -72,3 +72,12 @@ config BTRFS_DEBUG performance, or export extra information via sysfs. If unsure, say N. + +config BTRFS_ASSERT + bool "Btrfs assert support" + depends on BTRFS_FS + help + Enable run-time assertion checking. This will result in panics if + any of the assertions trip. This is meant for btrfs developers only. + + If unsure, say N. diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 3932224f99e9..a91a6a355cc5 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -8,7 +8,10 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ - reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o + reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ + uuid-tree.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o + +btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 58b7d14b08ee..08cc08f037a6 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -107,7 +107,8 @@ static void check_idle_worker(struct btrfs_worker_thread *worker) worker->idle = 1; /* the list may be empty if the worker is just starting */ - if (!list_empty(&worker->worker_list)) { + if (!list_empty(&worker->worker_list) && + !worker->workers->stopping) { list_move(&worker->worker_list, &worker->workers->idle_list); } @@ -127,7 +128,8 @@ static void check_busy_worker(struct btrfs_worker_thread *worker) spin_lock_irqsave(&worker->workers->lock, flags); worker->idle = 0; - if (!list_empty(&worker->worker_list)) { + if (!list_empty(&worker->worker_list) && + !worker->workers->stopping) { list_move_tail(&worker->worker_list, &worker->workers->worker_list); } @@ -412,6 +414,7 @@ void btrfs_stop_workers(struct btrfs_workers *workers) int can_stop; spin_lock_irq(&workers->lock); + workers->stopping = 1; list_splice_init(&workers->idle_list, &workers->worker_list); while (!list_empty(&workers->worker_list)) { cur = workers->worker_list.next; @@ -455,6 +458,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, workers->ordered = 0; workers->atomic_start_pending = 0; workers->atomic_worker_start = async_helper; + workers->stopping = 0; } /* @@ -480,15 +484,19 @@ static int __btrfs_start_workers(struct btrfs_workers *workers) atomic_set(&worker->num_pending, 0); atomic_set(&worker->refs, 1); worker->workers = workers; - worker->task = kthread_run(worker_loop, worker, - "btrfs-%s-%d", workers->name, - workers->num_workers + 1); + worker->task = kthread_create(worker_loop, worker, + "btrfs-%s-%d", workers->name, + workers->num_workers + 1); if (IS_ERR(worker->task)) { ret = PTR_ERR(worker->task); - kfree(worker); goto fail; } + spin_lock_irq(&workers->lock); + if (workers->stopping) { + spin_unlock_irq(&workers->lock); + goto fail_kthread; + } list_add_tail(&worker->worker_list, &workers->idle_list); worker->idle = 1; workers->num_workers++; @@ -496,8 +504,13 @@ static int __btrfs_start_workers(struct btrfs_workers *workers) WARN_ON(workers->num_workers_starting < 0); spin_unlock_irq(&workers->lock); + wake_up_process(worker->task); return 0; + +fail_kthread: + kthread_stop(worker->task); fail: + kfree(worker); spin_lock_irq(&workers->lock); workers->num_workers_starting--; spin_unlock_irq(&workers->lock); diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 063698b90ce2..1f26792683ed 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -107,6 +107,8 @@ struct btrfs_workers { /* extra name for this worker, used for current->name */ char *name; + + int stopping; }; void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 290e347b6db3..0552a599b28f 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -36,16 +36,23 @@ static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb, u64 extent_item_pos, struct extent_inode_elem **eie) { - u64 data_offset; - u64 data_len; + u64 offset = 0; struct extent_inode_elem *e; - data_offset = btrfs_file_extent_offset(eb, fi); - data_len = btrfs_file_extent_num_bytes(eb, fi); + if (!btrfs_file_extent_compression(eb, fi) && + !btrfs_file_extent_encryption(eb, fi) && + !btrfs_file_extent_other_encoding(eb, fi)) { + u64 data_offset; + u64 data_len; - if (extent_item_pos < data_offset || - extent_item_pos >= data_offset + data_len) - return 1; + data_offset = btrfs_file_extent_offset(eb, fi); + data_len = btrfs_file_extent_num_bytes(eb, fi); + + if (extent_item_pos < data_offset || + extent_item_pos >= data_offset + data_len) + return 1; + offset = extent_item_pos - data_offset; + } e = kmalloc(sizeof(*e), GFP_NOFS); if (!e) @@ -53,7 +60,7 @@ static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb, e->next = *eie; e->inum = key->objectid; - e->offset = key->offset + (extent_item_pos - data_offset); + e->offset = key->offset + offset; *eie = e; return 0; @@ -112,6 +119,26 @@ struct __prelim_ref { u64 wanted_disk_byte; }; +static struct kmem_cache *btrfs_prelim_ref_cache; + +int __init btrfs_prelim_ref_init(void) +{ + btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref", + sizeof(struct __prelim_ref), + 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!btrfs_prelim_ref_cache) + return -ENOMEM; + return 0; +} + +void btrfs_prelim_ref_exit(void) +{ + if (btrfs_prelim_ref_cache) + kmem_cache_destroy(btrfs_prelim_ref_cache); +} + /* * the rules for all callers of this function are: * - obtaining the parent is the goal @@ -153,12 +180,12 @@ struct __prelim_ref { static int __add_prelim_ref(struct list_head *head, u64 root_id, struct btrfs_key *key, int level, - u64 parent, u64 wanted_disk_byte, int count) + u64 parent, u64 wanted_disk_byte, int count, + gfp_t gfp_mask) { struct __prelim_ref *ref; - /* in case we're adding delayed refs, we're holding the refs spinlock */ - ref = kmalloc(sizeof(*ref), GFP_ATOMIC); + ref = kmem_cache_alloc(btrfs_prelim_ref_cache, gfp_mask); if (!ref) return -ENOMEM; @@ -189,7 +216,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *eb; struct btrfs_key key; struct btrfs_file_extent_item *fi; - struct extent_inode_elem *eie = NULL; + struct extent_inode_elem *eie = NULL, *old = NULL; u64 disk_byte; if (level != 0) { @@ -223,6 +250,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (disk_byte == wanted_disk_byte) { eie = NULL; + old = NULL; if (extent_item_pos) { ret = check_extent_in_eb(&key, eb, fi, *extent_item_pos, @@ -230,18 +258,20 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (ret < 0) break; } - if (!ret) { - ret = ulist_add(parents, eb->start, - (uintptr_t)eie, GFP_NOFS); - if (ret < 0) - break; - if (!extent_item_pos) { - ret = btrfs_next_old_leaf(root, path, - time_seq); - continue; - } + if (ret > 0) + goto next; + ret = ulist_add_merge(parents, eb->start, + (uintptr_t)eie, + (u64 *)&old, GFP_NOFS); + if (ret < 0) + break; + if (!ret && extent_item_pos) { + while (old->next) + old = old->next; + old->next = eie; } } +next: ret = btrfs_next_old_item(root, path, time_seq); } @@ -255,13 +285,11 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, * to a logical address */ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, - int search_commit_root, - u64 time_seq, - struct __prelim_ref *ref, - struct ulist *parents, - const u64 *extent_item_pos) + struct btrfs_path *path, u64 time_seq, + struct __prelim_ref *ref, + struct ulist *parents, + const u64 *extent_item_pos) { - struct btrfs_path *path; struct btrfs_root *root; struct btrfs_key root_key; struct extent_buffer *eb; @@ -269,11 +297,6 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, int root_level; int level = ref->level; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->search_commit_root = !!search_commit_root; - root_key.objectid = ref->root_id; root_key.type = BTRFS_ROOT_ITEM_KEY; root_key.offset = (u64)-1; @@ -292,10 +315,9 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq); pr_debug("search slot in root %llu (level %d, ref count %d) returned " "%d for key (%llu %u %llu)\n", - (unsigned long long)ref->root_id, level, ref->count, ret, - (unsigned long long)ref->key_for_search.objectid, - ref->key_for_search.type, - (unsigned long long)ref->key_for_search.offset); + ref->root_id, level, ref->count, ret, + ref->key_for_search.objectid, ref->key_for_search.type, + ref->key_for_search.offset); if (ret < 0) goto out; @@ -314,7 +336,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, time_seq, ref->wanted_disk_byte, extent_item_pos); out: - btrfs_free_path(path); + path->lowest_level = 0; + btrfs_release_path(path); return ret; } @@ -322,7 +345,7 @@ out: * resolve all indirect backrefs from the list */ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, - int search_commit_root, u64 time_seq, + struct btrfs_path *path, u64 time_seq, struct list_head *head, const u64 *extent_item_pos) { @@ -349,9 +372,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, continue; if (ref->count == 0) continue; - err = __resolve_indirect_ref(fs_info, search_commit_root, - time_seq, ref, parents, - extent_item_pos); + err = __resolve_indirect_ref(fs_info, path, time_seq, ref, + parents, extent_item_pos); if (err == -ENOMEM) goto out; if (err) @@ -362,11 +384,12 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, node = ulist_next(parents, &uiter); ref->parent = node ? node->val : 0; ref->inode_list = node ? - (struct extent_inode_elem *)(uintptr_t)node->aux : 0; + (struct extent_inode_elem *)(uintptr_t)node->aux : NULL; /* additional parents require new refs being added here */ while ((node = ulist_next(parents, &uiter))) { - new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS); + new_ref = kmem_cache_alloc(btrfs_prelim_ref_cache, + GFP_NOFS); if (!new_ref) { ret = -ENOMEM; goto out; @@ -490,7 +513,7 @@ static void __merge_refs(struct list_head *head, int mode) ref1->count += ref2->count; list_del(&ref2->list); - kfree(ref2); + kmem_cache_free(btrfs_prelim_ref_cache, ref2); } } @@ -545,7 +568,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, ref = btrfs_delayed_node_to_tree_ref(node); ret = __add_prelim_ref(prefs, ref->root, &op_key, ref->level + 1, 0, node->bytenr, - node->ref_mod * sgn); + node->ref_mod * sgn, GFP_ATOMIC); break; } case BTRFS_SHARED_BLOCK_REF_KEY: { @@ -555,7 +578,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, ret = __add_prelim_ref(prefs, ref->root, NULL, ref->level + 1, ref->parent, node->bytenr, - node->ref_mod * sgn); + node->ref_mod * sgn, GFP_ATOMIC); break; } case BTRFS_EXTENT_DATA_REF_KEY: { @@ -567,7 +590,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, key.offset = ref->offset; ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0, node->bytenr, - node->ref_mod * sgn); + node->ref_mod * sgn, GFP_ATOMIC); break; } case BTRFS_SHARED_DATA_REF_KEY: { @@ -580,7 +603,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, key.offset = ref->offset; ret = __add_prelim_ref(prefs, ref->root, &key, 0, ref->parent, node->bytenr, - node->ref_mod * sgn); + node->ref_mod * sgn, GFP_ATOMIC); break; } default: @@ -604,6 +627,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, int slot; struct extent_buffer *leaf; struct btrfs_key key; + struct btrfs_key found_key; unsigned long ptr; unsigned long end; struct btrfs_extent_item *ei; @@ -621,17 +645,21 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); flags = btrfs_extent_flags(leaf, ei); + btrfs_item_key_to_cpu(leaf, &found_key, slot); ptr = (unsigned long)(ei + 1); end = (unsigned long)ei + item_size; - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + if (found_key.type == BTRFS_EXTENT_ITEM_KEY && + flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { struct btrfs_tree_block_info *info; info = (struct btrfs_tree_block_info *)ptr; *info_level = btrfs_tree_block_level(leaf, info); ptr += sizeof(struct btrfs_tree_block_info); BUG_ON(ptr > end); + } else if (found_key.type == BTRFS_METADATA_ITEM_KEY) { + *info_level = found_key.offset; } else { BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); } @@ -649,7 +677,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, case BTRFS_SHARED_BLOCK_REF_KEY: ret = __add_prelim_ref(prefs, 0, NULL, *info_level + 1, offset, - bytenr, 1); + bytenr, 1, GFP_NOFS); break; case BTRFS_SHARED_DATA_REF_KEY: { struct btrfs_shared_data_ref *sdref; @@ -658,13 +686,13 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, sdref = (struct btrfs_shared_data_ref *)(iref + 1); count = btrfs_shared_data_ref_count(leaf, sdref); ret = __add_prelim_ref(prefs, 0, NULL, 0, offset, - bytenr, count); + bytenr, count, GFP_NOFS); break; } case BTRFS_TREE_BLOCK_REF_KEY: ret = __add_prelim_ref(prefs, offset, NULL, *info_level + 1, 0, - bytenr, 1); + bytenr, 1, GFP_NOFS); break; case BTRFS_EXTENT_DATA_REF_KEY: { struct btrfs_extent_data_ref *dref; @@ -679,7 +707,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, key.offset = btrfs_extent_data_ref_offset(leaf, dref); root = btrfs_extent_data_ref_root(leaf, dref); ret = __add_prelim_ref(prefs, root, &key, 0, 0, - bytenr, count); + bytenr, count, GFP_NOFS); break; } default: @@ -730,7 +758,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, case BTRFS_SHARED_BLOCK_REF_KEY: ret = __add_prelim_ref(prefs, 0, NULL, info_level + 1, key.offset, - bytenr, 1); + bytenr, 1, GFP_NOFS); break; case BTRFS_SHARED_DATA_REF_KEY: { struct btrfs_shared_data_ref *sdref; @@ -740,13 +768,13 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, struct btrfs_shared_data_ref); count = btrfs_shared_data_ref_count(leaf, sdref); ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset, - bytenr, count); + bytenr, count, GFP_NOFS); break; } case BTRFS_TREE_BLOCK_REF_KEY: ret = __add_prelim_ref(prefs, key.offset, NULL, info_level + 1, 0, - bytenr, 1); + bytenr, 1, GFP_NOFS); break; case BTRFS_EXTENT_DATA_REF_KEY: { struct btrfs_extent_data_ref *dref; @@ -762,7 +790,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, key.offset = btrfs_extent_data_ref_offset(leaf, dref); root = btrfs_extent_data_ref_root(leaf, dref); ret = __add_prelim_ref(prefs, root, &key, 0, 0, - bytenr, count); + bytenr, count, GFP_NOFS); break; } default: @@ -795,7 +823,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head; int info_level = 0; int ret; - int search_commit_root = (trans == BTRFS_BACKREF_SEARCH_COMMIT_ROOT); struct list_head prefs_delayed; struct list_head prefs; struct __prelim_ref *ref; @@ -804,13 +831,17 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&prefs_delayed); key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = (u64)-1; + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->search_commit_root = !!search_commit_root; + if (!trans) + path->search_commit_root = 1; /* * grab both a lock on the path and a lock on the delayed ref head. @@ -825,7 +856,7 @@ again: goto out; BUG_ON(ret == 0); - if (trans != BTRFS_BACKREF_SEARCH_COMMIT_ROOT) { + if (trans) { /* * look if there are updates for this ref queued and lock the * head @@ -869,7 +900,8 @@ again: slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid == bytenr && - key.type == BTRFS_EXTENT_ITEM_KEY) { + (key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY)) { ret = __add_inline_refs(fs_info, path, bytenr, &info_level, &prefs); if (ret) @@ -890,8 +922,8 @@ again: __merge_refs(&prefs, 1); - ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq, - &prefs, extent_item_pos); + ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs, + extent_item_pos); if (ret) goto out; @@ -899,7 +931,6 @@ again: while (!list_empty(&prefs)) { ref = list_first_entry(&prefs, struct __prelim_ref, list); - list_del(&ref->list); WARN_ON(ref->count < 0); if (ref->count && ref->root_id && ref->parent == 0) { /* no parent == root of tree */ @@ -923,8 +954,10 @@ again: } ret = find_extent_in_eb(eb, bytenr, *extent_item_pos, &eie); - ref->inode_list = eie; free_extent_buffer(eb); + if (ret < 0) + goto out; + ref->inode_list = eie; } ret = ulist_add_merge(refs, ref->parent, (uintptr_t)ref->inode_list, @@ -942,7 +975,8 @@ again: eie->next = ref->inode_list; } } - kfree(ref); + list_del(&ref->list); + kmem_cache_free(btrfs_prelim_ref_cache, ref); } out: @@ -950,13 +984,13 @@ out: while (!list_empty(&prefs)) { ref = list_first_entry(&prefs, struct __prelim_ref, list); list_del(&ref->list); - kfree(ref); + kmem_cache_free(btrfs_prelim_ref_cache, ref); } while (!list_empty(&prefs_delayed)) { ref = list_first_entry(&prefs_delayed, struct __prelim_ref, list); list_del(&ref->list); - kfree(ref); + kmem_cache_free(btrfs_prelim_ref_cache, ref); } return ret; @@ -1283,12 +1317,16 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, { int ret; u64 flags; + u64 size = 0; u32 item_size; struct extent_buffer *eb; struct btrfs_extent_item *ei; struct btrfs_key key; - key.type = BTRFS_EXTENT_ITEM_KEY; + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; key.objectid = logical; key.offset = (u64)-1; @@ -1301,11 +1339,16 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, return ret; btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); - if (found_key->type != BTRFS_EXTENT_ITEM_KEY || + if (found_key->type == BTRFS_METADATA_ITEM_KEY) + size = fs_info->extent_root->leafsize; + else if (found_key->type == BTRFS_EXTENT_ITEM_KEY) + size = found_key->offset; + + if ((found_key->type != BTRFS_EXTENT_ITEM_KEY && + found_key->type != BTRFS_METADATA_ITEM_KEY) || found_key->objectid > logical || - found_key->objectid + found_key->offset <= logical) { - pr_debug("logical %llu is not within any extent\n", - (unsigned long long)logical); + found_key->objectid + size <= logical) { + pr_debug("logical %llu is not within any extent\n", logical); return -ENOENT; } @@ -1318,11 +1361,8 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, pr_debug("logical %llu is at position %llu within the extent (%llu " "EXTENT_ITEM %llu) flags %#llx size %u\n", - (unsigned long long)logical, - (unsigned long long)(logical - found_key->objectid), - (unsigned long long)found_key->objectid, - (unsigned long long)found_key->offset, - (unsigned long long)flags, item_size); + logical, logical - found_key->objectid, found_key->objectid, + found_key->offset, flags, item_size); WARN_ON(!flags_ret); if (flags_ret) { @@ -1459,7 +1499,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, iterate_extent_inodes_t *iterate, void *ctx) { int ret; - struct btrfs_trans_handle *trans; + struct btrfs_trans_handle *trans = NULL; struct ulist *refs = NULL; struct ulist *roots = NULL; struct ulist_node *ref_node = NULL; @@ -1471,9 +1511,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, pr_debug("resolving all inodes for extent %llu\n", extent_item_objectid); - if (search_commit_root) { - trans = BTRFS_BACKREF_SEARCH_COMMIT_ROOT; - } else { + if (!search_commit_root) { trans = btrfs_join_transaction(fs_info->extent_root); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -1496,7 +1534,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, while (!ret && (root_node = ulist_next(roots, &root_uiter))) { pr_debug("root %llu references leaf %llu, data list " "%#llx\n", root_node->val, ref_node->val, - (long long)ref_node->aux); + ref_node->aux); ret = iterate_leaf_refs((struct extent_inode_elem *) (uintptr_t)ref_node->aux, root_node->val, @@ -1588,9 +1626,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, name_len = btrfs_inode_ref_name_len(eb, iref); /* path must be released before calling iterate()! */ pr_debug("following ref at offset %u for inode %llu in " - "tree %llu\n", cur, - (unsigned long long)found_key.objectid, - (unsigned long long)fs_root->objectid); + "tree %llu\n", cur, found_key.objectid, + fs_root->objectid); ret = iterate(parent, name_len, (unsigned long)(iref + 1), eb, ctx); if (ret) diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 0f446d7ca2c0..a910b27a8ad9 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -23,8 +23,6 @@ #include "ulist.h" #include "extent_io.h" -#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0) - struct inode_fs_paths { struct btrfs_path *btrfs_path; struct btrfs_root *fs_root; @@ -74,4 +72,6 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, struct btrfs_inode_extref **ret_extref, u64 *found_off); +int __init btrfs_prelim_ref_init(void); +void btrfs_prelim_ref_exit(void); #endif diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 08b286b2a2c5..71f074e1870b 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -213,11 +213,35 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode) static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) { if (BTRFS_I(inode)->logged_trans == generation && - BTRFS_I(inode)->last_sub_trans <= BTRFS_I(inode)->last_log_commit) + BTRFS_I(inode)->last_sub_trans <= + BTRFS_I(inode)->last_log_commit && + BTRFS_I(inode)->last_sub_trans <= + BTRFS_I(inode)->root->last_log_commit) return 1; return 0; } +struct btrfs_dio_private { + struct inode *inode; + u64 logical_offset; + u64 disk_bytenr; + u64 bytes; + void *private; + + /* number of bios pending for this dio */ + atomic_t pending_bios; + + /* IO errors */ + int errors; + + /* orig_bio is our btrfs_io_bio */ + struct bio *orig_bio; + + /* dio_bio came from fs/direct-io.c */ + struct bio *dio_bio; + u8 csum[0]; +}; + /* * Disable DIO read nolock optimization, so new dio readers will be forced * to grab i_mutex. It is used to avoid the endless truncate due to diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 1431a6965017..1c47be187240 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -701,15 +701,13 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, next_bytenr = btrfs_super_root(selected_super); if (state->print_mask & BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - printk(KERN_INFO "root@%llu\n", - (unsigned long long)next_bytenr); + printk(KERN_INFO "root@%llu\n", next_bytenr); break; case 1: next_bytenr = btrfs_super_chunk_root(selected_super); if (state->print_mask & BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - printk(KERN_INFO "chunk@%llu\n", - (unsigned long long)next_bytenr); + printk(KERN_INFO "chunk@%llu\n", next_bytenr); break; case 2: next_bytenr = btrfs_super_log_root(selected_super); @@ -717,8 +715,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, continue; if (state->print_mask & BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - printk(KERN_INFO "log@%llu\n", - (unsigned long long)next_bytenr); + printk(KERN_INFO "log@%llu\n", next_bytenr); break; } @@ -727,7 +724,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", - (unsigned long long)next_bytenr, num_copies); + next_bytenr, num_copies); for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { struct btrfsic_block *next_block; @@ -742,8 +739,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, printk(KERN_INFO "btrfsic:" " btrfsic_map_block(root @%llu," " mirror %d) failed!\n", - (unsigned long long)next_bytenr, - mirror_num); + next_bytenr, mirror_num); kfree(selected_super); return -1; } @@ -767,7 +763,6 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, if (ret < (int)PAGE_CACHE_SIZE) { printk(KERN_INFO "btrfsic: read @logical %llu failed!\n", - (unsigned long long) tmp_next_block_ctx.start); btrfsic_release_block_ctx(&tmp_next_block_ctx); kfree(selected_super); @@ -813,7 +808,7 @@ static int btrfsic_process_superblock_dev_mirror( (bh->b_data + (dev_bytenr & 4095)); if (btrfs_super_bytenr(super_tmp) != dev_bytenr || - super_tmp->magic != cpu_to_le64(BTRFS_MAGIC) || + btrfs_super_magic(super_tmp) != BTRFS_MAGIC || memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || btrfs_super_nodesize(super_tmp) != state->metablock_size || btrfs_super_leafsize(super_tmp) != state->metablock_size || @@ -847,10 +842,8 @@ static int btrfsic_process_superblock_dev_mirror( printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)" " @%llu (%s/%llu/%d)\n", superblock_bdev, - rcu_str_deref(device->name), - (unsigned long long)dev_bytenr, - dev_state->name, - (unsigned long long)dev_bytenr, + rcu_str_deref(device->name), dev_bytenr, + dev_state->name, dev_bytenr, superblock_mirror_num); list_add(&superblock_tmp->all_blocks_node, &state->all_blocks_list); @@ -880,20 +873,20 @@ static int btrfsic_process_superblock_dev_mirror( tmp_disk_key.offset = 0; switch (pass) { case 0: - tmp_disk_key.objectid = - cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID); + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_ROOT_TREE_OBJECTID); additional_string = "initial root "; next_bytenr = btrfs_super_root(super_tmp); break; case 1: - tmp_disk_key.objectid = - cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID); + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_CHUNK_TREE_OBJECTID); additional_string = "initial chunk "; next_bytenr = btrfs_super_chunk_root(super_tmp); break; case 2: - tmp_disk_key.objectid = - cpu_to_le64(BTRFS_TREE_LOG_OBJECTID); + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_TREE_LOG_OBJECTID); additional_string = "initial log "; next_bytenr = btrfs_super_log_root(super_tmp); if (0 == next_bytenr) @@ -906,7 +899,7 @@ static int btrfsic_process_superblock_dev_mirror( next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", - (unsigned long long)next_bytenr, num_copies); + next_bytenr, num_copies); for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { struct btrfsic_block *next_block; struct btrfsic_block_data_ctx tmp_next_block_ctx; @@ -918,8 +911,7 @@ static int btrfsic_process_superblock_dev_mirror( mirror_num)) { printk(KERN_INFO "btrfsic: btrfsic_map_block(" "bytenr @%llu, mirror %d) failed!\n", - (unsigned long long)next_bytenr, - mirror_num); + next_bytenr, mirror_num); brelse(bh); return -1; } @@ -1003,19 +995,17 @@ continue_with_new_stack_frame: (struct btrfs_leaf *)sf->hdr; if (-1 == sf->i) { - sf->nr = le32_to_cpu(leafhdr->header.nritems); + sf->nr = btrfs_stack_header_nritems(&leafhdr->header); if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO "leaf %llu items %d generation %llu" " owner %llu\n", - (unsigned long long) - sf->block_ctx->start, - sf->nr, - (unsigned long long) - le64_to_cpu(leafhdr->header.generation), - (unsigned long long) - le64_to_cpu(leafhdr->header.owner)); + sf->block_ctx->start, sf->nr, + btrfs_stack_header_generation( + &leafhdr->header), + btrfs_stack_header_owner( + &leafhdr->header)); } continue_with_current_leaf_stack_frame: @@ -1047,10 +1037,10 @@ leaf_item_out_of_bounce_error: &disk_item, disk_item_offset, sizeof(struct btrfs_item)); - item_offset = le32_to_cpu(disk_item.offset); - item_size = le32_to_cpu(disk_item.size); + item_offset = btrfs_stack_item_offset(&disk_item); + item_size = btrfs_stack_item_offset(&disk_item); disk_key = &disk_item.key; - type = disk_key->type; + type = btrfs_disk_key_type(disk_key); if (BTRFS_ROOT_ITEM_KEY == type) { struct btrfs_root_item root_item; @@ -1066,7 +1056,7 @@ leaf_item_out_of_bounce_error: sf->block_ctx, &root_item, root_item_offset, item_size); - next_bytenr = le64_to_cpu(root_item.bytenr); + next_bytenr = btrfs_root_bytenr(&root_item); sf->error = btrfsic_create_link_to_next_block( @@ -1081,8 +1071,8 @@ leaf_item_out_of_bounce_error: &sf->num_copies, &sf->mirror_num, disk_key, - le64_to_cpu(root_item. - generation)); + btrfs_root_generation( + &root_item)); if (sf->error) goto one_stack_frame_backwards; @@ -1130,18 +1120,17 @@ leaf_item_out_of_bounce_error: struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr; if (-1 == sf->i) { - sf->nr = le32_to_cpu(nodehdr->header.nritems); + sf->nr = btrfs_stack_header_nritems(&nodehdr->header); if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO "node %llu level %d items %d" " generation %llu owner %llu\n", - (unsigned long long) sf->block_ctx->start, nodehdr->header.level, sf->nr, - (unsigned long long) - le64_to_cpu(nodehdr->header.generation), - (unsigned long long) - le64_to_cpu(nodehdr->header.owner)); + btrfs_stack_header_generation( + &nodehdr->header), + btrfs_stack_header_owner( + &nodehdr->header)); } continue_with_current_node_stack_frame: @@ -1168,7 +1157,7 @@ continue_with_current_node_stack_frame: btrfsic_read_from_block_data( sf->block_ctx, &key_ptr, key_ptr_offset, sizeof(struct btrfs_key_ptr)); - next_bytenr = le64_to_cpu(key_ptr.blockptr); + next_bytenr = btrfs_stack_key_blockptr(&key_ptr); sf->error = btrfsic_create_link_to_next_block( state, @@ -1182,7 +1171,7 @@ continue_with_current_node_stack_frame: &sf->num_copies, &sf->mirror_num, &key_ptr.key, - le64_to_cpu(key_ptr.generation)); + btrfs_stack_key_generation(&key_ptr)); if (sf->error) goto one_stack_frame_backwards; @@ -1247,8 +1236,7 @@ static void btrfsic_read_from_block_data( unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT; WARN_ON(offset + len > block_ctx->len); - offset_in_page = (start_offset + offset) & - ((unsigned long)PAGE_CACHE_SIZE - 1); + offset_in_page = (start_offset + offset) & (PAGE_CACHE_SIZE - 1); while (len > 0) { cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page)); @@ -1290,7 +1278,7 @@ static int btrfsic_create_link_to_next_block( next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", - (unsigned long long)next_bytenr, *num_copiesp); + next_bytenr, *num_copiesp); *mirror_nump = 1; } @@ -1307,7 +1295,7 @@ static int btrfsic_create_link_to_next_block( if (ret) { printk(KERN_INFO "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n", - (unsigned long long)next_bytenr, *mirror_nump); + next_bytenr, *mirror_nump); btrfsic_release_block_ctx(next_block_ctx); *next_blockp = NULL; return -1; @@ -1335,20 +1323,16 @@ static int btrfsic_create_link_to_next_block( "Referenced block @%llu (%s/%llu/%d)" " found in hash table, %c," " bytenr mismatch (!= stored %llu).\n", - (unsigned long long)next_bytenr, - next_block_ctx->dev->name, - (unsigned long long)next_block_ctx->dev_bytenr, - *mirror_nump, + next_bytenr, next_block_ctx->dev->name, + next_block_ctx->dev_bytenr, *mirror_nump, btrfsic_get_block_type(state, next_block), - (unsigned long long)next_block->logical_bytenr); + next_block->logical_bytenr); } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO "Referenced block @%llu (%s/%llu/%d)" " found in hash table, %c.\n", - (unsigned long long)next_bytenr, - next_block_ctx->dev->name, - (unsigned long long)next_block_ctx->dev_bytenr, - *mirror_nump, + next_bytenr, next_block_ctx->dev->name, + next_block_ctx->dev_bytenr, *mirror_nump, btrfsic_get_block_type(state, next_block)); next_block->logical_bytenr = next_bytenr; @@ -1400,7 +1384,7 @@ static int btrfsic_create_link_to_next_block( if (ret < (int)next_block_ctx->len) { printk(KERN_INFO "btrfsic: read block @logical %llu failed!\n", - (unsigned long long)next_bytenr); + next_bytenr); btrfsic_release_block_ctx(next_block_ctx); *next_blockp = NULL; return -1; @@ -1444,12 +1428,12 @@ static int btrfsic_handle_extent_data( file_extent_item_offset, offsetof(struct btrfs_file_extent_item, disk_num_bytes)); if (BTRFS_FILE_EXTENT_REG != file_extent_item.type || - ((u64)0) == le64_to_cpu(file_extent_item.disk_bytenr)) { + btrfs_stack_file_extent_disk_bytenr(&file_extent_item) == 0) { if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu\n", file_extent_item.type, - (unsigned long long) - le64_to_cpu(file_extent_item.disk_bytenr)); + btrfs_stack_file_extent_disk_bytenr( + &file_extent_item)); return 0; } @@ -1463,20 +1447,19 @@ static int btrfsic_handle_extent_data( btrfsic_read_from_block_data(block_ctx, &file_extent_item, file_extent_item_offset, sizeof(struct btrfs_file_extent_item)); - next_bytenr = le64_to_cpu(file_extent_item.disk_bytenr) + - le64_to_cpu(file_extent_item.offset); - generation = le64_to_cpu(file_extent_item.generation); - num_bytes = le64_to_cpu(file_extent_item.num_bytes); - generation = le64_to_cpu(file_extent_item.generation); + next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item) + + btrfs_stack_file_extent_offset(&file_extent_item); + generation = btrfs_stack_file_extent_generation(&file_extent_item); + num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item); + generation = btrfs_stack_file_extent_generation(&file_extent_item); if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu," " offset = %llu, num_bytes = %llu\n", file_extent_item.type, - (unsigned long long) - le64_to_cpu(file_extent_item.disk_bytenr), - (unsigned long long)le64_to_cpu(file_extent_item.offset), - (unsigned long long)num_bytes); + btrfs_stack_file_extent_disk_bytenr(&file_extent_item), + btrfs_stack_file_extent_offset(&file_extent_item), + num_bytes); while (num_bytes > 0) { u32 chunk_len; int num_copies; @@ -1492,7 +1475,7 @@ static int btrfsic_handle_extent_data( next_bytenr, state->datablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", - (unsigned long long)next_bytenr, num_copies); + next_bytenr, num_copies); for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { struct btrfsic_block_data_ctx next_block_ctx; struct btrfsic_block *next_block; @@ -1504,8 +1487,7 @@ static int btrfsic_handle_extent_data( if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) printk(KERN_INFO "\tdisk_bytenr = %llu, num_bytes %u\n", - (unsigned long long)next_bytenr, - chunk_len); + next_bytenr, chunk_len); ret = btrfsic_map_block(state, next_bytenr, chunk_len, &next_block_ctx, mirror_num); @@ -1513,8 +1495,7 @@ static int btrfsic_handle_extent_data( printk(KERN_INFO "btrfsic: btrfsic_map_block(@%llu," " mirror=%d) failed!\n", - (unsigned long long)next_bytenr, - mirror_num); + next_bytenr, mirror_num); return -1; } @@ -1543,12 +1524,10 @@ static int btrfsic_handle_extent_data( " found in hash table, D," " bytenr mismatch" " (!= stored %llu).\n", - (unsigned long long)next_bytenr, + next_bytenr, next_block_ctx.dev->name, - (unsigned long long) next_block_ctx.dev_bytenr, mirror_num, - (unsigned long long) next_block->logical_bytenr); } next_block->logical_bytenr = next_bytenr; @@ -1675,7 +1654,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) { printk(KERN_INFO "btrfsic: read_block() with unaligned bytenr %llu\n", - (unsigned long long)block_ctx->dev_bytenr); + block_ctx->dev_bytenr); return -1; } @@ -1772,10 +1751,8 @@ static void btrfsic_dump_database(struct btrfsic_state *state) printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n", btrfsic_get_block_type(state, b_all), - (unsigned long long)b_all->logical_bytenr, - b_all->dev_state->name, - (unsigned long long)b_all->dev_bytenr, - b_all->mirror_num); + b_all->logical_bytenr, b_all->dev_state->name, + b_all->dev_bytenr, b_all->mirror_num); list_for_each(elem_ref_to, &b_all->ref_to_list) { const struct btrfsic_block_link *const l = @@ -1787,16 +1764,13 @@ static void btrfsic_dump_database(struct btrfsic_state *state) " refers %u* to" " %c @%llu (%s/%llu/%d)\n", btrfsic_get_block_type(state, b_all), - (unsigned long long)b_all->logical_bytenr, - b_all->dev_state->name, - (unsigned long long)b_all->dev_bytenr, - b_all->mirror_num, + b_all->logical_bytenr, b_all->dev_state->name, + b_all->dev_bytenr, b_all->mirror_num, l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long) l->block_ref_to->logical_bytenr, l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); } @@ -1810,16 +1784,12 @@ static void btrfsic_dump_database(struct btrfsic_state *state) " is ref %u* from" " %c @%llu (%s/%llu/%d)\n", btrfsic_get_block_type(state, b_all), - (unsigned long long)b_all->logical_bytenr, - b_all->dev_state->name, - (unsigned long long)b_all->dev_bytenr, - b_all->mirror_num, + b_all->logical_bytenr, b_all->dev_state->name, + b_all->dev_bytenr, b_all->mirror_num, l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_from), - (unsigned long long) l->block_ref_from->logical_bytenr, l->block_ref_from->dev_state->name, - (unsigned long long) l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num); } @@ -1896,8 +1866,8 @@ again: struct list_head *tmp_ref_to; if (block->is_superblock) { - bytenr = le64_to_cpu(((struct btrfs_super_block *) - mapped_datav[0])->bytenr); + bytenr = btrfs_super_bytenr((struct btrfs_super_block *) + mapped_datav[0]); if (num_pages * PAGE_CACHE_SIZE < BTRFS_SUPER_INFO_SIZE) { printk(KERN_INFO @@ -1923,8 +1893,9 @@ again: return; } processed_len = state->metablock_size; - bytenr = le64_to_cpu(((struct btrfs_header *) - mapped_datav[0])->bytenr); + bytenr = btrfs_stack_header_bytenr( + (struct btrfs_header *) + mapped_datav[0]); btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, dev_bytenr); @@ -1935,12 +1906,9 @@ again: " found in hash table, %c," " bytenr mismatch" " (!= stored %llu).\n", - (unsigned long long)bytenr, - dev_state->name, - (unsigned long long)dev_bytenr, + bytenr, dev_state->name, dev_bytenr, block->mirror_num, btrfsic_get_block_type(state, block), - (unsigned long long) block->logical_bytenr); block->logical_bytenr = bytenr; } else if (state->print_mask & @@ -1948,9 +1916,7 @@ again: printk(KERN_INFO "Written block @%llu (%s/%llu/%d)" " found in hash table, %c.\n", - (unsigned long long)bytenr, - dev_state->name, - (unsigned long long)dev_bytenr, + bytenr, dev_state->name, dev_bytenr, block->mirror_num, btrfsic_get_block_type(state, block)); } else { @@ -1966,9 +1932,7 @@ again: printk(KERN_INFO "Written block @%llu (%s/%llu/%d)" " found in hash table, %c.\n", - (unsigned long long)bytenr, - dev_state->name, - (unsigned long long)dev_bytenr, + bytenr, dev_state->name, dev_bytenr, block->mirror_num, btrfsic_get_block_type(state, block)); } @@ -1985,21 +1949,14 @@ again: " new(gen=%llu)," " which is referenced by most recent superblock" " (superblockgen=%llu)!\n", - btrfsic_get_block_type(state, block), - (unsigned long long)bytenr, - dev_state->name, - (unsigned long long)dev_bytenr, - block->mirror_num, - (unsigned long long)block->generation, - (unsigned long long) - le64_to_cpu(block->disk_key.objectid), + btrfsic_get_block_type(state, block), bytenr, + dev_state->name, dev_bytenr, block->mirror_num, + block->generation, + btrfs_disk_key_objectid(&block->disk_key), block->disk_key.type, - (unsigned long long) - le64_to_cpu(block->disk_key.offset), - (unsigned long long) - le64_to_cpu(((struct btrfs_header *) - mapped_datav[0])->generation), - (unsigned long long) + btrfs_disk_key_offset(&block->disk_key), + btrfs_stack_header_generation( + (struct btrfs_header *) mapped_datav[0]), state->max_superblock_generation); btrfsic_dump_tree(state); } @@ -2008,15 +1965,12 @@ again: printk(KERN_INFO "btrfs: attempt to overwrite %c-block" " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu," " which is not yet iodone!\n", - btrfsic_get_block_type(state, block), - (unsigned long long)bytenr, - dev_state->name, - (unsigned long long)dev_bytenr, - block->mirror_num, - (unsigned long long)block->generation, - (unsigned long long) - le64_to_cpu(((struct btrfs_header *) - mapped_datav[0])->generation)); + btrfsic_get_block_type(state, block), bytenr, + dev_state->name, dev_bytenr, block->mirror_num, + block->generation, + btrfs_stack_header_generation( + (struct btrfs_header *) + mapped_datav[0])); /* it would not be safe to go on */ btrfsic_dump_tree(state); goto continue_loop; @@ -2056,7 +2010,7 @@ again: if (ret) { printk(KERN_INFO "btrfsic: btrfsic_map_block(root @%llu)" - " failed!\n", (unsigned long long)bytenr); + " failed!\n", bytenr); goto continue_loop; } block_ctx.datav = mapped_datav; @@ -2140,7 +2094,7 @@ again: printk(KERN_INFO "btrfsic: btrfsic_process_metablock" "(root @%llu) failed!\n", - (unsigned long long)dev_bytenr); + dev_bytenr); } else { block->is_metadata = 0; block->mirror_num = 0; /* unknown */ @@ -2168,8 +2122,7 @@ again: if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO "Written block (%s/%llu/?)" " !found in hash table, D.\n", - dev_state->name, - (unsigned long long)dev_bytenr); + dev_state->name, dev_bytenr); if (!state->include_extent_data) { /* ignore that written D block */ goto continue_loop; @@ -2184,17 +2137,16 @@ again: block_ctx.pagev = NULL; } else { processed_len = state->metablock_size; - bytenr = le64_to_cpu(((struct btrfs_header *) - mapped_datav[0])->bytenr); + bytenr = btrfs_stack_header_bytenr( + (struct btrfs_header *) + mapped_datav[0]); btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, dev_bytenr); if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO "Written block @%llu (%s/%llu/?)" " !found in hash table, M.\n", - (unsigned long long)bytenr, - dev_state->name, - (unsigned long long)dev_bytenr); + bytenr, dev_state->name, dev_bytenr); ret = btrfsic_map_block(state, bytenr, processed_len, &block_ctx, 0); @@ -2202,7 +2154,7 @@ again: printk(KERN_INFO "btrfsic: btrfsic_map_block(root @%llu)" " failed!\n", - (unsigned long long)dev_bytenr); + dev_bytenr); goto continue_loop; } } @@ -2267,10 +2219,8 @@ again: printk(KERN_INFO "New written %c-block @%llu (%s/%llu/%d)\n", is_metadata ? 'M' : 'D', - (unsigned long long)block->logical_bytenr, - block->dev_state->name, - (unsigned long long)block->dev_bytenr, - block->mirror_num); + block->logical_bytenr, block->dev_state->name, + block->dev_bytenr, block->mirror_num); list_add(&block->all_blocks_node, &state->all_blocks_list); btrfsic_block_hashtable_add(block, &state->block_hashtable); @@ -2281,7 +2231,7 @@ again: printk(KERN_INFO "btrfsic: process_metablock(root @%llu)" " failed!\n", - (unsigned long long)dev_bytenr); + dev_bytenr); } btrfsic_release_block_ctx(&block_ctx); } @@ -2319,10 +2269,8 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n", bio_error_status, btrfsic_get_block_type(dev_state->state, block), - (unsigned long long)block->logical_bytenr, - dev_state->name, - (unsigned long long)block->dev_bytenr, - block->mirror_num); + block->logical_bytenr, dev_state->name, + block->dev_bytenr, block->mirror_num); next_block = block->next_in_same_bio; block->iodone_w_error = iodone_w_error; if (block->submit_bio_bh_rw & REQ_FLUSH) { @@ -2332,7 +2280,6 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) printk(KERN_INFO "bio_end_io() new %s flush_gen=%llu\n", dev_state->name, - (unsigned long long) dev_state->last_flush_gen); } if (block->submit_bio_bh_rw & REQ_FUA) @@ -2358,10 +2305,8 @@ static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate) "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n", iodone_w_error, btrfsic_get_block_type(dev_state->state, block), - (unsigned long long)block->logical_bytenr, - block->dev_state->name, - (unsigned long long)block->dev_bytenr, - block->mirror_num); + block->logical_bytenr, block->dev_state->name, + block->dev_bytenr, block->mirror_num); block->iodone_w_error = iodone_w_error; if (block->submit_bio_bh_rw & REQ_FLUSH) { @@ -2370,8 +2315,7 @@ static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate) BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) printk(KERN_INFO "bh_end_io() new %s flush_gen=%llu\n", - dev_state->name, - (unsigned long long)dev_state->last_flush_gen); + dev_state->name, dev_state->last_flush_gen); } if (block->submit_bio_bh_rw & REQ_FUA) block->flush_gen = 0; /* FUA completed means block is on disk */ @@ -2396,26 +2340,20 @@ static int btrfsic_process_written_superblock( printk(KERN_INFO "btrfsic: superblock @%llu (%s/%llu/%d)" " with old gen %llu <= %llu\n", - (unsigned long long)superblock->logical_bytenr, + superblock->logical_bytenr, superblock->dev_state->name, - (unsigned long long)superblock->dev_bytenr, - superblock->mirror_num, - (unsigned long long) + superblock->dev_bytenr, superblock->mirror_num, btrfs_super_generation(super_hdr), - (unsigned long long) state->max_superblock_generation); } else { if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) printk(KERN_INFO "btrfsic: got new superblock @%llu (%s/%llu/%d)" " with new gen %llu > %llu\n", - (unsigned long long)superblock->logical_bytenr, + superblock->logical_bytenr, superblock->dev_state->name, - (unsigned long long)superblock->dev_bytenr, - superblock->mirror_num, - (unsigned long long) + superblock->dev_bytenr, superblock->mirror_num, btrfs_super_generation(super_hdr), - (unsigned long long) state->max_superblock_generation); state->max_superblock_generation = @@ -2432,43 +2370,41 @@ static int btrfsic_process_written_superblock( int num_copies; int mirror_num; const char *additional_string = NULL; - struct btrfs_disk_key tmp_disk_key; + struct btrfs_disk_key tmp_disk_key = {0}; - tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY; - tmp_disk_key.offset = 0; + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_ROOT_ITEM_KEY); + btrfs_set_disk_key_objectid(&tmp_disk_key, 0); switch (pass) { case 0: - tmp_disk_key.objectid = - cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID); + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_ROOT_TREE_OBJECTID); additional_string = "root "; next_bytenr = btrfs_super_root(super_hdr); if (state->print_mask & BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - printk(KERN_INFO "root@%llu\n", - (unsigned long long)next_bytenr); + printk(KERN_INFO "root@%llu\n", next_bytenr); break; case 1: - tmp_disk_key.objectid = - cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID); + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_CHUNK_TREE_OBJECTID); additional_string = "chunk "; next_bytenr = btrfs_super_chunk_root(super_hdr); if (state->print_mask & BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - printk(KERN_INFO "chunk@%llu\n", - (unsigned long long)next_bytenr); + printk(KERN_INFO "chunk@%llu\n", next_bytenr); break; case 2: - tmp_disk_key.objectid = - cpu_to_le64(BTRFS_TREE_LOG_OBJECTID); + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_TREE_LOG_OBJECTID); additional_string = "log "; next_bytenr = btrfs_super_log_root(super_hdr); if (0 == next_bytenr) continue; if (state->print_mask & BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - printk(KERN_INFO "log@%llu\n", - (unsigned long long)next_bytenr); + printk(KERN_INFO "log@%llu\n", next_bytenr); break; } @@ -2477,7 +2413,7 @@ static int btrfsic_process_written_superblock( next_bytenr, BTRFS_SUPER_INFO_SIZE); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", - (unsigned long long)next_bytenr, num_copies); + next_bytenr, num_copies); for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { int was_created; @@ -2493,8 +2429,7 @@ static int btrfsic_process_written_superblock( printk(KERN_INFO "btrfsic: btrfsic_map_block(@%llu," " mirror=%d) failed!\n", - (unsigned long long)next_bytenr, - mirror_num); + next_bytenr, mirror_num); return -1; } @@ -2579,26 +2514,22 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, " %u* refers to %c @%llu (%s/%llu/%d)\n", recursion_level, btrfsic_get_block_type(state, block), - (unsigned long long)block->logical_bytenr, - block->dev_state->name, - (unsigned long long)block->dev_bytenr, - block->mirror_num, + block->logical_bytenr, block->dev_state->name, + block->dev_bytenr, block->mirror_num, l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long) l->block_ref_to->logical_bytenr, l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); if (l->block_ref_to->never_written) { printk(KERN_INFO "btrfs: attempt to write superblock" " which references block %c @%llu (%s/%llu/%d)" " which is never written!\n", btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long) l->block_ref_to->logical_bytenr, l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); ret = -1; } else if (!l->block_ref_to->is_iodone) { @@ -2606,10 +2537,9 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, " which references block %c @%llu (%s/%llu/%d)" " which is not yet iodone!\n", btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long) l->block_ref_to->logical_bytenr, l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); ret = -1; } else if (l->block_ref_to->iodone_w_error) { @@ -2617,10 +2547,9 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, " which references block %c @%llu (%s/%llu/%d)" " which has write error!\n", btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long) l->block_ref_to->logical_bytenr, l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); ret = -1; } else if (l->parent_generation != @@ -2634,13 +2563,12 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, " with generation %llu !=" " parent generation %llu!\n", btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long) l->block_ref_to->logical_bytenr, l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num, - (unsigned long long)l->block_ref_to->generation, - (unsigned long long)l->parent_generation); + l->block_ref_to->generation, + l->parent_generation); ret = -1; } else if (l->block_ref_to->flush_gen > l->block_ref_to->dev_state->last_flush_gen) { @@ -2650,13 +2578,10 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, " (block flush_gen=%llu," " dev->flush_gen=%llu)!\n", btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long) l->block_ref_to->logical_bytenr, l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num, - (unsigned long long)block->flush_gen, - (unsigned long long) + l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num, block->flush_gen, l->block_ref_to->dev_state->last_flush_gen); ret = -1; } else if (-1 == btrfsic_check_all_ref_blocks(state, @@ -2701,16 +2626,12 @@ static int btrfsic_is_block_ref_by_superblock( " is ref %u* from %c @%llu (%s/%llu/%d)\n", recursion_level, btrfsic_get_block_type(state, block), - (unsigned long long)block->logical_bytenr, - block->dev_state->name, - (unsigned long long)block->dev_bytenr, - block->mirror_num, + block->logical_bytenr, block->dev_state->name, + block->dev_bytenr, block->mirror_num, l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_from), - (unsigned long long) l->block_ref_from->logical_bytenr, l->block_ref_from->dev_state->name, - (unsigned long long) l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num); if (l->block_ref_from->is_superblock && @@ -2737,14 +2658,12 @@ static void btrfsic_print_add_link(const struct btrfsic_state *state, " to %c @%llu (%s/%llu/%d).\n", l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_from), - (unsigned long long)l->block_ref_from->logical_bytenr, + l->block_ref_from->logical_bytenr, l->block_ref_from->dev_state->name, - (unsigned long long)l->block_ref_from->dev_bytenr, - l->block_ref_from->mirror_num, + l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num, btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long)l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); } @@ -2756,14 +2675,12 @@ static void btrfsic_print_rem_link(const struct btrfsic_state *state, " to %c @%llu (%s/%llu/%d).\n", l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_from), - (unsigned long long)l->block_ref_from->logical_bytenr, + l->block_ref_from->logical_bytenr, l->block_ref_from->dev_state->name, - (unsigned long long)l->block_ref_from->dev_bytenr, - l->block_ref_from->mirror_num, + l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num, btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long)l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); } @@ -2807,10 +2724,8 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, */ indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)", btrfsic_get_block_type(state, block), - (unsigned long long)block->logical_bytenr, - block->dev_state->name, - (unsigned long long)block->dev_bytenr, - block->mirror_num); + block->logical_bytenr, block->dev_state->name, + block->dev_bytenr, block->mirror_num); if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { printk("[...]\n"); return; @@ -2943,10 +2858,8 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add( "New %s%c-block @%llu (%s/%llu/%d)\n", additional_string, btrfsic_get_block_type(state, block), - (unsigned long long)block->logical_bytenr, - dev_state->name, - (unsigned long long)block->dev_bytenr, - mirror_num); + block->logical_bytenr, dev_state->name, + block->dev_bytenr, mirror_num); list_add(&block->all_blocks_node, &state->all_blocks_list); btrfsic_block_hashtable_add(block, &state->block_hashtable); if (NULL != was_created) @@ -2980,7 +2893,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, printk(KERN_INFO "btrfsic:" " btrfsic_map_block(logical @%llu," " mirror %d) failed!\n", - (unsigned long long)bytenr, mirror_num); + bytenr, mirror_num); continue; } @@ -2997,8 +2910,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio," " buffer->log_bytenr=%llu, submit_bio(bdev=%s," " phys_bytenr=%llu)!\n", - (unsigned long long)bytenr, dev_state->name, - (unsigned long long)dev_bytenr); + bytenr, dev_state->name, dev_bytenr); for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { ret = btrfsic_map_block(state, bytenr, state->metablock_size, @@ -3008,10 +2920,8 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, printk(KERN_INFO "Read logical bytenr @%llu maps to" " (%s/%llu/%d)\n", - (unsigned long long)bytenr, - block_ctx.dev->name, - (unsigned long long)block_ctx.dev_bytenr, - mirror_num); + bytenr, block_ctx.dev->name, + block_ctx.dev_bytenr, mirror_num); } WARN_ON(1); } @@ -3048,12 +2958,10 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh) if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO - "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu)," - " size=%lu, data=%p, bdev=%p)\n", - rw, (unsigned long)bh->b_blocknr, - (unsigned long long)dev_bytenr, - (unsigned long)bh->b_size, bh->b_data, - bh->b_bdev); + "submit_bh(rw=0x%x, blocknr=%llu (bytenr %llu)," + " size=%zu, data=%p, bdev=%p)\n", + rw, (unsigned long long)bh->b_blocknr, + dev_bytenr, bh->b_size, bh->b_data, bh->b_bdev); btrfsic_process_written_block(dev_state, dev_bytenr, &bh->b_data, 1, NULL, NULL, bh, rw); @@ -3118,9 +3026,9 @@ void btrfsic_submit_bio(int rw, struct bio *bio) BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO "submit_bio(rw=0x%x, bi_vcnt=%u," - " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n", - rw, bio->bi_vcnt, (unsigned long)bio->bi_sector, - (unsigned long long)dev_bytenr, + " bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", + rw, bio->bi_vcnt, + (unsigned long long)bio->bi_sector, dev_bytenr, bio->bi_bdev); mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt, @@ -3213,19 +3121,19 @@ int btrfsic_mount(struct btrfs_root *root, if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) { printk(KERN_INFO "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", - root->nodesize, (unsigned long)PAGE_CACHE_SIZE); + root->nodesize, PAGE_CACHE_SIZE); return -1; } if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) { printk(KERN_INFO "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", - root->leafsize, (unsigned long)PAGE_CACHE_SIZE); + root->leafsize, PAGE_CACHE_SIZE); return -1; } if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) { printk(KERN_INFO "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", - root->sectorsize, (unsigned long)PAGE_CACHE_SIZE); + root->sectorsize, PAGE_CACHE_SIZE); return -1; } state = kzalloc(sizeof(*state), GFP_NOFS); @@ -3369,10 +3277,8 @@ void btrfsic_unmount(struct btrfs_root *root, " @%llu (%s/%llu/%d) on umount which is" " not yet iodone!\n", btrfsic_get_block_type(state, b_all), - (unsigned long long)b_all->logical_bytenr, - b_all->dev_state->name, - (unsigned long long)b_all->dev_bytenr, - b_all->mirror_num); + b_all->logical_bytenr, b_all->dev_state->name, + b_all->dev_bytenr, b_all->mirror_num); } mutex_unlock(&btrfsic_mutex); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index b189bd1e7a3e..6aad98cb343f 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -132,9 +132,8 @@ static int check_compressed_csum(struct inode *inode, printk(KERN_INFO "btrfs csum failed ino %llu " "extent %llu csum %u " "wanted %u mirror %d\n", - (unsigned long long)btrfs_ino(inode), - (unsigned long long)disk_start, - csum, *cb_sum, cb->mirror_num); + btrfs_ino(inode), disk_start, csum, *cb_sum, + cb->mirror_num); ret = -EIO; goto fail; } @@ -639,7 +638,11 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, faili = nr_pages - 1; cb->nr_pages = nr_pages; - add_ra_bio_pages(inode, em_start + em_len, cb); + /* In the parent-locked case, we only locked the range we are + * interested in. In all other cases, we can opportunistically + * cache decompressed data that goes beyond the requested range. */ + if (!(bio_flags & EXTENT_BIO_PARENT_LOCKED)) + add_ra_bio_pages(inode, em_start + em_len, cb); /* include any pages we added in add_ra-bio_pages */ uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 02fae7f7e42c..61b5bcd57b7e 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -274,8 +274,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, else btrfs_set_header_owner(cow, new_root_objectid); - write_extent_buffer(cow, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(cow), + write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(cow), BTRFS_FSID_SIZE); WARN_ON(btrfs_header_generation(buf) > trans->transid); @@ -484,8 +483,27 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) struct rb_node **new; struct rb_node *parent = NULL; struct tree_mod_elem *cur; + int ret = 0; - BUG_ON(!tm || !tm->seq); + BUG_ON(!tm); + + tree_mod_log_write_lock(fs_info); + if (list_empty(&fs_info->tree_mod_seq_list)) { + tree_mod_log_write_unlock(fs_info); + /* + * Ok we no longer care about logging modifications, free up tm + * and return 0. Any callers shouldn't be using tm after + * calling tree_mod_log_insert, but if they do we can just + * change this to return a special error code to let the callers + * do their own thing. + */ + kfree(tm); + return 0; + } + + spin_lock(&fs_info->tree_mod_seq_lock); + tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info); + spin_unlock(&fs_info->tree_mod_seq_lock); tm_root = &fs_info->tree_mod_log; new = &tm_root->rb_node; @@ -501,14 +519,17 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) else if (cur->seq > tm->seq) new = &((*new)->rb_right); else { + ret = -EEXIST; kfree(tm); - return -EEXIST; + goto out; } } rb_link_node(&tm->node, parent, new); rb_insert_color(&tm->node, tm_root); - return 0; +out: + tree_mod_log_write_unlock(fs_info); + return ret; } /* @@ -524,57 +545,19 @@ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, return 1; if (eb && btrfs_header_level(eb) == 0) return 1; - - tree_mod_log_write_lock(fs_info); - if (list_empty(&fs_info->tree_mod_seq_list)) { - /* - * someone emptied the list while we were waiting for the lock. - * we must not add to the list when no blocker exists. - */ - tree_mod_log_write_unlock(fs_info); - return 1; - } - return 0; } -/* - * This allocates memory and gets a tree modification sequence number. - * - * Returns <0 on error. - * Returns >0 (the added sequence number) on success. - */ -static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, - struct tree_mod_elem **tm_ret) -{ - struct tree_mod_elem *tm; - - /* - * once we switch from spin locks to something different, we should - * honor the flags parameter here. - */ - tm = *tm_ret = kzalloc(sizeof(*tm), GFP_ATOMIC); - if (!tm) - return -ENOMEM; - - spin_lock(&fs_info->tree_mod_seq_lock); - tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info); - spin_unlock(&fs_info->tree_mod_seq_lock); - - return tm->seq; -} - static inline int __tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int slot, enum mod_log_op op, gfp_t flags) { - int ret; struct tree_mod_elem *tm; - ret = tree_mod_alloc(fs_info, flags, &tm); - if (ret < 0) - return ret; + tm = kzalloc(sizeof(*tm), flags); + if (!tm) + return -ENOMEM; tm->index = eb->start >> PAGE_CACHE_SHIFT; if (op != MOD_LOG_KEY_ADD) { @@ -589,34 +572,14 @@ __tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, } static noinline int -tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int slot, - enum mod_log_op op, gfp_t flags) +tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int slot, + enum mod_log_op op, gfp_t flags) { - int ret; - if (tree_mod_dont_log(fs_info, eb)) return 0; - ret = __tree_mod_log_insert_key(fs_info, eb, slot, op, flags); - - tree_mod_log_write_unlock(fs_info); - return ret; -} - -static noinline int -tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, - int slot, enum mod_log_op op) -{ - return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS); -} - -static noinline int -tree_mod_log_insert_key_locked(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int slot, - enum mod_log_op op) -{ - return __tree_mod_log_insert_key(fs_info, eb, slot, op, GFP_NOFS); + return __tree_mod_log_insert_key(fs_info, eb, slot, op, flags); } static noinline int @@ -637,14 +600,14 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, * buffer, i.e. dst_slot < src_slot. */ for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { - ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot, - MOD_LOG_KEY_REMOVE_WHILE_MOVING); + ret = __tree_mod_log_insert_key(fs_info, eb, i + dst_slot, + MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS); BUG_ON(ret < 0); } - ret = tree_mod_alloc(fs_info, flags, &tm); - if (ret < 0) - goto out; + tm = kzalloc(sizeof(*tm), flags); + if (!tm) + return -ENOMEM; tm->index = eb->start >> PAGE_CACHE_SHIFT; tm->slot = src_slot; @@ -652,10 +615,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, tm->move.nr_items = nr_items; tm->op = MOD_LOG_MOVE_KEYS; - ret = __tree_mod_log_insert(fs_info, tm); -out: - tree_mod_log_write_unlock(fs_info); - return ret; + return __tree_mod_log_insert(fs_info, tm); } static inline void @@ -670,8 +630,8 @@ __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) nritems = btrfs_header_nritems(eb); for (i = nritems - 1; i >= 0; i--) { - ret = tree_mod_log_insert_key_locked(fs_info, eb, i, - MOD_LOG_KEY_REMOVE_WHILE_FREEING); + ret = __tree_mod_log_insert_key(fs_info, eb, i, + MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); BUG_ON(ret < 0); } } @@ -683,7 +643,6 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, int log_removal) { struct tree_mod_elem *tm; - int ret; if (tree_mod_dont_log(fs_info, NULL)) return 0; @@ -691,9 +650,9 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, if (log_removal) __tree_mod_log_free_eb(fs_info, old_root); - ret = tree_mod_alloc(fs_info, flags, &tm); - if (ret < 0) - goto out; + tm = kzalloc(sizeof(*tm), flags); + if (!tm) + return -ENOMEM; tm->index = new_root->start >> PAGE_CACHE_SHIFT; tm->old_root.logical = old_root->start; @@ -701,10 +660,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, tm->generation = btrfs_header_generation(old_root); tm->op = MOD_LOG_ROOT_REPLACE; - ret = __tree_mod_log_insert(fs_info, tm); -out: - tree_mod_log_write_unlock(fs_info); - return ret; + return __tree_mod_log_insert(fs_info, tm); } static struct tree_mod_elem * @@ -784,23 +740,20 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, if (tree_mod_dont_log(fs_info, NULL)) return; - if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) { - tree_mod_log_write_unlock(fs_info); + if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) return; - } for (i = 0; i < nr_items; i++) { - ret = tree_mod_log_insert_key_locked(fs_info, src, + ret = __tree_mod_log_insert_key(fs_info, src, i + src_offset, - MOD_LOG_KEY_REMOVE); + MOD_LOG_KEY_REMOVE, GFP_NOFS); BUG_ON(ret < 0); - ret = tree_mod_log_insert_key_locked(fs_info, dst, + ret = __tree_mod_log_insert_key(fs_info, dst, i + dst_offset, - MOD_LOG_KEY_ADD); + MOD_LOG_KEY_ADD, + GFP_NOFS); BUG_ON(ret < 0); } - - tree_mod_log_write_unlock(fs_info); } static inline void @@ -819,9 +772,9 @@ tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, { int ret; - ret = tree_mod_log_insert_key_mask(fs_info, eb, slot, - MOD_LOG_KEY_REPLACE, - atomic ? GFP_ATOMIC : GFP_NOFS); + ret = __tree_mod_log_insert_key(fs_info, eb, slot, + MOD_LOG_KEY_REPLACE, + atomic ? GFP_ATOMIC : GFP_NOFS); BUG_ON(ret < 0); } @@ -830,10 +783,7 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { if (tree_mod_dont_log(fs_info, eb)) return; - __tree_mod_log_free_eb(fs_info, eb); - - tree_mod_log_write_unlock(fs_info); } static noinline void @@ -1046,8 +996,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, else btrfs_set_header_owner(cow, root->root_key.objectid); - write_extent_buffer(cow, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(cow), + write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(cow), BTRFS_FSID_SIZE); ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); @@ -1056,8 +1005,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, return ret; } - if (root->ref_cows) - btrfs_reloc_cow_block(trans, root, buf, cow); + if (root->ref_cows) { + ret = btrfs_reloc_cow_block(trans, root, buf, cow); + if (ret) + return ret; + } if (buf == root->node) { WARN_ON(parent && parent != buf); @@ -1083,13 +1035,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, WARN_ON(trans->transid != btrfs_header_generation(parent)); tree_mod_log_insert_key(root->fs_info, parent, parent_slot, - MOD_LOG_KEY_REPLACE); + MOD_LOG_KEY_REPLACE, GFP_NOFS); btrfs_set_node_blockptr(parent, parent_slot, cow->start); btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); - tree_mod_log_free_eb(root->fs_info, buf); + if (last_ref) + tree_mod_log_free_eb(root->fs_info, buf); btrfs_free_tree_block(trans, root, buf, parent_start, last_ref); } @@ -1115,7 +1068,7 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, int looped = 0; if (!time_seq) - return 0; + return NULL; /* * the very last operation that's logged for a root is the replacement @@ -1126,7 +1079,7 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, tm = tree_mod_log_search_oldest(fs_info, root_logical, time_seq); if (!looped && !tm) - return 0; + return NULL; /* * if there are no tree operation for the oldest root, we simply * return it. this should only happen if that (old) root is at @@ -1161,8 +1114,8 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, * time_seq). */ static void -__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, - struct tree_mod_elem *first_tm) +__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, + u64 time_seq, struct tree_mod_elem *first_tm) { u32 n; struct rb_node *next; @@ -1172,6 +1125,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, unsigned long p_size = sizeof(struct btrfs_key_ptr); n = btrfs_header_nritems(eb); + tree_mod_log_read_lock(fs_info); while (tm && tm->seq >= time_seq) { /* * all the operations are recorded with the operator used for @@ -1226,6 +1180,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, if (tm->index != first_tm->index) break; } + tree_mod_log_read_unlock(fs_info); btrfs_set_header_nritems(eb, n); } @@ -1237,8 +1192,8 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, * is freed (its refcount is decremented). */ static struct extent_buffer * -tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, - u64 time_seq) +tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, + struct extent_buffer *eb, u64 time_seq) { struct extent_buffer *eb_rewin; struct tree_mod_elem *tm; @@ -1253,11 +1208,18 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, if (!tm) return eb; + btrfs_set_path_blocking(path); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { BUG_ON(tm->slot != 0); eb_rewin = alloc_dummy_extent_buffer(eb->start, fs_info->tree_root->nodesize); - BUG_ON(!eb_rewin); + if (!eb_rewin) { + btrfs_tree_read_unlock_blocking(eb); + free_extent_buffer(eb); + return NULL; + } btrfs_set_header_bytenr(eb_rewin, eb->start); btrfs_set_header_backref_rev(eb_rewin, btrfs_header_backref_rev(eb)); @@ -1265,16 +1227,20 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); } else { eb_rewin = btrfs_clone_extent_buffer(eb); - BUG_ON(!eb_rewin); + if (!eb_rewin) { + btrfs_tree_read_unlock_blocking(eb); + free_extent_buffer(eb); + return NULL; + } } - extent_buffer_get(eb_rewin); - btrfs_tree_read_unlock(eb); + btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK); + btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); extent_buffer_get(eb_rewin); btrfs_tree_read_lock(eb_rewin); - __tree_mod_log_rewind(eb_rewin, time_seq, tm); + __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); WARN_ON(btrfs_header_nritems(eb_rewin) > BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root)); @@ -1333,8 +1299,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq) free_extent_buffer(eb_root); eb = alloc_dummy_extent_buffer(logical, root->nodesize); } else { + btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK); eb = btrfs_clone_extent_buffer(eb_root); - btrfs_tree_read_unlock(eb_root); + btrfs_tree_read_unlock_blocking(eb_root); free_extent_buffer(eb_root); } @@ -1350,7 +1317,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) btrfs_set_header_generation(eb, old_generation); } if (tm) - __tree_mod_log_rewind(eb, time_seq, tm); + __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm); else WARN_ON(btrfs_header_level(eb) != 0); WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root)); @@ -1417,14 +1384,12 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, if (trans->transaction != root->fs_info->running_transaction) WARN(1, KERN_CRIT "trans %llu running %llu\n", - (unsigned long long)trans->transid, - (unsigned long long) + trans->transid, root->fs_info->running_transaction->transid); if (trans->transid != root->fs_info->generation) WARN(1, KERN_CRIT "trans %llu running %llu\n", - (unsigned long long)trans->transid, - (unsigned long long)root->fs_info->generation); + trans->transid, root->fs_info->generation); if (!should_cow_block(trans, root, buf)) { *cow_ret = buf; @@ -2178,12 +2143,8 @@ static void reada_for_search(struct btrfs_root *root, } } -/* - * returns -EAGAIN if it had to drop the path, or zero if everything was in - * cache - */ -static noinline int reada_for_balance(struct btrfs_root *root, - struct btrfs_path *path, int level) +static noinline void reada_for_balance(struct btrfs_root *root, + struct btrfs_path *path, int level) { int slot; int nritems; @@ -2192,12 +2153,11 @@ static noinline int reada_for_balance(struct btrfs_root *root, u64 gen; u64 block1 = 0; u64 block2 = 0; - int ret = 0; int blocksize; parent = path->nodes[level + 1]; if (!parent) - return 0; + return; nritems = btrfs_header_nritems(parent); slot = path->slots[level + 1]; @@ -2224,28 +2184,11 @@ static noinline int reada_for_balance(struct btrfs_root *root, block2 = 0; free_extent_buffer(eb); } - if (block1 || block2) { - ret = -EAGAIN; - - /* release the whole path */ - btrfs_release_path(path); - - /* read the blocks */ - if (block1) - readahead_tree_block(root, block1, blocksize, 0); - if (block2) - readahead_tree_block(root, block2, blocksize, 0); - if (block1) { - eb = read_tree_block(root, block1, blocksize, 0); - free_extent_buffer(eb); - } - if (block2) { - eb = read_tree_block(root, block2, blocksize, 0); - free_extent_buffer(eb); - } - } - return ret; + if (block1) + readahead_tree_block(root, block1, blocksize, 0); + if (block2) + readahead_tree_block(root, block2, blocksize, 0); } @@ -2359,35 +2302,28 @@ read_block_for_search(struct btrfs_trans_handle *trans, tmp = btrfs_find_tree_block(root, blocknr, blocksize); if (tmp) { /* first we do an atomic uptodate check */ - if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) { - if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { - /* - * we found an up to date block without - * sleeping, return - * right away - */ - *eb_ret = tmp; - return 0; - } - /* the pages were up to date, but we failed - * the generation number check. Do a full - * read for the generation number that is correct. - * We must do this without dropping locks so - * we can trust our generation number - */ - free_extent_buffer(tmp); - btrfs_set_path_blocking(p); + if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { + *eb_ret = tmp; + return 0; + } - /* now we're allowed to do a blocking uptodate check */ - tmp = read_tree_block(root, blocknr, blocksize, gen); - if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) { - *eb_ret = tmp; - return 0; - } - free_extent_buffer(tmp); - btrfs_release_path(p); - return -EIO; + /* the pages were up to date, but we failed + * the generation number check. Do a full + * read for the generation number that is correct. + * We must do this without dropping locks so + * we can trust our generation number + */ + btrfs_set_path_blocking(p); + + /* now we're allowed to do a blocking uptodate check */ + ret = btrfs_read_buffer(tmp, gen); + if (!ret) { + *eb_ret = tmp; + return 0; } + free_extent_buffer(tmp); + btrfs_release_path(p); + return -EIO; } /* @@ -2448,11 +2384,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, goto again; } - sret = reada_for_balance(root, p, level); - if (sret) - goto again; - btrfs_set_path_blocking(p); + reada_for_balance(root, p, level); sret = split_node(trans, root, p, level); btrfs_clear_path_blocking(p, NULL, 0); @@ -2472,11 +2405,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, goto again; } - sret = reada_for_balance(root, p, level); - if (sret) - goto again; - btrfs_set_path_blocking(p); + reada_for_balance(root, p, level); sret = balance_level(trans, root, p, level); btrfs_clear_path_blocking(p, NULL, 0); @@ -2499,6 +2429,40 @@ done: return ret; } +static void key_search_validate(struct extent_buffer *b, + struct btrfs_key *key, + int level) +{ +#ifdef CONFIG_BTRFS_ASSERT + struct btrfs_disk_key disk_key; + + btrfs_cpu_key_to_disk(&disk_key, key); + + if (level == 0) + ASSERT(!memcmp_extent_buffer(b, &disk_key, + offsetof(struct btrfs_leaf, items[0].key), + sizeof(disk_key))); + else + ASSERT(!memcmp_extent_buffer(b, &disk_key, + offsetof(struct btrfs_node, ptrs[0].key), + sizeof(disk_key))); +#endif +} + +static int key_search(struct extent_buffer *b, struct btrfs_key *key, + int level, int *prev_cmp, int *slot) +{ + if (*prev_cmp != 0) { + *prev_cmp = bin_search(b, key, level, slot); + return *prev_cmp; + } + + key_search_validate(b, key, level); + *slot = 0; + + return 0; +} + /* * look for key in the tree. path is filled in with nodes along the way * if key is found, we return zero and you can find the item in the leaf @@ -2527,6 +2491,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root int write_lock_level = 0; u8 lowest_level = 0; int min_write_lock_level; + int prev_cmp; lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); @@ -2557,6 +2522,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root min_write_lock_level = write_lock_level; again: + prev_cmp = -1; /* * we try very hard to do read locks on the root */ @@ -2657,7 +2623,7 @@ cow_done: if (!cow) btrfs_unlock_up_safe(p, level + 1); - ret = bin_search(b, key, level, &slot); + ret = key_search(b, key, level, &prev_cmp, &slot); if (level != 0) { int dec = 0; @@ -2792,6 +2758,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, int level; int lowest_unlock = 1; u8 lowest_level = 0; + int prev_cmp; lowest_level = p->lowest_level; WARN_ON(p->nodes[0] != NULL); @@ -2802,6 +2769,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, } again: + prev_cmp = -1; b = get_old_root(root, time_seq); level = btrfs_header_level(b); p->locks[level] = BTRFS_READ_LOCK; @@ -2819,7 +2787,7 @@ again: */ btrfs_unlock_up_safe(p, level + 1); - ret = bin_search(b, key, level, &slot); + ret = key_search(b, key, level, &prev_cmp, &slot); if (level != 0) { int dec = 0; @@ -2853,7 +2821,11 @@ again: btrfs_clear_path_blocking(p, b, BTRFS_READ_LOCK); } - b = tree_mod_log_rewind(root->fs_info, b, time_seq); + b = tree_mod_log_rewind(root->fs_info, p, b, time_seq); + if (!b) { + ret = -ENOMEM; + goto done; + } p->locks[level] = BTRFS_READ_LOCK; p->nodes[level] = b; } else { @@ -3143,7 +3115,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, */ static noinline int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int log_removal) + struct btrfs_path *path, int level) { u64 lower_gen; struct extent_buffer *lower; @@ -3176,13 +3148,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(c, root->root_key.objectid); - write_extent_buffer(c, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(c), + write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(c), BTRFS_FSID_SIZE); write_extent_buffer(c, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(c), - BTRFS_UUID_SIZE); + btrfs_header_chunk_tree_uuid(c), BTRFS_UUID_SIZE); btrfs_set_node_key(c, &lower_key, 0); btrfs_set_node_blockptr(c, 0, lower->start); @@ -3194,7 +3164,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); old = root->node; - tree_mod_log_set_root_pointer(root, c, log_removal); + tree_mod_log_set_root_pointer(root, c, 0); rcu_assign_pointer(root->node, c); /* the super has an extra ref to root->node */ @@ -3241,7 +3211,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans, } if (level) { ret = tree_mod_log_insert_key(root->fs_info, lower, slot, - MOD_LOG_KEY_ADD); + MOD_LOG_KEY_ADD, GFP_NOFS); BUG_ON(ret < 0); } btrfs_set_node_key(lower, key, slot); @@ -3278,14 +3248,14 @@ static noinline int split_node(struct btrfs_trans_handle *trans, /* * trying to split the root, lets make a new one * - * tree mod log: We pass 0 as log_removal parameter to + * tree mod log: We don't log_removal old root in * insert_new_root, because that root buffer will be kept as a * normal node. We are going to log removal of half of the * elements below with tree_mod_log_eb_copy. We're holding a * tree lock on the buffer, which is why we cannot race with * other tree_mod_log users. */ - ret = insert_new_root(trans, root, path, level + 1, 0); + ret = insert_new_root(trans, root, path, level + 1); if (ret) return ret; } else { @@ -3317,10 +3287,9 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(split, root->root_key.objectid); write_extent_buffer(split, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(split), - BTRFS_FSID_SIZE); + btrfs_header_fsid(split), BTRFS_FSID_SIZE); write_extent_buffer(split, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(split), + btrfs_header_chunk_tree_uuid(split), BTRFS_UUID_SIZE); tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); @@ -3986,7 +3955,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, return -EOVERFLOW; /* first try to make some room by pushing left and right */ - if (data_size) { + if (data_size && path->nodes[1]) { wret = push_leaf_right(trans, root, path, data_size, data_size, 0, 0); if (wret < 0) @@ -4005,7 +3974,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, } if (!path->nodes[1]) { - ret = insert_new_root(trans, root, path, 1, 1); + ret = insert_new_root(trans, root, path, 1); if (ret) return ret; } @@ -4073,11 +4042,10 @@ again: btrfs_set_header_owner(right, root->root_key.objectid); btrfs_set_header_level(right, 0); write_extent_buffer(right, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(right), - BTRFS_FSID_SIZE); + btrfs_header_fsid(right), BTRFS_FSID_SIZE); write_extent_buffer(right, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(right), + btrfs_header_chunk_tree_uuid(right), BTRFS_UUID_SIZE); if (split == 0) { @@ -4430,7 +4398,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, } /* - * make the item pointed to by the path bigger, data_size is the new size. + * make the item pointed to by the path bigger, data_size is the added size. */ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, u32 data_size) @@ -4675,7 +4643,7 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, (nritems - slot - 1)); } else if (level) { ret = tree_mod_log_insert_key(root->fs_info, parent, slot, - MOD_LOG_KEY_REMOVE); + MOD_LOG_KEY_REMOVE, GFP_NOFS); BUG_ON(ret < 0); } @@ -4847,7 +4815,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, * This may release the path, and so you may lose any locks held at the * time you call it. */ -int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) +static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) { struct btrfs_key key; struct btrfs_disk_key found_key; @@ -5362,19 +5330,20 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; advance_right = ADVANCE; } else { + enum btrfs_compare_tree_result cmp; + WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); ret = tree_compare_item(left_root, left_path, right_path, tmp_buf); - if (ret) { - WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); - ret = changed_cb(left_root, right_root, - left_path, right_path, - &left_key, - BTRFS_COMPARE_TREE_CHANGED, - ctx); - if (ret < 0) - goto out; - } + if (ret) + cmp = BTRFS_COMPARE_TREE_CHANGED; + else + cmp = BTRFS_COMPARE_TREE_SAME; + ret = changed_cb(left_root, right_root, + left_path, right_path, + &left_key, cmp, ctx); + if (ret < 0) + goto out; advance_left = ADVANCE; advance_right = ADVANCE; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index d6dd49b51ba8..0506f40ede83 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -23,6 +23,7 @@ #include <linux/highmem.h> #include <linux/fs.h> #include <linux/rwsem.h> +#include <linux/semaphore.h> #include <linux/completion.h> #include <linux/backing-dev.h> #include <linux/wait.h> @@ -91,6 +92,9 @@ struct btrfs_ordered_sum; /* holds quota configuration and tracking */ #define BTRFS_QUOTA_TREE_OBJECTID 8ULL +/* for storing items that use the BTRFS_UUID_KEY* types */ +#define BTRFS_UUID_TREE_OBJECTID 9ULL + /* for storing balance parameters in the root tree */ #define BTRFS_BALANCE_OBJECTID -4ULL @@ -142,7 +146,7 @@ struct btrfs_ordered_sum; #define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 -#define BTRFS_DEV_REPLACE_DEVID 0 +#define BTRFS_DEV_REPLACE_DEVID 0ULL /* * the max metadata block size. This limit is somewhat artificial, @@ -478,9 +482,10 @@ struct btrfs_super_block { char label[BTRFS_LABEL_SIZE]; __le64 cache_generation; + __le64 uuid_tree_generation; /* future expansion */ - __le64 reserved[31]; + __le64 reserved[30]; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; } __attribute__ ((__packed__)); @@ -961,8 +966,8 @@ struct btrfs_dev_replace_item { #define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) #define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) #define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) -#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7) -#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8) +#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7) +#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) #define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE enum btrfs_raid_types { @@ -1102,13 +1107,16 @@ struct btrfs_space_info { account */ /* - * we bump reservation progress every time we decrement - * bytes_reserved. This way people waiting for reservations - * know something good has happened and they can check - * for progress. The number here isn't to be trusted, it - * just shows reclaim activity + * bytes_pinned is kept in line with what is actually pinned, as in + * we've called update_block_group and dropped the bytes_used counter + * and increased the bytes_pinned counter. However this means that + * bytes_pinned does not reflect the bytes that will be pinned once the + * delayed refs are flushed, so this counter is inc'ed everytime we call + * btrfs_free_extent so it is a realtime count of what will be freed + * once the transaction is committed. It will be zero'ed everytime the + * transaction commits. */ - unsigned long reservation_progress; + struct percpu_counter total_bytes_pinned; unsigned int full:1; /* indicates that we cannot allocate any more chunks for this space */ @@ -1176,6 +1184,7 @@ enum btrfs_caching_type { BTRFS_CACHE_STARTED = 1, BTRFS_CACHE_FAST = 2, BTRFS_CACHE_FINISHED = 3, + BTRFS_CACHE_ERROR = 4, }; enum btrfs_disk_cache_state { @@ -1290,6 +1299,7 @@ struct btrfs_fs_info { struct btrfs_root *fs_root; struct btrfs_root *csum_root; struct btrfs_root *quota_root; + struct btrfs_root *uuid_root; /* the log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; @@ -1338,6 +1348,7 @@ struct btrfs_fs_info { u64 last_trans_log_full_commit; unsigned long mount_opt; unsigned long compress_type:4; + int commit_interval; /* * It is a suggestive number, the read side is safe even it gets a * wrong number because we will write out the data into a regular @@ -1399,6 +1410,13 @@ struct btrfs_fs_info { * before jumping into the main commit. */ struct mutex ordered_operations_mutex; + + /* + * Same as ordered_operations_mutex except this is for ordered extents + * and not the operations. + */ + struct mutex ordered_extent_flush_mutex; + struct rw_semaphore extent_commit_sem; struct rw_semaphore cleanup_work_sem; @@ -1437,25 +1455,22 @@ struct btrfs_fs_info { atomic_t open_ioctl_trans; /* - * this is used by the balancing code to wait for all the pending - * ordered extents + * this is used to protect the following list -- ordered_roots. */ - spinlock_t ordered_extent_lock; + spinlock_t ordered_root_lock; /* - * all of the data=ordered extents pending writeback + * all fs/file tree roots in which there are data=ordered extents + * pending writeback are added into this list. + * * these can span multiple transactions and basically include * every dirty data page that isn't from nodatacow */ - struct list_head ordered_extents; + struct list_head ordered_roots; - spinlock_t delalloc_lock; - /* - * all of the inodes that have delalloc bytes. It is possible for - * this list to be empty even when there is still dirty data=ordered - * extents waiting to finish IO. - */ - struct list_head delalloc_inodes; + spinlock_t delalloc_root_lock; + /* all fs/file tree roots that have delalloc inodes. */ + struct list_head delalloc_roots; /* * there is a pool of worker threads for checksumming during writes @@ -1498,8 +1513,6 @@ struct btrfs_fs_info { int do_barriers; int closing; int log_root_recovering; - int enospc_unlink; - int trans_no_join; u64 total_pinned; @@ -1594,6 +1607,12 @@ struct btrfs_fs_info { struct rb_root qgroup_tree; spinlock_t qgroup_lock; + /* + * used to avoid frequently calling ulist_alloc()/ulist_free() + * when doing qgroup accounting, it must be protected by qgroup_lock. + */ + struct ulist *qgroup_ulist; + /* protect user change for quota operations */ struct mutex qgroup_ioctl_lock; @@ -1607,6 +1626,8 @@ struct btrfs_fs_info { struct mutex qgroup_rescan_lock; /* protects the progress item */ struct btrfs_key qgroup_rescan_progress; struct btrfs_workers qgroup_rescan_workers; + struct completion qgroup_rescan_completion; + struct btrfs_work qgroup_rescan_work; /* filesystem state */ unsigned long fs_state; @@ -1626,6 +1647,9 @@ struct btrfs_fs_info { struct btrfs_dev_replace dev_replace; atomic_t mutually_exclusive_operation_running; + + struct semaphore uuid_tree_rescan_sem; + unsigned int update_uuid_tree_gen:1; }; /* @@ -1739,6 +1763,31 @@ struct btrfs_root { int force_cow; spinlock_t root_item_lock; + atomic_t refs; + + spinlock_t delalloc_lock; + /* + * all of the inodes that have delalloc bytes. It is possible for + * this list to be empty even when there is still dirty data=ordered + * extents waiting to finish IO. + */ + struct list_head delalloc_inodes; + struct list_head delalloc_root; + u64 nr_delalloc_inodes; + /* + * this is used by the balancing code to wait for all the pending + * ordered extents + */ + spinlock_t ordered_extent_lock; + + /* + * all of the data=ordered extents pending writeback + * these can span multiple transactions and basically include + * every dirty data page that isn't from nodatacow + */ + struct list_head ordered_extents; + struct list_head ordered_root; + u64 nr_ordered_extents; }; struct btrfs_ioctl_defrag_range_args { @@ -1894,6 +1943,19 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_DEV_REPLACE_KEY 250 /* + * Stores items that allow to quickly map UUIDs to something else. + * These items are part of the filesystem UUID tree. + * The key is built like this: + * (UUID_upper_64_bits, BTRFS_UUID_KEY*, UUID_lower_64_bits). + */ +#if BTRFS_UUID_SIZE != 16 +#error "UUID items require BTRFS_UUID_SIZE == 16!" +#endif +#define BTRFS_UUID_KEY_SUBVOL 251 /* for UUIDs assigned to subvols */ +#define BTRFS_UUID_KEY_RECEIVED_SUBVOL 252 /* for UUIDs assigned to + * received subvols */ + +/* * string items are for debugging. They just store a short string of * data in the FS */ @@ -1927,6 +1989,9 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20) #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) +#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) + +#define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -2090,14 +2155,14 @@ BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item, BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item, generation, 64); -static inline char *btrfs_device_uuid(struct btrfs_dev_item *d) +static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d) { - return (char *)d + offsetof(struct btrfs_dev_item, uuid); + return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid); } -static inline char *btrfs_device_fsid(struct btrfs_dev_item *d) +static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d) { - return (char *)d + offsetof(struct btrfs_dev_item, fsid); + return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid); } BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64); @@ -2200,6 +2265,23 @@ BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64); BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item, + sequence, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item, + transid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item, + nbytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item, + block_group, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64); static inline struct btrfs_timespec * btrfs_inode_atime(struct btrfs_inode_item *inode_item) @@ -2227,6 +2309,8 @@ btrfs_inode_ctime(struct btrfs_inode_item *inode_item) BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); +BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); +BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32); /* struct btrfs_dev_extent */ BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, @@ -2237,10 +2321,10 @@ BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, chunk_offset, 64); BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); -static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev) +static inline unsigned long btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev) { unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid); - return (u8 *)((unsigned long)dev + ptr); + return (unsigned long)dev + ptr; } BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64); @@ -2308,6 +2392,10 @@ BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32); /* struct btrfs_node */ BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr, + blockptr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr, + generation, 64); static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr) { @@ -2364,6 +2452,8 @@ static inline void btrfs_set_node_key(struct extent_buffer *eb, /* struct btrfs_item */ BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32); BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32); +BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32); static inline unsigned long btrfs_item_nr_offset(int nr) { @@ -2426,6 +2516,13 @@ BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8); BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dir_type, struct btrfs_dir_item, type, 8); +BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item, + data_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item, + name_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item, + transid, 64); static inline void btrfs_dir_item_key(struct extent_buffer *eb, struct btrfs_dir_item *item, @@ -2528,6 +2625,12 @@ BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64); BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32); BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64); BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8); +BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64); +BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header, + nritems, 32); +BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64); static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag) { @@ -2563,16 +2666,14 @@ static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, btrfs_set_header_flags(eb, flags); } -static inline u8 *btrfs_header_fsid(struct extent_buffer *eb) +static inline unsigned long btrfs_header_fsid(struct extent_buffer *eb) { - unsigned long ptr = offsetof(struct btrfs_header, fsid); - return (u8 *)ptr; + return offsetof(struct btrfs_header, fsid); } -static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb) +static inline unsigned long btrfs_header_chunk_tree_uuid(struct extent_buffer *eb) { - unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid); - return (u8 *)ptr; + return offsetof(struct btrfs_header, chunk_tree_uuid); } static inline int btrfs_is_leaf(struct extent_buffer *eb) @@ -2790,6 +2891,9 @@ BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, csum_type, 16); BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, cache_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); +BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, + uuid_tree_generation, 64); static inline int btrfs_super_csum_size(struct btrfs_super_block *s) { @@ -2807,6 +2911,14 @@ static inline unsigned long btrfs_leaf_data(struct extent_buffer *l) /* struct btrfs_file_extent_item */ BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr, + struct btrfs_file_extent_item, disk_bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset, + struct btrfs_file_extent_item, offset, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation, + struct btrfs_file_extent_item, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes, + struct btrfs_file_extent_item, num_bytes, 64); static inline unsigned long btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e) @@ -3014,7 +3126,7 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, unsigned num_items) { return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * - 3 * num_items; + 2 * num_items; } /* @@ -3028,6 +3140,8 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root, num_items; } +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); @@ -3039,6 +3153,8 @@ int btrfs_pin_extent(struct btrfs_root *root, u64 bytenr, u64 num, int reserved); int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, u64 bytenr, u64 num_bytes); +int btrfs_exclude_logged_extents(struct btrfs_root *root, + struct extent_buffer *eb); int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 offset, u64 bytenr); @@ -3063,11 +3179,9 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 root_objectid, u64 owner, u64 offset, struct btrfs_key *ins); -int btrfs_reserve_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 num_bytes, u64 min_alloc_size, - u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, int is_data); +int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes, + u64 min_alloc_size, u64 empty_size, u64 hint_byte, + struct btrfs_key *ins, int is_data); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref, int for_cow); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -3131,7 +3245,7 @@ void btrfs_orphan_release_metadata(struct inode *inode); int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, - u64 *qgroup_reserved); + u64 *qgroup_reserved, bool use_global_rsv); void btrfs_subvolume_release_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, u64 qgroup_reserved); @@ -3155,6 +3269,9 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes); +int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *dest, u64 num_bytes, + int min_factor); void btrfs_block_rsv_release(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes); @@ -3198,6 +3315,7 @@ enum btrfs_compare_tree_result { BTRFS_COMPARE_TREE_NEW, BTRFS_COMPARE_TREE_DELETED, BTRFS_COMPARE_TREE_CHANGED, + BTRFS_COMPARE_TREE_SAME, }; typedef int (*btrfs_changed_cb_t)(struct btrfs_root *left_root, struct btrfs_root *right_root, @@ -3311,6 +3429,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) smp_mb(); return fs_info->closing; } + +/* + * If we remount the fs to be R/O or umount the fs, the cleaner needn't do + * anything except sleeping. This function is used to check the status of + * the fs. + */ +static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root) +{ + return (root->fs_info->sb->s_flags & MS_RDONLY || + btrfs_fs_closing(root->fs_info)); +} + static inline void free_fs_info(struct btrfs_fs_info *fs_info) { kfree(fs_info->balance_ctl); @@ -3321,6 +3451,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info) kfree(fs_info->dev_root); kfree(fs_info->csum_root); kfree(fs_info->quota_root); + kfree(fs_info->uuid_root); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); kfree(fs_info); @@ -3355,11 +3486,9 @@ int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_root_item *item); -void btrfs_read_root_item(struct extent_buffer *eb, int slot, - struct btrfs_root_item *item); -int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct - btrfs_root_item *item, struct btrfs_key *key); -int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); +int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key, + struct btrfs_path *path, struct btrfs_root_item *root_item, + struct btrfs_key *root_key); int btrfs_find_orphan_roots(struct btrfs_root *tree_root); void btrfs_set_root_node(struct btrfs_root_item *item, struct extent_buffer *node); @@ -3367,6 +3496,17 @@ void btrfs_check_and_init_root_item(struct btrfs_root_item *item); void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root *root); +/* uuid-tree.c */ +int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, + struct btrfs_root *uuid_root, u8 *uuid, u8 type, + u64 subid); +int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans, + struct btrfs_root *uuid_root, u8 *uuid, u8 type, + u64 subid); +int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, + int (*check_func)(struct btrfs_fs_info *, u8 *, u8, + u64)); + /* dir-item.c */ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, const char *name, int name_len); @@ -3450,12 +3590,14 @@ int btrfs_find_name_in_ext_backref(struct btrfs_path *path, struct btrfs_inode_extref **extref_ret); /* file-item.c */ +struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, struct bio *bio, u32 *dst); int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, - struct bio *bio, u64 logical_offset); + struct btrfs_dio_private *dip, struct bio *bio, + u64 logical_offset); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, @@ -3493,6 +3635,9 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work); struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create); +noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, + u64 *orig_start, u64 *orig_block_len, + u64 *ram_bytes); /* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */ #if defined(ClearPageFsMisc) && !defined(ClearPageChecked) @@ -3530,6 +3675,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); +int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info, + int delay_iput); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, @@ -3578,11 +3725,15 @@ extern const struct dentry_operations btrfs_dentry_operations; long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); void btrfs_update_iflags(struct inode *inode); void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); +int btrfs_is_empty_uuid(u8 *uuid); int btrfs_defrag_file(struct inode *inode, struct file *file, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_pages); void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); +void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, + struct btrfs_ioctl_balance_args *bargs); + /* file.c */ int btrfs_auto_defrag_init(void); @@ -3655,6 +3806,22 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) #define btrfs_debug(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) +#ifdef CONFIG_BTRFS_ASSERT + +static inline void assfail(char *expr, char *file, int line) +{ + printk(KERN_ERR "BTRFS assertion failed: %s, file: %s, line: %d", + expr, file, line); + BUG(); +} + +#define ASSERT(expr) \ + (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) +#else +#define ASSERT(expr) ((void)0) +#endif + +#define btrfs_assert() __printf(5, 6) void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); @@ -3763,9 +3930,9 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_recover_relocation(struct btrfs_root *root); int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); -void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *buf, - struct extent_buffer *cow); +int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *cow); void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve); @@ -3814,6 +3981,8 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, int btrfs_quota_disable(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); +void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info); int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst); int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index f26f38ccd194..cbd9523ad09c 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -21,6 +21,7 @@ #include "delayed-inode.h" #include "disk-io.h" #include "transaction.h" +#include "ctree.h" #define BTRFS_DELAYED_WRITEBACK 512 #define BTRFS_DELAYED_BACKGROUND 128 @@ -535,20 +536,6 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item( return next; } -static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root, - u64 root_id) -{ - struct btrfs_key root_key; - - if (root->objectid == root_id) - return root; - - root_key.objectid = root_id; - root_key.type = BTRFS_ROOT_ITEM_KEY; - root_key.offset = (u64)-1; - return btrfs_read_fs_root_no_name(root->fs_info, &root_key); -} - static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_delayed_item *item) @@ -1467,10 +1454,10 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, dir_item = (struct btrfs_dir_item *)delayed_item->data; dir_item->location = *disk_key; - dir_item->transid = cpu_to_le64(trans->transid); - dir_item->data_len = 0; - dir_item->name_len = cpu_to_le16(name_len); - dir_item->type = type; + btrfs_set_stack_dir_transid(dir_item, trans->transid); + btrfs_set_stack_dir_data_len(dir_item, 0); + btrfs_set_stack_dir_name_len(dir_item, name_len); + btrfs_set_stack_dir_type(dir_item, type); memcpy((char *)(dir_item + 1), name, name_len); ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item); @@ -1484,13 +1471,11 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, mutex_lock(&delayed_node->mutex); ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); if (unlikely(ret)) { - printk(KERN_ERR "err add delayed dir index item(name: %s) into " - "the insertion tree of the delayed node" + printk(KERN_ERR "err add delayed dir index item(name: %.*s) " + "into the insertion tree of the delayed node" "(root id: %llu, inode id: %llu, errno: %d)\n", - name, - (unsigned long long)delayed_node->root->objectid, - (unsigned long long)delayed_node->inode_id, - ret); + name_len, name, delayed_node->root->objectid, + delayed_node->inode_id, ret); BUG(); } mutex_unlock(&delayed_node->mutex); @@ -1561,9 +1546,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, printk(KERN_ERR "err add delayed dir index item(index: %llu) " "into the deletion tree of the delayed node" "(root id: %llu, inode id: %llu, errno: %d)\n", - (unsigned long long)index, - (unsigned long long)node->root->objectid, - (unsigned long long)node->inode_id, + index, node->root->objectid, node->inode_id, ret); BUG(); } @@ -1681,8 +1664,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list, * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree * */ -int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent, - filldir_t filldir, +int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, struct list_head *ins_list) { struct btrfs_dir_item *di; @@ -1704,22 +1686,22 @@ int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent, list_for_each_entry_safe(curr, next, ins_list, readdir_list) { list_del(&curr->readdir_list); - if (curr->key.offset < filp->f_pos) { + if (curr->key.offset < ctx->pos) { if (atomic_dec_and_test(&curr->refs)) kfree(curr); continue; } - filp->f_pos = curr->key.offset; + ctx->pos = curr->key.offset; di = (struct btrfs_dir_item *)curr->data; name = (char *)(di + 1); - name_len = le16_to_cpu(di->name_len); + name_len = btrfs_stack_dir_name_len(di); d_type = btrfs_filetype_table[di->type]; btrfs_disk_key_to_cpu(&location, &di->location); - over = filldir(dirent, name, name_len, curr->key.offset, + over = !dir_emit(ctx, name, name_len, location.objectid, d_type); if (atomic_dec_and_test(&curr->refs)) @@ -1731,27 +1713,6 @@ int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent, return 0; } -BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item, - generation, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item, - sequence, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item, - transid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item, - nbytes, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item, - block_group, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64); - -BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); -BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32); - static void fill_stack_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *inode_item, struct inode *inode) diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 1d5c5f7abe3e..a4b38f934d14 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -139,8 +139,7 @@ void btrfs_put_delayed_items(struct list_head *ins_list, struct list_head *del_list); int btrfs_should_delete_dir_index(struct list_head *del_list, u64 index); -int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent, - filldir_t filldir, +int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, struct list_head *ins_list); /* for init */ diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index c219463fb1fd..e4d467be2dd4 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -241,7 +241,7 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, return 0; } -static void inline drop_delayed_ref(struct btrfs_trans_handle *trans, +static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_node *ref) { @@ -600,7 +600,7 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&head_ref->cluster); mutex_init(&head_ref->mutex); - trace_btrfs_delayed_ref_head(ref, head_ref, action); + trace_add_delayed_ref_head(ref, head_ref, action); existing = tree_insert(&delayed_refs->root, &ref->rb_node); @@ -661,7 +661,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->type = BTRFS_TREE_BLOCK_REF_KEY; full_ref->level = level; - trace_btrfs_delayed_tree_ref(ref, full_ref, action); + trace_add_delayed_tree_ref(ref, full_ref, action); existing = tree_insert(&delayed_refs->root, &ref->rb_node); @@ -722,7 +722,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, full_ref->objectid = owner; full_ref->offset = offset; - trace_btrfs_delayed_data_ref(ref, full_ref, action); + trace_add_delayed_data_ref(ref, full_ref, action); existing = tree_insert(&delayed_refs->root, &ref->rb_node); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 65241f32d3f8..9efb94e95858 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -148,13 +148,13 @@ no_valid_dev_replace_entry_found: !btrfs_test_opt(dev_root, DEGRADED)) { ret = -EIO; pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n", - (unsigned long long)src_devid); + src_devid); } if (!dev_replace->tgtdev && !btrfs_test_opt(dev_root, DEGRADED)) { ret = -EIO; pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n", - (unsigned long long)BTRFS_DEV_REPLACE_DEVID); + BTRFS_DEV_REPLACE_DEVID); } if (dev_replace->tgtdev) { if (dev_replace->srcdev) { @@ -400,7 +400,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; btrfs_dev_replace_unlock(dev_replace); - btrfs_wait_ordered_extents(root, 0); + btrfs_wait_all_ordered_extents(root->fs_info); /* force writing the updated state information to disk */ trans = btrfs_start_transaction(root, 0); @@ -470,12 +470,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ - ret = btrfs_start_delalloc_inodes(root, 0); + ret = btrfs_start_all_delalloc_inodes(root->fs_info, 0); if (ret) { mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; } - btrfs_wait_ordered_extents(root, 0); + btrfs_wait_all_ordered_extents(root->fs_info); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { @@ -535,10 +535,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); btrfs_rm_dev_replace_srcdev(fs_info, src_device); - if (src_device->bdev) { - /* zero out the old super */ - btrfs_scratch_superblock(src_device); - } + /* * this is again a consistent state where no dev_replace procedure * is running, the target device is part of the filesystem, the @@ -747,7 +744,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) WARN_ON(atomic_xchg( &fs_info->mutually_exclusive_operation_running, 1)); task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); - return PTR_RET(task); + return PTR_ERR_OR_ZERO(task); } static int btrfs_dev_replace_kthread(void *data) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e7b3cb5286a5..4ae17ed13b32 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -31,6 +31,7 @@ #include <linux/migrate.h> #include <linux/ratelimit.h> #include <linux/uuid.h> +#include <linux/semaphore.h> #include <asm/unaligned.h> #include "compat.h" #include "ctree.h" @@ -156,6 +157,7 @@ static struct btrfs_lockdep_keyset { { .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" }, { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" }, { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" }, + { .id = BTRFS_UUID_TREE_OBJECTID, .name_stem = "uuid" }, { .id = 0, .name_stem = "tree" }, }; @@ -302,9 +304,8 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, printk_ratelimited(KERN_INFO "btrfs: %s checksum verify " "failed on %llu wanted %X found %X " "level %d\n", - root->fs_info->sb->s_id, - (unsigned long long)buf->start, val, found, - btrfs_header_level(buf)); + root->fs_info->sb->s_id, buf->start, + val, found, btrfs_header_level(buf)); if (result != (char *)&inline_result) kfree(result); return 1; @@ -345,9 +346,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, } printk_ratelimited("parent transid verify failed on %llu wanted %llu " "found %llu\n", - (unsigned long long)eb->start, - (unsigned long long)parent_transid, - (unsigned long long)btrfs_header_generation(eb)); + eb->start, parent_transid, btrfs_header_generation(eb)); ret = 1; clear_extent_buffer_uptodate(eb); out: @@ -497,8 +496,7 @@ static int check_tree_block_fsid(struct btrfs_root *root, u8 fsid[BTRFS_UUID_SIZE]; int ret = 1; - read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb), - BTRFS_FSID_SIZE); + read_extent_buffer(eb, fsid, btrfs_header_fsid(eb), BTRFS_FSID_SIZE); while (fs_devices) { if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) { ret = 0; @@ -512,8 +510,7 @@ static int check_tree_block_fsid(struct btrfs_root *root, #define CORRUPT(reason, eb, root, slot) \ printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \ "root=%llu, slot=%d\n", reason, \ - (unsigned long long)btrfs_header_bytenr(eb), \ - (unsigned long long)root->objectid, slot) + btrfs_header_bytenr(eb), root->objectid, slot) static noinline int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf) @@ -576,8 +573,9 @@ static noinline int check_leaf(struct btrfs_root *root, return 0; } -static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, - struct extent_state *state, int mirror) +static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, + u64 phy_offset, struct page *page, + u64 start, u64 end, int mirror) { struct extent_io_tree *tree; u64 found_start; @@ -612,14 +610,13 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, if (found_start != eb->start) { printk_ratelimited(KERN_INFO "btrfs bad tree block start " "%llu %llu\n", - (unsigned long long)found_start, - (unsigned long long)eb->start); + found_start, eb->start); ret = -EIO; goto err; } if (check_tree_block_fsid(root, eb)) { printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n", - (unsigned long long)eb->start); + eb->start); ret = -EIO; goto err; } @@ -1013,7 +1010,8 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags) return try_release_extent_buffer(page); } -static void btree_invalidatepage(struct page *page, unsigned long offset) +static void btree_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct extent_io_tree *tree; tree = &BTRFS_I(page->mapping->host)->io_tree; @@ -1147,6 +1145,10 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, return NULL; ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); + if (ret) { + free_extent_buffer(buf); + return NULL; + } return buf; } @@ -1191,6 +1193,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->objectid = objectid; root->last_trans = 0; root->highest_objectid = 0; + root->nr_delalloc_inodes = 0; + root->nr_ordered_extents = 0; root->name = NULL; root->inode_tree = RB_ROOT; INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); @@ -1199,10 +1203,16 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, INIT_LIST_HEAD(&root->dirty_list); INIT_LIST_HEAD(&root->root_list); + INIT_LIST_HEAD(&root->delalloc_inodes); + INIT_LIST_HEAD(&root->delalloc_root); + INIT_LIST_HEAD(&root->ordered_extents); + INIT_LIST_HEAD(&root->ordered_root); INIT_LIST_HEAD(&root->logged_list[0]); INIT_LIST_HEAD(&root->logged_list[1]); spin_lock_init(&root->orphan_lock); spin_lock_init(&root->inode_lock); + spin_lock_init(&root->delalloc_lock); + spin_lock_init(&root->ordered_extent_lock); spin_lock_init(&root->accounting_lock); spin_lock_init(&root->log_extents_lock[0]); spin_lock_init(&root->log_extents_lock[1]); @@ -1216,6 +1226,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(&root->log_writers, 0); atomic_set(&root->log_batch, 0); atomic_set(&root->orphan_inodes, 0); + atomic_set(&root->refs, 1); root->log_transid = 0; root->last_log_commit = 0; extent_io_tree_init(&root->dirty_log_pages, @@ -1234,39 +1245,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, spin_lock_init(&root->root_item_lock); } -static int __must_check find_and_setup_root(struct btrfs_root *tree_root, - struct btrfs_fs_info *fs_info, - u64 objectid, - struct btrfs_root *root) -{ - int ret; - u32 blocksize; - u64 generation; - - __setup_root(tree_root->nodesize, tree_root->leafsize, - tree_root->sectorsize, tree_root->stripesize, - root, fs_info, objectid); - ret = btrfs_find_last_root(tree_root, objectid, - &root->root_item, &root->root_key); - if (ret > 0) - return -ENOENT; - else if (ret < 0) - return ret; - - generation = btrfs_root_generation(&root->root_item); - blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); - root->commit_root = NULL; - root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), - blocksize, generation); - if (!root->node || !btrfs_buffer_uptodate(root->node, generation, 0)) { - free_extent_buffer(root->node); - root->node = NULL; - return -EIO; - } - root->commit_root = btrfs_root_node(root); - return 0; -} - static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) { struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS); @@ -1314,11 +1292,10 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_set_header_owner(leaf, objectid); root->node = leaf; - write_extent_buffer(leaf, fs_info->fsid, - (unsigned long)btrfs_header_fsid(leaf), + write_extent_buffer(leaf, fs_info->fsid, btrfs_header_fsid(leaf), BTRFS_FSID_SIZE); write_extent_buffer(leaf, fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(leaf), + btrfs_header_chunk_tree_uuid(leaf), BTRFS_UUID_SIZE); btrfs_mark_buffer_dirty(leaf); @@ -1402,8 +1379,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, root->node = leaf; write_extent_buffer(root->node, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(root->node), - BTRFS_FSID_SIZE); + btrfs_header_fsid(root->node), BTRFS_FSID_SIZE); btrfs_mark_buffer_dirty(root->node); btrfs_tree_unlock(root->node); return root; @@ -1436,11 +1412,11 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, log_root->root_key.offset = root->root_key.objectid; inode_item = &log_root->root_item.inode; - inode_item->generation = cpu_to_le64(1); - inode_item->size = cpu_to_le64(3); - inode_item->nlink = cpu_to_le32(1); - inode_item->nbytes = cpu_to_le64(root->leafsize); - inode_item->mode = cpu_to_le32(S_IFDIR | 0755); + btrfs_set_stack_inode_generation(inode_item, 1); + btrfs_set_stack_inode_size(inode_item, 3); + btrfs_set_stack_inode_nlink(inode_item, 1); + btrfs_set_stack_inode_nbytes(inode_item, root->leafsize); + btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); btrfs_set_root_node(&log_root->root_item, log_root->node); @@ -1451,70 +1427,73 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, return 0; } -struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, - struct btrfs_key *location) +static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, + struct btrfs_key *key) { struct btrfs_root *root; struct btrfs_fs_info *fs_info = tree_root->fs_info; struct btrfs_path *path; - struct extent_buffer *l; u64 generation; u32 blocksize; - int ret = 0; - int slot; + int ret; - root = btrfs_alloc_root(fs_info); - if (!root) + path = btrfs_alloc_path(); + if (!path) return ERR_PTR(-ENOMEM); - if (location->offset == (u64)-1) { - ret = find_and_setup_root(tree_root, fs_info, - location->objectid, root); - if (ret) { - kfree(root); - return ERR_PTR(ret); - } - goto out; + + root = btrfs_alloc_root(fs_info); + if (!root) { + ret = -ENOMEM; + goto alloc_fail; } __setup_root(tree_root->nodesize, tree_root->leafsize, tree_root->sectorsize, tree_root->stripesize, - root, fs_info, location->objectid); + root, fs_info, key->objectid); - path = btrfs_alloc_path(); - if (!path) { - kfree(root); - return ERR_PTR(-ENOMEM); - } - ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); - if (ret == 0) { - l = path->nodes[0]; - slot = path->slots[0]; - btrfs_read_root_item(l, slot, &root->root_item); - memcpy(&root->root_key, location, sizeof(*location)); - } - btrfs_free_path(path); + ret = btrfs_find_root(tree_root, key, path, + &root->root_item, &root->root_key); if (ret) { - kfree(root); if (ret > 0) ret = -ENOENT; - return ERR_PTR(ret); + goto find_fail; } generation = btrfs_root_generation(&root->root_item); blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); - if (!root->node || !extent_buffer_uptodate(root->node)) { - ret = (!root->node) ? -ENOMEM : -EIO; - - free_extent_buffer(root->node); - kfree(root); - return ERR_PTR(ret); + if (!root->node) { + ret = -ENOMEM; + goto find_fail; + } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) { + ret = -EIO; + goto read_fail; } - root->commit_root = btrfs_root_node(root); out: - if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { + btrfs_free_path(path); + return root; + +read_fail: + free_extent_buffer(root->node); +find_fail: + kfree(root); +alloc_fail: + root = ERR_PTR(ret); + goto out; +} + +struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, + struct btrfs_key *location) +{ + struct btrfs_root *root; + + root = btrfs_read_tree_root(tree_root, location); + if (IS_ERR(root)) + return root; + + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { root->ref_cows = 1; btrfs_check_and_init_root_item(&root->root_item); } @@ -1522,6 +1501,66 @@ out: return root; } +int btrfs_init_fs_root(struct btrfs_root *root) +{ + int ret; + + root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); + root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), + GFP_NOFS); + if (!root->free_ino_pinned || !root->free_ino_ctl) { + ret = -ENOMEM; + goto fail; + } + + btrfs_init_free_ino_ctl(root); + mutex_init(&root->fs_commit_mutex); + spin_lock_init(&root->cache_lock); + init_waitqueue_head(&root->cache_wait); + + ret = get_anon_bdev(&root->anon_dev); + if (ret) + goto fail; + return 0; +fail: + kfree(root->free_ino_ctl); + kfree(root->free_ino_pinned); + return ret; +} + +static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_id) +{ + struct btrfs_root *root; + + spin_lock(&fs_info->fs_roots_radix_lock); + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)root_id); + spin_unlock(&fs_info->fs_roots_radix_lock); + return root; +} + +int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root) +{ + int ret; + + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (ret) + return ret; + + spin_lock(&fs_info->fs_roots_radix_lock); + ret = radix_tree_insert(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + root); + if (ret == 0) + root->in_radix = 1; + spin_unlock(&fs_info->fs_roots_radix_lock); + radix_tree_preload_end(); + + return ret; +} + struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, struct btrfs_key *location) { @@ -1541,59 +1580,37 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID) return fs_info->quota_root ? fs_info->quota_root : ERR_PTR(-ENOENT); + if (location->objectid == BTRFS_UUID_TREE_OBJECTID) + return fs_info->uuid_root ? fs_info->uuid_root : + ERR_PTR(-ENOENT); again: - spin_lock(&fs_info->fs_roots_radix_lock); - root = radix_tree_lookup(&fs_info->fs_roots_radix, - (unsigned long)location->objectid); - spin_unlock(&fs_info->fs_roots_radix_lock); - if (root) + root = btrfs_lookup_fs_root(fs_info, location->objectid); + if (root) { + if (btrfs_root_refs(&root->root_item) == 0) + return ERR_PTR(-ENOENT); return root; + } - root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); + root = btrfs_read_fs_root(fs_info->tree_root, location); if (IS_ERR(root)) return root; - root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); - root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), - GFP_NOFS); - if (!root->free_ino_pinned || !root->free_ino_ctl) { - ret = -ENOMEM; + if (btrfs_root_refs(&root->root_item) == 0) { + ret = -ENOENT; goto fail; } - btrfs_init_free_ino_ctl(root); - mutex_init(&root->fs_commit_mutex); - spin_lock_init(&root->cache_lock); - init_waitqueue_head(&root->cache_wait); - - ret = get_anon_bdev(&root->anon_dev); + ret = btrfs_init_fs_root(root); if (ret) goto fail; - if (btrfs_root_refs(&root->root_item) == 0) { - ret = -ENOENT; - goto fail; - } - ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid); if (ret < 0) goto fail; if (ret == 0) root->orphan_item_inserted = 1; - ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); - if (ret) - goto fail; - - spin_lock(&fs_info->fs_roots_radix_lock); - ret = radix_tree_insert(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, - root); - if (ret == 0) - root->in_radix = 1; - - spin_unlock(&fs_info->fs_roots_radix_lock); - radix_tree_preload_end(); + ret = btrfs_insert_fs_root(fs_info, root); if (ret) { if (ret == -EEXIST) { free_fs_root(root); @@ -1601,10 +1618,6 @@ again: } goto fail; } - - ret = btrfs_find_dead_roots(fs_info->tree_root, - root->root_key.objectid); - WARN_ON(ret); return root; fail: free_fs_root(root); @@ -1676,21 +1689,37 @@ static void end_workqueue_fn(struct btrfs_work *work) static int cleaner_kthread(void *arg) { struct btrfs_root *root = arg; + int again; do { - int again = 0; - - if (!(root->fs_info->sb->s_flags & MS_RDONLY) && - down_read_trylock(&root->fs_info->sb->s_umount)) { - if (mutex_trylock(&root->fs_info->cleaner_mutex)) { - btrfs_run_delayed_iputs(root); - again = btrfs_clean_one_deleted_snapshot(root); - mutex_unlock(&root->fs_info->cleaner_mutex); - } - btrfs_run_defrag_inodes(root->fs_info); - up_read(&root->fs_info->sb->s_umount); + again = 0; + + /* Make the cleaner go to sleep early. */ + if (btrfs_need_cleaner_sleep(root)) + goto sleep; + + if (!mutex_trylock(&root->fs_info->cleaner_mutex)) + goto sleep; + + /* + * Avoid the problem that we change the status of the fs + * during the above check and trylock. + */ + if (btrfs_need_cleaner_sleep(root)) { + mutex_unlock(&root->fs_info->cleaner_mutex); + goto sleep; } + btrfs_run_delayed_iputs(root); + again = btrfs_clean_one_deleted_snapshot(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + + /* + * The defragger has dealt with the R/O remount and umount, + * needn't do anything special here. + */ + btrfs_run_defrag_inodes(root->fs_info); +sleep: if (!try_to_freeze() && !again) { set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop()) @@ -1713,7 +1742,7 @@ static int transaction_kthread(void *arg) do { cannot_commit = false; - delay = HZ * 30; + delay = HZ * root->fs_info->commit_interval; mutex_lock(&root->fs_info->transaction_kthread_mutex); spin_lock(&root->fs_info->trans_lock); @@ -1724,8 +1753,9 @@ static int transaction_kthread(void *arg) } now = get_seconds(); - if (!cur->blocked && - (now < cur->start_time || now - cur->start_time < 30)) { + if (cur->state < TRANS_STATE_BLOCKED && + (now < cur->start_time || + now - cur->start_time < root->fs_info->commit_interval)) { spin_unlock(&root->fs_info->trans_lock); delay = HZ * 5; goto sleep; @@ -2014,6 +2044,12 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) info->quota_root->node = NULL; info->quota_root->commit_root = NULL; } + if (info->uuid_root) { + free_extent_buffer(info->uuid_root->node); + free_extent_buffer(info->uuid_root->commit_root); + info->uuid_root->node = NULL; + info->uuid_root->commit_root = NULL; + } if (chunk_root) { free_extent_buffer(info->chunk_root->node); free_extent_buffer(info->chunk_root->commit_root); @@ -2034,11 +2070,11 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info) list_del(&gang[0]->root_list); if (gang[0]->in_radix) { - btrfs_free_fs_root(fs_info, gang[0]); + btrfs_drop_and_free_fs_root(fs_info, gang[0]); } else { free_extent_buffer(gang[0]->node); free_extent_buffer(gang[0]->commit_root); - kfree(gang[0]); + btrfs_put_fs_root(gang[0]); } } @@ -2049,7 +2085,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info) if (!ret) break; for (i = 0; i < ret; i++) - btrfs_free_fs_root(fs_info, gang[i]); + btrfs_drop_and_free_fs_root(fs_info, gang[i]); } } @@ -2074,21 +2110,18 @@ int open_ctree(struct super_block *sb, struct btrfs_root *chunk_root; struct btrfs_root *dev_root; struct btrfs_root *quota_root; + struct btrfs_root *uuid_root; struct btrfs_root *log_tree_root; int ret; int err = -EINVAL; int num_backups_tried = 0; int backup_index = 0; + bool create_uuid_tree; + bool check_uuid_tree; tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info); - extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info); - csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info); chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); - dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info); - quota_root = fs_info->quota_root = btrfs_alloc_root(fs_info); - - if (!tree_root || !extent_root || !csum_root || - !chunk_root || !dev_root || !quota_root) { + if (!tree_root || !chunk_root) { err = -ENOMEM; goto fail; } @@ -2131,9 +2164,9 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); - INIT_LIST_HEAD(&fs_info->delalloc_inodes); + INIT_LIST_HEAD(&fs_info->delalloc_roots); INIT_LIST_HEAD(&fs_info->caching_block_groups); - spin_lock_init(&fs_info->delalloc_lock); + spin_lock_init(&fs_info->delalloc_root_lock); spin_lock_init(&fs_info->trans_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); @@ -2169,9 +2202,9 @@ int open_ctree(struct super_block *sb, fs_info->max_inline = 8192 * 1024; fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; - fs_info->trans_no_join = 0; fs_info->free_chunk_space = 0; fs_info->tree_mod_log = RB_ROOT; + fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; /* readahead state */ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); @@ -2180,8 +2213,8 @@ int open_ctree(struct super_block *sb, fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); - INIT_LIST_HEAD(&fs_info->ordered_extents); - spin_lock_init(&fs_info->ordered_extent_lock); + INIT_LIST_HEAD(&fs_info->ordered_roots); + spin_lock_init(&fs_info->ordered_root_lock); fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), GFP_NOFS); if (!fs_info->delayed_root) { @@ -2253,6 +2286,7 @@ int open_ctree(struct super_block *sb, mutex_init(&fs_info->ordered_operations_mutex); + mutex_init(&fs_info->ordered_extent_flush_mutex); mutex_init(&fs_info->tree_log_mutex); mutex_init(&fs_info->chunk_mutex); mutex_init(&fs_info->transaction_kthread_mutex); @@ -2261,6 +2295,7 @@ int open_ctree(struct super_block *sb, init_rwsem(&fs_info->extent_commit_sem); init_rwsem(&fs_info->cleanup_work_sem); init_rwsem(&fs_info->subvol_sem); + sema_init(&fs_info->uuid_tree_rescan_sem, 1); fs_info->dev_replace.lock_owner = 0; atomic_set(&fs_info->dev_replace.nesting_level, 0); mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); @@ -2274,6 +2309,7 @@ int open_ctree(struct super_block *sb, fs_info->qgroup_seq = 1; fs_info->quota_enabled = 0; fs_info->pending_quota_state = 0; + fs_info->qgroup_ulist = NULL; mutex_init(&fs_info->qgroup_rescan_lock); btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); @@ -2365,7 +2401,7 @@ int open_ctree(struct super_block *sb, if (features) { printk(KERN_ERR "BTRFS: couldn't mount because of " "unsupported optional features (%Lx).\n", - (unsigned long long)features); + features); err = -EINVAL; goto fail_alloc; } @@ -2435,7 +2471,7 @@ int open_ctree(struct super_block *sb, if (!(sb->s_flags & MS_RDONLY) && features) { printk(KERN_ERR "BTRFS: couldn't mount RDWR because of " "unsupported option features (%Lx).\n", - (unsigned long long)features); + features); err = -EINVAL; goto fail_alloc; } @@ -2448,20 +2484,17 @@ int open_ctree(struct super_block *sb, &fs_info->generic_worker); btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", - fs_info->thread_pool_size, - &fs_info->generic_worker); + fs_info->thread_pool_size, NULL); btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc", - fs_info->thread_pool_size, - &fs_info->generic_worker); + fs_info->thread_pool_size, NULL); btrfs_init_workers(&fs_info->submit_workers, "submit", min_t(u64, fs_devices->num_devices, - fs_info->thread_pool_size), - &fs_info->generic_worker); + fs_info->thread_pool_size), NULL); btrfs_init_workers(&fs_info->caching_workers, "cache", - 2, &fs_info->generic_worker); + fs_info->thread_pool_size, NULL); /* a higher idle thresh on the submit workers makes it much more * likely that bios will be send down in a sane order to the @@ -2557,7 +2590,7 @@ int open_ctree(struct super_block *sb, sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); - if (disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) { + if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) { printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id); goto fail_sb_buffer; } @@ -2597,8 +2630,7 @@ int open_ctree(struct super_block *sb, chunk_root->commit_root = btrfs_root_node(chunk_root); read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), - BTRFS_UUID_SIZE); + btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE); ret = btrfs_read_chunk_tree(chunk_root); if (ret) { @@ -2638,33 +2670,60 @@ retry_root_backup: btrfs_set_root_node(&tree_root->root_item, tree_root->node); tree_root->commit_root = btrfs_root_node(tree_root); - ret = find_and_setup_root(tree_root, fs_info, - BTRFS_EXTENT_TREE_OBJECTID, extent_root); - if (ret) + location.objectid = BTRFS_EXTENT_TREE_OBJECTID; + location.type = BTRFS_ROOT_ITEM_KEY; + location.offset = 0; + + extent_root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(extent_root)) { + ret = PTR_ERR(extent_root); goto recovery_tree_root; + } extent_root->track_dirty = 1; + fs_info->extent_root = extent_root; - ret = find_and_setup_root(tree_root, fs_info, - BTRFS_DEV_TREE_OBJECTID, dev_root); - if (ret) + location.objectid = BTRFS_DEV_TREE_OBJECTID; + dev_root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(dev_root)) { + ret = PTR_ERR(dev_root); goto recovery_tree_root; + } dev_root->track_dirty = 1; + fs_info->dev_root = dev_root; + btrfs_init_devices_late(fs_info); - ret = find_and_setup_root(tree_root, fs_info, - BTRFS_CSUM_TREE_OBJECTID, csum_root); - if (ret) + location.objectid = BTRFS_CSUM_TREE_OBJECTID; + csum_root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(csum_root)) { + ret = PTR_ERR(csum_root); goto recovery_tree_root; + } csum_root->track_dirty = 1; + fs_info->csum_root = csum_root; - ret = find_and_setup_root(tree_root, fs_info, - BTRFS_QUOTA_TREE_OBJECTID, quota_root); - if (ret) { - kfree(quota_root); - quota_root = fs_info->quota_root = NULL; - } else { + location.objectid = BTRFS_QUOTA_TREE_OBJECTID; + quota_root = btrfs_read_tree_root(tree_root, &location); + if (!IS_ERR(quota_root)) { quota_root->track_dirty = 1; fs_info->quota_enabled = 1; fs_info->pending_quota_state = 1; + fs_info->quota_root = quota_root; + } + + location.objectid = BTRFS_UUID_TREE_OBJECTID; + uuid_root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(uuid_root)) { + ret = PTR_ERR(uuid_root); + if (ret != -ENOENT) + goto recovery_tree_root; + create_uuid_tree = true; + check_uuid_tree = false; + } else { + uuid_root->track_dirty = 1; + fs_info->uuid_root = uuid_root; + create_uuid_tree = false; + check_uuid_tree = + generation != btrfs_super_uuid_tree_generation(disk_super); } fs_info->generation = generation; @@ -2817,11 +2876,9 @@ retry_root_backup: location.objectid = BTRFS_FS_TREE_OBJECTID; location.type = BTRFS_ROOT_ITEM_KEY; - location.offset = (u64)-1; + location.offset = 0; fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); - if (!fs_info->fs_root) - goto fail_qgroup; if (IS_ERR(fs_info->fs_root)) { err = PTR_ERR(fs_info->fs_root); goto fail_qgroup; @@ -2853,14 +2910,39 @@ retry_root_backup: return ret; } + btrfs_qgroup_rescan_resume(fs_info); + + if (create_uuid_tree) { + pr_info("btrfs: creating UUID tree\n"); + ret = btrfs_create_uuid_tree(fs_info); + if (ret) { + pr_warn("btrfs: failed to create the UUID tree %d\n", + ret); + close_ctree(tree_root); + return ret; + } + } else if (check_uuid_tree || + btrfs_test_opt(tree_root, RESCAN_UUID_TREE)) { + pr_info("btrfs: checking UUID tree\n"); + ret = btrfs_check_uuid_tree(fs_info); + if (ret) { + pr_warn("btrfs: failed to check the UUID tree %d\n", + ret); + close_ctree(tree_root); + return ret; + } + } else { + fs_info->update_uuid_tree_gen = 1; + } + return 0; fail_qgroup: btrfs_free_qgroup_config(fs_info); fail_trans_kthread: kthread_stop(fs_info->transaction_kthread); - del_fs_roots(fs_info); btrfs_cleanup_transaction(fs_info->tree_root); + del_fs_roots(fs_info); fail_cleaner: kthread_stop(fs_info->cleaner_kthread); @@ -2954,15 +3036,17 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) */ for (i = 0; i < 1; i++) { bytenr = btrfs_sb_offset(i); - if (bytenr + 4096 >= i_size_read(bdev->bd_inode)) + if (bytenr + BTRFS_SUPER_INFO_SIZE >= + i_size_read(bdev->bd_inode)) break; - bh = __bread(bdev, bytenr / 4096, 4096); + bh = __bread(bdev, bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); if (!bh) continue; super = (struct btrfs_super_block *)bh->b_data; if (btrfs_super_bytenr(super) != bytenr || - super->magic != cpu_to_le64(BTRFS_MAGIC)) { + btrfs_super_magic(super) != BTRFS_MAGIC) { brelse(bh); continue; } @@ -3258,7 +3342,7 @@ int btrfs_calc_num_tolerated_disk_barrier_failures( BTRFS_BLOCK_GROUP_RAID10)) { num_tolerated_disk_barrier_failures = 1; } else if (flags & - BTRFS_BLOCK_GROUP_RAID5) { + BTRFS_BLOCK_GROUP_RAID6) { num_tolerated_disk_barrier_failures = 2; } } @@ -3282,7 +3366,6 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) int total_errors = 0; u64 flags; - max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1; do_barriers = !btrfs_test_opt(root, NOBARRIER); backup_super_roots(root->fs_info); @@ -3291,6 +3374,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) mutex_lock(&root->fs_info->fs_devices->device_list_mutex); head = &root->fs_info->fs_devices->devices; + max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1; if (do_barriers) { ret = barrier_all_devices(root->fs_info); @@ -3332,9 +3416,12 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) if (total_errors > max_errors) { printk(KERN_ERR "btrfs: %d errors while writing supers\n", total_errors); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - /* This shouldn't happen. FUA is masked off if unsupported */ - BUG(); + /* FUA is masked off if unsupported and can't be the reason */ + btrfs_error(root->fs_info, -EIO, + "%d errors while writing supers", total_errors); + return -EIO; } total_errors = 0; @@ -3366,7 +3453,9 @@ int write_ctree_super(struct btrfs_trans_handle *trans, return ret; } -void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) +/* Drop a fs root from the radix tree and free it. */ +void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root) { spin_lock(&fs_info->fs_roots_radix_lock); radix_tree_delete(&fs_info->fs_roots_radix, @@ -3390,6 +3479,8 @@ static void free_fs_root(struct btrfs_root *root) { iput(root->cache_inode); WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); + btrfs_free_block_rsv(root, root->orphan_block_rsv); + root->orphan_block_rsv = NULL; if (root->anon_dev) free_anon_bdev(root->anon_dev); free_extent_buffer(root->node); @@ -3397,7 +3488,12 @@ static void free_fs_root(struct btrfs_root *root) kfree(root->free_ino_ctl); kfree(root->free_ino_pinned); kfree(root->name); - kfree(root); + btrfs_put_fs_root(root); +} + +void btrfs_free_fs_root(struct btrfs_root *root) +{ + free_fs_root(root); } int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) @@ -3474,6 +3570,11 @@ int close_ctree(struct btrfs_root *root) fs_info->closing = 1; smp_mb(); + /* wait for the uuid_scan task to finish */ + down(&fs_info->uuid_tree_rescan_sem); + /* avoid complains from lockdep et al., set sem back to initial state */ + up(&fs_info->uuid_tree_rescan_sem); + /* pause restriper - we want to resume on mount */ btrfs_pause_balance(fs_info); @@ -3512,15 +3613,15 @@ int close_ctree(struct btrfs_root *root) percpu_counter_sum(&fs_info->delalloc_bytes)); } - free_root_pointers(fs_info, 1); - btrfs_free_block_groups(fs_info); + btrfs_stop_all_workers(fs_info); + del_fs_roots(fs_info); - iput(fs_info->btree_inode); + free_root_pointers(fs_info, 1); - btrfs_stop_all_workers(fs_info); + iput(fs_info->btree_inode); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_test_opt(root, CHECK_INTEGRITY)) @@ -3537,6 +3638,9 @@ int close_ctree(struct btrfs_root *root) btrfs_free_stripe_hash_table(fs_info); + btrfs_free_block_rsv(root, root->orphan_block_rsv); + root->orphan_block_rsv = NULL; + return 0; } @@ -3572,9 +3676,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) if (transid != root->fs_info->generation) WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, " "found %llu running %llu\n", - (unsigned long long)buf->start, - (unsigned long long)transid, - (unsigned long long)root->fs_info->generation); + buf->start, transid, root->fs_info->generation); was_dirty = set_extent_buffer_dirty(buf); if (!was_dirty) __percpu_counter_add(&root->fs_info->dirty_metadata_bytes, @@ -3653,7 +3755,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, INIT_LIST_HEAD(&splice); mutex_lock(&root->fs_info->ordered_operations_mutex); - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->fs_info->ordered_root_lock); list_splice_init(&t->ordered_operations, &splice); while (!list_empty(&splice)) { @@ -3661,14 +3763,14 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, ordered_operations); list_del_init(&btrfs_inode->ordered_operations); - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->fs_info->ordered_root_lock); btrfs_invalidate_inodes(btrfs_inode->root); - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->fs_info->ordered_root_lock); } - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->fs_info->ordered_root_lock); mutex_unlock(&root->fs_info->ordered_operations_mutex); } @@ -3676,19 +3778,40 @@ static void btrfs_destroy_ordered_extents(struct btrfs_root *root) { struct btrfs_ordered_extent *ordered; - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->ordered_extent_lock); /* * This will just short circuit the ordered completion stuff which will * make sure the ordered extent gets properly cleaned up. */ - list_for_each_entry(ordered, &root->fs_info->ordered_extents, + list_for_each_entry(ordered, &root->ordered_extents, root_extent_list) set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->ordered_extent_lock); } -int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, - struct btrfs_root *root) +static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + spin_lock(&fs_info->ordered_root_lock); + list_splice_init(&fs_info->ordered_roots, &splice); + while (!list_empty(&splice)) { + root = list_first_entry(&splice, struct btrfs_root, + ordered_root); + list_del_init(&root->ordered_root); + + btrfs_destroy_ordered_extents(root); + + cond_resched_lock(&fs_info->ordered_root_lock); + } + spin_unlock(&fs_info->ordered_root_lock); +} + +static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, + struct btrfs_root *root) { struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; @@ -3706,6 +3829,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, while ((node = rb_first(&delayed_refs->root)) != NULL) { struct btrfs_delayed_ref_head *head = NULL; + bool pin_bytes = false; ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); atomic_set(&ref->refs, 1); @@ -3726,8 +3850,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, } if (head->must_insert_reserved) - btrfs_pin_extent(root, ref->bytenr, - ref->num_bytes, 1); + pin_bytes = true; btrfs_free_delayed_extent_op(head->extent_op); delayed_refs->num_heads--; if (list_empty(&head->cluster)) @@ -3738,9 +3861,13 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, ref->in_tree = 0; rb_erase(&ref->rb_node, &delayed_refs->root); delayed_refs->num_entries--; - if (head) - mutex_unlock(&head->mutex); spin_unlock(&delayed_refs->lock); + if (head) { + if (pin_bytes) + btrfs_pin_extent(root, ref->bytenr, + ref->num_bytes, 1); + mutex_unlock(&head->mutex); + } btrfs_put_delayed_ref(ref); cond_resched(); @@ -3777,24 +3904,49 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) INIT_LIST_HEAD(&splice); - spin_lock(&root->fs_info->delalloc_lock); - list_splice_init(&root->fs_info->delalloc_inodes, &splice); + spin_lock(&root->delalloc_lock); + list_splice_init(&root->delalloc_inodes, &splice); while (!list_empty(&splice)) { - btrfs_inode = list_entry(splice.next, struct btrfs_inode, - delalloc_inodes); + btrfs_inode = list_first_entry(&splice, struct btrfs_inode, + delalloc_inodes); list_del_init(&btrfs_inode->delalloc_inodes); clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, &btrfs_inode->runtime_flags); - spin_unlock(&root->fs_info->delalloc_lock); + spin_unlock(&root->delalloc_lock); btrfs_invalidate_inodes(btrfs_inode->root); - spin_lock(&root->fs_info->delalloc_lock); + spin_lock(&root->delalloc_lock); } - spin_unlock(&root->fs_info->delalloc_lock); + spin_unlock(&root->delalloc_lock); +} + +static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + spin_lock(&fs_info->delalloc_root_lock); + list_splice_init(&fs_info->delalloc_roots, &splice); + while (!list_empty(&splice)) { + root = list_first_entry(&splice, struct btrfs_root, + delalloc_root); + list_del_init(&root->delalloc_root); + root = btrfs_grab_fs_root(root); + BUG_ON(!root); + spin_unlock(&fs_info->delalloc_root_lock); + + btrfs_destroy_delalloc_inodes(root); + btrfs_put_fs_root(root); + + spin_lock(&fs_info->delalloc_root_lock); + } + spin_unlock(&fs_info->delalloc_root_lock); } static int btrfs_destroy_marked_extents(struct btrfs_root *root, @@ -3878,19 +4030,14 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, cur_trans->dirty_pages.dirty_bytes); - /* FIXME: cleanup wait for commit */ - cur_trans->in_commit = 1; - cur_trans->blocked = 1; + cur_trans->state = TRANS_STATE_COMMIT_START; wake_up(&root->fs_info->transaction_blocked_wait); btrfs_evict_pending_snapshots(cur_trans); - cur_trans->blocked = 0; + cur_trans->state = TRANS_STATE_UNBLOCKED; wake_up(&root->fs_info->transaction_wait); - cur_trans->commit_done = 1; - wake_up(&cur_trans->commit_wait); - btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); @@ -3899,6 +4046,9 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, btrfs_destroy_pinned_extent(root, root->fs_info->pinned_extents); + cur_trans->state =TRANS_STATE_COMPLETED; + wake_up(&cur_trans->commit_wait); + /* memset(cur_trans, 0, sizeof(*cur_trans)); kmem_cache_free(btrfs_transaction_cachep, cur_trans); @@ -3914,7 +4064,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) spin_lock(&root->fs_info->trans_lock); list_splice_init(&root->fs_info->trans_list, &list); - root->fs_info->trans_no_join = 1; + root->fs_info->running_transaction = NULL; spin_unlock(&root->fs_info->trans_lock); while (!list_empty(&list)) { @@ -3922,37 +4072,31 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) btrfs_destroy_ordered_operations(t, root); - btrfs_destroy_ordered_extents(root); + btrfs_destroy_all_ordered_extents(root->fs_info); btrfs_destroy_delayed_refs(t, root); - /* FIXME: cleanup wait for commit */ - t->in_commit = 1; - t->blocked = 1; + /* + * FIXME: cleanup wait for commit + * We needn't acquire the lock here, because we are during + * the umount, there is no other task which will change it. + */ + t->state = TRANS_STATE_COMMIT_START; smp_mb(); if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) wake_up(&root->fs_info->transaction_blocked_wait); btrfs_evict_pending_snapshots(t); - t->blocked = 0; + t->state = TRANS_STATE_UNBLOCKED; smp_mb(); if (waitqueue_active(&root->fs_info->transaction_wait)) wake_up(&root->fs_info->transaction_wait); - t->commit_done = 1; - smp_mb(); - if (waitqueue_active(&t->commit_wait)) - wake_up(&t->commit_wait); - btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); - btrfs_destroy_delalloc_inodes(root); - - spin_lock(&root->fs_info->trans_lock); - root->fs_info->running_transaction = NULL; - spin_unlock(&root->fs_info->trans_lock); + btrfs_destroy_all_delalloc_inodes(root->fs_info); btrfs_destroy_marked_extents(root, &t->dirty_pages, EXTENT_DIRTY); @@ -3960,15 +4104,17 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) btrfs_destroy_pinned_extent(root, root->fs_info->pinned_extents); + t->state = TRANS_STATE_COMPLETED; + smp_mb(); + if (waitqueue_active(&t->commit_wait)) + wake_up(&t->commit_wait); + atomic_set(&t->use_count, 0); list_del_init(&t->list); memset(t, 0, sizeof(*t)); kmem_cache_free(btrfs_transaction_cachep, t); } - spin_lock(&root->fs_info->trans_lock); - root->fs_info->trans_no_join = 0; - spin_unlock(&root->fs_info->trans_lock); mutex_unlock(&root->fs_info->transaction_kthread_mutex); return 0; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index be69ce1b07a2..b71acd6e1e5b 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -63,14 +63,40 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); int btrfs_commit_super(struct btrfs_root *root); struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); -struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, - struct btrfs_key *location); +struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, + struct btrfs_key *location); +int btrfs_init_fs_root(struct btrfs_root *root); +int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root); struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, struct btrfs_key *location); int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty(struct btrfs_root *root); void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); -void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); +void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root); +void btrfs_free_fs_root(struct btrfs_root *root); + +/* + * This function is used to grab the root, and avoid it is freed when we + * access it. But it doesn't ensure that the tree is not dropped. + * + * If you want to ensure the whole tree is safe, you should use + * fs_info->subvol_srcu + */ +static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root) +{ + if (atomic_inc_not_zero(&root->refs)) + return root; + return NULL; +} + +static inline void btrfs_put_fs_root(struct btrfs_root *root) +{ + if (atomic_dec_and_test(&root->refs)) + kfree(root); +} + void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 81ee29eeb7ca..4b8691607373 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -82,11 +82,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, goto fail; } - if (btrfs_root_refs(&root->root_item) == 0) { - err = -ENOENT; - goto fail; - } - key.objectid = objectid; btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index df472ab1b5ac..d58bef130a41 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -24,6 +24,7 @@ #include <linux/kthread.h> #include <linux/slab.h> #include <linux/ratelimit.h> +#include <linux/percpu_counter.h> #include "compat.h" #include "hash.h" #include "ctree.h" @@ -112,7 +113,8 @@ static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) { smp_mb(); - return cache->cached == BTRFS_CACHE_FINISHED; + return cache->cached == BTRFS_CACHE_FINISHED || + cache->cached == BTRFS_CACHE_ERROR; } static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) @@ -388,7 +390,7 @@ static noinline void caching_thread(struct btrfs_work *work) u64 total_found = 0; u64 last = 0; u32 nritems; - int ret = 0; + int ret = -ENOMEM; caching_ctl = container_of(work, struct btrfs_caching_control, work); block_group = caching_ctl->block_group; @@ -419,6 +421,7 @@ again: /* need to make sure the commit_root doesn't disappear */ down_read(&fs_info->extent_commit_sem); +next: ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto err; @@ -458,6 +461,16 @@ again: continue; } + if (key.objectid < last) { + key.objectid = last; + key.offset = 0; + key.type = BTRFS_EXTENT_ITEM_KEY; + + caching_ctl->progress = last; + btrfs_release_path(path); + goto next; + } + if (key.objectid < block_group->key.objectid) { path->slots[0]++; continue; @@ -505,6 +518,12 @@ err: mutex_unlock(&caching_ctl->mutex); out: + if (ret) { + spin_lock(&block_group->lock); + block_group->caching_ctl = NULL; + block_group->cached = BTRFS_CACHE_ERROR; + spin_unlock(&block_group->lock); + } wake_up(&caching_ctl->wait); put_caching_control(caching_ctl); @@ -770,10 +789,23 @@ again: goto out_free; if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = root->leafsize; - btrfs_release_path(path); - goto again; + metadata = 0; + if (path->slots[0]) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == root->leafsize) + ret = 0; + } + if (ret) { + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = root->leafsize; + btrfs_release_path(path); + goto again; + } } if (ret == 0) { @@ -2010,6 +2042,8 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, ins.type = BTRFS_EXTENT_ITEM_KEY; ref = btrfs_delayed_node_to_data_ref(node); + trace_run_delayed_data_ref(node, ref, node->action); + if (node->type == BTRFS_SHARED_DATA_REF_KEY) parent = ref->parent; else @@ -2153,6 +2187,8 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, SKINNY_METADATA); ref = btrfs_delayed_node_to_tree_ref(node); + trace_run_delayed_tree_ref(node, ref, node->action); + if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) parent = ref->parent; else @@ -2211,6 +2247,8 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, */ BUG_ON(extent_op); head = btrfs_delayed_node_to_head(node); + trace_run_delayed_ref_head(node, head, node->action); + if (insert_reserved) { btrfs_pin_extent(root, node->bytenr, node->num_bytes, 1); @@ -2402,6 +2440,8 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, default: WARN_ON(1); } + } else { + list_del_init(&locked_ref->cluster); } spin_unlock(&delayed_refs->lock); @@ -2424,7 +2464,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, * list before we release it. */ if (btrfs_delayed_ref_is_head(ref)) { - list_del_init(&locked_ref->cluster); btrfs_delayed_ref_unlock(locked_ref); locked_ref = NULL; } @@ -2526,6 +2565,51 @@ static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq, return 0; } +static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) +{ + u64 num_bytes; + + num_bytes = heads * (sizeof(struct btrfs_extent_item) + + sizeof(struct btrfs_extent_inline_ref)); + if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) + num_bytes += heads * sizeof(struct btrfs_tree_block_info); + + /* + * We don't ever fill up leaves all the way so multiply by 2 just to be + * closer to what we're really going to want to ouse. + */ + return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); +} + +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_rsv *global_rsv; + u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; + u64 num_bytes; + int ret = 0; + + num_bytes = btrfs_calc_trans_metadata_size(root, 1); + num_heads = heads_to_leaves(root, num_heads); + if (num_heads > 1) + num_bytes += (num_heads - 1) * root->leafsize; + num_bytes <<= 1; + global_rsv = &root->fs_info->global_block_rsv; + + /* + * If we can't allocate any more chunks lets make sure we have _lots_ of + * wiggle room since running delayed refs can create more delayed refs. + */ + if (global_rsv->space_info->full) + num_bytes <<= 1; + + spin_lock(&global_rsv->lock); + if (global_rsv->reserved <= num_bytes) + ret = 1; + spin_unlock(&global_rsv->lock); + return ret; +} + /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -2573,7 +2657,8 @@ progress: old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); if (old) { DEFINE_WAIT(__wait); - if (delayed_refs->num_entries < 16348) + if (delayed_refs->flushing || + !btrfs_should_throttle_delayed_refs(trans, root)) return 0; prepare_to_wait(&delayed_refs->wait, &__wait, @@ -2608,7 +2693,7 @@ again: while (1) { if (!(run_all || run_most) && - delayed_refs->num_heads_ready < 64) + !btrfs_should_throttle_delayed_refs(trans, root)) break; /* @@ -2629,6 +2714,7 @@ again: spin_unlock(&delayed_refs->lock); btrfs_abort_transaction(trans, root, ret); atomic_dec(&delayed_refs->procs_running_refs); + wake_up(&delayed_refs->wait); return ret; } @@ -3310,6 +3396,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, struct btrfs_space_info *found; int i; int factor; + int ret; if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) @@ -3333,6 +3420,12 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, if (!found) return -ENOMEM; + ret = percpu_counter_init(&found->total_bytes_pinned, 0); + if (ret) { + kfree(found); + return ret; + } + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) INIT_LIST_HEAD(&found->block_groups[i]); init_rwsem(&found->groups_sem); @@ -3565,10 +3658,11 @@ alloc: } /* - * If we have less pinned bytes than we want to allocate then - * don't bother committing the transaction, it won't help us. + * If we don't have enough pinned space to deal with this + * allocation don't bother committing the transaction. */ - if (data_sinfo->bytes_pinned < bytes) + if (percpu_counter_compare(&data_sinfo->total_bytes_pinned, + bytes) < 0) committed = 1; spin_unlock(&data_sinfo->lock); @@ -3577,6 +3671,7 @@ commit_trans: if (!committed && !atomic_read(&root->fs_info->open_ioctl_trans)) { committed = 1; + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -3609,6 +3704,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) data_sinfo = root->fs_info->data_sinfo; spin_lock(&data_sinfo->lock); + WARN_ON(data_sinfo->bytes_may_use < bytes); data_sinfo->bytes_may_use -= bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", data_sinfo->flags, bytes, 0); @@ -3741,8 +3837,12 @@ again: if (force < space_info->force_alloc) force = space_info->force_alloc; if (space_info->full) { + if (should_alloc_chunk(extent_root, space_info, force)) + ret = -ENOSPC; + else + ret = 0; spin_unlock(&space_info->lock); - return 0; + return ret; } if (!should_alloc_chunk(extent_root, space_info, force)) { @@ -3825,7 +3925,6 @@ static int can_overcommit(struct btrfs_root *root, u64 space_size; u64 avail; u64 used; - u64 to_add; used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly; @@ -3859,25 +3958,17 @@ static int can_overcommit(struct btrfs_root *root, BTRFS_BLOCK_GROUP_RAID10)) avail >>= 1; - to_add = space_info->total_bytes; - /* * If we aren't flushing all things, let us overcommit up to * 1/2th of the space. If we can flush, don't let us overcommit * too much, let it overcommit up to 1/8 of the space. */ if (flush == BTRFS_RESERVE_FLUSH_ALL) - to_add >>= 3; + avail >>= 3; else - to_add >>= 1; - - /* - * Limit the overcommit to the amount of free space we could possibly - * allocate for chunks. - */ - to_add = min(avail, to_add); + avail >>= 1; - if (used + bytes < space_info->total_bytes + to_add) + if (used + bytes < space_info->total_bytes + avail) return 1; return 0; } @@ -3886,12 +3977,11 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, unsigned long nr_pages) { struct super_block *sb = root->fs_info->sb; - int started; - /* If we can not start writeback, just sync all the delalloc file. */ - started = try_to_writeback_inodes_sb_nr(sb, nr_pages, - WB_REASON_FS_FREE_SPACE); - if (!started) { + if (down_read_trylock(&sb->s_umount)) { + writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); + up_read(&sb->s_umount); + } else { /* * We needn't worry the filesystem going from r/w to r/o though * we don't acquire ->s_umount mutex, because the filesystem @@ -3899,9 +3989,9 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, * the filesystem is readonly(all dirty pages are written to * the disk). */ - btrfs_start_delalloc_inodes(root, 0); + btrfs_start_all_delalloc_inodes(root->fs_info, 0); if (!current->journal_info) - btrfs_wait_ordered_extents(root, 0); + btrfs_wait_all_ordered_extents(root->fs_info); } } @@ -3931,7 +4021,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, if (delalloc_bytes == 0) { if (trans) return; - btrfs_wait_ordered_extents(root, 0); + btrfs_wait_all_ordered_extents(root->fs_info); return; } @@ -3959,7 +4049,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, loops++; if (wait_ordered && !trans) { - btrfs_wait_ordered_extents(root, 0); + btrfs_wait_all_ordered_extents(root->fs_info); } else { time_left = schedule_timeout_killable(1); if (time_left) @@ -3997,7 +4087,8 @@ static int may_commit_transaction(struct btrfs_root *root, /* See if there is enough pinned space to make this reservation */ spin_lock(&space_info->lock); - if (space_info->bytes_pinned >= bytes) { + if (percpu_counter_compare(&space_info->total_bytes_pinned, + bytes) >= 0) { spin_unlock(&space_info->lock); goto commit; } @@ -4012,7 +4103,8 @@ static int may_commit_transaction(struct btrfs_root *root, spin_lock(&space_info->lock); spin_lock(&delayed_rsv->lock); - if (space_info->bytes_pinned + delayed_rsv->size < bytes) { + if (percpu_counter_compare(&space_info->total_bytes_pinned, + bytes - delayed_rsv->size) >= 0) { spin_unlock(&delayed_rsv->lock); spin_unlock(&space_info->lock); return -ENOSPC; @@ -4261,6 +4353,9 @@ static struct btrfs_block_rsv *get_block_rsv( if (root == root->fs_info->csum_root && trans->adding_csums) block_rsv = trans->block_rsv; + if (root == root->fs_info->uuid_root) + block_rsv = trans->block_rsv; + if (!block_rsv) block_rsv = root->block_rsv; @@ -4297,6 +4392,31 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, spin_unlock(&block_rsv->lock); } +int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *dest, u64 num_bytes, + int min_factor) +{ + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + u64 min_bytes; + + if (global_rsv->space_info != dest->space_info) + return -ENOSPC; + + spin_lock(&global_rsv->lock); + min_bytes = div_factor(global_rsv->size, min_factor); + if (global_rsv->reserved < min_bytes + num_bytes) { + spin_unlock(&global_rsv->lock); + return -ENOSPC; + } + global_rsv->reserved -= num_bytes; + if (global_rsv->reserved < global_rsv->size) + global_rsv->full = 0; + spin_unlock(&global_rsv->lock); + + block_rsv_add_bytes(dest, num_bytes, 1); + return 0; +} + static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, struct btrfs_block_rsv *dest, u64 num_bytes) @@ -4336,7 +4456,6 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, space_info->bytes_may_use -= num_bytes; trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, num_bytes, 0); - space_info->reservation_progress++; spin_unlock(&space_info->lock); } } @@ -4537,7 +4656,6 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) sinfo->bytes_may_use -= num_bytes; trace_btrfs_space_reservation(fs_info, "space_info", sinfo->flags, num_bytes, 0); - sinfo->reservation_progress++; block_rsv->reserved = block_rsv->size; block_rsv->full = 1; } @@ -4645,10 +4763,12 @@ void btrfs_orphan_release_metadata(struct inode *inode) int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int items, - u64 *qgroup_reserved) + u64 *qgroup_reserved, + bool use_global_rsv) { u64 num_bytes; int ret; + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; if (root->fs_info->quota_enabled) { /* One for parent inode, two for dir entries */ @@ -4667,6 +4787,10 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, BTRFS_BLOCK_GROUP_METADATA); ret = btrfs_block_rsv_add(root, rsv, num_bytes, BTRFS_RESERVE_FLUSH_ALL); + + if (ret == -ENOSPC && use_global_rsv) + ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes); + if (ret) { if (*qgroup_reserved) btrfs_qgroup_free(root, *qgroup_reserved); @@ -5030,14 +5154,14 @@ static int update_block_group(struct btrfs_root *root, int factor; /* block accounting for super block */ - spin_lock(&info->delalloc_lock); + spin_lock(&info->delalloc_root_lock); old_val = btrfs_super_bytes_used(info->super_copy); if (alloc) old_val += num_bytes; else old_val -= num_bytes; btrfs_set_super_bytes_used(info->super_copy, old_val); - spin_unlock(&info->delalloc_lock); + spin_unlock(&info->delalloc_root_lock); while (total) { cache = btrfs_lookup_block_group(info, bytenr); @@ -5189,6 +5313,80 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, return ret; } +static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes) +{ + int ret; + struct btrfs_block_group_cache *block_group; + struct btrfs_caching_control *caching_ctl; + + block_group = btrfs_lookup_block_group(root->fs_info, start); + if (!block_group) + return -EINVAL; + + cache_block_group(block_group, 0); + caching_ctl = get_caching_control(block_group); + + if (!caching_ctl) { + /* Logic error */ + BUG_ON(!block_group_cache_done(block_group)); + ret = btrfs_remove_free_space(block_group, start, num_bytes); + } else { + mutex_lock(&caching_ctl->mutex); + + if (start >= caching_ctl->progress) { + ret = add_excluded_extent(root, start, num_bytes); + } else if (start + num_bytes <= caching_ctl->progress) { + ret = btrfs_remove_free_space(block_group, + start, num_bytes); + } else { + num_bytes = caching_ctl->progress - start; + ret = btrfs_remove_free_space(block_group, + start, num_bytes); + if (ret) + goto out_lock; + + num_bytes = (start + num_bytes) - + caching_ctl->progress; + start = caching_ctl->progress; + ret = add_excluded_extent(root, start, num_bytes); + } +out_lock: + mutex_unlock(&caching_ctl->mutex); + put_caching_control(caching_ctl); + } + btrfs_put_block_group(block_group); + return ret; +} + +int btrfs_exclude_logged_extents(struct btrfs_root *log, + struct extent_buffer *eb) +{ + struct btrfs_file_extent_item *item; + struct btrfs_key key; + int found_type; + int i; + + if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) + return 0; + + for (i = 0; i < btrfs_header_nritems(eb); i++) { + btrfs_item_key_to_cpu(eb, &key, i); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(eb, item); + if (found_type == BTRFS_FILE_EXTENT_INLINE) + continue; + if (btrfs_file_extent_disk_bytenr(eb, item) == 0) + continue; + key.objectid = btrfs_file_extent_disk_bytenr(eb, item); + key.offset = btrfs_file_extent_disk_num_bytes(eb, item); + __exclude_logged_extent(log, key.objectid, key.offset); + } + + return 0; +} + /** * btrfs_update_reserved_bytes - update the block_group and space info counters * @cache: The cache we are manipulating @@ -5237,7 +5435,6 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, space_info->bytes_readonly += num_bytes; cache->reserved -= num_bytes; space_info->bytes_reserved -= num_bytes; - space_info->reservation_progress++; } spin_unlock(&cache->lock); spin_unlock(&space_info->lock); @@ -5251,6 +5448,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_caching_control *next; struct btrfs_caching_control *caching_ctl; struct btrfs_block_group_cache *cache; + struct btrfs_space_info *space_info; down_write(&fs_info->extent_commit_sem); @@ -5273,6 +5471,9 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, up_write(&fs_info->extent_commit_sem); + list_for_each_entry_rcu(space_info, &fs_info->space_info, list) + percpu_counter_set(&space_info->total_bytes_pinned, 0); + update_global_block_rsv(fs_info); } @@ -5370,6 +5571,27 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, return 0; } +static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, + u64 owner, u64 root_objectid) +{ + struct btrfs_space_info *space_info; + u64 flags; + + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) + flags = BTRFS_BLOCK_GROUP_SYSTEM; + else + flags = BTRFS_BLOCK_GROUP_METADATA; + } else { + flags = BTRFS_BLOCK_GROUP_DATA; + } + + space_info = __find_space_info(fs_info, flags); + BUG_ON(!space_info); /* Logic bug */ + percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); +} + + static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -5485,7 +5707,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (ret) { btrfs_err(info, "umm, got %d back from search, was looking for %llu", - ret, (unsigned long long)bytenr); + ret, bytenr); if (ret > 0) btrfs_print_leaf(extent_root, path->nodes[0]); @@ -5501,11 +5723,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, WARN_ON(1); btrfs_err(info, "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", - (unsigned long long)bytenr, - (unsigned long long)parent, - (unsigned long long)root_objectid, - (unsigned long long)owner_objectid, - (unsigned long long)owner_offset); + bytenr, parent, root_objectid, owner_objectid, + owner_offset); } else { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -5534,7 +5753,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, -1, 1); if (ret) { btrfs_err(info, "umm, got %d back from search, was looking for %llu", - ret, (unsigned long long)bytenr); + ret, bytenr); btrfs_print_leaf(extent_root, path->nodes[0]); } if (ret < 0) { @@ -5590,6 +5809,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto out; } } + add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid, + root_objectid); } else { if (found_extent) { BUG_ON(is_data && refs_to_drop != @@ -5713,6 +5934,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, u64 parent, int last_ref) { struct btrfs_block_group_cache *cache = NULL; + int pin = 1; int ret; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { @@ -5745,8 +5967,14 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, btrfs_add_free_space(cache, buf->start, buf->len); btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); + pin = 0; } out: + if (pin) + add_pinned_bytes(root->fs_info, buf->len, + btrfs_header_level(buf), + root->root_key.objectid); + /* * Deleting the buffer, clear the corrupt flag since it doesn't matter * anymore. @@ -5763,6 +5991,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, int ret; struct btrfs_fs_info *fs_info = root->fs_info; + add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); + /* * tree log blocks never actually go into the extent allocation * tree, just update pinning info and exit early. @@ -5805,8 +6035,11 @@ static u64 stripe_align(struct btrfs_root *root, * for our min num_bytes. Another option is to have it go ahead * and look in the rbtree for a free extent of a given size, but this * is a good start. + * + * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using + * any of the information in this block group. */ -static noinline int +static noinline void wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, u64 num_bytes) { @@ -5814,28 +6047,29 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, caching_ctl = get_caching_control(cache); if (!caching_ctl) - return 0; + return; wait_event(caching_ctl->wait, block_group_cache_done(cache) || (cache->free_space_ctl->free_space >= num_bytes)); put_caching_control(caching_ctl); - return 0; } static noinline int wait_block_group_cache_done(struct btrfs_block_group_cache *cache) { struct btrfs_caching_control *caching_ctl; + int ret = 0; caching_ctl = get_caching_control(cache); if (!caching_ctl) - return 0; + return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; wait_event(caching_ctl->wait, block_group_cache_done(cache)); - + if (cache->cached == BTRFS_CACHE_ERROR) + ret = -EIO; put_caching_control(caching_ctl); - return 0; + return ret; } int __get_raid_index(u64 flags) @@ -5871,13 +6105,15 @@ enum btrfs_loop_type { /* * walks the btree of allocated extents and find a hole of a given size. * The key ins is changed to record the hole: - * ins->objectid == block start + * ins->objectid == start position * ins->flags = BTRFS_EXTENT_ITEM_KEY - * ins->offset == number of blocks + * ins->offset == the size of the hole. * Any available blocks before search_start are skipped. + * + * If there is no suitable free space, we will record the max size of + * the free space extent currently. */ -static noinline int find_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *orig_root, +static noinline int find_free_extent(struct btrfs_root *orig_root, u64 num_bytes, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, u64 flags) @@ -5888,6 +6124,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group = NULL; struct btrfs_block_group_cache *used_block_group; u64 search_start = 0; + u64 max_extent_size = 0; int empty_cluster = 2 * 1024 * 1024; struct btrfs_space_info *space_info; int loop = 0; @@ -6018,6 +6255,8 @@ have_block_group: ret = 0; } + if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) + goto loop; if (unlikely(block_group->ro)) goto loop; @@ -6045,7 +6284,10 @@ have_block_group: btrfs_get_block_group(used_block_group); offset = btrfs_alloc_from_cluster(used_block_group, - last_ptr, num_bytes, used_block_group->key.objectid); + last_ptr, + num_bytes, + used_block_group->key.objectid, + &max_extent_size); if (offset) { /* we have a block, we're done */ spin_unlock(&last_ptr->refill_lock); @@ -6098,18 +6340,20 @@ refill_cluster: block_group->full_stripe_len); /* allocate a cluster in this block group */ - ret = btrfs_find_space_cluster(trans, root, - block_group, last_ptr, - search_start, num_bytes, - aligned_cluster); + ret = btrfs_find_space_cluster(root, block_group, + last_ptr, search_start, + num_bytes, + aligned_cluster); if (ret == 0) { /* * now pull our allocation out of this * cluster */ offset = btrfs_alloc_from_cluster(block_group, - last_ptr, num_bytes, - search_start); + last_ptr, + num_bytes, + search_start, + &max_extent_size); if (offset) { /* we found one, proceed */ spin_unlock(&last_ptr->refill_lock); @@ -6144,13 +6388,18 @@ unclustered_alloc: if (cached && block_group->free_space_ctl->free_space < num_bytes + empty_cluster + empty_size) { + if (block_group->free_space_ctl->free_space > + max_extent_size) + max_extent_size = + block_group->free_space_ctl->free_space; spin_unlock(&block_group->free_space_ctl->tree_lock); goto loop; } spin_unlock(&block_group->free_space_ctl->tree_lock); offset = btrfs_find_space_for_alloc(block_group, search_start, - num_bytes, empty_size); + num_bytes, empty_size, + &max_extent_size); /* * If we didn't find a chunk, and we haven't failed on this * block group before, and this block group is in the middle of @@ -6232,17 +6481,28 @@ loop: index = 0; loop++; if (loop == LOOP_ALLOC_CHUNK) { + struct btrfs_trans_handle *trans; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + ret = do_chunk_alloc(trans, root, flags, CHUNK_ALLOC_FORCE); /* * Do not bail out on ENOSPC since we * can do more things. */ - if (ret < 0 && ret != -ENOSPC) { + if (ret < 0 && ret != -ENOSPC) btrfs_abort_transaction(trans, root, ret); + else + ret = 0; + btrfs_end_transaction(trans, root); + if (ret) goto out; - } } if (loop == LOOP_NO_EMPTY_SIZE) { @@ -6257,7 +6517,8 @@ loop: ret = 0; } out: - + if (ret == -ENOSPC) + ins->offset = max_extent_size; return ret; } @@ -6269,19 +6530,15 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, spin_lock(&info->lock); printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n", - (unsigned long long)info->flags, - (unsigned long long)(info->total_bytes - info->bytes_used - - info->bytes_pinned - info->bytes_reserved - - info->bytes_readonly), + info->flags, + info->total_bytes - info->bytes_used - info->bytes_pinned - + info->bytes_reserved - info->bytes_readonly, (info->full) ? "" : "not "); printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, " "reserved=%llu, may_use=%llu, readonly=%llu\n", - (unsigned long long)info->total_bytes, - (unsigned long long)info->bytes_used, - (unsigned long long)info->bytes_pinned, - (unsigned long long)info->bytes_reserved, - (unsigned long long)info->bytes_may_use, - (unsigned long long)info->bytes_readonly); + info->total_bytes, info->bytes_used, info->bytes_pinned, + info->bytes_reserved, info->bytes_may_use, + info->bytes_readonly); spin_unlock(&info->lock); if (!dump_block_groups) @@ -6292,12 +6549,9 @@ again: list_for_each_entry(cache, &info->block_groups[index], list) { spin_lock(&cache->lock); printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n", - (unsigned long long)cache->key.objectid, - (unsigned long long)cache->key.offset, - (unsigned long long)btrfs_block_group_used(&cache->item), - (unsigned long long)cache->pinned, - (unsigned long long)cache->reserved, - cache->ro ? "[readonly]" : ""); + cache->key.objectid, cache->key.offset, + btrfs_block_group_used(&cache->item), cache->pinned, + cache->reserved, cache->ro ? "[readonly]" : ""); btrfs_dump_free_space(cache, bytes); spin_unlock(&cache->lock); } @@ -6306,8 +6560,7 @@ again: up_read(&info->groups_sem); } -int btrfs_reserve_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data) @@ -6319,12 +6572,12 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, flags = btrfs_get_alloc_profile(root, is_data); again: WARN_ON(num_bytes < root->sectorsize); - ret = find_free_extent(trans, root, num_bytes, empty_size, - hint_byte, ins, flags); + ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, + flags); if (ret == -ENOSPC) { - if (!final_tried) { - num_bytes = num_bytes >> 1; + if (!final_tried && ins->offset) { + num_bytes = min(num_bytes >> 1, ins->offset); num_bytes = round_down(num_bytes, root->sectorsize); num_bytes = max(num_bytes, min_alloc_size); if (num_bytes == min_alloc_size) @@ -6335,8 +6588,7 @@ again: sinfo = __find_space_info(root->fs_info, flags); btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu", - (unsigned long long)flags, - (unsigned long long)num_bytes); + flags, num_bytes); if (sinfo) dump_space_info(sinfo, num_bytes, 1); } @@ -6356,7 +6608,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root, cache = btrfs_lookup_block_group(root->fs_info, start); if (!cache) { btrfs_err(root->fs_info, "Unable to find block group for %llu", - (unsigned long long)start); + start); return -ENOSPC; } @@ -6452,8 +6704,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, ret = update_block_group(root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", - (unsigned long long)ins->objectid, - (unsigned long long)ins->offset); + ins->objectid, ins->offset); BUG(); } return ret; @@ -6525,8 +6776,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, ret = update_block_group(root, ins->objectid, root->leafsize, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", - (unsigned long long)ins->objectid, - (unsigned long long)ins->offset); + ins->objectid, ins->offset); BUG(); } return ret; @@ -6560,52 +6810,26 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, { int ret; struct btrfs_block_group_cache *block_group; - struct btrfs_caching_control *caching_ctl; - u64 start = ins->objectid; - u64 num_bytes = ins->offset; - block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); - cache_block_group(block_group, 0); - caching_ctl = get_caching_control(block_group); - - if (!caching_ctl) { - BUG_ON(!block_group_cache_done(block_group)); - ret = btrfs_remove_free_space(block_group, start, num_bytes); - if (ret) - goto out; - } else { - mutex_lock(&caching_ctl->mutex); - - if (start >= caching_ctl->progress) { - ret = add_excluded_extent(root, start, num_bytes); - } else if (start + num_bytes <= caching_ctl->progress) { - ret = btrfs_remove_free_space(block_group, - start, num_bytes); - } else { - num_bytes = caching_ctl->progress - start; - ret = btrfs_remove_free_space(block_group, - start, num_bytes); - if (ret) - goto out_lock; - - start = caching_ctl->progress; - num_bytes = ins->objectid + ins->offset - - caching_ctl->progress; - ret = add_excluded_extent(root, start, num_bytes); - } -out_lock: - mutex_unlock(&caching_ctl->mutex); - put_caching_control(caching_ctl); + /* + * Mixed block groups will exclude before processing the log so we only + * need to do the exlude dance if this fs isn't mixed. + */ + if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) { + ret = __exclude_logged_extent(root, ins->objectid, ins->offset); if (ret) - goto out; + return ret; } + block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); + if (!block_group) + return -EINVAL; + ret = btrfs_update_reserved_bytes(block_group, ins->offset, RESERVE_ALLOC_NO_ACCOUNT); BUG_ON(ret); /* logic error */ ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); -out: btrfs_put_block_group(block_group); return ret; } @@ -6734,7 +6958,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, if (IS_ERR(block_rsv)) return ERR_CAST(block_rsv); - ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, + ret = btrfs_reserve_extent(root, blocksize, blocksize, empty_size, hint, &ins, 0); if (ret) { unuse_block_rsv(root->fs_info, block_rsv, blocksize); @@ -7005,6 +7229,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, next = btrfs_find_create_tree_block(root, bytenr, blocksize); if (!next) return -ENOMEM; + btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, + level - 1); reada = 1; } btrfs_tree_lock(next); @@ -7298,6 +7524,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int err = 0; int ret; int level; + bool root_dropped = false; path = btrfs_alloc_path(); if (!path) { @@ -7355,6 +7582,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, while (1) { btrfs_tree_lock(path->nodes[level]); btrfs_set_lock_blocking(path->nodes[level]); + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; ret = btrfs_lookup_extent_info(trans, root, path->nodes[level]->start, @@ -7370,6 +7598,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, break; btrfs_tree_unlock(path->nodes[level]); + path->locks[level] = 0; WARN_ON(wc->refs[level] != 1); level--; } @@ -7384,11 +7613,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); while (1) { - if (!for_reloc && btrfs_fs_closing(root->fs_info)) { - pr_debug("btrfs: drop snapshot early exit\n"); - err = -EAGAIN; - goto out_end_trans; - } ret = walk_down_tree(trans, root, path, wc); if (ret < 0) { @@ -7416,7 +7640,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } BUG_ON(wc->level == 0); - if (btrfs_should_end_transaction(trans, tree_root)) { + if (btrfs_should_end_transaction(trans, tree_root) || + (!for_reloc && btrfs_need_cleaner_sleep(root))) { ret = btrfs_update_root(trans, tree_root, &root->root_key, root_item); @@ -7427,6 +7652,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } btrfs_end_transaction_throttle(trans, tree_root); + if (!for_reloc && btrfs_need_cleaner_sleep(root)) { + pr_debug("btrfs: drop snapshot early exit\n"); + err = -EAGAIN; + goto out_free; + } + trans = btrfs_start_transaction(tree_root, 0); if (IS_ERR(trans)) { err = PTR_ERR(trans); @@ -7447,8 +7678,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_find_last_root(tree_root, root->root_key.objectid, - NULL, NULL); + ret = btrfs_find_root(tree_root, &root->root_key, path, + NULL, NULL); if (ret < 0) { btrfs_abort_transaction(trans, tree_root, ret); err = ret; @@ -7465,18 +7696,28 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } if (root->in_radix) { - btrfs_free_fs_root(tree_root->fs_info, root); + btrfs_drop_and_free_fs_root(tree_root->fs_info, root); } else { free_extent_buffer(root->node); free_extent_buffer(root->commit_root); - kfree(root); + btrfs_put_fs_root(root); } + root_dropped = true; out_end_trans: btrfs_end_transaction_throttle(trans, tree_root); out_free: kfree(wc); btrfs_free_path(path); out: + /* + * So if we need to stop dropping the snapshot for whatever reason we + * need to make sure to add it back to the dead root list so that we + * keep trying to do the work later. This also cleans up roots if we + * don't have it in the radix (like when we recover after a power fail + * or unmount) so we don't leak memory. + */ + if (!for_reloc && root_dropped == false) + btrfs_add_dead_root(root); if (err) btrfs_std_error(root->fs_info, err); return err; @@ -7782,6 +8023,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) struct btrfs_space_info *space_info; struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; struct btrfs_device *device; + struct btrfs_trans_handle *trans; u64 min_free; u64 dev_min = 1; u64 dev_nr = 0; @@ -7868,6 +8110,13 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) do_div(min_free, dev_min); } + /* We need to do this so that we can look at pending chunks */ + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + mutex_lock(&root->fs_info->chunk_mutex); list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { u64 dev_offset; @@ -7878,7 +8127,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) */ if (device->total_bytes > device->bytes_used + min_free && !device->is_tgtdev_for_dev_replace) { - ret = find_free_dev_extent(device, min_free, + ret = find_free_dev_extent(trans, device, min_free, &dev_offset, NULL); if (!ret) dev_nr++; @@ -7890,6 +8139,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) } } mutex_unlock(&root->fs_info->chunk_mutex); + btrfs_end_transaction(trans, root); out: btrfs_put_block_group(block_group); return ret; @@ -8000,7 +8250,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) * We haven't cached this block group, which means we could * possibly have excluded extents on this block group. */ - if (block_group->cached == BTRFS_CACHE_NO) + if (block_group->cached == BTRFS_CACHE_NO || + block_group->cached == BTRFS_CACHE_ERROR) free_excluded_extents(info->extent_root, block_group); btrfs_remove_free_space_cache(block_group); @@ -8032,6 +8283,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) dump_space_info(space_info, 0, 0); } } + percpu_counter_destroy(&space_info->total_bytes_pinned); list_del(&space_info->list); kfree(space_info); } @@ -8216,9 +8468,13 @@ int btrfs_read_block_groups(struct btrfs_root *root) * avoid allocating from un-mirrored block group if there are * mirrored block groups. */ - list_for_each_entry(cache, &space_info->block_groups[3], list) + list_for_each_entry(cache, + &space_info->block_groups[BTRFS_RAID_RAID0], + list) set_block_group_ro(cache, 1); - list_for_each_entry(cache, &space_info->block_groups[4], list) + list_for_each_entry(cache, + &space_info->block_groups[BTRFS_RAID_SINGLE], + list) set_block_group_ro(cache, 1); } @@ -8254,6 +8510,10 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, sizeof(item)); if (ret) btrfs_abort_transaction(trans, extent_root, ret); + ret = btrfs_finish_chunk_alloc(trans, extent_root, + key.objectid, key.offset); + if (ret) + btrfs_abort_transaction(trans, extent_root, ret); } } @@ -8591,8 +8851,15 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) if (end - start >= range->minlen) { if (!block_group_cache_done(cache)) { ret = cache_block_group(cache, 0); - if (!ret) - wait_block_group_cache_done(cache); + if (ret) { + btrfs_put_block_group(cache); + break; + } + ret = wait_block_group_cache_done(cache); + if (ret) { + btrfs_put_block_group(cache); + break; + } } ret = btrfs_trim_block_group(cache, &group_trimmed, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e7e7afb4a872..22bda32acb89 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -61,9 +61,8 @@ void btrfs_leak_debug_check(void) state = list_entry(states.next, struct extent_state, leak_list); printk(KERN_ERR "btrfs state leak: start %llu end %llu " "state %lu in tree %p refs %d\n", - (unsigned long long)state->start, - (unsigned long long)state->end, - state->state, state->tree, atomic_read(&state->refs)); + state->start, state->end, state->state, state->tree, + atomic_read(&state->refs)); list_del(&state->leak_list); kmem_cache_free(extent_state_cache, state); } @@ -71,16 +70,31 @@ void btrfs_leak_debug_check(void) while (!list_empty(&buffers)) { eb = list_entry(buffers.next, struct extent_buffer, leak_list); printk(KERN_ERR "btrfs buffer leak start %llu len %lu " - "refs %d\n", (unsigned long long)eb->start, - eb->len, atomic_read(&eb->refs)); + "refs %d\n", + eb->start, eb->len, atomic_read(&eb->refs)); list_del(&eb->leak_list); kmem_cache_free(extent_buffer_cache, eb); } } + +#define btrfs_debug_check_extent_io_range(inode, start, end) \ + __btrfs_debug_check_extent_io_range(__func__, (inode), (start), (end)) +static inline void __btrfs_debug_check_extent_io_range(const char *caller, + struct inode *inode, u64 start, u64 end) +{ + u64 isize = i_size_read(inode); + + if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { + printk_ratelimited(KERN_DEBUG + "btrfs: %s: ino %llu isize %llu odd range [%llu,%llu]\n", + caller, btrfs_ino(inode), isize, start, end); + } +} #else #define btrfs_leak_debug_add(new, head) do {} while (0) #define btrfs_leak_debug_del(entry) do {} while (0) #define btrfs_leak_debug_check() do {} while (0) +#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) #endif #define BUFFER_LRU_MAX 64 @@ -131,8 +145,16 @@ int __init extent_io_init(void) offsetof(struct btrfs_io_bio, bio)); if (!btrfs_bioset) goto free_buffer_cache; + + if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE)) + goto free_bioset; + return 0; +free_bioset: + bioset_free(btrfs_bioset); + btrfs_bioset = NULL; + free_buffer_cache: kmem_cache_destroy(extent_buffer_cache); extent_buffer_cache = NULL; @@ -369,8 +391,7 @@ static int insert_state(struct extent_io_tree *tree, if (end < start) WARN(1, KERN_ERR "btrfs end < start %llu %llu\n", - (unsigned long long)end, - (unsigned long long)start); + end, start); state->start = start; state->end = end; @@ -381,9 +402,8 @@ static int insert_state(struct extent_io_tree *tree, struct extent_state *found; found = rb_entry(node, struct extent_state, rb_node); printk(KERN_ERR "btrfs found node %llu %llu on insert of " - "%llu %llu\n", (unsigned long long)found->start, - (unsigned long long)found->end, - (unsigned long long)start, (unsigned long long)end); + "%llu %llu\n", + found->start, found->end, start, end); return -EEXIST; } state->tree = tree; @@ -522,6 +542,11 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int err; int clear = 0; + btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); + + if (bits & EXTENT_DELALLOC) + bits |= EXTENT_NORESERVE; + if (delete) bits |= ~EXTENT_CTLBITS; bits |= EXTENT_FIRST_DELALLOC; @@ -677,6 +702,8 @@ static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *state; struct rb_node *node; + btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); + spin_lock(&tree->lock); again: while (1) { @@ -736,15 +763,6 @@ static void cache_state(struct extent_state *state, } } -static void uncache_state(struct extent_state **cached_ptr) -{ - if (cached_ptr && (*cached_ptr)) { - struct extent_state *state = *cached_ptr; - *cached_ptr = NULL; - free_extent_state(state); - } -} - /* * set some bits on a range in the tree. This may require allocations or * sleeping, so the gfp mask is used to indicate what is allowed. @@ -769,6 +787,8 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u64 last_start; u64 last_end; + btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); + bits |= EXTENT_FIRST_DELALLOC; again: if (!prealloc && (mask & __GFP_WAIT)) { @@ -989,6 +1009,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u64 last_start; u64 last_end; + btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); + again: if (!prealloc && (mask & __GFP_WAIT)) { prealloc = alloc_extent_state(mask); @@ -1467,10 +1489,12 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, *end = state->end; cur_start = state->end + 1; node = rb_next(node); - if (!node) - break; total_bytes += state->end - state->start + 1; - if (total_bytes >= max_bytes) + if (total_bytes >= max_bytes) { + *end = *start + max_bytes - 1; + break; + } + if (!node) break; } out: @@ -1598,7 +1622,7 @@ again: *start = delalloc_start; *end = delalloc_end; free_extent_state(cached_state); - return found; + return 0; } /* @@ -1657,31 +1681,21 @@ out_failed: return found; } -int extent_clear_unlock_delalloc(struct inode *inode, - struct extent_io_tree *tree, - u64 start, u64 end, struct page *locked_page, - unsigned long op) +int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, + struct page *locked_page, + unsigned long clear_bits, + unsigned long page_ops) { + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; int ret; struct page *pages[16]; unsigned long index = start >> PAGE_CACHE_SHIFT; unsigned long end_index = end >> PAGE_CACHE_SHIFT; unsigned long nr_pages = end_index - index + 1; int i; - unsigned long clear_bits = 0; - - if (op & EXTENT_CLEAR_UNLOCK) - clear_bits |= EXTENT_LOCKED; - if (op & EXTENT_CLEAR_DIRTY) - clear_bits |= EXTENT_DIRTY; - - if (op & EXTENT_CLEAR_DELALLOC) - clear_bits |= EXTENT_DELALLOC; clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); - if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | - EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | - EXTENT_SET_PRIVATE2))) + if (page_ops == 0) return 0; while (nr_pages > 0) { @@ -1690,20 +1704,20 @@ int extent_clear_unlock_delalloc(struct inode *inode, nr_pages, ARRAY_SIZE(pages)), pages); for (i = 0; i < ret; i++) { - if (op & EXTENT_SET_PRIVATE2) + if (page_ops & PAGE_SET_PRIVATE2) SetPagePrivate2(pages[i]); if (pages[i] == locked_page) { page_cache_release(pages[i]); continue; } - if (op & EXTENT_CLEAR_DIRTY) + if (page_ops & PAGE_CLEAR_DIRTY) clear_page_dirty_for_io(pages[i]); - if (op & EXTENT_SET_WRITEBACK) + if (page_ops & PAGE_SET_WRITEBACK) set_page_writeback(pages[i]); - if (op & EXTENT_END_WRITEBACK) + if (page_ops & PAGE_END_WRITEBACK) end_page_writeback(pages[i]); - if (op & EXTENT_CLEAR_UNLOCK_PAGE) + if (page_ops & PAGE_UNLOCK) unlock_page(pages[i]); page_cache_release(pages[i]); } @@ -1780,7 +1794,7 @@ out: * set the private field for a given byte offset in the tree. If there isn't * an extent_state there already, this does nothing. */ -int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) +static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) { struct rb_node *node; struct extent_state *state; @@ -1807,64 +1821,6 @@ out: return ret; } -void extent_cache_csums_dio(struct extent_io_tree *tree, u64 start, u32 csums[], - int count) -{ - struct rb_node *node; - struct extent_state *state; - - spin_lock(&tree->lock); - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, start); - BUG_ON(!node); - - state = rb_entry(node, struct extent_state, rb_node); - BUG_ON(state->start != start); - - while (count) { - state->private = *csums++; - count--; - state = next_state(state); - } - spin_unlock(&tree->lock); -} - -static inline u64 __btrfs_get_bio_offset(struct bio *bio, int bio_index) -{ - struct bio_vec *bvec = bio->bi_io_vec + bio_index; - - return page_offset(bvec->bv_page) + bvec->bv_offset; -} - -void extent_cache_csums(struct extent_io_tree *tree, struct bio *bio, int bio_index, - u32 csums[], int count) -{ - struct rb_node *node; - struct extent_state *state = NULL; - u64 start; - - spin_lock(&tree->lock); - do { - start = __btrfs_get_bio_offset(bio, bio_index); - if (state == NULL || state->start != start) { - node = tree_search(tree, start); - BUG_ON(!node); - - state = rb_entry(node, struct extent_state, rb_node); - BUG_ON(state->start != start); - } - state->private = *csums++; - count--; - bio_index++; - - state = next_state(state); - } while (count); - spin_unlock(&tree->lock); -} - int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) { struct rb_node *node; @@ -2143,7 +2099,8 @@ static int clean_io_failure(u64 start, struct page *page) EXTENT_LOCKED); spin_unlock(&BTRFS_I(inode)->io_tree.lock); - if (state && state->start == failrec->start) { + if (state && state->start <= failrec->start && + state->end >= failrec->start + failrec->len - 1) { fs_info = BTRFS_I(inode)->root->fs_info; num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); @@ -2171,9 +2128,9 @@ out: * needed */ -static int bio_readpage_error(struct bio *failed_bio, struct page *page, - u64 start, u64 end, int failed_mirror, - struct extent_state *state) +static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, + int failed_mirror) { struct io_failure_record *failrec = NULL; u64 private; @@ -2183,6 +2140,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct bio *bio; + struct btrfs_io_bio *btrfs_failed_bio; + struct btrfs_io_bio *btrfs_bio; int num_copies; int ret; int read_mode; @@ -2266,23 +2225,12 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, * all the retry and error correction code that follows. no * matter what the error is, it is very likely to persist. */ - pr_debug("bio_readpage_error: cannot repair, num_copies == 1. " - "state=%p, num_copies=%d, next_mirror %d, " - "failed_mirror %d\n", state, num_copies, - failrec->this_mirror, failed_mirror); + pr_debug("bio_readpage_error: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n", + num_copies, failrec->this_mirror, failed_mirror); free_io_failure(inode, failrec, 0); return -EIO; } - if (!state) { - spin_lock(&tree->lock); - state = find_first_extent_bit_state(tree, failrec->start, - EXTENT_LOCKED); - if (state && state->start != failrec->start) - state = NULL; - spin_unlock(&tree->lock); - } - /* * there are two premises: * a) deliver good data to the caller @@ -2319,9 +2267,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, read_mode = READ_SYNC; } - if (!state || failrec->this_mirror > num_copies) { - pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, " - "next_mirror %d, failed_mirror %d\n", state, + if (failrec->this_mirror > num_copies) { + pr_debug("bio_readpage_error: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n", num_copies, failrec->this_mirror, failed_mirror); free_io_failure(inode, failrec, 0); return -EIO; @@ -2332,12 +2279,24 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, free_io_failure(inode, failrec, 0); return -EIO; } - bio->bi_private = state; bio->bi_end_io = failed_bio->bi_end_io; bio->bi_sector = failrec->logical >> 9; bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; bio->bi_size = 0; + btrfs_failed_bio = btrfs_io_bio(failed_bio); + if (btrfs_failed_bio->csum) { + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + + btrfs_bio = btrfs_io_bio(bio); + btrfs_bio->csum = btrfs_bio->csum_inline; + phy_offset >>= inode->i_sb->s_blocksize_bits; + phy_offset *= csum_size; + memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + phy_offset, + csum_size); + } + bio_add_page(bio, page, failrec->len, start - page_offset(page)); pr_debug("bio_readpage_error: submitting new read[%#x] to " @@ -2420,6 +2379,18 @@ static void end_bio_extent_writepage(struct bio *bio, int err) bio_put(bio); } +static void +endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, + int uptodate) +{ + struct extent_state *cached = NULL; + u64 end = start + len - 1; + + if (uptodate && tree->track_uptodate) + set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); + unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); +} + /* * after a readpage IO is done, we need to: * clear the uptodate bits on error @@ -2436,9 +2407,14 @@ static void end_bio_extent_readpage(struct bio *bio, int err) int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; struct bio_vec *bvec = bio->bi_io_vec; + struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct extent_io_tree *tree; + u64 offset = 0; u64 start; u64 end; + u64 len; + u64 extent_start = 0; + u64 extent_len = 0; int mirror; int ret; @@ -2447,14 +2423,12 @@ static void end_bio_extent_readpage(struct bio *bio, int err) do { struct page *page = bvec->bv_page; - struct extent_state *cached = NULL; - struct extent_state *state; - struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + struct inode *inode = page->mapping->host; pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " "mirror=%lu\n", (u64)bio->bi_sector, err, io_bio->mirror_num); - tree = &BTRFS_I(page->mapping->host)->io_tree; + tree = &BTRFS_I(inode)->io_tree; /* We always issue full-page reads, but if some block * in a page fails to read, blk_update_request() will @@ -2469,37 +2443,32 @@ static void end_bio_extent_readpage(struct bio *bio, int err) start = page_offset(page); end = start + bvec->bv_offset + bvec->bv_len - 1; + len = bvec->bv_len; if (++bvec <= bvec_end) prefetchw(&bvec->bv_page->flags); - spin_lock(&tree->lock); - state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED); - if (state && state->start == start) { - /* - * take a reference on the state, unlock will drop - * the ref - */ - cache_state(state, &cached); - } - spin_unlock(&tree->lock); - mirror = io_bio->mirror_num; - if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { - ret = tree->ops->readpage_end_io_hook(page, start, end, - state, mirror); + if (likely(uptodate && tree->ops && + tree->ops->readpage_end_io_hook)) { + ret = tree->ops->readpage_end_io_hook(io_bio, offset, + page, start, end, + mirror); if (ret) uptodate = 0; else clean_io_failure(start, page); } - if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { + if (likely(uptodate)) + goto readpage_ok; + + if (tree->ops && tree->ops->readpage_io_failed_hook) { ret = tree->ops->readpage_io_failed_hook(page, mirror); if (!ret && !err && test_bit(BIO_UPTODATE, &bio->bi_flags)) uptodate = 1; - } else if (!uptodate) { + } else { /* * The generic bio_readpage_error handles errors the * following way: If possible, new read requests are @@ -2510,32 +2479,62 @@ static void end_bio_extent_readpage(struct bio *bio, int err) * can't handle the error it will return -EIO and we * remain responsible for that page. */ - ret = bio_readpage_error(bio, page, start, end, mirror, NULL); + ret = bio_readpage_error(bio, offset, page, start, end, + mirror); if (ret == 0) { uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); if (err) uptodate = 0; - uncache_state(&cached); continue; } } - - if (uptodate && tree->track_uptodate) { - set_extent_uptodate(tree, start, end, &cached, - GFP_ATOMIC); - } - unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); - - if (uptodate) { +readpage_ok: + if (likely(uptodate)) { + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + + /* Zero out the end if this page straddles i_size */ + offset = i_size & (PAGE_CACHE_SIZE-1); + if (page->index == end_index && offset) + zero_user_segment(page, offset, PAGE_CACHE_SIZE); SetPageUptodate(page); } else { ClearPageUptodate(page); SetPageError(page); } unlock_page(page); + offset += len; + + if (unlikely(!uptodate)) { + if (extent_len) { + endio_readpage_release_extent(tree, + extent_start, + extent_len, 1); + extent_start = 0; + extent_len = 0; + } + endio_readpage_release_extent(tree, start, + end - start + 1, 0); + } else if (!extent_len) { + extent_start = start; + extent_len = end + 1 - start; + } else if (extent_start + extent_len == start) { + extent_len += end + 1 - start; + } else { + endio_readpage_release_extent(tree, extent_start, + extent_len, uptodate); + extent_start = start; + extent_len = end + 1 - start; + } } while (bvec <= bvec_end); + if (extent_len) + endio_readpage_release_extent(tree, extent_start, extent_len, + uptodate); + if (io_bio->end_io) + io_bio->end_io(io_bio, err); bio_put(bio); } @@ -2547,6 +2546,7 @@ struct bio * btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, gfp_t gfp_flags) { + struct btrfs_io_bio *btrfs_bio; struct bio *bio; bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset); @@ -2562,6 +2562,10 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, bio->bi_size = 0; bio->bi_bdev = bdev; bio->bi_sector = first_sector; + btrfs_bio = btrfs_io_bio(bio); + btrfs_bio->csum = NULL; + btrfs_bio->csum_allocated = NULL; + btrfs_bio->end_io = NULL; } return bio; } @@ -2575,7 +2579,17 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) /* this also allocates from the btrfs_bioset */ struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) { - return bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset); + struct btrfs_io_bio *btrfs_bio; + struct bio *bio; + + bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset); + if (bio) { + btrfs_bio = btrfs_io_bio(bio); + btrfs_bio->csum = NULL; + btrfs_bio->csum_allocated = NULL; + btrfs_bio->end_io = NULL; + } + return bio; } @@ -2699,17 +2713,45 @@ void set_page_extent_mapped(struct page *page) } } +static struct extent_map * +__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, + u64 start, u64 len, get_extent_t *get_extent, + struct extent_map **em_cached) +{ + struct extent_map *em; + + if (em_cached && *em_cached) { + em = *em_cached; + if (em->in_tree && start >= em->start && + start < extent_map_end(em)) { + atomic_inc(&em->refs); + return em; + } + + free_extent_map(em); + *em_cached = NULL; + } + + em = get_extent(inode, page, pg_offset, start, len, 0); + if (em_cached && !IS_ERR_OR_NULL(em)) { + BUG_ON(*em_cached); + atomic_inc(&em->refs); + *em_cached = em; + } + return em; +} /* * basic readpage implementation. Locked extent state structs are inserted * into the tree that are removed when the IO is done (by the end_io * handlers) * XXX JDM: This needs looking at to ensure proper page locking */ -static int __extent_read_full_page(struct extent_io_tree *tree, - struct page *page, - get_extent_t *get_extent, - struct bio **bio, int mirror_num, - unsigned long *bio_flags, int rw) +static int __do_readpage(struct extent_io_tree *tree, + struct page *page, + get_extent_t *get_extent, + struct extent_map **em_cached, + struct bio **bio, int mirror_num, + unsigned long *bio_flags, int rw) { struct inode *inode = page->mapping->host; u64 start = page_offset(page); @@ -2723,35 +2765,26 @@ static int __extent_read_full_page(struct extent_io_tree *tree, sector_t sector; struct extent_map *em; struct block_device *bdev; - struct btrfs_ordered_extent *ordered; int ret; int nr = 0; + int parent_locked = *bio_flags & EXTENT_BIO_PARENT_LOCKED; size_t pg_offset = 0; size_t iosize; size_t disk_io_size; size_t blocksize = inode->i_sb->s_blocksize; - unsigned long this_bio_flag = 0; + unsigned long this_bio_flag = *bio_flags & EXTENT_BIO_PARENT_LOCKED; set_page_extent_mapped(page); + end = page_end; if (!PageUptodate(page)) { if (cleancache_get_page(page) == 0) { BUG_ON(blocksize != PAGE_SIZE); + unlock_extent(tree, start, end); goto out; } } - end = page_end; - while (1) { - lock_extent(tree, start, end); - ordered = btrfs_lookup_ordered_extent(inode, start); - if (!ordered) - break; - unlock_extent(tree, start, end); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - } - if (page->index == last_byte >> PAGE_CACHE_SHIFT) { char *userpage; size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); @@ -2778,15 +2811,18 @@ static int __extent_read_full_page(struct extent_io_tree *tree, kunmap_atomic(userpage); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); - unlock_extent_cached(tree, cur, cur + iosize - 1, - &cached, GFP_NOFS); + if (!parent_locked) + unlock_extent_cached(tree, cur, + cur + iosize - 1, + &cached, GFP_NOFS); break; } - em = get_extent(inode, page, pg_offset, cur, - end - cur + 1, 0); + em = __get_extent_map(inode, page, pg_offset, cur, + end - cur + 1, get_extent, em_cached); if (IS_ERR_OR_NULL(em)) { SetPageError(page); - unlock_extent(tree, cur, end); + if (!parent_locked) + unlock_extent(tree, cur, end); break; } extent_offset = cur - em->start; @@ -2794,7 +2830,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, BUG_ON(end < cur); if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - this_bio_flag = EXTENT_BIO_COMPRESSED; + this_bio_flag |= EXTENT_BIO_COMPRESSED; extent_set_compress_type(&this_bio_flag, em->compress_type); } @@ -2838,7 +2874,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree, if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1, NULL)) { check_page_uptodate(tree, page); - unlock_extent(tree, cur, cur + iosize - 1); + if (!parent_locked) + unlock_extent(tree, cur, cur + iosize - 1); cur = cur + iosize; pg_offset += iosize; continue; @@ -2848,7 +2885,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree, */ if (block_start == EXTENT_MAP_INLINE) { SetPageError(page); - unlock_extent(tree, cur, cur + iosize - 1); + if (!parent_locked) + unlock_extent(tree, cur, cur + iosize - 1); cur = cur + iosize; pg_offset += iosize; continue; @@ -2866,7 +2904,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree, *bio_flags = this_bio_flag; } else { SetPageError(page); - unlock_extent(tree, cur, cur + iosize - 1); + if (!parent_locked) + unlock_extent(tree, cur, cur + iosize - 1); } cur = cur + iosize; pg_offset += iosize; @@ -2880,6 +2919,104 @@ out: return 0; } +static inline void __do_contiguous_readpages(struct extent_io_tree *tree, + struct page *pages[], int nr_pages, + u64 start, u64 end, + get_extent_t *get_extent, + struct extent_map **em_cached, + struct bio **bio, int mirror_num, + unsigned long *bio_flags, int rw) +{ + struct inode *inode; + struct btrfs_ordered_extent *ordered; + int index; + + inode = pages[0]->mapping->host; + while (1) { + lock_extent(tree, start, end); + ordered = btrfs_lookup_ordered_range(inode, start, + end - start + 1); + if (!ordered) + break; + unlock_extent(tree, start, end); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } + + for (index = 0; index < nr_pages; index++) { + __do_readpage(tree, pages[index], get_extent, em_cached, bio, + mirror_num, bio_flags, rw); + page_cache_release(pages[index]); + } +} + +static void __extent_readpages(struct extent_io_tree *tree, + struct page *pages[], + int nr_pages, get_extent_t *get_extent, + struct extent_map **em_cached, + struct bio **bio, int mirror_num, + unsigned long *bio_flags, int rw) +{ + u64 start = 0; + u64 end = 0; + u64 page_start; + int index; + int first_index = 0; + + for (index = 0; index < nr_pages; index++) { + page_start = page_offset(pages[index]); + if (!end) { + start = page_start; + end = start + PAGE_CACHE_SIZE - 1; + first_index = index; + } else if (end + 1 == page_start) { + end += PAGE_CACHE_SIZE; + } else { + __do_contiguous_readpages(tree, &pages[first_index], + index - first_index, start, + end, get_extent, em_cached, + bio, mirror_num, bio_flags, + rw); + start = page_start; + end = start + PAGE_CACHE_SIZE - 1; + first_index = index; + } + } + + if (end) + __do_contiguous_readpages(tree, &pages[first_index], + index - first_index, start, + end, get_extent, em_cached, bio, + mirror_num, bio_flags, rw); +} + +static int __extent_read_full_page(struct extent_io_tree *tree, + struct page *page, + get_extent_t *get_extent, + struct bio **bio, int mirror_num, + unsigned long *bio_flags, int rw) +{ + struct inode *inode = page->mapping->host; + struct btrfs_ordered_extent *ordered; + u64 start = page_offset(page); + u64 end = start + PAGE_CACHE_SIZE - 1; + int ret; + + while (1) { + lock_extent(tree, start, end); + ordered = btrfs_lookup_ordered_extent(inode, start); + if (!ordered) + break; + unlock_extent(tree, start, end); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } + + ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, + bio_flags, rw); + return ret; +} + int extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, int mirror_num) { @@ -2894,6 +3031,20 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, return ret; } +int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent, int mirror_num) +{ + struct bio *bio = NULL; + unsigned long bio_flags = EXTENT_BIO_PARENT_LOCKED; + int ret; + + ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num, + &bio_flags, READ); + if (bio) + ret = submit_one_bio(READ, bio, mirror_num, bio_flags); + return ret; +} + static noinline void update_nr_written(struct page *page, struct writeback_control *wbc, unsigned long nr_written) @@ -2957,7 +3108,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, pg_offset = i_size & (PAGE_CACHE_SIZE - 1); if (page->index > end_index || (page->index == end_index && !pg_offset)) { - page->mapping->a_ops->invalidatepage(page, 0); + page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); unlock_page(page); return 0; } @@ -3150,8 +3301,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (!PageWriteback(page)) { printk(KERN_ERR "btrfs warning page %lu not " "writeback, cur %llu end %llu\n", - page->index, (unsigned long long)cur, - (unsigned long long)end); + page->index, cur, end); } ret = submit_extent_page(write_flags, tree, page, @@ -3730,7 +3880,7 @@ int extent_readpages(struct extent_io_tree *tree, unsigned long bio_flags = 0; struct page *pagepool[16]; struct page *page; - int i = 0; + struct extent_map *em_cached = NULL; int nr = 0; for (page_idx = 0; page_idx < nr_pages; page_idx++) { @@ -3747,18 +3897,16 @@ int extent_readpages(struct extent_io_tree *tree, pagepool[nr++] = page; if (nr < ARRAY_SIZE(pagepool)) continue; - for (i = 0; i < nr; i++) { - __extent_read_full_page(tree, pagepool[i], get_extent, - &bio, 0, &bio_flags, READ); - page_cache_release(pagepool[i]); - } + __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, + &bio, 0, &bio_flags, READ); nr = 0; } - for (i = 0; i < nr; i++) { - __extent_read_full_page(tree, pagepool[i], get_extent, - &bio, 0, &bio_flags, READ); - page_cache_release(pagepool[i]); - } + if (nr) + __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, + &bio, 0, &bio_flags, READ); + + if (em_cached) + free_extent_map(em_cached); BUG_ON(!list_empty(pages)); if (bio) @@ -4009,7 +4157,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } while (!end) { - u64 offset_in_extent; + u64 offset_in_extent = 0; /* break if the extent we found is outside the range */ if (em->start >= max || extent_map_end(em) < off) @@ -4025,9 +4173,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, /* * record the offset from the start of the extent - * for adjusting the disk offset below + * for adjusting the disk offset below. Only do this if the + * extent isn't compressed since our in ram offset may be past + * what we have actually allocated on disk. */ - offset_in_extent = em_start - em->start; + if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + offset_in_extent = em_start - em->start; em_end = extent_map_end(em); em_len = em_end - em_start; emflags = em->flags; @@ -4094,6 +4245,76 @@ static void __free_extent_buffer(struct extent_buffer *eb) kmem_cache_free(extent_buffer_cache, eb); } +static int extent_buffer_under_io(struct extent_buffer *eb) +{ + return (atomic_read(&eb->io_pages) || + test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || + test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); +} + +/* + * Helper for releasing extent buffer page. + */ +static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, + unsigned long start_idx) +{ + unsigned long index; + unsigned long num_pages; + struct page *page; + int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); + + BUG_ON(extent_buffer_under_io(eb)); + + num_pages = num_extent_pages(eb->start, eb->len); + index = start_idx + num_pages; + if (start_idx >= index) + return; + + do { + index--; + page = extent_buffer_page(eb, index); + if (page && mapped) { + spin_lock(&page->mapping->private_lock); + /* + * We do this since we'll remove the pages after we've + * removed the eb from the radix tree, so we could race + * and have this page now attached to the new eb. So + * only clear page_private if it's still connected to + * this eb. + */ + if (PagePrivate(page) && + page->private == (unsigned long)eb) { + BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + BUG_ON(PageDirty(page)); + BUG_ON(PageWriteback(page)); + /* + * We need to make sure we haven't be attached + * to a new eb. + */ + ClearPagePrivate(page); + set_page_private(page, 0); + /* One for the page private */ + page_cache_release(page); + } + spin_unlock(&page->mapping->private_lock); + + } + if (page) { + /* One for when we alloced the page */ + page_cache_release(page); + } + } while (index != start_idx); +} + +/* + * Helper for releasing the extent buffer. + */ +static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) +{ + btrfs_release_extent_buffer_page(eb, 0); + __free_extent_buffer(eb); +} + static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len, @@ -4142,13 +4363,16 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) struct extent_buffer *new; unsigned long num_pages = num_extent_pages(src->start, src->len); - new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC); + new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_NOFS); if (new == NULL) return NULL; for (i = 0; i < num_pages; i++) { - p = alloc_page(GFP_ATOMIC); - BUG_ON(!p); + p = alloc_page(GFP_NOFS); + if (!p) { + btrfs_release_extent_buffer(new); + return NULL; + } attach_extent_buffer_page(new, p); WARN_ON(PageDirty(p)); SetPageUptodate(p); @@ -4168,12 +4392,12 @@ struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len) unsigned long num_pages = num_extent_pages(0, len); unsigned long i; - eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC); + eb = __alloc_extent_buffer(NULL, start, len, GFP_NOFS); if (!eb) return NULL; for (i = 0; i < num_pages; i++) { - eb->pages[i] = alloc_page(GFP_ATOMIC); + eb->pages[i] = alloc_page(GFP_NOFS); if (!eb->pages[i]) goto err; } @@ -4189,76 +4413,6 @@ err: return NULL; } -static int extent_buffer_under_io(struct extent_buffer *eb) -{ - return (atomic_read(&eb->io_pages) || - test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || - test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); -} - -/* - * Helper for releasing extent buffer page. - */ -static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, - unsigned long start_idx) -{ - unsigned long index; - unsigned long num_pages; - struct page *page; - int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); - - BUG_ON(extent_buffer_under_io(eb)); - - num_pages = num_extent_pages(eb->start, eb->len); - index = start_idx + num_pages; - if (start_idx >= index) - return; - - do { - index--; - page = extent_buffer_page(eb, index); - if (page && mapped) { - spin_lock(&page->mapping->private_lock); - /* - * We do this since we'll remove the pages after we've - * removed the eb from the radix tree, so we could race - * and have this page now attached to the new eb. So - * only clear page_private if it's still connected to - * this eb. - */ - if (PagePrivate(page) && - page->private == (unsigned long)eb) { - BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); - BUG_ON(PageDirty(page)); - BUG_ON(PageWriteback(page)); - /* - * We need to make sure we haven't be attached - * to a new eb. - */ - ClearPagePrivate(page); - set_page_private(page, 0); - /* One for the page private */ - page_cache_release(page); - } - spin_unlock(&page->mapping->private_lock); - - } - if (page) { - /* One for when we alloced the page */ - page_cache_release(page); - } - } while (index != start_idx); -} - -/* - * Helper for releasing the extent buffer. - */ -static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) -{ - btrfs_release_extent_buffer_page(eb, 0); - __free_extent_buffer(eb); -} - static void check_buffer_tree_ref(struct extent_buffer *eb) { int refs; @@ -4729,7 +4883,7 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); while (len > 0) { page = extent_buffer_page(eb, i); @@ -4771,8 +4925,8 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, if (start + min_len > eb->len) { WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, " - "wanted %lu %lu\n", (unsigned long long)eb->start, - eb->len, start, min_len); + "wanted %lu %lu\n", + eb->start, eb->len, start, min_len); return -EINVAL; } @@ -4799,7 +4953,7 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); while (len > 0) { page = extent_buffer_page(eb, i); @@ -4833,7 +4987,7 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); while (len > 0) { page = extent_buffer_page(eb, i); @@ -4863,7 +5017,7 @@ void memset_extent_buffer(struct extent_buffer *eb, char c, WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); while (len > 0) { page = extent_buffer_page(eb, i); @@ -4894,7 +5048,7 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, WARN_ON(src->len != dst_len); offset = (start_offset + dst_offset) & - ((unsigned long)PAGE_CACHE_SIZE - 1); + (PAGE_CACHE_SIZE - 1); while (len > 0) { page = extent_buffer_page(dst, i); @@ -4980,9 +5134,9 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, while (len > 0) { dst_off_in_page = (start_offset + dst_offset) & - ((unsigned long)PAGE_CACHE_SIZE - 1); + (PAGE_CACHE_SIZE - 1); src_off_in_page = (start_offset + src_offset) & - ((unsigned long)PAGE_CACHE_SIZE - 1); + (PAGE_CACHE_SIZE - 1); dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; @@ -5033,9 +5187,9 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; dst_off_in_page = (start_offset + dst_end) & - ((unsigned long)PAGE_CACHE_SIZE - 1); + (PAGE_CACHE_SIZE - 1); src_off_in_page = (start_offset + src_end) & - ((unsigned long)PAGE_CACHE_SIZE - 1); + (PAGE_CACHE_SIZE - 1); cur = min_t(unsigned long, len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 41fb81e7ec53..6dbc645f1f3d 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -19,6 +19,7 @@ #define EXTENT_FIRST_DELALLOC (1 << 12) #define EXTENT_NEED_WAIT (1 << 13) #define EXTENT_DAMAGED (1 << 14) +#define EXTENT_NORESERVE (1 << 15) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) @@ -28,6 +29,7 @@ */ #define EXTENT_BIO_COMPRESSED 1 #define EXTENT_BIO_TREE_LOG 2 +#define EXTENT_BIO_PARENT_LOCKED 4 #define EXTENT_BIO_FLAG_SHIFT 16 /* these are bit numbers for test/set bit */ @@ -43,14 +45,11 @@ #define EXTENT_BUFFER_DUMMY 9 /* these are flags for extent_clear_unlock_delalloc */ -#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 -#define EXTENT_CLEAR_UNLOCK 0x2 -#define EXTENT_CLEAR_DELALLOC 0x4 -#define EXTENT_CLEAR_DIRTY 0x8 -#define EXTENT_SET_WRITEBACK 0x10 -#define EXTENT_END_WRITEBACK 0x20 -#define EXTENT_SET_PRIVATE2 0x40 -#define EXTENT_CLEAR_ACCOUNTING 0x80 +#define PAGE_UNLOCK (1 << 0) +#define PAGE_CLEAR_DIRTY (1 << 1) +#define PAGE_SET_WRITEBACK (1 << 2) +#define PAGE_END_WRITEBACK (1 << 3) +#define PAGE_SET_PRIVATE2 (1 << 4) /* * page->private values. Every page that is controlled by the extent @@ -61,6 +60,7 @@ struct extent_state; struct btrfs_root; +struct btrfs_io_bio; typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, struct bio *bio, int mirror_num, @@ -76,8 +76,9 @@ struct extent_io_ops { size_t size, struct bio *bio, unsigned long bio_flags); int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); - int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, - struct extent_state *state, int mirror); + int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, + int mirror); int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate); void (*set_bit_hook)(struct inode *inode, struct extent_state *state, @@ -199,6 +200,8 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, int mirror_num); +int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent, int mirror_num); int __init extent_io_init(void); void extent_io_exit(void); @@ -260,11 +263,6 @@ int extent_readpages(struct extent_io_tree *tree, get_extent_t get_extent); int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent); -int set_state_private(struct extent_io_tree *tree, u64 start, u64 private); -void extent_cache_csums_dio(struct extent_io_tree *tree, u64 start, u32 csums[], - int count); -void extent_cache_csums(struct extent_io_tree *tree, struct bio *bio, - int bvec_index, u32 csums[], int count); int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); void set_page_extent_mapped(struct page *page); @@ -329,10 +327,10 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, unsigned long *map_len); int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); -int extent_clear_unlock_delalloc(struct inode *inode, - struct extent_io_tree *tree, - u64 start, u64 end, struct page *locked_page, - unsigned long op); +int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, + struct page *locked_page, + unsigned long bits_to_clear, + unsigned long page_ops); struct bio * btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, gfp_t gfp_flags); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index b193bf324a41..4f53159bdb9d 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -23,6 +23,7 @@ #include "ctree.h" #include "disk-io.h" #include "transaction.h" +#include "volumes.h" #include "print-tree.h" #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ @@ -34,8 +35,7 @@ #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ sizeof(struct btrfs_ordered_sum)) / \ - sizeof(struct btrfs_sector_sum) * \ - (r)->sectorsize - (r)->sectorsize) + sizeof(u32) * (r)->sectorsize) int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -153,28 +153,54 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, return ret; } +static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err) +{ + kfree(bio->csum_allocated); +} + static int __btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 logical_offset, u32 *dst, int dio) { - u32 sum[16]; - int len; struct bio_vec *bvec = bio->bi_io_vec; - int bio_index = 0; + struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio); + struct btrfs_csum_item *item = NULL; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_path *path; + u8 *csum; u64 offset = 0; u64 item_start_offset = 0; u64 item_last_offset = 0; u64 disk_bytenr; u32 diff; - u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + int nblocks; + int bio_index = 0; int count; - struct btrfs_path *path; - struct btrfs_csum_item *item = NULL; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); path = btrfs_alloc_path(); if (!path) return -ENOMEM; + + nblocks = bio->bi_size >> inode->i_sb->s_blocksize_bits; + if (!dst) { + if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { + btrfs_bio->csum_allocated = kmalloc(nblocks * csum_size, + GFP_NOFS); + if (!btrfs_bio->csum_allocated) { + btrfs_free_path(path); + return -ENOMEM; + } + btrfs_bio->csum = btrfs_bio->csum_allocated; + btrfs_bio->end_io = btrfs_io_bio_endio_readpage; + } else { + btrfs_bio->csum = btrfs_bio->csum_inline; + } + csum = btrfs_bio->csum; + } else { + csum = (u8 *)dst; + } + if (bio->bi_size > PAGE_CACHE_SIZE * 8) path->reada = 2; @@ -195,11 +221,10 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, if (dio) offset = logical_offset; while (bio_index < bio->bi_vcnt) { - len = min_t(int, ARRAY_SIZE(sum), bio->bi_vcnt - bio_index); if (!dio) offset = page_offset(bvec->bv_page) + bvec->bv_offset; - count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, sum, - len); + count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, + (u32 *)csum, nblocks); if (count) goto found; @@ -214,7 +239,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, path, disk_bytenr, 0); if (IS_ERR(item)) { count = 1; - sum[0] = 0; + memset(csum, 0, csum_size); if (BTRFS_I(inode)->root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { set_extent_bits(io_tree, offset, @@ -223,9 +248,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, } else { printk(KERN_INFO "btrfs no csum found " "for inode %llu start %llu\n", - (unsigned long long) - btrfs_ino(inode), - (unsigned long long)offset); + btrfs_ino(inode), offset); } item = NULL; btrfs_release_path(path); @@ -250,23 +273,14 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, diff = disk_bytenr - item_start_offset; diff = diff / root->sectorsize; diff = diff * csum_size; - count = min_t(int, len, (item_last_offset - disk_bytenr) >> - inode->i_sb->s_blocksize_bits); - read_extent_buffer(path->nodes[0], sum, + count = min_t(int, nblocks, (item_last_offset - disk_bytenr) >> + inode->i_sb->s_blocksize_bits); + read_extent_buffer(path->nodes[0], csum, ((unsigned long)item) + diff, csum_size * count); found: - if (dst) { - memcpy(dst, sum, count * csum_size); - dst += count; - } else { - if (dio) - extent_cache_csums_dio(io_tree, offset, sum, - count); - else - extent_cache_csums(io_tree, bio, bio_index, sum, - count); - } + csum += count * csum_size; + nblocks -= count; while (count--) { disk_bytenr += bvec->bv_len; offset += bvec->bv_len; @@ -285,9 +299,19 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, } int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, - struct bio *bio, u64 offset) + struct btrfs_dio_private *dip, struct bio *bio, + u64 offset) { - return __btrfs_lookup_bio_sums(root, inode, bio, offset, NULL, 1); + int len = (bio->bi_sector << 9) - dip->disk_bytenr; + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + int ret; + + len >>= inode->i_sb->s_blocksize_bits; + len *= csum_size; + + ret = __btrfs_lookup_bio_sums(root, inode, bio, offset, + (u32 *)(dip->csum + len), 1); + return ret; } int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, @@ -297,7 +321,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; struct btrfs_csum_item *item; LIST_HEAD(tmplist); unsigned long offset; @@ -368,34 +391,28 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_csum_item); while (start < csum_end) { size = min_t(size_t, csum_end - start, - MAX_ORDERED_SUM_BYTES(root)); + MAX_ORDERED_SUM_BYTES(root)); sums = kzalloc(btrfs_ordered_sum_size(root, size), - GFP_NOFS); + GFP_NOFS); if (!sums) { ret = -ENOMEM; goto fail; } - sector_sum = sums->sums; sums->bytenr = start; - sums->len = size; + sums->len = (int)size; offset = (start - key.offset) >> root->fs_info->sb->s_blocksize_bits; offset *= csum_size; + size >>= root->fs_info->sb->s_blocksize_bits; - while (size > 0) { - read_extent_buffer(path->nodes[0], - §or_sum->sum, - ((unsigned long)item) + - offset, csum_size); - sector_sum->bytenr = start; - - size -= root->sectorsize; - start += root->sectorsize; - offset += csum_size; - sector_sum++; - } + read_extent_buffer(path->nodes[0], + sums->sums, + ((unsigned long)item) + offset, + csum_size * size); + + start += root->sectorsize * size; list_add_tail(&sums->list, &tmplist); } path->slots[0]++; @@ -417,23 +434,20 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 file_start, int contig) { struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; struct btrfs_ordered_extent *ordered; char *data; struct bio_vec *bvec = bio->bi_io_vec; int bio_index = 0; + int index; unsigned long total_bytes = 0; unsigned long this_sum_bytes = 0; u64 offset; - u64 disk_bytenr; WARN_ON(bio->bi_vcnt <= 0); sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS); if (!sums) return -ENOMEM; - sector_sum = sums->sums; - disk_bytenr = (u64)bio->bi_sector << 9; sums->len = bio->bi_size; INIT_LIST_HEAD(&sums->list); @@ -444,7 +458,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, ordered = btrfs_lookup_ordered_extent(inode, offset); BUG_ON(!ordered); /* Logic error */ - sums->bytenr = ordered->start; + sums->bytenr = (u64)bio->bi_sector << 9; + index = 0; while (bio_index < bio->bi_vcnt) { if (!contig) @@ -463,28 +478,27 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), GFP_NOFS); BUG_ON(!sums); /* -ENOMEM */ - sector_sum = sums->sums; sums->len = bytes_left; ordered = btrfs_lookup_ordered_extent(inode, offset); BUG_ON(!ordered); /* Logic error */ - sums->bytenr = ordered->start; + sums->bytenr = ((u64)bio->bi_sector << 9) + + total_bytes; + index = 0; } data = kmap_atomic(bvec->bv_page); - sector_sum->sum = ~(u32)0; - sector_sum->sum = btrfs_csum_data(data + bvec->bv_offset, - sector_sum->sum, - bvec->bv_len); + sums->sums[index] = ~(u32)0; + sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset, + sums->sums[index], + bvec->bv_len); kunmap_atomic(data); - btrfs_csum_final(sector_sum->sum, - (char *)§or_sum->sum); - sector_sum->bytenr = disk_bytenr; + btrfs_csum_final(sums->sums[index], + (char *)(sums->sums + index)); - sector_sum++; bio_index++; + index++; total_bytes += bvec->bv_len; this_sum_bytes += bvec->bv_len; - disk_bytenr += bvec->bv_len; offset += bvec->bv_len; bvec++; } @@ -672,62 +686,46 @@ out: return ret; } -static u64 btrfs_sector_sum_left(struct btrfs_ordered_sum *sums, - struct btrfs_sector_sum *sector_sum, - u64 total_bytes, u64 sectorsize) -{ - u64 tmp = sectorsize; - u64 next_sector = sector_sum->bytenr; - struct btrfs_sector_sum *next = sector_sum + 1; - - while ((tmp + total_bytes) < sums->len) { - if (next_sector + sectorsize != next->bytenr) - break; - tmp += sectorsize; - next_sector = next->bytenr; - next++; - } - return tmp; -} - int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums) { - u64 bytenr; - int ret; struct btrfs_key file_key; struct btrfs_key found_key; - u64 next_offset; - u64 total_bytes = 0; - int found_next; struct btrfs_path *path; struct btrfs_csum_item *item; struct btrfs_csum_item *item_end; struct extent_buffer *leaf = NULL; + u64 next_offset; + u64 total_bytes = 0; u64 csum_offset; - struct btrfs_sector_sum *sector_sum; + u64 bytenr; u32 nritems; u32 ins_size; + int index = 0; + int found_next; + int ret; u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); path = btrfs_alloc_path(); if (!path) return -ENOMEM; - - sector_sum = sums->sums; again: next_offset = (u64)-1; found_next = 0; + bytenr = sums->bytenr + total_bytes; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - file_key.offset = sector_sum->bytenr; - bytenr = sector_sum->bytenr; + file_key.offset = bytenr; btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); - item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1); + item = btrfs_lookup_csum(trans, root, path, bytenr, 1); if (!IS_ERR(item)) { - leaf = path->nodes[0]; ret = 0; + leaf = path->nodes[0]; + item_end = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_csum_item); + item_end = (struct btrfs_csum_item *)((char *)item_end + + btrfs_item_size_nr(leaf, path->slots[0])); goto found; } ret = PTR_ERR(item); @@ -807,8 +805,7 @@ again: free_space = btrfs_leaf_free_space(root, leaf) - sizeof(struct btrfs_item) - csum_size; - tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, - root->sectorsize); + tmp = sums->len - total_bytes; tmp >>= root->fs_info->sb->s_blocksize_bits; WARN_ON(tmp < 1); @@ -822,6 +819,7 @@ again: diff *= csum_size; btrfs_extend_item(root, path, diff); + ret = 0; goto csum; } @@ -831,8 +829,7 @@ insert: if (found_next) { u64 tmp; - tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, - root->sectorsize); + tmp = sums->len - total_bytes; tmp >>= root->fs_info->sb->s_blocksize_bits; tmp = min(tmp, (next_offset - file_key.offset) >> root->fs_info->sb->s_blocksize_bits); @@ -853,31 +850,25 @@ insert: WARN_ON(1); goto fail_unlock; } -csum: leaf = path->nodes[0]; +csum: item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); - ret = 0; + item_end = (struct btrfs_csum_item *)((unsigned char *)item + + btrfs_item_size_nr(leaf, path->slots[0])); item = (struct btrfs_csum_item *)((unsigned char *)item + csum_offset * csum_size); found: - item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); - item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + - btrfs_item_size_nr(leaf, path->slots[0])); -next_sector: - - write_extent_buffer(leaf, §or_sum->sum, (unsigned long)item, csum_size); - - total_bytes += root->sectorsize; - sector_sum++; - if (total_bytes < sums->len) { - item = (struct btrfs_csum_item *)((char *)item + - csum_size); - if (item < item_end && bytenr + PAGE_CACHE_SIZE == - sector_sum->bytenr) { - bytenr = sector_sum->bytenr; - goto next_sector; - } - } + ins_size = (u32)(sums->len - total_bytes) >> + root->fs_info->sb->s_blocksize_bits; + ins_size *= csum_size; + ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item, + ins_size); + write_extent_buffer(leaf, sums->sums + index, (unsigned long)item, + ins_size); + + ins_size /= csum_size; + total_bytes += ins_size * root->sectorsize; + index += ins_size; btrfs_mark_buffer_dirty(path->nodes[0]); if (total_bytes < sums->len) { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4205ba752d40..72da4df53c9a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -309,10 +309,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, ret = PTR_ERR(inode_root); goto cleanup; } - if (btrfs_root_refs(&inode_root->root_item) == 0) { - ret = -ENOENT; - goto cleanup; - } key.objectid = defrag->ino; btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); @@ -600,20 +596,29 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, if (no_splits) goto next; - if (em->block_start < EXTENT_MAP_LAST_BYTE && - em->start < start) { + if (em->start < start) { split->start = em->start; split->len = start - em->start; - split->orig_start = em->orig_start; - split->block_start = em->block_start; - if (compressed) - split->block_len = em->block_len; - else - split->block_len = split->len; - split->ram_bytes = em->ram_bytes; - split->orig_block_len = max(split->block_len, - em->orig_block_len); + if (em->block_start < EXTENT_MAP_LAST_BYTE) { + split->orig_start = em->orig_start; + split->block_start = em->block_start; + + if (compressed) + split->block_len = em->block_len; + else + split->block_len = split->len; + split->orig_block_len = max(split->block_len, + em->orig_block_len); + split->ram_bytes = em->ram_bytes; + } else { + split->orig_start = split->start; + split->block_len = 0; + split->block_start = em->block_start; + split->orig_block_len = 0; + split->ram_bytes = split->len; + } + split->generation = gen; split->bdev = em->bdev; split->flags = flags; @@ -624,8 +629,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split = split2; split2 = NULL; } - if (em->block_start < EXTENT_MAP_LAST_BYTE && - testend && em->start + em->len > start + len) { + if (testend && em->start + em->len > start + len) { u64 diff = start + len - em->start; split->start = start + len; @@ -634,18 +638,28 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->flags = flags; split->compress_type = em->compress_type; split->generation = gen; - split->orig_block_len = max(em->block_len, + + if (em->block_start < EXTENT_MAP_LAST_BYTE) { + split->orig_block_len = max(em->block_len, em->orig_block_len); - split->ram_bytes = em->ram_bytes; - if (compressed) { - split->block_len = em->block_len; - split->block_start = em->block_start; - split->orig_start = em->orig_start; + split->ram_bytes = em->ram_bytes; + if (compressed) { + split->block_len = em->block_len; + split->block_start = em->block_start; + split->orig_start = em->orig_start; + } else { + split->block_len = split->len; + split->block_start = em->block_start + + diff; + split->orig_start = em->orig_start; + } } else { - split->block_len = split->len; - split->block_start = em->block_start + diff; - split->orig_start = em->orig_start; + split->ram_bytes = split->len; + split->orig_start = split->start; + split->block_len = 0; + split->block_start = em->block_start; + split->orig_block_len = 0; } ret = add_extent_mapping(em_tree, split, modified); @@ -1317,6 +1331,47 @@ fail: } +static noinline int check_can_nocow(struct inode *inode, loff_t pos, + size_t *write_bytes) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_ordered_extent *ordered; + u64 lockstart, lockend; + u64 num_bytes; + int ret; + + lockstart = round_down(pos, root->sectorsize); + lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1; + + while (1) { + lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); + ordered = btrfs_lookup_ordered_range(inode, lockstart, + lockend - lockstart + 1); + if (!ordered) { + break; + } + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } + + num_bytes = lockend - lockstart + 1; + ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); + if (ret <= 0) { + ret = 0; + } else { + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, + NULL, GFP_NOFS); + *write_bytes = min_t(size_t, *write_bytes, num_bytes); + } + + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); + + return ret; +} + static noinline ssize_t __btrfs_buffered_write(struct file *file, struct iov_iter *i, loff_t pos) @@ -1324,10 +1379,12 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; + u64 release_bytes = 0; unsigned long first_index; size_t num_written = 0; int nrptrs; int ret = 0; + bool only_release_metadata = false; bool force_page_uptodate = false; nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / @@ -1348,6 +1405,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, offset); size_t num_pages = (write_bytes + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + size_t reserve_bytes; size_t dirty_pages; size_t copied; @@ -1362,11 +1420,41 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, break; } - ret = btrfs_delalloc_reserve_space(inode, - num_pages << PAGE_CACHE_SHIFT); + reserve_bytes = num_pages << PAGE_CACHE_SHIFT; + ret = btrfs_check_data_free_space(inode, reserve_bytes); + if (ret == -ENOSPC && + (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC))) { + ret = check_can_nocow(inode, pos, &write_bytes); + if (ret > 0) { + only_release_metadata = true; + /* + * our prealloc extent may be smaller than + * write_bytes, so scale down. + */ + num_pages = (write_bytes + offset + + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + reserve_bytes = num_pages << PAGE_CACHE_SHIFT; + ret = 0; + } else { + ret = -ENOSPC; + } + } + if (ret) break; + ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes); + if (ret) { + if (!only_release_metadata) + btrfs_free_reserved_data_space(inode, + reserve_bytes); + break; + } + + release_bytes = reserve_bytes; + /* * This is going to setup the pages array with the number of * pages we want, so we don't really need to worry about the @@ -1375,11 +1463,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, ret = prepare_pages(root, file, pages, num_pages, pos, first_index, write_bytes, force_page_uptodate); - if (ret) { - btrfs_delalloc_release_space(inode, - num_pages << PAGE_CACHE_SHIFT); + if (ret) break; - } copied = btrfs_copy_from_user(pos, num_pages, write_bytes, pages, i); @@ -1409,30 +1494,46 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, * managed to copy. */ if (num_pages > dirty_pages) { + release_bytes = (num_pages - dirty_pages) << + PAGE_CACHE_SHIFT; if (copied > 0) { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); } - btrfs_delalloc_release_space(inode, - (num_pages - dirty_pages) << - PAGE_CACHE_SHIFT); + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, + release_bytes); + else + btrfs_delalloc_release_space(inode, + release_bytes); } + release_bytes = dirty_pages << PAGE_CACHE_SHIFT; if (copied > 0) { ret = btrfs_dirty_pages(root, inode, pages, dirty_pages, pos, copied, NULL); if (ret) { - btrfs_delalloc_release_space(inode, - dirty_pages << PAGE_CACHE_SHIFT); btrfs_drop_pages(pages, num_pages); break; } } + release_bytes = 0; btrfs_drop_pages(pages, num_pages); + if (only_release_metadata && copied > 0) { + u64 lockstart = round_down(pos, root->sectorsize); + u64 lockend = lockstart + + (dirty_pages << PAGE_CACHE_SHIFT) - 1; + + set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, EXTENT_NORESERVE, NULL, + NULL, GFP_NOFS); + only_release_metadata = false; + } + cond_resched(); balance_dirty_pages_ratelimited(inode->i_mapping); @@ -1445,6 +1546,13 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, kfree(pages); + if (release_bytes) { + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, release_bytes); + else + btrfs_delalloc_release_space(inode, release_bytes); + } + return num_written ? num_written : ret; } @@ -1610,7 +1718,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, */ BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; BTRFS_I(inode)->last_sub_trans = root->log_transid; - if (num_written > 0 || num_written == -EIOCBQUEUED) { + if (num_written > 0) { err = generic_write_sync(file, pos, num_written); if (err < 0 && num_written > 0) num_written = err; @@ -1751,8 +1859,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = btrfs_log_dentry_safe(trans, root, dentry); if (ret < 0) { - mutex_unlock(&inode->i_mutex); - goto out; + /* Fallthrough and commit/free transaction. */ + ret = 1; } /* we've logged all the items and now have a consistent @@ -2175,12 +2283,6 @@ static long btrfs_fallocate(struct file *file, int mode, goto out_reserve_fail; } - /* - * wait for ordered IO before we have any locks. We'll loop again - * below with the locks held. - */ - btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); - mutex_lock(&inode->i_mutex); ret = inode_newsize_ok(inode, alloc_end); if (ret) @@ -2191,8 +2293,23 @@ static long btrfs_fallocate(struct file *file, int mode, alloc_start); if (ret) goto out; + } else { + /* + * If we are fallocating from the end of the file onward we + * need to zero out the end of the page if i_size lands in the + * middle of a page. + */ + ret = btrfs_truncate_page(inode, inode->i_size, 0, 0); + if (ret) + goto out; } + /* + * wait for ordered IO before we have any locks. We'll loop again + * below with the locks held. + */ + btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); + locked_end = alloc_end - 1; while (1) { struct btrfs_ordered_extent *ordered; @@ -2425,20 +2542,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) } } - if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) { - offset = -EINVAL; - goto out; - } - if (offset > inode->i_sb->s_maxbytes) { - offset = -EINVAL; - goto out; - } - - /* Special lock needed here? */ - if (offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; - } + offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: mutex_unlock(&inode->i_mutex); return offset; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index e53009657f0e..b4f9904c4c6b 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -213,7 +213,7 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, else ret = 0; spin_unlock(&rsv->lock); - return 0; + return ret; } int btrfs_truncate_free_space_cache(struct btrfs_root *root, @@ -221,12 +221,10 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct btrfs_path *path, struct inode *inode) { - loff_t oldsize; int ret = 0; - oldsize = i_size_read(inode); btrfs_i_size_write(inode, 0); - truncate_pagecache(inode, oldsize, 0); + truncate_pagecache(inode, 0); /* * We don't need an orphan item because truncating the free space cache @@ -308,7 +306,7 @@ static void io_ctl_unmap_page(struct io_ctl *io_ctl) static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) { - BUG_ON(io_ctl->index >= io_ctl->num_pages); + ASSERT(io_ctl->index < io_ctl->num_pages); io_ctl->page = io_ctl->pages[io_ctl->index++]; io_ctl->cur = kmap(io_ctl->page); io_ctl->orig = io_ctl->cur; @@ -673,8 +671,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, btrfs_err(root->fs_info, "free space inode generation (%llu) " "did not match free space cache generation (%llu)", - (unsigned long long)BTRFS_I(inode)->generation, - (unsigned long long)generation); + BTRFS_I(inode)->generation, generation); return 0; } @@ -729,7 +726,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, goto free_cache; } } else { - BUG_ON(!num_bitmaps); + ASSERT(num_bitmaps); num_bitmaps--; e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); if (!e->bitmap) { @@ -1029,7 +1026,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, leaf = path->nodes[0]; if (ret > 0) { struct btrfs_key found_key; - BUG_ON(!path->slots[0]); + ASSERT(path->slots[0]); path->slots[0]--; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || @@ -1117,7 +1114,7 @@ int btrfs_write_out_cache(struct btrfs_root *root, static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit, u64 offset) { - BUG_ON(offset < bitmap_start); + ASSERT(offset >= bitmap_start); offset -= bitmap_start; return (unsigned long)(div_u64(offset, unit)); } @@ -1272,7 +1269,7 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, if (n) { entry = rb_entry(n, struct btrfs_free_space, offset_index); - BUG_ON(entry->offset > offset); + ASSERT(entry->offset <= offset); } else { if (fuzzy) return entry; @@ -1336,7 +1333,7 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, { int ret = 0; - BUG_ON(!info->bitmap && !info->bytes); + ASSERT(info->bytes || info->bitmap); ret = tree_insert_offset(&ctl->free_space_offset, info->offset, &info->offset_index, (info->bitmap != NULL)); if (ret) @@ -1359,7 +1356,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) max_bitmaps = max(max_bitmaps, 1); - BUG_ON(ctl->total_bitmaps > max_bitmaps); + ASSERT(ctl->total_bitmaps <= max_bitmaps); /* * The goal is to keep the total amount of memory used per 1gb of space @@ -1403,7 +1400,7 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, start = offset_to_bit(info->offset, ctl->unit, offset); count = bytes_to_bits(bytes, ctl->unit); - BUG_ON(start + count > BITS_PER_BITMAP); + ASSERT(start + count <= BITS_PER_BITMAP); bitmap_clear(info->bitmap, start, count); @@ -1426,7 +1423,7 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, start = offset_to_bit(info->offset, ctl->unit, offset); count = bytes_to_bits(bytes, ctl->unit); - BUG_ON(start + count > BITS_PER_BITMAP); + ASSERT(start + count <= BITS_PER_BITMAP); bitmap_set(info->bitmap, start, count); @@ -1434,13 +1431,19 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, ctl->free_space += bytes; } +/* + * If we can not find suitable extent, we will use bytes to record + * the size of the max extent. + */ static int search_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info, u64 *offset, u64 *bytes) { unsigned long found_bits = 0; + unsigned long max_bits = 0; unsigned long bits, i; unsigned long next_zero; + unsigned long extent_bits; i = offset_to_bit(bitmap_info->offset, ctl->unit, max_t(u64, *offset, bitmap_info->offset)); @@ -1449,9 +1452,12 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { next_zero = find_next_zero_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); - if ((next_zero - i) >= bits) { - found_bits = next_zero - i; + extent_bits = next_zero - i; + if (extent_bits >= bits) { + found_bits = extent_bits; break; + } else if (extent_bits > max_bits) { + max_bits = extent_bits; } i = next_zero; } @@ -1462,38 +1468,41 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, return 0; } + *bytes = (u64)(max_bits) * ctl->unit; return -1; } +/* Cache the size of the max extent in bytes */ static struct btrfs_free_space * find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, - unsigned long align) + unsigned long align, u64 *max_extent_size) { struct btrfs_free_space *entry; struct rb_node *node; - u64 ctl_off; u64 tmp; u64 align_off; int ret; if (!ctl->free_space_offset.rb_node) - return NULL; + goto out; entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1); if (!entry) - return NULL; + goto out; for (node = &entry->offset_index; node; node = rb_next(node)) { entry = rb_entry(node, struct btrfs_free_space, offset_index); - if (entry->bytes < *bytes) + if (entry->bytes < *bytes) { + if (entry->bytes > *max_extent_size) + *max_extent_size = entry->bytes; continue; + } /* make sure the space returned is big enough * to match our requested alignment */ if (*bytes >= align) { - ctl_off = entry->offset - ctl->start; - tmp = ctl_off + align - 1;; + tmp = entry->offset - ctl->start + align - 1; do_div(tmp, align); tmp = tmp * align + ctl->start; align_off = tmp - entry->offset; @@ -1502,14 +1511,22 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, tmp = entry->offset; } - if (entry->bytes < *bytes + align_off) + if (entry->bytes < *bytes + align_off) { + if (entry->bytes > *max_extent_size) + *max_extent_size = entry->bytes; continue; + } if (entry->bitmap) { - ret = search_bitmap(ctl, entry, &tmp, bytes); + u64 size = *bytes; + + ret = search_bitmap(ctl, entry, &tmp, &size); if (!ret) { *offset = tmp; + *bytes = size; return entry; + } else if (size > *max_extent_size) { + *max_extent_size = size; } continue; } @@ -1518,7 +1535,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, *bytes = entry->bytes - align_off; return entry; } - +out: return NULL; } @@ -1742,7 +1759,7 @@ no_cluster_bitmap: bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!bitmap_info) { - BUG_ON(added); + ASSERT(added == 0); goto new_bitmap; } @@ -1882,7 +1899,7 @@ out: if (ret) { printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret); - BUG_ON(ret == -EEXIST); + ASSERT(ret != -EEXIST); } return ret; @@ -1991,8 +2008,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, if (info->bytes >= bytes && !block_group->ro) count++; printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n", - (unsigned long long)info->offset, - (unsigned long long)info->bytes, + info->offset, info->bytes, (info->bitmap) ? "yes" : "no"); } printk(KERN_INFO "block group has cluster?: %s\n", @@ -2120,7 +2136,8 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) } u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes, u64 empty_size) + u64 offset, u64 bytes, u64 empty_size, + u64 *max_extent_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry = NULL; @@ -2131,7 +2148,7 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, spin_lock(&ctl->tree_lock); entry = find_free_space(ctl, &offset, &bytes_search, - block_group->full_stripe_len); + block_group->full_stripe_len, max_extent_size); if (!entry) goto out; @@ -2141,7 +2158,6 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, if (!entry->bytes) free_bitmap(ctl, entry); } else { - unlink_free_space(ctl, entry); align_gap_len = offset - entry->offset; align_gap = entry->offset; @@ -2155,7 +2171,6 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, else link_free_space(ctl, entry); } - out: spin_unlock(&ctl->tree_lock); @@ -2210,7 +2225,8 @@ int btrfs_return_cluster_to_free_space( static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, struct btrfs_free_space *entry, - u64 bytes, u64 min_start) + u64 bytes, u64 min_start, + u64 *max_extent_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; int err; @@ -2222,8 +2238,11 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, search_bytes = bytes; err = search_bitmap(ctl, entry, &search_start, &search_bytes); - if (err) + if (err) { + if (search_bytes > *max_extent_size) + *max_extent_size = search_bytes; return 0; + } ret = search_start; __bitmap_clear_bits(ctl, entry, ret, bytes); @@ -2238,7 +2257,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, */ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, u64 bytes, - u64 min_start) + u64 min_start, u64 *max_extent_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry = NULL; @@ -2258,6 +2277,9 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, entry = rb_entry(node, struct btrfs_free_space, offset_index); while(1) { + if (entry->bytes < bytes && entry->bytes > *max_extent_size) + *max_extent_size = entry->bytes; + if (entry->bytes < bytes || (!entry->bitmap && entry->offset < min_start)) { node = rb_next(&entry->offset_index); @@ -2271,7 +2293,8 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, if (entry->bitmap) { ret = btrfs_alloc_from_bitmap(block_group, cluster, entry, bytes, - cluster->window_start); + cluster->window_start, + max_extent_size); if (ret == 0) { node = rb_next(&entry->offset_index); if (!node) @@ -2371,7 +2394,7 @@ again: rb_erase(&entry->offset_index, &ctl->free_space_offset); ret = tree_insert_offset(&cluster->root, entry->offset, &entry->offset_index, 1); - BUG_ON(ret); /* -EEXIST; Logic error */ + ASSERT(!ret); /* -EEXIST; Logic error */ trace_btrfs_setup_cluster(block_group, cluster, total_found * ctl->unit, 1); @@ -2464,7 +2487,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, ret = tree_insert_offset(&cluster->root, entry->offset, &entry->offset_index, 0); total_size += entry->bytes; - BUG_ON(ret); /* -EEXIST; Logic error */ + ASSERT(!ret); /* -EEXIST; Logic error */ } while (node && entry != last); cluster->max_size = max_extent; @@ -2525,8 +2548,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, * returns zero and sets up cluster if things worked out, otherwise * it returns -enospc */ -int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_find_space_cluster(struct btrfs_root *root, struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, u64 empty_size) @@ -2856,7 +2878,7 @@ u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root) ret = search_bitmap(ctl, entry, &offset, &count); /* Logic error; Should be empty if it can't find anything */ - BUG_ON(ret); + ASSERT(!ret); ino = offset; bitmap_clear_bits(ctl, entry, offset, 1); @@ -2973,33 +2995,68 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, } #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -static struct btrfs_block_group_cache *init_test_block_group(void) +/* + * Use this if you need to make a bitmap or extent entry specifically, it + * doesn't do any of the merging that add_free_space does, this acts a lot like + * how the free space cache loading stuff works, so you can get really weird + * configurations. + */ +int test_add_free_space_entry(struct btrfs_block_group_cache *cache, + u64 offset, u64 bytes, bool bitmap) { - struct btrfs_block_group_cache *cache; + struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; + struct btrfs_free_space *info = NULL, *bitmap_info; + void *map = NULL; + u64 bytes_added; + int ret; - cache = kzalloc(sizeof(*cache), GFP_NOFS); - if (!cache) - return NULL; - cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), - GFP_NOFS); - if (!cache->free_space_ctl) { - kfree(cache); - return NULL; +again: + if (!info) { + info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); + if (!info) + return -ENOMEM; } - cache->key.objectid = 0; - cache->key.offset = 1024 * 1024 * 1024; - cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - cache->sectorsize = 4096; + if (!bitmap) { + spin_lock(&ctl->tree_lock); + info->offset = offset; + info->bytes = bytes; + ret = link_free_space(ctl, info); + spin_unlock(&ctl->tree_lock); + if (ret) + kmem_cache_free(btrfs_free_space_cachep, info); + return ret; + } - spin_lock_init(&cache->lock); - INIT_LIST_HEAD(&cache->list); - INIT_LIST_HEAD(&cache->cluster_list); - INIT_LIST_HEAD(&cache->new_bg_list); + if (!map) { + map = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + if (!map) { + kmem_cache_free(btrfs_free_space_cachep, info); + return -ENOMEM; + } + } - btrfs_init_free_space_ctl(cache); + spin_lock(&ctl->tree_lock); + bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), + 1, 0); + if (!bitmap_info) { + info->bitmap = map; + map = NULL; + add_new_bitmap(ctl, info, offset); + bitmap_info = info; + } - return cache; + bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); + bytes -= bytes_added; + offset += bytes_added; + spin_unlock(&ctl->tree_lock); + + if (bytes) + goto again; + + if (map) + kfree(map); + return 0; } /* @@ -3007,8 +3064,8 @@ static struct btrfs_block_group_cache *init_test_block_group(void) * just used to check the absence of space, so if there is free space in the * range at all we will return 1. */ -static int check_exists(struct btrfs_block_group_cache *cache, u64 offset, - u64 bytes) +int test_check_exists(struct btrfs_block_group_cache *cache, + u64 offset, u64 bytes) { struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; struct btrfs_free_space *info; @@ -3085,408 +3142,4 @@ out: spin_unlock(&ctl->tree_lock); return ret; } - -/* - * Use this if you need to make a bitmap or extent entry specifically, it - * doesn't do any of the merging that add_free_space does, this acts a lot like - * how the free space cache loading stuff works, so you can get really weird - * configurations. - */ -static int add_free_space_entry(struct btrfs_block_group_cache *cache, - u64 offset, u64 bytes, bool bitmap) -{ - struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; - struct btrfs_free_space *info = NULL, *bitmap_info; - void *map = NULL; - u64 bytes_added; - int ret; - -again: - if (!info) { - info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); - if (!info) - return -ENOMEM; - } - - if (!bitmap) { - spin_lock(&ctl->tree_lock); - info->offset = offset; - info->bytes = bytes; - ret = link_free_space(ctl, info); - spin_unlock(&ctl->tree_lock); - if (ret) - kmem_cache_free(btrfs_free_space_cachep, info); - return ret; - } - - if (!map) { - map = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); - if (!map) { - kmem_cache_free(btrfs_free_space_cachep, info); - return -ENOMEM; - } - } - - spin_lock(&ctl->tree_lock); - bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), - 1, 0); - if (!bitmap_info) { - info->bitmap = map; - map = NULL; - add_new_bitmap(ctl, info, offset); - bitmap_info = info; - } - - bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); - bytes -= bytes_added; - offset += bytes_added; - spin_unlock(&ctl->tree_lock); - - if (bytes) - goto again; - - if (map) - kfree(map); - return 0; -} - -/* - * This test just does basic sanity checking, making sure we can add an exten - * entry and remove space from either end and the middle, and make sure we can - * remove space that covers adjacent extent entries. - */ -static int test_extents(struct btrfs_block_group_cache *cache) -{ - int ret = 0; - - printk(KERN_ERR "Running extent only tests\n"); - - /* First just make sure we can remove an entire entry */ - ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); - if (ret) { - printk(KERN_ERR "Error adding initial extents %d\n", ret); - return ret; - } - - ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); - if (ret) { - printk(KERN_ERR "Error removing extent %d\n", ret); - return ret; - } - - if (check_exists(cache, 0, 4 * 1024 * 1024)) { - printk(KERN_ERR "Full remove left some lingering space\n"); - return -1; - } - - /* Ok edge and middle cases now */ - ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); - if (ret) { - printk(KERN_ERR "Error adding half extent %d\n", ret); - return ret; - } - - ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024); - if (ret) { - printk(KERN_ERR "Error removing tail end %d\n", ret); - return ret; - } - - ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); - if (ret) { - printk(KERN_ERR "Error removing front end %d\n", ret); - return ret; - } - - ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096); - if (ret) { - printk(KERN_ERR "Error removing middle peice %d\n", ret); - return ret; - } - - if (check_exists(cache, 0, 1 * 1024 * 1024)) { - printk(KERN_ERR "Still have space at the front\n"); - return -1; - } - - if (check_exists(cache, 2 * 1024 * 1024, 4096)) { - printk(KERN_ERR "Still have space in the middle\n"); - return -1; - } - - if (check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) { - printk(KERN_ERR "Still have space at the end\n"); - return -1; - } - - /* Cleanup */ - __btrfs_remove_free_space_cache(cache->free_space_ctl); - - return 0; -} - -static int test_bitmaps(struct btrfs_block_group_cache *cache) -{ - u64 next_bitmap_offset; - int ret; - - printk(KERN_ERR "Running bitmap only tests\n"); - - ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); - if (ret) { - printk(KERN_ERR "Couldn't create a bitmap entry %d\n", ret); - return ret; - } - - ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); - if (ret) { - printk(KERN_ERR "Error removing bitmap full range %d\n", ret); - return ret; - } - - if (check_exists(cache, 0, 4 * 1024 * 1024)) { - printk(KERN_ERR "Left some space in bitmap\n"); - return -1; - } - - ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); - if (ret) { - printk(KERN_ERR "Couldn't add to our bitmap entry %d\n", ret); - return ret; - } - - ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024); - if (ret) { - printk(KERN_ERR "Couldn't remove middle chunk %d\n", ret); - return ret; - } - - /* - * The first bitmap we have starts at offset 0 so the next one is just - * at the end of the first bitmap. - */ - next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); - - /* Test a bit straddling two bitmaps */ - ret = add_free_space_entry(cache, next_bitmap_offset - - (2 * 1024 * 1024), 4 * 1024 * 1024, 1); - if (ret) { - printk(KERN_ERR "Couldn't add space that straddles two bitmaps" - " %d\n", ret); - return ret; - } - - ret = btrfs_remove_free_space(cache, next_bitmap_offset - - (1 * 1024 * 1024), 2 * 1024 * 1024); - if (ret) { - printk(KERN_ERR "Couldn't remove overlapping space %d\n", ret); - return ret; - } - - if (check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024), - 2 * 1024 * 1024)) { - printk(KERN_ERR "Left some space when removing overlapping\n"); - return -1; - } - - __btrfs_remove_free_space_cache(cache->free_space_ctl); - - return 0; -} - -/* This is the high grade jackassery */ -static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) -{ - u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); - int ret; - - printk(KERN_ERR "Running bitmap and extent tests\n"); - - /* - * First let's do something simple, an extent at the same offset as the - * bitmap, but the free space completely in the extent and then - * completely in the bitmap. - */ - ret = add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1); - if (ret) { - printk(KERN_ERR "Couldn't create bitmap entry %d\n", ret); - return ret; - } - - ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); - if (ret) { - printk(KERN_ERR "Couldn't add extent entry %d\n", ret); - return ret; - } - - ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); - if (ret) { - printk(KERN_ERR "Couldn't remove extent entry %d\n", ret); - return ret; - } - - if (check_exists(cache, 0, 1 * 1024 * 1024)) { - printk(KERN_ERR "Left remnants after our remove\n"); - return -1; - } - - /* Now to add back the extent entry and remove from the bitmap */ - ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); - if (ret) { - printk(KERN_ERR "Couldn't re-add extent entry %d\n", ret); - return ret; - } - - ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024); - if (ret) { - printk(KERN_ERR "Couldn't remove from bitmap %d\n", ret); - return ret; - } - - if (check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) { - printk(KERN_ERR "Left remnants in the bitmap\n"); - return -1; - } - - /* - * Ok so a little more evil, extent entry and bitmap at the same offset, - * removing an overlapping chunk. - */ - ret = add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1); - if (ret) { - printk(KERN_ERR "Couldn't add to a bitmap %d\n", ret); - return ret; - } - - ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024); - if (ret) { - printk(KERN_ERR "Couldn't remove overlapping space %d\n", ret); - return ret; - } - - if (check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) { - printk(KERN_ERR "Left over peices after removing " - "overlapping\n"); - return -1; - } - - __btrfs_remove_free_space_cache(cache->free_space_ctl); - - /* Now with the extent entry offset into the bitmap */ - ret = add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1); - if (ret) { - printk(KERN_ERR "Couldn't add space to the bitmap %d\n", ret); - return ret; - } - - ret = add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0); - if (ret) { - printk(KERN_ERR "Couldn't add extent to the cache %d\n", ret); - return ret; - } - - ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024); - if (ret) { - printk(KERN_ERR "Problem removing overlapping space %d\n", ret); - return ret; - } - - if (check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) { - printk(KERN_ERR "Left something behind when removing space"); - return -1; - } - - /* - * This has blown up in the past, the extent entry starts before the - * bitmap entry, but we're trying to remove an offset that falls - * completely within the bitmap range and is in both the extent entry - * and the bitmap entry, looks like this - * - * [ extent ] - * [ bitmap ] - * [ del ] - */ - __btrfs_remove_free_space_cache(cache->free_space_ctl); - ret = add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024, - 4 * 1024 * 1024, 1); - if (ret) { - printk(KERN_ERR "Couldn't add bitmap %d\n", ret); - return ret; - } - - ret = add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024, - 5 * 1024 * 1024, 0); - if (ret) { - printk(KERN_ERR "Couldn't add extent entry %d\n", ret); - return ret; - } - - ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024, - 5 * 1024 * 1024); - if (ret) { - printk(KERN_ERR "Failed to free our space %d\n", ret); - return ret; - } - - if (check_exists(cache, bitmap_offset + 1 * 1024 * 1024, - 5 * 1024 * 1024)) { - printk(KERN_ERR "Left stuff over\n"); - return -1; - } - - __btrfs_remove_free_space_cache(cache->free_space_ctl); - - /* - * This blew up before, we have part of the free space in a bitmap and - * then the entirety of the rest of the space in an extent. This used - * to return -EAGAIN back from btrfs_remove_extent, make sure this - * doesn't happen. - */ - ret = add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1); - if (ret) { - printk(KERN_ERR "Couldn't add bitmap entry %d\n", ret); - return ret; - } - - ret = add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0); - if (ret) { - printk(KERN_ERR "Couldn't add extent entry %d\n", ret); - return ret; - } - - ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024); - if (ret) { - printk(KERN_ERR "Error removing bitmap and extent " - "overlapping %d\n", ret); - return ret; - } - - __btrfs_remove_free_space_cache(cache->free_space_ctl); - return 0; -} - -void btrfs_test_free_space_cache(void) -{ - struct btrfs_block_group_cache *cache; - - printk(KERN_ERR "Running btrfs free space cache tests\n"); - - cache = init_test_block_group(); - if (!cache) { - printk(KERN_ERR "Couldn't run the tests\n"); - return; - } - - if (test_extents(cache)) - goto out; - if (test_bitmaps(cache)) - goto out; - if (test_bitmaps_and_extents(cache)) - goto out; -out: - __btrfs_remove_free_space_cache(cache->free_space_ctl); - kfree(cache->free_space_ctl); - kfree(cache); - printk(KERN_ERR "Free space cache tests finished\n"); -} #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 8b7f19f44961..e737f92cf6d0 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -94,27 +94,31 @@ void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl); void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group); u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes, u64 empty_size); + u64 offset, u64 bytes, u64 empty_size, + u64 *max_extent_size); u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root); void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, u64 bytes); -int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_find_space_cluster(struct btrfs_root *root, struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, u64 empty_size); void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster); u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, u64 bytes, - u64 min_start); + u64 min_start, u64 *max_extent_size); int btrfs_return_cluster_to_free_space( struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster); int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen); +/* Support functions for runnint our sanity tests */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -void btrfs_test_free_space_cache(void); +int test_add_free_space_entry(struct btrfs_block_group_cache *cache, + u64 offset, u64 bytes, bool bitmap); +int test_check_exists(struct btrfs_block_group_cache *cache, + u64 offset, u64 bytes); #endif #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index af978f7682b3..22ebc13b6c99 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -42,6 +42,7 @@ #include <linux/mount.h> #include <linux/btrfs.h> #include <linux/blkdev.h> +#include <linux/posix_acl_xattr.h> #include "compat.h" #include "ctree.h" #include "disk-io.h" @@ -57,6 +58,7 @@ #include "free-space-cache.h" #include "inode-map.h" #include "backref.h" +#include "hash.h" struct btrfs_iget_args { u64 ino; @@ -228,12 +230,13 @@ fail: * does the checks required to make sure the data is small enough * to fit as an inline extent. */ -static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode, u64 start, u64 end, - size_t compressed_size, int compress_type, - struct page **compressed_pages) +static noinline int cow_file_range_inline(struct btrfs_root *root, + struct inode *inode, u64 start, + u64 end, size_t compressed_size, + int compress_type, + struct page **compressed_pages) { + struct btrfs_trans_handle *trans; u64 isize = i_size_read(inode); u64 actual_end = min(end + 1, isize); u64 inline_len = actual_end - start; @@ -254,9 +257,16 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, return 1; } + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1); - if (ret) - return ret; + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } if (isize > actual_end) inline_len = min_t(u64, isize, actual_end); @@ -265,15 +275,18 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, compress_type, compressed_pages); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, root, ret); - return ret; + goto out; } else if (ret == -ENOSPC) { - return 1; + ret = 1; + goto out; } set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); btrfs_delalloc_release_metadata(inode, end + 1 - start); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); - return 0; +out: + btrfs_end_transaction(trans, root); + return ret; } struct async_extent { @@ -341,7 +354,6 @@ static noinline int compress_file_range(struct inode *inode, int *num_added) { struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; u64 num_bytes; u64 blocksize = root->sectorsize; u64 actual_end; @@ -459,45 +471,36 @@ again: } cont: if (start == 0) { - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto cleanup_and_out; - } - trans->block_rsv = &root->fs_info->delalloc_block_rsv; - /* lets try to make an inline extent */ if (ret || total_in < (actual_end - start)) { /* we didn't compress the entire range, try * to make an uncompressed inline extent. */ - ret = cow_file_range_inline(trans, root, inode, - start, end, 0, 0, NULL); + ret = cow_file_range_inline(root, inode, start, end, + 0, 0, NULL); } else { /* try making a compressed inline extent */ - ret = cow_file_range_inline(trans, root, inode, - start, end, + ret = cow_file_range_inline(root, inode, start, end, total_compressed, compress_type, pages); } if (ret <= 0) { + unsigned long clear_flags = EXTENT_DELALLOC | + EXTENT_DEFRAG; + clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0; + /* * inline extent creation worked or returned error, * we don't need to create any more async work items. * Unlock and free up our temp pages. */ - extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, NULL, - EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | - EXTENT_CLEAR_DELALLOC | - EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); - - btrfs_end_transaction(trans, root); + extent_clear_unlock_delalloc(inode, start, end, NULL, + clear_flags, PAGE_UNLOCK | + PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK | + PAGE_END_WRITEBACK); goto free_pages_out; } - btrfs_end_transaction(trans, root); } if (will_compress) { @@ -588,20 +591,6 @@ free_pages_out: kfree(pages); goto out; - -cleanup_and_out: - extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, - start, end, NULL, - EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_DIRTY | - EXTENT_CLEAR_DELALLOC | - EXTENT_SET_WRITEBACK | - EXTENT_END_WRITEBACK); - if (!trans || IS_ERR(trans)) - btrfs_error(root->fs_info, ret, "Failed to join transaction"); - else - btrfs_abort_transaction(trans, root, ret); - goto free_pages_out; } /* @@ -615,7 +604,6 @@ static noinline int submit_compressed_extents(struct inode *inode, { struct async_extent *async_extent; u64 alloc_hint = 0; - struct btrfs_trans_handle *trans; struct btrfs_key ins; struct extent_map *em; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -676,20 +664,10 @@ retry: lock_extent(io_tree, async_extent->start, async_extent->start + async_extent->ram_size - 1); - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - } else { - trans->block_rsv = &root->fs_info->delalloc_block_rsv; - ret = btrfs_reserve_extent(trans, root, + ret = btrfs_reserve_extent(root, async_extent->compressed_size, async_extent->compressed_size, 0, alloc_hint, &ins, 1); - if (ret && ret != -ENOSPC) - btrfs_abort_transaction(trans, root, ret); - btrfs_end_transaction(trans, root); - } - if (ret) { int i; @@ -701,8 +679,12 @@ retry: async_extent->nr_pages = 0; async_extent->pages = NULL; - if (ret == -ENOSPC) + if (ret == -ENOSPC) { + unlock_extent(io_tree, async_extent->start, + async_extent->start + + async_extent->ram_size - 1); goto retry; + } goto out_free; } @@ -764,16 +746,12 @@ retry: /* * clear dirty, set writeback and unlock the pages. */ - extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - async_extent->start, + extent_clear_unlock_delalloc(inode, async_extent->start, async_extent->start + async_extent->ram_size - 1, - NULL, EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | - EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK); - + NULL, EXTENT_LOCKED | EXTENT_DELALLOC, + PAGE_UNLOCK | PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK); ret = btrfs_submit_compressed_write(inode, async_extent->start, async_extent->ram_size, @@ -792,16 +770,13 @@ out: out_free_reserve: btrfs_free_reserved_extent(root, ins.objectid, ins.offset); out_free: - extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, - async_extent->start, + extent_clear_unlock_delalloc(inode, async_extent->start, async_extent->start + async_extent->ram_size - 1, - NULL, EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | - EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_DIRTY | - EXTENT_SET_WRITEBACK | - EXTENT_END_WRITEBACK); + NULL, EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, + PAGE_UNLOCK | PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); kfree(async_extent); goto again; } @@ -851,14 +826,13 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start, * required to start IO on it. It may be clean and already done with * IO when we return. */ -static noinline int __cow_file_range(struct btrfs_trans_handle *trans, - struct inode *inode, - struct btrfs_root *root, - struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, - int unlock) +static noinline int cow_file_range(struct inode *inode, + struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written, + int unlock) { + struct btrfs_root *root = BTRFS_I(inode)->root; u64 alloc_hint = 0; u64 num_bytes; unsigned long ram_size; @@ -879,29 +853,24 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, /* if this is a small write inside eof, kick off defrag */ if (num_bytes < 64 * 1024 && (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) - btrfs_add_inode_defrag(trans, inode); + btrfs_add_inode_defrag(NULL, inode); if (start == 0) { /* lets try to make an inline extent */ - ret = cow_file_range_inline(trans, root, inode, - start, end, 0, 0, NULL); + ret = cow_file_range_inline(root, inode, start, end, 0, 0, + NULL); if (ret == 0) { - extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, NULL, - EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | - EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_DIRTY | - EXTENT_SET_WRITEBACK | - EXTENT_END_WRITEBACK); + extent_clear_unlock_delalloc(inode, start, end, NULL, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DEFRAG, PAGE_UNLOCK | + PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | + PAGE_END_WRITEBACK); *nr_written = *nr_written + (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; *page_started = 1; goto out; } else if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); goto out_unlock; } } @@ -916,13 +885,11 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, unsigned long op; cur_alloc_size = disk_num_bytes; - ret = btrfs_reserve_extent(trans, root, cur_alloc_size, + ret = btrfs_reserve_extent(root, cur_alloc_size, root->sectorsize, 0, alloc_hint, &ins, 1); - if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + if (ret < 0) goto out_unlock; - } em = alloc_extent_map(); if (!em) { @@ -968,10 +935,8 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_reloc_clone_csums(inode, start, cur_alloc_size); - if (ret) { - btrfs_abort_transaction(trans, root, ret); + if (ret) goto out_reserve; - } } if (disk_num_bytes < cur_alloc_size) @@ -984,13 +949,13 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, * Do set the Private2 bit so we know this page was properly * setup for writepage */ - op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0; - op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | - EXTENT_SET_PRIVATE2; + op = unlock ? PAGE_UNLOCK : 0; + op |= PAGE_SET_PRIVATE2; - extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, - start, start + ram_size - 1, - locked_page, op); + extent_clear_unlock_delalloc(inode, start, + start + ram_size - 1, locked_page, + EXTENT_LOCKED | EXTENT_DELALLOC, + op); disk_num_bytes -= cur_alloc_size; num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; @@ -1002,52 +967,14 @@ out: out_reserve: btrfs_free_reserved_extent(root, ins.objectid, ins.offset); out_unlock: - extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, locked_page, - EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | - EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_DIRTY | - EXTENT_SET_WRITEBACK | - EXTENT_END_WRITEBACK); - + extent_clear_unlock_delalloc(inode, start, end, locked_page, + EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | + EXTENT_DELALLOC | EXTENT_DEFRAG, + PAGE_UNLOCK | PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); goto out; } -static noinline int cow_file_range(struct inode *inode, - struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, - int unlock) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(inode)->root; - int ret; - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, locked_page, - EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | - EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_DIRTY | - EXTENT_SET_WRITEBACK | - EXTENT_END_WRITEBACK); - return PTR_ERR(trans); - } - trans->block_rsv = &root->fs_info->delalloc_block_rsv; - - ret = __cow_file_range(trans, inode, root, locked_page, start, end, - page_started, nr_written, unlock); - - btrfs_end_transaction(trans, root); - - return ret; -} - /* * work queue call back to started compression on a file and pages */ @@ -1215,15 +1142,13 @@ static noinline int run_delalloc_nocow(struct inode *inode, path = btrfs_alloc_path(); if (!path) { - extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, locked_page, - EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | - EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_DIRTY | - EXTENT_SET_WRITEBACK | - EXTENT_END_WRITEBACK); + extent_clear_unlock_delalloc(inode, start, end, locked_page, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, PAGE_UNLOCK | + PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK | + PAGE_END_WRITEBACK); return -ENOMEM; } @@ -1235,15 +1160,13 @@ static noinline int run_delalloc_nocow(struct inode *inode, trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { - extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, locked_page, - EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | - EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_DIRTY | - EXTENT_SET_WRITEBACK | - EXTENT_END_WRITEBACK); + extent_clear_unlock_delalloc(inode, start, end, locked_page, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, PAGE_UNLOCK | + PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK | + PAGE_END_WRITEBACK); btrfs_free_path(path); return PTR_ERR(trans); } @@ -1363,9 +1286,9 @@ out_check: btrfs_release_path(path); if (cow_start != (u64)-1) { - ret = __cow_file_range(trans, inode, root, locked_page, - cow_start, found_key.offset - 1, - page_started, nr_written, 1); + ret = cow_file_range(inode, locked_page, + cow_start, found_key.offset - 1, + page_started, nr_written, 1); if (ret) { btrfs_abort_transaction(trans, root, ret); goto error; @@ -1422,11 +1345,11 @@ out_check: } } - extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, - cur_offset, cur_offset + num_bytes - 1, - locked_page, EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | - EXTENT_SET_PRIVATE2); + extent_clear_unlock_delalloc(inode, cur_offset, + cur_offset + num_bytes - 1, + locked_page, EXTENT_LOCKED | + EXTENT_DELALLOC, PAGE_UNLOCK | + PAGE_SET_PRIVATE2); cur_offset = extent_end; if (cur_offset > end) break; @@ -1439,9 +1362,8 @@ out_check: } if (cow_start != (u64)-1) { - ret = __cow_file_range(trans, inode, root, locked_page, - cow_start, end, - page_started, nr_written, 1); + ret = cow_file_range(inode, locked_page, cow_start, end, + page_started, nr_written, 1); if (ret) { btrfs_abort_transaction(trans, root, ret); goto error; @@ -1454,16 +1376,13 @@ error: ret = err; if (ret && cur_offset < end) - extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - cur_offset, end, locked_page, - EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | - EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_DIRTY | - EXTENT_SET_WRITEBACK | - EXTENT_END_WRITEBACK); - + extent_clear_unlock_delalloc(inode, cur_offset, end, + locked_page, EXTENT_LOCKED | + EXTENT_DELALLOC | EXTENT_DEFRAG | + EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | + PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK | + PAGE_END_WRITEBACK); btrfs_free_path(path); return ret; } @@ -1529,6 +1448,46 @@ static void btrfs_merge_extent_hook(struct inode *inode, spin_unlock(&BTRFS_I(inode)->lock); } +static void btrfs_add_delalloc_inodes(struct btrfs_root *root, + struct inode *inode) +{ + spin_lock(&root->delalloc_lock); + if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_add_tail(&BTRFS_I(inode)->delalloc_inodes, + &root->delalloc_inodes); + set_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags); + root->nr_delalloc_inodes++; + if (root->nr_delalloc_inodes == 1) { + spin_lock(&root->fs_info->delalloc_root_lock); + BUG_ON(!list_empty(&root->delalloc_root)); + list_add_tail(&root->delalloc_root, + &root->fs_info->delalloc_roots); + spin_unlock(&root->fs_info->delalloc_root_lock); + } + } + spin_unlock(&root->delalloc_lock); +} + +static void btrfs_del_delalloc_inode(struct btrfs_root *root, + struct inode *inode) +{ + spin_lock(&root->delalloc_lock); + if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_del_init(&BTRFS_I(inode)->delalloc_inodes); + clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags); + root->nr_delalloc_inodes--; + if (!root->nr_delalloc_inodes) { + spin_lock(&root->fs_info->delalloc_root_lock); + BUG_ON(list_empty(&root->delalloc_root)); + list_del_init(&root->delalloc_root); + spin_unlock(&root->fs_info->delalloc_root_lock); + } + } + spin_unlock(&root->delalloc_lock); +} + /* * extent_io.c set_bit_hook, used to track delayed allocation * bytes in this file, and to maintain the list of inodes that @@ -1561,16 +1520,8 @@ static void btrfs_set_bit_hook(struct inode *inode, spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes += len; if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags)) { - spin_lock(&root->fs_info->delalloc_lock); - if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_add_tail(&BTRFS_I(inode)->delalloc_inodes, - &root->fs_info->delalloc_inodes); - set_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags); - } - spin_unlock(&root->fs_info->delalloc_lock); - } + &BTRFS_I(inode)->runtime_flags)) + btrfs_add_delalloc_inodes(root, inode); spin_unlock(&BTRFS_I(inode)->lock); } } @@ -1604,7 +1555,7 @@ static void btrfs_clear_bit_hook(struct inode *inode, btrfs_delalloc_release_metadata(inode, len); if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID - && do_list) + && do_list && !(state->state & EXTENT_NORESERVE)) btrfs_free_reserved_data_space(inode, len); __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, @@ -1613,15 +1564,8 @@ static void btrfs_clear_bit_hook(struct inode *inode, BTRFS_I(inode)->delalloc_bytes -= len; if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && test_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags)) { - spin_lock(&root->fs_info->delalloc_lock); - if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_del_init(&BTRFS_I(inode)->delalloc_inodes); - clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags); - } - spin_unlock(&root->fs_info->delalloc_lock); - } + &BTRFS_I(inode)->runtime_flags)) + btrfs_del_delalloc_inode(root, inode); spin_unlock(&BTRFS_I(inode)->lock); } } @@ -2101,6 +2045,7 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, WARN_ON(1); return ret; } + ret = 0; while (1) { cond_resched(); @@ -2135,16 +2080,21 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr) continue; - extent_offset = btrfs_file_extent_offset(leaf, extent); - if (key.offset - extent_offset != offset) + /* + * 'offset' refers to the exact key.offset, + * NOT the 'offset' field in btrfs_extent_data_ref, ie. + * (key.offset - extent_offset). + */ + if (key.offset != offset) continue; + extent_offset = btrfs_file_extent_offset(leaf, extent); num_bytes = btrfs_file_extent_num_bytes(leaf, extent); + if (extent_offset >= old->extent_offset + old->offset + old->len || extent_offset + num_bytes <= old->extent_offset + old->offset) continue; - break; } @@ -2156,7 +2106,7 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, backref->root_id = root_id; backref->inum = inum; - backref->file_pos = offset + extent_offset; + backref->file_pos = offset; backref->num_bytes = num_bytes; backref->extent_offset = extent_offset; backref->generation = btrfs_file_extent_generation(leaf, extent); @@ -2179,7 +2129,8 @@ static noinline bool record_extent_backrefs(struct btrfs_path *path, new->path = path; list_for_each_entry_safe(old, tmp, &new->head, list) { - ret = iterate_inodes_from_logical(old->bytenr, fs_info, + ret = iterate_inodes_from_logical(old->bytenr + + old->extent_offset, fs_info, path, record_one_backref, old); BUG_ON(ret < 0 && ret != -ENOENT); @@ -2199,16 +2150,18 @@ static noinline bool record_extent_backrefs(struct btrfs_path *path, static int relink_is_mergable(struct extent_buffer *leaf, struct btrfs_file_extent_item *fi, - u64 disk_bytenr) + struct new_sa_defrag_extent *new) { - if (btrfs_file_extent_disk_bytenr(leaf, fi) != disk_bytenr) + if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr) return 0; if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) return 0; - if (btrfs_file_extent_compression(leaf, fi) || - btrfs_file_extent_encryption(leaf, fi) || + if (btrfs_file_extent_compression(leaf, fi) != new->compress_type) + return 0; + + if (btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)) return 0; @@ -2263,11 +2216,6 @@ static noinline int relink_extent_backref(struct btrfs_path *path, return 0; return PTR_ERR(root); } - if (btrfs_root_refs(&root->root_item) == 0) { - srcu_read_unlock(&fs_info->subvol_srcu, index); - /* parse ENOENT to 0 */ - return 0; - } /* step 2: get inode */ key.objectid = backref->inum; @@ -2357,8 +2305,8 @@ again: struct btrfs_file_extent_item); extent_len = btrfs_file_extent_num_bytes(leaf, fi); - if (relink_is_mergable(leaf, fi, new->bytenr) && - extent_len + found_key.offset == start) { + if (extent_len + found_key.offset == start && + relink_is_mergable(leaf, fi, new)) { btrfs_set_file_extent_num_bytes(leaf, fi, extent_len + len); btrfs_mark_buffer_dirty(leaf); @@ -2614,8 +2562,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) struct extent_state *cached_state = NULL; struct new_sa_defrag_extent *new = NULL; int compress_type = 0; - int ret; + int ret = 0; + u64 logical_len = ordered_extent->len; bool nolock; + bool truncated = false; nolock = btrfs_is_free_space_inode(inode); @@ -2624,6 +2574,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } + if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { + truncated = true; + logical_len = ordered_extent->truncated_len; + /* Truncated the entire extent, don't bother adding */ + if (!logical_len) + goto out; + } + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ btrfs_ordered_update_i_size(inode, 0, ordered_extent); @@ -2679,15 +2637,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ret = btrfs_mark_extent_written(trans, inode, ordered_extent->file_offset, ordered_extent->file_offset + - ordered_extent->len); + logical_len); } else { BUG_ON(root == root->fs_info->tree_root); ret = insert_reserved_file_extent(trans, inode, ordered_extent->file_offset, ordered_extent->start, ordered_extent->disk_len, - ordered_extent->len, - ordered_extent->len, + logical_len, logical_len, compress_type, 0, 0, BTRFS_FILE_EXTENT_REG); } @@ -2719,17 +2676,27 @@ out: if (trans) btrfs_end_transaction(trans, root); - if (ret) { - clear_extent_uptodate(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + - ordered_extent->len - 1, NULL, GFP_NOFS); + if (ret || truncated) { + u64 start, end; + + if (truncated) + start = ordered_extent->file_offset + logical_len; + else + start = ordered_extent->file_offset; + end = ordered_extent->file_offset + ordered_extent->len - 1; + clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS); + + /* Drop the cache for the part of the extent we didn't write. */ + btrfs_drop_extent_cache(inode, start, end, 0); /* * If the ordered extent had an IOERR or something else went * wrong we need to return the space for this ordered extent - * back to the allocator. + * back to the allocator. We only free the extent in the + * truncated case if we didn't write out the extent at all. */ - if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && + if ((ret || !logical_len) && + !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) btrfs_free_reserved_extent(root, ordered_extent->start, ordered_extent->disk_len); @@ -2793,16 +2760,16 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, * if there's a match, we allow the bio to finish. If not, the code in * extent_io.c will try to find good copies for us. */ -static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, - struct extent_state *state, int mirror) +static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, + u64 phy_offset, struct page *page, + u64 start, u64 end, int mirror) { size_t offset = start - page_offset(page); struct inode *inode = page->mapping->host; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; char *kaddr; - u64 private = ~(u32)0; - int ret; struct btrfs_root *root = BTRFS_I(inode)->root; + u32 csum_expected; u32 csum = ~(u32)0; static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -2822,19 +2789,13 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, return 0; } - if (state && state->start == start) { - private = state->private; - ret = 0; - } else { - ret = get_state_private(io_tree, start, &private); - } - kaddr = kmap_atomic(page); - if (ret) - goto zeroit; + phy_offset >>= inode->i_sb->s_blocksize_bits; + csum_expected = *(((u32 *)io_bio->csum) + phy_offset); + kaddr = kmap_atomic(page); csum = btrfs_csum_data(kaddr + offset, csum, end - start + 1); btrfs_csum_final(csum, (char *)&csum); - if (csum != private) + if (csum != csum_expected) goto zeroit; kunmap_atomic(kaddr); @@ -2843,14 +2804,12 @@ good: zeroit: if (__ratelimit(&_rs)) - btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u private %llu", - (unsigned long long)btrfs_ino(page->mapping->host), - (unsigned long long)start, csum, - (unsigned long long)private); + btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", + btrfs_ino(page->mapping->host), start, csum, csum_expected); memset(kaddr + offset, 1, end - start + 1); flush_dcache_page(page); kunmap_atomic(kaddr); - if (private == 0) + if (csum_expected == 0) return 0; return -EIO; } @@ -2937,8 +2896,10 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, btrfs_root_refs(&root->root_item) > 0) { ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, root->root_key.objectid); - BUG_ON(ret); - root->orphan_item_inserted = 0; + if (ret) + btrfs_abort_transaction(trans, root, ret); + else + root->orphan_item_inserted = 0; } if (block_rsv) { @@ -3007,11 +2968,18 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) /* insert an orphan item to track this unlinked/truncated file */ if (insert >= 1) { ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); - if (ret && ret != -EEXIST) { - clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, - &BTRFS_I(inode)->runtime_flags); - btrfs_abort_transaction(trans, root, ret); - return ret; + if (ret) { + if (reserve) { + clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, + &BTRFS_I(inode)->runtime_flags); + btrfs_orphan_release_metadata(inode); + } + if (ret != -EEXIST) { + clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags); + btrfs_abort_transaction(trans, root, ret); + return ret; + } } ret = 0; } @@ -3050,17 +3018,15 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans, release_rsv = 1; spin_unlock(&root->orphan_lock); - if (trans && delete_item) { + if (trans && delete_item) ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode)); - BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ - } if (release_rsv) { btrfs_orphan_release_metadata(inode); atomic_dec(&root->orphan_inodes); } - return 0; + return ret; } /* @@ -3140,7 +3106,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); - ret = PTR_RET(inode); + ret = PTR_ERR_OR_ZERO(inode); if (ret && ret != -ESTALE) goto out; @@ -3190,8 +3156,9 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) found_key.objectid); ret = btrfs_del_orphan_item(trans, root, found_key.objectid); - BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ btrfs_end_transaction(trans, root); + if (ret) + goto out; continue; } @@ -3215,13 +3182,16 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) /* 1 for the orphan item deletion. */ trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { + iput(inode); ret = PTR_ERR(trans); goto out; } ret = btrfs_orphan_add(trans, inode); btrfs_end_transaction(trans, root); - if (ret) + if (ret) { + iput(inode); goto out; + } ret = btrfs_truncate(inode); if (ret) @@ -3274,8 +3244,17 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, { u32 nritems = btrfs_header_nritems(leaf); struct btrfs_key found_key; + static u64 xattr_access = 0; + static u64 xattr_default = 0; int scanned = 0; + if (!xattr_access) { + xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS, + strlen(POSIX_ACL_XATTR_ACCESS)); + xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT, + strlen(POSIX_ACL_XATTR_DEFAULT)); + } + slot++; while (slot < nritems) { btrfs_item_key_to_cpu(leaf, &found_key, slot); @@ -3285,8 +3264,11 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, return 0; /* we found an xattr, assume we've got an acl */ - if (found_key.type == BTRFS_XATTR_ITEM_KEY) - return 1; + if (found_key.type == BTRFS_XATTR_ITEM_KEY) { + if (found_key.offset == xattr_access || + found_key.offset == xattr_default) + return 1; + } /* * we found a key greater than an xattr key, there can't @@ -3608,8 +3590,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, if (ret) { btrfs_info(root->fs_info, "failed to delete reference to %.*s, inode %llu parent %llu", - name_len, name, - (unsigned long long)ino, (unsigned long long)dir_ino); + name_len, name, ino, dir_ino); btrfs_abort_transaction(trans, root, ret); goto err; } @@ -3660,53 +3641,20 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, } return ret; } - - -/* helper to check if there is any shared block in the path */ -static int check_path_shared(struct btrfs_root *root, - struct btrfs_path *path) -{ - struct extent_buffer *eb; - int level; - u64 refs = 1; - - for (level = 0; level < BTRFS_MAX_LEVEL; level++) { - int ret; - - if (!path->nodes[level]) - break; - eb = path->nodes[level]; - if (!btrfs_block_can_be_shared(root, eb)) - continue; - ret = btrfs_lookup_extent_info(NULL, root, eb->start, level, 1, - &refs, NULL); - if (refs > 1) - return 1; - } - return 0; -} /* * helper to start transaction for unlink and rmdir. * - * unlink and rmdir are special in btrfs, they do not always free space. - * so in enospc case, we should make sure they will free space before - * allowing them to use the global metadata reservation. + * unlink and rmdir are special in btrfs, they do not always free space, so + * if we cannot make our reservations the normal way try and see if there is + * plenty of slack room in the global reserve to migrate, otherwise we cannot + * allow the unlink to occur. */ -static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, - struct dentry *dentry) +static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; - struct btrfs_path *path; - struct btrfs_dir_item *di; - struct inode *inode = dentry->d_inode; - u64 index; - int check_link = 1; - int err = -ENOSPC; int ret; - u64 ino = btrfs_ino(inode); - u64 dir_ino = btrfs_ino(dir); /* * 1 for the possible orphan item @@ -3719,158 +3667,23 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) return trans; - if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) - return ERR_PTR(-ENOSPC); - - /* check if there is someone else holds reference */ - if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1) - return ERR_PTR(-ENOSPC); - - if (atomic_read(&inode->i_count) > 2) - return ERR_PTR(-ENOSPC); - - if (xchg(&root->fs_info->enospc_unlink, 1)) - return ERR_PTR(-ENOSPC); - - path = btrfs_alloc_path(); - if (!path) { - root->fs_info->enospc_unlink = 0; - return ERR_PTR(-ENOMEM); - } - - /* 1 for the orphan item */ - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - btrfs_free_path(path); - root->fs_info->enospc_unlink = 0; - return trans; - } - - path->skip_locking = 1; - path->search_commit_root = 1; - - ret = btrfs_lookup_inode(trans, root, path, - &BTRFS_I(dir)->location, 0); - if (ret < 0) { - err = ret; - goto out; - } - if (ret == 0) { - if (check_path_shared(root, path)) - goto out; - } else { - check_link = 0; - } - btrfs_release_path(path); - - ret = btrfs_lookup_inode(trans, root, path, - &BTRFS_I(inode)->location, 0); - if (ret < 0) { - err = ret; - goto out; - } - if (ret == 0) { - if (check_path_shared(root, path)) - goto out; - } else { - check_link = 0; - } - btrfs_release_path(path); + if (PTR_ERR(trans) == -ENOSPC) { + u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5); - if (ret == 0 && S_ISREG(inode->i_mode)) { - ret = btrfs_lookup_file_extent(trans, root, path, - ino, (u64)-1, 0); - if (ret < 0) { - err = ret; - goto out; + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return trans; + ret = btrfs_cond_migrate_bytes(root->fs_info, + &root->fs_info->trans_block_rsv, + num_bytes, 5); + if (ret) { + btrfs_end_transaction(trans, root); + return ERR_PTR(ret); } - BUG_ON(ret == 0); /* Corruption */ - if (check_path_shared(root, path)) - goto out; - btrfs_release_path(path); - } - - if (!check_link) { - err = 0; - goto out; - } - - di = btrfs_lookup_dir_item(trans, root, path, dir_ino, - dentry->d_name.name, dentry->d_name.len, 0); - if (IS_ERR(di)) { - err = PTR_ERR(di); - goto out; - } - if (di) { - if (check_path_shared(root, path)) - goto out; - } else { - err = 0; - goto out; - } - btrfs_release_path(path); - - ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name, - dentry->d_name.len, ino, dir_ino, 0, - &index); - if (ret) { - err = ret; - goto out; - } - - if (check_path_shared(root, path)) - goto out; - - btrfs_release_path(path); - - /* - * This is a commit root search, if we can lookup inode item and other - * relative items in the commit root, it means the transaction of - * dir/file creation has been committed, and the dir index item that we - * delay to insert has also been inserted into the commit root. So - * we needn't worry about the delayed insertion of the dir index item - * here. - */ - di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index, - dentry->d_name.name, dentry->d_name.len, 0); - if (IS_ERR(di)) { - err = PTR_ERR(di); - goto out; - } - BUG_ON(ret == -ENOENT); - if (check_path_shared(root, path)) - goto out; - - err = 0; -out: - btrfs_free_path(path); - /* Migrate the orphan reservation over */ - if (!err) - err = btrfs_block_rsv_migrate(trans->block_rsv, - &root->fs_info->global_block_rsv, - trans->bytes_reserved); - - if (err) { - btrfs_end_transaction(trans, root); - root->fs_info->enospc_unlink = 0; - return ERR_PTR(err); - } - - trans->block_rsv = &root->fs_info->global_block_rsv; - return trans; -} - -static void __unlink_end_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) { - btrfs_block_rsv_release(root, trans->block_rsv, - trans->bytes_reserved); trans->block_rsv = &root->fs_info->trans_block_rsv; - BUG_ON(!root->fs_info->enospc_unlink); - root->fs_info->enospc_unlink = 0; + trans->bytes_reserved = num_bytes; } - btrfs_end_transaction(trans, root); + return trans; } static int btrfs_unlink(struct inode *dir, struct dentry *dentry) @@ -3880,7 +3693,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) struct inode *inode = dentry->d_inode; int ret; - trans = __unlink_start_trans(dir, dentry); + trans = __unlink_start_trans(dir); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -3898,7 +3711,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) } out: - __unlink_end_trans(trans, root); + btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); return ret; } @@ -3995,7 +3808,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) return -EPERM; - trans = __unlink_start_trans(dir, dentry); + trans = __unlink_start_trans(dir); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -4017,7 +3830,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) if (!err) btrfs_i_size_write(inode, 0); out: - __unlink_end_trans(trans, root); + btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); return err; @@ -4048,6 +3861,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u64 extent_num_bytes = 0; u64 extent_offset = 0; u64 item_end = 0; + u64 last_size = (u64)-1; u32 found_type = (u8)-1; int found_extent; int del_item; @@ -4145,6 +3959,11 @@ search_again: if (found_type != BTRFS_EXTENT_DATA_KEY) goto delete; + if (del_item) + last_size = found_key.offset; + else + last_size = new_size; + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { u64 num_dec; extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); @@ -4256,6 +4075,8 @@ out: btrfs_abort_transaction(trans, root, ret); } error: + if (last_size != (u64)-1) + btrfs_ordered_update_i_size(inode, last_size, NULL); btrfs_free_path(path); return err; } @@ -4395,6 +4216,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) u64 hole_size; int err = 0; + /* + * If our size started in the middle of a page we need to zero out the + * rest of the page before we expand the i_size, otherwise we could + * expose stale data. + */ + err = btrfs_truncate_page(inode, oldsize, 0, 0); + if (err) + return err; + if (size <= hole_start) return 0; @@ -4509,9 +4339,6 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) int mask = attr->ia_valid; int ret; - if (newsize == oldsize) - return 0; - /* * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a * special case where we need to update the times despite not having @@ -4522,7 +4349,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); if (newsize > oldsize) { - truncate_pagecache(inode, oldsize, newsize); + truncate_pagecache(inode, newsize); ret = btrfs_cont_expand(inode, oldsize, newsize); if (ret) return ret; @@ -4578,8 +4405,26 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) btrfs_inode_resume_unlocked_dio(inode); ret = btrfs_truncate(inode); - if (ret && inode->i_nlink) - btrfs_orphan_del(NULL, inode); + if (ret && inode->i_nlink) { + int err; + + /* + * failed to truncate, disk_i_size is only adjusted down + * as we remove extents, so it should represent the true + * size of the inode, so reset the in memory size and + * delete our orphan entry. + */ + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + btrfs_orphan_del(NULL, inode); + return ret; + } + i_size_write(inode, BTRFS_I(inode)->disk_i_size); + err = btrfs_orphan_del(trans, inode); + if (err) + btrfs_abort_transaction(trans, root, err); + btrfs_end_transaction(trans, root); + } } return ret; @@ -4714,10 +4559,15 @@ void btrfs_evict_inode(struct inode *inode) btrfs_free_block_rsv(root, rsv); + /* + * Errors here aren't a big deal, it just means we leave orphan items + * in the tree. They will be cleaned up on the next mount. + */ if (ret == 0) { trans->block_rsv = root->orphan_block_rsv; - ret = btrfs_orphan_del(trans, inode); - BUG_ON(ret); + btrfs_orphan_del(trans, inode); + } else { + btrfs_orphan_del(NULL, inode); } trans->block_rsv = &root->fs_info->trans_block_rsv; @@ -4822,11 +4672,6 @@ static int fixup_tree_root_location(struct btrfs_root *root, goto out; } - if (btrfs_root_refs(&new_root->root_item) == 0) { - err = -ENOENT; - goto out; - } - *sub_root = new_root; location->objectid = btrfs_root_dirid(&new_root->root_item); location->type = BTRFS_INODE_ITEM_KEY; @@ -4843,11 +4688,11 @@ static void inode_tree_add(struct inode *inode) struct btrfs_inode *entry; struct rb_node **p; struct rb_node *parent; + struct rb_node *new = &BTRFS_I(inode)->rb_node; u64 ino = btrfs_ino(inode); if (inode_unhashed(inode)) return; -again: parent = NULL; spin_lock(&root->inode_lock); p = &root->inode_tree.rb_node; @@ -4862,14 +4707,14 @@ again: else { WARN_ON(!(entry->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); - rb_erase(parent, &root->inode_tree); + rb_replace_node(parent, new, &root->inode_tree); RB_CLEAR_NODE(parent); spin_unlock(&root->inode_lock); - goto again; + return; } } - rb_link_node(&BTRFS_I(inode)->rb_node, parent, p); - rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree); + rb_link_node(new, parent, p); + rb_insert_color(new, &root->inode_tree); spin_unlock(&root->inode_lock); } @@ -5092,8 +4937,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (!(inode->i_sb->s_flags & MS_RDONLY)) ret = btrfs_orphan_cleanup(sub_root); up_read(&root->fs_info->cleanup_work_sem); - if (ret) + if (ret) { + iput(inode); inode = ERR_PTR(ret); + } } return inode; @@ -5137,10 +4984,9 @@ unsigned char btrfs_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK }; -static int btrfs_real_readdir(struct file *filp, void *dirent, - filldir_t filldir) +static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_item *item; struct btrfs_dir_item *di; @@ -5161,29 +5007,15 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, char tmp_name[32]; char *name_ptr; int name_len; - int is_curr = 0; /* filp->f_pos points to the current index? */ + int is_curr = 0; /* ctx->pos points to the current index? */ /* FIXME, use a real flag for deciding about the key type */ if (root->fs_info->tree_root == root) key_type = BTRFS_DIR_ITEM_KEY; - /* special case for "." */ - if (filp->f_pos == 0) { - over = filldir(dirent, ".", 1, - filp->f_pos, btrfs_ino(inode), DT_DIR); - if (over) - return 0; - filp->f_pos = 1; - } - /* special case for .., just use the back ref */ - if (filp->f_pos == 1) { - u64 pino = parent_ino(filp->f_path.dentry); - over = filldir(dirent, "..", 2, - filp->f_pos, pino, DT_DIR); - if (over) - return 0; - filp->f_pos = 2; - } + if (!dir_emit_dots(file, ctx)) + return 0; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -5197,7 +5029,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, } btrfs_set_key_type(&key, key_type); - key.offset = filp->f_pos; + key.offset = ctx->pos; key.objectid = btrfs_ino(inode); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -5223,14 +5055,14 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, break; if (btrfs_key_type(&found_key) != key_type) break; - if (found_key.offset < filp->f_pos) + if (found_key.offset < ctx->pos) goto next; if (key_type == BTRFS_DIR_INDEX_KEY && btrfs_should_delete_dir_index(&del_list, found_key.offset)) goto next; - filp->f_pos = found_key.offset; + ctx->pos = found_key.offset; is_curr = 1; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); @@ -5274,9 +5106,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, over = 0; goto skip; } - over = filldir(dirent, name_ptr, name_len, - found_key.offset, location.objectid, - d_type); + over = !dir_emit(ctx, name_ptr, name_len, + location.objectid, d_type); skip: if (name_ptr != tmp_name) @@ -5295,22 +5126,38 @@ next: if (key_type == BTRFS_DIR_INDEX_KEY) { if (is_curr) - filp->f_pos++; - ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir, - &ins_list); + ctx->pos++; + ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); if (ret) goto nopos; } /* Reached end of directory/root. Bump pos past the last item. */ - if (key_type == BTRFS_DIR_INDEX_KEY) - /* - * 32-bit glibc will use getdents64, but then strtol - - * so the last number we can serve is this. - */ - filp->f_pos = 0x7fffffff; - else - filp->f_pos++; + ctx->pos++; + + /* + * Stop new entries from being returned after we return the last + * entry. + * + * New directory entries are assigned a strictly increasing + * offset. This means that new entries created during readdir + * are *guaranteed* to be seen in the future by that readdir. + * This has broken buggy programs which operate on names as + * they're returned by readdir. Until we re-use freed offsets + * we have this hack to stop new entries from being returned + * under the assumption that they'll never reach this huge + * offset. + * + * This is being careful not to overflow 32bit loff_t unless the + * last entry requires it because doing so has broken 32bit apps + * in the past. + */ + if (key_type == BTRFS_DIR_INDEX_KEY) { + if (ctx->pos >= INT_MAX) + ctx->pos = LLONG_MAX; + else + ctx->pos = INT_MAX; + } nopos: ret = 0; err: @@ -6277,10 +6124,7 @@ insert: btrfs_release_path(path); if (em->start > start || extent_map_end(em) <= start) { btrfs_err(root->fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", - (unsigned long long)em->start, - (unsigned long long)em->len, - (unsigned long long)start, - (unsigned long long)len); + em->start, em->len, start, len); err = -EIO; goto out; } @@ -6478,39 +6322,32 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, u64 start, u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; struct extent_map *em; struct btrfs_key ins; u64 alloc_hint; int ret; - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return ERR_CAST(trans); - - trans->block_rsv = &root->fs_info->delalloc_block_rsv; - alloc_hint = get_extent_allocation_hint(inode, start, len); - ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0, + ret = btrfs_reserve_extent(root, len, root->sectorsize, 0, alloc_hint, &ins, 1); - if (ret) { - em = ERR_PTR(ret); - goto out; - } + if (ret) + return ERR_PTR(ret); em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, ins.offset, ins.offset, ins.offset, 0); - if (IS_ERR(em)) - goto out; + if (IS_ERR(em)) { + btrfs_free_reserved_extent(root, ins.objectid, ins.offset); + return em; + } ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, ins.offset, ins.offset, 0); if (ret) { btrfs_free_reserved_extent(root, ins.objectid, ins.offset); - em = ERR_PTR(ret); + free_extent_map(em); + return ERR_PTR(ret); } -out: - btrfs_end_transaction(trans, root); + return em; } @@ -6518,11 +6355,11 @@ out: * returns 1 when the nocow is safe, < 1 on error, 0 if the * block must be cow'd */ -static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, - struct inode *inode, u64 offset, u64 *len, - u64 *orig_start, u64 *orig_block_len, - u64 *ram_bytes) +noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, + u64 *orig_start, u64 *orig_block_len, + u64 *ram_bytes) { + struct btrfs_trans_handle *trans; struct btrfs_path *path; int ret; struct extent_buffer *leaf; @@ -6535,12 +6372,12 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, u64 num_bytes; int slot; int found_type; - + bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW); path = btrfs_alloc_path(); if (!path) return -ENOMEM; - ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), offset, 0); if (ret < 0) goto out; @@ -6575,18 +6412,28 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, /* not a regular extent, must cow */ goto out; } + + if (!nocow && found_type == BTRFS_FILE_EXTENT_REG) + goto out; + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + if (disk_bytenr == 0) + goto out; + + if (btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + goto out; + backref_offset = btrfs_file_extent_offset(leaf, fi); - *orig_start = key.offset - backref_offset; - *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); - *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + if (orig_start) { + *orig_start = key.offset - backref_offset; + *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); + *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + } extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); - if (extent_end < offset + *len) { - /* extent doesn't include our full range, must cow */ - goto out; - } if (btrfs_extent_readonly(root, disk_bytenr)) goto out; @@ -6595,9 +6442,19 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, * look for other files referencing this extent, if we * find any we must cow */ - if (btrfs_cross_ref_exist(trans, root, btrfs_ino(inode), - key.offset - backref_offset, disk_bytenr)) + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = 0; goto out; + } + + ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode), + key.offset - backref_offset, disk_bytenr); + btrfs_end_transaction(trans, root); + if (ret) { + ret = 0; + goto out; + } /* * adjust disk_bytenr and num_bytes to cover just the bytes @@ -6739,7 +6596,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, u64 start = iblock << inode->i_blkbits; u64 lockstart, lockend; u64 len = bh_result->b_size; - struct btrfs_trans_handle *trans; int unlock_bits = EXTENT_LOCKED; int ret = 0; @@ -6821,17 +6677,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, len = min(len, em->len - (start - em->start)); block_start = em->block_start + (start - em->start); - /* - * we're not going to log anything, but we do need - * to make sure the current transaction stays open - * while we look for nocow cross refs - */ - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - goto must_cow; - - if (can_nocow_odirect(trans, inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes) == 1) { + if (can_nocow_extent(inode, start, &len, &orig_start, + &orig_block_len, &ram_bytes) == 1) { if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); em = create_pinned_em(inode, start, len, @@ -6839,24 +6686,20 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, block_start, len, orig_block_len, ram_bytes, type); - if (IS_ERR(em)) { - btrfs_end_transaction(trans, root); + if (IS_ERR(em)) goto unlock_err; - } } ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len, len, type); - btrfs_end_transaction(trans, root); if (ret) { free_extent_map(em); goto unlock_err; } goto unlock; } - btrfs_end_transaction(trans, root); } -must_cow: + /* * this will cow the extent, reset the len in case we changed * it above @@ -6919,26 +6762,6 @@ unlock_err: return ret; } -struct btrfs_dio_private { - struct inode *inode; - u64 logical_offset; - u64 disk_bytenr; - u64 bytes; - void *private; - - /* number of bios pending for this dio */ - atomic_t pending_bios; - - /* IO errors */ - int errors; - - /* orig_bio is our btrfs_io_bio */ - struct bio *orig_bio; - - /* dio_bio came from fs/direct-io.c */ - struct bio *dio_bio; -}; - static void btrfs_endio_direct_read(struct bio *bio, int err) { struct btrfs_dio_private *dip = bio->bi_private; @@ -6947,6 +6770,8 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct bio *dio_bio; + u32 *csums = (u32 *)dip->csum; + int index = 0; u64 start; start = dip->logical_offset; @@ -6955,12 +6780,8 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) struct page *page = bvec->bv_page; char *kaddr; u32 csum = ~(u32)0; - u64 private = ~(u32)0; unsigned long flags; - if (get_state_private(&BTRFS_I(inode)->io_tree, - start, &private)) - goto failed; local_irq_save(flags); kaddr = kmap_atomic(page); csum = btrfs_csum_data(kaddr + bvec->bv_offset, @@ -6970,18 +6791,17 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) local_irq_restore(flags); flush_dcache_page(bvec->bv_page); - if (csum != private) { -failed: - btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u private %u", - (unsigned long long)btrfs_ino(inode), - (unsigned long long)start, - csum, (unsigned)private); + if (csum != csums[index]) { + btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", + btrfs_ino(inode), start, csum, + csums[index]); err = -EIO; } } start += bvec->bv_len; bvec++; + index++; } while (bvec <= bvec_end); unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, @@ -7062,7 +6882,7 @@ static void btrfs_end_dio_bio(struct bio *bio, int err) if (err) { printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu " "sector %#Lx len %u err no %d\n", - (unsigned long long)btrfs_ino(dip->inode), bio->bi_rw, + btrfs_ino(dip->inode), bio->bi_rw, (unsigned long long)bio->bi_sector, bio->bi_size, err); dip->errors = 1; @@ -7098,6 +6918,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, int rw, u64 file_offset, int skip_sum, int async_submit) { + struct btrfs_dio_private *dip = bio->bi_private; int write = rw & REQ_WRITE; struct btrfs_root *root = BTRFS_I(inode)->root; int ret; @@ -7132,7 +6953,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, if (ret) goto err; } else if (!skip_sum) { - ret = btrfs_lookup_bio_sums_dio(root, inode, bio, file_offset); + ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio, + file_offset); if (ret) goto err; } @@ -7167,6 +6989,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, bio_put(orig_bio); return -EIO; } + if (map_length >= orig_bio->bi_size) { bio = orig_bio; goto submit; @@ -7260,38 +7083,39 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_dio_private *dip; - struct bio_vec *bvec = dio_bio->bi_io_vec; struct bio *io_bio; int skip_sum; + int sum_len; int write = rw & REQ_WRITE; int ret = 0; + u16 csum_size; skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS); - if (!io_bio) { ret = -ENOMEM; goto free_ordered; } - dip = kmalloc(sizeof(*dip), GFP_NOFS); + if (!skip_sum && !write) { + csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + sum_len = dio_bio->bi_size >> inode->i_sb->s_blocksize_bits; + sum_len *= csum_size; + } else { + sum_len = 0; + } + + dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS); if (!dip) { ret = -ENOMEM; goto free_io_bio; } dip->private = dio_bio->bi_private; - io_bio->bi_private = dio_bio->bi_private; dip->inode = inode; dip->logical_offset = file_offset; - - dip->bytes = 0; - do { - dip->bytes += bvec->bv_len; - bvec++; - } while (bvec <= (dio_bio->bi_io_vec + dio_bio->bi_vcnt - 1)); - + dip->bytes = dio_bio->bi_size; dip->disk_bytenr = (u64)dio_bio->bi_sector << 9; io_bio->bi_private = dip; dip->errors = 0; @@ -7390,8 +7214,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, atomic_inc(&inode->i_dio_count); smp_mb__after_atomic_inc(); + /* + * The generic stuff only does filemap_write_and_wait_range, which isn't + * enough if we've written compressed pages to this area, so we need to + * call btrfs_wait_ordered_range to make absolutely sure that any + * outstanding dirty pages are on disk. + */ + count = iov_length(iov, nr_segs); + btrfs_wait_ordered_range(inode, offset, count); + if (rw & WRITE) { - count = iov_length(iov, nr_segs); /* * If the write DIO is beyond the EOF, we need update * the isize, but it is protected by i_mutex. So we can @@ -7510,7 +7342,8 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); } -static void btrfs_invalidatepage(struct page *page, unsigned long offset) +static void btrfs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct inode *inode = page->mapping->host; struct extent_io_tree *tree; @@ -7548,10 +7381,23 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) * whoever cleared the private bit is responsible * for the finish_ordered_io */ - if (TestClearPagePrivate2(page) && - btrfs_dec_test_ordered_pending(inode, &ordered, page_start, - PAGE_CACHE_SIZE, 1)) { - btrfs_finish_ordered_io(ordered); + if (TestClearPagePrivate2(page)) { + struct btrfs_ordered_inode_tree *tree; + u64 new_len; + + tree = &BTRFS_I(inode)->ordered_tree; + + spin_lock_irq(&tree->lock); + set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); + new_len = page_start - ordered->file_offset; + if (new_len < ordered->truncated_len) + ordered->truncated_len = new_len; + spin_unlock_irq(&tree->lock); + + if (btrfs_dec_test_ordered_pending(inode, &ordered, + page_start, + PAGE_CACHE_SIZE, 1)) + btrfs_finish_ordered_io(ordered); } btrfs_put_ordered_extent(ordered); cached_state = NULL; @@ -7710,18 +7556,13 @@ static int btrfs_truncate(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv; - int ret; + int ret = 0; int err = 0; struct btrfs_trans_handle *trans; u64 mask = root->sectorsize - 1; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); - ret = btrfs_truncate_page(inode, inode->i_size, 0, 0); - if (ret) - return ret; - btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); - btrfs_ordered_update_i_size(inode, inode->i_size, NULL); /* * Yes ladies and gentelment, this is indeed ugly. The fact is we have @@ -7977,15 +7818,15 @@ void btrfs_destroy_inode(struct inode *inode) */ smp_mb(); if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->fs_info->ordered_root_lock); list_del_init(&BTRFS_I(inode)->ordered_operations); - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->fs_info->ordered_root_lock); } if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &BTRFS_I(inode)->runtime_flags)) { btrfs_info(root->fs_info, "inode %llu still on the orphan list", - (unsigned long long)btrfs_ino(inode)); + btrfs_ino(inode)); atomic_dec(&root->orphan_inodes); } @@ -7995,8 +7836,7 @@ void btrfs_destroy_inode(struct inode *inode) break; else { btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup", - (unsigned long long)ordered->file_offset, - (unsigned long long)ordered->len); + ordered->file_offset, ordered->len); btrfs_remove_ordered_extent(inode, ordered); btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); @@ -8012,6 +7852,9 @@ int btrfs_drop_inode(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; + if (root == NULL) + return 1; + /* the snap/subvol tree is on deleting */ if (btrfs_root_refs(&root->root_item) == 0 && root != root->fs_info->tree_root) @@ -8267,10 +8110,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_dentry->d_name.name, new_dentry->d_name.len); } - if (!ret && new_inode->i_nlink == 0) { + if (!ret && new_inode->i_nlink == 0) ret = btrfs_orphan_add(trans, new_dentry->d_inode); - BUG_ON(ret); - } if (ret) { btrfs_abort_transaction(trans, root, ret); goto out_fail; @@ -8346,7 +8187,7 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) +static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput) { struct btrfs_inode *binode; struct inode *inode; @@ -8355,33 +8196,30 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) struct list_head splice; int ret = 0; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; - INIT_LIST_HEAD(&works); INIT_LIST_HEAD(&splice); - spin_lock(&root->fs_info->delalloc_lock); - list_splice_init(&root->fs_info->delalloc_inodes, &splice); + spin_lock(&root->delalloc_lock); + list_splice_init(&root->delalloc_inodes, &splice); while (!list_empty(&splice)) { binode = list_entry(splice.next, struct btrfs_inode, delalloc_inodes); - list_del_init(&binode->delalloc_inodes); - + list_move_tail(&binode->delalloc_inodes, + &root->delalloc_inodes); inode = igrab(&binode->vfs_inode); if (!inode) { - clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &binode->runtime_flags); + cond_resched_lock(&root->delalloc_lock); continue; } - - list_add_tail(&binode->delalloc_inodes, - &root->fs_info->delalloc_inodes); - spin_unlock(&root->fs_info->delalloc_lock); + spin_unlock(&root->delalloc_lock); work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); if (unlikely(!work)) { + if (delay_iput) + btrfs_add_delayed_iput(inode); + else + iput(inode); ret = -ENOMEM; goto out; } @@ -8390,16 +8228,39 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) &work->work); cond_resched(); - spin_lock(&root->fs_info->delalloc_lock); + spin_lock(&root->delalloc_lock); } - spin_unlock(&root->fs_info->delalloc_lock); + spin_unlock(&root->delalloc_lock); list_for_each_entry_safe(work, next, &works, list) { list_del_init(&work->list); btrfs_wait_and_free_delalloc_work(work); } + return 0; +out: + list_for_each_entry_safe(work, next, &works, list) { + list_del_init(&work->list); + btrfs_wait_and_free_delalloc_work(work); + } + + if (!list_empty_careful(&splice)) { + spin_lock(&root->delalloc_lock); + list_splice_tail(&splice, &root->delalloc_inodes); + spin_unlock(&root->delalloc_lock); + } + return ret; +} + +int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) +{ + int ret; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; - /* the filemap_flush will queue IO into the worker threads, but + ret = __start_delalloc_inodes(root, delay_iput); + /* + * the filemap_flush will queue IO into the worker threads, but * we have to make sure the IO is actually started and that * ordered extents get created before we return */ @@ -8411,17 +8272,55 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) atomic_read(&root->fs_info->async_delalloc_pages) == 0)); } atomic_dec(&root->fs_info->async_submit_draining); - return 0; -out: - list_for_each_entry_safe(work, next, &works, list) { - list_del_init(&work->list); - btrfs_wait_and_free_delalloc_work(work); + return ret; +} + +int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info, + int delay_iput) +{ + struct btrfs_root *root; + struct list_head splice; + int ret; + + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + INIT_LIST_HEAD(&splice); + + spin_lock(&fs_info->delalloc_root_lock); + list_splice_init(&fs_info->delalloc_roots, &splice); + while (!list_empty(&splice)) { + root = list_first_entry(&splice, struct btrfs_root, + delalloc_root); + root = btrfs_grab_fs_root(root); + BUG_ON(!root); + list_move_tail(&root->delalloc_root, + &fs_info->delalloc_roots); + spin_unlock(&fs_info->delalloc_root_lock); + + ret = __start_delalloc_inodes(root, delay_iput); + btrfs_put_fs_root(root); + if (ret) + goto out; + + spin_lock(&fs_info->delalloc_root_lock); } + spin_unlock(&fs_info->delalloc_root_lock); + atomic_inc(&fs_info->async_submit_draining); + while (atomic_read(&fs_info->nr_async_submits) || + atomic_read(&fs_info->async_delalloc_pages)) { + wait_event(fs_info->async_submit_wait, + (atomic_read(&fs_info->nr_async_submits) == 0 && + atomic_read(&fs_info->async_delalloc_pages) == 0)); + } + atomic_dec(&fs_info->async_submit_draining); + return 0; +out: if (!list_empty_careful(&splice)) { - spin_lock(&root->fs_info->delalloc_lock); - list_splice_tail(&splice, &root->fs_info->delalloc_inodes); - spin_unlock(&root->fs_info->delalloc_lock); + spin_lock(&fs_info->delalloc_root_lock); + list_splice_tail(&splice, &fs_info->delalloc_roots); + spin_unlock(&fs_info->delalloc_root_lock); } return ret; } @@ -8577,8 +8476,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, cur_bytes = min(num_bytes, 256ULL * 1024 * 1024); cur_bytes = max(cur_bytes, min_size); - ret = btrfs_reserve_extent(trans, root, cur_bytes, - min_size, 0, *alloc_hint, &ins, 1); + ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0, + *alloc_hint, &ins, 1); if (ret) { if (own_trans) btrfs_end_transaction(trans, root); @@ -8718,17 +8617,19 @@ static const struct inode_operations btrfs_dir_inode_operations = { .removexattr = btrfs_removexattr, .permission = btrfs_permission, .get_acl = btrfs_get_acl, + .update_time = btrfs_update_time, }; static const struct inode_operations btrfs_dir_ro_inode_operations = { .lookup = btrfs_lookup, .permission = btrfs_permission, .get_acl = btrfs_get_acl, + .update_time = btrfs_update_time, }; static const struct file_operations btrfs_dir_file_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = btrfs_real_readdir, + .iterate = btrfs_real_readdir, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_ioctl, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0f81d67cdc8d..9d46f60cb943 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -43,6 +43,7 @@ #include <linux/blkdev.h> #include <linux/uuid.h> #include <linux/btrfs.h> +#include <linux/uaccess.h> #include "compat.h" #include "ctree.h" #include "disk-io.h" @@ -57,6 +58,9 @@ #include "send.h" #include "dev-replace.h" +static int btrfs_clone(struct inode *src, struct inode *inode, + u64 off, u64 olen, u64 olen_aligned, u64 destoff); + /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) { @@ -363,6 +367,13 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) return 0; } +int btrfs_is_empty_uuid(u8 *uuid) +{ + static char empty_uuid[BTRFS_UUID_SIZE] = {0}; + + return !memcmp(uuid, empty_uuid, BTRFS_UUID_SIZE); +} + static noinline int create_subvol(struct inode *dir, struct dentry *dentry, char *name, int namelen, @@ -396,7 +407,7 @@ static noinline int create_subvol(struct inode *dir, * of create_snapshot(). */ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, - 7, &qgroup_reserved); + 8, &qgroup_reserved, false); if (ret) return ret; @@ -425,26 +436,25 @@ static noinline int create_subvol(struct inode *dir, btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(leaf, objectid); - write_extent_buffer(leaf, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(leaf), + write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(leaf), BTRFS_FSID_SIZE); write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(leaf), + btrfs_header_chunk_tree_uuid(leaf), BTRFS_UUID_SIZE); btrfs_mark_buffer_dirty(leaf); memset(&root_item, 0, sizeof(root_item)); inode_item = &root_item.inode; - inode_item->generation = cpu_to_le64(1); - inode_item->size = cpu_to_le64(3); - inode_item->nlink = cpu_to_le32(1); - inode_item->nbytes = cpu_to_le64(root->leafsize); - inode_item->mode = cpu_to_le32(S_IFDIR | 0755); + btrfs_set_stack_inode_generation(inode_item, 1); + btrfs_set_stack_inode_size(inode_item, 3); + btrfs_set_stack_inode_nlink(inode_item, 1); + btrfs_set_stack_inode_nbytes(inode_item, root->leafsize); + btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); - root_item.flags = 0; - root_item.byte_limit = 0; - inode_item->flags = cpu_to_le64(BTRFS_INODE_ROOT_ITEM_INIT); + btrfs_set_root_flags(&root_item, 0); + btrfs_set_root_limit(&root_item, 0); + btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT); btrfs_set_root_bytenr(&root_item, leaf->start); btrfs_set_root_generation(&root_item, trans->transid); @@ -457,8 +467,8 @@ static noinline int create_subvol(struct inode *dir, btrfs_root_generation(&root_item)); uuid_le_gen(&new_uuid); memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE); - root_item.otime.sec = cpu_to_le64(cur_time.tv_sec); - root_item.otime.nsec = cpu_to_le32(cur_time.tv_nsec); + btrfs_set_stack_timespec_sec(&root_item.otime, cur_time.tv_sec); + btrfs_set_stack_timespec_nsec(&root_item.otime, cur_time.tv_nsec); root_item.ctime = root_item.otime; btrfs_set_root_ctransid(&root_item, trans->transid); btrfs_set_root_otransid(&root_item, trans->transid); @@ -518,9 +528,14 @@ static noinline int create_subvol(struct inode *dir, ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, objectid, root->root_key.objectid, btrfs_ino(dir), index, name, namelen); - BUG_ON(ret); + ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root, + root_item.uuid, BTRFS_UUID_KEY_SUBVOL, + objectid); + if (ret) + btrfs_abort_transaction(trans, root, ret); + fail: trans->block_rsv = NULL; trans->bytes_reserved = 0; @@ -555,6 +570,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (!root->ref_cows) return -EINVAL; + ret = btrfs_start_delalloc_inodes(root, 0); + if (ret) + return ret; + + btrfs_wait_ordered_extents(root); + pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); if (!pending_snapshot) return -ENOMEM; @@ -567,10 +588,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, * 1 - root item * 2 - root ref/backref * 1 - root of snapshot + * 1 - UUID item */ ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, - &pending_snapshot->block_rsv, 7, - &pending_snapshot->qgroup_reserved); + &pending_snapshot->block_rsv, 8, + &pending_snapshot->qgroup_reserved, + false); if (ret) goto out; @@ -1261,9 +1284,6 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, cluster = max_cluster; } - if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) - BTRFS_I(inode)->force_compress = compress_type; - if (i + cluster > ra_index) { ra_index = max(i, ra_index); btrfs_force_ra(inode->i_mapping, ra, file, ra_index, @@ -1272,6 +1292,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, } mutex_lock(&inode->i_mutex); + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) + BTRFS_I(inode)->force_compress = compress_type; ret = cluster_pages_for_defrag(inode, pages, i, cluster); if (ret < 0) { mutex_unlock(&inode->i_mutex); @@ -1328,10 +1350,6 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, atomic_read(&root->fs_info->async_delalloc_pages) == 0)); } atomic_dec(&root->fs_info->async_submit_draining); - - mutex_lock(&inode->i_mutex); - BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE; - mutex_unlock(&inode->i_mutex); } if (range->compress_type == BTRFS_COMPRESS_LZO) { @@ -1341,6 +1359,11 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ret = defrag_count; out_ra: + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { + mutex_lock(&inode->i_mutex); + BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE; + mutex_unlock(&inode->i_mutex); + } if (!file) kfree(ra); kfree(pages); @@ -1371,9 +1394,8 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { - pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); mnt_drop_write_file(file); - return -EINVAL; + return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } mutex_lock(&root->fs_info->volume_mutex); @@ -1397,14 +1419,13 @@ static noinline int btrfs_ioctl_resize(struct file *file, ret = -EINVAL; goto out_free; } - printk(KERN_INFO "btrfs: resizing devid %llu\n", - (unsigned long long)devid); + printk(KERN_INFO "btrfs: resizing devid %llu\n", devid); } device = btrfs_find_device(root->fs_info, devid, NULL, NULL); if (!device) { printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", - (unsigned long long)devid); + devid); ret = -ENODEV; goto out_free; } @@ -1412,7 +1433,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (!device->writeable) { printk(KERN_INFO "btrfs: resizer unable to apply on " "readonly device %llu\n", - (unsigned long long)devid); + devid); ret = -EPERM; goto out_free; } @@ -1464,8 +1485,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, new_size *= root->sectorsize; printk_in_rcu(KERN_INFO "btrfs: new size for %s is %llu\n", - rcu_str_deref(device->name), - (unsigned long long)new_size); + rcu_str_deref(device->name), new_size); if (new_size > old_size) { trans = btrfs_start_transaction(root, 0); @@ -1715,13 +1735,28 @@ out: static noinline int may_destroy_subvol(struct btrfs_root *root) { struct btrfs_path *path; + struct btrfs_dir_item *di; struct btrfs_key key; + u64 dir_id; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; + /* Make sure this root isn't set as the default subvol */ + dir_id = btrfs_super_root_dir(root->fs_info->super_copy); + di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, path, + dir_id, "default", 7, 0); + if (di && !IS_ERR(di)) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); + if (key.objectid == root->root_key.objectid) { + ret = -ENOTEMPTY; + goto out; + } + btrfs_release_path(path); + } + key.objectid = root->root_key.objectid; key.type = BTRFS_ROOT_REF_KEY; key.offset = (u64)-1; @@ -1987,25 +2022,29 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; + else if (ret > 0) { + ret = btrfs_previous_item(root, path, dirid, + BTRFS_INODE_REF_KEY); + if (ret < 0) + goto out; + else if (ret > 0) { + ret = -ENOENT; + goto out; + } + } l = path->nodes[0]; slot = path->slots[0]; - if (ret > 0 && slot > 0) - slot--; btrfs_item_key_to_cpu(l, &key, slot); - if (ret > 0 && (key.objectid != dirid || - key.type != BTRFS_INODE_REF_KEY)) { - ret = -ENOENT; - goto out; - } - iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref); len = btrfs_inode_ref_name_len(l, iref); ptr -= len + 1; total_len += len + 1; - if (ptr < name) + if (ptr < name) { + ret = -ENAMETOOLONG; goto out; + } *(ptr + len) = '/'; read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len); @@ -2018,8 +2057,6 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, key.offset = (u64)-1; dirid = key.objectid; } - if (ptr < name) - goto out; memmove(name, ptr, total_len); name[total_len]='\0'; ret = 0; @@ -2168,7 +2205,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * ref/backref. */ err = btrfs_subvolume_reserve_metadata(root, &block_rsv, - 5, &qgroup_reserved); + 5, &qgroup_reserved, true); if (err) goto out_up_write; @@ -2207,6 +2244,27 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out_end_trans; } } + + ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root, + dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL, + dest->root_key.objectid); + if (ret && ret != -ENOENT) { + btrfs_abort_transaction(trans, root, ret); + err = ret; + goto out_end_trans; + } + if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) { + ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root, + dest->root_item.received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + dest->root_key.objectid); + if (ret && ret != -ENOENT) { + btrfs_abort_transaction(trans, root, ret); + err = ret; + goto out_end_trans; + } + } + out_end_trans: trans->block_rsv = NULL; trans->bytes_reserved = 0; @@ -2320,8 +2378,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { - pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - return -EINVAL; + return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } mutex_lock(&root->fs_info->volume_mutex); @@ -2354,14 +2411,6 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (ret) return ret; - if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, - 1)) { - pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - mnt_drop_write_file(file); - return -EINVAL; - } - - mutex_lock(&root->fs_info->volume_mutex); vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); @@ -2369,12 +2418,20 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - ret = btrfs_rm_device(root, vol_args->name); - kfree(vol_args); -out: + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + goto out; + } + + mutex_lock(&root->fs_info->volume_mutex); + ret = btrfs_rm_device(root, vol_args->name); mutex_unlock(&root->fs_info->volume_mutex); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + +out: + kfree(vol_args); mnt_drop_write_file(file); return ret; } @@ -2394,10 +2451,10 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) if (!fi_args) return -ENOMEM; + mutex_lock(&fs_devices->device_list_mutex); fi_args->num_devices = fs_devices->num_devices; memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid)); - mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { if (device->devid > fi_args->max_id) fi_args->max_id = device->devid; @@ -2418,7 +2475,6 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; int ret = 0; char *s_uuid = NULL; - char empty_uuid[BTRFS_UUID_SIZE] = {0}; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -2427,7 +2483,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) if (IS_ERR(di_args)) return PTR_ERR(di_args); - if (memcmp(empty_uuid, di_args->uuid, BTRFS_UUID_SIZE) != 0) + if (!btrfs_is_empty_uuid(di_args->uuid)) s_uuid = di_args->uuid; mutex_lock(&fs_devices->device_list_mutex); @@ -2463,139 +2519,350 @@ out: return ret; } -static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, - u64 off, u64 olen, u64 destoff) +static struct page *extent_same_get_page(struct inode *inode, u64 off) +{ + struct page *page; + pgoff_t index; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + + index = off >> PAGE_CACHE_SHIFT; + + page = grab_cache_page(inode->i_mapping, index); + if (!page) + return NULL; + + if (!PageUptodate(page)) { + if (extent_read_full_page_nolock(tree, page, btrfs_get_extent, + 0)) + return NULL; + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + return NULL; + } + } + unlock_page(page); + + return page; +} + +static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) +{ + /* do any pending delalloc/csum calc on src, one way or + another, and lock file content */ + while (1) { + struct btrfs_ordered_extent *ordered; + lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); + ordered = btrfs_lookup_first_ordered_extent(inode, + off + len - 1); + if (!ordered && + !test_range_bit(&BTRFS_I(inode)->io_tree, off, + off + len - 1, EXTENT_DELALLOC, 0, NULL)) + break; + unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); + if (ordered) + btrfs_put_ordered_extent(ordered); + btrfs_wait_ordered_range(inode, off, len); + } +} + +static void btrfs_double_unlock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); + unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); + + mutex_unlock(&inode1->i_mutex); + mutex_unlock(&inode2->i_mutex); +} + +static void btrfs_double_lock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + if (inode1 < inode2) { + swap(inode1, inode2); + swap(loff1, loff2); + } + + mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); + lock_extent_range(inode1, loff1, len); + if (inode1 != inode2) { + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); + lock_extent_range(inode2, loff2, len); + } +} + +static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, + u64 dst_loff, u64 len) +{ + int ret = 0; + struct page *src_page, *dst_page; + unsigned int cmp_len = PAGE_CACHE_SIZE; + void *addr, *dst_addr; + + while (len) { + if (len < PAGE_CACHE_SIZE) + cmp_len = len; + + src_page = extent_same_get_page(src, loff); + if (!src_page) + return -EINVAL; + dst_page = extent_same_get_page(dst, dst_loff); + if (!dst_page) { + page_cache_release(src_page); + return -EINVAL; + } + addr = kmap_atomic(src_page); + dst_addr = kmap_atomic(dst_page); + + flush_dcache_page(src_page); + flush_dcache_page(dst_page); + + if (memcmp(addr, dst_addr, cmp_len)) + ret = BTRFS_SAME_DATA_DIFFERS; + + kunmap_atomic(addr); + kunmap_atomic(dst_addr); + page_cache_release(src_page); + page_cache_release(dst_page); + + if (ret) + break; + + loff += cmp_len; + dst_loff += cmp_len; + len -= cmp_len; + } + + return ret; +} + +static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len) +{ + u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize; + + if (off + len > inode->i_size || off + len < off) + return -EINVAL; + /* Check that we are block aligned - btrfs_clone() requires this */ + if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs)) + return -EINVAL; + + return 0; +} + +static int btrfs_extent_same(struct inode *src, u64 loff, u64 len, + struct inode *dst, u64 dst_loff) { - struct inode *inode = file_inode(file); - struct btrfs_root *root = BTRFS_I(inode)->root; - struct fd src_file; - struct inode *src; - struct btrfs_trans_handle *trans; - struct btrfs_path *path; - struct extent_buffer *leaf; - char *buf; - struct btrfs_key key; - u32 nritems; - int slot; int ret; - u64 len = olen; - u64 bs = root->fs_info->sb->s_blocksize; /* - * TODO: - * - split compressed inline extents. annoying: we need to - * decompress into destination's address_space (the file offset - * may change, so source mapping won't do), then recompress (or - * otherwise reinsert) a subrange. - * - allow ranges within the same file to be cloned (provided - * they don't overlap)? + * btrfs_clone() can't handle extents in the same file + * yet. Once that works, we can drop this check and replace it + * with a check for the same inode, but overlapping extents. */ - - /* the destination must be opened for writing */ - if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) + if (src == dst) return -EINVAL; - if (btrfs_root_readonly(root)) - return -EROFS; + btrfs_double_lock(src, loff, dst, dst_loff, len); + + ret = extent_same_check_offsets(src, loff, len); + if (ret) + goto out_unlock; + + ret = extent_same_check_offsets(dst, dst_loff, len); + if (ret) + goto out_unlock; + + /* don't make the dst file partly checksummed */ + if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != + (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) { + ret = -EINVAL; + goto out_unlock; + } + + ret = btrfs_cmp_data(src, loff, dst, dst_loff, len); + if (ret == 0) + ret = btrfs_clone(src, dst, loff, len, len, dst_loff); + +out_unlock: + btrfs_double_unlock(src, loff, dst, dst_loff, len); + + return ret; +} + +#define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) + +static long btrfs_ioctl_file_extent_same(struct file *file, + void __user *argp) +{ + struct btrfs_ioctl_same_args tmp; + struct btrfs_ioctl_same_args *same; + struct btrfs_ioctl_same_extent_info *info; + struct inode *src = file->f_dentry->d_inode; + struct file *dst_file = NULL; + struct inode *dst; + u64 off; + u64 len; + int i; + int ret; + unsigned long size; + u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; + bool is_admin = capable(CAP_SYS_ADMIN); + + if (!(file->f_mode & FMODE_READ)) + return -EINVAL; ret = mnt_want_write_file(file); if (ret) return ret; - src_file = fdget(srcfd); - if (!src_file.file) { - ret = -EBADF; - goto out_drop_write; + if (copy_from_user(&tmp, + (struct btrfs_ioctl_same_args __user *)argp, + sizeof(tmp))) { + ret = -EFAULT; + goto out; } - ret = -EXDEV; - if (src_file.file->f_path.mnt != file->f_path.mnt) - goto out_fput; + size = sizeof(tmp) + + tmp.dest_count * sizeof(struct btrfs_ioctl_same_extent_info); - src = file_inode(src_file.file); + same = kmalloc(size, GFP_NOFS); + if (!same) { + ret = -EFAULT; + goto out; + } - ret = -EINVAL; - if (src == inode) - goto out_fput; + if (copy_from_user(same, + (struct btrfs_ioctl_same_args __user *)argp, size)) { + ret = -EFAULT; + goto out; + } - /* the src must be open for reading */ - if (!(src_file.file->f_mode & FMODE_READ)) - goto out_fput; + off = same->logical_offset; + len = same->length; - /* don't make the dst file partly checksummed */ - if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) - goto out_fput; + /* + * Limit the total length we will dedupe for each operation. + * This is intended to bound the total time spent in this + * ioctl to something sane. + */ + if (len > BTRFS_MAX_DEDUPE_LEN) + len = BTRFS_MAX_DEDUPE_LEN; - ret = -EISDIR; - if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) - goto out_fput; + if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) { + /* + * Btrfs does not support blocksize < page_size. As a + * result, btrfs_cmp_data() won't correctly handle + * this situation without an update. + */ + ret = -EINVAL; + goto out; + } - ret = -EXDEV; - if (src->i_sb != inode->i_sb) - goto out_fput; + ret = -EISDIR; + if (S_ISDIR(src->i_mode)) + goto out; - ret = -ENOMEM; - buf = vmalloc(btrfs_level_size(root, 0)); - if (!buf) - goto out_fput; + ret = -EACCES; + if (!S_ISREG(src->i_mode)) + goto out; - path = btrfs_alloc_path(); - if (!path) { - vfree(buf); - goto out_fput; + /* pre-format output fields to sane values */ + for (i = 0; i < same->dest_count; i++) { + same->info[i].bytes_deduped = 0ULL; + same->info[i].status = 0; } - path->reada = 2; - if (inode < src) { - mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD); - } else { - mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); - } + ret = 0; + for (i = 0; i < same->dest_count; i++) { + info = &same->info[i]; - /* determine range to clone */ - ret = -EINVAL; - if (off + len > src->i_size || off + len < off) - goto out_unlock; - if (len == 0) - olen = len = src->i_size - off; - /* if we extend to eof, continue to block boundary */ - if (off + len == src->i_size) - len = ALIGN(src->i_size, bs) - off; + dst_file = fget(info->fd); + if (!dst_file) { + info->status = -EBADF; + goto next; + } - /* verify the end result is block aligned */ - if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || - !IS_ALIGNED(destoff, bs)) - goto out_unlock; + if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) { + info->status = -EINVAL; + goto next; + } - if (destoff > inode->i_size) { - ret = btrfs_cont_expand(inode, inode->i_size, destoff); - if (ret) - goto out_unlock; + info->status = -EXDEV; + if (file->f_path.mnt != dst_file->f_path.mnt) + goto next; + + dst = dst_file->f_dentry->d_inode; + if (src->i_sb != dst->i_sb) + goto next; + + if (S_ISDIR(dst->i_mode)) { + info->status = -EISDIR; + goto next; + } + + if (!S_ISREG(dst->i_mode)) { + info->status = -EACCES; + goto next; + } + + info->status = btrfs_extent_same(src, off, len, dst, + info->logical_offset); + if (info->status == 0) + info->bytes_deduped += len; + +next: + if (dst_file) + fput(dst_file); } - /* truncate page cache pages from target inode range */ - truncate_inode_pages_range(&inode->i_data, destoff, - PAGE_CACHE_ALIGN(destoff + len) - 1); + ret = copy_to_user(argp, same, size); + if (ret) + ret = -EFAULT; - /* do any pending delalloc/csum calc on src, one way or - another, and lock file content */ - while (1) { - struct btrfs_ordered_extent *ordered; - lock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); - ordered = btrfs_lookup_first_ordered_extent(src, off + len - 1); - if (!ordered && - !test_range_bit(&BTRFS_I(src)->io_tree, off, off + len - 1, - EXTENT_DELALLOC, 0, NULL)) - break; - unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); - if (ordered) - btrfs_put_ordered_extent(ordered); - btrfs_wait_ordered_range(src, off, len); +out: + mnt_drop_write_file(file); + return ret; +} + +/** + * btrfs_clone() - clone a range from inode file to another + * + * @src: Inode to clone from + * @inode: Inode to clone to + * @off: Offset within source to start clone from + * @olen: Original length, passed by user, of range to clone + * @olen_aligned: Block-aligned value of olen, extent_same uses + * identical values here + * @destoff: Offset within @inode to start clone + */ +static int btrfs_clone(struct inode *src, struct inode *inode, + u64 off, u64 olen, u64 olen_aligned, u64 destoff) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path = NULL; + struct extent_buffer *leaf; + struct btrfs_trans_handle *trans; + char *buf = NULL; + struct btrfs_key key; + u32 nritems; + int slot; + int ret; + u64 len = olen_aligned; + + ret = -ENOMEM; + buf = vmalloc(btrfs_level_size(root, 0)); + if (!buf) + return ret; + + path = btrfs_alloc_path(); + if (!path) { + vfree(buf); + return ret; } + path->reada = 2; /* clone data */ key.objectid = btrfs_ino(src); key.type = BTRFS_EXTENT_DATA_KEY; @@ -2841,14 +3108,132 @@ next: key.offset++; } ret = 0; + out: btrfs_release_path(path); + btrfs_free_path(path); + vfree(buf); + return ret; +} + +static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, + u64 off, u64 olen, u64 destoff) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct fd src_file; + struct inode *src; + int ret; + u64 len = olen; + u64 bs = root->fs_info->sb->s_blocksize; + int same_inode = 0; + + /* + * TODO: + * - split compressed inline extents. annoying: we need to + * decompress into destination's address_space (the file offset + * may change, so source mapping won't do), then recompress (or + * otherwise reinsert) a subrange. + * - allow ranges within the same file to be cloned (provided + * they don't overlap)? + */ + + /* the destination must be opened for writing */ + if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) + return -EINVAL; + + if (btrfs_root_readonly(root)) + return -EROFS; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + src_file = fdget(srcfd); + if (!src_file.file) { + ret = -EBADF; + goto out_drop_write; + } + + ret = -EXDEV; + if (src_file.file->f_path.mnt != file->f_path.mnt) + goto out_fput; + + src = file_inode(src_file.file); + + ret = -EINVAL; + if (src == inode) + same_inode = 1; + + /* the src must be open for reading */ + if (!(src_file.file->f_mode & FMODE_READ)) + goto out_fput; + + /* don't make the dst file partly checksummed */ + if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != + (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) + goto out_fput; + + ret = -EISDIR; + if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) + goto out_fput; + + ret = -EXDEV; + if (src->i_sb != inode->i_sb) + goto out_fput; + + if (!same_inode) { + if (inode < src) { + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD); + } else { + mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + } + } else { + mutex_lock(&src->i_mutex); + } + + /* determine range to clone */ + ret = -EINVAL; + if (off + len > src->i_size || off + len < off) + goto out_unlock; + if (len == 0) + olen = len = src->i_size - off; + /* if we extend to eof, continue to block boundary */ + if (off + len == src->i_size) + len = ALIGN(src->i_size, bs) - off; + + /* verify the end result is block aligned */ + if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || + !IS_ALIGNED(destoff, bs)) + goto out_unlock; + + /* verify if ranges are overlapped within the same file */ + if (same_inode) { + if (destoff + len > off && destoff < off + len) + goto out_unlock; + } + + if (destoff > inode->i_size) { + ret = btrfs_cont_expand(inode, inode->i_size, destoff); + if (ret) + goto out_unlock; + } + + /* truncate page cache pages from target inode range */ + truncate_inode_pages_range(&inode->i_data, destoff, + PAGE_CACHE_ALIGN(destoff + len) - 1); + + lock_extent_range(src, off, len); + + ret = btrfs_clone(src, inode, off, olen, len, destoff); + unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); out_unlock: mutex_unlock(&src->i_mutex); - mutex_unlock(&inode->i_mutex); - vfree(buf); - btrfs_free_path(path); + if (!same_inode) + mutex_unlock(&inode->i_mutex); out_fput: fdput(src_file); out_drop_write: @@ -2939,7 +3324,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) } if (!objectid) - objectid = root->root_key.objectid; + objectid = BTRFS_FS_TREE_OBJECTID; location.objectid = objectid; location.type = BTRFS_ROOT_ITEM_KEY; @@ -2951,11 +3336,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) goto out; } - if (btrfs_root_refs(&new_root->root_item) == 0) { - ret = -ENOENT; - goto out; - } - path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; @@ -3299,11 +3679,13 @@ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg) switch (p->cmd) { case BTRFS_IOCTL_DEV_REPLACE_CMD_START: + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + if (atomic_xchg( &root->fs_info->mutually_exclusive_operation_running, 1)) { - pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - ret = -EINPROGRESS; + ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } else { ret = btrfs_dev_replace_start(root, p); atomic_set( @@ -3547,8 +3929,7 @@ again: } else { /* this is (1) */ mutex_unlock(&fs_info->balance_mutex); - pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - ret = -EINVAL; + ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out; } @@ -3719,9 +4100,6 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) break; } - if (copy_to_user(arg, sa, sizeof(*sa))) - ret = -EFAULT; - err = btrfs_commit_transaction(trans, root->fs_info->tree_root); if (err && !ret) ret = err; @@ -3881,7 +4259,7 @@ drop_write: static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; struct btrfs_ioctl_quota_rescan_args *qsa; int ret; @@ -3914,7 +4292,7 @@ drop_write: static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; struct btrfs_ioctl_quota_rescan_args *qsa; int ret = 0; @@ -3937,6 +4315,16 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) return ret; } +static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return btrfs_qgroup_wait_for_completion(root->fs_info); +} + static long btrfs_ioctl_set_received_subvol(struct file *file, void __user *arg) { @@ -3947,6 +4335,7 @@ static long btrfs_ioctl_set_received_subvol(struct file *file, struct btrfs_trans_handle *trans; struct timespec ct = CURRENT_TIME; int ret = 0; + int received_uuid_changed; ret = mnt_want_write_file(file); if (ret < 0) @@ -3976,7 +4365,11 @@ static long btrfs_ioctl_set_received_subvol(struct file *file, goto out; } - trans = btrfs_start_transaction(root, 1); + /* + * 1 - root item + * 2 - uuid items (received uuid + subvol uuid) + */ + trans = btrfs_start_transaction(root, 3); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; @@ -3987,24 +4380,42 @@ static long btrfs_ioctl_set_received_subvol(struct file *file, sa->rtime.sec = ct.tv_sec; sa->rtime.nsec = ct.tv_nsec; + received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid, + BTRFS_UUID_SIZE); + if (received_uuid_changed && + !btrfs_is_empty_uuid(root_item->received_uuid)) + btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root, + root_item->received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + root->root_key.objectid); memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE); btrfs_set_root_stransid(root_item, sa->stransid); btrfs_set_root_rtransid(root_item, sa->rtransid); - root_item->stime.sec = cpu_to_le64(sa->stime.sec); - root_item->stime.nsec = cpu_to_le32(sa->stime.nsec); - root_item->rtime.sec = cpu_to_le64(sa->rtime.sec); - root_item->rtime.nsec = cpu_to_le32(sa->rtime.nsec); + btrfs_set_stack_timespec_sec(&root_item->stime, sa->stime.sec); + btrfs_set_stack_timespec_nsec(&root_item->stime, sa->stime.nsec); + btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec); + btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec); ret = btrfs_update_root(trans, root->fs_info->tree_root, &root->root_key, &root->root_item); if (ret < 0) { btrfs_end_transaction(trans, root); - trans = NULL; goto out; - } else { - ret = btrfs_commit_transaction(trans, root); - if (ret < 0) + } + if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) { + ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root, + sa->uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + root->root_key.objectid); + if (ret < 0 && ret != -EEXIST) { + btrfs_abort_transaction(trans, root, ret); goto out; + } + } + ret = btrfs_commit_transaction(trans, root); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; } ret = copy_to_user(arg, sa, sizeof(*sa)); @@ -4020,26 +4431,30 @@ out: static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; - const char *label = root->fs_info->super_copy->label; - size_t len = strnlen(label, BTRFS_LABEL_SIZE); + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + size_t len; int ret; + char label[BTRFS_LABEL_SIZE]; + + spin_lock(&root->fs_info->super_lock); + memcpy(label, root->fs_info->super_copy->label, BTRFS_LABEL_SIZE); + spin_unlock(&root->fs_info->super_lock); + + len = strnlen(label, BTRFS_LABEL_SIZE); if (len == BTRFS_LABEL_SIZE) { pr_warn("btrfs: label is too long, return the first %zu bytes\n", --len); } - mutex_lock(&root->fs_info->volume_mutex); ret = copy_to_user(arg, label, len); - mutex_unlock(&root->fs_info->volume_mutex); return ret ? -EFAULT : 0; } static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; struct btrfs_super_block *super_block = root->fs_info->super_copy; struct btrfs_trans_handle *trans; char label[BTRFS_LABEL_SIZE]; @@ -4061,18 +4476,18 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) if (ret) return ret; - mutex_lock(&root->fs_info->volume_mutex); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_unlock; } + spin_lock(&root->fs_info->super_lock); strcpy(super_block->label, label); + spin_unlock(&root->fs_info->super_lock); ret = btrfs_end_transaction(trans, root); out_unlock: - mutex_unlock(&root->fs_info->volume_mutex); mnt_drop_write_file(file); return ret; } @@ -4179,12 +4594,16 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_quota_rescan(file, argp); case BTRFS_IOC_QUOTA_RESCAN_STATUS: return btrfs_ioctl_quota_rescan_status(file, argp); + case BTRFS_IOC_QUOTA_RESCAN_WAIT: + return btrfs_ioctl_quota_rescan_wait(file, argp); case BTRFS_IOC_DEV_REPLACE: return btrfs_ioctl_dev_replace(root, argp); case BTRFS_IOC_GET_FSLABEL: return btrfs_ioctl_get_fslabel(file, argp); case BTRFS_IOC_SET_FSLABEL: return btrfs_ioctl_set_fslabel(file, argp); + case BTRFS_IOC_FILE_EXTENT_SAME: + return btrfs_ioctl_file_extent_same(file, argp); } return -ENOTTY; diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 743b86fa4fcb..b6a6f07c5ce2 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -31,8 +31,8 @@ struct workspace { void *mem; - void *buf; /* where compressed data goes */ - void *cbuf; /* where decompressed data goes */ + void *buf; /* where decompressed data goes */ + void *cbuf; /* where compressed data goes */ struct list_head list; }; @@ -207,8 +207,10 @@ static int lzo_compress_pages(struct list_head *ws, } /* we're making it bigger, give up */ - if (tot_in > 8192 && tot_in < tot_out) + if (tot_in > 8192 && tot_in < tot_out) { + ret = -1; goto out; + } /* we're all done */ if (tot_in >= len) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 1ddd728541ee..c702cb62f78a 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -24,6 +24,7 @@ #include "transaction.h" #include "btrfs_inode.h" #include "extent_io.h" +#include "disk-io.h" static struct kmem_cache *btrfs_ordered_extent_cache; @@ -66,7 +67,7 @@ static void ordered_data_tree_panic(struct inode *inode, int errno, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset " - "%llu\n", (unsigned long long)offset); + "%llu\n", offset); } /* @@ -184,6 +185,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type, int dio, int compress_type) { + struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry; @@ -203,6 +205,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->bytes_left = len; entry->inode = igrab(inode); entry->compress_type = compress_type; + entry->truncated_len = (u64)-1; if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) set_bit(type, &entry->flags); @@ -227,10 +230,18 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, ordered_data_tree_panic(inode, -EEXIST, file_offset); spin_unlock_irq(&tree->lock); - spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + spin_lock(&root->ordered_extent_lock); list_add_tail(&entry->root_extent_list, - &BTRFS_I(inode)->root->fs_info->ordered_extents); - spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + &root->ordered_extents); + root->nr_ordered_extents++; + if (root->nr_ordered_extents == 1) { + spin_lock(&root->fs_info->ordered_root_lock); + BUG_ON(!list_empty(&root->ordered_root)); + list_add_tail(&root->ordered_root, + &root->fs_info->ordered_roots); + spin_unlock(&root->fs_info->ordered_root_lock); + } + spin_unlock(&root->ordered_extent_lock); return 0; } @@ -326,14 +337,12 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, *file_offset = dec_end; if (dec_start > dec_end) { printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n", - (unsigned long long)dec_start, - (unsigned long long)dec_end); + dec_start, dec_end); } to_dec = dec_end - dec_start; if (to_dec > entry->bytes_left) { printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", - (unsigned long long)entry->bytes_left, - (unsigned long long)to_dec); + entry->bytes_left, to_dec); } entry->bytes_left -= to_dec; if (!uptodate) @@ -393,8 +402,7 @@ have_entry: if (io_size > entry->bytes_left) { printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", - (unsigned long long)entry->bytes_left, - (unsigned long long)io_size); + entry->bytes_left, io_size); } entry->bytes_left -= io_size; if (!uptodate) @@ -516,8 +524,9 @@ void btrfs_remove_ordered_extent(struct inode *inode, set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); spin_unlock_irq(&tree->lock); - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->ordered_extent_lock); list_del_init(&entry->root_extent_list); + root->nr_ordered_extents--; trace_btrfs_ordered_extent_remove(inode, entry); @@ -530,7 +539,14 @@ void btrfs_remove_ordered_extent(struct inode *inode, !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { list_del_init(&BTRFS_I(inode)->ordered_operations); } - spin_unlock(&root->fs_info->ordered_extent_lock); + + if (!root->nr_ordered_extents) { + spin_lock(&root->fs_info->ordered_root_lock); + BUG_ON(list_empty(&root->ordered_root)); + list_del_init(&root->ordered_root); + spin_unlock(&root->fs_info->ordered_root_lock); + } + spin_unlock(&root->ordered_extent_lock); wake_up(&entry->wait); } @@ -547,63 +563,70 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ -void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) +void btrfs_wait_ordered_extents(struct btrfs_root *root) { struct list_head splice, works; - struct list_head *cur; struct btrfs_ordered_extent *ordered, *next; - struct inode *inode; INIT_LIST_HEAD(&splice); INIT_LIST_HEAD(&works); mutex_lock(&root->fs_info->ordered_operations_mutex); - spin_lock(&root->fs_info->ordered_extent_lock); - list_splice_init(&root->fs_info->ordered_extents, &splice); + spin_lock(&root->ordered_extent_lock); + list_splice_init(&root->ordered_extents, &splice); while (!list_empty(&splice)) { - cur = splice.next; - ordered = list_entry(cur, struct btrfs_ordered_extent, - root_extent_list); - list_del_init(&ordered->root_extent_list); + ordered = list_first_entry(&splice, struct btrfs_ordered_extent, + root_extent_list); + list_move_tail(&ordered->root_extent_list, + &root->ordered_extents); atomic_inc(&ordered->refs); + spin_unlock(&root->ordered_extent_lock); - /* - * the inode may be getting freed (in sys_unlink path). - */ - inode = igrab(ordered->inode); - - spin_unlock(&root->fs_info->ordered_extent_lock); - - if (inode) { - ordered->flush_work.func = btrfs_run_ordered_extent_work; - list_add_tail(&ordered->work_list, &works); - btrfs_queue_worker(&root->fs_info->flush_workers, - &ordered->flush_work); - } else { - btrfs_put_ordered_extent(ordered); - } + ordered->flush_work.func = btrfs_run_ordered_extent_work; + list_add_tail(&ordered->work_list, &works); + btrfs_queue_worker(&root->fs_info->flush_workers, + &ordered->flush_work); cond_resched(); - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->ordered_extent_lock); } - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->ordered_extent_lock); list_for_each_entry_safe(ordered, next, &works, work_list) { list_del_init(&ordered->work_list); wait_for_completion(&ordered->completion); - - inode = ordered->inode; btrfs_put_ordered_extent(ordered); - if (delay_iput) - btrfs_add_delayed_iput(inode); - else - iput(inode); - cond_resched(); } mutex_unlock(&root->fs_info->ordered_operations_mutex); } +void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + spin_lock(&fs_info->ordered_root_lock); + list_splice_init(&fs_info->ordered_roots, &splice); + while (!list_empty(&splice)) { + root = list_first_entry(&splice, struct btrfs_root, + ordered_root); + root = btrfs_grab_fs_root(root); + BUG_ON(!root); + list_move_tail(&root->ordered_root, + &fs_info->ordered_roots); + spin_unlock(&fs_info->ordered_root_lock); + + btrfs_wait_ordered_extents(root); + btrfs_put_fs_root(root); + + spin_lock(&fs_info->ordered_root_lock); + } + spin_unlock(&fs_info->ordered_root_lock); +} + /* * this is used during transaction commit to write all the inodes * added to the ordered operation list. These files must be fully on @@ -628,8 +651,8 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&splice); INIT_LIST_HEAD(&works); - mutex_lock(&root->fs_info->ordered_operations_mutex); - spin_lock(&root->fs_info->ordered_extent_lock); + mutex_lock(&root->fs_info->ordered_extent_flush_mutex); + spin_lock(&root->fs_info->ordered_root_lock); list_splice_init(&cur_trans->ordered_operations, &splice); while (!list_empty(&splice)) { btrfs_inode = list_entry(splice.next, struct btrfs_inode, @@ -648,17 +671,17 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, if (!wait) list_add_tail(&BTRFS_I(inode)->ordered_operations, &cur_trans->ordered_operations); - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->fs_info->ordered_root_lock); work = btrfs_alloc_delalloc_work(inode, wait, 1); if (!work) { - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->fs_info->ordered_root_lock); if (list_empty(&BTRFS_I(inode)->ordered_operations)) list_add_tail(&btrfs_inode->ordered_operations, &splice); list_splice_tail(&splice, &cur_trans->ordered_operations); - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->fs_info->ordered_root_lock); ret = -ENOMEM; goto out; } @@ -667,15 +690,15 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, &work->work); cond_resched(); - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->fs_info->ordered_root_lock); } - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->fs_info->ordered_root_lock); out: list_for_each_entry_safe(work, next, &works, list) { list_del_init(&work->list); btrfs_wait_and_free_delalloc_work(work); } - mutex_unlock(&root->fs_info->ordered_operations_mutex); + mutex_unlock(&root->fs_info->ordered_extent_flush_mutex); return ret; } @@ -880,12 +903,16 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *test; int ret = 1; - if (ordered) + spin_lock_irq(&tree->lock); + if (ordered) { offset = entry_end(ordered); - else + if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags)) + offset = min(offset, + ordered->file_offset + + ordered->truncated_len); + } else { offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); - - spin_lock_irq(&tree->lock); + } disk_i_size = BTRFS_I(inode)->disk_i_size; /* truncate file */ @@ -989,7 +1016,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum, int len) { struct btrfs_ordered_sum *ordered_sum; - struct btrfs_sector_sum *sector_sums; struct btrfs_ordered_extent *ordered; struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; unsigned long num_sectors; @@ -1007,18 +1033,16 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, disk_bytenr < ordered_sum->bytenr + ordered_sum->len) { i = (disk_bytenr - ordered_sum->bytenr) >> inode->i_sb->s_blocksize_bits; - sector_sums = ordered_sum->sums + i; num_sectors = ordered_sum->len >> inode->i_sb->s_blocksize_bits; - for (; i < num_sectors; i++) { - if (sector_sums[i].bytenr == disk_bytenr) { - sum[index] = sector_sums[i].sum; - index++; - if (index == len) - goto out; - disk_bytenr += sectorsize; - } - } + num_sectors = min_t(int, len - index, num_sectors - i); + memcpy(sum + index, ordered_sum->sums + i, + num_sectors); + + index += (int)num_sectors; + if (index == len) + goto out; + disk_bytenr += num_sectors * sectorsize; } } out: @@ -1055,12 +1079,12 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, if (last_mod < root->fs_info->last_trans_committed) return; - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->fs_info->ordered_root_lock); if (list_empty(&BTRFS_I(inode)->ordered_operations)) { list_add_tail(&BTRFS_I(inode)->ordered_operations, &cur_trans->ordered_operations); } - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->fs_info->ordered_root_lock); } int __init ordered_data_init(void) diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 58b0e3b0ebad..0c0b35612d7a 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -26,18 +26,6 @@ struct btrfs_ordered_inode_tree { struct rb_node *last; }; -/* - * these are used to collect checksums done just before bios submission. - * They are attached via a list into the ordered extent, and - * checksum items are inserted into the tree after all the blocks in - * the ordered extent are on disk - */ -struct btrfs_sector_sum { - /* bytenr on disk */ - u64 bytenr; - u32 sum; -}; - struct btrfs_ordered_sum { /* bytenr is the start of this extent on disk */ u64 bytenr; @@ -45,10 +33,10 @@ struct btrfs_ordered_sum { /* * this is the length in bytes covered by the sums array below. */ - unsigned long len; + int len; struct list_head list; - /* last field is a variable length array of btrfs_sector_sums */ - struct btrfs_sector_sum sums[]; + /* last field is a variable length array of csums */ + u32 sums[]; }; /* @@ -81,6 +69,7 @@ struct btrfs_ordered_sum { * the isize. */ #define BTRFS_ORDERED_LOGGED_CSUM 8 /* We've logged the csums on this ordered ordered extent */ +#define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */ struct btrfs_ordered_extent { /* logical offset in the file */ @@ -108,6 +97,12 @@ struct btrfs_ordered_extent { */ u64 outstanding_isize; + /* + * If we get truncated we need to adjust the file extent we enter for + * this ordered extent so that we do not expose stale data. + */ + u64 truncated_len; + /* flags (described above) */ unsigned long flags; @@ -149,11 +144,8 @@ struct btrfs_ordered_extent { static inline int btrfs_ordered_sum_size(struct btrfs_root *root, unsigned long bytes) { - unsigned long num_sectors = (bytes + root->sectorsize - 1) / - root->sectorsize; - num_sectors++; - return sizeof(struct btrfs_ordered_sum) + - num_sectors * sizeof(struct btrfs_sector_sum); + int num_sectors = (int)DIV_ROUND_UP(bytes, root->sectorsize); + return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32); } static inline void @@ -203,7 +195,8 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); -void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput); +void btrfs_wait_ordered_extents(struct btrfs_root *root); +void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info); void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode); void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index dc0024f17c1f..0088bedc8631 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -26,14 +26,12 @@ static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) int i; printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu " "num_stripes %d\n", - (unsigned long long)btrfs_chunk_length(eb, chunk), - (unsigned long long)btrfs_chunk_owner(eb, chunk), - (unsigned long long)btrfs_chunk_type(eb, chunk), - num_stripes); + btrfs_chunk_length(eb, chunk), btrfs_chunk_owner(eb, chunk), + btrfs_chunk_type(eb, chunk), num_stripes); for (i = 0 ; i < num_stripes ; i++) { printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i, - (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i), - (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i)); + btrfs_stripe_devid_nr(eb, chunk, i), + btrfs_stripe_offset_nr(eb, chunk, i)); } } static void print_dev_item(struct extent_buffer *eb, @@ -41,18 +39,18 @@ static void print_dev_item(struct extent_buffer *eb, { printk(KERN_INFO "\t\tdev item devid %llu " "total_bytes %llu bytes used %llu\n", - (unsigned long long)btrfs_device_id(eb, dev_item), - (unsigned long long)btrfs_device_total_bytes(eb, dev_item), - (unsigned long long)btrfs_device_bytes_used(eb, dev_item)); + btrfs_device_id(eb, dev_item), + btrfs_device_total_bytes(eb, dev_item), + btrfs_device_bytes_used(eb, dev_item)); } static void print_extent_data_ref(struct extent_buffer *eb, struct btrfs_extent_data_ref *ref) { printk(KERN_INFO "\t\textent data backref root %llu " "objectid %llu offset %llu count %u\n", - (unsigned long long)btrfs_extent_data_ref_root(eb, ref), - (unsigned long long)btrfs_extent_data_ref_objectid(eb, ref), - (unsigned long long)btrfs_extent_data_ref_offset(eb, ref), + btrfs_extent_data_ref_root(eb, ref), + btrfs_extent_data_ref_objectid(eb, ref), + btrfs_extent_data_ref_offset(eb, ref), btrfs_extent_data_ref_count(eb, ref)); } @@ -87,19 +85,17 @@ static void print_extent_item(struct extent_buffer *eb, int slot) flags = btrfs_extent_flags(eb, ei); printk(KERN_INFO "\t\textent refs %llu gen %llu flags %llu\n", - (unsigned long long)btrfs_extent_refs(eb, ei), - (unsigned long long)btrfs_extent_generation(eb, ei), - (unsigned long long)flags); + btrfs_extent_refs(eb, ei), btrfs_extent_generation(eb, ei), + flags); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { struct btrfs_tree_block_info *info; info = (struct btrfs_tree_block_info *)(ei + 1); btrfs_tree_block_key(eb, info, &key); - printk(KERN_INFO "\t\ttree block key (%llu %x %llu) " + printk(KERN_INFO "\t\ttree block key (%llu %u %llu) " "level %d\n", - (unsigned long long)btrfs_disk_key_objectid(&key), - key.type, - (unsigned long long)btrfs_disk_key_offset(&key), + btrfs_disk_key_objectid(&key), key.type, + btrfs_disk_key_offset(&key), btrfs_tree_block_level(eb, info)); iref = (struct btrfs_extent_inline_ref *)(info + 1); } else { @@ -115,11 +111,11 @@ static void print_extent_item(struct extent_buffer *eb, int slot) switch (type) { case BTRFS_TREE_BLOCK_REF_KEY: printk(KERN_INFO "\t\ttree block backref " - "root %llu\n", (unsigned long long)offset); + "root %llu\n", offset); break; case BTRFS_SHARED_BLOCK_REF_KEY: printk(KERN_INFO "\t\tshared block backref " - "parent %llu\n", (unsigned long long)offset); + "parent %llu\n", offset); break; case BTRFS_EXTENT_DATA_REF_KEY: dref = (struct btrfs_extent_data_ref *)(&iref->offset); @@ -129,8 +125,7 @@ static void print_extent_item(struct extent_buffer *eb, int slot) sref = (struct btrfs_shared_data_ref *)(iref + 1); printk(KERN_INFO "\t\tshared data backref " "parent %llu count %u\n", - (unsigned long long)offset, - btrfs_shared_data_ref_count(eb, sref)); + offset, btrfs_shared_data_ref_count(eb, sref)); break; default: BUG(); @@ -148,13 +143,32 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot) ref0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_ref_v0); printk("\t\textent back ref root %llu gen %llu " "owner %llu num_refs %lu\n", - (unsigned long long)btrfs_ref_root_v0(eb, ref0), - (unsigned long long)btrfs_ref_generation_v0(eb, ref0), - (unsigned long long)btrfs_ref_objectid_v0(eb, ref0), + btrfs_ref_root_v0(eb, ref0), + btrfs_ref_generation_v0(eb, ref0), + btrfs_ref_objectid_v0(eb, ref0), (unsigned long)btrfs_ref_count_v0(eb, ref0)); } #endif +static void print_uuid_item(struct extent_buffer *l, unsigned long offset, + u32 item_size) +{ + if (!IS_ALIGNED(item_size, sizeof(u64))) { + pr_warn("btrfs: uuid item with illegal size %lu!\n", + (unsigned long)item_size); + return; + } + while (item_size) { + __le64 subvol_id; + + read_extent_buffer(l, &subvol_id, offset, sizeof(subvol_id)); + printk(KERN_INFO "\t\tsubvol_id %llu\n", + (unsigned long long)le64_to_cpu(subvol_id)); + item_size -= sizeof(u64); + offset += sizeof(u64); + } +} + void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) { int i; @@ -177,39 +191,34 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) nr = btrfs_header_nritems(l); btrfs_info(root->fs_info, "leaf %llu total ptrs %d free space %d", - (unsigned long long)btrfs_header_bytenr(l), nr, - btrfs_leaf_free_space(root, l)); + btrfs_header_bytenr(l), nr, btrfs_leaf_free_space(root, l)); for (i = 0 ; i < nr ; i++) { item = btrfs_item_nr(l, i); btrfs_item_key_to_cpu(l, &key, i); type = btrfs_key_type(&key); - printk(KERN_INFO "\titem %d key (%llu %x %llu) itemoff %d " + printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d " "itemsize %d\n", - i, - (unsigned long long)key.objectid, type, - (unsigned long long)key.offset, + i, key.objectid, type, key.offset, btrfs_item_offset(l, item), btrfs_item_size(l, item)); switch (type) { case BTRFS_INODE_ITEM_KEY: ii = btrfs_item_ptr(l, i, struct btrfs_inode_item); printk(KERN_INFO "\t\tinode generation %llu size %llu " "mode %o\n", - (unsigned long long) btrfs_inode_generation(l, ii), - (unsigned long long)btrfs_inode_size(l, ii), + btrfs_inode_size(l, ii), btrfs_inode_mode(l, ii)); break; case BTRFS_DIR_ITEM_KEY: di = btrfs_item_ptr(l, i, struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(l, di, &found_key); printk(KERN_INFO "\t\tdir oid %llu type %u\n", - (unsigned long long)found_key.objectid, + found_key.objectid, btrfs_dir_type(l, di)); break; case BTRFS_ROOT_ITEM_KEY: ri = btrfs_item_ptr(l, i, struct btrfs_root_item); printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n", - (unsigned long long) btrfs_disk_root_bytenr(l, ri), btrfs_disk_root_refs(l, ri)); break; @@ -245,17 +254,12 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) } printk(KERN_INFO "\t\textent data disk bytenr %llu " "nr %llu\n", - (unsigned long long) btrfs_file_extent_disk_bytenr(l, fi), - (unsigned long long) btrfs_file_extent_disk_num_bytes(l, fi)); printk(KERN_INFO "\t\textent data offset %llu " "nr %llu ram %llu\n", - (unsigned long long) btrfs_file_extent_offset(l, fi), - (unsigned long long) btrfs_file_extent_num_bytes(l, fi), - (unsigned long long) btrfs_file_extent_ram_bytes(l, fi)); break; case BTRFS_EXTENT_REF_V0_KEY: @@ -269,7 +273,6 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) bi = btrfs_item_ptr(l, i, struct btrfs_block_group_item); printk(KERN_INFO "\t\tblock group used %llu\n", - (unsigned long long) btrfs_disk_block_group_used(l, bi)); break; case BTRFS_CHUNK_ITEM_KEY: @@ -286,13 +289,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n" "\t\tchunk objectid %llu chunk offset %llu " "length %llu\n", - (unsigned long long) btrfs_dev_extent_chunk_tree(l, dev_extent), - (unsigned long long) btrfs_dev_extent_chunk_objectid(l, dev_extent), - (unsigned long long) btrfs_dev_extent_chunk_offset(l, dev_extent), - (unsigned long long) btrfs_dev_extent_length(l, dev_extent)); break; case BTRFS_DEV_STATS_KEY: @@ -301,6 +300,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) case BTRFS_DEV_REPLACE_KEY: printk(KERN_INFO "\t\tdev replace\n"); break; + case BTRFS_UUID_KEY_SUBVOL: + case BTRFS_UUID_KEY_RECEIVED_SUBVOL: + print_uuid_item(l, btrfs_item_ptr_offset(l, i), + btrfs_item_size_nr(l, i)); + break; }; } } @@ -320,16 +324,13 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) return; } btrfs_info(root->fs_info, "node %llu level %d total ptrs %d free spc %u", - (unsigned long long)btrfs_header_bytenr(c), - level, nr, (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); + btrfs_header_bytenr(c), level, nr, + (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); for (i = 0; i < nr; i++) { btrfs_node_key_to_cpu(c, &key, i); printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n", - i, - (unsigned long long)key.objectid, - key.type, - (unsigned long long)key.offset, - (unsigned long long)btrfs_node_blockptr(c, i)); + i, key.objectid, key.type, key.offset, + btrfs_node_blockptr(c, i)); } for (i = 0; i < nr; i++) { struct extent_buffer *next = read_tree_block(root, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 9d49c586995a..4e6ef490619e 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -98,13 +98,10 @@ struct btrfs_qgroup_list { struct btrfs_qgroup *member; }; -struct qgroup_rescan { - struct btrfs_work work; - struct btrfs_fs_info *fs_info; -}; - -static void qgroup_rescan_start(struct btrfs_fs_info *fs_info, - struct qgroup_rescan *qscan); +static int +qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, + int init_flags); +static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info); /* must be called with qgroup_ioctl_lock held */ static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, @@ -160,18 +157,11 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, return qgroup; } -/* must be called with qgroup_lock held */ -static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) +static void __del_qgroup_rb(struct btrfs_qgroup *qgroup) { - struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid); struct btrfs_qgroup_list *list; - if (!qgroup) - return -ENOENT; - - rb_erase(&qgroup->node, &fs_info->qgroup_tree); list_del(&qgroup->dirty); - while (!list_empty(&qgroup->groups)) { list = list_first_entry(&qgroup->groups, struct btrfs_qgroup_list, next_group); @@ -188,7 +178,18 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) kfree(list); } kfree(qgroup); +} + +/* must be called with qgroup_lock held */ +static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) +{ + struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid); + + if (!qgroup) + return -ENOENT; + rb_erase(&qgroup->node, &fs_info->qgroup_tree); + __del_qgroup_rb(qgroup); return 0; } @@ -255,10 +256,17 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) int slot; int ret = 0; u64 flags = 0; + u64 rescan_progress = 0; if (!fs_info->quota_enabled) return 0; + fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS); + if (!fs_info->qgroup_ulist) { + ret = -ENOMEM; + goto out; + } + path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; @@ -306,20 +314,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) } fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr); - fs_info->qgroup_rescan_progress.objectid = - btrfs_qgroup_status_rescan(l, ptr); - if (fs_info->qgroup_flags & - BTRFS_QGROUP_STATUS_FLAG_RESCAN) { - struct qgroup_rescan *qscan = - kmalloc(sizeof(*qscan), GFP_NOFS); - if (!qscan) { - ret = -ENOMEM; - goto out; - } - fs_info->qgroup_rescan_progress.type = 0; - fs_info->qgroup_rescan_progress.offset = 0; - qgroup_rescan_start(fs_info, qscan); - } + rescan_progress = btrfs_qgroup_status_rescan(l, ptr); goto next1; } @@ -403,8 +398,7 @@ next1: if (ret == -ENOENT) { printk(KERN_WARNING "btrfs: orphan qgroup relation 0x%llx->0x%llx\n", - (unsigned long long)found_key.objectid, - (unsigned long long)found_key.offset); + found_key.objectid, found_key.offset); ret = 0; /* ignore the error */ } if (ret) @@ -421,45 +415,44 @@ out: if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { fs_info->quota_enabled = 0; fs_info->pending_quota_state = 0; + } else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN && + ret >= 0) { + ret = qgroup_rescan_init(fs_info, rescan_progress, 0); } btrfs_free_path(path); + if (ret < 0) { + ulist_free(fs_info->qgroup_ulist); + fs_info->qgroup_ulist = NULL; + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + } + return ret < 0 ? ret : 0; } /* - * This is only called from close_ctree() or open_ctree(), both in single- - * treaded paths. Clean up the in-memory structures. No locking needed. + * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(), + * first two are in single-threaded paths.And for the third one, we have set + * quota_root to be null with qgroup_lock held before, so it is safe to clean + * up the in-memory structures without qgroup_lock held. */ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) { struct rb_node *n; struct btrfs_qgroup *qgroup; - struct btrfs_qgroup_list *list; while ((n = rb_first(&fs_info->qgroup_tree))) { qgroup = rb_entry(n, struct btrfs_qgroup, node); rb_erase(n, &fs_info->qgroup_tree); - - while (!list_empty(&qgroup->groups)) { - list = list_first_entry(&qgroup->groups, - struct btrfs_qgroup_list, - next_group); - list_del(&list->next_group); - list_del(&list->next_member); - kfree(list); - } - - while (!list_empty(&qgroup->members)) { - list = list_first_entry(&qgroup->members, - struct btrfs_qgroup_list, - next_member); - list_del(&list->next_group); - list_del(&list->next_member); - kfree(list); - } - kfree(qgroup); + __del_qgroup_rb(qgroup); } + /* + * we call btrfs_free_qgroup_config() when umounting + * filesystem and disabling quota, so we set qgroup_ulit + * to be null here to avoid double free. + */ + ulist_free(fs_info->qgroup_ulist); + fs_info->qgroup_ulist = NULL; } static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, @@ -819,6 +812,12 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, goto out; } + fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS); + if (!fs_info->qgroup_ulist) { + ret = -ENOMEM; + goto out; + } + /* * initially create the quota tree */ @@ -916,6 +915,10 @@ out_free_root: kfree(quota_root); } out: + if (ret) { + ulist_free(fs_info->qgroup_ulist); + fs_info->qgroup_ulist = NULL; + } mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } @@ -935,13 +938,9 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, fs_info->pending_quota_state = 0; quota_root = fs_info->quota_root; fs_info->quota_root = NULL; - btrfs_free_qgroup_config(fs_info); spin_unlock(&fs_info->qgroup_lock); - if (!quota_root) { - ret = -EINVAL; - goto out; - } + btrfs_free_qgroup_config(fs_info); ret = btrfs_clean_quota_tree(trans, quota_root); if (ret) @@ -1163,7 +1162,7 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, if (ret) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; printk(KERN_INFO "unable to update quota limit for %llu\n", - (unsigned long long)qgroupid); + qgroupid); } spin_lock(&fs_info->qgroup_lock); @@ -1355,7 +1354,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, u64 ref_root; struct btrfs_qgroup *qgroup; struct ulist *roots = NULL; - struct ulist *tmp = NULL; u64 seq; int ret = 0; int sgn; @@ -1428,14 +1426,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, if (ret < 0) return ret; - mutex_lock(&fs_info->qgroup_rescan_lock); spin_lock(&fs_info->qgroup_lock); - if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { - if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) { - ret = 0; - goto unlock; - } - } quota_root = fs_info->quota_root; if (!quota_root) @@ -1448,39 +1439,34 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, /* * step 1: for each old ref, visit all nodes once and inc refcnt */ - tmp = ulist_alloc(GFP_ATOMIC); - if (!tmp) { - ret = -ENOMEM; - goto unlock; - } + ulist_reinit(fs_info->qgroup_ulist); seq = fs_info->qgroup_seq; fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ - ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq); + ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist, + seq); if (ret) goto unlock; /* * step 2: walk from the new root */ - ret = qgroup_account_ref_step2(fs_info, roots, tmp, seq, sgn, - node->num_bytes, qgroup); + ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist, + seq, sgn, node->num_bytes, qgroup); if (ret) goto unlock; /* * step 3: walk again from old refs */ - ret = qgroup_account_ref_step3(fs_info, roots, tmp, seq, sgn, - node->num_bytes); + ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist, + seq, sgn, node->num_bytes); if (ret) goto unlock; unlock: spin_unlock(&fs_info->qgroup_lock); - mutex_unlock(&fs_info->qgroup_rescan_lock); ulist_free(roots); - ulist_free(tmp); return ret; } @@ -1527,9 +1513,12 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans, fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; if (!ret && start_rescan_worker) { - ret = btrfs_qgroup_rescan(fs_info); - if (ret) - pr_err("btrfs: start rescan quota failed: %d\n", ret); + ret = qgroup_rescan_init(fs_info, 0, 1); + if (!ret) { + qgroup_rescan_zero_tracking(fs_info); + btrfs_queue_worker(&fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); + } ret = 0; } @@ -1720,7 +1709,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) struct btrfs_fs_info *fs_info = root->fs_info; u64 ref_root = root->root_key.objectid; int ret = 0; - struct ulist *ulist = NULL; struct ulist_node *unode; struct ulist_iterator uiter; @@ -1743,17 +1731,13 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) * in a first step, we check all affected qgroups if any limits would * be exceeded */ - ulist = ulist_alloc(GFP_ATOMIC); - if (!ulist) { - ret = -ENOMEM; - goto out; - } - ret = ulist_add(ulist, qgroup->qgroupid, + ulist_reinit(fs_info->qgroup_ulist); + ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); if (ret < 0) goto out; ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(ulist, &uiter))) { + while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { struct btrfs_qgroup *qg; struct btrfs_qgroup_list *glist; @@ -1774,7 +1758,8 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) } list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(ulist, glist->group->qgroupid, + ret = ulist_add(fs_info->qgroup_ulist, + glist->group->qgroupid, (uintptr_t)glist->group, GFP_ATOMIC); if (ret < 0) goto out; @@ -1785,7 +1770,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) * no limits exceeded, now record the reservation into all qgroups */ ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(ulist, &uiter))) { + while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { struct btrfs_qgroup *qg; qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; @@ -1795,8 +1780,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) out: spin_unlock(&fs_info->qgroup_lock); - ulist_free(ulist); - return ret; } @@ -1805,7 +1788,6 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; struct btrfs_fs_info *fs_info = root->fs_info; - struct ulist *ulist = NULL; struct ulist_node *unode; struct ulist_iterator uiter; u64 ref_root = root->root_key.objectid; @@ -1827,17 +1809,13 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) if (!qgroup) goto out; - ulist = ulist_alloc(GFP_ATOMIC); - if (!ulist) { - btrfs_std_error(fs_info, -ENOMEM); - goto out; - } - ret = ulist_add(ulist, qgroup->qgroupid, + ulist_reinit(fs_info->qgroup_ulist); + ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); if (ret < 0) goto out; ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(ulist, &uiter))) { + while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { struct btrfs_qgroup *qg; struct btrfs_qgroup_list *glist; @@ -1846,7 +1824,8 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) qg->reserved -= num_bytes; list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(ulist, glist->group->qgroupid, + ret = ulist_add(fs_info->qgroup_ulist, + glist->group->qgroupid, (uintptr_t)glist->group, GFP_ATOMIC); if (ret < 0) goto out; @@ -1855,7 +1834,6 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) out: spin_unlock(&fs_info->qgroup_lock); - ulist_free(ulist); } void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) @@ -1874,12 +1852,11 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared. */ static int -qgroup_rescan_leaf(struct qgroup_rescan *qscan, struct btrfs_path *path, +qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct btrfs_trans_handle *trans, struct ulist *tmp, struct extent_buffer *scratch_leaf) { struct btrfs_key found; - struct btrfs_fs_info *fs_info = qscan->fs_info; struct ulist *roots = NULL; struct ulist_node *unode; struct ulist_iterator uiter; @@ -1895,10 +1872,9 @@ qgroup_rescan_leaf(struct qgroup_rescan *qscan, struct btrfs_path *path, path, 1, 0); pr_debug("current progress key (%llu %u %llu), search_slot ret %d\n", - (unsigned long long)fs_info->qgroup_rescan_progress.objectid, + fs_info->qgroup_rescan_progress.objectid, fs_info->qgroup_rescan_progress.type, - (unsigned long long)fs_info->qgroup_rescan_progress.offset, - ret); + fs_info->qgroup_rescan_progress.offset, ret); if (ret) { /* @@ -2007,11 +1983,10 @@ out: static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) { - struct qgroup_rescan *qscan = container_of(work, struct qgroup_rescan, - work); + struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info, + qgroup_rescan_work); struct btrfs_path *path; struct btrfs_trans_handle *trans = NULL; - struct btrfs_fs_info *fs_info = qscan->fs_info; struct ulist *tmp = NULL; struct extent_buffer *scratch_leaf = NULL; int err = -ENOMEM; @@ -2036,7 +2011,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) if (!fs_info->quota_enabled) { err = -EINTR; } else { - err = qgroup_rescan_leaf(qscan, path, trans, + err = qgroup_rescan_leaf(fs_info, path, trans, tmp, scratch_leaf); } if (err > 0) @@ -2049,7 +2024,6 @@ out: kfree(scratch_leaf); ulist_free(tmp); btrfs_free_path(path); - kfree(qscan); mutex_lock(&fs_info->qgroup_rescan_lock); fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; @@ -2068,47 +2042,74 @@ out: } else { pr_err("btrfs: qgroup scan failed with %d\n", err); } -} - -static void -qgroup_rescan_start(struct btrfs_fs_info *fs_info, struct qgroup_rescan *qscan) -{ - memset(&qscan->work, 0, sizeof(qscan->work)); - qscan->work.func = btrfs_qgroup_rescan_worker; - qscan->fs_info = fs_info; - pr_info("btrfs: qgroup scan started\n"); - btrfs_queue_worker(&fs_info->qgroup_rescan_workers, &qscan->work); + complete_all(&fs_info->qgroup_rescan_completion); } -int -btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) +/* + * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all + * memory required for the rescan context. + */ +static int +qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, + int init_flags) { int ret = 0; - struct rb_node *n; - struct btrfs_qgroup *qgroup; - struct qgroup_rescan *qscan = kmalloc(sizeof(*qscan), GFP_NOFS); - if (!qscan) - return -ENOMEM; + if (!init_flags && + (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) || + !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))) { + ret = -EINVAL; + goto err; + } mutex_lock(&fs_info->qgroup_rescan_lock); spin_lock(&fs_info->qgroup_lock); - if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) - ret = -EINPROGRESS; - else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) - ret = -EINVAL; - if (ret) { - spin_unlock(&fs_info->qgroup_lock); - mutex_unlock(&fs_info->qgroup_rescan_lock); - kfree(qscan); - return ret; + + if (init_flags) { + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) + ret = -EINPROGRESS; + else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) + ret = -EINVAL; + + if (ret) { + spin_unlock(&fs_info->qgroup_lock); + mutex_unlock(&fs_info->qgroup_rescan_lock); + goto err; + } + + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN; } - fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN; memset(&fs_info->qgroup_rescan_progress, 0, sizeof(fs_info->qgroup_rescan_progress)); + fs_info->qgroup_rescan_progress.objectid = progress_objectid; + + spin_unlock(&fs_info->qgroup_lock); + mutex_unlock(&fs_info->qgroup_rescan_lock); + + init_completion(&fs_info->qgroup_rescan_completion); + + memset(&fs_info->qgroup_rescan_work, 0, + sizeof(fs_info->qgroup_rescan_work)); + fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker; + + if (ret) { +err: + pr_info("btrfs: qgroup_rescan_init failed with %d\n", ret); + return ret; + } + + return 0; +} +static void +qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info) +{ + struct rb_node *n; + struct btrfs_qgroup *qgroup; + + spin_lock(&fs_info->qgroup_lock); /* clear all current qgroup tracking information */ for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) { qgroup = rb_entry(n, struct btrfs_qgroup, node); @@ -2118,9 +2119,74 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) qgroup->excl_cmpr = 0; } spin_unlock(&fs_info->qgroup_lock); - mutex_unlock(&fs_info->qgroup_rescan_lock); +} + +int +btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) +{ + int ret = 0; + struct btrfs_trans_handle *trans; + + ret = qgroup_rescan_init(fs_info, 0, 1); + if (ret) + return ret; + + /* + * We have set the rescan_progress to 0, which means no more + * delayed refs will be accounted by btrfs_qgroup_account_ref. + * However, btrfs_qgroup_account_ref may be right after its call + * to btrfs_find_all_roots, in which case it would still do the + * accounting. + * To solve this, we're committing the transaction, which will + * ensure we run all delayed refs and only after that, we are + * going to clear all tracking information for a clean start. + */ + + trans = btrfs_join_transaction(fs_info->fs_root); + if (IS_ERR(trans)) { + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans, fs_info->fs_root); + if (ret) { + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + return ret; + } - qgroup_rescan_start(fs_info, qscan); + qgroup_rescan_zero_tracking(fs_info); + + btrfs_queue_worker(&fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); return 0; } + +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info) +{ + int running; + int ret = 0; + + mutex_lock(&fs_info->qgroup_rescan_lock); + spin_lock(&fs_info->qgroup_lock); + running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN; + spin_unlock(&fs_info->qgroup_lock); + mutex_unlock(&fs_info->qgroup_rescan_lock); + + if (running) + ret = wait_for_completion_interruptible( + &fs_info->qgroup_rescan_completion); + + return ret; +} + +/* + * this is only called from open_ctree where we're still single threaded, thus + * locking is omitted here. + */ +void +btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) +{ + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) + btrfs_queue_worker(&fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); +} diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 0525e1389f5b..d0ecfbd9cc9f 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1540,8 +1540,10 @@ static int full_stripe_write(struct btrfs_raid_bio *rbio) int ret; ret = alloc_rbio_parity_pages(rbio); - if (ret) + if (ret) { + __free_raid_bio(rbio); return ret; + } ret = lock_stripe_add(rbio); if (ret == 0) @@ -1687,11 +1689,8 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio, struct blk_plug_cb *cb; rbio = alloc_rbio(root, bbio, raid_map, stripe_len); - if (IS_ERR(rbio)) { - kfree(raid_map); - kfree(bbio); + if (IS_ERR(rbio)) return PTR_ERR(rbio); - } bio_list_add(&rbio->bio_list, bio); rbio->bio_list_bytes = bio->bi_size; @@ -2041,9 +2040,8 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, int ret; rbio = alloc_rbio(root, bbio, raid_map, stripe_len); - if (IS_ERR(rbio)) { + if (IS_ERR(rbio)) return PTR_ERR(rbio); - } rbio->read_rebuild = 1; bio_list_add(&rbio->bio_list, bio); @@ -2052,6 +2050,8 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { BUG(); + kfree(raid_map); + kfree(bbio); kfree(rbio); return -EIO; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 395b82031a42..a5a26320503f 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -335,7 +335,7 @@ static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr) if (bnode->root) fs_info = bnode->root->fs_info; btrfs_panic(fs_info, errno, "Inconsistency in backref cache " - "found at offset %llu\n", (unsigned long long)bytenr); + "found at offset %llu\n", bytenr); } /* @@ -641,6 +641,11 @@ int find_inline_backref(struct extent_buffer *leaf, int slot, WARN_ON(item_size < sizeof(*ei) + sizeof(*bi)); return 1; } + if (key.type == BTRFS_METADATA_ITEM_KEY && + item_size <= sizeof(*ei)) { + WARN_ON(item_size < sizeof(*ei)); + return 1; + } if (key.type == BTRFS_EXTENT_ITEM_KEY) { bi = (struct btrfs_tree_block_info *)(ei + 1); @@ -691,6 +696,7 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, int cowonly; int ret; int err = 0; + bool need_check = true; path1 = btrfs_alloc_path(); path2 = btrfs_alloc_path(); @@ -914,6 +920,7 @@ again: cur->bytenr); lower = cur; + need_check = true; for (; level < BTRFS_MAX_LEVEL; level++) { if (!path2->nodes[level]) { BUG_ON(btrfs_root_bytenr(&root->root_item) != @@ -957,14 +964,12 @@ again: /* * add the block to pending list if we - * need check its backrefs. only block - * at 'cur->level + 1' is added to the - * tail of pending list. this guarantees - * we check backrefs from lower level - * blocks to upper level blocks. + * need check its backrefs, we only do this once + * while walking up a tree as we will catch + * anything else later on. */ - if (!upper->checked && - level == cur->level + 1) { + if (!upper->checked && need_check) { + need_check = false; list_add_tail(&edge->list[UPPER], &list); } else @@ -1305,6 +1310,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, struct extent_buffer *eb; struct btrfs_root_item *root_item; struct btrfs_key root_key; + u64 last_snap = 0; int ret; root_item = kmalloc(sizeof(*root_item), GFP_NOFS); @@ -1320,6 +1326,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, BTRFS_TREE_RELOC_OBJECTID); BUG_ON(ret); + last_snap = btrfs_root_last_snapshot(&root->root_item); btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1); } else { @@ -1345,6 +1352,12 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); root_item->drop_level = 0; + /* + * abuse rtransid, it is safe because it is impossible to + * receive data into a relocation tree. + */ + btrfs_set_root_rtransid(root_item, last_snap); + btrfs_set_root_otransid(root_item, trans->transid); } btrfs_tree_unlock(eb); @@ -1355,8 +1368,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, BUG_ON(ret); kfree(root_item); - reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, - &root_key); + reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key); BUG_ON(IS_ERR(reloc_root)); reloc_root->last_trans = trans->transid; return reloc_root; @@ -1536,7 +1548,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, btrfs_file_extent_other_encoding(leaf, fi)); if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { - ret = 1; + ret = -EINVAL; goto out; } @@ -1567,7 +1579,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, u64 end; u32 nritems; u32 i; - int ret; + int ret = 0; int first = 1; int dirty = 0; @@ -1630,11 +1642,13 @@ int replace_file_extents(struct btrfs_trans_handle *trans, ret = get_new_location(rc->data_inode, &new_bytenr, bytenr, num_bytes); - if (ret > 0) { - WARN_ON(1); - continue; + if (ret) { + /* + * Don't have to abort since we've not changed anything + * in the file extent yet. + */ + break; } - BUG_ON(ret < 0); btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); dirty = 1; @@ -1644,18 +1658,24 @@ int replace_file_extents(struct btrfs_trans_handle *trans, num_bytes, parent, btrfs_header_owner(leaf), key.objectid, key.offset, 1); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + break; + } ret = btrfs_free_extent(trans, root, bytenr, num_bytes, parent, btrfs_header_owner(leaf), key.objectid, key.offset, 1); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + break; + } } if (dirty) btrfs_mark_buffer_dirty(leaf); if (inode) btrfs_add_delayed_iput(inode); - return 0; + return ret; } static noinline_for_stack @@ -2273,8 +2293,12 @@ void free_reloc_roots(struct list_head *list) static noinline_for_stack int merge_reloc_roots(struct reloc_control *rc) { + struct btrfs_trans_handle *trans; struct btrfs_root *root; struct btrfs_root *reloc_root; + u64 last_snap; + u64 otransid; + u64 objectid; LIST_HEAD(reloc_roots); int found = 0; int ret = 0; @@ -2303,17 +2327,51 @@ again: BUG_ON(root->reloc_root != reloc_root); ret = merge_reloc_root(rc, root); - if (ret) + if (ret) { + __update_reloc_root(reloc_root, 1); + free_extent_buffer(reloc_root->node); + free_extent_buffer(reloc_root->commit_root); + kfree(reloc_root); goto out; + } } else { list_del_init(&reloc_root->root_list); } + + /* + * we keep the old last snapshod transid in rtranid when we + * created the relocation tree. + */ + last_snap = btrfs_root_rtransid(&reloc_root->root_item); + otransid = btrfs_root_otransid(&reloc_root->root_item); + objectid = reloc_root->root_key.offset; + ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); if (ret < 0) { if (list_empty(&reloc_root->root_list)) list_add_tail(&reloc_root->root_list, &reloc_roots); goto out; + } else if (!ret) { + /* + * recover the last snapshot tranid to avoid + * the space balance break NOCOW. + */ + root = read_fs_root(rc->extent_root->fs_info, + objectid); + if (IS_ERR(root)) + continue; + + trans = btrfs_join_transaction(root); + BUG_ON(IS_ERR(trans)); + + /* Check if the fs/file tree was snapshoted or not. */ + if (btrfs_root_last_snapshot(&root->root_item) == + otransid - 1) + btrfs_set_root_last_snapshot(&root->root_item, + last_snap); + + btrfs_end_transaction(trans, root); } } @@ -3266,6 +3324,8 @@ static int __add_tree_block(struct reloc_control *rc, struct btrfs_path *path; struct btrfs_key key; int ret; + bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info, + SKINNY_METADATA); if (tree_block_processed(bytenr, blocksize, rc)) return 0; @@ -3276,10 +3336,15 @@ static int __add_tree_block(struct reloc_control *rc, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - +again: key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = blocksize; + if (skinny) { + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = (u64)-1; + } else { + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = blocksize; + } path->search_commit_root = 1; path->skip_locking = 1; @@ -3287,11 +3352,23 @@ static int __add_tree_block(struct reloc_control *rc, if (ret < 0) goto out; - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (ret > 0) { - if (key.objectid == bytenr && - key.type == BTRFS_METADATA_ITEM_KEY) - ret = 0; + if (ret > 0 && skinny) { + if (path->slots[0]) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == bytenr && + (key.type == BTRFS_METADATA_ITEM_KEY || + (key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == blocksize))) + ret = 0; + } + + if (ret) { + skinny = false; + btrfs_release_path(path); + goto again; + } } BUG_ON(ret); @@ -3566,7 +3643,7 @@ int add_data_references(struct reloc_control *rc, unsigned long ptr; unsigned long end; u32 blocksize = btrfs_level_size(rc->extent_root, 0); - int ret; + int ret = 0; int err = 0; eb = path->nodes[0]; @@ -3593,6 +3670,10 @@ int add_data_references(struct reloc_control *rc, } else { BUG(); } + if (ret) { + err = ret; + goto out; + } ptr += btrfs_extent_inline_ref_size(key.type); } WARN_ON(ptr > end); @@ -3638,6 +3719,7 @@ int add_data_references(struct reloc_control *rc, } path->slots[0]++; } +out: btrfs_release_path(path); if (err) free_block_list(blocks); @@ -4082,7 +4164,7 @@ out: return inode; } -static struct reloc_control *alloc_reloc_control(void) +static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) { struct reloc_control *rc; @@ -4093,7 +4175,8 @@ static struct reloc_control *alloc_reloc_control(void) INIT_LIST_HEAD(&rc->reloc_roots); backref_cache_init(&rc->backref_cache); mapping_tree_init(&rc->reloc_root_tree); - extent_io_tree_init(&rc->processed_blocks, NULL); + extent_io_tree_init(&rc->processed_blocks, + fs_info->btree_inode->i_mapping); return rc; } @@ -4110,7 +4193,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) int rw = 0; int err = 0; - rc = alloc_reloc_control(); + rc = alloc_reloc_control(fs_info); if (!rc) return -ENOMEM; @@ -4156,15 +4239,14 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) } printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n", - (unsigned long long)rc->block_group->key.objectid, - (unsigned long long)rc->block_group->flags); + rc->block_group->key.objectid, rc->block_group->flags); - ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0); + ret = btrfs_start_all_delalloc_inodes(fs_info, 0); if (ret < 0) { err = ret; goto out; } - btrfs_wait_ordered_extents(fs_info->tree_root, 0); + btrfs_wait_all_ordered_extents(fs_info); while (1) { mutex_lock(&fs_info->cleaner_mutex); @@ -4179,7 +4261,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) break; printk(KERN_INFO "btrfs: found %llu extents\n", - (unsigned long long)rc->extents_found); + rc->extents_found); if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1); @@ -4276,7 +4358,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) key.type != BTRFS_ROOT_ITEM_KEY) break; - reloc_root = btrfs_read_fs_root_no_radix(root, &key); + reloc_root = btrfs_read_fs_root(root, &key); if (IS_ERR(reloc_root)) { err = PTR_ERR(reloc_root); goto out; @@ -4311,7 +4393,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) if (list_empty(&reloc_roots)) goto out; - rc = alloc_reloc_control(); + rc = alloc_reloc_control(root->fs_info); if (!rc) { err = -ENOMEM; goto out; @@ -4395,10 +4477,8 @@ out: int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) { struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; struct btrfs_ordered_extent *ordered; struct btrfs_root *root = BTRFS_I(inode)->root; - size_t offset; int ret; u64 disk_bytenr; LIST_HEAD(list); @@ -4412,19 +4492,13 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) if (ret) goto out; + disk_bytenr = ordered->start; while (!list_empty(&list)) { sums = list_entry(list.next, struct btrfs_ordered_sum, list); list_del_init(&sums->list); - sector_sum = sums->sums; - sums->bytenr = ordered->start; - - offset = 0; - while (offset < sums->len) { - sector_sum->bytenr += ordered->start - disk_bytenr; - sector_sum++; - offset += root->sectorsize; - } + sums->bytenr = disk_bytenr; + disk_bytenr += sums->len; btrfs_add_ordered_sum(inode, ordered, sums); } @@ -4433,19 +4507,19 @@ out: return ret; } -void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *buf, - struct extent_buffer *cow) +int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *cow) { struct reloc_control *rc; struct backref_node *node; int first_cow = 0; int level; - int ret; + int ret = 0; rc = root->fs_info->reloc_ctl; if (!rc) - return; + return 0; BUG_ON(rc->stage == UPDATE_DATA_PTRS && root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); @@ -4481,10 +4555,9 @@ void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, rc->nodes_relocated += buf->len; } - if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) { + if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) ret = replace_file_extents(trans, rc, root, cow); - BUG_ON(ret); - } + return ret; } /* diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 5bf1ed57f178..0b1f4ef8db98 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -29,8 +29,8 @@ * generation numbers as then we know the root was once mounted with an older * kernel that was not aware of the root item structure change. */ -void btrfs_read_root_item(struct extent_buffer *eb, int slot, - struct btrfs_root_item *item) +static void btrfs_read_root_item(struct extent_buffer *eb, int slot, + struct btrfs_root_item *item) { uuid_le uuid; int len; @@ -64,52 +64,59 @@ void btrfs_read_root_item(struct extent_buffer *eb, int slot, } /* - * lookup the root with the highest offset for a given objectid. The key we do - * find is copied into 'key'. If we find something return 0, otherwise 1, < 0 - * on error. + * btrfs_find_root - lookup the root by the key. + * root: the root of the root tree + * search_key: the key to search + * path: the path we search + * root_item: the root item of the tree we look for + * root_key: the reak key of the tree we look for + * + * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset + * of the search key, just lookup the root with the highest offset for a + * given objectid. + * + * If we find something return 0, otherwise > 0, < 0 on error. */ -int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, - struct btrfs_root_item *item, struct btrfs_key *key) +int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key, + struct btrfs_path *path, struct btrfs_root_item *root_item, + struct btrfs_key *root_key) { - struct btrfs_path *path; - struct btrfs_key search_key; struct btrfs_key found_key; struct extent_buffer *l; int ret; int slot; - search_key.objectid = objectid; - search_key.type = BTRFS_ROOT_ITEM_KEY; - search_key.offset = (u64)-1; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + ret = btrfs_search_slot(NULL, root, search_key, path, 0, 0); if (ret < 0) - goto out; + return ret; - BUG_ON(ret == 0); - if (path->slots[0] == 0) { - ret = 1; - goto out; + if (search_key->offset != -1ULL) { /* the search key is exact */ + if (ret > 0) + goto out; + } else { + BUG_ON(ret == 0); /* Logical error */ + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + ret = 0; } + l = path->nodes[0]; - slot = path->slots[0] - 1; + slot = path->slots[0]; + btrfs_item_key_to_cpu(l, &found_key, slot); - if (found_key.objectid != objectid || + if (found_key.objectid != search_key->objectid || found_key.type != BTRFS_ROOT_ITEM_KEY) { ret = 1; goto out; } - if (item) - btrfs_read_root_item(l, slot, item); - if (key) - memcpy(key, &found_key, sizeof(found_key)); - ret = 0; + if (root_item) + btrfs_read_root_item(l, slot, root_item); + if (root_key) + memcpy(root_key, &found_key, sizeof(found_key)); out: - btrfs_free_path(path); + btrfs_release_path(path); return ret; } @@ -148,8 +155,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root if (ret != 0) { btrfs_print_leaf(root, path->nodes[0]); printk(KERN_CRIT "unable to update root key %llu %u %llu\n", - (unsigned long long)key->objectid, key->type, - (unsigned long long)key->offset); + key->objectid, key->type, key->offset); BUG_ON(1); } @@ -212,86 +218,6 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, return btrfs_insert_item(trans, root, key, item, sizeof(*item)); } -/* - * at mount time we want to find all the old transaction snapshots that were in - * the process of being deleted if we crashed. This is any root item with an - * offset lower than the latest root. They need to be queued for deletion to - * finish what was happening when we crashed. - */ -int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid) -{ - struct btrfs_root *dead_root; - struct btrfs_root_item *ri; - struct btrfs_key key; - struct btrfs_key found_key; - struct btrfs_path *path; - int ret; - u32 nritems; - struct extent_buffer *leaf; - int slot; - - key.objectid = objectid; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); - key.offset = 0; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - -again: - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto err; - while (1) { - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - slot = path->slots[0]; - if (slot >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret) - break; - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - slot = path->slots[0]; - } - btrfs_item_key_to_cpu(leaf, &key, slot); - if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY) - goto next; - - if (key.objectid < objectid) - goto next; - - if (key.objectid > objectid) - break; - - ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item); - if (btrfs_disk_root_refs(leaf, ri) != 0) - goto next; - - memcpy(&found_key, &key, sizeof(key)); - key.offset++; - btrfs_release_path(path); - dead_root = - btrfs_read_fs_root_no_radix(root->fs_info->tree_root, - &found_key); - if (IS_ERR(dead_root)) { - ret = PTR_ERR(dead_root); - goto err; - } - - ret = btrfs_add_dead_root(dead_root); - if (ret) - goto err; - goto again; -next: - slot++; - path->slots[0]++; - } - ret = 0; -err: - btrfs_free_path(path); - return ret; -} - int btrfs_find_orphan_roots(struct btrfs_root *tree_root) { struct extent_buffer *leaf; @@ -301,6 +227,10 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) struct btrfs_root *root; int err = 0; int ret; + bool can_recover = true; + + if (tree_root->fs_info->sb->s_flags & MS_RDONLY) + can_recover = false; path = btrfs_alloc_path(); if (!path) @@ -340,20 +270,52 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) root_key.objectid = key.offset; key.offset++; - root = btrfs_read_fs_root_no_name(tree_root->fs_info, - &root_key); - if (!IS_ERR(root)) + root = btrfs_read_fs_root(tree_root, &root_key); + err = PTR_RET(root); + if (err && err != -ENOENT) { + break; + } else if (err == -ENOENT) { + struct btrfs_trans_handle *trans; + + btrfs_release_path(path); + + trans = btrfs_join_transaction(tree_root); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + btrfs_error(tree_root->fs_info, err, + "Failed to start trans to delete " + "orphan item"); + break; + } + err = btrfs_del_orphan_item(trans, tree_root, + root_key.objectid); + btrfs_end_transaction(trans, tree_root); + if (err) { + btrfs_error(tree_root->fs_info, err, + "Failed to delete root orphan " + "item"); + break; + } continue; + } - ret = PTR_ERR(root); - if (ret != -ENOENT) { - err = ret; + if (btrfs_root_refs(&root->root_item) == 0) { + btrfs_add_dead_root(root); + continue; + } + + err = btrfs_init_fs_root(root); + if (err) { + btrfs_free_fs_root(root); break; } - ret = btrfs_find_dead_roots(tree_root, root_key.objectid); - if (ret) { - err = ret; + root->orphan_item_inserted = 1; + + err = btrfs_insert_fs_root(root->fs_info, root); + if (err) { + BUG_ON(err == -EEXIST); + btrfs_free_fs_root(root); break; } } @@ -368,8 +330,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, { struct btrfs_path *path; int ret; - struct btrfs_root_item *ri; - struct extent_buffer *leaf; path = btrfs_alloc_path(); if (!path) @@ -379,8 +339,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, goto out; BUG_ON(ret != 0); - leaf = path->nodes[0]; - ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item); ret = btrfs_del_item(trans, root, path); out: @@ -531,13 +489,13 @@ again: */ void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item) { - u64 inode_flags = le64_to_cpu(root_item->inode.flags); + u64 inode_flags = btrfs_stack_inode_flags(&root_item->inode); if (!(inode_flags & BTRFS_INODE_ROOT_ITEM_INIT)) { inode_flags |= BTRFS_INODE_ROOT_ITEM_INIT; - root_item->inode.flags = cpu_to_le64(inode_flags); - root_item->flags = 0; - root_item->byte_limit = 0; + btrfs_set_stack_inode_flags(&root_item->inode, inode_flags); + btrfs_set_root_flags(root_item, 0); + btrfs_set_root_limit(root_item, 0); } } @@ -548,8 +506,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct timespec ct = CURRENT_TIME; spin_lock(&root->root_item_lock); - item->ctransid = cpu_to_le64(trans->transid); - item->ctime.sec = cpu_to_le64(ct.tv_sec); - item->ctime.nsec = cpu_to_le32(ct.tv_nsec); + btrfs_set_root_ctransid(item, trans->transid); + btrfs_set_stack_timespec_sec(&item->ctime, ct.tv_sec); + btrfs_set_stack_timespec_nsec(&item->ctime, ct.tv_nsec); spin_unlock(&root->root_item_lock); } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 79bd479317cb..a18e0e23f6a6 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -158,12 +158,20 @@ struct scrub_fixup_nodatasum { int mirror_num; }; +struct scrub_nocow_inode { + u64 inum; + u64 offset; + u64 root; + struct list_head list; +}; + struct scrub_copy_nocow_ctx { struct scrub_ctx *sctx; u64 logical; u64 len; int mirror_num; u64 physical_for_dev_replace; + struct list_head inodes; struct btrfs_work work; }; @@ -245,7 +253,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); static int write_page_nocow(struct scrub_ctx *sctx, u64 physical_for_dev_replace, struct page *page); static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, - void *ctx); + struct scrub_copy_nocow_ctx *ctx); static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, int mirror_num, u64 physical_for_dev_replace); static void copy_nocow_pages_worker(struct btrfs_work *work); @@ -754,8 +762,7 @@ out: num_uncorrectable_read_errors); printk_ratelimited_in_rcu(KERN_ERR "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", - (unsigned long long)fixup->logical, - rcu_str_deref(fixup->dev->name)); + fixup->logical, rcu_str_deref(fixup->dev->name)); } btrfs_free_path(path); @@ -1154,8 +1161,7 @@ corrected_error: spin_unlock(&sctx->stat_lock); printk_ratelimited_in_rcu(KERN_ERR "btrfs: fixed up error at logical %llu on dev %s\n", - (unsigned long long)logical, - rcu_str_deref(dev->name)); + logical, rcu_str_deref(dev->name)); } } else { did_not_correct_error: @@ -1164,8 +1170,7 @@ did_not_correct_error: spin_unlock(&sctx->stat_lock); printk_ratelimited_in_rcu(KERN_ERR "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", - (unsigned long long)logical, - rcu_str_deref(dev->name)); + logical, rcu_str_deref(dev->name)); } out: @@ -1345,12 +1350,12 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, mapped_buffer = kmap_atomic(sblock->pagev[0]->page); h = (struct btrfs_header *)mapped_buffer; - if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) || + if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) || memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) { sblock->header_error = 1; - } else if (generation != le64_to_cpu(h->generation)) { + } else if (generation != btrfs_stack_header_generation(h)) { sblock->header_error = 1; sblock->generation_error = 1; } @@ -1720,10 +1725,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) * b) the page is already kmapped */ - if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr)) + if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h)) ++fail; - if (sblock->pagev[0]->generation != le64_to_cpu(h->generation)) + if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) ++fail; if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) @@ -1786,10 +1791,10 @@ static int scrub_checksum_super(struct scrub_block *sblock) s = (struct btrfs_super_block *)mapped_buffer; memcpy(on_disk_csum, s->csum, sctx->csum_size); - if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr)) + if (sblock->pagev[0]->logical != btrfs_super_bytenr(s)) ++fail_cor; - if (sblock->pagev[0]->generation != le64_to_cpu(s->generation)) + if (sblock->pagev[0]->generation != btrfs_super_generation(s)) ++fail_gen; if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) @@ -2126,8 +2131,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, u8 *csum) { struct btrfs_ordered_sum *sum = NULL; - int ret = 0; - unsigned long i; + unsigned long index; unsigned long num_sectors; while (!list_empty(&sctx->csum_list)) { @@ -2146,19 +2150,14 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, if (!sum) return 0; + index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize; num_sectors = sum->len / sctx->sectorsize; - for (i = 0; i < num_sectors; ++i) { - if (sum->sums[i].bytenr == logical) { - memcpy(csum, &sum->sums[i].sum, sctx->csum_size); - ret = 1; - break; - } - } - if (ret && i == num_sectors - 1) { + memcpy(csum, sum->sums + index, sctx->csum_size); + if (index == num_sectors - 1) { list_del(&sum->list); kfree(sum); } - return ret; + return 1; } /* scrub extent tries to collect up to 64 kB for each bio */ @@ -2461,8 +2460,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, printk(KERN_ERR "btrfs scrub: tree block %llu spanning " "stripes, ignored. logical=%llu\n", - (unsigned long long)key.objectid, - (unsigned long long)logical); + key.objectid, logical); goto next; } @@ -2501,10 +2499,11 @@ again: ret = scrub_extent(sctx, extent_logical, extent_len, extent_physical, extent_dev, flags, generation, extent_mirror_num, - extent_physical); + extent_logical - logical + physical); if (ret) goto out; + scrub_free_csums(sctx); if (extent_logical + extent_len < key.objectid + bytes) { logical += increment; @@ -2868,9 +2867,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (fs_info->chunk_root->sectorsize != PAGE_SIZE) { /* not supported for data w/o checksums */ printk(KERN_ERR - "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", - fs_info->chunk_root->sectorsize, - (unsigned long long)PAGE_SIZE); + "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails\n", + fs_info->chunk_root->sectorsize, PAGE_SIZE); return -EINVAL; } @@ -3136,12 +3134,30 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, nocow_ctx->mirror_num = mirror_num; nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; nocow_ctx->work.func = copy_nocow_pages_worker; + INIT_LIST_HEAD(&nocow_ctx->inodes); btrfs_queue_worker(&fs_info->scrub_nocow_workers, &nocow_ctx->work); return 0; } +static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx) +{ + struct scrub_copy_nocow_ctx *nocow_ctx = ctx; + struct scrub_nocow_inode *nocow_inode; + + nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS); + if (!nocow_inode) + return -ENOMEM; + nocow_inode->inum = inum; + nocow_inode->offset = offset; + nocow_inode->root = root; + list_add_tail(&nocow_inode->list, &nocow_ctx->inodes); + return 0; +} + +#define COPY_COMPLETE 1 + static void copy_nocow_pages_worker(struct btrfs_work *work) { struct scrub_copy_nocow_ctx *nocow_ctx = @@ -3177,19 +3193,42 @@ static void copy_nocow_pages_worker(struct btrfs_work *work) } ret = iterate_inodes_from_logical(logical, fs_info, path, - copy_nocow_pages_for_inode, - nocow_ctx); + record_inode_for_nocow, nocow_ctx); if (ret != 0 && ret != -ENOENT) { - pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n", - (unsigned long long)logical, - (unsigned long long)physical_for_dev_replace, - (unsigned long long)len, - (unsigned long long)mirror_num, ret); + pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d\n", + logical, physical_for_dev_replace, len, mirror_num, + ret); not_written = 1; goto out; } + btrfs_end_transaction(trans, root); + trans = NULL; + while (!list_empty(&nocow_ctx->inodes)) { + struct scrub_nocow_inode *entry; + entry = list_first_entry(&nocow_ctx->inodes, + struct scrub_nocow_inode, + list); + list_del_init(&entry->list); + ret = copy_nocow_pages_for_inode(entry->inum, entry->offset, + entry->root, nocow_ctx); + kfree(entry); + if (ret == COPY_COMPLETE) { + ret = 0; + break; + } else if (ret) { + break; + } + } out: + while (!list_empty(&nocow_ctx->inodes)) { + struct scrub_nocow_inode *entry; + entry = list_first_entry(&nocow_ctx->inodes, + struct scrub_nocow_inode, + list); + list_del_init(&entry->list); + kfree(entry); + } if (trans && !IS_ERR(trans)) btrfs_end_transaction(trans, root); if (not_written) @@ -3202,18 +3241,25 @@ out: scrub_pending_trans_workers_dec(sctx); } -static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) +static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, + struct scrub_copy_nocow_ctx *nocow_ctx) { - unsigned long index; - struct scrub_copy_nocow_ctx *nocow_ctx = ctx; - int ret = 0; + struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; struct btrfs_key key; - struct inode *inode = NULL; + struct inode *inode; + struct page *page; struct btrfs_root *local_root; + struct btrfs_ordered_extent *ordered; + struct extent_map *em; + struct extent_state *cached_state = NULL; + struct extent_io_tree *io_tree; u64 physical_for_dev_replace; - u64 len; - struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; + u64 len = nocow_ctx->len; + u64 lockstart = offset, lockend = offset + len - 1; + unsigned long index; int srcu_index; + int ret = 0; + int err = 0; key.objectid = root; key.type = BTRFS_ROOT_ITEM_KEY; @@ -3235,19 +3281,45 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) if (IS_ERR(inode)) return PTR_ERR(inode); + /* Avoid truncate/dio/punch hole.. */ + mutex_lock(&inode->i_mutex); + inode_dio_wait(inode); + physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; - len = nocow_ctx->len; - while (len >= PAGE_CACHE_SIZE) { - struct page *page = NULL; - int ret_sub; + io_tree = &BTRFS_I(inode)->io_tree; - index = offset >> PAGE_CACHE_SHIFT; + lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, lockstart, len); + if (ordered) { + btrfs_put_ordered_extent(ordered); + goto out_unlock; + } + + em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out_unlock; + } + /* + * This extent does not actually cover the logical extent anymore, + * move on to the next inode. + */ + if (em->block_start > nocow_ctx->logical || + em->block_start + em->block_len < nocow_ctx->logical + len) { + free_extent_map(em); + goto out_unlock; + } + free_extent_map(em); + + while (len >= PAGE_CACHE_SIZE) { + index = offset >> PAGE_CACHE_SHIFT; +again: page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); if (!page) { pr_err("find_or_create_page() failed\n"); ret = -ENOMEM; - goto next_page; + goto out; } if (PageUptodate(page)) { @@ -3255,39 +3327,53 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) goto next_page; } else { ClearPageError(page); - ret_sub = extent_read_full_page(&BTRFS_I(inode)-> - io_tree, - page, btrfs_get_extent, - nocow_ctx->mirror_num); - if (ret_sub) { - ret = ret_sub; + err = extent_read_full_page_nolock(io_tree, page, + btrfs_get_extent, + nocow_ctx->mirror_num); + if (err) { + ret = err; goto next_page; } - wait_on_page_locked(page); + + lock_page(page); + /* + * If the page has been remove from the page cache, + * the data on it is meaningless, because it may be + * old one, the new data may be written into the new + * page in the page cache. + */ + if (page->mapping != inode->i_mapping) { + unlock_page(page); + page_cache_release(page); + goto again; + } if (!PageUptodate(page)) { ret = -EIO; goto next_page; } } - ret_sub = write_page_nocow(nocow_ctx->sctx, - physical_for_dev_replace, page); - if (ret_sub) { - ret = ret_sub; - goto next_page; - } - + err = write_page_nocow(nocow_ctx->sctx, + physical_for_dev_replace, page); + if (err) + ret = err; next_page: - if (page) { - unlock_page(page); - put_page(page); - } + unlock_page(page); + page_cache_release(page); + + if (ret) + break; + offset += PAGE_CACHE_SIZE; physical_for_dev_replace += PAGE_CACHE_SIZE; len -= PAGE_CACHE_SIZE; } - - if (inode) - iput(inode); + ret = COPY_COMPLETE; +out_unlock: + unlock_extent_cached(io_tree, lockstart, lockend, &cached_state, + GFP_NOFS); +out: + mutex_unlock(&inode->i_mutex); + iput(inode); return ret; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index ff40f1c00ce3..e46e0ed74925 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -26,6 +26,7 @@ #include <linux/radix-tree.h> #include <linux/crc32c.h> #include <linux/vmalloc.h> +#include <linux/string.h> #include "send.h" #include "backref.h" @@ -54,8 +55,8 @@ struct fs_path { char *buf; int buf_len; - int reversed:1; - int virtual_mem:1; + unsigned int reversed:1; + unsigned int virtual_mem:1; char inline_buf[]; }; char pad[PAGE_SIZE]; @@ -158,7 +159,7 @@ static void fs_path_reset(struct fs_path *p) } } -static struct fs_path *fs_path_alloc(struct send_ctx *sctx) +static struct fs_path *fs_path_alloc(void) { struct fs_path *p; @@ -173,11 +174,11 @@ static struct fs_path *fs_path_alloc(struct send_ctx *sctx) return p; } -static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx) +static struct fs_path *fs_path_alloc_reversed(void) { struct fs_path *p; - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return NULL; p->reversed = 1; @@ -185,7 +186,7 @@ static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx) return p; } -static void fs_path_free(struct send_ctx *sctx, struct fs_path *p) +static void fs_path_free(struct fs_path *p) { if (!p) return; @@ -219,7 +220,7 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) len = PAGE_ALIGN(len); if (p->buf == p->inline_buf) { - tmp_buf = kmalloc(len, GFP_NOFS); + tmp_buf = kmalloc(len, GFP_NOFS | __GFP_NOWARN); if (!tmp_buf) { tmp_buf = vmalloc(len); if (!tmp_buf) @@ -753,8 +754,7 @@ typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index, * * path must point to the INODE_REF or INODE_EXTREF when called. */ -static int iterate_inode_ref(struct send_ctx *sctx, - struct btrfs_root *root, struct btrfs_path *path, +static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *found_key, int resolve, iterate_inode_ref_t iterate, void *ctx) { @@ -777,13 +777,13 @@ static int iterate_inode_ref(struct send_ctx *sctx, unsigned long elem_size; unsigned long ptr; - p = fs_path_alloc_reversed(sctx); + p = fs_path_alloc_reversed(); if (!p) return -ENOMEM; tmp_path = alloc_path_for_send(); if (!tmp_path) { - fs_path_free(sctx, p); + fs_path_free(p); return -ENOMEM; } @@ -858,7 +858,7 @@ static int iterate_inode_ref(struct send_ctx *sctx, out: btrfs_free_path(tmp_path); - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -874,8 +874,7 @@ typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key, * * path must point to the dir item when called. */ -static int iterate_dir_item(struct send_ctx *sctx, - struct btrfs_root *root, struct btrfs_path *path, +static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *found_key, iterate_dir_item_t iterate, void *ctx) { @@ -990,7 +989,7 @@ static int __copy_first_ref(int num, u64 dir, int index, * Retrieve the first path of an inode. If an inode has more then one * ref/hardlink, this is ignored. */ -static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root, +static int get_inode_path(struct btrfs_root *root, u64 ino, struct fs_path *path) { int ret; @@ -1022,8 +1021,8 @@ static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root, goto out; } - ret = iterate_inode_ref(sctx, root, p, &found_key, 1, - __copy_first_ref, path); + ret = iterate_inode_ref(root, p, &found_key, 1, + __copy_first_ref, path); if (ret < 0) goto out; ret = 0; @@ -1314,8 +1313,7 @@ out: return ret; } -static int read_symlink(struct send_ctx *sctx, - struct btrfs_root *root, +static int read_symlink(struct btrfs_root *root, u64 ino, struct fs_path *dest) { @@ -1562,8 +1560,7 @@ out: * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir, * generation of the parent dir and the name of the dir entry. */ -static int get_first_ref(struct send_ctx *sctx, - struct btrfs_root *root, u64 ino, +static int get_first_ref(struct btrfs_root *root, u64 ino, u64 *dir, u64 *dir_gen, struct fs_path *name) { int ret; @@ -1628,8 +1625,7 @@ out: return ret; } -static int is_first_ref(struct send_ctx *sctx, - struct btrfs_root *root, +static int is_first_ref(struct btrfs_root *root, u64 ino, u64 dir, const char *name, int name_len) { @@ -1638,11 +1634,11 @@ static int is_first_ref(struct send_ctx *sctx, u64 tmp_dir; u64 tmp_dir_gen; - tmp_name = fs_path_alloc(sctx); + tmp_name = fs_path_alloc(); if (!tmp_name) return -ENOMEM; - ret = get_first_ref(sctx, root, ino, &tmp_dir, &tmp_dir_gen, tmp_name); + ret = get_first_ref(root, ino, &tmp_dir, &tmp_dir_gen, tmp_name); if (ret < 0) goto out; @@ -1654,7 +1650,7 @@ static int is_first_ref(struct send_ctx *sctx, ret = !memcmp(tmp_name->start, name, name_len); out: - fs_path_free(sctx, tmp_name); + fs_path_free(tmp_name); return ret; } @@ -1673,6 +1669,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, u64 *who_ino, u64 *who_gen) { int ret = 0; + u64 gen; u64 other_inode = 0; u8 other_type = 0; @@ -1683,6 +1680,24 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, if (ret <= 0) goto out; + /* + * If we have a parent root we need to verify that the parent dir was + * not delted and then re-created, if it was then we have no overwrite + * and we can just unlink this entry. + */ + if (sctx->parent_root) { + ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, + NULL, NULL, NULL); + if (ret < 0 && ret != -ENOENT) + goto out; + if (ret) { + ret = 0; + goto out; + } + if (gen != dir_gen) + goto out; + } + ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len, &other_inode, &other_type); if (ret < 0 && ret != -ENOENT) @@ -1783,11 +1798,11 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen) if (!sctx->parent_root) goto out; - name = fs_path_alloc(sctx); + name = fs_path_alloc(); if (!name) return -ENOMEM; - ret = get_first_ref(sctx, sctx->parent_root, ino, &dir, &dir_gen, name); + ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name); if (ret < 0) goto out; @@ -1795,7 +1810,7 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen) name->start, fs_path_len(name)); out: - fs_path_free(sctx, name); + fs_path_free(name); return ret; } @@ -1979,11 +1994,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, * send_root or parent_root for ref lookup. */ if (ino < sctx->send_progress) - ret = get_first_ref(sctx, sctx->send_root, ino, - parent_ino, parent_gen, dest); + ret = get_first_ref(sctx->send_root, ino, + parent_ino, parent_gen, dest); else - ret = get_first_ref(sctx, sctx->parent_root, ino, - parent_ino, parent_gen, dest); + ret = get_first_ref(sctx->parent_root, ino, + parent_ino, parent_gen, dest); if (ret < 0) goto out; @@ -2070,7 +2085,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, u64 parent_gen = 0; int stop = 0; - name = fs_path_alloc(sctx); + name = fs_path_alloc(); if (!name) { ret = -ENOMEM; goto out; @@ -2098,7 +2113,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, } out: - fs_path_free(sctx, name); + fs_path_free(name); if (!ret) fs_path_unreverse(dest); return ret; @@ -2263,7 +2278,7 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size); - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -2281,7 +2296,7 @@ verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size); tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -2292,7 +2307,7 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode) verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode); - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -2310,7 +2325,7 @@ verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode); tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -2321,7 +2336,7 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid); - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -2340,7 +2355,7 @@ verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid); tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -2356,7 +2371,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) verbose_printk("btrfs: send_utimes %llu\n", ino); - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -2397,7 +2412,7 @@ verbose_printk("btrfs: send_utimes %llu\n", ino); tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); btrfs_free_path(path); return ret; } @@ -2418,7 +2433,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino) verbose_printk("btrfs: send_create_inode %llu\n", ino); - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -2459,7 +2474,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino); if (S_ISLNK(mode)) { fs_path_reset(p); - ret = read_symlink(sctx, sctx->send_root, ino, p); + ret = read_symlink(sctx->send_root, ino, p); if (ret < 0) goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p); @@ -2476,7 +2491,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino); tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -2524,7 +2539,8 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(eb, di, &di_key); - if (di_key.objectid < sctx->send_progress) { + if (di_key.type != BTRFS_ROOT_ITEM_KEY && + di_key.objectid < sctx->send_progress) { ret = 1; goto out; } @@ -2586,7 +2602,6 @@ static int record_ref(struct list_head *head, u64 dir, u64 dir_gen, struct fs_path *path) { struct recorded_ref *ref; - char *tmp; ref = kmalloc(sizeof(*ref), GFP_NOFS); if (!ref) @@ -2596,32 +2611,42 @@ static int record_ref(struct list_head *head, u64 dir, ref->dir_gen = dir_gen; ref->full_path = path; - tmp = strrchr(ref->full_path->start, '/'); - if (!tmp) { - ref->name_len = ref->full_path->end - ref->full_path->start; - ref->name = ref->full_path->start; + ref->name = (char *)kbasename(ref->full_path->start); + ref->name_len = ref->full_path->end - ref->name; + ref->dir_path = ref->full_path->start; + if (ref->name == ref->full_path->start) ref->dir_path_len = 0; - ref->dir_path = ref->full_path->start; - } else { - tmp++; - ref->name_len = ref->full_path->end - tmp; - ref->name = tmp; - ref->dir_path = ref->full_path->start; + else ref->dir_path_len = ref->full_path->end - ref->full_path->start - 1 - ref->name_len; - } list_add_tail(&ref->list, head); return 0; } -static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head) +static int dup_ref(struct recorded_ref *ref, struct list_head *list) +{ + struct recorded_ref *new; + + new = kmalloc(sizeof(*ref), GFP_NOFS); + if (!new) + return -ENOMEM; + + new->dir = ref->dir; + new->dir_gen = ref->dir_gen; + new->full_path = NULL; + INIT_LIST_HEAD(&new->list); + list_add_tail(&new->list, list); + return 0; +} + +static void __free_recorded_refs(struct list_head *head) { struct recorded_ref *cur; while (!list_empty(head)) { cur = list_entry(head->next, struct recorded_ref, list); - fs_path_free(sctx, cur->full_path); + fs_path_free(cur->full_path); list_del(&cur->list); kfree(cur); } @@ -2629,8 +2654,8 @@ static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head) static void free_recorded_refs(struct send_ctx *sctx) { - __free_recorded_refs(sctx, &sctx->new_refs); - __free_recorded_refs(sctx, &sctx->deleted_refs); + __free_recorded_refs(&sctx->new_refs); + __free_recorded_refs(&sctx->deleted_refs); } /* @@ -2644,7 +2669,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen, int ret; struct fs_path *orphan; - orphan = fs_path_alloc(sctx); + orphan = fs_path_alloc(); if (!orphan) return -ENOMEM; @@ -2655,7 +2680,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen, ret = send_rename(sctx, path, orphan); out: - fs_path_free(sctx, orphan); + fs_path_free(orphan); return ret; } @@ -2729,9 +2754,7 @@ static int process_recorded_refs(struct send_ctx *sctx) int ret = 0; struct recorded_ref *cur; struct recorded_ref *cur2; - struct ulist *check_dirs = NULL; - struct ulist_iterator uit; - struct ulist_node *un; + struct list_head check_dirs; struct fs_path *valid_path = NULL; u64 ow_inode = 0; u64 ow_gen; @@ -2745,19 +2768,14 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); * which is always '..' */ BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID); + INIT_LIST_HEAD(&check_dirs); - valid_path = fs_path_alloc(sctx); + valid_path = fs_path_alloc(); if (!valid_path) { ret = -ENOMEM; goto out; } - check_dirs = ulist_alloc(GFP_NOFS); - if (!check_dirs) { - ret = -ENOMEM; - goto out; - } - /* * First, check if the first ref of the current inode was overwritten * before. If yes, we know that the current inode was already orphanized @@ -2843,9 +2861,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); if (ret < 0) goto out; if (ret) { - ret = is_first_ref(sctx, sctx->parent_root, - ow_inode, cur->dir, cur->name, - cur->name_len); + ret = is_first_ref(sctx->parent_root, + ow_inode, cur->dir, cur->name, + cur->name_len); if (ret < 0) goto out; if (ret) { @@ -2894,8 +2912,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); goto out; } } - ret = ulist_add(check_dirs, cur->dir, cur->dir_gen, - GFP_NOFS); + ret = dup_ref(cur, &check_dirs); if (ret < 0) goto out; } @@ -2923,8 +2940,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); } list_for_each_entry(cur, &sctx->deleted_refs, list) { - ret = ulist_add(check_dirs, cur->dir, cur->dir_gen, - GFP_NOFS); + ret = dup_ref(cur, &check_dirs); if (ret < 0) goto out; } @@ -2935,8 +2951,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); */ cur = list_entry(sctx->deleted_refs.next, struct recorded_ref, list); - ret = ulist_add(check_dirs, cur->dir, cur->dir_gen, - GFP_NOFS); + ret = dup_ref(cur, &check_dirs); if (ret < 0) goto out; } else if (!S_ISDIR(sctx->cur_inode_mode)) { @@ -2956,12 +2971,10 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); if (ret < 0) goto out; } - ret = ulist_add(check_dirs, cur->dir, cur->dir_gen, - GFP_NOFS); + ret = dup_ref(cur, &check_dirs); if (ret < 0) goto out; } - /* * If the inode is still orphan, unlink the orphan. This may * happen when a previous inode did overwrite the first ref @@ -2983,33 +2996,32 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); * deletion and if it's finally possible to perform the rmdir now. * We also update the inode stats of the parent dirs here. */ - ULIST_ITER_INIT(&uit); - while ((un = ulist_next(check_dirs, &uit))) { + list_for_each_entry(cur, &check_dirs, list) { /* * In case we had refs into dirs that were not processed yet, * we don't need to do the utime and rmdir logic for these dirs. * The dir will be processed later. */ - if (un->val > sctx->cur_ino) + if (cur->dir > sctx->cur_ino) continue; - ret = get_cur_inode_state(sctx, un->val, un->aux); + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; if (ret == inode_state_did_create || ret == inode_state_no_change) { /* TODO delayed utimes */ - ret = send_utimes(sctx, un->val, un->aux); + ret = send_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; } else if (ret == inode_state_did_delete) { - ret = can_rmdir(sctx, un->val, sctx->cur_ino); + ret = can_rmdir(sctx, cur->dir, sctx->cur_ino); if (ret < 0) goto out; if (ret) { - ret = get_cur_path(sctx, un->val, un->aux, - valid_path); + ret = get_cur_path(sctx, cur->dir, + cur->dir_gen, valid_path); if (ret < 0) goto out; ret = send_rmdir(sctx, valid_path); @@ -3022,9 +3034,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); ret = 0; out: + __free_recorded_refs(&check_dirs); free_recorded_refs(sctx); - ulist_free(check_dirs); - fs_path_free(sctx, valid_path); + fs_path_free(valid_path); return ret; } @@ -3037,7 +3049,7 @@ static int __record_new_ref(int num, u64 dir, int index, struct fs_path *p; u64 gen; - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3057,7 +3069,7 @@ static int __record_new_ref(int num, u64 dir, int index, out: if (ret) - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -3070,7 +3082,7 @@ static int __record_deleted_ref(int num, u64 dir, int index, struct fs_path *p; u64 gen; - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3090,7 +3102,7 @@ static int __record_deleted_ref(int num, u64 dir, int index, out: if (ret) - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -3098,8 +3110,8 @@ static int record_new_ref(struct send_ctx *sctx) { int ret; - ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path, - sctx->cmp_key, 0, __record_new_ref, sctx); + ret = iterate_inode_ref(sctx->send_root, sctx->left_path, + sctx->cmp_key, 0, __record_new_ref, sctx); if (ret < 0) goto out; ret = 0; @@ -3112,8 +3124,8 @@ static int record_deleted_ref(struct send_ctx *sctx) { int ret; - ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path, - sctx->cmp_key, 0, __record_deleted_ref, sctx); + ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, + sctx->cmp_key, 0, __record_deleted_ref, sctx); if (ret < 0) goto out; ret = 0; @@ -3124,6 +3136,8 @@ out: struct find_ref_ctx { u64 dir; + u64 dir_gen; + struct btrfs_root *root; struct fs_path *name; int found_idx; }; @@ -3133,29 +3147,42 @@ static int __find_iref(int num, u64 dir, int index, void *ctx_) { struct find_ref_ctx *ctx = ctx_; + u64 dir_gen; + int ret; if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) && strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) { + /* + * To avoid doing extra lookups we'll only do this if everything + * else matches. + */ + ret = get_inode_info(ctx->root, dir, NULL, &dir_gen, NULL, + NULL, NULL, NULL); + if (ret) + return ret; + if (dir_gen != ctx->dir_gen) + return 0; ctx->found_idx = num; return 1; } return 0; } -static int find_iref(struct send_ctx *sctx, - struct btrfs_root *root, +static int find_iref(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, - u64 dir, struct fs_path *name) + u64 dir, u64 dir_gen, struct fs_path *name) { int ret; struct find_ref_ctx ctx; ctx.dir = dir; ctx.name = name; + ctx.dir_gen = dir_gen; ctx.found_idx = -1; + ctx.root = root; - ret = iterate_inode_ref(sctx, root, path, key, 0, __find_iref, &ctx); + ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx); if (ret < 0) return ret; @@ -3169,11 +3196,17 @@ static int __record_changed_new_ref(int num, u64 dir, int index, struct fs_path *name, void *ctx) { + u64 dir_gen; int ret; struct send_ctx *sctx = ctx; - ret = find_iref(sctx, sctx->parent_root, sctx->right_path, - sctx->cmp_key, dir, name); + ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL, + NULL, NULL, NULL); + if (ret) + return ret; + + ret = find_iref(sctx->parent_root, sctx->right_path, + sctx->cmp_key, dir, dir_gen, name); if (ret == -ENOENT) ret = __record_new_ref(num, dir, index, name, sctx); else if (ret > 0) @@ -3186,11 +3219,17 @@ static int __record_changed_deleted_ref(int num, u64 dir, int index, struct fs_path *name, void *ctx) { + u64 dir_gen; int ret; struct send_ctx *sctx = ctx; - ret = find_iref(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key, - dir, name); + ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL, + NULL, NULL, NULL); + if (ret) + return ret; + + ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key, + dir, dir_gen, name); if (ret == -ENOENT) ret = __record_deleted_ref(num, dir, index, name, sctx); else if (ret > 0) @@ -3203,11 +3242,11 @@ static int record_changed_ref(struct send_ctx *sctx) { int ret = 0; - ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path, + ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key, 0, __record_changed_new_ref, sctx); if (ret < 0) goto out; - ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path, + ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key, 0, __record_changed_deleted_ref, sctx); if (ret < 0) goto out; @@ -3266,8 +3305,7 @@ static int process_all_refs(struct send_ctx *sctx, found_key.type != BTRFS_INODE_EXTREF_KEY)) break; - ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb, - sctx); + ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); btrfs_release_path(path); if (ret < 0) goto out; @@ -3335,7 +3373,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, struct fs_path *p; posix_acl_xattr_header dummy_acl; - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3362,7 +3400,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, ret = send_set_xattr(sctx, p, name, name_len, data, data_len); out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -3375,7 +3413,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key, struct send_ctx *sctx = ctx; struct fs_path *p; - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3386,7 +3424,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key, ret = send_remove_xattr(sctx, p, name, name_len); out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -3394,8 +3432,8 @@ static int process_new_xattr(struct send_ctx *sctx) { int ret = 0; - ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path, - sctx->cmp_key, __process_new_xattr, sctx); + ret = iterate_dir_item(sctx->send_root, sctx->left_path, + sctx->cmp_key, __process_new_xattr, sctx); return ret; } @@ -3404,8 +3442,8 @@ static int process_deleted_xattr(struct send_ctx *sctx) { int ret; - ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path, - sctx->cmp_key, __process_deleted_xattr, sctx); + ret = iterate_dir_item(sctx->parent_root, sctx->right_path, + sctx->cmp_key, __process_deleted_xattr, sctx); return ret; } @@ -3429,17 +3467,15 @@ static int __find_xattr(int num, struct btrfs_key *di_key, strncmp(name, ctx->name, name_len) == 0) { ctx->found_idx = num; ctx->found_data_len = data_len; - ctx->found_data = kmalloc(data_len, GFP_NOFS); + ctx->found_data = kmemdup(data, data_len, GFP_NOFS); if (!ctx->found_data) return -ENOMEM; - memcpy(ctx->found_data, data, data_len); return 1; } return 0; } -static int find_xattr(struct send_ctx *sctx, - struct btrfs_root *root, +static int find_xattr(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, const char *name, int name_len, @@ -3454,7 +3490,7 @@ static int find_xattr(struct send_ctx *sctx, ctx.found_data = NULL; ctx.found_data_len = 0; - ret = iterate_dir_item(sctx, root, path, key, __find_xattr, &ctx); + ret = iterate_dir_item(root, path, key, __find_xattr, &ctx); if (ret < 0) return ret; @@ -3480,9 +3516,9 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, char *found_data = NULL; int found_data_len = 0; - ret = find_xattr(sctx, sctx->parent_root, sctx->right_path, - sctx->cmp_key, name, name_len, &found_data, - &found_data_len); + ret = find_xattr(sctx->parent_root, sctx->right_path, + sctx->cmp_key, name, name_len, &found_data, + &found_data_len); if (ret == -ENOENT) { ret = __process_new_xattr(num, di_key, name, name_len, data, data_len, type, ctx); @@ -3508,8 +3544,8 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key, int ret; struct send_ctx *sctx = ctx; - ret = find_xattr(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key, - name, name_len, NULL, NULL); + ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key, + name, name_len, NULL, NULL); if (ret == -ENOENT) ret = __process_deleted_xattr(num, di_key, name, name_len, data, data_len, type, ctx); @@ -3523,11 +3559,11 @@ static int process_changed_xattr(struct send_ctx *sctx) { int ret = 0; - ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path, + ret = iterate_dir_item(sctx->send_root, sctx->left_path, sctx->cmp_key, __process_changed_new_xattr, sctx); if (ret < 0) goto out; - ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path, + ret = iterate_dir_item(sctx->parent_root, sctx->right_path, sctx->cmp_key, __process_changed_deleted_xattr, sctx); out: @@ -3572,8 +3608,8 @@ static int process_all_new_xattrs(struct send_ctx *sctx) goto out; } - ret = iterate_dir_item(sctx, root, path, &found_key, - __process_new_xattr, sctx); + ret = iterate_dir_item(root, path, &found_key, + __process_new_xattr, sctx); if (ret < 0) goto out; @@ -3598,7 +3634,7 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) int num_read = 0; mm_segment_t old_fs; - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3640,7 +3676,7 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len); tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); set_fs(old_fs); if (ret < 0) return ret; @@ -3663,7 +3699,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, " clone_root->root->objectid, clone_root->ino, clone_root->offset); - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3686,8 +3722,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, " goto out; ret = get_cur_path(sctx, clone_root->ino, gen, p); } else { - ret = get_inode_path(sctx, clone_root->root, - clone_root->ino, p); + ret = get_inode_path(clone_root->root, clone_root->ino, p); } if (ret < 0) goto out; @@ -3704,7 +3739,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, " tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -3717,7 +3752,7 @@ static int send_update_extent(struct send_ctx *sctx, int ret = 0; struct fs_path *p; - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3737,7 +3772,7 @@ static int send_update_extent(struct send_ctx *sctx, tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -3879,7 +3914,8 @@ static int is_extent_unchanged(struct send_ctx *sctx, btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || found_key.type != key.type) { - ret = 0; + /* If we're a hole then just pretend nothing changed */ + ret = (left_disknr) ? 0 : 1; goto out; } @@ -3905,7 +3941,8 @@ static int is_extent_unchanged(struct send_ctx *sctx, * This may only happen on the first iteration. */ if (found_key.offset + right_len <= ekey->offset) { - ret = 0; + /* If we're a hole just pretend nothing changed */ + ret = (left_disknr) ? 0 : 1; goto out; } @@ -3970,8 +4007,8 @@ static int process_extent(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key) { - int ret = 0; struct clone_root *found_clone = NULL; + int ret = 0; if (S_ISLNK(sctx->cur_inode_mode)) return 0; @@ -3984,6 +4021,32 @@ static int process_extent(struct send_ctx *sctx, ret = 0; goto out; } + } else { + struct btrfs_file_extent_item *ei; + u8 type; + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + type = btrfs_file_extent_type(path->nodes[0], ei); + if (type == BTRFS_FILE_EXTENT_PREALLOC || + type == BTRFS_FILE_EXTENT_REG) { + /* + * The send spec does not have a prealloc command yet, + * so just leave a hole for prealloc'ed extents until + * we have enough commands queued up to justify rev'ing + * the send spec. + */ + if (type == BTRFS_FILE_EXTENT_PREALLOC) { + ret = 0; + goto out; + } + + /* Have a hole, just skip it. */ + if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) { + ret = 0; + goto out; + } + } } ret = find_extent_clone(sctx, path, key->objectid, key->offset, @@ -4371,6 +4434,64 @@ static int changed_extent(struct send_ctx *sctx, return ret; } +static int dir_changed(struct send_ctx *sctx, u64 dir) +{ + u64 orig_gen, new_gen; + int ret; + + ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL, + NULL, NULL); + if (ret) + return ret; + + ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL, + NULL, NULL, NULL); + if (ret) + return ret; + + return (orig_gen != new_gen) ? 1 : 0; +} + +static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path, + struct btrfs_key *key) +{ + struct btrfs_inode_extref *extref; + struct extent_buffer *leaf; + u64 dirid = 0, last_dirid = 0; + unsigned long ptr; + u32 item_size; + u32 cur_offset = 0; + int ref_name_len; + int ret = 0; + + /* Easy case, just check this one dirid */ + if (key->type == BTRFS_INODE_REF_KEY) { + dirid = key->offset; + + ret = dir_changed(sctx, dirid); + goto out; + } + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + while (cur_offset < item_size) { + extref = (struct btrfs_inode_extref *)(ptr + + cur_offset); + dirid = btrfs_inode_extref_parent(leaf, extref); + ref_name_len = btrfs_inode_extref_name_len(leaf, extref); + cur_offset += ref_name_len + sizeof(*extref); + if (dirid == last_dirid) + continue; + ret = dir_changed(sctx, dirid); + if (ret) + break; + last_dirid = dirid; + } +out: + return ret; +} + /* * Updates compare related fields in sctx and simply forwards to the actual * changed_xxx functions. @@ -4386,6 +4507,19 @@ static int changed_cb(struct btrfs_root *left_root, int ret = 0; struct send_ctx *sctx = ctx; + if (result == BTRFS_COMPARE_TREE_SAME) { + if (key->type != BTRFS_INODE_REF_KEY && + key->type != BTRFS_INODE_EXTREF_KEY) + return 0; + ret = compare_refs(sctx, left_path, key); + if (!ret) + return 0; + if (ret < 0) + return ret; + result = BTRFS_COMPARE_TREE_CHANGED; + ret = 0; + } + sctx->left_path = left_path; sctx->right_path = right_path; sctx->cmp_key = key; @@ -4579,6 +4713,41 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) send_root = BTRFS_I(file_inode(mnt_file))->root; fs_info = send_root->fs_info; + /* + * This is done when we lookup the root, it should already be complete + * by the time we get here. + */ + WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE); + + /* + * If we just created this root we need to make sure that the orphan + * cleanup has been done and committed since we search the commit root, + * so check its commit root transid with our otransid and if they match + * commit the transaction to make sure everything is updated. + */ + down_read(&send_root->fs_info->extent_commit_sem); + if (btrfs_header_generation(send_root->commit_root) == + btrfs_root_otransid(&send_root->root_item)) { + struct btrfs_trans_handle *trans; + + up_read(&send_root->fs_info->extent_commit_sem); + + trans = btrfs_attach_transaction_barrier(send_root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT) { + ret = PTR_ERR(trans); + goto out; + } + /* ENOENT means theres no transaction */ + } else { + ret = btrfs_commit_transaction(trans, send_root); + if (ret) + goto out; + } + } else { + up_read(&send_root->fs_info->extent_commit_sem); + } + arg = memdup_user(arg_, sizeof(*arg)); if (IS_ERR(arg)) { ret = PTR_ERR(arg); @@ -4663,10 +4832,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; clone_root = btrfs_read_fs_root_no_name(fs_info, &key); - if (!clone_root) { - ret = -EINVAL; - goto out; - } if (IS_ERR(clone_root)) { ret = PTR_ERR(clone_root); goto out; @@ -4682,8 +4847,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key); - if (!sctx->parent_root) { - ret = -EINVAL; + if (IS_ERR(sctx->parent_root)) { + ret = PTR_ERR(sctx->parent_root); goto out; } } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f0857e092a3c..e913328d0f2a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -51,12 +51,13 @@ #include "print-tree.h" #include "xattr.h" #include "volumes.h" -#include "version.h" #include "export.h" #include "compression.h" #include "rcu-string.h" #include "dev-replace.h" #include "free-space-cache.h" +#include "backref.h" +#include "tests/btrfs-tests.h" #define CREATE_TRACE_POINTS #include <trace/events/btrfs.h> @@ -266,6 +267,9 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, return; } ACCESS_ONCE(trans->transaction->aborted) = errno; + /* Wake up anybody who may be waiting on this transaction */ + wake_up(&root->fs_info->transaction_wait); + wake_up(&root->fs_info->transaction_blocked_wait); __btrfs_std_error(root->fs_info, function, line, errno, NULL); } /* @@ -318,14 +322,15 @@ enum { Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_skip_balance, Opt_check_integrity, Opt_check_integrity_including_extent_data, - Opt_check_integrity_print_mask, Opt_fatal_errors, + Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree, + Opt_commit_interval, Opt_err, }; static match_table_t tokens = { {Opt_degraded, "degraded"}, {Opt_subvol, "subvol=%s"}, - {Opt_subvolid, "subvolid=%d"}, + {Opt_subvolid, "subvolid=%s"}, {Opt_device, "device=%s"}, {Opt_nodatasum, "nodatasum"}, {Opt_nodatacow, "nodatacow"}, @@ -358,7 +363,9 @@ static match_table_t tokens = { {Opt_check_integrity, "check_int"}, {Opt_check_integrity_including_extent_data, "check_int_data"}, {Opt_check_integrity_print_mask, "check_int_print_mask=%d"}, + {Opt_rescan_uuid_tree, "rescan_uuid_tree"}, {Opt_fatal_errors, "fatal_errors=%s"}, + {Opt_commit_interval, "commit=%d"}, {Opt_err, NULL}, }; @@ -494,10 +501,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, NOBARRIER); break; case Opt_thread_pool: - intarg = 0; - match_int(&args[0], &intarg); - if (intarg) + ret = match_int(&args[0], &intarg); + if (ret) { + goto out; + } else if (intarg > 0) { info->thread_pool_size = intarg; + } else { + ret = -EINVAL; + goto out; + } break; case Opt_max_inline: num = match_strdup(&args[0]); @@ -511,7 +523,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) root->sectorsize); } printk(KERN_INFO "btrfs: max_inline at %llu\n", - (unsigned long long)info->max_inline); + info->max_inline); + } else { + ret = -ENOMEM; + goto out; } break; case Opt_alloc_start: @@ -523,7 +538,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) kfree(num); printk(KERN_INFO "btrfs: allocations start at %llu\n", - (unsigned long long)info->alloc_start); + info->alloc_start); + } else { + ret = -ENOMEM; + goto out; } break; case Opt_noacl: @@ -538,12 +556,16 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT); break; case Opt_ratio: - intarg = 0; - match_int(&args[0], &intarg); - if (intarg) { + ret = match_int(&args[0], &intarg); + if (ret) { + goto out; + } else if (intarg >= 0) { info->metadata_ratio = intarg; printk(KERN_INFO "btrfs: metadata ratio %d\n", info->metadata_ratio); + } else { + ret = -EINVAL; + goto out; } break; case Opt_discard: @@ -552,6 +574,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_space_cache: btrfs_set_opt(info->mount_opt, SPACE_CACHE); break; + case Opt_rescan_uuid_tree: + btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE); + break; case Opt_no_space_cache: printk(KERN_INFO "btrfs: disabling disk space caching\n"); btrfs_clear_opt(info->mount_opt, SPACE_CACHE); @@ -594,13 +619,17 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); break; case Opt_check_integrity_print_mask: - intarg = 0; - match_int(&args[0], &intarg); - if (intarg) { + ret = match_int(&args[0], &intarg); + if (ret) { + goto out; + } else if (intarg >= 0) { info->check_integrity_print_mask = intarg; printk(KERN_INFO "btrfs:" " check_integrity_print_mask 0x%x\n", info->check_integrity_print_mask); + } else { + ret = -EINVAL; + goto out; } break; #else @@ -624,6 +653,29 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) goto out; } break; + case Opt_commit_interval: + intarg = 0; + ret = match_int(&args[0], &intarg); + if (ret < 0) { + printk(KERN_ERR + "btrfs: invalid commit interval\n"); + ret = -EINVAL; + goto out; + } + if (intarg > 0) { + if (intarg > 300) { + printk(KERN_WARNING + "btrfs: excessive commit interval %d\n", + intarg); + } + info->commit_interval = intarg; + } else { + printk(KERN_INFO + "btrfs: using default commit interval %ds\n", + BTRFS_DEFAULT_COMMIT_INTERVAL); + info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; + } + break; case Opt_err: printk(KERN_INFO "btrfs: unrecognized mount option " "'%s'\n", p); @@ -652,8 +704,8 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, { substring_t args[MAX_OPT_ARGS]; char *device_name, *opts, *orig, *p; + char *num = NULL; int error = 0; - int intarg; if (!options) return 0; @@ -677,17 +729,23 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, case Opt_subvol: kfree(*subvol_name); *subvol_name = match_strdup(&args[0]); + if (!*subvol_name) { + error = -ENOMEM; + goto out; + } break; case Opt_subvolid: - intarg = 0; - error = match_int(&args[0], &intarg); - if (!error) { + num = match_strdup(&args[0]); + if (num) { + *subvol_objectid = memparse(num, NULL); + kfree(num); /* we want the original fs_tree */ - if (!intarg) + if (!*subvol_objectid) *subvol_objectid = BTRFS_FS_TREE_OBJECTID; - else - *subvol_objectid = intarg; + } else { + error = -EINVAL; + goto out; } break; case Opt_subvolrootid: @@ -776,9 +834,6 @@ find_root: if (IS_ERR(new_root)) return ERR_CAST(new_root); - if (btrfs_root_refs(&new_root->root_item) == 0) - return ERR_PTR(-ENOENT); - dir_id = btrfs_root_dirid(&new_root->root_item); setup_root: location.objectid = dir_id; @@ -866,7 +921,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return 0; } - btrfs_wait_ordered_extents(root, 1); + btrfs_wait_all_ordered_extents(fs_info); trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { @@ -893,11 +948,9 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) if (btrfs_test_opt(root, NOBARRIER)) seq_puts(seq, ",nobarrier"); if (info->max_inline != 8192 * 1024) - seq_printf(seq, ",max_inline=%llu", - (unsigned long long)info->max_inline); + seq_printf(seq, ",max_inline=%llu", info->max_inline); if (info->alloc_start != 0) - seq_printf(seq, ",alloc_start=%llu", - (unsigned long long)info->alloc_start); + seq_printf(seq, ",alloc_start=%llu", info->alloc_start); if (info->thread_pool_size != min_t(unsigned long, num_online_cpus() + 2, 8)) seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); @@ -929,6 +982,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",space_cache"); else seq_puts(seq, ",nospace_cache"); + if (btrfs_test_opt(root, RESCAN_UUID_TREE)) + seq_puts(seq, ",rescan_uuid_tree"); if (btrfs_test_opt(root, CLEAR_CACHE)) seq_puts(seq, ",clear_cache"); if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) @@ -941,8 +996,24 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",inode_cache"); if (btrfs_test_opt(root, SKIP_BALANCE)) seq_puts(seq, ",skip_balance"); + if (btrfs_test_opt(root, RECOVERY)) + seq_puts(seq, ",recovery"); +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + if (btrfs_test_opt(root, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA)) + seq_puts(seq, ",check_int_data"); + else if (btrfs_test_opt(root, CHECK_INTEGRITY)) + seq_puts(seq, ",check_int"); + if (info->check_integrity_print_mask) + seq_printf(seq, ",check_int_print_mask=%d", + info->check_integrity_print_mask); +#endif + if (info->metadata_ratio) + seq_printf(seq, ",metadata_ratio=%d", + info->metadata_ratio); if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR)) seq_puts(seq, ",fatal_errors=panic"); + if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) + seq_printf(seq, ",commit=%d", info->commit_interval); return 0; } @@ -1269,6 +1340,12 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (ret) goto restore; } else { + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { + btrfs_err(fs_info, + "Remounting read-write after error is not allowed\n"); + ret = -EINVAL; + goto restore; + } if (fs_info->fs_devices->rw_devices == 0) { ret = -EACCES; goto restore; @@ -1306,6 +1383,16 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) pr_warn("btrfs: failed to resume dev_replace\n"); goto restore; } + + if (!fs_info->uuid_root) { + pr_info("btrfs: creating UUID tree\n"); + ret = btrfs_create_uuid_tree(fs_info); + if (ret) { + pr_warn("btrfs: failed to create the uuid tree" + "%d\n", ret); + goto restore; + } + } sb->s_flags &= ~MS_RDONLY; } out: @@ -1685,6 +1772,26 @@ static void btrfs_interface_exit(void) printk(KERN_INFO "btrfs: misc_deregister failed for control device\n"); } +static void btrfs_print_info(void) +{ + printk(KERN_INFO "Btrfs loaded" +#ifdef CONFIG_BTRFS_DEBUG + ", debug=on" +#endif +#ifdef CONFIG_BTRFS_ASSERT + ", assert=on" +#endif +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + ", integrity-checker=on" +#endif + "\n"); +} + +static int btrfs_run_sanity_tests(void) +{ + return btrfs_test_free_space_cache(); +} + static int __init init_btrfs_fs(void) { int err; @@ -1723,25 +1830,32 @@ static int __init init_btrfs_fs(void) if (err) goto free_auto_defrag; + err = btrfs_prelim_ref_init(); + if (err) + goto free_prelim_ref; + err = btrfs_interface_init(); if (err) goto free_delayed_ref; - err = register_filesystem(&btrfs_fs_type); + btrfs_init_lockdep(); + + btrfs_print_info(); + + err = btrfs_run_sanity_tests(); if (err) goto unregister_ioctl; - btrfs_init_lockdep(); - -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - btrfs_test_free_space_cache(); -#endif + err = register_filesystem(&btrfs_fs_type); + if (err) + goto unregister_ioctl; - printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION); return 0; unregister_ioctl: btrfs_interface_exit(); +free_prelim_ref: + btrfs_prelim_ref_exit(); free_delayed_ref: btrfs_delayed_ref_exit(); free_auto_defrag: @@ -1768,6 +1882,7 @@ static void __exit exit_btrfs_fs(void) btrfs_delayed_ref_exit(); btrfs_auto_defrag_exit(); btrfs_delayed_inode_exit(); + btrfs_prelim_ref_exit(); ordered_data_exit(); extent_map_exit(); extent_io_exit(); diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h new file mode 100644 index 000000000000..580877625776 --- /dev/null +++ b/fs/btrfs/tests/btrfs-tests.h @@ -0,0 +1,34 @@ +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_TESTS +#define __BTRFS_TESTS + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + +#define test_msg(fmt, ...) pr_info("btrfs: selftest: " fmt, ##__VA_ARGS__) + +int btrfs_test_free_space_cache(void); +#else +static inline int btrfs_test_free_space_cache(void) +{ + return 0; +} +#endif + +#endif diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c new file mode 100644 index 000000000000..6fc82010dc15 --- /dev/null +++ b/fs/btrfs/tests/free-space-tests.c @@ -0,0 +1,395 @@ +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/slab.h> +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../free-space-cache.h" + +#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) +static struct btrfs_block_group_cache *init_test_block_group(void) +{ + struct btrfs_block_group_cache *cache; + + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) + return NULL; + cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), + GFP_NOFS); + if (!cache->free_space_ctl) { + kfree(cache); + return NULL; + } + + cache->key.objectid = 0; + cache->key.offset = 1024 * 1024 * 1024; + cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + cache->sectorsize = 4096; + + spin_lock_init(&cache->lock); + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + INIT_LIST_HEAD(&cache->new_bg_list); + + btrfs_init_free_space_ctl(cache); + + return cache; +} + +/* + * This test just does basic sanity checking, making sure we can add an exten + * entry and remove space from either end and the middle, and make sure we can + * remove space that covers adjacent extent entries. + */ +static int test_extents(struct btrfs_block_group_cache *cache) +{ + int ret = 0; + + test_msg("Running extent only tests\n"); + + /* First just make sure we can remove an entire entry */ + ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); + if (ret) { + test_msg("Error adding initial extents %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); + if (ret) { + test_msg("Error removing extent %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 0, 4 * 1024 * 1024)) { + test_msg("Full remove left some lingering space\n"); + return -1; + } + + /* Ok edge and middle cases now */ + ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); + if (ret) { + test_msg("Error adding half extent %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024); + if (ret) { + test_msg("Error removing tail end %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); + if (ret) { + test_msg("Error removing front end %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096); + if (ret) { + test_msg("Error removing middle peice %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 0, 1 * 1024 * 1024)) { + test_msg("Still have space at the front\n"); + return -1; + } + + if (test_check_exists(cache, 2 * 1024 * 1024, 4096)) { + test_msg("Still have space in the middle\n"); + return -1; + } + + if (test_check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) { + test_msg("Still have space at the end\n"); + return -1; + } + + /* Cleanup */ + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + return 0; +} + +static int test_bitmaps(struct btrfs_block_group_cache *cache) +{ + u64 next_bitmap_offset; + int ret; + + test_msg("Running bitmap only tests\n"); + + ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't create a bitmap entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); + if (ret) { + test_msg("Error removing bitmap full range %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 0, 4 * 1024 * 1024)) { + test_msg("Left some space in bitmap\n"); + return -1; + } + + ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add to our bitmap entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024); + if (ret) { + test_msg("Couldn't remove middle chunk %d\n", ret); + return ret; + } + + /* + * The first bitmap we have starts at offset 0 so the next one is just + * at the end of the first bitmap. + */ + next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); + + /* Test a bit straddling two bitmaps */ + ret = test_add_free_space_entry(cache, next_bitmap_offset - + (2 * 1024 * 1024), 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add space that straddles two bitmaps %d\n", + ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, next_bitmap_offset - + (1 * 1024 * 1024), 2 * 1024 * 1024); + if (ret) { + test_msg("Couldn't remove overlapping space %d\n", ret); + return ret; + } + + if (test_check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024), + 2 * 1024 * 1024)) { + test_msg("Left some space when removing overlapping\n"); + return -1; + } + + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + return 0; +} + +/* This is the high grade jackassery */ +static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) +{ + u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); + int ret; + + test_msg("Running bitmap and extent tests\n"); + + /* + * First let's do something simple, an extent at the same offset as the + * bitmap, but the free space completely in the extent and then + * completely in the bitmap. + */ + ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't create bitmap entry %d\n", ret); + return ret; + } + + ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); + if (ret) { + test_msg("Couldn't add extent entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); + if (ret) { + test_msg("Couldn't remove extent entry %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 0, 1 * 1024 * 1024)) { + test_msg("Left remnants after our remove\n"); + return -1; + } + + /* Now to add back the extent entry and remove from the bitmap */ + ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); + if (ret) { + test_msg("Couldn't re-add extent entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024); + if (ret) { + test_msg("Couldn't remove from bitmap %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) { + test_msg("Left remnants in the bitmap\n"); + return -1; + } + + /* + * Ok so a little more evil, extent entry and bitmap at the same offset, + * removing an overlapping chunk. + */ + ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add to a bitmap %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024); + if (ret) { + test_msg("Couldn't remove overlapping space %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) { + test_msg("Left over peices after removing overlapping\n"); + return -1; + } + + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + /* Now with the extent entry offset into the bitmap */ + ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add space to the bitmap %d\n", ret); + return ret; + } + + ret = test_add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0); + if (ret) { + test_msg("Couldn't add extent to the cache %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024); + if (ret) { + test_msg("Problem removing overlapping space %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) { + test_msg("Left something behind when removing space"); + return -1; + } + + /* + * This has blown up in the past, the extent entry starts before the + * bitmap entry, but we're trying to remove an offset that falls + * completely within the bitmap range and is in both the extent entry + * and the bitmap entry, looks like this + * + * [ extent ] + * [ bitmap ] + * [ del ] + */ + __btrfs_remove_free_space_cache(cache->free_space_ctl); + ret = test_add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024, + 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add bitmap %d\n", ret); + return ret; + } + + ret = test_add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024, + 5 * 1024 * 1024, 0); + if (ret) { + test_msg("Couldn't add extent entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024, + 5 * 1024 * 1024); + if (ret) { + test_msg("Failed to free our space %d\n", ret); + return ret; + } + + if (test_check_exists(cache, bitmap_offset + 1 * 1024 * 1024, + 5 * 1024 * 1024)) { + test_msg("Left stuff over\n"); + return -1; + } + + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + /* + * This blew up before, we have part of the free space in a bitmap and + * then the entirety of the rest of the space in an extent. This used + * to return -EAGAIN back from btrfs_remove_extent, make sure this + * doesn't happen. + */ + ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add bitmap entry %d\n", ret); + return ret; + } + + ret = test_add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0); + if (ret) { + test_msg("Couldn't add extent entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024); + if (ret) { + test_msg("Error removing bitmap and extent overlapping %d\n", ret); + return ret; + } + + __btrfs_remove_free_space_cache(cache->free_space_ctl); + return 0; +} + +int btrfs_test_free_space_cache(void) +{ + struct btrfs_block_group_cache *cache; + int ret; + + test_msg("Running btrfs free space cache tests\n"); + + cache = init_test_block_group(); + if (!cache) { + test_msg("Couldn't run the tests\n"); + return 0; + } + + ret = test_extents(cache); + if (ret) + goto out; + ret = test_bitmaps(cache); + if (ret) + goto out; + ret = test_bitmaps_and_extents(cache); + if (ret) + goto out; +out: + __btrfs_remove_free_space_cache(cache->free_space_ctl); + kfree(cache->free_space_ctl); + kfree(cache); + test_msg("Free space cache tests finished\n"); + return ret; +} diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 0544587d74f4..8c81bdc1ef9b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -34,12 +34,43 @@ #define BTRFS_ROOT_TRANS_TAG 0 +static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { + [TRANS_STATE_RUNNING] = 0U, + [TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE | + __TRANS_START), + [TRANS_STATE_COMMIT_START] = (__TRANS_USERSPACE | + __TRANS_START | + __TRANS_ATTACH), + [TRANS_STATE_COMMIT_DOING] = (__TRANS_USERSPACE | + __TRANS_START | + __TRANS_ATTACH | + __TRANS_JOIN), + [TRANS_STATE_UNBLOCKED] = (__TRANS_USERSPACE | + __TRANS_START | + __TRANS_ATTACH | + __TRANS_JOIN | + __TRANS_JOIN_NOLOCK), + [TRANS_STATE_COMPLETED] = (__TRANS_USERSPACE | + __TRANS_START | + __TRANS_ATTACH | + __TRANS_JOIN | + __TRANS_JOIN_NOLOCK), +}; + static void put_transaction(struct btrfs_transaction *transaction) { WARN_ON(atomic_read(&transaction->use_count) == 0); if (atomic_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); WARN_ON(transaction->delayed_refs.root.rb_node); + while (!list_empty(&transaction->pending_chunks)) { + struct extent_map *em; + + em = list_first_entry(&transaction->pending_chunks, + struct extent_map, list); + list_del_init(&em->list); + free_extent_map(em); + } kmem_cache_free(btrfs_transaction_cachep, transaction); } } @@ -50,18 +81,35 @@ static noinline void switch_commit_root(struct btrfs_root *root) root->commit_root = btrfs_root_node(root); } -static inline int can_join_transaction(struct btrfs_transaction *trans, - int type) +static inline void extwriter_counter_inc(struct btrfs_transaction *trans, + unsigned int type) +{ + if (type & TRANS_EXTWRITERS) + atomic_inc(&trans->num_extwriters); +} + +static inline void extwriter_counter_dec(struct btrfs_transaction *trans, + unsigned int type) { - return !(trans->in_commit && - type != TRANS_JOIN && - type != TRANS_JOIN_NOLOCK); + if (type & TRANS_EXTWRITERS) + atomic_dec(&trans->num_extwriters); +} + +static inline void extwriter_counter_init(struct btrfs_transaction *trans, + unsigned int type) +{ + atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0)); +} + +static inline int extwriter_counter_read(struct btrfs_transaction *trans) +{ + return atomic_read(&trans->num_extwriters); } /* * either allocate a new transaction or hop into the existing one */ -static noinline int join_transaction(struct btrfs_root *root, int type) +static noinline int join_transaction(struct btrfs_root *root, unsigned int type) { struct btrfs_transaction *cur_trans; struct btrfs_fs_info *fs_info = root->fs_info; @@ -74,32 +122,19 @@ loop: return -EROFS; } - if (fs_info->trans_no_join) { - /* - * If we are JOIN_NOLOCK we're already committing a current - * transaction, we just need a handle to deal with something - * when committing the transaction, such as inode cache and - * space cache. It is a special case. - */ - if (type != TRANS_JOIN_NOLOCK) { - spin_unlock(&fs_info->trans_lock); - return -EBUSY; - } - } - cur_trans = fs_info->running_transaction; if (cur_trans) { if (cur_trans->aborted) { spin_unlock(&fs_info->trans_lock); return cur_trans->aborted; } - if (!can_join_transaction(cur_trans, type)) { + if (btrfs_blocked_trans_types[cur_trans->state] & type) { spin_unlock(&fs_info->trans_lock); return -EBUSY; } atomic_inc(&cur_trans->use_count); atomic_inc(&cur_trans->num_writers); - cur_trans->num_joined++; + extwriter_counter_inc(cur_trans, type); spin_unlock(&fs_info->trans_lock); return 0; } @@ -112,6 +147,12 @@ loop: if (type == TRANS_ATTACH) return -ENOENT; + /* + * JOIN_NOLOCK only happens during the transaction commit, so + * it is impossible that ->running_transaction is NULL + */ + BUG_ON(type == TRANS_JOIN_NOLOCK); + cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); if (!cur_trans) return -ENOMEM; @@ -120,7 +161,7 @@ loop: if (fs_info->running_transaction) { /* * someone started a transaction after we unlocked. Make sure - * to redo the trans_no_join checks above + * to redo the checks above */ kmem_cache_free(btrfs_transaction_cachep, cur_trans); goto loop; @@ -131,17 +172,15 @@ loop: } atomic_set(&cur_trans->num_writers, 1); - cur_trans->num_joined = 0; + extwriter_counter_init(cur_trans, type); init_waitqueue_head(&cur_trans->writer_wait); init_waitqueue_head(&cur_trans->commit_wait); - cur_trans->in_commit = 0; - cur_trans->blocked = 0; + cur_trans->state = TRANS_STATE_RUNNING; /* * One for this trans handle, one so it will live on until we * commit the transaction. */ atomic_set(&cur_trans->use_count, 2); - cur_trans->commit_done = 0; cur_trans->start_time = get_seconds(); cur_trans->delayed_refs.root = RB_ROOT; @@ -164,7 +203,6 @@ loop: "creating a fresh transaction\n"); atomic64_set(&fs_info->tree_mod_seq, 0); - spin_lock_init(&cur_trans->commit_lock); spin_lock_init(&cur_trans->delayed_refs.lock); atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0); atomic_set(&cur_trans->delayed_refs.ref_seq, 0); @@ -172,6 +210,7 @@ loop: INIT_LIST_HEAD(&cur_trans->pending_snapshots); INIT_LIST_HEAD(&cur_trans->ordered_operations); + INIT_LIST_HEAD(&cur_trans->pending_chunks); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, fs_info->btree_inode->i_mapping); @@ -269,6 +308,13 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, return 0; } +static inline int is_transaction_blocked(struct btrfs_transaction *trans) +{ + return (trans->state >= TRANS_STATE_BLOCKED && + trans->state < TRANS_STATE_UNBLOCKED && + !trans->aborted); +} + /* wait for commit against the current transaction to become unblocked * when this is done, it is safe to start a new transaction, but the current * transaction might not be fully on disk. @@ -279,12 +325,13 @@ static void wait_current_trans(struct btrfs_root *root) spin_lock(&root->fs_info->trans_lock); cur_trans = root->fs_info->running_transaction; - if (cur_trans && cur_trans->blocked) { + if (cur_trans && is_transaction_blocked(cur_trans)) { atomic_inc(&cur_trans->use_count); spin_unlock(&root->fs_info->trans_lock); wait_event(root->fs_info->transaction_wait, - !cur_trans->blocked); + cur_trans->state >= TRANS_STATE_UNBLOCKED || + cur_trans->aborted); put_transaction(cur_trans); } else { spin_unlock(&root->fs_info->trans_lock); @@ -307,7 +354,7 @@ static int may_wait_transaction(struct btrfs_root *root, int type) } static struct btrfs_trans_handle * -start_transaction(struct btrfs_root *root, u64 num_items, int type, +start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, enum btrfs_reserve_flush_enum flush) { struct btrfs_trans_handle *h; @@ -320,7 +367,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type, return ERR_PTR(-EROFS); if (current->journal_info) { - WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); + WARN_ON(type & TRANS_EXTWRITERS); h = current->journal_info; h->use_count++; WARN_ON(h->use_count > 2); @@ -366,7 +413,7 @@ again: * If we are ATTACH, it means we just want to catch the current * transaction and commit it, so we needn't do sb_start_intwrite(). */ - if (type < TRANS_JOIN_NOLOCK) + if (type & __TRANS_FREEZABLE) sb_start_intwrite(root->fs_info->sb); if (may_wait_transaction(root, type)) @@ -408,7 +455,8 @@ again: INIT_LIST_HEAD(&h->new_bgs); smp_mb(); - if (cur_trans->blocked && may_wait_transaction(root, type)) { + if (cur_trans->state >= TRANS_STATE_BLOCKED && + may_wait_transaction(root, type)) { btrfs_commit_transaction(h, root); goto again; } @@ -429,7 +477,7 @@ got_it: return h; join_fail: - if (type < TRANS_JOIN_NOLOCK) + if (type & __TRANS_FREEZABLE) sb_end_intwrite(root->fs_info->sb); kmem_cache_free(btrfs_trans_handle_cachep, h); alloc_fail: @@ -490,7 +538,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) } /* - * btrfs_attach_transaction() - catch the running transaction + * btrfs_attach_transaction_barrier() - catch the running transaction * * It is similar to the above function, the differentia is this one * will wait for all the inactive transactions until they fully @@ -512,7 +560,7 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root) static noinline void wait_for_commit(struct btrfs_root *root, struct btrfs_transaction *commit) { - wait_event(commit->commit_wait, commit->commit_done); + wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED); } int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) @@ -548,8 +596,8 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) spin_lock(&root->fs_info->trans_lock); list_for_each_entry_reverse(t, &root->fs_info->trans_list, list) { - if (t->in_commit) { - if (t->commit_done) + if (t->state >= TRANS_STATE_COMMIT_START) { + if (t->state == TRANS_STATE_COMPLETED) break; cur_trans = t; atomic_inc(&cur_trans->use_count); @@ -576,10 +624,11 @@ void btrfs_throttle(struct btrfs_root *root) static int should_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - int ret; + if (root->fs_info->global_block_rsv.space_info->full && + btrfs_should_throttle_delayed_refs(trans, root)) + return 1; - ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5); - return ret ? 1 : 0; + return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5); } int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, @@ -590,7 +639,8 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, int err; smp_mb(); - if (cur_trans->blocked || cur_trans->delayed_refs.flushing) + if (cur_trans->state >= TRANS_STATE_BLOCKED || + cur_trans->delayed_refs.flushing) return 1; updates = trans->delayed_ref_updates; @@ -609,7 +659,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, { struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *info = root->fs_info; - int count = 0; + unsigned long cur = trans->delayed_ref_updates; int lock = (trans->type != TRANS_JOIN_NOLOCK); int err = 0; @@ -638,17 +688,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); - while (count < 1) { - unsigned long cur = trans->delayed_ref_updates; + trans->delayed_ref_updates = 0; + if (btrfs_should_throttle_delayed_refs(trans, root)) { + cur = max_t(unsigned long, cur, 1); trans->delayed_ref_updates = 0; - if (cur && - trans->transaction->delayed_refs.num_heads_ready > 64) { - trans->delayed_ref_updates = 0; - btrfs_run_delayed_refs(trans, root, cur); - } else { - break; - } - count++; + btrfs_run_delayed_refs(trans, root, cur); } btrfs_trans_release_metadata(trans, root); @@ -658,12 +702,15 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_create_pending_block_groups(trans, root); if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && - should_end_transaction(trans, root)) { - trans->transaction->blocked = 1; - smp_wmb(); + should_end_transaction(trans, root) && + ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) { + spin_lock(&info->trans_lock); + if (cur_trans->state == TRANS_STATE_RUNNING) + cur_trans->state = TRANS_STATE_BLOCKED; + spin_unlock(&info->trans_lock); } - if (lock && cur_trans->blocked && !cur_trans->in_commit) { + if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) { if (throttle) { /* * We may race with somebody else here so end up having @@ -677,12 +724,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } } - if (trans->type < TRANS_JOIN_NOLOCK) + if (trans->type & __TRANS_FREEZABLE) sb_end_intwrite(root->fs_info->sb); WARN_ON(cur_trans != info->running_transaction); WARN_ON(atomic_read(&cur_trans->num_writers) < 1); atomic_dec(&cur_trans->num_writers); + extwriter_counter_dec(cur_trans, trans->type); smp_mb(); if (waitqueue_active(&cur_trans->writer_wait)) @@ -736,9 +784,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root, struct extent_state *cached_state = NULL; u64 start = 0; u64 end; - struct blk_plug plug; - blk_start_plug(&plug); while (!find_first_extent_bit(dirty_pages, start, &start, &end, mark, &cached_state)) { convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, @@ -752,7 +798,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root, } if (err) werr = err; - blk_finish_plug(&plug); return werr; } @@ -792,13 +837,16 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, * them in one of two extent_io trees. This is used to make sure all of * those extents are on disk for transaction or log commit */ -int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, +static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark) { int ret; int ret2; + struct blk_plug plug; + blk_start_plug(&plug); ret = btrfs_write_marked_extents(root, dirty_pages, mark); + blk_finish_plug(&plug); ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); if (ret) @@ -935,12 +983,12 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, * a dirty root struct and adds it into the list of dead roots that need to * be deleted */ -int btrfs_add_dead_root(struct btrfs_root *root) +void btrfs_add_dead_root(struct btrfs_root *root) { spin_lock(&root->fs_info->trans_lock); - list_add_tail(&root->root_list, &root->fs_info->dead_roots); + if (list_empty(&root->root_list)) + list_add_tail(&root->root_list, &root->fs_info->dead_roots); spin_unlock(&root->fs_info->trans_lock); - return 0; } /* @@ -1177,8 +1225,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_set_root_stransid(new_root_item, 0); btrfs_set_root_rtransid(new_root_item, 0); } - new_root_item->otime.sec = cpu_to_le64(cur_time.tv_sec); - new_root_item->otime.nsec = cpu_to_le32(cur_time.tv_nsec); + btrfs_set_stack_timespec_sec(&new_root_item->otime, cur_time.tv_sec); + btrfs_set_stack_timespec_nsec(&new_root_item->otime, cur_time.tv_nsec); btrfs_set_root_otransid(new_root_item, trans->transid); old = btrfs_lock_root_node(root); @@ -1263,8 +1311,26 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, dentry->d_name.len * 2); parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode); - if (ret) + if (ret) { btrfs_abort_transaction(trans, root, ret); + goto fail; + } + ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, new_uuid.b, + BTRFS_UUID_KEY_SUBVOL, objectid); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) { + ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, + new_root_item->received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + objectid); + if (ret && ret != -EEXIST) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + } fail: pending->error = ret; dir_item_existed: @@ -1314,24 +1380,32 @@ static void update_super_roots(struct btrfs_root *root) super->root_level = root_item->level; if (btrfs_test_opt(root, SPACE_CACHE)) super->cache_generation = root_item->generation; + if (root->fs_info->update_uuid_tree_gen) + super->uuid_tree_generation = root_item->generation; } int btrfs_transaction_in_commit(struct btrfs_fs_info *info) { + struct btrfs_transaction *trans; int ret = 0; + spin_lock(&info->trans_lock); - if (info->running_transaction) - ret = info->running_transaction->in_commit; + trans = info->running_transaction; + if (trans) + ret = (trans->state >= TRANS_STATE_COMMIT_START); spin_unlock(&info->trans_lock); return ret; } int btrfs_transaction_blocked(struct btrfs_fs_info *info) { + struct btrfs_transaction *trans; int ret = 0; + spin_lock(&info->trans_lock); - if (info->running_transaction) - ret = info->running_transaction->blocked; + trans = info->running_transaction; + if (trans) + ret = is_transaction_blocked(trans); spin_unlock(&info->trans_lock); return ret; } @@ -1343,7 +1417,9 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info) static void wait_current_trans_commit_start(struct btrfs_root *root, struct btrfs_transaction *trans) { - wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit); + wait_event(root->fs_info->transaction_blocked_wait, + trans->state >= TRANS_STATE_COMMIT_START || + trans->aborted); } /* @@ -1354,7 +1430,8 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, struct btrfs_transaction *trans) { wait_event(root->fs_info->transaction_wait, - trans->commit_done || (trans->in_commit && !trans->blocked)); + trans->state >= TRANS_STATE_UNBLOCKED || + trans->aborted); } /* @@ -1450,26 +1527,31 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, spin_lock(&root->fs_info->trans_lock); - if (list_empty(&cur_trans->list)) { - spin_unlock(&root->fs_info->trans_lock); - btrfs_end_transaction(trans, root); - return; - } + /* + * If the transaction is removed from the list, it means this + * transaction has been committed successfully, so it is impossible + * to call the cleanup function. + */ + BUG_ON(list_empty(&cur_trans->list)); list_del_init(&cur_trans->list); if (cur_trans == root->fs_info->running_transaction) { - root->fs_info->trans_no_join = 1; + cur_trans->state = TRANS_STATE_COMMIT_DOING; spin_unlock(&root->fs_info->trans_lock); wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); spin_lock(&root->fs_info->trans_lock); - root->fs_info->running_transaction = NULL; } spin_unlock(&root->fs_info->trans_lock); btrfs_cleanup_one_transaction(trans->transaction, root); + spin_lock(&root->fs_info->trans_lock); + if (cur_trans == root->fs_info->running_transaction) + root->fs_info->running_transaction = NULL; + spin_unlock(&root->fs_info->trans_lock); + put_transaction(cur_trans); put_transaction(cur_trans); @@ -1481,33 +1563,13 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, current->journal_info = NULL; kmem_cache_free(btrfs_trans_handle_cachep, trans); - - spin_lock(&root->fs_info->trans_lock); - root->fs_info->trans_no_join = 0; - spin_unlock(&root->fs_info->trans_lock); } static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); - int snap_pending = 0; int ret; - if (!flush_on_commit) { - spin_lock(&root->fs_info->trans_lock); - if (!list_empty(&trans->transaction->pending_snapshots)) - snap_pending = 1; - spin_unlock(&root->fs_info->trans_lock); - } - - if (flush_on_commit || snap_pending) { - ret = btrfs_start_delalloc_inodes(root, 1); - if (ret) - return ret; - btrfs_wait_ordered_extents(root, 1); - } - ret = btrfs_run_delayed_items(trans, root); if (ret) return ret; @@ -1531,23 +1593,25 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, return ret; } -/* - * btrfs_transaction state sequence: - * in_commit = 0, blocked = 0 (initial) - * in_commit = 1, blocked = 1 - * blocked = 0 - * commit_done = 1 - */ +static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) +{ + if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) + return btrfs_start_all_delalloc_inodes(fs_info, 1); + return 0; +} + +static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) +{ + if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) + btrfs_wait_all_ordered_extents(fs_info); +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - unsigned long joined = 0; struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_transaction *prev_trans = NULL; - DEFINE_WAIT(wait); int ret; - int should_grow = 0; - unsigned long now = get_seconds(); ret = btrfs_run_ordered_operations(trans, root, 0); if (ret) { @@ -1586,6 +1650,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, * start sending their work down. */ cur_trans->delayed_refs.flushing = 1; + smp_wmb(); if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); @@ -1596,9 +1661,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, return ret; } - spin_lock(&cur_trans->commit_lock); - if (cur_trans->in_commit) { - spin_unlock(&cur_trans->commit_lock); + spin_lock(&root->fs_info->trans_lock); + if (cur_trans->state >= TRANS_STATE_COMMIT_START) { + spin_unlock(&root->fs_info->trans_lock); atomic_inc(&cur_trans->use_count); ret = btrfs_end_transaction(trans, root); @@ -1609,16 +1674,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, return ret; } - trans->transaction->in_commit = 1; - trans->transaction->blocked = 1; - spin_unlock(&cur_trans->commit_lock); + cur_trans->state = TRANS_STATE_COMMIT_START; wake_up(&root->fs_info->transaction_blocked_wait); - spin_lock(&root->fs_info->trans_lock); if (cur_trans->list.prev != &root->fs_info->trans_list) { prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); - if (!prev_trans->commit_done) { + if (prev_trans->state != TRANS_STATE_COMPLETED) { atomic_inc(&prev_trans->use_count); spin_unlock(&root->fs_info->trans_lock); @@ -1632,42 +1694,32 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, spin_unlock(&root->fs_info->trans_lock); } - if (!btrfs_test_opt(root, SSD) && - (now < cur_trans->start_time || now - cur_trans->start_time < 1)) - should_grow = 1; - - do { - joined = cur_trans->num_joined; - - WARN_ON(cur_trans != trans->transaction); - - ret = btrfs_flush_all_pending_stuffs(trans, root); - if (ret) - goto cleanup_transaction; + extwriter_counter_dec(cur_trans, trans->type); - prepare_to_wait(&cur_trans->writer_wait, &wait, - TASK_UNINTERRUPTIBLE); + ret = btrfs_start_delalloc_flush(root->fs_info); + if (ret) + goto cleanup_transaction; - if (atomic_read(&cur_trans->num_writers) > 1) - schedule_timeout(MAX_SCHEDULE_TIMEOUT); - else if (should_grow) - schedule_timeout(1); + ret = btrfs_flush_all_pending_stuffs(trans, root); + if (ret) + goto cleanup_transaction; - finish_wait(&cur_trans->writer_wait, &wait); - } while (atomic_read(&cur_trans->num_writers) > 1 || - (should_grow && cur_trans->num_joined != joined)); + wait_event(cur_trans->writer_wait, + extwriter_counter_read(cur_trans) == 0); + /* some pending stuffs might be added after the previous flush. */ ret = btrfs_flush_all_pending_stuffs(trans, root); if (ret) goto cleanup_transaction; + btrfs_wait_delalloc_flush(root->fs_info); /* * Ok now we need to make sure to block out any other joins while we * commit the transaction. We could have started a join before setting - * no_join so make sure to wait for num_writers to == 1 again. + * COMMIT_DOING so make sure to wait for num_writers to == 1 again. */ spin_lock(&root->fs_info->trans_lock); - root->fs_info->trans_no_join = 1; + cur_trans->state = TRANS_STATE_COMMIT_DOING; spin_unlock(&root->fs_info->trans_lock); wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); @@ -1786,18 +1838,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, assert_qgroups_uptodate(trans); update_super_roots(root); - if (!root->fs_info->log_root_recovering) { - btrfs_set_super_log_root(root->fs_info->super_copy, 0); - btrfs_set_super_log_root_level(root->fs_info->super_copy, 0); - } - + btrfs_set_super_log_root(root->fs_info->super_copy, 0); + btrfs_set_super_log_root_level(root->fs_info->super_copy, 0); memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, sizeof(*root->fs_info->super_copy)); - trans->transaction->blocked = 0; spin_lock(&root->fs_info->trans_lock); + cur_trans->state = TRANS_STATE_UNBLOCKED; root->fs_info->running_transaction = NULL; - root->fs_info->trans_no_join = 0; spin_unlock(&root->fs_info->trans_lock); mutex_unlock(&root->fs_info->reloc_mutex); @@ -1825,10 +1873,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_finish_extent_commit(trans, root); - cur_trans->commit_done = 1; - root->fs_info->last_trans_committed = cur_trans->transid; - + /* + * We needn't acquire the lock here because there is no other task + * which can change it. + */ + cur_trans->state = TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); spin_lock(&root->fs_info->trans_lock); @@ -1838,7 +1888,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, put_transaction(cur_trans); put_transaction(cur_trans); - if (trans->type < TRANS_JOIN_NOLOCK) + if (trans->type & __TRANS_FREEZABLE) sb_end_intwrite(root->fs_info->sb); trace_btrfs_transaction_commit(root); @@ -1885,11 +1935,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) int ret; struct btrfs_fs_info *fs_info = root->fs_info; - if (fs_info->sb->s_flags & MS_RDONLY) { - pr_debug("btrfs: cleaner called for RO fs!\n"); - return 0; - } - spin_lock(&fs_info->trans_lock); if (list_empty(&fs_info->dead_roots)) { spin_unlock(&fs_info->trans_lock); @@ -1897,11 +1942,10 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) } root = list_first_entry(&fs_info->dead_roots, struct btrfs_root, root_list); - list_del(&root->root_list); + list_del_init(&root->root_list); spin_unlock(&fs_info->trans_lock); - pr_debug("btrfs: cleaner removing %llu\n", - (unsigned long long)root->objectid); + pr_debug("btrfs: cleaner removing %llu\n", root->objectid); btrfs_kill_all_delayed_nodes(root); @@ -1914,6 +1958,5 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) * If we encounter a transaction abort during snapshot cleaning, we * don't want to crash here */ - BUG_ON(ret < 0 && ret != -EAGAIN && ret != -EROFS); - return 1; + return (ret < 0) ? 0 : 1; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 24c97335a59f..5c2af8491621 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -22,21 +22,33 @@ #include "delayed-ref.h" #include "ctree.h" +enum btrfs_trans_state { + TRANS_STATE_RUNNING = 0, + TRANS_STATE_BLOCKED = 1, + TRANS_STATE_COMMIT_START = 2, + TRANS_STATE_COMMIT_DOING = 3, + TRANS_STATE_UNBLOCKED = 4, + TRANS_STATE_COMPLETED = 5, + TRANS_STATE_MAX = 6, +}; + struct btrfs_transaction { u64 transid; /* + * total external writers(USERSPACE/START/ATTACH) in this + * transaction, it must be zero before the transaction is + * being committed + */ + atomic_t num_extwriters; + /* * total writers in this transaction, it must be zero before the * transaction can end */ atomic_t num_writers; atomic_t use_count; - unsigned long num_joined; - - spinlock_t commit_lock; - int in_commit; - int commit_done; - int blocked; + /* Be protected by fs_info->trans_lock when we want to change it. */ + enum btrfs_trans_state state; struct list_head list; struct extent_io_tree dirty_pages; unsigned long start_time; @@ -44,17 +56,27 @@ struct btrfs_transaction { wait_queue_head_t commit_wait; struct list_head pending_snapshots; struct list_head ordered_operations; + struct list_head pending_chunks; struct btrfs_delayed_ref_root delayed_refs; int aborted; }; -enum btrfs_trans_type { - TRANS_START, - TRANS_JOIN, - TRANS_USERSPACE, - TRANS_JOIN_NOLOCK, - TRANS_ATTACH, -}; +#define __TRANS_FREEZABLE (1U << 0) + +#define __TRANS_USERSPACE (1U << 8) +#define __TRANS_START (1U << 9) +#define __TRANS_ATTACH (1U << 10) +#define __TRANS_JOIN (1U << 11) +#define __TRANS_JOIN_NOLOCK (1U << 12) + +#define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE) +#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE) +#define TRANS_ATTACH (__TRANS_ATTACH) +#define TRANS_JOIN (__TRANS_JOIN | __TRANS_FREEZABLE) +#define TRANS_JOIN_NOLOCK (__TRANS_JOIN_NOLOCK) + +#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \ + __TRANS_ATTACH) struct btrfs_trans_handle { u64 transid; @@ -70,7 +92,7 @@ struct btrfs_trans_handle { short aborted; short adding_csums; bool allocating_chunk; - enum btrfs_trans_type type; + unsigned int type; /* * this root is only needed to validate that the root passed to * start_transaction is the same as the one passed to end_transaction. @@ -121,7 +143,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_add_dead_root(struct btrfs_root *root); +void btrfs_add_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root); int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans, @@ -138,8 +160,6 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, void btrfs_throttle(struct btrfs_root *root); int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages, int mark); int btrfs_write_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark); int btrfs_wait_marked_extents(struct btrfs_root *root, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c276ac9a0ec3..79f057c0619a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -18,6 +18,7 @@ #include <linux/sched.h> #include <linux/slab.h> +#include <linux/blkdev.h> #include <linux/list_sort.h> #include "ctree.h" #include "transaction.h" @@ -92,7 +93,8 @@ */ #define LOG_WALK_PIN_ONLY 0 #define LOG_WALK_REPLAY_INODES 1 -#define LOG_WALK_REPLAY_ALL 2 +#define LOG_WALK_REPLAY_DIR_INDEX 2 +#define LOG_WALK_REPLAY_ALL 3 static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, @@ -279,11 +281,23 @@ static int process_one_buffer(struct btrfs_root *log, { int ret = 0; + /* + * If this fs is mixed then we need to be able to process the leaves to + * pin down any logged extents, so we have to read the block. + */ + if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) { + ret = btrfs_read_buffer(eb, gen); + if (ret) + return ret; + } + if (wc->pin) ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, eb->start, eb->len); if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { + if (wc->pin && btrfs_header_level(eb) == 0) + ret = btrfs_exclude_logged_extents(log, eb); if (wc->write) btrfs_write_tree_block(eb); if (wc->wait) @@ -380,6 +394,7 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, if (inode_item) { struct btrfs_inode_item *item; u64 nbytes; + u32 mode; item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); @@ -387,9 +402,19 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); btrfs_set_inode_nbytes(eb, item, nbytes); + + /* + * If this is a directory we need to reset the i_size to + * 0 so that we can set it up properly when replaying + * the rest of the items in this log. + */ + mode = btrfs_inode_mode(eb, item); + if (S_ISDIR(mode)) + btrfs_set_inode_size(eb, item, 0); } } else if (inode_item) { struct btrfs_inode_item *item; + u32 mode; /* * New inode, set nbytes to 0 so that the nbytes comes out @@ -397,6 +422,15 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, */ item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); btrfs_set_inode_nbytes(eb, item, 0); + + /* + * If this is a directory we need to reset the i_size to 0 so + * that we can set it up properly when replaying the rest of + * the items in this log. + */ + mode = btrfs_inode_mode(eb, item); + if (S_ISDIR(mode)) + btrfs_set_inode_size(eb, item, 0); } insert: btrfs_release_path(path); @@ -734,7 +768,8 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); if (ret) goto out; - btrfs_run_delayed_items(trans, root); + else + ret = btrfs_run_delayed_items(trans, root); out: kfree(name); iput(inode); @@ -910,7 +945,9 @@ again: kfree(victim_name); if (ret) return ret; - btrfs_run_delayed_items(trans, root); + ret = btrfs_run_delayed_items(trans, root); + if (ret) + return ret; *search_done = 1; goto again; } @@ -977,7 +1014,9 @@ again: inode, victim_name, victim_name_len); - btrfs_run_delayed_items(trans, root); + if (!ret) + ret = btrfs_run_delayed_items( + trans, root); } iput(victim_parent); kfree(victim_name); @@ -1478,6 +1517,7 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, iput(inode); return -EIO; } + ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); /* FIXME, put inode into FIXUP list */ @@ -1516,6 +1556,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, u8 log_type; int exists; int ret = 0; + bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); dir = read_one_inode(root, key->objectid); if (!dir) @@ -1523,8 +1564,10 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, name_len = btrfs_dir_name_len(eb, di); name = kmalloc(name_len, GFP_NOFS); - if (!name) - return -ENOMEM; + if (!name) { + ret = -ENOMEM; + goto out; + } log_type = btrfs_dir_type(eb, di); read_extent_buffer(eb, name, (unsigned long)(di + 1), @@ -1584,6 +1627,10 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, goto insert; out: btrfs_release_path(path); + if (!ret && update_size) { + btrfs_i_size_write(dir, dir->i_size + name_len * 2); + ret = btrfs_update_inode(trans, root, dir); + } kfree(name); iput(dir); return ret; @@ -1594,6 +1641,7 @@ insert: name, name_len, log_type, &log_key); if (ret && ret != -ENOENT) goto out; + update_size = false; ret = 0; goto out; } @@ -1797,7 +1845,7 @@ again: ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); if (!ret) - btrfs_run_delayed_items(trans, root); + ret = btrfs_run_delayed_items(trans, root); kfree(name); iput(inode); if (ret) @@ -2007,6 +2055,15 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, if (ret) break; } + + if (key.type == BTRFS_DIR_INDEX_KEY && + wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { + ret = replay_one_dir_item(wc->trans, root, path, + eb, i, &key); + if (ret) + break; + } + if (wc->stage < LOG_WALK_REPLAY_ALL) continue; @@ -2016,13 +2073,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, eb, i, &key); if (ret) break; - } else if (key.type == BTRFS_INODE_REF_KEY) { - ret = add_inode_ref(wc->trans, root, log, path, - eb, i, &key); - if (ret && ret != -ENOENT) - break; - ret = 0; - } else if (key.type == BTRFS_INODE_EXTREF_KEY) { + } else if (key.type == BTRFS_INODE_REF_KEY || + key.type == BTRFS_INODE_EXTREF_KEY) { ret = add_inode_ref(wc->trans, root, log, path, eb, i, &key); if (ret && ret != -ENOENT) @@ -2033,8 +2085,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, eb, i, &key); if (ret) break; - } else if (key.type == BTRFS_DIR_ITEM_KEY || - key.type == BTRFS_DIR_INDEX_KEY) { + } else if (key.type == BTRFS_DIR_ITEM_KEY) { ret = replay_one_dir_item(wc->trans, root, path, eb, i, &key); if (ret) @@ -2358,6 +2409,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *log = root->log_root; struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; unsigned long log_transid = 0; + struct blk_plug plug; mutex_lock(&root->log_mutex); log_transid = root->log_transid; @@ -2401,8 +2453,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, /* we start IO on all the marked extents here, but we don't actually * wait for them until later. */ + blk_start_plug(&plug); ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); if (ret) { + blk_finish_plug(&plug); btrfs_abort_transaction(trans, root, ret); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&root->log_mutex); @@ -2437,6 +2491,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } if (ret) { + blk_finish_plug(&plug); if (ret != -ENOSPC) { btrfs_abort_transaction(trans, root, ret); mutex_unlock(&log_root_tree->log_mutex); @@ -2452,6 +2507,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, index2 = log_root_tree->log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { + blk_finish_plug(&plug); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); wait_log_commit(trans, log_root_tree, log_root_tree->log_transid); @@ -2474,6 +2530,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * check the full commit flag again */ if (root->fs_info->last_trans_log_full_commit == trans->transid) { + blk_finish_plug(&plug); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -2481,9 +2538,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out_wake_log_root; } - ret = btrfs_write_and_wait_marked_extents(log_root_tree, - &log_root_tree->dirty_log_pages, - EXTENT_DIRTY | EXTENT_NEW); + ret = btrfs_write_marked_extents(log_root_tree, + &log_root_tree->dirty_log_pages, + EXTENT_DIRTY | EXTENT_NEW); + blk_finish_plug(&plug); if (ret) { btrfs_abort_transaction(trans, root, ret); btrfs_free_logged_extents(log, log_transid); @@ -2491,6 +2549,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out_wake_log_root; } btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + btrfs_wait_marked_extents(log_root_tree, + &log_root_tree->dirty_log_pages, + EXTENT_NEW | EXTENT_DIRTY); btrfs_wait_logged_extents(log, log_transid); btrfs_set_super_log_root(root->fs_info->super_for_commit, @@ -3728,8 +3789,9 @@ next_slot: } log_extents: + btrfs_release_path(path); + btrfs_release_path(dst_path); if (fast_search) { - btrfs_release_path(dst_path); ret = btrfs_log_changed_extents(trans, root, inode, dst_path); if (ret) { err = ret; @@ -3746,8 +3808,6 @@ log_extents: } if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { - btrfs_release_path(path); - btrfs_release_path(dst_path); ret = log_directory_changes(trans, root, inode, path, dst_path); if (ret) { err = ret; @@ -3781,6 +3841,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, int ret = 0; struct btrfs_root *root; struct dentry *old_parent = NULL; + struct inode *orig_inode = inode; /* * for regular files, if its inode is already on disk, we don't @@ -3800,7 +3861,14 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, } while (1) { - BTRFS_I(inode)->logged_trans = trans->transid; + /* + * If we are logging a directory then we start with our inode, + * not our parents inode, so we need to skipp setting the + * logged_trans so that further down in the log code we don't + * think this inode has already been logged. + */ + if (inode != orig_inode) + BTRFS_I(inode)->logged_trans = trans->transid; smp_mb(); if (BTRFS_I(inode)->last_unlink_trans > last_committed) { @@ -4016,8 +4084,7 @@ again: if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) break; - log = btrfs_read_fs_root_no_radix(log_root_tree, - &found_key); + log = btrfs_read_fs_root(log_root_tree, &found_key); if (IS_ERR(log)) { ret = PTR_ERR(log); btrfs_error(fs_info, ret, diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 7b417e20efe2..b0a523b2c60e 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -205,6 +205,10 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, u64 new_alloced = ulist->nodes_alloced + 128; struct ulist_node *new_nodes; void *old = NULL; + int i; + + for (i = 0; i < ulist->nnodes; i++) + rb_erase(&ulist->nodes[i].rb_node, &ulist->root); /* * if nodes_alloced == ULIST_SIZE no memory has been allocated @@ -224,6 +228,17 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, ulist->nodes = new_nodes; ulist->nodes_alloced = new_alloced; + + /* + * krealloc actually uses memcpy, which does not copy rb_node + * pointers, so we have to do it ourselves. Otherwise we may + * be bitten by crashes. + */ + for (i = 0; i < ulist->nnodes; i++) { + ret = ulist_rbtree_insert(ulist, &ulist->nodes[i]); + if (ret < 0) + return ret; + } } ulist->nodes[ulist->nnodes].val = val; ulist->nodes[ulist->nnodes].aux = aux; diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c new file mode 100644 index 000000000000..dd0dea3766f7 --- /dev/null +++ b/fs/btrfs/uuid-tree.c @@ -0,0 +1,358 @@ +/* + * Copyright (C) STRATO AG 2013. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include <linux/uuid.h> +#include <asm/unaligned.h> +#include "ctree.h" +#include "transaction.h" +#include "disk-io.h" +#include "print-tree.h" + + +static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key) +{ + key->type = type; + key->objectid = get_unaligned_le64(uuid); + key->offset = get_unaligned_le64(uuid + sizeof(u64)); +} + +/* return -ENOENT for !found, < 0 for errors, or 0 if an item was found */ +static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid, + u8 type, u64 subid) +{ + int ret; + struct btrfs_path *path = NULL; + struct extent_buffer *eb; + int slot; + u32 item_size; + unsigned long offset; + struct btrfs_key key; + + if (WARN_ON_ONCE(!uuid_root)) { + ret = -ENOENT; + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + btrfs_uuid_to_key(uuid, type, &key); + ret = btrfs_search_slot(NULL, uuid_root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = -ENOENT; + goto out; + } + + eb = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size_nr(eb, slot); + offset = btrfs_item_ptr_offset(eb, slot); + ret = -ENOENT; + + if (!IS_ALIGNED(item_size, sizeof(u64))) { + pr_warn("btrfs: uuid item with illegal size %lu!\n", + (unsigned long)item_size); + goto out; + } + while (item_size) { + __le64 data; + + read_extent_buffer(eb, &data, offset, sizeof(data)); + if (le64_to_cpu(data) == subid) { + ret = 0; + break; + } + offset += sizeof(data); + item_size -= sizeof(data); + } + +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, + struct btrfs_root *uuid_root, u8 *uuid, u8 type, + u64 subid_cpu) +{ + int ret; + struct btrfs_path *path = NULL; + struct btrfs_key key; + struct extent_buffer *eb; + int slot; + unsigned long offset; + __le64 subid_le; + + ret = btrfs_uuid_tree_lookup(uuid_root, uuid, type, subid_cpu); + if (ret != -ENOENT) + return ret; + + if (WARN_ON_ONCE(!uuid_root)) { + ret = -EINVAL; + goto out; + } + + btrfs_uuid_to_key(uuid, type, &key); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + ret = btrfs_insert_empty_item(trans, uuid_root, path, &key, + sizeof(subid_le)); + if (ret >= 0) { + /* Add an item for the type for the first time */ + eb = path->nodes[0]; + slot = path->slots[0]; + offset = btrfs_item_ptr_offset(eb, slot); + } else if (ret == -EEXIST) { + /* + * An item with that type already exists. + * Extend the item and store the new subid at the end. + */ + btrfs_extend_item(uuid_root, path, sizeof(subid_le)); + eb = path->nodes[0]; + slot = path->slots[0]; + offset = btrfs_item_ptr_offset(eb, slot); + offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le); + } else if (ret < 0) { + pr_warn("btrfs: insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!\n", + ret, (unsigned long long)key.objectid, + (unsigned long long)key.offset, type); + goto out; + } + + ret = 0; + subid_le = cpu_to_le64(subid_cpu); + write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le)); + btrfs_mark_buffer_dirty(eb); + +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans, + struct btrfs_root *uuid_root, u8 *uuid, u8 type, + u64 subid) +{ + int ret; + struct btrfs_path *path = NULL; + struct btrfs_key key; + struct extent_buffer *eb; + int slot; + unsigned long offset; + u32 item_size; + unsigned long move_dst; + unsigned long move_src; + unsigned long move_len; + + if (WARN_ON_ONCE(!uuid_root)) { + ret = -EINVAL; + goto out; + } + + btrfs_uuid_to_key(uuid, type, &key); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + ret = btrfs_search_slot(trans, uuid_root, &key, path, -1, 1); + if (ret < 0) { + pr_warn("btrfs: error %d while searching for uuid item!\n", + ret); + goto out; + } + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + eb = path->nodes[0]; + slot = path->slots[0]; + offset = btrfs_item_ptr_offset(eb, slot); + item_size = btrfs_item_size_nr(eb, slot); + if (!IS_ALIGNED(item_size, sizeof(u64))) { + pr_warn("btrfs: uuid item with illegal size %lu!\n", + (unsigned long)item_size); + ret = -ENOENT; + goto out; + } + while (item_size) { + __le64 read_subid; + + read_extent_buffer(eb, &read_subid, offset, sizeof(read_subid)); + if (le64_to_cpu(read_subid) == subid) + break; + offset += sizeof(read_subid); + item_size -= sizeof(read_subid); + } + + if (!item_size) { + ret = -ENOENT; + goto out; + } + + item_size = btrfs_item_size_nr(eb, slot); + if (item_size == sizeof(subid)) { + ret = btrfs_del_item(trans, uuid_root, path); + goto out; + } + + move_dst = offset; + move_src = offset + sizeof(subid); + move_len = item_size - (move_src - btrfs_item_ptr_offset(eb, slot)); + memmove_extent_buffer(eb, move_dst, move_src, move_len); + btrfs_truncate_item(uuid_root, path, item_size - sizeof(subid), 1); + +out: + btrfs_free_path(path); + return ret; +} + +static int btrfs_uuid_iter_rem(struct btrfs_root *uuid_root, u8 *uuid, u8 type, + u64 subid) +{ + struct btrfs_trans_handle *trans; + int ret; + + /* 1 - for the uuid item */ + trans = btrfs_start_transaction(uuid_root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + ret = btrfs_uuid_tree_rem(trans, uuid_root, uuid, type, subid); + btrfs_end_transaction(trans, uuid_root); + +out: + return ret; +} + +int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, + int (*check_func)(struct btrfs_fs_info *, u8 *, u8, + u64)) +{ + struct btrfs_root *root = fs_info->uuid_root; + struct btrfs_key key; + struct btrfs_key max_key; + struct btrfs_path *path; + int ret = 0; + struct extent_buffer *leaf; + int slot; + u32 item_size; + unsigned long offset; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = 0; + key.type = 0; + key.offset = 0; + max_key.objectid = (u64)-1; + max_key.type = (u8)-1; + max_key.offset = (u64)-1; + +again_search_slot: + path->keep_locks = 1; + ret = btrfs_search_forward(root, &key, &max_key, path, 0); + if (ret) { + if (ret > 0) + ret = 0; + goto out; + } + + while (1) { + cond_resched(); + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + + if (key.type != BTRFS_UUID_KEY_SUBVOL && + key.type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) + goto skip; + + offset = btrfs_item_ptr_offset(leaf, slot); + item_size = btrfs_item_size_nr(leaf, slot); + if (!IS_ALIGNED(item_size, sizeof(u64))) { + pr_warn("btrfs: uuid item with illegal size %lu!\n", + (unsigned long)item_size); + goto skip; + } + while (item_size) { + u8 uuid[BTRFS_UUID_SIZE]; + __le64 subid_le; + u64 subid_cpu; + + put_unaligned_le64(key.objectid, uuid); + put_unaligned_le64(key.offset, uuid + sizeof(u64)); + read_extent_buffer(leaf, &subid_le, offset, + sizeof(subid_le)); + subid_cpu = le64_to_cpu(subid_le); + ret = check_func(fs_info, uuid, key.type, subid_cpu); + if (ret < 0) + goto out; + if (ret > 0) { + btrfs_release_path(path); + ret = btrfs_uuid_iter_rem(root, uuid, key.type, + subid_cpu); + if (ret == 0) { + /* + * this might look inefficient, but the + * justification is that it is an + * exception that check_func returns 1, + * and that in the regular case only one + * entry per UUID exists. + */ + goto again_search_slot; + } + if (ret < 0 && ret != -ENOENT) + goto out; + } + item_size -= sizeof(subid_le); + offset += sizeof(subid_le); + } + +skip: + ret = btrfs_next_item(root, path); + if (ret == 0) + continue; + else if (ret > 0) + ret = 0; + break; + } + +out: + btrfs_free_path(path); + if (ret) + pr_warn("btrfs: btrfs_uuid_tree_iterate failed %d\n", ret); + return 0; +} diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h deleted file mode 100644 index 9bf3946d5ef2..000000000000 --- a/fs/btrfs/version.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef __BTRFS_VERSION_H -#define __BTRFS_VERSION_H -#define BTRFS_BUILD_VERSION "Btrfs" -#endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8bffb9174afb..043b215769c2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -26,6 +26,7 @@ #include <linux/ratelimit.h> #include <linux/kthread.h> #include <linux/raid/pq.h> +#include <linux/semaphore.h> #include <asm/div64.h> #include "compat.h" #include "ctree.h" @@ -62,6 +63,48 @@ static void unlock_chunks(struct btrfs_root *root) mutex_unlock(&root->fs_info->chunk_mutex); } +static struct btrfs_fs_devices *__alloc_fs_devices(void) +{ + struct btrfs_fs_devices *fs_devs; + + fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS); + if (!fs_devs) + return ERR_PTR(-ENOMEM); + + mutex_init(&fs_devs->device_list_mutex); + + INIT_LIST_HEAD(&fs_devs->devices); + INIT_LIST_HEAD(&fs_devs->alloc_list); + INIT_LIST_HEAD(&fs_devs->list); + + return fs_devs; +} + +/** + * alloc_fs_devices - allocate struct btrfs_fs_devices + * @fsid: a pointer to UUID for this FS. If NULL a new UUID is + * generated. + * + * Return: a pointer to a new &struct btrfs_fs_devices on success; + * ERR_PTR() on error. Returned struct is not linked onto any lists and + * can be destroyed with kfree() right away. + */ +static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) +{ + struct btrfs_fs_devices *fs_devs; + + fs_devs = __alloc_fs_devices(); + if (IS_ERR(fs_devs)) + return fs_devs; + + if (fsid) + memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); + else + generate_random_uuid(fs_devs->fsid); + + return fs_devs; +} + static void free_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device; @@ -101,6 +144,27 @@ void btrfs_cleanup_fs_uuids(void) } } +static struct btrfs_device *__alloc_device(void) +{ + struct btrfs_device *dev; + + dev = kzalloc(sizeof(*dev), GFP_NOFS); + if (!dev) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&dev->dev_list); + INIT_LIST_HEAD(&dev->dev_alloc_list); + + spin_lock_init(&dev->io_lock); + + spin_lock_init(&dev->reada_lock); + atomic_set(&dev->reada_in_flight, 0); + INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT); + INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT); + + return dev; +} + static noinline struct btrfs_device *__find_device(struct list_head *head, u64 devid, u8 *uuid) { @@ -395,16 +459,14 @@ static noinline int device_list_add(const char *path, fs_devices = find_fsid(disk_super->fsid); if (!fs_devices) { - fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); - if (!fs_devices) - return -ENOMEM; - INIT_LIST_HEAD(&fs_devices->devices); - INIT_LIST_HEAD(&fs_devices->alloc_list); + fs_devices = alloc_fs_devices(disk_super->fsid); + if (IS_ERR(fs_devices)) + return PTR_ERR(fs_devices); + list_add(&fs_devices->list, &fs_uuids); - memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); fs_devices->latest_devid = devid; fs_devices->latest_trans = found_transid; - mutex_init(&fs_devices->device_list_mutex); + device = NULL; } else { device = __find_device(&fs_devices->devices, devid, @@ -414,17 +476,12 @@ static noinline int device_list_add(const char *path, if (fs_devices->opened) return -EBUSY; - device = kzalloc(sizeof(*device), GFP_NOFS); - if (!device) { + device = btrfs_alloc_device(NULL, &devid, + disk_super->dev_item.uuid); + if (IS_ERR(device)) { /* we can safely leave the fs_devices entry around */ - return -ENOMEM; + return PTR_ERR(device); } - device->devid = devid; - device->dev_stats_valid = 0; - device->work.func = pending_bios_fn; - memcpy(device->uuid, disk_super->dev_item.uuid, - BTRFS_UUID_SIZE); - spin_lock_init(&device->io_lock); name = rcu_string_strdup(path, GFP_NOFS); if (!name) { @@ -432,22 +489,13 @@ static noinline int device_list_add(const char *path, return -ENOMEM; } rcu_assign_pointer(device->name, name); - INIT_LIST_HEAD(&device->dev_alloc_list); - - /* init readahead state */ - spin_lock_init(&device->reada_lock); - device->reada_curr_zone = NULL; - atomic_set(&device->reada_in_flight, 0); - device->reada_next = 0; - INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT); - INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT); mutex_lock(&fs_devices->device_list_mutex); list_add_rcu(&device->dev_list, &fs_devices->devices); + fs_devices->num_devices++; mutex_unlock(&fs_devices->device_list_mutex); device->fs_devices = fs_devices; - fs_devices->num_devices++; } else if (!device->name || strcmp(device->name->str, path)) { name = rcu_string_strdup(path, GFP_NOFS); if (!name) @@ -474,25 +522,21 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) struct btrfs_device *device; struct btrfs_device *orig_dev; - fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); - if (!fs_devices) - return ERR_PTR(-ENOMEM); + fs_devices = alloc_fs_devices(orig->fsid); + if (IS_ERR(fs_devices)) + return fs_devices; - INIT_LIST_HEAD(&fs_devices->devices); - INIT_LIST_HEAD(&fs_devices->alloc_list); - INIT_LIST_HEAD(&fs_devices->list); - mutex_init(&fs_devices->device_list_mutex); fs_devices->latest_devid = orig->latest_devid; fs_devices->latest_trans = orig->latest_trans; fs_devices->total_devices = orig->total_devices; - memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); /* We have held the volume lock, it is safe to get the devices. */ list_for_each_entry(orig_dev, &orig->devices, dev_list) { struct rcu_string *name; - device = kzalloc(sizeof(*device), GFP_NOFS); - if (!device) + device = btrfs_alloc_device(NULL, &orig_dev->devid, + orig_dev->uuid); + if (IS_ERR(device)) goto error; /* @@ -506,13 +550,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) } rcu_assign_pointer(device->name, name); - device->devid = orig_dev->devid; - device->work.func = pending_bios_fn; - memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); - spin_lock_init(&device->io_lock); - INIT_LIST_HEAD(&device->dev_list); - INIT_LIST_HEAD(&device->dev_alloc_list); - list_add(&device->dev_list, &fs_devices->devices); device->fs_devices = fs_devices; fs_devices->num_devices++; @@ -636,23 +673,22 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) if (device->can_discard) fs_devices->num_can_discard--; + if (device->missing) + fs_devices->missing_devices--; - new_device = kmalloc(sizeof(*new_device), GFP_NOFS); - BUG_ON(!new_device); /* -ENOMEM */ - memcpy(new_device, device, sizeof(*new_device)); + new_device = btrfs_alloc_device(NULL, &device->devid, + device->uuid); + BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ /* Safe because we are under uuid_mutex */ if (device->name) { name = rcu_string_strdup(device->name->str, GFP_NOFS); - BUG_ON(device->name && !name); /* -ENOMEM */ + BUG_ON(!name); /* -ENOMEM */ rcu_assign_pointer(new_device->name, name); } - new_device->bdev = NULL; - new_device->writeable = 0; - new_device->in_fs_metadata = 0; - new_device->can_discard = 0; - spin_lock_init(&new_device->io_lock); + list_replace_rcu(&device->dev_list, &new_device->dev_list); + new_device->fs_devices = device->fs_devices; call_rcu(&device->rcu, free_device); } @@ -760,7 +796,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fs_devices->rotating = 1; fs_devices->open_devices++; - if (device->writeable && !device->is_tgtdev_for_dev_replace) { + if (device->writeable && + device->devid != BTRFS_DEV_REPLACE_DEVID) { fs_devices->rw_devices++; list_add(&device->dev_alloc_list, &fs_devices->alloc_list); @@ -865,7 +902,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, disk_super = p + (bytenr & ~PAGE_CACHE_MASK); if (btrfs_super_bytenr(disk_super) != bytenr || - disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) + btrfs_super_magic(disk_super) != BTRFS_MAGIC) goto error_unmap; devid = btrfs_stack_device_id(&disk_super->dev_item); @@ -875,13 +912,12 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, if (disk_super->label[0]) { if (disk_super->label[BTRFS_LABEL_SIZE - 1]) disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; - printk(KERN_INFO "device label %s ", disk_super->label); + printk(KERN_INFO "btrfs: device label %s ", disk_super->label); } else { - printk(KERN_INFO "device fsid %pU ", disk_super->fsid); + printk(KERN_INFO "btrfs: device fsid %pU ", disk_super->fsid); } - printk(KERN_CONT "devid %llu transid %llu %s\n", - (unsigned long long)devid, (unsigned long long)transid, path); + printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path); ret = device_list_add(path, disk_super, devid, fs_devices_ret); if (!ret && fs_devices_ret) @@ -982,6 +1018,35 @@ out: return ret; } +static int contains_pending_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 *start, u64 len) +{ + struct extent_map *em; + int ret = 0; + + list_for_each_entry(em, &trans->transaction->pending_chunks, list) { + struct map_lookup *map; + int i; + + map = (struct map_lookup *)em->bdev; + for (i = 0; i < map->num_stripes; i++) { + if (map->stripes[i].dev != device) + continue; + if (map->stripes[i].physical >= *start + len || + map->stripes[i].physical + em->orig_block_len <= + *start) + continue; + *start = map->stripes[i].physical + + em->orig_block_len; + ret = 1; + } + } + + return ret; +} + + /* * find_free_dev_extent - find free space in the specified device * @device: the device which we search the free space in @@ -1002,7 +1067,8 @@ out: * But if we don't find suitable free space, it is used to store the size of * the max free space. */ -int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, +int find_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *len) { struct btrfs_key key; @@ -1026,21 +1092,22 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, */ search_start = max(root->fs_info->alloc_start, 1024ull * 1024); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; +again: max_hole_start = search_start; max_hole_size = 0; hole_size = 0; if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { ret = -ENOSPC; - goto error; + goto out; } - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto error; - } path->reada = 2; + path->search_commit_root = 1; + path->skip_locking = 1; key.objectid = device->devid; key.offset = search_start; @@ -1081,6 +1148,15 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, if (key.offset > search_start) { hole_size = key.offset - search_start; + /* + * Have to check before we set max_hole_start, otherwise + * we could end up sending back this offset anyway. + */ + if (contains_pending_extent(trans, device, + &search_start, + hole_size)) + hole_size = 0; + if (hole_size > max_hole_size) { max_hole_start = search_start; max_hole_size = hole_size; @@ -1124,6 +1200,11 @@ next: max_hole_size = hole_size; } + if (contains_pending_extent(trans, device, &search_start, hole_size)) { + btrfs_release_path(path); + goto again; + } + /* See above. */ if (hole_size < num_bytes) ret = -ENOSPC; @@ -1132,7 +1213,6 @@ next: out: btrfs_free_path(path); -error: *start = max_hole_start; if (len) *len = max_hole_size; @@ -1234,8 +1314,7 @@ static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent), - BTRFS_UUID_SIZE); + btrfs_dev_extent_chunk_tree_uuid(extent), BTRFS_UUID_SIZE); btrfs_set_dev_extent_length(leaf, extent, num_bytes); btrfs_mark_buffer_dirty(leaf); @@ -1244,59 +1323,33 @@ out: return ret; } -static noinline int find_next_chunk(struct btrfs_root *root, - u64 objectid, u64 *offset) +static u64 find_next_chunk(struct btrfs_fs_info *fs_info) { - struct btrfs_path *path; - int ret; - struct btrfs_key key; - struct btrfs_chunk *chunk; - struct btrfs_key found_key; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = objectid; - key.offset = (u64)-1; - key.type = BTRFS_CHUNK_ITEM_KEY; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto error; - - BUG_ON(ret == 0); /* Corruption */ + struct extent_map_tree *em_tree; + struct extent_map *em; + struct rb_node *n; + u64 ret = 0; - ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); - if (ret) { - *offset = 0; - } else { - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); - if (found_key.objectid != objectid) - *offset = 0; - else { - chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_chunk); - *offset = found_key.offset + - btrfs_chunk_length(path->nodes[0], chunk); - } + em_tree = &fs_info->mapping_tree.map_tree; + read_lock(&em_tree->lock); + n = rb_last(&em_tree->map); + if (n) { + em = rb_entry(n, struct extent_map, rb_node); + ret = em->start + em->len; } - ret = 0; -error: - btrfs_free_path(path); + read_unlock(&em_tree->lock); + return ret; } -static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) +static noinline int find_next_devid(struct btrfs_fs_info *fs_info, + u64 *devid_ret) { int ret; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_path *path; - root = root->fs_info->chunk_root; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1305,20 +1358,21 @@ static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) key.type = BTRFS_DEV_ITEM_KEY; key.offset = (u64)-1; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); if (ret < 0) goto error; BUG_ON(ret == 0); /* Corruption */ - ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID, + ret = btrfs_previous_item(fs_info->chunk_root, path, + BTRFS_DEV_ITEMS_OBJECTID, BTRFS_DEV_ITEM_KEY); if (ret) { - *objectid = 1; + *devid_ret = 1; } else { btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); - *objectid = found_key.offset + 1; + *devid_ret = found_key.offset + 1; } ret = 0; error: @@ -1372,9 +1426,9 @@ static int btrfs_add_device(struct btrfs_trans_handle *trans, btrfs_set_device_bandwidth(leaf, dev_item, 0); btrfs_set_device_start_offset(leaf, dev_item, 0); - ptr = (unsigned long)btrfs_device_uuid(dev_item); + ptr = btrfs_device_uuid(dev_item); write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); - ptr = (unsigned long)btrfs_device_fsid(dev_item); + ptr = btrfs_device_fsid(dev_item); write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); btrfs_mark_buffer_dirty(leaf); @@ -1462,31 +1516,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) btrfs_dev_replace_unlock(&root->fs_info->dev_replace); if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { - printk(KERN_ERR "btrfs: unable to go below four devices " - "on raid10\n"); - ret = -EINVAL; + ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET; goto out; } if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { - printk(KERN_ERR "btrfs: unable to go below two " - "devices on raid1\n"); - ret = -EINVAL; + ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET; goto out; } if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && root->fs_info->fs_devices->rw_devices <= 2) { - printk(KERN_ERR "btrfs: unable to go below two " - "devices on raid5\n"); - ret = -EINVAL; + ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET; goto out; } if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && root->fs_info->fs_devices->rw_devices <= 3) { - printk(KERN_ERR "btrfs: unable to go below three " - "devices on raid6\n"); - ret = -EINVAL; + ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET; goto out; } @@ -1512,8 +1558,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) bh = NULL; disk_super = NULL; if (!device) { - printk(KERN_ERR "btrfs: no missing devices found to " - "remove\n"); + ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; goto out; } } else { @@ -1535,15 +1580,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } if (device->is_tgtdev_for_dev_replace) { - pr_err("btrfs: unable to remove the dev_replace target dev\n"); - ret = -EINVAL; + ret = BTRFS_ERROR_DEV_TGT_REPLACE; goto error_brelse; } if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { - printk(KERN_ERR "btrfs: unable to remove the only writeable " - "device\n"); - ret = -EINVAL; + ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; goto error_brelse; } @@ -1555,7 +1597,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) clear_super = true; } + mutex_unlock(&uuid_mutex); ret = btrfs_shrink_device(device, 0); + mutex_lock(&uuid_mutex); if (ret) goto error_undo; @@ -1579,7 +1623,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) /* * the device list mutex makes sure that we don't change * the device list while someone else is writing out all - * the device supers. + * the device supers. Whoever is writing all supers, should + * lock the device list mutex before getting the number of + * devices in the super block (super_copy). Conversely, + * whoever updates the number of devices in the super block + * (super_copy) should hold the device list mutex. */ cur_devices = device->fs_devices; @@ -1603,10 +1651,10 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) device->fs_devices->open_devices--; call_rcu(&device->rcu, free_device); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (cur_devices->open_devices == 0) { struct btrfs_fs_devices *fs_devices; @@ -1668,6 +1716,7 @@ void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, struct btrfs_device *srcdev) { WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex)); + list_del_rcu(&srcdev->dev_list); list_del_rcu(&srcdev->dev_alloc_list); fs_info->fs_devices->num_devices--; @@ -1677,9 +1726,13 @@ void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, } if (srcdev->can_discard) fs_info->fs_devices->num_can_discard--; - if (srcdev->bdev) + if (srcdev->bdev) { fs_info->fs_devices->open_devices--; + /* zero out the old super */ + btrfs_scratch_superblock(srcdev); + } + call_rcu(&srcdev->rcu, free_device); } @@ -1786,9 +1839,9 @@ static int btrfs_prepare_sprout(struct btrfs_root *root) if (!fs_devices->seeding) return -EINVAL; - seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); - if (!seed_devices) - return -ENOMEM; + seed_devices = __alloc_fs_devices(); + if (IS_ERR(seed_devices)) + return PTR_ERR(seed_devices); old_devices = clone_fs_devices(fs_devices); if (IS_ERR(old_devices)) { @@ -1807,7 +1860,6 @@ static int btrfs_prepare_sprout(struct btrfs_root *root) mutex_lock(&root->fs_info->fs_devices->device_list_mutex); list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, synchronize_rcu); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); list_for_each_entry(device, &seed_devices->devices, dev_list) { @@ -1823,6 +1875,8 @@ static int btrfs_prepare_sprout(struct btrfs_root *root) generate_random_uuid(fs_devices->fsid); memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + super_flags = btrfs_super_flags(disk_super) & ~BTRFS_SUPER_FLAG_SEEDING; btrfs_set_super_flags(disk_super, super_flags); @@ -1882,11 +1936,9 @@ next_slot: dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); devid = btrfs_device_id(leaf, dev_item); - read_extent_buffer(leaf, dev_uuid, - (unsigned long)btrfs_device_uuid(dev_item), + read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), BTRFS_UUID_SIZE); - read_extent_buffer(leaf, fs_uuid, - (unsigned long)btrfs_device_fsid(dev_item), + read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_UUID_SIZE); device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid); @@ -1949,10 +2001,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) } mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - device = kzalloc(sizeof(*device), GFP_NOFS); - if (!device) { + device = btrfs_alloc_device(root->fs_info, NULL, NULL); + if (IS_ERR(device)) { /* we can safely leave the fs_devices entry around */ - ret = -ENOMEM; + ret = PTR_ERR(device); goto error; } @@ -1964,13 +2016,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) } rcu_assign_pointer(device->name, name); - ret = find_next_devid(root, &device->devid); - if (ret) { - rcu_string_free(device->name); - kfree(device); - goto error; - } - trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { rcu_string_free(device->name); @@ -1985,9 +2030,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if (blk_queue_discard(q)) device->can_discard = 1; device->writeable = 1; - device->work.func = pending_bios_fn; - generate_random_uuid(device->uuid); - spin_lock_init(&device->io_lock); device->generation = trans->transid; device->io_width = root->sectorsize; device->io_align = root->sectorsize; @@ -2114,6 +2156,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, struct btrfs_fs_info *fs_info = root->fs_info; struct list_head *devices; struct rcu_string *name; + u64 devid = BTRFS_DEV_REPLACE_DEVID; int ret = 0; *device_out = NULL; @@ -2135,9 +2178,9 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, } } - device = kzalloc(sizeof(*device), GFP_NOFS); - if (!device) { - ret = -ENOMEM; + device = btrfs_alloc_device(NULL, &devid, NULL); + if (IS_ERR(device)) { + ret = PTR_ERR(device); goto error; } @@ -2154,10 +2197,6 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, device->can_discard = 1; mutex_lock(&root->fs_info->fs_devices->device_list_mutex); device->writeable = 1; - device->work.func = pending_bios_fn; - generate_random_uuid(device->uuid); - device->devid = BTRFS_DEV_REPLACE_DEVID; - spin_lock_init(&device->io_lock); device->generation = 0; device->io_width = root->sectorsize; device->io_align = root->sectorsize; @@ -2964,10 +3003,6 @@ again: if (found_key.objectid != key.objectid) break; - /* chunk zero is special */ - if (found_key.offset == 0) - break; - chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); if (!counting) { @@ -3003,6 +3038,8 @@ again: spin_unlock(&fs_info->balance_lock); } loop: + if (found_key.offset == 0) + break; key.offset = found_key.offset - 1; } @@ -3067,9 +3104,6 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) atomic_set(&fs_info->mutually_exclusive_operation_running, 0); } -void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, - struct btrfs_ioctl_balance_args *bargs); - /* * Should be called with both balance and volume mutexes held */ @@ -3132,7 +3166,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, (bctl->data.target & ~allowed))) { printk(KERN_ERR "btrfs: unable to start balance with target " "data profile %llu\n", - (unsigned long long)bctl->data.target); + bctl->data.target); ret = -EINVAL; goto out; } @@ -3141,7 +3175,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, (bctl->meta.target & ~allowed))) { printk(KERN_ERR "btrfs: unable to start balance with target " "metadata profile %llu\n", - (unsigned long long)bctl->meta.target); + bctl->meta.target); ret = -EINVAL; goto out; } @@ -3150,7 +3184,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, (bctl->sys.target & ~allowed))) { printk(KERN_ERR "btrfs: unable to start balance with target " "system profile %llu\n", - (unsigned long long)bctl->sys.target); + bctl->sys.target); ret = -EINVAL; goto out; } @@ -3295,10 +3329,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) } tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); - if (IS_ERR(tsk)) - return PTR_ERR(tsk); - - return 0; + return PTR_ERR_OR_ZERO(tsk); } int btrfs_recover_balance(struct btrfs_fs_info *fs_info) @@ -3426,6 +3457,264 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) return 0; } +static int btrfs_uuid_scan_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = data; + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_key key; + struct btrfs_key max_key; + struct btrfs_path *path = NULL; + int ret = 0; + struct extent_buffer *eb; + int slot; + struct btrfs_root_item root_item; + u32 item_size; + struct btrfs_trans_handle *trans = NULL; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = 0; + + max_key.objectid = (u64)-1; + max_key.type = BTRFS_ROOT_ITEM_KEY; + max_key.offset = (u64)-1; + + path->keep_locks = 1; + + while (1) { + ret = btrfs_search_forward(root, &key, &max_key, path, 0); + if (ret) { + if (ret > 0) + ret = 0; + break; + } + + if (key.type != BTRFS_ROOT_ITEM_KEY || + (key.objectid < BTRFS_FIRST_FREE_OBJECTID && + key.objectid != BTRFS_FS_TREE_OBJECTID) || + key.objectid > BTRFS_LAST_FREE_OBJECTID) + goto skip; + + eb = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size_nr(eb, slot); + if (item_size < sizeof(root_item)) + goto skip; + + read_extent_buffer(eb, &root_item, + btrfs_item_ptr_offset(eb, slot), + (int)sizeof(root_item)); + if (btrfs_root_refs(&root_item) == 0) + goto skip; + + if (!btrfs_is_empty_uuid(root_item.uuid) || + !btrfs_is_empty_uuid(root_item.received_uuid)) { + if (trans) + goto update_tree; + + btrfs_release_path(path); + /* + * 1 - subvol uuid item + * 1 - received_subvol uuid item + */ + trans = btrfs_start_transaction(fs_info->uuid_root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + continue; + } else { + goto skip; + } +update_tree: + if (!btrfs_is_empty_uuid(root_item.uuid)) { + ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, + root_item.uuid, + BTRFS_UUID_KEY_SUBVOL, + key.objectid); + if (ret < 0) { + pr_warn("btrfs: uuid_tree_add failed %d\n", + ret); + break; + } + } + + if (!btrfs_is_empty_uuid(root_item.received_uuid)) { + ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, + root_item.received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + key.objectid); + if (ret < 0) { + pr_warn("btrfs: uuid_tree_add failed %d\n", + ret); + break; + } + } + +skip: + if (trans) { + ret = btrfs_end_transaction(trans, fs_info->uuid_root); + trans = NULL; + if (ret) + break; + } + + btrfs_release_path(path); + if (key.offset < (u64)-1) { + key.offset++; + } else if (key.type < BTRFS_ROOT_ITEM_KEY) { + key.offset = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + } else if (key.objectid < (u64)-1) { + key.offset = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + key.objectid++; + } else { + break; + } + cond_resched(); + } + +out: + btrfs_free_path(path); + if (trans && !IS_ERR(trans)) + btrfs_end_transaction(trans, fs_info->uuid_root); + if (ret) + pr_warn("btrfs: btrfs_uuid_scan_kthread failed %d\n", ret); + else + fs_info->update_uuid_tree_gen = 1; + up(&fs_info->uuid_tree_rescan_sem); + return 0; +} + +/* + * Callback for btrfs_uuid_tree_iterate(). + * returns: + * 0 check succeeded, the entry is not outdated. + * < 0 if an error occured. + * > 0 if the check failed, which means the caller shall remove the entry. + */ +static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, + u8 *uuid, u8 type, u64 subid) +{ + struct btrfs_key key; + int ret = 0; + struct btrfs_root *subvol_root; + + if (type != BTRFS_UUID_KEY_SUBVOL && + type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) + goto out; + + key.objectid = subid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + subvol_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(subvol_root)) { + ret = PTR_ERR(subvol_root); + if (ret == -ENOENT) + ret = 1; + goto out; + } + + switch (type) { + case BTRFS_UUID_KEY_SUBVOL: + if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) + ret = 1; + break; + case BTRFS_UUID_KEY_RECEIVED_SUBVOL: + if (memcmp(uuid, subvol_root->root_item.received_uuid, + BTRFS_UUID_SIZE)) + ret = 1; + break; + } + +out: + return ret; +} + +static int btrfs_uuid_rescan_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; + int ret; + + /* + * 1st step is to iterate through the existing UUID tree and + * to delete all entries that contain outdated data. + * 2nd step is to add all missing entries to the UUID tree. + */ + ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); + if (ret < 0) { + pr_warn("btrfs: iterating uuid_tree failed %d\n", ret); + up(&fs_info->uuid_tree_rescan_sem); + return ret; + } + return btrfs_uuid_scan_kthread(data); +} + +int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *uuid_root; + struct task_struct *task; + int ret; + + /* + * 1 - root node + * 1 - root item + */ + trans = btrfs_start_transaction(tree_root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + uuid_root = btrfs_create_tree(trans, fs_info, + BTRFS_UUID_TREE_OBJECTID); + if (IS_ERR(uuid_root)) { + btrfs_abort_transaction(trans, tree_root, + PTR_ERR(uuid_root)); + return PTR_ERR(uuid_root); + } + + fs_info->uuid_root = uuid_root; + + ret = btrfs_commit_transaction(trans, tree_root); + if (ret) + return ret; + + down(&fs_info->uuid_tree_rescan_sem); + task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); + if (IS_ERR(task)) { + /* fs_info->update_uuid_tree_gen remains 0 in all error case */ + pr_warn("btrfs: failed to start uuid_scan task\n"); + up(&fs_info->uuid_tree_rescan_sem); + return PTR_ERR(task); + } + + return 0; +} + +int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) +{ + struct task_struct *task; + + down(&fs_info->uuid_tree_rescan_sem); + task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); + if (IS_ERR(task)) { + /* fs_info->update_uuid_tree_gen remains 0 in all error case */ + pr_warn("btrfs: failed to start uuid_rescan task\n"); + up(&fs_info->uuid_tree_rescan_sem); + return PTR_ERR(task); + } + + return 0; +} + /* * shrinking a device means finding all of the device extents past * the new size, and then following the back refs to the chunks. @@ -3681,10 +3970,8 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) } static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct map_lookup **map_ret, - u64 *num_bytes_out, u64 *stripe_size_out, - u64 start, u64 type) + struct btrfs_root *extent_root, u64 start, + u64 type) { struct btrfs_fs_info *info = extent_root->fs_info; struct btrfs_fs_devices *fs_devices = info->fs_devices; @@ -3791,7 +4078,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (total_avail == 0) continue; - ret = find_free_dev_extent(device, + ret = find_free_dev_extent(trans, device, max_stripe_size * dev_stripes, &dev_offset, &max_avail); if (ret && ret != -ENOSPC) @@ -3903,12 +4190,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, map->type = type; map->sub_stripes = sub_stripes; - *map_ret = map; num_bytes = stripe_size * data_stripes; - *stripe_size_out = stripe_size; - *num_bytes_out = num_bytes; - trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes); em = alloc_extent_map(); @@ -3921,38 +4204,26 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, em->len = num_bytes; em->block_start = 0; em->block_len = em->len; + em->orig_block_len = stripe_size; em_tree = &extent_root->fs_info->mapping_tree.map_tree; write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); + if (!ret) { + list_add_tail(&em->list, &trans->transaction->pending_chunks); + atomic_inc(&em->refs); + } write_unlock(&em_tree->lock); if (ret) { free_extent_map(em); goto error; } - for (i = 0; i < map->num_stripes; ++i) { - struct btrfs_device *device; - u64 dev_offset; - - device = map->stripes[i].dev; - dev_offset = map->stripes[i].physical; - - ret = btrfs_alloc_dev_extent(trans, device, - info->chunk_root->root_key.objectid, - BTRFS_FIRST_CHUNK_TREE_OBJECTID, - start, dev_offset, stripe_size); - if (ret) - goto error_dev_extent; - } - ret = btrfs_make_block_group(trans, extent_root, 0, type, BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes); - if (ret) { - i = map->num_stripes - 1; - goto error_dev_extent; - } + if (ret) + goto error_del_extent; free_extent_map(em); check_raid56_incompat_flag(extent_root->fs_info, type); @@ -3960,18 +4231,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, kfree(devices_info); return 0; -error_dev_extent: - for (; i >= 0; i--) { - struct btrfs_device *device; - int err; - - device = map->stripes[i].dev; - err = btrfs_free_dev_extent(trans, device, start); - if (err) { - btrfs_abort_transaction(trans, extent_root, err); - break; - } - } +error_del_extent: write_lock(&em_tree->lock); remove_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); @@ -3986,33 +4246,68 @@ error: return ret; } -static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, +int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, - struct map_lookup *map, u64 chunk_offset, - u64 chunk_size, u64 stripe_size) + u64 chunk_offset, u64 chunk_size) { - u64 dev_offset; struct btrfs_key key; struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; struct btrfs_device *device; struct btrfs_chunk *chunk; struct btrfs_stripe *stripe; - size_t item_size = btrfs_chunk_item_size(map->num_stripes); - int index = 0; + struct extent_map_tree *em_tree; + struct extent_map *em; + struct map_lookup *map; + size_t item_size; + u64 dev_offset; + u64 stripe_size; + int i = 0; int ret; + em_tree = &extent_root->fs_info->mapping_tree.map_tree; + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size); + read_unlock(&em_tree->lock); + + if (!em) { + btrfs_crit(extent_root->fs_info, "unable to find logical " + "%Lu len %Lu", chunk_offset, chunk_size); + return -EINVAL; + } + + if (em->start != chunk_offset || em->len != chunk_size) { + btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted" + " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset, + chunk_size, em->start, em->len); + free_extent_map(em); + return -EINVAL; + } + + map = (struct map_lookup *)em->bdev; + item_size = btrfs_chunk_item_size(map->num_stripes); + stripe_size = em->orig_block_len; + chunk = kzalloc(item_size, GFP_NOFS); - if (!chunk) - return -ENOMEM; + if (!chunk) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < map->num_stripes; i++) { + device = map->stripes[i].dev; + dev_offset = map->stripes[i].physical; - index = 0; - while (index < map->num_stripes) { - device = map->stripes[index].dev; device->bytes_used += stripe_size; ret = btrfs_update_device(trans, device); if (ret) - goto out_free; - index++; + goto out; + ret = btrfs_alloc_dev_extent(trans, device, + chunk_root->root_key.objectid, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, + chunk_offset, dev_offset, + stripe_size); + if (ret) + goto out; } spin_lock(&extent_root->fs_info->free_chunk_lock); @@ -4020,17 +4315,15 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, map->num_stripes); spin_unlock(&extent_root->fs_info->free_chunk_lock); - index = 0; stripe = &chunk->stripe; - while (index < map->num_stripes) { - device = map->stripes[index].dev; - dev_offset = map->stripes[index].physical; + for (i = 0; i < map->num_stripes; i++) { + device = map->stripes[i].dev; + dev_offset = map->stripes[i].physical; btrfs_set_stack_stripe_devid(stripe, device->devid); btrfs_set_stack_stripe_offset(stripe, dev_offset); memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); stripe++; - index++; } btrfs_set_stack_chunk_length(chunk, chunk_size); @@ -4048,7 +4341,6 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, key.offset = chunk_offset; ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); - if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { /* * TODO: Cleanup of inserted chunk root in case of @@ -4058,8 +4350,9 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, item_size); } -out_free: +out: kfree(chunk); + free_extent_map(em); return ret; } @@ -4074,27 +4367,9 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 type) { u64 chunk_offset; - u64 chunk_size; - u64 stripe_size; - struct map_lookup *map; - struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; - int ret; - ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, - &chunk_offset); - if (ret) - return ret; - - ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, - &stripe_size, chunk_offset, type); - if (ret) - return ret; - - ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, - chunk_size, stripe_size); - if (ret) - return ret; - return 0; + chunk_offset = find_next_chunk(extent_root->fs_info); + return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type); } static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, @@ -4103,66 +4378,31 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, { u64 chunk_offset; u64 sys_chunk_offset; - u64 chunk_size; - u64 sys_chunk_size; - u64 stripe_size; - u64 sys_stripe_size; u64 alloc_profile; - struct map_lookup *map; - struct map_lookup *sys_map; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *extent_root = fs_info->extent_root; int ret; - ret = find_next_chunk(fs_info->chunk_root, - BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); - if (ret) - return ret; - + chunk_offset = find_next_chunk(fs_info); alloc_profile = btrfs_get_alloc_profile(extent_root, 0); - ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, - &stripe_size, chunk_offset, alloc_profile); + ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset, + alloc_profile); if (ret) return ret; - sys_chunk_offset = chunk_offset + chunk_size; - + sys_chunk_offset = find_next_chunk(root->fs_info); alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); - ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, - &sys_chunk_size, &sys_stripe_size, - sys_chunk_offset, alloc_profile); + ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset, + alloc_profile); if (ret) { btrfs_abort_transaction(trans, root, ret); goto out; } ret = btrfs_add_device(trans, fs_info->chunk_root, device); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto out; - } - - /* - * Modifying chunk tree needs allocating new blocks from both - * system block group and metadata block group. So we only can - * do operations require modifying the chunk tree after both - * block groups were created. - */ - ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, - chunk_size, stripe_size); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto out; - } - - ret = __finish_chunk_alloc(trans, extent_root, sys_map, - sys_chunk_offset, sys_chunk_size, - sys_stripe_size); if (ret) btrfs_abort_transaction(trans, root, ret); - out: - return ret; } @@ -4239,13 +4479,13 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) * and exit, so return 1 so the callers don't try to use other copies. */ if (!em) { - btrfs_emerg(fs_info, "No mapping for %Lu-%Lu\n", logical, + btrfs_crit(fs_info, "No mapping for %Lu-%Lu\n", logical, logical+len); return 1; } if (em->start > logical || em->start + em->len < logical) { - btrfs_emerg(fs_info, "Invalid mapping for %Lu-%Lu, got " + btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got " "%Lu-%Lu\n", logical, logical+len, em->start, em->start + em->len); return 1; @@ -4420,8 +4660,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (!em) { btrfs_crit(fs_info, "unable to find logical %llu len %llu", - (unsigned long long)logical, - (unsigned long long)*length); + logical, *length); return -EINVAL; } @@ -4435,9 +4674,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, map = (struct map_lookup *)em->bdev; offset = logical - em->start; - if (mirror_num > map->num_stripes) - mirror_num = 0; - stripe_len = map->stripe_len; stripe_nr = offset; /* @@ -4719,6 +4955,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS); if (!bbio) { + kfree(raid_map); ret = -ENOMEM; goto out; } @@ -5294,9 +5531,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, if (map_length < length) { btrfs_crit(root->fs_info, "mapping failed logical %llu bio len %llu len %llu", - (unsigned long long)logical, - (unsigned long long)length, - (unsigned long long)map_length); + logical, length, map_length); BUG(); } @@ -5362,24 +5597,72 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root, struct btrfs_device *device; struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; - device = kzalloc(sizeof(*device), GFP_NOFS); - if (!device) + device = btrfs_alloc_device(NULL, &devid, dev_uuid); + if (IS_ERR(device)) return NULL; - list_add(&device->dev_list, - &fs_devices->devices); - device->dev_root = root->fs_info->dev_root; - device->devid = devid; - device->work.func = pending_bios_fn; + + list_add(&device->dev_list, &fs_devices->devices); device->fs_devices = fs_devices; - device->missing = 1; fs_devices->num_devices++; + + device->missing = 1; fs_devices->missing_devices++; - spin_lock_init(&device->io_lock); - INIT_LIST_HEAD(&device->dev_alloc_list); - memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); + return device; } +/** + * btrfs_alloc_device - allocate struct btrfs_device + * @fs_info: used only for generating a new devid, can be NULL if + * devid is provided (i.e. @devid != NULL). + * @devid: a pointer to devid for this device. If NULL a new devid + * is generated. + * @uuid: a pointer to UUID for this device. If NULL a new UUID + * is generated. + * + * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() + * on error. Returned struct is not linked onto any lists and can be + * destroyed with kfree() right away. + */ +struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, + const u64 *devid, + const u8 *uuid) +{ + struct btrfs_device *dev; + u64 tmp; + + if (!devid && !fs_info) { + WARN_ON(1); + return ERR_PTR(-EINVAL); + } + + dev = __alloc_device(); + if (IS_ERR(dev)) + return dev; + + if (devid) + tmp = *devid; + else { + int ret; + + ret = find_next_devid(fs_info, &tmp); + if (ret) { + kfree(dev); + return ERR_PTR(ret); + } + } + dev->devid = tmp; + + if (uuid) + memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); + else + generate_random_uuid(dev->uuid); + + dev->work.func = pending_bios_fn; + + return dev; +} + static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) @@ -5486,7 +5769,7 @@ static void fill_device_from_item(struct extent_buffer *leaf, WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); device->is_tgtdev_for_dev_replace = 0; - ptr = (unsigned long)btrfs_device_uuid(dev_item); + ptr = btrfs_device_uuid(dev_item); read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); } @@ -5549,11 +5832,9 @@ static int read_one_dev(struct btrfs_root *root, u8 dev_uuid[BTRFS_UUID_SIZE]; devid = btrfs_device_id(leaf, dev_item); - read_extent_buffer(leaf, dev_uuid, - (unsigned long)btrfs_device_uuid(dev_item), + read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), BTRFS_UUID_SIZE); - read_extent_buffer(leaf, fs_uuid, - (unsigned long)btrfs_device_fsid(dev_item), + read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_UUID_SIZE); if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { @@ -5568,8 +5849,7 @@ static int read_one_dev(struct btrfs_root *root, return -EIO; if (!device) { - btrfs_warn(root->fs_info, "devid %llu missing", - (unsigned long long)devid); + btrfs_warn(root->fs_info, "devid %llu missing", devid); device = add_missing_dev(root, devid, dev_uuid); if (!device) return -ENOMEM; @@ -5593,7 +5873,6 @@ static int read_one_dev(struct btrfs_root *root, } fill_device_from_item(leaf, dev_item, device); - device->dev_root = root->fs_info->dev_root; device->in_fs_metadata = 1; if (device->writeable && !device->is_tgtdev_for_dev_replace) { device->fs_devices->total_rw_bytes += device->total_bytes; @@ -5694,14 +5973,15 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) mutex_lock(&uuid_mutex); lock_chunks(root); - /* first we search for all of the device items, and then we - * read in all of the chunk items. This way we can create chunk - * mappings that reference all of the devices that are afound + /* + * Read all device items, and then all the chunk items. All + * device items are found before any chunk item (their object id + * is smaller than the lowest possible object id for a chunk + * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). */ key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.offset = 0; key.type = 0; -again: ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto error; @@ -5717,17 +5997,13 @@ again: break; } btrfs_item_key_to_cpu(leaf, &found_key, slot); - if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { - if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID) - break; - if (found_key.type == BTRFS_DEV_ITEM_KEY) { - struct btrfs_dev_item *dev_item; - dev_item = btrfs_item_ptr(leaf, slot, + if (found_key.type == BTRFS_DEV_ITEM_KEY) { + struct btrfs_dev_item *dev_item; + dev_item = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); - ret = read_one_dev(root, leaf, dev_item); - if (ret) - goto error; - } + ret = read_one_dev(root, leaf, dev_item); + if (ret) + goto error; } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { struct btrfs_chunk *chunk; chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); @@ -5737,11 +6013,6 @@ again: } path->slots[0]++; } - if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { - key.objectid = 0; - btrfs_release_path(path); - goto again; - } ret = 0; error: unlock_chunks(root); @@ -5751,6 +6022,17 @@ error: return ret; } +void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) + device->dev_root = fs_info->dev_root; + mutex_unlock(&fs_devices->device_list_mutex); +} + static void __btrfs_reset_dev_stats(struct btrfs_device *dev) { int i; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index f6247e2a47f7..b72f540c8b29 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -152,6 +152,8 @@ struct btrfs_fs_devices { int rotating; }; +#define BTRFS_BIO_INLINE_CSUM_SIZE 64 + /* * we need the mirror number and stripe index to be passed around * the call chain while we are processing end_io (especially errors). @@ -161,9 +163,14 @@ struct btrfs_fs_devices { * we allocate are actually btrfs_io_bios. We'll cram as much of * struct btrfs_bio as we can into this over time. */ +typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err); struct btrfs_io_bio { unsigned long mirror_num; unsigned long stripe_index; + u8 *csum; + u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; + u8 *csum_allocated; + btrfs_io_bio_end_io_t *end_io; struct bio bio; }; @@ -298,6 +305,9 @@ void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, char *device_path, struct btrfs_device **device); +struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, + const u64 *devid, + const u8 *uuid); int btrfs_rm_device(struct btrfs_root *root, char *device_path); void btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); @@ -315,12 +325,16 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); int btrfs_recover_balance(struct btrfs_fs_info *fs_info); int btrfs_pause_balance(struct btrfs_fs_info *fs_info); int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); +int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); +int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); -int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, +int find_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); int btrfs_get_dev_stats(struct btrfs_root *root, struct btrfs_ioctl_get_dev_stats *stats); +void btrfs_init_devices_late(struct btrfs_fs_info *fs_info); int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); @@ -336,6 +350,9 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, unsigned long btrfs_full_stripe_len(struct btrfs_root *root, struct btrfs_mapping_tree *map_tree, u64 logical); +int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + u64 chunk_offset, u64 chunk_size); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, int index) { |