diff options
Diffstat (limited to 'fs')
197 files changed, 5785 insertions, 3019 deletions
diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c index 025a9a5e1c32..55a756c60746 100644 --- a/fs/afs/addr_list.c +++ b/fs/afs/addr_list.c @@ -17,11 +17,6 @@ #include "internal.h" #include "afs_fs.h" -//#define AFS_MAX_ADDRESSES -// ((unsigned int)((PAGE_SIZE - sizeof(struct afs_addr_list)) / -// sizeof(struct sockaddr_rxrpc))) -#define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8)) - /* * Release an address list. */ @@ -43,11 +38,15 @@ struct afs_addr_list *afs_alloc_addrlist(unsigned int nr, _enter("%u,%u,%u", nr, service, port); + if (nr > AFS_MAX_ADDRESSES) + nr = AFS_MAX_ADDRESSES; + alist = kzalloc(struct_size(alist, addrs, nr), GFP_KERNEL); if (!alist) return NULL; refcount_set(&alist->usage, 1); + alist->max_addrs = nr; for (i = 0; i < nr; i++) { struct sockaddr_rxrpc *srx = &alist->addrs[i]; @@ -109,8 +108,6 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len, } while (p < end); _debug("%u/%u addresses", nr, AFS_MAX_ADDRESSES); - if (nr > AFS_MAX_ADDRESSES) - nr = AFS_MAX_ADDRESSES; alist = afs_alloc_addrlist(nr, service, port); if (!alist) @@ -119,8 +116,10 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len, /* Extract the addresses */ p = text; do { - struct sockaddr_rxrpc *srx = &alist->addrs[alist->nr_addrs]; const char *q, *stop; + unsigned int xport = port; + __be32 x[4]; + int family; if (*p == delim) { p++; @@ -136,19 +135,12 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len, break; } - if (in4_pton(p, q - p, - (u8 *)&srx->transport.sin6.sin6_addr.s6_addr32[3], - -1, &stop)) { - srx->transport.sin6.sin6_addr.s6_addr32[0] = 0; - srx->transport.sin6.sin6_addr.s6_addr32[1] = 0; - srx->transport.sin6.sin6_addr.s6_addr32[2] = htonl(0xffff); - } else if (in6_pton(p, q - p, - srx->transport.sin6.sin6_addr.s6_addr, - -1, &stop)) { - /* Nothing to do */ - } else { + if (in4_pton(p, q - p, (u8 *)&x[0], -1, &stop)) + family = AF_INET; + else if (in6_pton(p, q - p, (u8 *)x, -1, &stop)) + family = AF_INET6; + else goto bad_address; - } if (stop != q) goto bad_address; @@ -160,7 +152,7 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len, if (p < end) { if (*p == '+') { /* Port number specification "+1234" */ - unsigned int xport = 0; + xport = 0; p++; if (p >= end || !isdigit(*p)) goto bad_address; @@ -171,7 +163,6 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len, goto bad_address; p++; } while (p < end && isdigit(*p)); - srx->transport.sin6.sin6_port = htons(xport); } else if (*p == delim) { p++; } else { @@ -179,8 +170,12 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len, } } - alist->nr_addrs++; - } while (p < end && alist->nr_addrs < AFS_MAX_ADDRESSES); + if (family == AF_INET) + afs_merge_fs_addr4(alist, x[0], xport); + else + afs_merge_fs_addr6(alist, x, xport); + + } while (p < end); _leave(" = [nr %u]", alist->nr_addrs); return alist; @@ -237,19 +232,23 @@ struct afs_addr_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry) */ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port) { - struct sockaddr_in6 *a; - __be16 xport = htons(port); + struct sockaddr_rxrpc *srx; + u32 addr = ntohl(xdr); int i; + if (alist->nr_addrs >= alist->max_addrs) + return; + for (i = 0; i < alist->nr_ipv4; i++) { - a = &alist->addrs[i].transport.sin6; - if (xdr == a->sin6_addr.s6_addr32[3] && - xport == a->sin6_port) + struct sockaddr_in *a = &alist->addrs[i].transport.sin; + u32 a_addr = ntohl(a->sin_addr.s_addr); + u16 a_port = ntohs(a->sin_port); + + if (addr == a_addr && port == a_port) return; - if (xdr == a->sin6_addr.s6_addr32[3] && - (u16 __force)xport < (u16 __force)a->sin6_port) + if (addr == a_addr && port < a_port) break; - if ((u32 __force)xdr < (u32 __force)a->sin6_addr.s6_addr32[3]) + if (addr < a_addr) break; } @@ -258,12 +257,11 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port) alist->addrs + i, sizeof(alist->addrs[0]) * (alist->nr_addrs - i)); - a = &alist->addrs[i].transport.sin6; - a->sin6_port = xport; - a->sin6_addr.s6_addr32[0] = 0; - a->sin6_addr.s6_addr32[1] = 0; - a->sin6_addr.s6_addr32[2] = htonl(0xffff); - a->sin6_addr.s6_addr32[3] = xdr; + srx = &alist->addrs[i]; + srx->transport_len = sizeof(srx->transport.sin); + srx->transport.sin.sin_family = AF_INET; + srx->transport.sin.sin_port = htons(port); + srx->transport.sin.sin_addr.s_addr = xdr; alist->nr_ipv4++; alist->nr_addrs++; } @@ -273,18 +271,20 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port) */ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port) { - struct sockaddr_in6 *a; - __be16 xport = htons(port); + struct sockaddr_rxrpc *srx; int i, diff; + if (alist->nr_addrs >= alist->max_addrs) + return; + for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) { - a = &alist->addrs[i].transport.sin6; + struct sockaddr_in6 *a = &alist->addrs[i].transport.sin6; + u16 a_port = ntohs(a->sin6_port); + diff = memcmp(xdr, &a->sin6_addr, 16); - if (diff == 0 && - xport == a->sin6_port) + if (diff == 0 && port == a_port) return; - if (diff == 0 && - (u16 __force)xport < (u16 __force)a->sin6_port) + if (diff == 0 && port < a_port) break; if (diff < 0) break; @@ -295,12 +295,11 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port) alist->addrs + i, sizeof(alist->addrs[0]) * (alist->nr_addrs - i)); - a = &alist->addrs[i].transport.sin6; - a->sin6_port = xport; - a->sin6_addr.s6_addr32[0] = xdr[0]; - a->sin6_addr.s6_addr32[1] = xdr[1]; - a->sin6_addr.s6_addr32[2] = xdr[2]; - a->sin6_addr.s6_addr32[3] = xdr[3]; + srx = &alist->addrs[i]; + srx->transport_len = sizeof(srx->transport.sin6); + srx->transport.sin6.sin6_family = AF_INET6; + srx->transport.sin6.sin6_port = htons(port); + memcpy(&srx->transport.sin6.sin6_addr, xdr, 16); alist->nr_addrs++; } diff --git a/fs/afs/cell.c b/fs/afs/cell.c index f3d0bef16d78..6127f0fcd62c 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -514,6 +514,8 @@ static int afs_alloc_anon_key(struct afs_cell *cell) */ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell) { + struct hlist_node **p; + struct afs_cell *pcell; int ret; if (!cell->anonymous_key) { @@ -534,7 +536,18 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell) return ret; mutex_lock(&net->proc_cells_lock); - list_add_tail(&cell->proc_link, &net->proc_cells); + for (p = &net->proc_cells.first; *p; p = &(*p)->next) { + pcell = hlist_entry(*p, struct afs_cell, proc_link); + if (strcmp(cell->name, pcell->name) < 0) + break; + } + + cell->proc_link.pprev = p; + cell->proc_link.next = *p; + rcu_assign_pointer(*p, &cell->proc_link.next); + if (cell->proc_link.next) + cell->proc_link.next->pprev = &cell->proc_link.next; + afs_dynroot_mkdir(net, cell); mutex_unlock(&net->proc_cells_lock); return 0; @@ -550,7 +563,7 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell) afs_proc_cell_remove(cell); mutex_lock(&net->proc_cells_lock); - list_del_init(&cell->proc_link); + hlist_del_rcu(&cell->proc_link); afs_dynroot_rmdir(net, cell); mutex_unlock(&net->proc_cells_lock); diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 1cde710a8013..f29c6dade7f6 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -265,7 +265,7 @@ int afs_dynroot_populate(struct super_block *sb) return -ERESTARTSYS; net->dynroot_sb = sb; - list_for_each_entry(cell, &net->proc_cells, proc_link) { + hlist_for_each_entry(cell, &net->proc_cells, proc_link) { ret = afs_dynroot_mkdir(net, cell); if (ret < 0) goto error; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 871a228d7f37..72de1f157d20 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -73,12 +73,14 @@ struct afs_addr_list { struct rcu_head rcu; /* Must be first */ refcount_t usage; u32 version; /* Version */ - unsigned short nr_addrs; - unsigned short index; /* Address currently in use */ - unsigned short nr_ipv4; /* Number of IPv4 addresses */ + unsigned char max_addrs; + unsigned char nr_addrs; + unsigned char index; /* Address currently in use */ + unsigned char nr_ipv4; /* Number of IPv4 addresses */ unsigned long probed; /* Mask of servers that have been probed */ unsigned long yfs; /* Mask of servers that are YFS */ struct sockaddr_rxrpc addrs[]; +#define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8)) }; /* @@ -242,7 +244,7 @@ struct afs_net { seqlock_t cells_lock; struct mutex proc_cells_lock; - struct list_head proc_cells; + struct hlist_head proc_cells; /* Known servers. Theoretically each fileserver can only be in one * cell, but in practice, people create aliases and subsets and there's @@ -320,7 +322,7 @@ struct afs_cell { struct afs_net *net; struct key *anonymous_key; /* anonymous user key for this cell */ struct work_struct manager; /* Manager for init/deinit/dns */ - struct list_head proc_link; /* /proc cell list link */ + struct hlist_node proc_link; /* /proc cell list link */ #ifdef CONFIG_AFS_FSCACHE struct fscache_cookie *cache; /* caching cookie */ #endif diff --git a/fs/afs/main.c b/fs/afs/main.c index e84fe822a960..107427688edd 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -87,7 +87,7 @@ static int __net_init afs_net_init(struct net *net_ns) timer_setup(&net->cells_timer, afs_cells_timer, 0); mutex_init(&net->proc_cells_lock); - INIT_LIST_HEAD(&net->proc_cells); + INIT_HLIST_HEAD(&net->proc_cells); seqlock_init(&net->fs_lock); net->fs_servers = RB_ROOT; diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 476dcbb79713..9101f62707af 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -33,9 +33,8 @@ static inline struct afs_net *afs_seq2net_single(struct seq_file *m) static int afs_proc_cells_show(struct seq_file *m, void *v) { struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link); - struct afs_net *net = afs_seq2net(m); - if (v == &net->proc_cells) { + if (v == SEQ_START_TOKEN) { /* display header on line 1 */ seq_puts(m, "USE NAME\n"); return 0; @@ -50,12 +49,12 @@ static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos) __acquires(rcu) { rcu_read_lock(); - return seq_list_start_head(&afs_seq2net(m)->proc_cells, *_pos); + return seq_hlist_start_head_rcu(&afs_seq2net(m)->proc_cells, *_pos); } static void *afs_proc_cells_next(struct seq_file *m, void *v, loff_t *pos) { - return seq_list_next(v, &afs_seq2net(m)->proc_cells, pos); + return seq_hlist_next_rcu(v, &afs_seq2net(m)->proc_cells, pos); } static void afs_proc_cells_stop(struct seq_file *m, void *v) diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 35f2ae30f31f..77a83790a31f 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -690,8 +690,6 @@ static void afs_process_async_call(struct work_struct *work) } if (call->state == AFS_CALL_COMPLETE) { - call->reply[0] = NULL; - /* We have two refs to release - one from the alloc and one * queued with the work item - and we can't just deallocate the * call because the work item may be queued again. diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index efae2fb0930a..54207327f98f 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1580,7 +1580,7 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm) } static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, - const siginfo_t *siginfo) + const kernel_siginfo_t *siginfo) { mm_segment_t old_fs = get_fs(); set_fs(KERNEL_DS); @@ -1782,7 +1782,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_note_info *info, - const siginfo_t *siginfo, struct pt_regs *regs) + const kernel_siginfo_t *siginfo, struct pt_regs *regs) { struct task_struct *dump_task = current; const struct user_regset_view *view = task_user_regset_view(dump_task); @@ -2031,7 +2031,7 @@ static int elf_note_info_init(struct elf_note_info *info) static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_note_info *info, - const siginfo_t *siginfo, struct pt_regs *regs) + const kernel_siginfo_t *siginfo, struct pt_regs *regs) { struct list_head *t; struct core_thread *ct; diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index ae750b1574a2..68ebe188446a 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -112,11 +112,11 @@ static int find_extent_in_eb(const struct extent_buffer *eb, } struct preftree { - struct rb_root root; + struct rb_root_cached root; unsigned int count; }; -#define PREFTREE_INIT { .root = RB_ROOT, .count = 0 } +#define PREFTREE_INIT { .root = RB_ROOT_CACHED, .count = 0 } struct preftrees { struct preftree direct; /* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */ @@ -225,14 +225,15 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info, struct prelim_ref *newref, struct share_check *sc) { - struct rb_root *root; + struct rb_root_cached *root; struct rb_node **p; struct rb_node *parent = NULL; struct prelim_ref *ref; int result; + bool leftmost = true; root = &preftree->root; - p = &root->rb_node; + p = &root->rb_root.rb_node; while (*p) { parent = *p; @@ -242,6 +243,7 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info, p = &(*p)->rb_left; } else if (result > 0) { p = &(*p)->rb_right; + leftmost = false; } else { /* Identical refs, merge them and free @newref */ struct extent_inode_elem *eie = ref->inode_list; @@ -272,7 +274,7 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info, preftree->count++; trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count); rb_link_node(&newref->rbnode, parent, p); - rb_insert_color(&newref->rbnode, root); + rb_insert_color_cached(&newref->rbnode, root, leftmost); } /* @@ -283,11 +285,11 @@ static void prelim_release(struct preftree *preftree) { struct prelim_ref *ref, *next_ref; - rbtree_postorder_for_each_entry_safe(ref, next_ref, &preftree->root, - rbnode) + rbtree_postorder_for_each_entry_safe(ref, next_ref, + &preftree->root.rb_root, rbnode) free_pref(ref); - preftree->root = RB_ROOT; + preftree->root = RB_ROOT_CACHED; preftree->count = 0; } @@ -627,7 +629,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, * freeing the entire indirect tree when we're done. In some test * cases, the tree can grow quite large (~200k objects). */ - while ((rnode = rb_first(&preftrees->indirect.root))) { + while ((rnode = rb_first_cached(&preftrees->indirect.root))) { struct prelim_ref *ref; ref = rb_entry(rnode, struct prelim_ref, rbnode); @@ -637,7 +639,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, goto out; } - rb_erase(&ref->rbnode, &preftrees->indirect.root); + rb_erase_cached(&ref->rbnode, &preftrees->indirect.root); preftrees->indirect.count--; if (ref->count == 0) { @@ -717,9 +719,9 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, struct preftree *tree = &preftrees->indirect_missing_keys; struct rb_node *node; - while ((node = rb_first(&tree->root))) { + while ((node = rb_first_cached(&tree->root))) { ref = rb_entry(node, struct prelim_ref, rbnode); - rb_erase(node, &tree->root); + rb_erase_cached(node, &tree->root); BUG_ON(ref->parent); /* should not be a direct ref */ BUG_ON(ref->key_for_search.type); @@ -769,7 +771,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key); spin_lock(&head->lock); - for (n = rb_first(&head->ref_tree); n; n = rb_next(n)) { + for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) { node = rb_entry(n, struct btrfs_delayed_ref_node, ref_node); if (node->seq > seq) @@ -1229,14 +1231,14 @@ again: if (ret) goto out; - WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root)); + WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root)); ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees, extent_item_pos, total_refs, sc, ignore_offset); if (ret) goto out; - WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root)); + WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root.rb_root)); /* * This walks the tree of merged and resolved refs. Tree blocks are @@ -1245,7 +1247,7 @@ again: * * We release the entire tree in one go before returning. */ - node = rb_first(&preftrees.direct.root); + node = rb_first_cached(&preftrees.direct.root); while (node) { ref = rb_entry(node, struct prelim_ref, rbnode); node = rb_next(&ref->rbnode); @@ -1468,7 +1470,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) struct seq_list elem = SEQ_LIST_INIT(elem); int ret = 0; struct share_check shared = { - .root_objectid = root->objectid, + .root_objectid = root->root_key.objectid, .inum = inum, .share_count = 0, }; @@ -2031,7 +2033,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, /* path must be released before calling iterate()! */ btrfs_debug(fs_root->fs_info, "following ref at offset %u for inode %llu in tree %llu", - cur, found_key.objectid, fs_root->objectid); + cur, found_key.objectid, + fs_root->root_key.objectid); ret = iterate(parent, name_len, (unsigned long)(iref + 1), eb, ctx); if (ret) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 1343ac57b438..97d91e55b70a 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -206,7 +206,7 @@ static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) static inline unsigned long btrfs_inode_hash(u64 objectid, const struct btrfs_root *root) { - u64 h = objectid ^ (root->objectid * GOLDEN_RATIO_PRIME); + u64 h = objectid ^ (root->root_key.objectid * GOLDEN_RATIO_PRIME); #if BITS_PER_LONG == 32 h = (h >> 32) ^ (h & 0xffffffff); @@ -339,15 +339,15 @@ static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode, struct btrfs_root *root = inode->root; /* Output minus objectid, which is more meaningful */ - if (root->objectid >= BTRFS_LAST_FREE_OBJECTID) + if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) btrfs_warn_rl(root->fs_info, "csum failed root %lld ino %lld off %llu csum 0x%08x expected csum 0x%08x mirror %d", - root->objectid, btrfs_ino(inode), + root->root_key.objectid, btrfs_ino(inode), logical_start, csum, csum_expected, mirror_num); else btrfs_warn_rl(root->fs_info, "csum failed root %llu ino %llu off %llu csum 0x%08x expected csum 0x%08x mirror %d", - root->objectid, btrfs_ino(inode), + root->root_key.objectid, btrfs_ino(inode), logical_start, csum, csum_expected, mirror_num); } diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 833cf3c35b4d..2e43fba44035 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1594,6 +1594,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, { unsigned int num_pages; unsigned int i; + size_t size; u64 dev_bytenr; int ret; @@ -1608,9 +1609,8 @@ static int btrfsic_read_block(struct btrfsic_state *state, num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >> PAGE_SHIFT; - block_ctx->mem_to_free = kcalloc(sizeof(*block_ctx->datav) + - sizeof(*block_ctx->pagev), - num_pages, GFP_NOFS); + size = sizeof(*block_ctx->datav) + sizeof(*block_ctx->pagev); + block_ctx->mem_to_free = kcalloc(num_pages, size, GFP_NOFS); if (!block_ctx->mem_to_free) return -ENOMEM; block_ctx->datav = block_ctx->mem_to_free; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 9bfa66592aa7..8703ce68fe9d 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -528,7 +528,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_io_tree *tree; struct extent_map_tree *em_tree; struct compressed_bio *cb; unsigned long compressed_len; @@ -545,7 +544,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int faili = 0; u32 *sums; - tree = &BTRFS_I(inode)->io_tree; em_tree = &BTRFS_I(inode)->extent_tree; /* we need the actual starting offset of this extent in the file */ diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d436fb4c002e..2ee43b6a4f09 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -52,42 +52,6 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p) } } -/* - * reset all the locked nodes in the patch to spinning locks. - * - * held is used to keep lockdep happy, when lockdep is enabled - * we set held to a blocking lock before we go around and - * retake all the spinlocks in the path. You can safely use NULL - * for held - */ -noinline void btrfs_clear_path_blocking(struct btrfs_path *p, - struct extent_buffer *held, int held_rw) -{ - int i; - - if (held) { - btrfs_set_lock_blocking_rw(held, held_rw); - if (held_rw == BTRFS_WRITE_LOCK) - held_rw = BTRFS_WRITE_LOCK_BLOCKING; - else if (held_rw == BTRFS_READ_LOCK) - held_rw = BTRFS_READ_LOCK_BLOCKING; - } - btrfs_set_path_blocking(p); - - for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { - if (p->nodes[i] && p->locks[i]) { - btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]); - if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING) - p->locks[i] = BTRFS_WRITE_LOCK; - else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING) - p->locks[i] = BTRFS_READ_LOCK; - } - } - - if (held) - btrfs_clear_lock_blocking_rw(held, held_rw); -} - /* this also releases the path */ void btrfs_free_path(struct btrfs_path *p) { @@ -207,7 +171,7 @@ static void add_root_to_dirty_list(struct btrfs_root *root) spin_lock(&fs_info->trans_lock); if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { /* Want the extent tree to be the last on the list */ - if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID) + if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID) list_move_tail(&root->dirty_list, &fs_info->dirty_cowonly_roots); else @@ -1306,7 +1270,6 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, } } - btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK); btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); @@ -1815,8 +1778,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, int orig_slot = path->slots[level]; u64 orig_ptr; - if (level == 0) - return 0; + ASSERT(level > 0); mid = path->nodes[level]; @@ -2483,7 +2445,6 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, btrfs_set_path_blocking(p); reada_for_balance(fs_info, p, level); sret = split_node(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL, 0); BUG_ON(sret > 0); if (sret) { @@ -2504,7 +2465,6 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, btrfs_set_path_blocking(p); reada_for_balance(fs_info, p, level); sret = balance_level(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL, 0); if (sret) { ret = sret; @@ -2789,7 +2749,10 @@ again: } cow_done: p->nodes[level] = b; - btrfs_clear_path_blocking(p, NULL, 0); + /* + * Leave path with blocking locks to avoid massive + * lock context switch, this is made on purpose. + */ /* * we have a lock on b and as long as we aren't changing @@ -2871,8 +2834,6 @@ cow_done: if (!err) { btrfs_set_path_blocking(p); btrfs_tree_lock(b); - btrfs_clear_path_blocking(p, b, - BTRFS_WRITE_LOCK); } p->locks[level] = BTRFS_WRITE_LOCK; } else { @@ -2880,8 +2841,6 @@ cow_done: if (!err) { btrfs_set_path_blocking(p); btrfs_tree_read_lock(b); - btrfs_clear_path_blocking(p, b, - BTRFS_READ_LOCK); } p->locks[level] = BTRFS_READ_LOCK; } @@ -2900,7 +2859,6 @@ cow_done: btrfs_set_path_blocking(p); err = split_leaf(trans, root, key, p, ins_len, ret == 0); - btrfs_clear_path_blocking(p, NULL, 0); BUG_ON(err > 0); if (err) { @@ -2910,7 +2868,7 @@ cow_done: } if (!p->search_for_split) unlock_up(p, level, lowest_unlock, - min_write_lock_level, &write_lock_level); + min_write_lock_level, NULL); goto done; } } @@ -2961,13 +2919,16 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, again: b = get_old_root(root, time_seq); + if (!b) { + ret = -EIO; + goto done; + } level = btrfs_header_level(b); p->locks[level] = BTRFS_READ_LOCK; while (b) { level = btrfs_header_level(b); p->nodes[level] = b; - btrfs_clear_path_blocking(p, NULL, 0); /* * we have a lock on b and as long as we aren't changing @@ -3013,8 +2974,6 @@ again: if (!err) { btrfs_set_path_blocking(p); btrfs_tree_read_lock(b); - btrfs_clear_path_blocking(p, b, - BTRFS_READ_LOCK); } b = tree_mod_log_rewind(fs_info, p, b, time_seq); if (!b) { @@ -5198,7 +5157,6 @@ find_next_key: path->locks[level - 1] = BTRFS_READ_LOCK; path->nodes[level - 1] = cur; unlock_up(path, level, 1, 0, NULL); - btrfs_clear_path_blocking(path, NULL, 0); } out: path->keep_locks = keep_locks; @@ -5783,8 +5741,6 @@ again: if (!ret) { btrfs_set_path_blocking(path); btrfs_tree_read_lock(next); - btrfs_clear_path_blocking(path, next, - BTRFS_READ_LOCK); } next_rw_lock = BTRFS_READ_LOCK; } @@ -5820,8 +5776,6 @@ again: if (!ret) { btrfs_set_path_blocking(path); btrfs_tree_read_lock(next); - btrfs_clear_path_blocking(path, next, - BTRFS_READ_LOCK); } next_rw_lock = BTRFS_READ_LOCK; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2cddfe7806a4..68ca41dbbef3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -41,12 +41,6 @@ extern struct kmem_cache *btrfs_path_cachep; extern struct kmem_cache *btrfs_free_space_cachep; struct btrfs_ordered_sum; -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -#define STATIC noinline -#else -#define STATIC static noinline -#endif - #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ #define BTRFS_MAX_MIRRORS 3 @@ -367,11 +361,13 @@ struct btrfs_dev_replace { struct mutex lock_finishing_cancel_unmount; rwlock_t lock; - atomic_t read_locks; atomic_t blocking_readers; wait_queue_head_t read_lock_wq; struct btrfs_scrub_progress scrub_progress; + + struct percpu_counter bio_counter; + wait_queue_head_t replace_wait; }; /* For raid type sysfs entries */ @@ -1094,9 +1090,6 @@ struct btrfs_fs_info { /* device replace state */ struct btrfs_dev_replace dev_replace; - struct percpu_counter bio_counter; - wait_queue_head_t replace_wait; - struct semaphore uuid_tree_rescan_sem; /* Used to reclaim the metadata space in the background. */ @@ -1202,18 +1195,12 @@ struct btrfs_root { int last_log_commit; pid_t log_start_pid; - u64 objectid; u64 last_trans; u32 type; u64 highest_objectid; -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - /* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */ - u64 alloc_bytenr; -#endif - u64 defrag_trans_start; struct btrfs_key defrag_progress; struct btrfs_key defrag_max; @@ -1286,6 +1273,10 @@ struct btrfs_root { spinlock_t qgroup_meta_rsv_lock; u64 qgroup_meta_rsv_pertrans; u64 qgroup_meta_rsv_prealloc; + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + u64 alloc_bytenr; +#endif }; struct btrfs_file_private { @@ -2607,10 +2598,8 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info, return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; } -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans); +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans); void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start); void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); @@ -2771,7 +2760,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, enum btrfs_reserve_flush_enum flush); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes, - int update_size); + bool update_size); int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *dest, u64 num_bytes, int min_factor); @@ -2877,8 +2866,6 @@ void btrfs_release_path(struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); void btrfs_set_path_blocking(struct btrfs_path *p); -void btrfs_clear_path_blocking(struct btrfs_path *p, - struct extent_buffer *held, int held_rw); void btrfs_unlock_up_safe(struct btrfs_path *p, int level); int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -3021,8 +3008,7 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, /* dir-item.c */ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, const char *name, int name_len); -int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const char *name, +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, struct btrfs_key *location, u8 type, u64 index); struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, @@ -3180,8 +3166,8 @@ void __cold btrfs_destroy_cachep(void); struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, struct btrfs_root *root, int *was_new); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 end, int create); + struct page *page, size_t pg_offset, + u64 start, u64 end, int create); int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); @@ -3201,9 +3187,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); extern const struct dentry_operations btrfs_dentry_operations; -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -void btrfs_test_inode_set_ops(struct inode *inode); -#endif /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -3716,18 +3699,19 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) /* Sanity test specific functions */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +void btrfs_test_inode_set_ops(struct inode *inode); void btrfs_test_destroy_inode(struct inode *inode); -#endif static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) { -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, - &fs_info->fs_state))) - return 1; -#endif + return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); +} +#else +static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) +{ return 0; } +#endif static inline void cond_wake_up(struct wait_queue_head *wq) { diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index f51b509f2d9b..c669f250d4a0 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -42,8 +42,8 @@ static inline void btrfs_init_delayed_node( delayed_node->root = root; delayed_node->inode_id = inode_id; refcount_set(&delayed_node->refs, 0); - delayed_node->ins_root = RB_ROOT; - delayed_node->del_root = RB_ROOT; + delayed_node->ins_root = RB_ROOT_CACHED; + delayed_node->del_root = RB_ROOT_CACHED; mutex_init(&delayed_node->mutex); INIT_LIST_HEAD(&delayed_node->n_list); INIT_LIST_HEAD(&delayed_node->p_list); @@ -390,7 +390,7 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item( struct btrfs_delayed_node *delayed_node, struct btrfs_key *key) { - return __btrfs_lookup_delayed_item(&delayed_node->ins_root, key, + return __btrfs_lookup_delayed_item(&delayed_node->ins_root.rb_root, key, NULL, NULL); } @@ -400,9 +400,10 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, { struct rb_node **p, *node; struct rb_node *parent_node = NULL; - struct rb_root *root; + struct rb_root_cached *root; struct btrfs_delayed_item *item; int cmp; + bool leftmost = true; if (action == BTRFS_DELAYED_INSERTION_ITEM) root = &delayed_node->ins_root; @@ -410,7 +411,7 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, root = &delayed_node->del_root; else BUG(); - p = &root->rb_node; + p = &root->rb_root.rb_node; node = &ins->rb_node; while (*p) { @@ -419,16 +420,18 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, rb_node); cmp = btrfs_comp_cpu_keys(&item->key, &ins->key); - if (cmp < 0) + if (cmp < 0) { p = &(*p)->rb_right; - else if (cmp > 0) + leftmost = false; + } else if (cmp > 0) { p = &(*p)->rb_left; - else + } else { return -EEXIST; + } } rb_link_node(node, parent_node, p); - rb_insert_color(node, root); + rb_insert_color_cached(node, root, leftmost); ins->delayed_node = delayed_node; ins->ins_or_del = action; @@ -468,7 +471,7 @@ static void finish_one_item(struct btrfs_delayed_root *delayed_root) static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) { - struct rb_root *root; + struct rb_root_cached *root; struct btrfs_delayed_root *delayed_root; delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root; @@ -482,7 +485,7 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) else root = &delayed_item->delayed_node->del_root; - rb_erase(&delayed_item->rb_node, root); + rb_erase_cached(&delayed_item->rb_node, root); delayed_item->delayed_node->count--; finish_one_item(delayed_root); @@ -503,7 +506,7 @@ static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item( struct rb_node *p; struct btrfs_delayed_item *item = NULL; - p = rb_first(&delayed_node->ins_root); + p = rb_first_cached(&delayed_node->ins_root); if (p) item = rb_entry(p, struct btrfs_delayed_item, rb_node); @@ -516,7 +519,7 @@ static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item( struct rb_node *p; struct btrfs_delayed_item *item = NULL; - p = rb_first(&delayed_node->del_root); + p = rb_first_cached(&delayed_node->del_root); if (p) item = rb_entry(p, struct btrfs_delayed_item, rb_node); @@ -559,7 +562,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, * reserved space when starting a transaction. So no need to reserve * qgroup space here. */ - ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true); if (!ret) { trace_btrfs_space_reservation(fs_info, "delayed_item", item->key.objectid, @@ -647,7 +650,7 @@ static int btrfs_delayed_inode_reserve_metadata( return ret; } - ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true); if (!ret) { trace_btrfs_space_reservation(fs_info, "delayed_inode", btrfs_ino(inode), num_bytes, 1); @@ -762,9 +765,6 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, i++; } - /* reset all the locked nodes in the patch to spinning locks. */ - btrfs_clear_path_blocking(path, NULL, 0); - /* insert the keys of the items */ setup_items_for_insert(root, path, keys, data_size, total_data_size, total_size, nitems); @@ -1462,7 +1462,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, if (unlikely(ret)) { btrfs_err(trans->fs_info, "err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", - name_len, name, delayed_node->root->objectid, + name_len, name, delayed_node->root->root_key.objectid, delayed_node->inode_id, ret); BUG(); } @@ -1533,7 +1533,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, if (unlikely(ret)) { btrfs_err(trans->fs_info, "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", - index, node->root->objectid, node->inode_id, ret); + index, node->root->root_key.objectid, + node->inode_id, ret); BUG(); } mutex_unlock(&node->mutex); diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 33536cd681d4..74ae226ffaf0 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -50,8 +50,8 @@ struct btrfs_delayed_node { * is waiting to be dealt with by the async worker. */ struct list_head p_list; - struct rb_root ins_root; - struct rb_root del_root; + struct rb_root_cached ins_root; + struct rb_root_cached del_root; struct mutex mutex; struct btrfs_inode_item inode_item; refcount_t refs; diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 62ff545ba1f7..5149165b49a4 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -101,14 +101,15 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1, } /* insert a new ref to head ref rbtree */ -static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, +static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root, struct rb_node *node) { - struct rb_node **p = &root->rb_node; + struct rb_node **p = &root->rb_root.rb_node; struct rb_node *parent_node = NULL; struct btrfs_delayed_ref_head *entry; struct btrfs_delayed_ref_head *ins; u64 bytenr; + bool leftmost = true; ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); bytenr = ins->bytenr; @@ -117,26 +118,29 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, href_node); - if (bytenr < entry->bytenr) + if (bytenr < entry->bytenr) { p = &(*p)->rb_left; - else if (bytenr > entry->bytenr) + } else if (bytenr > entry->bytenr) { p = &(*p)->rb_right; - else + leftmost = false; + } else { return entry; + } } rb_link_node(node, parent_node, p); - rb_insert_color(node, root); + rb_insert_color_cached(node, root, leftmost); return NULL; } -static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root, +static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, struct btrfs_delayed_ref_node *ins) { - struct rb_node **p = &root->rb_node; + struct rb_node **p = &root->rb_root.rb_node; struct rb_node *node = &ins->ref_node; struct rb_node *parent_node = NULL; struct btrfs_delayed_ref_node *entry; + bool leftmost = true; while (*p) { int comp; @@ -145,16 +149,18 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root, entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, ref_node); comp = comp_refs(ins, entry, true); - if (comp < 0) + if (comp < 0) { p = &(*p)->rb_left; - else if (comp > 0) + } else if (comp > 0) { p = &(*p)->rb_right; - else + leftmost = false; + } else { return entry; + } } rb_link_node(node, parent_node, p); - rb_insert_color(node, root); + rb_insert_color_cached(node, root, leftmost); return NULL; } @@ -162,12 +168,14 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root, * find an head entry based on bytenr. This returns the delayed ref * head if it was able to find one, or NULL if nothing was in that spot. * If return_bigger is given, the next bigger entry is returned if no exact - * match is found. + * match is found. But if no bigger one is found then the first node of the + * ref head tree will be returned. */ -static struct btrfs_delayed_ref_head * -find_ref_head(struct rb_root *root, u64 bytenr, - int return_bigger) +static struct btrfs_delayed_ref_head* find_ref_head( + struct btrfs_delayed_ref_root *dr, u64 bytenr, + bool return_bigger) { + struct rb_root *root = &dr->href_root.rb_root; struct rb_node *n; struct btrfs_delayed_ref_head *entry; @@ -187,7 +195,7 @@ find_ref_head(struct rb_root *root, u64 bytenr, if (bytenr > entry->bytenr) { n = rb_next(&entry->href_node); if (!n) - n = rb_first(root); + n = rb_first_cached(&dr->href_root); entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); return entry; @@ -197,12 +205,9 @@ find_ref_head(struct rb_root *root, u64 bytenr, return NULL; } -int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, +int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { - struct btrfs_delayed_ref_root *delayed_refs; - - delayed_refs = &trans->transaction->delayed_refs; lockdep_assert_held(&delayed_refs->lock); if (mutex_trylock(&head->mutex)) return 0; @@ -227,7 +232,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref) { lockdep_assert_held(&head->lock); - rb_erase(&ref->ref_node, &head->ref_tree); + rb_erase_cached(&ref->ref_node, &head->ref_tree); RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) list_del(&ref->add_list); @@ -296,7 +301,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, lockdep_assert_held(&head->lock); - if (RB_EMPTY_ROOT(&head->ref_tree)) + if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) return; /* We don't have too many refs to merge for data. */ @@ -314,7 +319,8 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, spin_unlock(&fs_info->tree_mod_seq_lock); again: - for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) { + for (node = rb_first_cached(&head->ref_tree); node; + node = rb_next(node)) { ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); if (seq && ref->seq >= seq) continue; @@ -345,24 +351,21 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) return ret; } -struct btrfs_delayed_ref_head * -btrfs_select_ref_head(struct btrfs_trans_handle *trans) +struct btrfs_delayed_ref_head *btrfs_select_ref_head( + struct btrfs_delayed_ref_root *delayed_refs) { - struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_head *head; u64 start; bool loop = false; - delayed_refs = &trans->transaction->delayed_refs; - again: start = delayed_refs->run_delayed_start; - head = find_ref_head(&delayed_refs->href_root, start, 1); + head = find_ref_head(delayed_refs, start, true); if (!head && !loop) { delayed_refs->run_delayed_start = 0; start = 0; loop = true; - head = find_ref_head(&delayed_refs->href_root, start, 1); + head = find_ref_head(delayed_refs, start, true); if (!head) return NULL; } else if (!head && loop) { @@ -569,7 +572,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, head_ref->must_insert_reserved = must_insert_reserved; head_ref->is_data = is_data; head_ref->is_system = is_system; - head_ref->ref_tree = RB_ROOT; + head_ref->ref_tree = RB_ROOT_CACHED; INIT_LIST_HEAD(&head_ref->ref_add_list); RB_CLEAR_NODE(&head_ref->href_node); head_ref->processing = 0; @@ -903,7 +906,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) { - return find_ref_head(&delayed_refs->href_root, bytenr, 0); + return find_ref_head(delayed_refs, bytenr, false); } void __cold btrfs_delayed_ref_exit(void) diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index d9f2a4ebd5db..8e20c5cb5404 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -79,7 +79,7 @@ struct btrfs_delayed_ref_head { struct mutex mutex; spinlock_t lock; - struct rb_root ref_tree; + struct rb_root_cached ref_tree; /* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */ struct list_head ref_add_list; @@ -148,7 +148,7 @@ struct btrfs_delayed_data_ref { struct btrfs_delayed_ref_root { /* head ref rbtree */ - struct rb_root href_root; + struct rb_root_cached href_root; /* dirty extent records */ struct rb_root dirty_extent_root; @@ -255,7 +255,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr); -int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, +int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) { @@ -263,8 +263,8 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) } -struct btrfs_delayed_ref_head * -btrfs_select_ref_head(struct btrfs_trans_handle *trans); +struct btrfs_delayed_ref_head *btrfs_select_ref_head( + struct btrfs_delayed_ref_root *delayed_refs); int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index dec01970d8c5..2aa48aecc52b 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -382,14 +382,6 @@ out: return ret; } -void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info) -{ - struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - - dev_replace->committed_cursor_left = - dev_replace->cursor_left_last_write_of_item; -} - static char* btrfs_dev_name(struct btrfs_device *device) { if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) @@ -408,11 +400,12 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, int ret; struct btrfs_device *tgt_device = NULL; struct btrfs_device *src_device = NULL; + bool need_unlock; - ret = btrfs_find_device_by_devspec(fs_info, srcdevid, - srcdev_name, &src_device); - if (ret) - return ret; + src_device = btrfs_find_device_by_devspec(fs_info, srcdevid, + srcdev_name); + if (IS_ERR(src_device)) + return PTR_ERR(src_device); ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name, src_device, &tgt_device); @@ -432,6 +425,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return PTR_ERR(trans); } + need_unlock = true; btrfs_dev_replace_write_lock(dev_replace); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: @@ -440,6 +434,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + ASSERT(0); ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; goto leave; } @@ -470,6 +465,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, atomic64_set(&dev_replace->num_write_errors, 0); atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); btrfs_dev_replace_write_unlock(dev_replace); + need_unlock = false; ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); if (ret) @@ -481,7 +477,12 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + need_unlock = true; btrfs_dev_replace_write_lock(dev_replace); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; goto leave; } @@ -503,9 +504,8 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return ret; leave: - dev_replace->srcdev = NULL; - dev_replace->tgtdev = NULL; - btrfs_dev_replace_write_unlock(dev_replace); + if (need_unlock) + btrfs_dev_replace_write_unlock(dev_replace); btrfs_destroy_dev_replace_tgtdev(tgt_device); return ret; } @@ -545,8 +545,8 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) { set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); - wait_event(fs_info->replace_wait, !percpu_counter_sum( - &fs_info->bio_counter)); + wait_event(fs_info->dev_replace.replace_wait, !percpu_counter_sum( + &fs_info->dev_replace.bio_counter)); } /* @@ -555,7 +555,7 @@ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info) { clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); - wake_up(&fs_info->replace_wait); + wake_up(&fs_info->dev_replace.replace_wait); } static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, @@ -961,13 +961,10 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace) { read_lock(&dev_replace->lock); - atomic_inc(&dev_replace->read_locks); } void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace) { - ASSERT(atomic_read(&dev_replace->read_locks) > 0); - atomic_dec(&dev_replace->read_locks); read_unlock(&dev_replace->lock); } @@ -985,7 +982,6 @@ again: void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace) { - ASSERT(atomic_read(&dev_replace->blocking_readers) == 0); write_unlock(&dev_replace->lock); } @@ -994,45 +990,31 @@ void btrfs_dev_replace_set_lock_blocking( struct btrfs_dev_replace *dev_replace) { /* only set blocking for read lock */ - ASSERT(atomic_read(&dev_replace->read_locks) > 0); atomic_inc(&dev_replace->blocking_readers); read_unlock(&dev_replace->lock); } -/* acquire read lock and dec blocking cnt */ -void btrfs_dev_replace_clear_lock_blocking( - struct btrfs_dev_replace *dev_replace) -{ - /* only set blocking for read lock */ - ASSERT(atomic_read(&dev_replace->read_locks) > 0); - ASSERT(atomic_read(&dev_replace->blocking_readers) > 0); - read_lock(&dev_replace->lock); - /* Barrier implied by atomic_dec_and_test */ - if (atomic_dec_and_test(&dev_replace->blocking_readers)) - cond_wake_up_nomb(&dev_replace->read_lock_wq); -} - void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) { - percpu_counter_inc(&fs_info->bio_counter); + percpu_counter_inc(&fs_info->dev_replace.bio_counter); } void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount) { - percpu_counter_sub(&fs_info->bio_counter, amount); - cond_wake_up_nomb(&fs_info->replace_wait); + percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount); + cond_wake_up_nomb(&fs_info->dev_replace.replace_wait); } void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info) { while (1) { - percpu_counter_inc(&fs_info->bio_counter); + percpu_counter_inc(&fs_info->dev_replace.bio_counter); if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state))) break; btrfs_bio_counter_dec(fs_info); - wait_event(fs_info->replace_wait, + wait_event(fs_info->dev_replace.replace_wait, !test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)); } diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index b6d4206188bb..795c551f5b5e 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -11,7 +11,6 @@ struct btrfs_ioctl_dev_replace_args; int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info); int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, @@ -28,12 +27,5 @@ void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace); void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace); void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace); void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_clear_lock_blocking( - struct btrfs_dev_replace *dev_replace); - -static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value) -{ - atomic64_inc(stat_value); -} #endif diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index a678b07fcf01..8de74d835dba 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -105,13 +105,13 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, * to use for the second index (if one is created). * Will return 0 or -ENOMEM */ -int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root - *root, const char *name, int name_len, - struct btrfs_inode *dir, struct btrfs_key *location, - u8 type, u64 index) +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, + int name_len, struct btrfs_inode *dir, + struct btrfs_key *location, u8 type, u64 index) { int ret = 0; int ret2 = 0; + struct btrfs_root *root = dir->root; struct btrfs_path *path; struct btrfs_dir_item *dir_item; struct extent_buffer *leaf; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 05dc3c17cb62..b0ab41da91d1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -125,8 +125,8 @@ struct async_submit_bio { * Different roots are used for different purposes and may nest inside each * other and they require separate keysets. As lockdep keys should be * static, assign keysets according to the purpose of the root as indicated - * by btrfs_root->objectid. This ensures that all special purpose roots - * have separate keysets. + * by btrfs_root->root_key.objectid. This ensures that all special purpose + * roots have separate keysets. * * Lock-nesting across peer nodes is always done with the immediate parent * node locked thus preventing deadlock. As lockdep doesn't know this, use @@ -1148,7 +1148,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->state = 0; root->orphan_cleanup_state = 0; - root->objectid = objectid; root->last_trans = 0; root->highest_objectid = 0; root->nr_delalloc_inodes = 0; @@ -2156,9 +2155,8 @@ static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) { mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); rwlock_init(&fs_info->dev_replace.lock); - atomic_set(&fs_info->dev_replace.read_locks, 0); atomic_set(&fs_info->dev_replace.blocking_readers, 0); - init_waitqueue_head(&fs_info->replace_wait); + init_waitqueue_head(&fs_info->dev_replace.replace_wait); init_waitqueue_head(&fs_info->dev_replace.read_lock_wq); } @@ -2648,7 +2646,8 @@ int open_ctree(struct super_block *sb, goto fail_dirty_metadata_bytes; } - ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL); + ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0, + GFP_KERNEL); if (ret) { err = ret; goto fail_delalloc_bytes; @@ -3309,7 +3308,7 @@ fail_iput: iput(fs_info->btree_inode); fail_bio_counter: - percpu_counter_destroy(&fs_info->bio_counter); + percpu_counter_destroy(&fs_info->dev_replace.bio_counter); fail_delalloc_bytes: percpu_counter_destroy(&fs_info->delalloc_bytes); fail_dirty_metadata_bytes: @@ -3977,6 +3976,7 @@ void close_ctree(struct btrfs_fs_info *fs_info) kthread_stop(fs_info->transaction_kthread); kthread_stop(fs_info->cleaner_kthread); + ASSERT(list_empty(&fs_info->delayed_iputs)); set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags); btrfs_free_qgroup_config(fs_info); @@ -4018,7 +4018,7 @@ void close_ctree(struct btrfs_fs_info *fs_info) percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); - percpu_counter_destroy(&fs_info->bio_counter); + percpu_counter_destroy(&fs_info->dev_replace.bio_counter); cleanup_srcu_struct(&fs_info->subvol_srcu); btrfs_free_stripe_hash_table(fs_info); @@ -4204,7 +4204,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, return ret; } - while ((node = rb_first(&delayed_refs->href_root)) != NULL) { + while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) { struct btrfs_delayed_ref_head *head; struct rb_node *n; bool pin_bytes = false; @@ -4222,11 +4222,11 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, continue; } spin_lock(&head->lock); - while ((n = rb_first(&head->ref_tree)) != NULL) { + while ((n = rb_first_cached(&head->ref_tree)) != NULL) { ref = rb_entry(n, struct btrfs_delayed_ref_node, ref_node); ref->in_tree = 0; - rb_erase(&ref->ref_node, &head->ref_tree); + rb_erase_cached(&ref->ref_node, &head->ref_tree); RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) list_del(&ref->add_list); @@ -4240,7 +4240,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, if (head->processing == 0) delayed_refs->num_heads_ready--; atomic_dec(&delayed_refs->num_entries); - rb_erase(&head->href_node, &delayed_refs->href_root); + rb_erase_cached(&head->href_node, &delayed_refs->href_root); RB_CLEAR_NODE(&head->href_node); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 1f3755b3a37a..ddf28ecf17f9 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -33,7 +33,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, type = FILEID_BTRFS_WITHOUT_PARENT; fid->objectid = btrfs_ino(BTRFS_I(inode)); - fid->root_objectid = BTRFS_I(inode)->root->objectid; + fid->root_objectid = BTRFS_I(inode)->root->root_key.objectid; fid->gen = inode->i_generation; if (parent) { @@ -41,7 +41,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, fid->parent_objectid = BTRFS_I(parent)->location.objectid; fid->parent_gen = parent->i_generation; - parent_root_id = BTRFS_I(parent)->root->objectid; + parent_root_id = BTRFS_I(parent)->root->root_key.objectid; if (parent_root_id != fid->root_objectid) { fid->parent_root_objectid = parent_root_id; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2d9074295d7f..a4cd0221bc8d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2374,7 +2374,7 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head) { struct btrfs_delayed_ref_node *ref; - if (RB_EMPTY_ROOT(&head->ref_tree)) + if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) return NULL; /* @@ -2387,7 +2387,7 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head) return list_first_entry(&head->ref_add_list, struct btrfs_delayed_ref_node, add_list); - ref = rb_entry(rb_first(&head->ref_tree), + ref = rb_entry(rb_first_cached(&head->ref_tree), struct btrfs_delayed_ref_node, ref_node); ASSERT(list_empty(&ref->add_list)); return ref; @@ -2448,13 +2448,13 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, spin_unlock(&head->lock); spin_lock(&delayed_refs->lock); spin_lock(&head->lock); - if (!RB_EMPTY_ROOT(&head->ref_tree) || head->extent_op) { + if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) { spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); return 1; } delayed_refs->num_heads--; - rb_erase(&head->href_node, &delayed_refs->href_root); + rb_erase_cached(&head->href_node, &delayed_refs->href_root); RB_CLEAR_NODE(&head->href_node); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); @@ -2502,102 +2502,66 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, return 0; } -/* - * Returns 0 on success or if called with an already aborted transaction. - * Returns -ENOMEM or -EIO on failure and will abort the transaction. - */ -static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - unsigned long nr) +static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( + struct btrfs_trans_handle *trans) +{ + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + struct btrfs_delayed_ref_head *head = NULL; + int ret; + + spin_lock(&delayed_refs->lock); + head = btrfs_select_ref_head(delayed_refs); + if (!head) { + spin_unlock(&delayed_refs->lock); + return head; + } + + /* + * Grab the lock that says we are going to process all the refs for + * this head + */ + ret = btrfs_delayed_ref_lock(delayed_refs, head); + spin_unlock(&delayed_refs->lock); + + /* + * We may have dropped the spin lock to get the head mutex lock, and + * that might have given someone else time to free the head. If that's + * true, it has been removed from our list and we can move on. + */ + if (ret == -EAGAIN) + head = ERR_PTR(-EAGAIN); + + return head; +} + +static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *locked_ref, + unsigned long *run_refs) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_node *ref; - struct btrfs_delayed_ref_head *locked_ref = NULL; struct btrfs_delayed_extent_op *extent_op; - ktime_t start = ktime_get(); - int ret; - unsigned long count = 0; - unsigned long actual_count = 0; + struct btrfs_delayed_ref_node *ref; int must_insert_reserved = 0; + int ret; delayed_refs = &trans->transaction->delayed_refs; - while (1) { - if (!locked_ref) { - if (count >= nr) - break; - spin_lock(&delayed_refs->lock); - locked_ref = btrfs_select_ref_head(trans); - if (!locked_ref) { - spin_unlock(&delayed_refs->lock); - break; - } + lockdep_assert_held(&locked_ref->mutex); + lockdep_assert_held(&locked_ref->lock); - /* grab the lock that says we are going to process - * all the refs for this head */ - ret = btrfs_delayed_ref_lock(trans, locked_ref); - spin_unlock(&delayed_refs->lock); - /* - * we may have dropped the spin lock to get the head - * mutex lock, and that might have given someone else - * time to free the head. If that's true, it has been - * removed from our list and we can move on. - */ - if (ret == -EAGAIN) { - locked_ref = NULL; - count++; - continue; - } - } - - /* - * We need to try and merge add/drops of the same ref since we - * can run into issues with relocate dropping the implicit ref - * and then it being added back again before the drop can - * finish. If we merged anything we need to re-loop so we can - * get a good ref. - * Or we can get node references of the same type that weren't - * merged when created due to bumps in the tree mod seq, and - * we need to merge them to prevent adding an inline extent - * backref before dropping it (triggering a BUG_ON at - * insert_inline_extent_backref()). - */ - spin_lock(&locked_ref->lock); - btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); - - ref = select_delayed_ref(locked_ref); - - if (ref && ref->seq && + while ((ref = select_delayed_ref(locked_ref))) { + if (ref->seq && btrfs_check_delayed_seq(fs_info, ref->seq)) { spin_unlock(&locked_ref->lock); unselect_delayed_ref_head(delayed_refs, locked_ref); - locked_ref = NULL; - cond_resched(); - count++; - continue; - } - - /* - * We're done processing refs in this ref_head, clean everything - * up and move on to the next ref_head. - */ - if (!ref) { - ret = cleanup_ref_head(trans, locked_ref); - if (ret > 0 ) { - /* We dropped our lock, we need to loop. */ - ret = 0; - continue; - } else if (ret) { - return ret; - } - locked_ref = NULL; - count++; - continue; + return -EAGAIN; } - actual_count++; + (*run_refs)++; ref->in_tree = 0; - rb_erase(&ref->ref_node, &locked_ref->ref_tree); + rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree); RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) list_del(&ref->add_list); @@ -2619,8 +2583,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, atomic_dec(&delayed_refs->num_entries); /* - * Record the must-insert_reserved flag before we drop the spin - * lock. + * Record the must_insert_reserved flag before we drop the + * spin lock. */ must_insert_reserved = locked_ref->must_insert_reserved; locked_ref->must_insert_reserved = 0; @@ -2642,10 +2606,90 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } btrfs_put_delayed_ref(ref); - count++; cond_resched(); + + spin_lock(&locked_ref->lock); + btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); } + return 0; +} + +/* + * Returns 0 on success or if called with an already aborted transaction. + * Returns -ENOMEM or -EIO on failure and will abort the transaction. + */ +static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + unsigned long nr) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_head *locked_ref = NULL; + ktime_t start = ktime_get(); + int ret; + unsigned long count = 0; + unsigned long actual_count = 0; + + delayed_refs = &trans->transaction->delayed_refs; + do { + if (!locked_ref) { + locked_ref = btrfs_obtain_ref_head(trans); + if (IS_ERR_OR_NULL(locked_ref)) { + if (PTR_ERR(locked_ref) == -EAGAIN) { + continue; + } else { + break; + } + } + count++; + } + /* + * We need to try and merge add/drops of the same ref since we + * can run into issues with relocate dropping the implicit ref + * and then it being added back again before the drop can + * finish. If we merged anything we need to re-loop so we can + * get a good ref. + * Or we can get node references of the same type that weren't + * merged when created due to bumps in the tree mod seq, and + * we need to merge them to prevent adding an inline extent + * backref before dropping it (triggering a BUG_ON at + * insert_inline_extent_backref()). + */ + spin_lock(&locked_ref->lock); + btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); + + ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, + &actual_count); + if (ret < 0 && ret != -EAGAIN) { + /* + * Error, btrfs_run_delayed_refs_for_head already + * unlocked everything so just bail out + */ + return ret; + } else if (!ret) { + /* + * Success, perform the usual cleanup of a processed + * head + */ + ret = cleanup_ref_head(trans, locked_ref); + if (ret > 0 ) { + /* We dropped our lock, we need to loop. */ + ret = 0; + continue; + } else if (ret) { + return ret; + } + } + + /* + * Either success case or btrfs_run_delayed_refs_for_head + * returned -EAGAIN, meaning we need to select another head + */ + + locked_ref = NULL; + cond_resched(); + } while ((nr != -1 && count < nr) || locked_ref); + /* * We don't want to include ref heads since we can have empty ref heads * and those will drastically skew our runtime down since we just do @@ -2745,9 +2789,9 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) return num_csums; } -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_rsv *global_rsv; u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; u64 csum_bytes = trans->transaction->delayed_refs.pending_csums; @@ -2782,8 +2826,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, return ret; } -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) { u64 num_entries = atomic_read(&trans->transaction->delayed_refs.num_entries); @@ -2791,14 +2834,14 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, u64 val; smp_mb(); - avg_runtime = fs_info->avg_delayed_ref_runtime; + avg_runtime = trans->fs_info->avg_delayed_ref_runtime; val = num_entries * avg_runtime; if (val >= NSEC_PER_SEC) return 1; if (val >= NSEC_PER_SEC / 2) return 2; - return btrfs_check_space_for_delayed_refs(trans, fs_info); + return btrfs_check_space_for_delayed_refs(trans); } struct async_delayed_refs { @@ -2940,7 +2983,7 @@ again: btrfs_create_pending_block_groups(trans); spin_lock(&delayed_refs->lock); - node = rb_first(&delayed_refs->href_root); + node = rb_first_cached(&delayed_refs->href_root); if (!node) { spin_unlock(&delayed_refs->lock); goto out; @@ -3040,7 +3083,8 @@ static noinline int check_delayed_ref(struct btrfs_root *root, * XXX: We should replace this with a proper search function in the * future. */ - for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) { + for (node = rb_first_cached(&head->ref_tree); node; + node = rb_next(node)) { ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); /* If it's a shared ref we know a cross reference exists */ if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { @@ -3139,7 +3183,6 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, { struct btrfs_path *path; int ret; - int ret2; path = btrfs_alloc_path(); if (!path) @@ -3151,17 +3194,9 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, if (ret && ret != -ENOENT) goto out; - ret2 = check_delayed_ref(root, path, objectid, - offset, bytenr); - } while (ret2 == -EAGAIN); - - if (ret2 && ret2 != -ENOENT) { - ret = ret2; - goto out; - } + ret = check_delayed_ref(root, path, objectid, offset, bytenr); + } while (ret == -EAGAIN); - if (ret != -ENOENT || ret2 != -ENOENT) - ret = 0; out: btrfs_free_path(path); if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) @@ -5284,7 +5319,7 @@ static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, } static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, - u64 num_bytes, int update_size) + u64 num_bytes, bool update_size) { spin_lock(&block_rsv->lock); block_rsv->reserved += num_bytes; @@ -5316,7 +5351,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, global_rsv->full = 0; spin_unlock(&global_rsv->lock); - block_rsv_add_bytes(dest, num_bytes, 1); + block_rsv_add_bytes(dest, num_bytes, true); return 0; } @@ -5479,7 +5514,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, struct btrfs_block_rsv *dst, u64 num_bytes, - int update_size) + bool update_size) { int ret; @@ -5539,10 +5574,8 @@ int btrfs_block_rsv_add(struct btrfs_root *root, return 0; ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); - if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, 1); - return 0; - } + if (!ret) + block_rsv_add_bytes(block_rsv, num_bytes, true); return ret; } @@ -5587,7 +5620,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, 0); + block_rsv_add_bytes(block_rsv, num_bytes, false); return 0; } @@ -5629,7 +5662,7 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, return ret; ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, 0); + block_rsv_add_bytes(block_rsv, num_bytes, false); trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), num_bytes, 1); @@ -5835,7 +5868,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, BTRFS_RESERVE_FLUSH_ALL); if (ret == -ENOSPC && use_global_rsv) - ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1); + ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true); if (ret && qgroup_num_bytes) btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); @@ -6399,10 +6432,6 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, } else { cache->reserved += num_bytes; space_info->bytes_reserved += num_bytes; - - trace_btrfs_space_reservation(cache->fs_info, - "space_info", space_info->flags, - ram_bytes, 0); space_info->bytes_may_use -= ram_bytes; if (delalloc) cache->delalloc_bytes += num_bytes; @@ -6424,11 +6453,10 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, * reserve set to 0 in order to clear the reservation. */ -static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int delalloc) +static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int delalloc) { struct btrfs_space_info *space_info = cache->space_info; - int ret = 0; spin_lock(&space_info->lock); spin_lock(&cache->lock); @@ -6441,7 +6469,6 @@ static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, cache->delalloc_bytes -= num_bytes; spin_unlock(&cache->lock); spin_unlock(&space_info->lock); - return ret; } void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) { @@ -6925,7 +6952,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, goto out_delayed_unlock; spin_lock(&head->lock); - if (!RB_EMPTY_ROOT(&head->ref_tree)) + if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root)) goto out; if (head->extent_op) { @@ -6946,7 +6973,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, * at this point we have a head with no other entries. Go * ahead and process it. */ - rb_erase(&head->href_node, &delayed_refs->href_root); + rb_erase_cached(&head->href_node, &delayed_refs->href_root); RB_CLEAR_NODE(&head->href_node); atomic_dec(&delayed_refs->num_entries); @@ -8119,6 +8146,19 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (IS_ERR(buf)) return buf; + /* + * Extra safety check in case the extent tree is corrupted and extent + * allocator chooses to use a tree block which is already used and + * locked. + */ + if (buf->lock_owner == current->pid) { + btrfs_err_rl(fs_info, +"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", + buf->start, btrfs_header_owner(buf), current->pid); + free_extent_buffer(buf); + return ERR_PTR(-EUCLEAN); + } + btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); clean_tree_block(fs_info, buf); @@ -8215,7 +8255,7 @@ try_reserve: static void unuse_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u32 blocksize) { - block_rsv_add_bytes(block_rsv, blocksize, 0); + block_rsv_add_bytes(block_rsv, blocksize, false); block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL); } @@ -8642,7 +8682,13 @@ skip: parent = 0; } - if (need_account) { + /* + * Reloc tree doesn't contribute to qgroup numbers, and we have + * already accounted them at merge time (replace_path), + * thus we could skip expensive subtree trace here. + */ + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && + need_account) { ret = btrfs_qgroup_trace_subtree(trans, next, generation, level - 1); if (ret) { @@ -8763,15 +8809,14 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (eb == root->node) { if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = eb->start; - else - BUG_ON(root->root_key.objectid != - btrfs_header_owner(eb)); + else if (root->root_key.objectid != btrfs_header_owner(eb)) + goto owner_mismatch; } else { if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = path->nodes[level + 1]->start; - else - BUG_ON(root->root_key.objectid != - btrfs_header_owner(path->nodes[level + 1])); + else if (root->root_key.objectid != + btrfs_header_owner(path->nodes[level + 1])) + goto owner_mismatch; } btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); @@ -8779,6 +8824,11 @@ out: wc->refs[level] = 0; wc->flags[level] = 0; return 0; + +owner_mismatch: + btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu", + btrfs_header_owner(eb), root->root_key.objectid); + return -EUCLEAN; } static noinline int walk_down_tree(struct btrfs_trans_handle *trans, @@ -8832,6 +8882,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, ret = walk_up_proc(trans, root, path, wc); if (ret > 0) return 0; + if (ret < 0) + return ret; if (path->locks[level]) { btrfs_tree_unlock_rw(path->nodes[level], @@ -8875,7 +8927,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int level; bool root_dropped = false; - btrfs_debug(fs_info, "Drop subvolume %llu", root->objectid); + btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid); path = btrfs_alloc_path(); if (!path) { @@ -9613,6 +9665,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info) block_group = btrfs_lookup_first_block_group(info, last); while (block_group) { + wait_block_group_cache_done(block_group); spin_lock(&block_group->lock); if (block_group->iref) break; @@ -10074,7 +10127,7 @@ error: void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_cache *block_group, *tmp; + struct btrfs_block_group_cache *block_group; struct btrfs_root *extent_root = fs_info->extent_root; struct btrfs_block_group_item item; struct btrfs_key key; @@ -10082,7 +10135,10 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) bool can_flush_pending_bgs = trans->can_flush_pending_bgs; trans->can_flush_pending_bgs = false; - list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { + while (!list_empty(&trans->new_bgs)) { + block_group = list_first_entry(&trans->new_bgs, + struct btrfs_block_group_cache, + bg_list); if (ret) goto next; @@ -10753,14 +10809,16 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, * We don't want a transaction for this since the discard may take a * substantial amount of time. We don't require that a transaction be * running, but we do need to take a running transaction into account - * to ensure that we're not discarding chunks that were released in - * the current transaction. + * to ensure that we're not discarding chunks that were released or + * allocated in the current transaction. * * Holding the chunks lock will prevent other threads from allocating * or releasing chunks, but it won't prevent a running transaction * from committing and releasing the memory that the pending chunks * list head uses. For that, we need to take a reference to the - * transaction. + * transaction and hold the commit root sem. We only need to hold + * it while performing the free space search since we have already + * held back allocations. */ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 minlen, u64 *trimmed) @@ -10770,6 +10828,10 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, *trimmed = 0; + /* Discard not supported = nothing to do. */ + if (!blk_queue_discard(bdev_get_queue(device->bdev))) + return 0; + /* Not writeable = nothing to do. */ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) return 0; @@ -10787,9 +10849,13 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, ret = mutex_lock_interruptible(&fs_info->chunk_mutex); if (ret) - return ret; + break; - down_read(&fs_info->commit_root_sem); + ret = down_read_killable(&fs_info->commit_root_sem); + if (ret) { + mutex_unlock(&fs_info->chunk_mutex); + break; + } spin_lock(&fs_info->trans_lock); trans = fs_info->running_transaction; @@ -10797,13 +10863,17 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, refcount_inc(&trans->use_count); spin_unlock(&fs_info->trans_lock); + if (!trans) + up_read(&fs_info->commit_root_sem); + ret = find_free_dev_extent_start(trans, device, minlen, start, &start, &len); - if (trans) + if (trans) { + up_read(&fs_info->commit_root_sem); btrfs_put_transaction(trans); + } if (ret) { - up_read(&fs_info->commit_root_sem); mutex_unlock(&fs_info->chunk_mutex); if (ret == -ENOSPC) ret = 0; @@ -10811,7 +10881,6 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, } ret = btrfs_issue_discard(device->bdev, start, len, &bytes); - up_read(&fs_info->commit_root_sem); mutex_unlock(&fs_info->chunk_mutex); if (ret) @@ -10831,6 +10900,15 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, return ret; } +/* + * Trim the whole filesystem by: + * 1) trimming the free space in each block group + * 2) trimming the unallocated space on each device + * + * This will also continue trimming even if a block group or device encounters + * an error. The return value will be the last error, or 0 if nothing bad + * happens. + */ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) { struct btrfs_block_group_cache *cache = NULL; @@ -10840,18 +10918,14 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) u64 start; u64 end; u64 trimmed = 0; - u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); + u64 bg_failed = 0; + u64 dev_failed = 0; + int bg_ret = 0; + int dev_ret = 0; int ret = 0; - /* - * try to trim all FS space, our block group may start from non-zero. - */ - if (range->len == total_bytes) - cache = btrfs_lookup_first_block_group(fs_info, range->start); - else - cache = btrfs_lookup_block_group(fs_info, range->start); - - while (cache) { + cache = btrfs_lookup_first_block_group(fs_info, range->start); + for (; cache; cache = next_block_group(fs_info, cache)) { if (cache->key.objectid >= (range->start + range->len)) { btrfs_put_block_group(cache); break; @@ -10865,13 +10939,15 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) if (!block_group_cache_done(cache)) { ret = cache_block_group(cache, 0); if (ret) { - btrfs_put_block_group(cache); - break; + bg_failed++; + bg_ret = ret; + continue; } ret = wait_block_group_cache_done(cache); if (ret) { - btrfs_put_block_group(cache); - break; + bg_failed++; + bg_ret = ret; + continue; } } ret = btrfs_trim_block_group(cache, @@ -10882,28 +10958,40 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) trimmed += group_trimmed; if (ret) { - btrfs_put_block_group(cache); - break; + bg_failed++; + bg_ret = ret; + continue; } } - - cache = next_block_group(fs_info, cache); } + if (bg_failed) + btrfs_warn(fs_info, + "failed to trim %llu block group(s), last error %d", + bg_failed, bg_ret); mutex_lock(&fs_info->fs_devices->device_list_mutex); - devices = &fs_info->fs_devices->alloc_list; - list_for_each_entry(device, devices, dev_alloc_list) { + devices = &fs_info->fs_devices->devices; + list_for_each_entry(device, devices, dev_list) { ret = btrfs_trim_free_extents(device, range->minlen, &group_trimmed); - if (ret) + if (ret) { + dev_failed++; + dev_ret = ret; break; + } trimmed += group_trimmed; } mutex_unlock(&fs_info->fs_devices->device_list_mutex); + if (dev_failed) + btrfs_warn(fs_info, + "failed to trim %llu device(s), last error %d", + dev_failed, dev_ret); range->len = trimmed; - return ret; + if (bg_ret) + return bg_ret; + return dev_ret; } /* diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4dd6faab02bb..6877a74c7469 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1424,20 +1424,15 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, struct extent_state **cached_state) { struct extent_state *state; - struct rb_node *n; int ret = 1; spin_lock(&tree->lock); if (cached_state && *cached_state) { state = *cached_state; if (state->end == start - 1 && extent_state_in_tree(state)) { - n = rb_next(&state->rb_node); - while (n) { - state = rb_entry(n, struct extent_state, - rb_node); + while ((state = next_state(state)) != NULL) { if (state->state & bits) goto got_it; - n = rb_next(n); } free_extent_state(*cached_state); *cached_state = NULL; @@ -1568,7 +1563,7 @@ static noinline int lock_delalloc_pages(struct inode *inode, * * 1 is returned if we find something, 0 if nothing was in the tree */ -STATIC u64 find_lock_delalloc_range(struct inode *inode, +static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, struct page *locked_page, u64 *start, u64 *end, u64 max_bytes) @@ -1648,6 +1643,17 @@ out_failed: return found; } +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +u64 btrfs_find_lock_delalloc_range(struct inode *inode, + struct extent_io_tree *tree, + struct page *locked_page, u64 *start, + u64 *end, u64 max_bytes) +{ + return find_lock_delalloc_range(inode, tree, locked_page, start, end, + max_bytes); +} +#endif + static int __process_pages_contig(struct address_space *mapping, struct page *locked_page, pgoff_t start_index, pgoff_t end_index, @@ -5165,11 +5171,11 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb) WARN_ON(atomic_read(&eb->refs) == 0); } -int set_extent_buffer_dirty(struct extent_buffer *eb) +bool set_extent_buffer_dirty(struct extent_buffer *eb) { int i; int num_pages; - int was_dirty = 0; + bool was_dirty; check_buffer_tree_ref(eb); @@ -5179,8 +5185,15 @@ int set_extent_buffer_dirty(struct extent_buffer *eb) WARN_ON(atomic_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); + if (!was_dirty) + for (i = 0; i < num_pages; i++) + set_page_dirty(eb->pages[i]); + +#ifdef CONFIG_BTRFS_DEBUG for (i = 0; i < num_pages; i++) - set_page_dirty(eb->pages[i]); + ASSERT(PageDirty(eb->pages[i])); +#endif + return was_dirty; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index b4d03e677e1d..369daa5d4f73 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -479,7 +479,7 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len); void clear_extent_buffer_dirty(struct extent_buffer *eb); -int set_extent_buffer_dirty(struct extent_buffer *eb); +bool set_extent_buffer_dirty(struct extent_buffer *eb); void set_extent_buffer_uptodate(struct extent_buffer *eb); void clear_extent_buffer_uptodate(struct extent_buffer *eb); int extent_buffer_under_io(struct extent_buffer *eb); @@ -546,7 +546,7 @@ int free_io_failure(struct extent_io_tree *failure_tree, struct extent_io_tree *io_tree, struct io_failure_record *rec); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -noinline u64 find_lock_delalloc_range(struct inode *inode, +u64 btrfs_find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, struct page *locked_page, u64 *start, u64 *end, u64 max_bytes); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 6648d55e5339..7eea8b6e2cd3 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -34,7 +34,7 @@ void __cold extent_map_exit(void) */ void extent_map_tree_init(struct extent_map_tree *tree) { - tree->map = RB_ROOT; + tree->map = RB_ROOT_CACHED; INIT_LIST_HEAD(&tree->modified_extents); rwlock_init(&tree->lock); } @@ -90,24 +90,27 @@ static u64 range_end(u64 start, u64 len) return start + len; } -static int tree_insert(struct rb_root *root, struct extent_map *em) +static int tree_insert(struct rb_root_cached *root, struct extent_map *em) { - struct rb_node **p = &root->rb_node; + struct rb_node **p = &root->rb_root.rb_node; struct rb_node *parent = NULL; struct extent_map *entry = NULL; struct rb_node *orig_parent = NULL; u64 end = range_end(em->start, em->len); + bool leftmost = true; while (*p) { parent = *p; entry = rb_entry(parent, struct extent_map, rb_node); - if (em->start < entry->start) + if (em->start < entry->start) { p = &(*p)->rb_left; - else if (em->start >= extent_map_end(entry)) + } else if (em->start >= extent_map_end(entry)) { p = &(*p)->rb_right; - else + leftmost = false; + } else { return -EEXIST; + } } orig_parent = parent; @@ -130,7 +133,7 @@ static int tree_insert(struct rb_root *root, struct extent_map *em) return -EEXIST; rb_link_node(&em->rb_node, orig_parent, p); - rb_insert_color(&em->rb_node, root); + rb_insert_color_cached(&em->rb_node, root, leftmost); return 0; } @@ -242,7 +245,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->mod_start = merge->mod_start; em->generation = max(em->generation, merge->generation); - rb_erase(&merge->rb_node, &tree->map); + rb_erase_cached(&merge->rb_node, &tree->map); RB_CLEAR_NODE(&merge->rb_node); free_extent_map(merge); } @@ -254,7 +257,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) if (rb && mergable_maps(em, merge)) { em->len += merge->len; em->block_len += merge->block_len; - rb_erase(&merge->rb_node, &tree->map); + rb_erase_cached(&merge->rb_node, &tree->map); RB_CLEAR_NODE(&merge->rb_node); em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; em->generation = max(em->generation, merge->generation); @@ -367,7 +370,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree, struct rb_node *next = NULL; u64 end = range_end(start, len); - rb_node = __tree_search(&tree->map, start, &prev, &next); + rb_node = __tree_search(&tree->map.rb_root, start, &prev, &next); if (!rb_node) { if (prev) rb_node = prev; @@ -428,16 +431,13 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, * Removes @em from @tree. No reference counts are dropped, and no checks * are done to see if the range is in use */ -int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) +void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) { - int ret = 0; - WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); - rb_erase(&em->rb_node, &tree->map); + rb_erase_cached(&em->rb_node, &tree->map); if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) list_del_init(&em->list); RB_CLEAR_NODE(&em->rb_node); - return ret; } void replace_extent_mapping(struct extent_map_tree *tree, @@ -449,7 +449,7 @@ void replace_extent_mapping(struct extent_map_tree *tree, ASSERT(extent_map_in_tree(cur)); if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags)) list_del_init(&cur->list); - rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map); + rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map); RB_CLEAR_NODE(&cur->rb_node); setup_extent_mapping(tree, new, modified); diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 25d985e7532a..31977ffd6190 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -49,7 +49,7 @@ struct extent_map { }; struct extent_map_tree { - struct rb_root map; + struct rb_root_cached map; struct list_head modified_extents; rwlock_t lock; }; @@ -78,7 +78,7 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); int add_extent_mapping(struct extent_map_tree *tree, struct extent_map *em, int modified); -int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); +void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); void replace_extent_mapping(struct extent_map_tree *tree, struct extent_map *cur, struct extent_map *new, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 2be00e873e92..15b925142793 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -531,6 +531,14 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, end_of_last_block = start_pos + num_bytes - 1; + /* + * The pages may have already been dirty, clear out old accounting so + * we can set things up properly + */ + clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, cached); + if (!btrfs_is_free_space_inode(BTRFS_I(inode))) { if (start_pos >= isize && !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) { @@ -1500,18 +1508,27 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, } if (ordered) btrfs_put_ordered_extent(ordered); - clear_extent_bit(&inode->io_tree, start_pos, last_pos, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, cached_state); + *lockstart = start_pos; *lockend = last_pos; ret = 1; } + /* + * It's possible the pages are dirty right now, but we don't want + * to clean them yet because copy_from_user may catch a page fault + * and we might have to fall back to one page at a time. If that + * happens, we'll unlock these pages and we'd have a window where + * reclaim could sneak in and drop the once-dirty page on the floor + * without writing it. + * + * We have the pages locked and the extent range locked, so there's + * no way someone can start IO on any dirty pages in this range. + * + * We'll call btrfs_dirty_pages() later on, and that will flip around + * delalloc bits and dirty the pages as required. + */ for (i = 0; i < num_pages; i++) { - if (clear_page_dirty_for_io(pages[i])) - account_page_redirty(pages[i]); set_page_extent_mapped(pages[i]); WARN_ON(!PageLocked(pages[i])); } @@ -2544,7 +2561,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, - min_size, 0); + min_size, false); BUG_ON(ret); trans->block_rsv = rsv; @@ -2594,7 +2611,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, - rsv, min_size, 0); + rsv, min_size, false); BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 0adf38b00fa0..67441219d6c9 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -10,6 +10,7 @@ #include <linux/math64.h> #include <linux/ratelimit.h> #include <linux/error-injection.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "free-space-cache.h" #include "transaction.h" @@ -47,6 +48,7 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, struct btrfs_free_space_header *header; struct extent_buffer *leaf; struct inode *inode = NULL; + unsigned nofs_flag; int ret; key.objectid = BTRFS_FREE_SPACE_OBJECTID; @@ -68,7 +70,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, btrfs_disk_key_to_cpu(&location, &disk_key); btrfs_release_path(path); + /* + * We are often under a trans handle at this point, so we need to make + * sure NOFS is set to keep us from deadlocking. + */ + nofs_flag = memalloc_nofs_save(); inode = btrfs_iget(fs_info->sb, &location, root, NULL); + memalloc_nofs_restore(nofs_flag); if (IS_ERR(inode)) return inode; @@ -1679,6 +1687,8 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, bitmap_clear(info->bitmap, start, count); info->bytes -= bytes; + if (info->max_extent_size > ctl->unit) + info->max_extent_size = 0; } static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, @@ -2110,8 +2120,7 @@ new_bitmap: out: if (info) { - if (info->bitmap) - kfree(info->bitmap); + kfree(info->bitmap); kmem_cache_free(btrfs_free_space_cachep, info); } @@ -3601,8 +3610,7 @@ again: if (info) kmem_cache_free(btrfs_free_space_cachep, info); - if (map) - kfree(map); + kfree(map); return 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3ea5339603cf..f22f77172c5f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -64,7 +64,6 @@ static const struct inode_operations btrfs_dir_ro_inode_operations; static const struct inode_operations btrfs_special_inode_operations; static const struct inode_operations btrfs_file_inode_operations; static const struct address_space_operations btrfs_aops; -static const struct address_space_operations btrfs_symlink_aops; static const struct file_operations btrfs_dir_file_operations; static const struct extent_io_ops btrfs_extent_io_ops; @@ -2750,12 +2749,9 @@ static void relink_file_extents(struct new_sa_defrag_extent *new) struct btrfs_path *path; struct sa_defrag_extent_backref *backref; struct sa_defrag_extent_backref *prev = NULL; - struct inode *inode; struct rb_node *node; int ret; - inode = new->inode; - path = btrfs_alloc_path(); if (!path) return; @@ -3471,8 +3467,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) /* this will do delete_inode and everything for us */ iput(inode); - if (ret) - goto out; } /* release the path since we're done with it */ btrfs_release_path(path); @@ -3738,7 +3732,7 @@ cache_acl: case S_IFLNK: inode->i_op = &btrfs_symlink_inode_operations; inode_nohighmem(inode); - inode->i_mapping->a_ops = &btrfs_symlink_aops; + inode->i_mapping->a_ops = &btrfs_aops; break; default: inode->i_op = &btrfs_special_inode_operations; @@ -3910,12 +3904,8 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, path->leave_spinning = 1; di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, name_len, -1); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto err; - } - if (!di) { - ret = -ENOENT; + if (IS_ERR_OR_NULL(di)) { + ret = di ? PTR_ERR(di) : -ENOENT; goto err; } leaf = path->nodes[0]; @@ -4075,10 +4065,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, name_len, -1); if (IS_ERR_OR_NULL(di)) { - if (!di) - ret = -ENOENT; - else - ret = PTR_ERR(di); + ret = di ? PTR_ERR(di) : -ENOENT; goto out; } @@ -4270,18 +4257,17 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) * again is not run concurrently. */ spin_lock(&dest->root_item_lock); - root_flags = btrfs_root_flags(&dest->root_item); - if (dest->send_in_progress == 0) { - btrfs_set_root_flags(&dest->root_item, - root_flags | BTRFS_ROOT_SUBVOL_DEAD); - spin_unlock(&dest->root_item_lock); - } else { + if (dest->send_in_progress) { spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu during send", dest->root_key.objectid); return -EPERM; } + root_flags = btrfs_root_flags(&dest->root_item); + btrfs_set_root_flags(&dest->root_item, + root_flags | BTRFS_ROOT_SUBVOL_DEAD); + spin_unlock(&dest->root_item_lock); down_write(&fs_info->subvol_sem); @@ -4727,7 +4713,7 @@ delete: btrfs_abort_transaction(trans, ret); break; } - if (btrfs_should_throttle_delayed_refs(trans, fs_info)) + if (btrfs_should_throttle_delayed_refs(trans)) btrfs_async_run_delayed_refs(fs_info, trans->delayed_ref_updates * 2, trans->transid, 0); @@ -4736,8 +4722,7 @@ delete: extent_num_bytes)) { should_end = true; } - if (btrfs_should_throttle_delayed_refs(trans, - fs_info)) + if (btrfs_should_throttle_delayed_refs(trans)) should_throttle = true; } } @@ -5235,10 +5220,10 @@ static void evict_inode_truncate_pages(struct inode *inode) truncate_inode_pages_final(&inode->i_data); write_lock(&map_tree->lock); - while (!RB_EMPTY_ROOT(&map_tree->map)) { + while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) { struct extent_map *em; - node = rb_first(&map_tree->map); + node = rb_first_cached(&map_tree->map); em = rb_entry(node, struct extent_map, rb_node); clear_bit(EXTENT_FLAG_PINNED, &em->flags); clear_bit(EXTENT_FLAG_LOGGING, &em->flags); @@ -5306,8 +5291,7 @@ static void evict_inode_truncate_pages(struct inode *inode) } static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, - struct btrfs_block_rsv *rsv, - u64 min_size) + struct btrfs_block_rsv *rsv) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; @@ -5317,7 +5301,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, struct btrfs_trans_handle *trans; int ret; - ret = btrfs_block_rsv_refill(root, rsv, min_size, + ret = btrfs_block_rsv_refill(root, rsv, rsv->size, BTRFS_RESERVE_FLUSH_LIMIT); if (ret && ++failures > 2) { @@ -5334,8 +5318,8 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, * Try to steal from the global reserve if there is space for * it. */ - if (!btrfs_check_space_for_delayed_refs(trans, fs_info) && - !btrfs_block_rsv_migrate(global_rsv, rsv, min_size, 0)) + if (!btrfs_check_space_for_delayed_refs(trans) && + !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, false)) return trans; /* If not, commit and try again. */ @@ -5351,7 +5335,6 @@ void btrfs_evict_inode(struct inode *inode) struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv; - u64 min_size; int ret; trace_btrfs_inode_evict(inode); @@ -5361,8 +5344,6 @@ void btrfs_evict_inode(struct inode *inode) return; } - min_size = btrfs_calc_trunc_metadata_size(fs_info, 1); - evict_inode_truncate_pages(inode); if (inode->i_nlink && @@ -5373,9 +5354,6 @@ void btrfs_evict_inode(struct inode *inode) if (is_bad_inode(inode)) goto no_delete; - /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ - if (!special_file(inode->i_mode)) - btrfs_wait_ordered_range(inode, 0, (u64)-1); btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1); @@ -5395,13 +5373,13 @@ void btrfs_evict_inode(struct inode *inode) rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); if (!rsv) goto no_delete; - rsv->size = min_size; + rsv->size = btrfs_calc_trunc_metadata_size(fs_info, 1); rsv->failfast = 1; btrfs_i_size_write(BTRFS_I(inode), 0); while (1) { - trans = evict_refill_and_join(root, rsv, min_size); + trans = evict_refill_and_join(root, rsv); if (IS_ERR(trans)) goto free_rsv; @@ -5426,7 +5404,7 @@ void btrfs_evict_inode(struct inode *inode) * If it turns out that we are dropping too many of these, we might want * to add a mechanism for retrying these after a commit. */ - trans = evict_refill_and_join(root, rsv, min_size); + trans = evict_refill_and_join(root, rsv); if (!IS_ERR(trans)) { trans->block_rsv = rsv; btrfs_orphan_del(trans, BTRFS_I(inode)); @@ -5471,12 +5449,8 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)), name, namelen, 0); - if (!di) { - ret = -ENOENT; - goto out; - } - if (IS_ERR(di)) { - ret = PTR_ERR(di); + if (IS_ERR_OR_NULL(di)) { + ret = di ? PTR_ERR(di) : -ENOENT; goto out; } @@ -6390,8 +6364,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = btrfs_insert_dir_item(trans, root, name, name_len, - parent_inode, &key, + ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key, btrfs_inode_type(&inode->vfs_inode), index); if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; @@ -6584,7 +6557,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, int drop_inode = 0; /* do not allow sys_link's with other subvols of the same device */ - if (root->objectid != BTRFS_I(inode)->root->objectid) + if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid) return -EXDEV; if (inode->i_nlink >= BTRFS_LINK_MAX) @@ -6777,9 +6750,9 @@ static noinline int uncompress_inline(struct btrfs_path *path, * This also copies inline extents directly into the page. */ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, - size_t pg_offset, u64 start, u64 len, - int create) + struct page *page, + size_t pg_offset, u64 start, u64 len, + int create) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret; @@ -6823,19 +6796,21 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, em->len = (u64)-1; em->block_len = (u64)-1; + path = btrfs_alloc_path(); if (!path) { - path = btrfs_alloc_path(); - if (!path) { - err = -ENOMEM; - goto out; - } - /* - * Chances are we'll be called again, so go ahead and do - * readahead - */ - path->reada = READA_FORWARD; + err = -ENOMEM; + goto out; } + /* Chances are we'll be called again, so go ahead and do readahead */ + path->reada = READA_FORWARD; + + /* + * Unless we're going to uncompress the inline extent, no sleep would + * happen. + */ + path->leave_spinning = 1; + ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); if (ret < 0) { err = ret; @@ -6938,6 +6913,8 @@ next: em->orig_block_len = em->len; em->orig_start = em->start; ptr = btrfs_file_extent_inline_start(item) + extent_offset; + + btrfs_set_path_blocking(path); if (!PageUptodate(page)) { if (btrfs_file_extent_compression(leaf, item) != BTRFS_COMPRESS_NONE) { @@ -6985,10 +6962,10 @@ insert: err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); write_unlock(&em_tree->lock); out: + btrfs_free_path(path); trace_btrfs_get_extent(root, inode, em); - btrfs_free_path(path); if (err) { free_extent_map(em); return ERR_PTR(err); @@ -9021,7 +8998,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) /* Migrate the slack space for the truncate to our reserve */ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, - min_size, 0); + min_size, false); BUG_ON(ret); /* @@ -9058,7 +9035,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) btrfs_block_rsv_release(fs_info, rsv, -1); ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, - rsv, min_size, 0); + rsv, min_size, false); BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; } @@ -10191,7 +10168,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &btrfs_symlink_inode_operations; inode_nohighmem(inode); - inode->i_mapping->a_ops = &btrfs_symlink_aops; + inode->i_mapping->a_ops = &btrfs_aops; inode_set_bytes(inode, name_len); btrfs_i_size_write(BTRFS_I(inode), name_len); err = btrfs_update_inode(trans, root, inode); @@ -10567,13 +10544,6 @@ static const struct address_space_operations btrfs_aops = { .error_remove_page = generic_error_remove_page, }; -static const struct address_space_operations btrfs_symlink_aops = { - .readpage = btrfs_readpage, - .writepage = btrfs_writepage, - .invalidatepage = btrfs_invalidatepage, - .releasepage = btrfs_releasepage, -}; - static const struct inode_operations btrfs_file_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d60b6caf09e8..a990a9045139 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -491,7 +491,6 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) struct fstrim_range range; u64 minlen = ULLONG_MAX; u64 num_devices = 0; - u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); int ret; if (!capable(CAP_SYS_ADMIN)) @@ -515,11 +514,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) return -EOPNOTSUPP; if (copy_from_user(&range, arg, sizeof(range))) return -EFAULT; - if (range.start > total_bytes || - range.len < fs_info->sb->s_blocksize) + + /* + * NOTE: Don't truncate the range using super->total_bytes. Bytenr of + * block group is in the logical address space, which can be any + * sectorsize aligned bytenr in the range [0, U64_MAX]. + */ + if (range.len < fs_info->sb->s_blocksize) return -EINVAL; - range.len = min(range.len, total_bytes - range.start); range.minlen = max(range.minlen, minlen); ret = btrfs_trim_fs(fs_info, &range); if (ret < 0) @@ -686,8 +689,7 @@ static noinline int create_subvol(struct inode *dir, goto fail; } - ret = btrfs_insert_dir_item(trans, root, - name, namelen, BTRFS_I(dir), &key, + ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key, BTRFS_FT_DIR, index); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1324,7 +1326,7 @@ again: if (i_done != page_cnt) { spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; + btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); spin_unlock(&BTRFS_I(inode)->lock); btrfs_delalloc_release_space(inode, data_reserved, start_index << PAGE_SHIFT, @@ -4393,7 +4395,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) ret = PTR_ERR(new_root); goto out; } - if (!is_fstree(new_root->objectid)) { + if (!is_fstree(new_root->root_key.objectid)) { ret = -ENOENT; goto out; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index d4917c0cddf5..45868fd76209 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1416,13 +1416,14 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) if (!qgroup) { ret = -ENOENT; goto out; - } else { - /* check if there are no children of this qgroup */ - if (!list_empty(&qgroup->members)) { - ret = -EBUSY; - goto out; - } } + + /* Check if there are no children of this qgroup */ + if (!list_empty(&qgroup->members)) { + ret = -EBUSY; + goto out; + } + ret = del_qgroup_item(trans, qgroupid); if (ret && ret != -ENOENT) goto out; @@ -1712,6 +1713,416 @@ static int adjust_slots_upwards(struct btrfs_path *path, int root_level) return 0; } +/* + * Helper function to trace a subtree tree block swap. + * + * The swap will happen in highest tree block, but there may be a lot of + * tree blocks involved. + * + * For example: + * OO = Old tree blocks + * NN = New tree blocks allocated during balance + * + * File tree (257) Reloc tree for 257 + * L2 OO NN + * / \ / \ + * L1 OO OO (a) OO NN (a) + * / \ / \ / \ / \ + * L0 OO OO OO OO OO OO NN NN + * (b) (c) (b) (c) + * + * When calling qgroup_trace_extent_swap(), we will pass: + * @src_eb = OO(a) + * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ] + * @dst_level = 0 + * @root_level = 1 + * + * In that case, qgroup_trace_extent_swap() will search from OO(a) to + * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty. + * + * The main work of qgroup_trace_extent_swap() can be split into 3 parts: + * + * 1) Tree search from @src_eb + * It should acts as a simplified btrfs_search_slot(). + * The key for search can be extracted from @dst_path->nodes[dst_level] + * (first key). + * + * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty + * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty. + * They should be marked during preivous (@dst_level = 1) iteration. + * + * 3) Mark file extents in leaves dirty + * We don't have good way to pick out new file extents only. + * So we still follow the old method by scanning all file extents in + * the leave. + * + * This function can free us from keeping two pathes, thus later we only need + * to care about how to iterate all new tree blocks in reloc tree. + */ +static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, + struct extent_buffer *src_eb, + struct btrfs_path *dst_path, + int dst_level, int root_level, + bool trace_leaf) +{ + struct btrfs_key key; + struct btrfs_path *src_path; + struct btrfs_fs_info *fs_info = trans->fs_info; + u32 nodesize = fs_info->nodesize; + int cur_level = root_level; + int ret; + + BUG_ON(dst_level > root_level); + /* Level mismatch */ + if (btrfs_header_level(src_eb) != root_level) + return -EINVAL; + + src_path = btrfs_alloc_path(); + if (!src_path) { + ret = -ENOMEM; + goto out; + } + + if (dst_level) + btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0); + else + btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0); + + /* For src_path */ + extent_buffer_get(src_eb); + src_path->nodes[root_level] = src_eb; + src_path->slots[root_level] = dst_path->slots[root_level]; + src_path->locks[root_level] = 0; + + /* A simplified version of btrfs_search_slot() */ + while (cur_level >= dst_level) { + struct btrfs_key src_key; + struct btrfs_key dst_key; + + if (src_path->nodes[cur_level] == NULL) { + struct btrfs_key first_key; + struct extent_buffer *eb; + int parent_slot; + u64 child_gen; + u64 child_bytenr; + + eb = src_path->nodes[cur_level + 1]; + parent_slot = src_path->slots[cur_level + 1]; + child_bytenr = btrfs_node_blockptr(eb, parent_slot); + child_gen = btrfs_node_ptr_generation(eb, parent_slot); + btrfs_node_key_to_cpu(eb, &first_key, parent_slot); + + eb = read_tree_block(fs_info, child_bytenr, child_gen, + cur_level, &first_key); + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto out; + } else if (!extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + ret = -EIO; + goto out; + } + + src_path->nodes[cur_level] = eb; + + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING; + } + + src_path->slots[cur_level] = dst_path->slots[cur_level]; + if (cur_level) { + btrfs_node_key_to_cpu(dst_path->nodes[cur_level], + &dst_key, dst_path->slots[cur_level]); + btrfs_node_key_to_cpu(src_path->nodes[cur_level], + &src_key, src_path->slots[cur_level]); + } else { + btrfs_item_key_to_cpu(dst_path->nodes[cur_level], + &dst_key, dst_path->slots[cur_level]); + btrfs_item_key_to_cpu(src_path->nodes[cur_level], + &src_key, src_path->slots[cur_level]); + } + /* Content mismatch, something went wrong */ + if (btrfs_comp_cpu_keys(&dst_key, &src_key)) { + ret = -ENOENT; + goto out; + } + cur_level--; + } + + /* + * Now both @dst_path and @src_path have been populated, record the tree + * blocks for qgroup accounting. + */ + ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start, + nodesize, GFP_NOFS); + if (ret < 0) + goto out; + ret = btrfs_qgroup_trace_extent(trans, + dst_path->nodes[dst_level]->start, + nodesize, GFP_NOFS); + if (ret < 0) + goto out; + + /* Record leaf file extents */ + if (dst_level == 0 && trace_leaf) { + ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]); + if (ret < 0) + goto out; + ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]); + } +out: + btrfs_free_path(src_path); + return ret; +} + +/* + * Helper function to do recursive generation-aware depth-first search, to + * locate all new tree blocks in a subtree of reloc tree. + * + * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot) + * reloc tree + * L2 NN (a) + * / \ + * L1 OO NN (b) + * / \ / \ + * L0 OO OO OO NN + * (c) (d) + * If we pass: + * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ], + * @cur_level = 1 + * @root_level = 1 + * + * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace + * above tree blocks along with their counter parts in file tree. + * While during search, old tree blocsk OO(c) will be skiped as tree block swap + * won't affect OO(c). + */ +static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, + struct extent_buffer *src_eb, + struct btrfs_path *dst_path, + int cur_level, int root_level, + u64 last_snapshot, bool trace_leaf) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct extent_buffer *eb; + bool need_cleanup = false; + int ret = 0; + int i; + + /* Level sanity check */ + if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL || + root_level < 0 || root_level >= BTRFS_MAX_LEVEL || + root_level < cur_level) { + btrfs_err_rl(fs_info, + "%s: bad levels, cur_level=%d root_level=%d", + __func__, cur_level, root_level); + return -EUCLEAN; + } + + /* Read the tree block if needed */ + if (dst_path->nodes[cur_level] == NULL) { + struct btrfs_key first_key; + int parent_slot; + u64 child_gen; + u64 child_bytenr; + + /* + * dst_path->nodes[root_level] must be initialized before + * calling this function. + */ + if (cur_level == root_level) { + btrfs_err_rl(fs_info, + "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d", + __func__, root_level, root_level, cur_level); + return -EUCLEAN; + } + + /* + * We need to get child blockptr/gen from parent before we can + * read it. + */ + eb = dst_path->nodes[cur_level + 1]; + parent_slot = dst_path->slots[cur_level + 1]; + child_bytenr = btrfs_node_blockptr(eb, parent_slot); + child_gen = btrfs_node_ptr_generation(eb, parent_slot); + btrfs_node_key_to_cpu(eb, &first_key, parent_slot); + + /* This node is old, no need to trace */ + if (child_gen < last_snapshot) + goto out; + + eb = read_tree_block(fs_info, child_bytenr, child_gen, + cur_level, &first_key); + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto out; + } else if (!extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + ret = -EIO; + goto out; + } + + dst_path->nodes[cur_level] = eb; + dst_path->slots[cur_level] = 0; + + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING; + need_cleanup = true; + } + + /* Now record this tree block and its counter part for qgroups */ + ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level, + root_level, trace_leaf); + if (ret < 0) + goto cleanup; + + eb = dst_path->nodes[cur_level]; + + if (cur_level > 0) { + /* Iterate all child tree blocks */ + for (i = 0; i < btrfs_header_nritems(eb); i++) { + /* Skip old tree blocks as they won't be swapped */ + if (btrfs_node_ptr_generation(eb, i) < last_snapshot) + continue; + dst_path->slots[cur_level] = i; + + /* Recursive call (at most 7 times) */ + ret = qgroup_trace_new_subtree_blocks(trans, src_eb, + dst_path, cur_level - 1, root_level, + last_snapshot, trace_leaf); + if (ret < 0) + goto cleanup; + } + } + +cleanup: + if (need_cleanup) { + /* Clean up */ + btrfs_tree_unlock_rw(dst_path->nodes[cur_level], + dst_path->locks[cur_level]); + free_extent_buffer(dst_path->nodes[cur_level]); + dst_path->nodes[cur_level] = NULL; + dst_path->slots[cur_level] = 0; + dst_path->locks[cur_level] = 0; + } +out: + return ret; +} + +/* + * Inform qgroup to trace subtree swap used in balance. + * + * Unlike btrfs_qgroup_trace_subtree(), this function will only trace + * new tree blocks whose generation is equal to (or larger than) @last_snapshot. + * + * Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and + * @dst_slot), and find any tree blocks whose generation is at @last_snapshot, + * and then go down @src_eb (pointed by @src_parent and @src_slot) to find + * the conterpart of the tree block, then mark both tree blocks as qgroup dirty, + * and skip all tree blocks whose generation is smaller than last_snapshot. + * + * This would skip tons of tree blocks of original btrfs_qgroup_trace_subtree(), + * which could be the cause of very slow balance if the file tree is large. + * + * @src_parent, @src_slot: pointer to src (file tree) eb. + * @dst_parent, @dst_slot: pointer to dst (reloc tree) eb. + */ +int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *bg_cache, + struct extent_buffer *src_parent, int src_slot, + struct extent_buffer *dst_parent, int dst_slot, + u64 last_snapshot) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_path *dst_path = NULL; + struct btrfs_key first_key; + struct extent_buffer *src_eb = NULL; + struct extent_buffer *dst_eb = NULL; + bool trace_leaf = false; + u64 child_gen; + u64 child_bytenr; + int level; + int ret; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return 0; + + /* Check parameter order */ + if (btrfs_node_ptr_generation(src_parent, src_slot) > + btrfs_node_ptr_generation(dst_parent, dst_slot)) { + btrfs_err_rl(fs_info, + "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__, + btrfs_node_ptr_generation(src_parent, src_slot), + btrfs_node_ptr_generation(dst_parent, dst_slot)); + return -EUCLEAN; + } + + /* + * Only trace leaf if we're relocating data block groups, this could + * reduce tons of data extents tracing for meta/sys bg relocation. + */ + if (bg_cache->flags & BTRFS_BLOCK_GROUP_DATA) + trace_leaf = true; + /* Read out real @src_eb, pointed by @src_parent and @src_slot */ + child_bytenr = btrfs_node_blockptr(src_parent, src_slot); + child_gen = btrfs_node_ptr_generation(src_parent, src_slot); + btrfs_node_key_to_cpu(src_parent, &first_key, src_slot); + + src_eb = read_tree_block(fs_info, child_bytenr, child_gen, + btrfs_header_level(src_parent) - 1, &first_key); + if (IS_ERR(src_eb)) { + ret = PTR_ERR(src_eb); + goto out; + } + + /* Read out real @dst_eb, pointed by @src_parent and @src_slot */ + child_bytenr = btrfs_node_blockptr(dst_parent, dst_slot); + child_gen = btrfs_node_ptr_generation(dst_parent, dst_slot); + btrfs_node_key_to_cpu(dst_parent, &first_key, dst_slot); + + dst_eb = read_tree_block(fs_info, child_bytenr, child_gen, + btrfs_header_level(dst_parent) - 1, &first_key); + if (IS_ERR(dst_eb)) { + ret = PTR_ERR(dst_eb); + goto out; + } + + if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) { + ret = -EINVAL; + goto out; + } + + level = btrfs_header_level(dst_eb); + dst_path = btrfs_alloc_path(); + if (!dst_path) { + ret = -ENOMEM; + goto out; + } + + /* For dst_path */ + extent_buffer_get(dst_eb); + dst_path->nodes[level] = dst_eb; + dst_path->slots[level] = 0; + dst_path->locks[level] = 0; + + /* Do the generation-aware breadth-first search */ + ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level, + level, last_snapshot, trace_leaf); + if (ret < 0) + goto out; + ret = 0; + +out: + free_extent_buffer(src_eb); + free_extent_buffer(dst_eb); + btrfs_free_path(dst_path); + if (ret < 0) + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + return ret; +} + int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *root_eb, u64 root_gen, int root_level) @@ -2132,6 +2543,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) struct btrfs_delayed_ref_root *delayed_refs; struct ulist *new_roots = NULL; struct rb_node *node; + u64 num_dirty_extents = 0; u64 qgroup_to_skip; int ret = 0; @@ -2141,6 +2553,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) record = rb_entry(node, struct btrfs_qgroup_extent_record, node); + num_dirty_extents++; trace_btrfs_qgroup_account_extents(fs_info, record); if (!ret) { @@ -2186,6 +2599,8 @@ cleanup: kfree(record); } + trace_qgroup_num_dirty_extents(fs_info, trans->transid, + num_dirty_extents); return ret; } @@ -2897,6 +3312,7 @@ qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info) qgroup->rfer_cmpr = 0; qgroup->excl = 0; qgroup->excl_cmpr = 0; + qgroup_dirty(fs_info, qgroup); } spin_unlock(&fs_info->qgroup_lock); } @@ -3004,7 +3420,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode, int ret; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) || - !is_fstree(root->objectid) || len == 0) + !is_fstree(root->root_key.objectid) || len == 0) return 0; /* @reserved parameter is mandatory for qgroup */ @@ -3090,7 +3506,7 @@ static int qgroup_free_reserved_data(struct inode *inode, goto out; freed += changeset.bytes_changed; } - btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed, + btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed, BTRFS_QGROUP_RSV_DATA); ret = freed; out: @@ -3106,6 +3522,10 @@ static int __btrfs_qgroup_release_data(struct inode *inode, int trace_op = QGROUP_RELEASE; int ret; + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, + &BTRFS_I(inode)->root->fs_info->flags)) + return 0; + /* In release case, we shouldn't have @reserved */ WARN_ON(!free && reserved); if (free && reserved) @@ -3122,7 +3542,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode, changeset.bytes_changed, trace_op); if (free) btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, - BTRFS_I(inode)->root->objectid, + BTRFS_I(inode)->root->root_key.objectid, changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); ret = changeset.bytes_changed; out: @@ -3215,7 +3635,7 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, int ret; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || - !is_fstree(root->objectid) || num_bytes == 0) + !is_fstree(root->root_key.objectid) || num_bytes == 0) return 0; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); @@ -3240,13 +3660,13 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) struct btrfs_fs_info *fs_info = root->fs_info; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || - !is_fstree(root->objectid)) + !is_fstree(root->root_key.objectid)) return; /* TODO: Update trace point to handle such free */ trace_qgroup_meta_free_all_pertrans(root); /* Special value -1 means to free all reserved space */ - btrfs_qgroup_free_refroot(fs_info, root->objectid, (u64)-1, + btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1, BTRFS_QGROUP_RSV_META_PERTRANS); } @@ -3256,7 +3676,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, struct btrfs_fs_info *fs_info = root->fs_info; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || - !is_fstree(root->objectid)) + !is_fstree(root->root_key.objectid)) return; /* @@ -3267,7 +3687,8 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, num_bytes = sub_root_meta_rsv(root, num_bytes, type); BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); trace_qgroup_meta_reserve(root, type, -(s64)num_bytes); - btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes, type); + btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, + num_bytes, type); } static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, @@ -3321,13 +3742,13 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) struct btrfs_fs_info *fs_info = root->fs_info; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || - !is_fstree(root->objectid)) + !is_fstree(root->root_key.objectid)) return; /* Same as btrfs_qgroup_free_meta_prealloc() */ num_bytes = sub_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PREALLOC); trace_qgroup_meta_convert(root, num_bytes); - qgroup_convert_meta(fs_info, root->objectid, num_bytes); + qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes); } /* @@ -3354,7 +3775,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) inode->i_ino, unode->val, unode->aux); } btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, - BTRFS_I(inode)->root->objectid, + BTRFS_I(inode)->root->root_key.objectid, changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 54b8bb282c0e..d8f78f5ab854 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -236,6 +236,12 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *root_eb, u64 root_gen, int root_level); + +int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *bg_cache, + struct extent_buffer *src_parent, int src_slot, + struct extent_buffer *dst_parent, int dst_slot, + u64 last_snapshot); int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct ulist *old_roots, struct ulist *new_roots); @@ -249,6 +255,8 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes) { + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return; trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes); btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes, BTRFS_QGROUP_RSV_DATA); diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index e5b9e596bb92..d69fbfb30aa9 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -732,7 +732,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, INIT_LIST_HEAD(&ra->list); ra->action = action; - ra->root = root->objectid; + ra->root = root->root_key.objectid; /* * This is an allocation, preallocate the block_entry in case we haven't @@ -787,8 +787,8 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, * one we want to lookup below when we modify the * re->num_refs. */ - ref_root = root->objectid; - re->root_objectid = root->objectid; + ref_root = root->root_key.objectid; + re->root_objectid = root->root_key.objectid; re->num_refs = 0; } @@ -862,7 +862,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, * didn't thik of some other corner case. */ btrfs_err(fs_info, "failed to find root %llu for %llu", - root->objectid, be->bytenr); + root->root_key.objectid, be->bytenr); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); kfree(ra); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 8783a1776540..924116f654a1 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -648,8 +648,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, int level, u64 bytenr) { struct backref_cache *cache = &rc->backref_cache; - struct btrfs_path *path1; - struct btrfs_path *path2; + struct btrfs_path *path1; /* For searching extent root */ + struct btrfs_path *path2; /* For searching parent of TREE_BLOCK_REF */ struct extent_buffer *eb; struct btrfs_root *root; struct backref_node *cur; @@ -662,7 +662,7 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, struct btrfs_key key; unsigned long end; unsigned long ptr; - LIST_HEAD(list); + LIST_HEAD(list); /* Pending edge list, upper node needs to be checked */ LIST_HEAD(useless); int cowonly; int ret; @@ -778,6 +778,10 @@ again: key.type != BTRFS_SHARED_BLOCK_REF_KEY); } + /* + * Parent node found and matches current inline ref, no need to + * rebuild this node for this inline ref. + */ if (exist && ((key.type == BTRFS_TREE_BLOCK_REF_KEY && exist->owner == key.offset) || @@ -787,11 +791,12 @@ again: goto next; } + /* SHARED_BLOCK_REF means key.offset is the parent bytenr */ if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) { if (key.objectid == key.offset) { /* - * only root blocks of reloc trees use - * backref of this type. + * Only root blocks of reloc trees use backref + * pointing to itself. */ root = find_reloc_root(rc, cur->bytenr); ASSERT(root); @@ -840,7 +845,11 @@ again: goto next; } - /* key.type == BTRFS_TREE_BLOCK_REF_KEY */ + /* + * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset + * means the root objectid. We need to search the tree to get + * its parent bytenr. + */ root = read_fs_root(rc->extent_root->fs_info, key.offset); if (IS_ERR(root)) { err = PTR_ERR(root); @@ -863,10 +872,7 @@ again: level = cur->level + 1; - /* - * searching the tree to find upper level blocks - * reference the block. - */ + /* Search the tree to find parent blocks referring the block. */ path2->search_commit_root = 1; path2->skip_locking = 1; path2->lowest_level = level; @@ -884,7 +890,8 @@ again: cur->bytenr) { btrfs_err(root->fs_info, "couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)", - cur->bytenr, level - 1, root->objectid, + cur->bytenr, level - 1, + root->root_key.objectid, node_key->objectid, node_key->type, node_key->offset); err = -ENOENT; @@ -892,6 +899,8 @@ again: } lower = cur; need_check = true; + + /* Add all nodes and edges in the path */ for (; level < BTRFS_MAX_LEVEL; level++) { if (!path2->nodes[level]) { ASSERT(btrfs_root_bytenr(&root->root_item) == @@ -1281,7 +1290,7 @@ static void __del_reloc_root(struct btrfs_root *root) struct mapping_node *node = NULL; struct reloc_control *rc = fs_info->reloc_ctl; - if (rc) { + if (rc && root->node) { spin_lock(&rc->reloc_root_tree.lock); rb_node = tree_search(&rc->reloc_root_tree.rb_root, root->node->start); @@ -1735,7 +1744,7 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot, * errors, a negative error number is returned. */ static noinline_for_stack -int replace_path(struct btrfs_trans_handle *trans, +int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_root *dest, struct btrfs_root *src, struct btrfs_path *path, struct btrfs_key *next_key, int lowest_level, int max_level) @@ -1879,14 +1888,9 @@ again: * and tree block numbers, if current trans doesn't free * data reloc tree inode. */ - ret = btrfs_qgroup_trace_subtree(trans, parent, - btrfs_header_generation(parent), - btrfs_header_level(parent)); - if (ret < 0) - break; - ret = btrfs_qgroup_trace_subtree(trans, path->nodes[level], - btrfs_header_generation(path->nodes[level]), - btrfs_header_level(path->nodes[level])); + ret = btrfs_qgroup_trace_subtree_swap(trans, rc->block_group, + parent, slot, path->nodes[level], + path->slots[level], last_snapshot); if (ret < 0) break; @@ -2205,7 +2209,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, btrfs_comp_cpu_keys(&next_key, &key) >= 0) { ret = 0; } else { - ret = replace_path(trans, root, reloc_root, path, + ret = replace_path(trans, rc, root, reloc_root, path, &next_key, level, max_level); } if (ret < 0) { @@ -2911,7 +2915,6 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info, free_extent_buffer(eb); return -EIO; } - WARN_ON(btrfs_header_level(eb) != block->level); if (block->level == 0) btrfs_item_key_to_cpu(eb, &block->key, 0); else @@ -2987,7 +2990,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, struct backref_node *node; struct btrfs_path *path; struct tree_block *block; - struct rb_node *rb_node; + struct tree_block *next; int ret; int err = 0; @@ -2997,29 +3000,23 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, goto out_free_blocks; } - rb_node = rb_first(blocks); - while (rb_node) { - block = rb_entry(rb_node, struct tree_block, rb_node); + /* Kick in readahead for tree blocks with missing keys */ + rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { if (!block->key_ready) readahead_tree_block(fs_info, block->bytenr); - rb_node = rb_next(rb_node); } - rb_node = rb_first(blocks); - while (rb_node) { - block = rb_entry(rb_node, struct tree_block, rb_node); + /* Get first keys */ + rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { if (!block->key_ready) { err = get_tree_block_key(fs_info, block); if (err) goto out_free_path; } - rb_node = rb_next(rb_node); } - rb_node = rb_first(blocks); - while (rb_node) { - block = rb_entry(rb_node, struct tree_block, rb_node); - + /* Do tree relocation */ + rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { node = build_backref_tree(rc, &block->key, block->level, block->bytenr); if (IS_ERR(node)) { @@ -3030,11 +3027,10 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, ret = relocate_tree_block(trans, rc, node, &block->key, path); if (ret < 0) { - if (ret != -EAGAIN || rb_node == rb_first(blocks)) + if (ret != -EAGAIN || &block->rb_node == rb_first(blocks)) err = ret; goto out; } - rb_node = rb_next(rb_node); } out: err = finish_pending_nodes(trans, rc, path, err); @@ -4669,7 +4665,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, if (rc->merge_reloc_tree) { ret = btrfs_block_rsv_migrate(&pending->block_rsv, rc->block_rsv, - rc->nodes_relocated, 1); + rc->nodes_relocated, true); if (ret) return ret; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 3be1456b5116..902819d3cf41 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1124,7 +1124,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) if (scrub_write_page_to_dev_replace(sblock_other, page_num) != 0) { - btrfs_dev_replace_stats_inc( + atomic64_inc( &fs_info->dev_replace.num_write_errors); success = 0; } @@ -1564,8 +1564,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, if (btrfsic_submit_bio_wait(bio)) { btrfs_dev_stat_inc_and_print(page_bad->dev, BTRFS_DEV_STAT_WRITE_ERRS); - btrfs_dev_replace_stats_inc( - &fs_info->dev_replace.num_write_errors); + atomic64_inc(&fs_info->dev_replace.num_write_errors); bio_put(bio); return -EIO; } @@ -1592,8 +1591,7 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) ret = scrub_write_page_to_dev_replace(sblock, page_num); if (ret) - btrfs_dev_replace_stats_inc( - &fs_info->dev_replace.num_write_errors); + atomic64_inc(&fs_info->dev_replace.num_write_errors); } } @@ -1726,8 +1724,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) struct scrub_page *spage = sbio->pagev[i]; spage->io_error = 1; - btrfs_dev_replace_stats_inc(&dev_replace-> - num_write_errors); + atomic64_inc(&dev_replace->num_write_errors); } } @@ -3022,8 +3019,7 @@ out: static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *scrub_dev, - int num, u64 base, u64 length, - int is_dev_replace) + int num, u64 base, u64 length) { struct btrfs_path *path, *ppath; struct btrfs_fs_info *fs_info = sctx->fs_info; @@ -3299,7 +3295,7 @@ again: extent_physical = extent_logical - logical + physical; extent_dev = scrub_dev; extent_mirror_num = mirror_num; - if (is_dev_replace) + if (sctx->is_dev_replace) scrub_remap_extent(fs_info, extent_logical, extent_len, &extent_physical, &extent_dev, @@ -3397,8 +3393,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, u64 chunk_offset, u64 length, u64 dev_offset, - struct btrfs_block_group_cache *cache, - int is_dev_replace) + struct btrfs_block_group_cache *cache) { struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; @@ -3435,8 +3430,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, if (map->stripes[i].dev->bdev == scrub_dev->bdev && map->stripes[i].physical == dev_offset) { ret = scrub_stripe(sctx, map, scrub_dev, i, - chunk_offset, length, - is_dev_replace); + chunk_offset, length); if (ret) goto out; } @@ -3449,8 +3443,7 @@ out: static noinline_for_stack int scrub_enumerate_chunks(struct scrub_ctx *sctx, - struct btrfs_device *scrub_dev, u64 start, u64 end, - int is_dev_replace) + struct btrfs_device *scrub_dev, u64 start, u64 end) { struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_path *path; @@ -3544,7 +3537,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, */ scrub_pause_on(fs_info); ret = btrfs_inc_block_group_ro(cache); - if (!ret && is_dev_replace) { + if (!ret && sctx->is_dev_replace) { /* * If we are doing a device replace wait for any tasks * that started dellaloc right before we set the block @@ -3609,7 +3602,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, dev_replace->item_needs_writeback = 1; btrfs_dev_replace_write_unlock(&fs_info->dev_replace); ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length, - found_key.offset, cache, is_dev_replace); + found_key.offset, cache); /* * flush, submit all pending read and write bios, afterwards @@ -3670,7 +3663,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, btrfs_put_block_group(cache); if (ret) break; - if (is_dev_replace && + if (sctx->is_dev_replace && atomic64_read(&dev_replace->num_write_errors) > 0) { ret = -EIO; break; @@ -3893,8 +3886,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, } if (!ret) - ret = scrub_enumerate_chunks(sctx, dev, start, end, - is_dev_replace); + ret = scrub_enumerate_chunks(sctx, dev, start, end); wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); atomic_dec(&fs_info->scrubs_running); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index ba8950bfd9c7..094cc1444a90 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1186,9 +1186,9 @@ static int __clone_root_cmp_bsearch(const void *key, const void *elt) u64 root = (u64)(uintptr_t)key; struct clone_root *cr = (struct clone_root *)elt; - if (root < cr->root->objectid) + if (root < cr->root->root_key.objectid) return -1; - if (root > cr->root->objectid) + if (root > cr->root->root_key.objectid) return 1; return 0; } @@ -1198,9 +1198,9 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2) struct clone_root *cr1 = (struct clone_root *)e1; struct clone_root *cr2 = (struct clone_root *)e2; - if (cr1->root->objectid < cr2->root->objectid) + if (cr1->root->root_key.objectid < cr2->root->root_key.objectid) return -1; - if (cr1->root->objectid > cr2->root->objectid) + if (cr1->root->root_key.objectid > cr2->root->root_key.objectid) return 1; return 0; } @@ -1693,12 +1693,8 @@ static int lookup_dir_item_inode(struct btrfs_root *root, di = btrfs_lookup_dir_item(NULL, root, path, dir, name, name_len, 0); - if (!di) { - ret = -ENOENT; - goto out; - } - if (IS_ERR(di)) { - ret = PTR_ERR(di); + if (IS_ERR_OR_NULL(di)) { + ret = di ? PTR_ERR(di) : -ENOENT; goto out; } btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); @@ -2346,7 +2342,7 @@ static int send_subvol_begin(struct send_ctx *sctx) return -ENOMEM; } - key.objectid = send_root->objectid; + key.objectid = send_root->root_key.objectid; key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = 0; @@ -2362,7 +2358,7 @@ static int send_subvol_begin(struct send_ctx *sctx) leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.type != BTRFS_ROOT_BACKREF_KEY || - key.objectid != send_root->objectid) { + key.objectid != send_root->root_key.objectid) { ret = -ENOENT; goto out; } @@ -4907,8 +4903,8 @@ static int send_clone(struct send_ctx *sctx, btrfs_debug(sctx->send_root->fs_info, "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu", - offset, len, clone_root->root->objectid, clone_root->ino, - clone_root->offset); + offset, len, clone_root->root->root_key.objectid, + clone_root->ino, clone_root->offset); p = fs_path_alloc(); if (!p) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 6601c9aa5e35..b362b45dd757 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2177,8 +2177,10 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]); buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]); /* Mask in the root object ID too, to disambiguate subvols */ - buf->f_fsid.val[0] ^= BTRFS_I(d_inode(dentry))->root->objectid >> 32; - buf->f_fsid.val[1] ^= BTRFS_I(d_inode(dentry))->root->objectid; + buf->f_fsid.val[0] ^= + BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32; + buf->f_fsid.val[1] ^= + BTRFS_I(d_inode(dentry))->root->root_key.objectid; return 0; } diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index d9269a531a4d..9e0f4a01be14 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -106,7 +106,7 @@ static int test_find_delalloc(u32 sectorsize) set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL); start = 0; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, &end, max_bytes); if (!found) { test_err("should have found at least one delalloc"); @@ -137,7 +137,7 @@ static int test_find_delalloc(u32 sectorsize) set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL); start = test_start; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, &end, max_bytes); if (!found) { test_err("couldn't find delalloc in our range"); @@ -171,7 +171,7 @@ static int test_find_delalloc(u32 sectorsize) } start = test_start; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, &end, max_bytes); if (found) { test_err("found range when we shouldn't have"); @@ -192,7 +192,7 @@ static int test_find_delalloc(u32 sectorsize) set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL); start = test_start; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, &end, max_bytes); if (!found) { test_err("didn't find our range"); @@ -233,7 +233,7 @@ static int test_find_delalloc(u32 sectorsize) * this changes at any point in the future we will need to fix this * tests expected behavior. */ - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, &end, max_bytes); if (!found) { test_err("didn't find our range"); diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 385a5316e4bf..bf15d3a7f20e 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -12,8 +12,8 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree) struct extent_map *em; struct rb_node *node; - while (!RB_EMPTY_ROOT(&em_tree->map)) { - node = rb_first(&em_tree->map); + while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) { + node = rb_first_cached(&em_tree->map); em = rb_entry(node, struct extent_map, rb_node); remove_extent_mapping(em_tree, em); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 3b84f5015029..5686290a50e1 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -44,7 +44,8 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) WARN_ON(refcount_read(&transaction->use_count) == 0); if (refcount_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); - WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root)); + WARN_ON(!RB_EMPTY_ROOT( + &transaction->delayed_refs.href_root.rb_root)); if (transaction->delayed_refs.pending_csums) btrfs_err(transaction->fs_info, "pending csums is %llu", @@ -118,7 +119,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans) list_del_init(&root->dirty_list); free_extent_buffer(root->commit_root); root->commit_root = btrfs_root_node(root); - if (is_fstree(root->objectid)) + if (is_fstree(root->root_key.objectid)) btrfs_unpin_free_ino(root); clear_btree_io_tree(&root->dirty_log_pages); } @@ -245,7 +246,7 @@ loop: memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs)); - cur_trans->delayed_refs.href_root = RB_ROOT; + cur_trans->delayed_refs.href_root = RB_ROOT_CACHED; cur_trans->delayed_refs.dirty_extent_root = RB_ROOT; atomic_set(&cur_trans->delayed_refs.num_entries, 0); @@ -759,7 +760,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - if (btrfs_check_space_for_delayed_refs(trans, fs_info)) + if (btrfs_check_space_for_delayed_refs(trans)) return 1; return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5); @@ -834,7 +835,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, trans->delayed_ref_updates = 0; if (!trans->sync) { must_run_delayed_refs = - btrfs_should_throttle_delayed_refs(trans, info); + btrfs_should_throttle_delayed_refs(trans); cur = max_t(unsigned long, cur, 32); /* @@ -1197,7 +1198,10 @@ again: list_add_tail(&fs_info->extent_root->dirty_list, &trans->transaction->switch_commits); - btrfs_after_dev_replace_commit(fs_info); + + /* Update dev-replace pointer once everything is committed */ + fs_info->dev_replace.committed_cursor_left = + fs_info->dev_replace.cursor_left_last_write_of_item; return 0; } @@ -1613,10 +1617,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret < 0) goto fail; - ret = btrfs_insert_dir_item(trans, parent_root, - dentry->d_name.name, dentry->d_name.len, - BTRFS_I(parent_inode), &key, - BTRFS_FT_DIR, index); + ret = btrfs_insert_dir_item(trans, dentry->d_name.name, + dentry->d_name.len, BTRFS_I(parent_inode), + &key, BTRFS_FT_DIR, index); /* We have check then name at the beginning, so it is impossible. */ BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); if (ret) { @@ -1929,6 +1932,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) return ret; } + btrfs_trans_release_metadata(trans); + trans->block_rsv = NULL; + /* make a pass through all the delayed refs we have so far * any runnings procs may add more while we are here */ @@ -1938,9 +1944,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) return ret; } - btrfs_trans_release_metadata(trans); - trans->block_rsv = NULL; - cur_trans = trans->transaction; /* @@ -2330,7 +2333,7 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) list_del_init(&root->root_list); spin_unlock(&fs_info->trans_lock); - btrfs_debug(fs_info, "cleaner removing %llu", root->objectid); + btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid); btrfs_kill_all_delayed_nodes(root); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index db835635372f..cab0b1f1f741 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -487,6 +487,13 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, u32 nritems = btrfs_header_nritems(leaf); int slot; + if (btrfs_header_level(leaf) != 0) { + generic_err(fs_info, leaf, 0, + "invalid level for leaf, have %d expect 0", + btrfs_header_level(leaf)); + return -EUCLEAN; + } + /* * Extent buffers from a relocation tree have a owner field that * corresponds to the subvolume tree they are based on. So just from an @@ -645,9 +652,16 @@ int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node) unsigned long nr = btrfs_header_nritems(node); struct btrfs_key key, next_key; int slot; + int level = btrfs_header_level(node); u64 bytenr; int ret = 0; + if (level <= 0 || level >= BTRFS_MAX_LEVEL) { + generic_err(fs_info, node, 0, + "invalid level for node, have %d expect [1, %d]", + level, BTRFS_MAX_LEVEL - 1); + return -EUCLEAN; + } if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info)) { btrfs_crit(fs_info, "corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]", diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 3c2ae0e4f25a..0dba09334a16 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -205,14 +205,11 @@ static int join_running_log_trans(struct btrfs_root *root) * until you call btrfs_end_log_trans() or it makes any future * log transactions wait until you call btrfs_end_log_trans() */ -int btrfs_pin_log_trans(struct btrfs_root *root) +void btrfs_pin_log_trans(struct btrfs_root *root) { - int ret = -ENOENT; - mutex_lock(&root->log_mutex); atomic_inc(&root->log_writers); mutex_unlock(&root->log_mutex); - return ret; } /* @@ -258,6 +255,13 @@ struct walk_control { /* what stage of the replay code we're currently in */ int stage; + /* + * Ignore any items from the inode currently being processed. Needs + * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in + * the LOG_WALK_REPLAY_INODES stage. + */ + bool ignore_cur_inode; + /* the root we are currently replaying */ struct btrfs_root *replay_dest; @@ -2487,6 +2491,20 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, inode_item = btrfs_item_ptr(eb, i, struct btrfs_inode_item); + /* + * If we have a tmpfile (O_TMPFILE) that got fsync'ed + * and never got linked before the fsync, skip it, as + * replaying it is pointless since it would be deleted + * later. We skip logging tmpfiles, but it's always + * possible we are replaying a log created with a kernel + * that used to log tmpfiles. + */ + if (btrfs_inode_nlink(eb, inode_item) == 0) { + wc->ignore_cur_inode = true; + continue; + } else { + wc->ignore_cur_inode = false; + } ret = replay_xattr_deletes(wc->trans, root, log, path, key.objectid); if (ret) @@ -2524,16 +2542,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, root->fs_info->sectorsize); ret = btrfs_drop_extents(wc->trans, root, inode, from, (u64)-1, 1); - /* - * If the nlink count is zero here, the iput - * will free the inode. We bump it to make - * sure it doesn't get freed until the link - * count fixup is done. - */ if (!ret) { - if (inode->i_nlink == 0) - inc_nlink(inode); - /* Update link count and nbytes. */ + /* Update the inode's nbytes. */ ret = btrfs_update_inode(wc->trans, root, inode); } @@ -2548,6 +2558,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, break; } + if (wc->ignore_cur_inode) + continue; + if (key.type == BTRFS_DIR_INDEX_KEY && wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { ret = replay_one_dir_item(wc->trans, root, path, @@ -3196,9 +3209,12 @@ static void free_log_tree(struct btrfs_trans_handle *trans, }; ret = walk_log_tree(trans, log, &wc); - /* I don't think this can happen but just in case */ - if (ret) - btrfs_abort_transaction(trans, ret); + if (ret) { + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(log->fs_info, ret, NULL); + } while (1) { ret = find_first_extent_bit(&log->dirty_log_pages, @@ -5564,9 +5580,33 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, dir_inode = btrfs_iget(fs_info->sb, &inode_key, root, NULL); - /* If parent inode was deleted, skip it. */ - if (IS_ERR(dir_inode)) - continue; + /* + * If the parent inode was deleted, return an error to + * fallback to a transaction commit. This is to prevent + * getting an inode that was moved from one parent A to + * a parent B, got its former parent A deleted and then + * it got fsync'ed, from existing at both parents after + * a log replay (and the old parent still existing). + * Example: + * + * mkdir /mnt/A + * mkdir /mnt/B + * touch /mnt/B/bar + * sync + * mv /mnt/B/bar /mnt/A/bar + * mv -T /mnt/A /mnt/B + * fsync /mnt/B/bar + * <power fail> + * + * If we ignore the old parent B which got deleted, + * after a log replay we would have file bar linked + * at both parents and the old parent B would still + * exist. + */ + if (IS_ERR(dir_inode)) { + ret = PTR_ERR(dir_inode); + goto out; + } if (ctx) ctx->log_new_dentries = false; @@ -5640,7 +5680,13 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; - if (btrfs_inode_in_log(inode, trans->transid)) { + /* + * Skip already logged inodes or inodes corresponding to tmpfiles + * (since logging them is pointless, a link count of 0 means they + * will never be accessible). + */ + if (btrfs_inode_in_log(inode, trans->transid) || + inode->vfs_inode.i_nlink == 0) { ret = BTRFS_NO_LOG_SYNC; goto end_no_trans; } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 7ab9bb88a639..767765031e59 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -65,7 +65,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *inode, u64 dirid); void btrfs_end_log_trans(struct btrfs_root *root); -int btrfs_pin_log_trans(struct btrfs_root *root); +void btrfs_pin_log_trans(struct btrfs_root *root); void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, int for_rename); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f4405e430da6..f435d397019e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1613,7 +1613,7 @@ static u64 find_next_chunk(struct btrfs_fs_info *fs_info) em_tree = &fs_info->mapping_tree.map_tree; read_lock(&em_tree->lock); - n = rb_last(&em_tree->map); + n = rb_last(&em_tree->map.rb_root); if (n) { em = rb_entry(n, struct extent_map, rb_node); ret = em->start + em->len; @@ -1854,6 +1854,24 @@ void btrfs_assign_next_active_device(struct btrfs_device *device, fs_info->fs_devices->latest_bdev = next_device->bdev; } +/* + * Return btrfs_fs_devices::num_devices excluding the device that's being + * currently replaced. + */ +static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) +{ + u64 num_devices = fs_info->fs_devices->num_devices; + + btrfs_dev_replace_read_lock(&fs_info->dev_replace); + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { + ASSERT(num_devices > 1); + num_devices--; + } + btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + + return num_devices; +} + int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, u64 devid) { @@ -1865,22 +1883,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, mutex_lock(&uuid_mutex); - num_devices = fs_devices->num_devices; - btrfs_dev_replace_read_lock(&fs_info->dev_replace); - if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { - WARN_ON(num_devices < 1); - num_devices--; - } - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + num_devices = btrfs_num_devices(fs_info); ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); if (ret) goto out; - ret = btrfs_find_device_by_devspec(fs_info, devid, device_path, - &device); - if (ret) + device = btrfs_find_device_by_devspec(fs_info, devid, device_path); + + if (IS_ERR(device)) { + if (PTR_ERR(device) == -ENOENT && + strcmp(device_path, "missing") == 0) + ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; + else + ret = PTR_ERR(device); goto out; + } if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { ret = BTRFS_ERROR_DEV_TGT_REPLACE; @@ -2096,9 +2114,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) call_rcu(&tgtdev->rcu, free_device_rcu); } -static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info, - const char *device_path, - struct btrfs_device **device) +static struct btrfs_device *btrfs_find_device_by_path( + struct btrfs_fs_info *fs_info, const char *device_path) { int ret = 0; struct btrfs_super_block *disk_super; @@ -2106,28 +2123,27 @@ static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info, u8 *dev_uuid; struct block_device *bdev; struct buffer_head *bh; + struct btrfs_device *device; - *device = NULL; ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, fs_info->bdev_holder, 0, &bdev, &bh); if (ret) - return ret; + return ERR_PTR(ret); disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); dev_uuid = disk_super->dev_item.uuid; - *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid); + device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid); brelse(bh); - if (!*device) - ret = -ENOENT; + if (!device) + device = ERR_PTR(-ENOENT); blkdev_put(bdev, FMODE_READ); - return ret; + return device; } -int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, - const char *device_path, - struct btrfs_device **device) +static struct btrfs_device *btrfs_find_device_missing_or_by_path( + struct btrfs_fs_info *fs_info, const char *device_path) { - *device = NULL; + struct btrfs_device *device = NULL; if (strcmp(device_path, "missing") == 0) { struct list_head *devices; struct btrfs_device *tmp; @@ -2136,42 +2152,38 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, list_for_each_entry(tmp, devices, dev_list) { if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &tmp->dev_state) && !tmp->bdev) { - *device = tmp; + device = tmp; break; } } - if (!*device) - return BTRFS_ERROR_DEV_MISSING_NOT_FOUND; - - return 0; + if (!device) + return ERR_PTR(-ENOENT); } else { - return btrfs_find_device_by_path(fs_info, device_path, device); + device = btrfs_find_device_by_path(fs_info, device_path); } + + return device; } /* * Lookup a device given by device id, or the path if the id is 0. */ -int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, - const char *devpath, - struct btrfs_device **device) +struct btrfs_device *btrfs_find_device_by_devspec( + struct btrfs_fs_info *fs_info, u64 devid, const char *devpath) { - int ret; + struct btrfs_device *device; if (devid) { - ret = 0; - *device = btrfs_find_device(fs_info, devid, NULL, NULL); - if (!*device) - ret = -ENOENT; + device = btrfs_find_device(fs_info, devid, NULL, NULL); + if (!device) + return ERR_PTR(-ENOENT); } else { if (!devpath || !devpath[0]) - return -EINVAL; - - ret = btrfs_find_device_missing_or_by_path(fs_info, devpath, - device); + return ERR_PTR(-EINVAL); + device = btrfs_find_device_missing_or_by_path(fs_info, devpath); } - return ret; + return device; } /* @@ -3679,7 +3691,7 @@ static int alloc_profile_is_valid(u64 flags, int extended) return !extended; /* "0" is valid for usual profiles */ /* true if exactly one bit set */ - return (flags & (flags - 1)) == 0; + return is_power_of_2(flags); } static inline int balance_need_close(struct btrfs_fs_info *fs_info) @@ -3740,13 +3752,8 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, } } - num_devices = fs_info->fs_devices->num_devices; - btrfs_dev_replace_read_lock(&fs_info->dev_replace); - if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { - BUG_ON(num_devices < 1); - num_devices--; - } - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + num_devices = btrfs_num_devices(fs_info); + allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; if (num_devices > 1) allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); @@ -5897,7 +5904,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } out: if (dev_replace_is_ongoing) { - btrfs_dev_replace_clear_lock_blocking(dev_replace); + ASSERT(atomic_read(&dev_replace->blocking_readers) > 0); + btrfs_dev_replace_read_lock(dev_replace); + /* Barrier implied by atomic_dec_and_test */ + if (atomic_dec_and_test(&dev_replace->blocking_readers)) + cond_wake_up_nomb(&dev_replace->read_lock_wq); btrfs_dev_replace_read_unlock(dev_replace); } free_extent_map(em); @@ -7438,7 +7449,7 @@ static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) int ret = 0; read_lock(&em_tree->lock); - for (node = rb_first(&em_tree->map); node; node = rb_next(node)) { + for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { em = rb_entry(node, struct extent_map, rb_node); if (em->map_lookup->num_stripes != em->map_lookup->verified_stripes) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 23e9285d88de..aefce895e994 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -410,12 +410,9 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step); void btrfs_assign_next_active_device(struct btrfs_device *device, struct btrfs_device *this_dev); -int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, - const char *device_path, - struct btrfs_device **device); -int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, - const char *devpath, - struct btrfs_device **device); +struct btrfs_device *btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, + u64 devid, + const char *devpath); struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, const u8 *uuid); diff --git a/fs/buffer.c b/fs/buffer.c index 6f1ae3ac9789..109f55196866 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3060,11 +3060,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, */ bio = bio_alloc(GFP_NOIO, 1); - if (wbc) { - wbc_init_bio(wbc, bio); - wbc_account_io(wbc, bh->b_page, bh->b_size); - } - bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); bio->bi_write_hint = write_hint; @@ -3084,6 +3079,11 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, op_flags |= REQ_PRIO; bio_set_op_attrs(bio, op, op_flags); + if (wbc) { + wbc_init_bio(wbc, bio); + wbc_account_io(wbc, bh->b_page, bh->b_size); + } + submit_bio(bio); return 0; } diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index af2b17b21b94..95983c744164 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -343,7 +343,7 @@ try_again: trap = lock_rename(cache->graveyard, dir); /* do some checks before getting the grave dentry */ - if (rep->d_parent != dir) { + if (rep->d_parent != dir || IS_DEADDIR(d_inode(rep))) { /* the entry was probably culled when we dropped the parent dir * lock */ unlock_rename(cache->graveyard, dir); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 0c9ab62c3df4..9dcaed031843 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1553,6 +1553,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param, /* Flags */ #define MID_WAIT_CANCELLED 1 /* Cancelled while waiting for response */ +#define MID_DELETED 2 /* Mid has been dequeued/deleted */ /* Types of response buffer returned from SendReceive2 */ #define CIFS_NO_BUFFER 0 /* Response buffer not returned */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 7aa08dba4719..52d71b64c0c6 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -659,7 +659,15 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed) mid->mid_state = MID_RESPONSE_RECEIVED; else mid->mid_state = MID_RESPONSE_MALFORMED; - list_del_init(&mid->qhead); + /* + * Trying to handle/dequeue a mid after the send_recv() + * function has finished processing it is a bug. + */ + if (mid->mid_flags & MID_DELETED) + printk_once(KERN_WARNING + "trying to dequeue a deleted mid\n"); + else + list_del_init(&mid->qhead); spin_unlock(&GlobalMid_Lock); } @@ -938,8 +946,7 @@ next_pdu: } else { mids[0] = server->ops->find_mid(server, buf); bufs[0] = buf; - if (mids[0]) - num_mids = 1; + num_mids = 1; if (!mids[0] || !mids[0]->receive) length = standard_receive3(server, mids[0]); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index d954ce36b473..89985a0a6819 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1477,7 +1477,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, } srch_inf->entries_in_buffer = 0; - srch_inf->index_of_last_entry = 0; + srch_inf->index_of_last_entry = 2; rc = SMB2_query_directory(xid, tcon, fid->persistent_fid, fid->volatile_fid, 0, srch_inf); diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 78f96fa3d7d9..b48f43963da6 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -142,7 +142,8 @@ void cifs_delete_mid(struct mid_q_entry *mid) { spin_lock(&GlobalMid_Lock); - list_del(&mid->qhead); + list_del_init(&mid->qhead); + mid->mid_flags |= MID_DELETED; spin_unlock(&GlobalMid_Lock); DeleteMidQEntry(mid); @@ -772,6 +773,11 @@ cifs_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst) return mid; } +static void +cifs_noop_callback(struct mid_q_entry *mid) +{ +} + int compound_send_recv(const unsigned int xid, struct cifs_ses *ses, const int flags, const int num_rqst, struct smb_rqst *rqst, @@ -826,8 +832,13 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, } midQ[i]->mid_state = MID_REQUEST_SUBMITTED; + /* + * We don't invoke the callback compounds unless it is the last + * request. + */ + if (i < num_rqst - 1) + midQ[i]->callback = cifs_noop_callback; } - cifs_in_send_inc(ses->server); rc = smb_send_rqst(ses->server, num_rqst, rqst, flags); cifs_in_send_dec(ses->server); @@ -908,6 +919,12 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, midQ[i]->resp_buf = NULL; } out: + /* + * This will dequeue all mids. After this it is important that the + * demultiplex_thread will not process any of these mids any futher. + * This is prevented above by using a noop callback that will not + * wake this thread except for the very last PDU. + */ for (i = 0; i < num_rqst; i++) cifs_delete_mid(midQ[i]); add_credits(ses->server, credits, optype); diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index a9b00942e87d..0c445a03e682 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -560,69 +560,6 @@ static int mt_ioctl_trans(struct file *file, #define HIDPGETCONNLIST _IOR('H', 210, int) #define HIDPGETCONNINFO _IOR('H', 211, int) - -struct serial_struct32 { - compat_int_t type; - compat_int_t line; - compat_uint_t port; - compat_int_t irq; - compat_int_t flags; - compat_int_t xmit_fifo_size; - compat_int_t custom_divisor; - compat_int_t baud_base; - unsigned short close_delay; - char io_type; - char reserved_char[1]; - compat_int_t hub6; - unsigned short closing_wait; /* time to wait before closing */ - unsigned short closing_wait2; /* no longer used... */ - compat_uint_t iomem_base; - unsigned short iomem_reg_shift; - unsigned int port_high; - /* compat_ulong_t iomap_base FIXME */ - compat_int_t reserved[1]; -}; - -static int serial_struct_ioctl(struct file *file, - unsigned cmd, struct serial_struct32 __user *ss32) -{ - typedef struct serial_struct32 SS32; - int err; - struct serial_struct __user *ss = compat_alloc_user_space(sizeof(*ss)); - __u32 udata; - unsigned int base; - unsigned char *iomem_base; - - if (ss == NULL) - return -EFAULT; - if (cmd == TIOCSSERIAL) { - if (copy_in_user(ss, ss32, offsetof(SS32, iomem_base)) || - get_user(udata, &ss32->iomem_base)) - return -EFAULT; - iomem_base = compat_ptr(udata); - if (put_user(iomem_base, &ss->iomem_base) || - convert_in_user(&ss32->iomem_reg_shift, - &ss->iomem_reg_shift) || - convert_in_user(&ss32->port_high, &ss->port_high) || - put_user(0UL, &ss->iomap_base)) - return -EFAULT; - } - err = do_ioctl(file, cmd, (unsigned long)ss); - if (cmd == TIOCGSERIAL && err >= 0) { - if (copy_in_user(ss32, ss, offsetof(SS32, iomem_base)) || - get_user(iomem_base, &ss->iomem_base)) - return -EFAULT; - base = (unsigned long)iomem_base >> 32 ? - 0xffffffff : (unsigned)(unsigned long)iomem_base; - if (put_user(base, &ss32->iomem_base) || - convert_in_user(&ss->iomem_reg_shift, - &ss32->iomem_reg_shift) || - convert_in_user(&ss->port_high, &ss32->port_high)) - return -EFAULT; - } - return err; -} - #define RTC_IRQP_READ32 _IOR('p', 0x0b, compat_ulong_t) #define RTC_IRQP_SET32 _IOW('p', 0x0c, compat_ulong_t) #define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t) @@ -707,60 +644,8 @@ static int compat_ioctl_preallocate(struct file *file, static unsigned int ioctl_pointer[] = { /* compatible ioctls first */ -COMPATIBLE_IOCTL(0x4B50) /* KDGHWCLK - not in the kernel, but don't complain */ -COMPATIBLE_IOCTL(0x4B51) /* KDSHWCLK - not in the kernel, but don't complain */ - -/* Big T */ -COMPATIBLE_IOCTL(TCGETA) -COMPATIBLE_IOCTL(TCSETA) -COMPATIBLE_IOCTL(TCSETAW) -COMPATIBLE_IOCTL(TCSETAF) -COMPATIBLE_IOCTL(TCSBRK) -COMPATIBLE_IOCTL(TCXONC) -COMPATIBLE_IOCTL(TCFLSH) -COMPATIBLE_IOCTL(TCGETS) -COMPATIBLE_IOCTL(TCSETS) -COMPATIBLE_IOCTL(TCSETSW) -COMPATIBLE_IOCTL(TCSETSF) -COMPATIBLE_IOCTL(TIOCLINUX) -COMPATIBLE_IOCTL(TIOCSBRK) -COMPATIBLE_IOCTL(TIOCGDEV) -COMPATIBLE_IOCTL(TIOCCBRK) -COMPATIBLE_IOCTL(TIOCGSID) -COMPATIBLE_IOCTL(TIOCGICOUNT) -COMPATIBLE_IOCTL(TIOCGEXCL) /* Little t */ -COMPATIBLE_IOCTL(TIOCGETD) -COMPATIBLE_IOCTL(TIOCSETD) -COMPATIBLE_IOCTL(TIOCEXCL) -COMPATIBLE_IOCTL(TIOCNXCL) -COMPATIBLE_IOCTL(TIOCCONS) -COMPATIBLE_IOCTL(TIOCGSOFTCAR) -COMPATIBLE_IOCTL(TIOCSSOFTCAR) -COMPATIBLE_IOCTL(TIOCSWINSZ) -COMPATIBLE_IOCTL(TIOCGWINSZ) -COMPATIBLE_IOCTL(TIOCMGET) -COMPATIBLE_IOCTL(TIOCMBIC) -COMPATIBLE_IOCTL(TIOCMBIS) -COMPATIBLE_IOCTL(TIOCMSET) -COMPATIBLE_IOCTL(TIOCNOTTY) -COMPATIBLE_IOCTL(TIOCSTI) COMPATIBLE_IOCTL(TIOCOUTQ) -COMPATIBLE_IOCTL(TIOCSPGRP) -COMPATIBLE_IOCTL(TIOCGPGRP) -COMPATIBLE_IOCTL(TIOCSERGETLSR) -#ifdef TIOCSRS485 -COMPATIBLE_IOCTL(TIOCSRS485) -#endif -#ifdef TIOCGRS485 -COMPATIBLE_IOCTL(TIOCGRS485) -#endif -#ifdef TCGETS2 -COMPATIBLE_IOCTL(TCGETS2) -COMPATIBLE_IOCTL(TCSETS2) -COMPATIBLE_IOCTL(TCSETSW2) -COMPATIBLE_IOCTL(TCSETSF2) -#endif /* Little f */ COMPATIBLE_IOCTL(FIOCLEX) COMPATIBLE_IOCTL(FIONCLEX) @@ -775,23 +660,6 @@ COMPATIBLE_IOCTL(FIGETBSZ) COMPATIBLE_IOCTL(FIFREEZE) COMPATIBLE_IOCTL(FITHAW) COMPATIBLE_IOCTL(FITRIM) -COMPATIBLE_IOCTL(KDGETKEYCODE) -COMPATIBLE_IOCTL(KDSETKEYCODE) -COMPATIBLE_IOCTL(KDGKBTYPE) -COMPATIBLE_IOCTL(KDGETMODE) -COMPATIBLE_IOCTL(KDGKBMODE) -COMPATIBLE_IOCTL(KDGKBMETA) -COMPATIBLE_IOCTL(KDGKBENT) -COMPATIBLE_IOCTL(KDSKBENT) -COMPATIBLE_IOCTL(KDGKBSENT) -COMPATIBLE_IOCTL(KDSKBSENT) -COMPATIBLE_IOCTL(KDGKBDIACR) -COMPATIBLE_IOCTL(KDSKBDIACR) -COMPATIBLE_IOCTL(KDGKBDIACRUC) -COMPATIBLE_IOCTL(KDSKBDIACRUC) -COMPATIBLE_IOCTL(KDKBDREP) -COMPATIBLE_IOCTL(KDGKBLED) -COMPATIBLE_IOCTL(KDGETLED) #ifdef CONFIG_BLOCK /* Big S */ COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN) @@ -1133,11 +1001,6 @@ COMPATIBLE_IOCTL(CAPI_SET_FLAGS) COMPATIBLE_IOCTL(CAPI_CLR_FLAGS) COMPATIBLE_IOCTL(CAPI_NCCI_OPENCOUNT) COMPATIBLE_IOCTL(CAPI_NCCI_GETUNIT) -/* Siemens Gigaset */ -COMPATIBLE_IOCTL(GIGASET_REDIR) -COMPATIBLE_IOCTL(GIGASET_CONFIG) -COMPATIBLE_IOCTL(GIGASET_BRKCHARS) -COMPATIBLE_IOCTL(GIGASET_VERSION) /* Misc. */ COMPATIBLE_IOCTL(0x41545900) /* ATYIO_CLKR */ COMPATIBLE_IOCTL(0x41545901) /* ATYIO_CLKW */ @@ -1223,21 +1086,6 @@ COMPATIBLE_IOCTL(JSIOCGAXES) COMPATIBLE_IOCTL(JSIOCGBUTTONS) COMPATIBLE_IOCTL(JSIOCGNAME(0)) -#ifdef TIOCGLTC -COMPATIBLE_IOCTL(TIOCGLTC) -COMPATIBLE_IOCTL(TIOCSLTC) -#endif -#ifdef TIOCSTART -/* - * For these two we have definitions in ioctls.h and/or termios.h on - * some architectures but no actual implemention. Some applications - * like bash call them if they are defined in the headers, so we provide - * entries here to avoid syslog message spew. - */ -COMPATIBLE_IOCTL(TIOCSTART) -COMPATIBLE_IOCTL(TIOCSTOP) -#endif - /* fat 'r' ioctls. These are handled by fat with ->compat_ioctl, but we don't want warnings on other file systems. So declare them as compatible here. */ @@ -1293,10 +1141,6 @@ static long do_ioctl_trans(unsigned int cmd, case MTIOCPOS32: return mt_ioctl_trans(file, cmd, argp); #endif - /* Serial */ - case TIOCGSERIAL: - case TIOCSSERIAL: - return serial_struct_ioctl(file, cmd, argp); /* Not implemented in the native kernel */ case RTC_IRQP_READ32: case RTC_IRQP_SET32: @@ -1316,24 +1160,11 @@ static long do_ioctl_trans(unsigned int cmd, * so we must not do a compat_ptr() translation. */ switch (cmd) { - /* Big T */ - case TCSBRKP: - case TIOCMIWAIT: - case TIOCSCTTY: /* RAID */ case HOT_REMOVE_DISK: case HOT_ADD_DISK: case SET_DISK_FAULTY: case SET_BITMAP_FILE: - /* Big K */ - case KDSIGACCEPT: - case KIOCSOUND: - case KDMKTONE: - case KDSETMODE: - case KDSKBMODE: - case KDSKBMETA: - case KDSKBLED: - case KDSETLED: return vfs_ioctl(file, cmd, arg); } diff --git a/fs/coredump.c b/fs/coredump.c index 1e2c87acac9b..e42e17e55bfd 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -536,7 +536,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) return err; } -void do_coredump(const siginfo_t *siginfo) +void do_coredump(const kernel_siginfo_t *siginfo) { struct core_state core_state; struct core_name cn; @@ -447,6 +447,7 @@ bool dax_lock_mapping_entry(struct page *page) xa_unlock_irq(&mapping->i_pages); break; } else if (IS_ERR(entry)) { + xa_unlock_irq(&mapping->i_pages); WARN_ON_ONCE(PTR_ERR(entry) != -EAGAIN); continue; } @@ -665,6 +666,8 @@ struct page *dax_layout_busy_page(struct address_space *mapping) while (index < end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) { + pgoff_t nr_pages = 1; + for (i = 0; i < pagevec_count(&pvec); i++) { struct page *pvec_ent = pvec.pages[i]; void *entry; @@ -679,8 +682,15 @@ struct page *dax_layout_busy_page(struct address_space *mapping) xa_lock_irq(&mapping->i_pages); entry = get_unlocked_mapping_entry(mapping, index, NULL); - if (entry) + if (entry) { page = dax_busy_page(entry); + /* + * Account for multi-order entries at + * the end of the pagevec. + */ + if (i + 1 >= pagevec_count(&pvec)) + nr_pages = 1UL << dax_radix_order(entry); + } put_unlocked_mapping_entry(mapping, index, entry); xa_unlock_irq(&mapping->i_pages); if (page) @@ -695,7 +705,7 @@ struct page *dax_layout_busy_page(struct address_space *mapping) */ pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); - index++; + index += nr_pages; if (page) break; @@ -1120,21 +1130,12 @@ static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry, { struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; - vm_fault_t ret = VM_FAULT_NOPAGE; - struct page *zero_page; - pfn_t pfn; - - zero_page = ZERO_PAGE(0); - if (unlikely(!zero_page)) { - ret = VM_FAULT_OOM; - goto out; - } + pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr)); + vm_fault_t ret; - pfn = page_to_pfn_t(zero_page); dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE, false); ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); -out: trace_dax_load_hole(inode, vmf, ret); return ret; } diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 49121e5a8de2..5c36ceecb5c1 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -593,11 +593,16 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, lower_new_dir_dentry = dget_parent(lower_new_dentry); target_inode = d_inode(new_dentry); trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); + rc = -EINVAL; + if (lower_old_dentry->d_parent != lower_old_dir_dentry) + goto out_lock; + if (lower_new_dentry->d_parent != lower_new_dir_dentry) + goto out_lock; + if (d_unhashed(lower_old_dentry) || d_unhashed(lower_new_dentry)) + goto out_lock; /* source should not be ancestor of target */ - if (trap == lower_old_dentry) { - rc = -EINVAL; + if (trap == lower_old_dentry) goto out_lock; - } /* target should not be ancestor of source */ if (trap == lower_new_dentry) { rc = -ENOTEMPTY; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 7f7ee18fe179..e4bb9386c045 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1448,6 +1448,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) } inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); ei->i_flags = le32_to_cpu(raw_inode->i_flags); + ext2_set_inode_flags(inode); ei->i_faddr = le32_to_cpu(raw_inode->i_faddr); ei->i_frag_no = raw_inode->i_frag; ei->i_frag_size = raw_inode->i_fsize; @@ -1517,7 +1518,6 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } brelse (bh); - ext2_set_inode_flags(inode); unlock_new_inode(inode); return inode; diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index fb50f9aa6ead..c1d570ee1d9f 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -284,12 +284,16 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, default_acl, XATTR_CREATE); posix_acl_release(default_acl); + } else { + inode->i_default_acl = NULL; } if (acl) { if (!error) error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl, XATTR_CREATE); posix_acl_release(acl); + } else { + inode->i_acl = NULL; } return error; } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index e2902d394f1b..f93f9881ec18 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -76,7 +76,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) error_msg = "rec_len is too small for name_len"; else if (unlikely(((char *) de - buf) + rlen > size)) - error_msg = "directory entry across range"; + error_msg = "directory entry overrun"; else if (unlikely(le32_to_cpu(de->inode) > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; @@ -85,18 +85,16 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, if (filp) ext4_error_file(filp, function, line, bh->b_blocknr, - "bad entry in directory: %s - offset=%u(%u), " - "inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset % size), - offset, le32_to_cpu(de->inode), - rlen, de->name_len); + "bad entry in directory: %s - offset=%u, " + "inode=%u, rec_len=%d, name_len=%d, size=%d", + error_msg, offset, le32_to_cpu(de->inode), + rlen, de->name_len, size); else ext4_error_inode(dir, function, line, bh->b_blocknr, - "bad entry in directory: %s - offset=%u(%u), " - "inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset % size), - offset, le32_to_cpu(de->inode), - rlen, de->name_len); + "bad entry in directory: %s - offset=%u, " + "inode=%u, rec_len=%d, name_len=%d, size=%d", + error_msg, offset, le32_to_cpu(de->inode), + rlen, de->name_len, size); return 1; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0f0edd1cd0cd..12f90d48ba61 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -43,6 +43,17 @@ #define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_EXT4_FS_ENCRYPTION) #include <linux/fscrypt.h> +#include <linux/compiler.h> + +/* Until this gets included into linux/compiler-gcc.h */ +#ifndef __nonstring +#if defined(GCC_VERSION) && (GCC_VERSION >= 80000) +#define __nonstring __attribute__((nonstring)) +#else +#define __nonstring +#endif +#endif + /* * The fourth extended filesystem constants/structures */ @@ -617,6 +628,7 @@ enum { #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 +#define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040 /* * ioctl commands @@ -675,6 +687,9 @@ enum { /* Max physical block we can address w/o extents */ #define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF +/* Max logical block we can support */ +#define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFF + /* * Structure of an inode on the disk */ @@ -1016,6 +1031,9 @@ struct ext4_inode_info { ext4_lblk_t i_da_metadata_calc_last_lblock; int i_da_metadata_calc_len; + /* pending cluster reservations for bigalloc file systems */ + struct ext4_pending_tree i_pending_tree; + /* on-disk additional length */ __u16 i_extra_isize; @@ -1226,7 +1244,7 @@ struct ext4_super_block { __le32 s_feature_ro_compat; /* readonly-compatible feature set */ /*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ /*78*/ char s_volume_name[16]; /* volume name */ -/*88*/ char s_last_mounted[64]; /* directory where last mounted */ +/*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */ /*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ /* * Performance hints. Directory preallocation should only @@ -1277,13 +1295,13 @@ struct ext4_super_block { __le32 s_first_error_time; /* first time an error happened */ __le32 s_first_error_ino; /* inode involved in first error */ __le64 s_first_error_block; /* block involved of first error */ - __u8 s_first_error_func[32]; /* function where the error happened */ + __u8 s_first_error_func[32] __nonstring; /* function where the error happened */ __le32 s_first_error_line; /* line number where error happened */ __le32 s_last_error_time; /* most recent time of an error */ __le32 s_last_error_ino; /* inode involved in last error */ __le32 s_last_error_line; /* line number where error happened */ __le64 s_last_error_block; /* block involved of last error */ - __u8 s_last_error_func[32]; /* function where the error happened */ + __u8 s_last_error_func[32] __nonstring; /* function where the error happened */ #define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) __u8 s_mount_opts[64]; __le32 s_usr_quota_inum; /* inode for tracking user quota */ @@ -1387,7 +1405,8 @@ struct ext4_sb_info { u32 s_min_batch_time; struct block_device *journal_bdev; #ifdef CONFIG_QUOTA - char *s_qf_names[EXT4_MAXQUOTAS]; /* Names of quota files with journalled quota */ + /* Names of quota files with journalled quota */ + char __rcu *s_qf_names[EXT4_MAXQUOTAS]; int s_jquota_fmt; /* Format of quota to use */ #endif unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ @@ -2469,10 +2488,11 @@ extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); -extern int ext4_page_mkwrite(struct vm_fault *vmf); -extern int ext4_filemap_fault(struct vm_fault *vmf); +extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); +extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); +extern void ext4_da_release_space(struct inode *inode, int to_free); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, @@ -3128,10 +3148,6 @@ extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, int flags); extern void ext4_ext_drop_refs(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); -extern int ext4_find_delalloc_range(struct inode *inode, - ext4_lblk_t lblk_start, - ext4_lblk_t lblk_end); -extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); @@ -3142,6 +3158,7 @@ extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, ext4_lblk_t count, int mark_unwritten,int *err); +extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu); /* move_extent.c */ extern void ext4_double_down_write_data_sem(struct inode *first, diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index adf6668b596f..98bd0e9ee7df 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -120,6 +120,19 @@ struct ext4_ext_path { }; /* + * Used to record a portion of a cluster found at the beginning or end + * of an extent while traversing the extent tree during space removal. + * A partial cluster may be removed if it does not contain blocks shared + * with extents that aren't being deleted (tofree state). Otherwise, + * it cannot be removed (nofree state). + */ +struct partial_cluster { + ext4_fsblk_t pclu; /* physical cluster number */ + ext4_lblk_t lblk; /* logical block number within logical cluster */ + enum {initial, tofree, nofree} state; +}; + +/* * structure for external API */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 72a361d5ef74..240b6dea5441 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2351,8 +2351,8 @@ ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start, { struct extent_status es; - ext4_es_find_delayed_extent_range(inode, hole_start, - hole_start + hole_len - 1, &es); + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start, + hole_start + hole_len - 1, &es); if (es.es_len) { /* There's delayed extent containing lblock? */ if (es.es_lblk <= hole_start) @@ -2490,106 +2490,157 @@ static inline int get_default_free_blocks_flags(struct inode *inode) return 0; } +/* + * ext4_rereserve_cluster - increment the reserved cluster count when + * freeing a cluster with a pending reservation + * + * @inode - file containing the cluster + * @lblk - logical block in cluster to be reserved + * + * Increments the reserved cluster count and adjusts quota in a bigalloc + * file system when freeing a partial cluster containing at least one + * delayed and unwritten block. A partial cluster meeting that + * requirement will have a pending reservation. If so, the + * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to + * defer reserved and allocated space accounting to a subsequent call + * to this function. + */ +static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + + dquot_reclaim_block(inode, EXT4_C2B(sbi, 1)); + + spin_lock(&ei->i_block_reservation_lock); + ei->i_reserved_data_blocks++; + percpu_counter_add(&sbi->s_dirtyclusters_counter, 1); + spin_unlock(&ei->i_block_reservation_lock); + + percpu_counter_add(&sbi->s_freeclusters_counter, 1); + ext4_remove_pending(inode, lblk); +} + static int ext4_remove_blocks(handle_t *handle, struct inode *inode, struct ext4_extent *ex, - long long *partial_cluster, + struct partial_cluster *partial, ext4_lblk_t from, ext4_lblk_t to) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned short ee_len = ext4_ext_get_actual_len(ex); - ext4_fsblk_t pblk; - int flags = get_default_free_blocks_flags(inode); + ext4_fsblk_t last_pblk, pblk; + ext4_lblk_t num; + int flags; + + /* only extent tail removal is allowed */ + if (from < le32_to_cpu(ex->ee_block) || + to != le32_to_cpu(ex->ee_block) + ee_len - 1) { + ext4_error(sbi->s_sb, + "strange request: removal(2) %u-%u from %u:%u", + from, to, le32_to_cpu(ex->ee_block), ee_len); + return 0; + } + +#ifdef EXTENTS_STATS + spin_lock(&sbi->s_ext_stats_lock); + sbi->s_ext_blocks += ee_len; + sbi->s_ext_extents++; + if (ee_len < sbi->s_ext_min) + sbi->s_ext_min = ee_len; + if (ee_len > sbi->s_ext_max) + sbi->s_ext_max = ee_len; + if (ext_depth(inode) > sbi->s_depth_max) + sbi->s_depth_max = ext_depth(inode); + spin_unlock(&sbi->s_ext_stats_lock); +#endif + + trace_ext4_remove_blocks(inode, ex, from, to, partial); /* - * For bigalloc file systems, we never free a partial cluster - * at the beginning of the extent. Instead, we make a note - * that we tried freeing the cluster, and check to see if we - * need to free it on a subsequent call to ext4_remove_blocks, - * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space. + * if we have a partial cluster, and it's different from the + * cluster of the last block in the extent, we free it */ - flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; + last_pblk = ext4_ext_pblock(ex) + ee_len - 1; + + if (partial->state != initial && + partial->pclu != EXT4_B2C(sbi, last_pblk)) { + if (partial->state == tofree) { + flags = get_default_free_blocks_flags(inode); + if (ext4_is_pending(inode, partial->lblk)) + flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; + ext4_free_blocks(handle, inode, NULL, + EXT4_C2B(sbi, partial->pclu), + sbi->s_cluster_ratio, flags); + if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) + ext4_rereserve_cluster(inode, partial->lblk); + } + partial->state = initial; + } + + num = le32_to_cpu(ex->ee_block) + ee_len - from; + pblk = ext4_ext_pblock(ex) + ee_len - num; - trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster); /* - * If we have a partial cluster, and it's different from the - * cluster of the last block, we need to explicitly free the - * partial cluster here. + * We free the partial cluster at the end of the extent (if any), + * unless the cluster is used by another extent (partial_cluster + * state is nofree). If a partial cluster exists here, it must be + * shared with the last block in the extent. */ - pblk = ext4_ext_pblock(ex) + ee_len - 1; - if (*partial_cluster > 0 && - *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { + flags = get_default_free_blocks_flags(inode); + + /* partial, left end cluster aligned, right end unaligned */ + if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) && + (EXT4_LBLK_CMASK(sbi, to) >= from) && + (partial->state != nofree)) { + if (ext4_is_pending(inode, to)) + flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, - EXT4_C2B(sbi, *partial_cluster), + EXT4_PBLK_CMASK(sbi, last_pblk), sbi->s_cluster_ratio, flags); - *partial_cluster = 0; + if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) + ext4_rereserve_cluster(inode, to); + partial->state = initial; + flags = get_default_free_blocks_flags(inode); } -#ifdef EXTENTS_STATS - { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - spin_lock(&sbi->s_ext_stats_lock); - sbi->s_ext_blocks += ee_len; - sbi->s_ext_extents++; - if (ee_len < sbi->s_ext_min) - sbi->s_ext_min = ee_len; - if (ee_len > sbi->s_ext_max) - sbi->s_ext_max = ee_len; - if (ext_depth(inode) > sbi->s_depth_max) - sbi->s_depth_max = ext_depth(inode); - spin_unlock(&sbi->s_ext_stats_lock); - } -#endif - if (from >= le32_to_cpu(ex->ee_block) - && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { - /* tail removal */ - ext4_lblk_t num; - long long first_cluster; - - num = le32_to_cpu(ex->ee_block) + ee_len - from; - pblk = ext4_ext_pblock(ex) + ee_len - num; - /* - * Usually we want to free partial cluster at the end of the - * extent, except for the situation when the cluster is still - * used by any other extent (partial_cluster is negative). - */ - if (*partial_cluster < 0 && - *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1)) - flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; + flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; - ext_debug("free last %u blocks starting %llu partial %lld\n", - num, pblk, *partial_cluster); - ext4_free_blocks(handle, inode, NULL, pblk, num, flags); - /* - * If the block range to be freed didn't start at the - * beginning of a cluster, and we removed the entire - * extent and the cluster is not used by any other extent, - * save the partial cluster here, since we might need to - * delete if we determine that the truncate or punch hole - * operation has removed all of the blocks in the cluster. - * If that cluster is used by another extent, preserve its - * negative value so it isn't freed later on. - * - * If the whole extent wasn't freed, we've reached the - * start of the truncated/punched region and have finished - * removing blocks. If there's a partial cluster here it's - * shared with the remainder of the extent and is no longer - * a candidate for removal. - */ - if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) { - first_cluster = (long long) EXT4_B2C(sbi, pblk); - if (first_cluster != -*partial_cluster) - *partial_cluster = first_cluster; - } else { - *partial_cluster = 0; + /* + * For bigalloc file systems, we never free a partial cluster + * at the beginning of the extent. Instead, we check to see if we + * need to free it on a subsequent call to ext4_remove_blocks, + * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space. + */ + flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; + ext4_free_blocks(handle, inode, NULL, pblk, num, flags); + + /* reset the partial cluster if we've freed past it */ + if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk)) + partial->state = initial; + + /* + * If we've freed the entire extent but the beginning is not left + * cluster aligned and is not marked as ineligible for freeing we + * record the partial cluster at the beginning of the extent. It + * wasn't freed by the preceding ext4_free_blocks() call, and we + * need to look farther to the left to determine if it's to be freed + * (not shared with another extent). Else, reset the partial + * cluster - we're either done freeing or the beginning of the + * extent is left cluster aligned. + */ + if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) { + if (partial->state == initial) { + partial->pclu = EXT4_B2C(sbi, pblk); + partial->lblk = from; + partial->state = tofree; } - } else - ext4_error(sbi->s_sb, "strange request: removal(2) " - "%u-%u from %u:%u", - from, to, le32_to_cpu(ex->ee_block), ee_len); + } else { + partial->state = initial; + } + return 0; } - /* * ext4_ext_rm_leaf() Removes the extents associated with the * blocks appearing between "start" and "end". Both "start" @@ -2608,7 +2659,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, static int ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - long long *partial_cluster, + struct partial_cluster *partial, ext4_lblk_t start, ext4_lblk_t end) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -2640,7 +2691,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); - trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); + trace_ext4_ext_rm_leaf(inode, start, ex, partial); while (ex >= EXT_FIRST_EXTENT(eh) && ex_ee_block + ex_ee_len > start) { @@ -2671,8 +2722,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, */ if (sbi->s_cluster_ratio > 1) { pblk = ext4_ext_pblock(ex); - *partial_cluster = - -(long long) EXT4_B2C(sbi, pblk); + partial->pclu = EXT4_B2C(sbi, pblk); + partial->state = nofree; } ex--; ex_ee_block = le32_to_cpu(ex->ee_block); @@ -2714,8 +2765,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (err) goto out; - err = ext4_remove_blocks(handle, inode, ex, partial_cluster, - a, b); + err = ext4_remove_blocks(handle, inode, ex, partial, a, b); if (err) goto out; @@ -2769,18 +2819,23 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, * If there's a partial cluster and at least one extent remains in * the leaf, free the partial cluster if it isn't shared with the * current extent. If it is shared with the current extent - * we zero partial_cluster because we've reached the start of the + * we reset the partial cluster because we've reached the start of the * truncated/punched region and we're done removing blocks. */ - if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) { + if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) { pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; - if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { + if (partial->pclu != EXT4_B2C(sbi, pblk)) { + int flags = get_default_free_blocks_flags(inode); + + if (ext4_is_pending(inode, partial->lblk)) + flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, - EXT4_C2B(sbi, *partial_cluster), - sbi->s_cluster_ratio, - get_default_free_blocks_flags(inode)); + EXT4_C2B(sbi, partial->pclu), + sbi->s_cluster_ratio, flags); + if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) + ext4_rereserve_cluster(inode, partial->lblk); } - *partial_cluster = 0; + partial->state = initial; } /* if this leaf is free, then we should @@ -2819,10 +2874,14 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int depth = ext_depth(inode); struct ext4_ext_path *path = NULL; - long long partial_cluster = 0; + struct partial_cluster partial; handle_t *handle; int i = 0, err = 0; + partial.pclu = 0; + partial.lblk = 0; + partial.state = initial; + ext_debug("truncate since %u to %u\n", start, end); /* probably first extent we're gonna free will be last in block */ @@ -2882,8 +2941,8 @@ again: */ if (sbi->s_cluster_ratio > 1) { pblk = ext4_ext_pblock(ex) + end - ee_block + 2; - partial_cluster = - -(long long) EXT4_B2C(sbi, pblk); + partial.pclu = EXT4_B2C(sbi, pblk); + partial.state = nofree; } /* @@ -2911,9 +2970,10 @@ again: &ex); if (err) goto out; - if (pblk) - partial_cluster = - -(long long) EXT4_B2C(sbi, pblk); + if (pblk) { + partial.pclu = EXT4_B2C(sbi, pblk); + partial.state = nofree; + } } } /* @@ -2948,8 +3008,7 @@ again: if (i == depth) { /* this is leaf block */ err = ext4_ext_rm_leaf(handle, inode, path, - &partial_cluster, start, - end); + &partial, start, end); /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); path[i].p_bh = NULL; @@ -3021,21 +3080,24 @@ again: } } - trace_ext4_ext_remove_space_done(inode, start, end, depth, - partial_cluster, path->p_hdr->eh_entries); + trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial, + path->p_hdr->eh_entries); /* - * If we still have something in the partial cluster and we have removed - * even the first extent, then we should free the blocks in the partial - * cluster as well. (This code will only run when there are no leaves - * to the immediate left of the truncated/punched region.) + * if there's a partial cluster and we have removed the first extent + * in the file, then we also free the partial cluster, if any */ - if (partial_cluster > 0 && err == 0) { - /* don't zero partial_cluster since it's not used afterwards */ + if (partial.state == tofree && err == 0) { + int flags = get_default_free_blocks_flags(inode); + + if (ext4_is_pending(inode, partial.lblk)) + flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, - EXT4_C2B(sbi, partial_cluster), - sbi->s_cluster_ratio, - get_default_free_blocks_flags(inode)); + EXT4_C2B(sbi, partial.pclu), + sbi->s_cluster_ratio, flags); + if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) + ext4_rereserve_cluster(inode, partial.lblk); + partial.state = initial; } /* TODO: flexible tree reduction should be here */ @@ -3819,114 +3881,6 @@ out: return ext4_mark_inode_dirty(handle, inode); } -/** - * ext4_find_delalloc_range: find delayed allocated block in the given range. - * - * Return 1 if there is a delalloc block in the range, otherwise 0. - */ -int ext4_find_delalloc_range(struct inode *inode, - ext4_lblk_t lblk_start, - ext4_lblk_t lblk_end) -{ - struct extent_status es; - - ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es); - if (es.es_len == 0) - return 0; /* there is no delay extent in this tree */ - else if (es.es_lblk <= lblk_start && - lblk_start < es.es_lblk + es.es_len) - return 1; - else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end) - return 1; - else - return 0; -} - -int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk) -{ - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - ext4_lblk_t lblk_start, lblk_end; - lblk_start = EXT4_LBLK_CMASK(sbi, lblk); - lblk_end = lblk_start + sbi->s_cluster_ratio - 1; - - return ext4_find_delalloc_range(inode, lblk_start, lblk_end); -} - -/** - * Determines how many complete clusters (out of those specified by the 'map') - * are under delalloc and were reserved quota for. - * This function is called when we are writing out the blocks that were - * originally written with their allocation delayed, but then the space was - * allocated using fallocate() before the delayed allocation could be resolved. - * The cases to look for are: - * ('=' indicated delayed allocated blocks - * '-' indicates non-delayed allocated blocks) - * (a) partial clusters towards beginning and/or end outside of allocated range - * are not delalloc'ed. - * Ex: - * |----c---=|====c====|====c====|===-c----| - * |++++++ allocated ++++++| - * ==> 4 complete clusters in above example - * - * (b) partial cluster (outside of allocated range) towards either end is - * marked for delayed allocation. In this case, we will exclude that - * cluster. - * Ex: - * |----====c========|========c========| - * |++++++ allocated ++++++| - * ==> 1 complete clusters in above example - * - * Ex: - * |================c================| - * |++++++ allocated ++++++| - * ==> 0 complete clusters in above example - * - * The ext4_da_update_reserve_space will be called only if we - * determine here that there were some "entire" clusters that span - * this 'allocated' range. - * In the non-bigalloc case, this function will just end up returning num_blks - * without ever calling ext4_find_delalloc_range. - */ -static unsigned int -get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, - unsigned int num_blks) -{ - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - ext4_lblk_t alloc_cluster_start, alloc_cluster_end; - ext4_lblk_t lblk_from, lblk_to, c_offset; - unsigned int allocated_clusters = 0; - - alloc_cluster_start = EXT4_B2C(sbi, lblk_start); - alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1); - - /* max possible clusters for this allocation */ - allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1; - - trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks); - - /* Check towards left side */ - c_offset = EXT4_LBLK_COFF(sbi, lblk_start); - if (c_offset) { - lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start); - lblk_to = lblk_from + c_offset - 1; - - if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) - allocated_clusters--; - } - - /* Now check towards right. */ - c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks); - if (allocated_clusters && c_offset) { - lblk_from = lblk_start + num_blks; - lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; - - if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) - allocated_clusters--; - } - - return allocated_clusters; -} - static int convert_initialized_extent(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, @@ -4108,23 +4062,6 @@ out: } map->m_len = allocated; - /* - * If we have done fallocate with the offset that is already - * delayed allocated, we would have block reservation - * and quota reservation done in the delayed write path. - * But fallocate would have already updated quota and block - * count for this offset. So cancel these reservation - */ - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { - unsigned int reserved_clusters; - reserved_clusters = get_reserved_cluster_alloc(inode, - map->m_lblk, map->m_len); - if (reserved_clusters) - ext4_da_update_reserve_space(inode, - reserved_clusters, - 0); - } - map_out: map->m_flags |= EXT4_MAP_MAPPED; if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) { @@ -4513,77 +4450,39 @@ got_allocated_blocks: map->m_flags |= EXT4_MAP_NEW; /* - * Update reserved blocks/metadata blocks after successful - * block allocation which had been deferred till now. + * Reduce the reserved cluster count to reflect successful deferred + * allocation of delayed allocated clusters or direct allocation of + * clusters discovered to be delayed allocated. Once allocated, a + * cluster is not included in the reserved count. */ - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { - unsigned int reserved_clusters; - /* - * Check how many clusters we had reserved this allocated range - */ - reserved_clusters = get_reserved_cluster_alloc(inode, - map->m_lblk, allocated); - if (!map_from_cluster) { - BUG_ON(allocated_clusters < reserved_clusters); - if (reserved_clusters < allocated_clusters) { - struct ext4_inode_info *ei = EXT4_I(inode); - int reservation = allocated_clusters - - reserved_clusters; - /* - * It seems we claimed few clusters outside of - * the range of this allocation. We should give - * it back to the reservation pool. This can - * happen in the following case: - * - * * Suppose s_cluster_ratio is 4 (i.e., each - * cluster has 4 blocks. Thus, the clusters - * are [0-3],[4-7],[8-11]... - * * First comes delayed allocation write for - * logical blocks 10 & 11. Since there were no - * previous delayed allocated blocks in the - * range [8-11], we would reserve 1 cluster - * for this write. - * * Next comes write for logical blocks 3 to 8. - * In this case, we will reserve 2 clusters - * (for [0-3] and [4-7]; and not for [8-11] as - * that range has a delayed allocated blocks. - * Thus total reserved clusters now becomes 3. - * * Now, during the delayed allocation writeout - * time, we will first write blocks [3-8] and - * allocate 3 clusters for writing these - * blocks. Also, we would claim all these - * three clusters above. - * * Now when we come here to writeout the - * blocks [10-11], we would expect to claim - * the reservation of 1 cluster we had made - * (and we would claim it since there are no - * more delayed allocated blocks in the range - * [8-11]. But our reserved cluster count had - * already gone to 0. - * - * Thus, at the step 4 above when we determine - * that there are still some unwritten delayed - * allocated blocks outside of our current - * block range, we should increment the - * reserved clusters count so that when the - * remaining blocks finally gets written, we - * could claim them. - */ - dquot_reserve_block(inode, - EXT4_C2B(sbi, reservation)); - spin_lock(&ei->i_block_reservation_lock); - ei->i_reserved_data_blocks += reservation; - spin_unlock(&ei->i_block_reservation_lock); - } + if (test_opt(inode->i_sb, DELALLOC) && !map_from_cluster) { + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { /* - * We will claim quota for all newly allocated blocks. - * We're updating the reserved space *after* the - * correction above so we do not accidentally free - * all the metadata reservation because we might - * actually need it later on. + * When allocating delayed allocated clusters, simply + * reduce the reserved cluster count and claim quota */ ext4_da_update_reserve_space(inode, allocated_clusters, 1); + } else { + ext4_lblk_t lblk, len; + unsigned int n; + + /* + * When allocating non-delayed allocated clusters + * (from fallocate, filemap, DIO, or clusters + * allocated when delalloc has been disabled by + * ext4_nonda_switch), reduce the reserved cluster + * count by the number of allocated clusters that + * have previously been delayed allocated. Quota + * has been claimed by ext4_mb_new_blocks() above, + * so release the quota reservations made for any + * previously delayed allocated clusters. + */ + lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk); + len = allocated_clusters << sbi->s_cluster_bits; + n = ext4_es_delayed_clu(inode, lblk, len); + if (n > 0) + ext4_da_update_reserve_space(inode, (int) n, 0); } } @@ -5075,8 +4974,10 @@ static int ext4_find_delayed_extent(struct inode *inode, ext4_lblk_t block, next_del; if (newes->es_pblk == 0) { - ext4_es_find_delayed_extent_range(inode, newes->es_lblk, - newes->es_lblk + newes->es_len - 1, &es); + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, + newes->es_lblk, + newes->es_lblk + newes->es_len - 1, + &es); /* * No extent in extent-tree contains block @newes->es_pblk, @@ -5097,7 +4998,8 @@ static int ext4_find_delayed_extent(struct inode *inode, } block = newes->es_lblk + newes->es_len; - ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es); + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, block, + EXT_MAX_BLOCKS, &es); if (es.es_len == 0) next_del = EXT_MAX_BLOCKS; else @@ -5958,3 +5860,82 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, } return replaced_count; } + +/* + * ext4_clu_mapped - determine whether any block in a logical cluster has + * been mapped to a physical cluster + * + * @inode - file containing the logical cluster + * @lclu - logical cluster of interest + * + * Returns 1 if any block in the logical cluster is mapped, signifying + * that a physical cluster has been allocated for it. Otherwise, + * returns 0. Can also return negative error codes. Derived from + * ext4_ext_map_blocks(). + */ +int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_ext_path *path; + int depth, mapped = 0, err = 0; + struct ext4_extent *extent; + ext4_lblk_t first_lblk, first_lclu, last_lclu; + + /* search for the extent closest to the first block in the cluster */ + path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + goto out; + } + + depth = ext_depth(inode); + + /* + * A consistent leaf must not be empty. This situation is possible, + * though, _during_ tree modification, and it's why an assert can't + * be put in ext4_find_extent(). + */ + if (unlikely(path[depth].p_ext == NULL && depth != 0)) { + EXT4_ERROR_INODE(inode, + "bad extent address - lblock: %lu, depth: %d, pblock: %lld", + (unsigned long) EXT4_C2B(sbi, lclu), + depth, path[depth].p_block); + err = -EFSCORRUPTED; + goto out; + } + + extent = path[depth].p_ext; + + /* can't be mapped if the extent tree is empty */ + if (extent == NULL) + goto out; + + first_lblk = le32_to_cpu(extent->ee_block); + first_lclu = EXT4_B2C(sbi, first_lblk); + + /* + * Three possible outcomes at this point - found extent spanning + * the target cluster, to the left of the target cluster, or to the + * right of the target cluster. The first two cases are handled here. + * The last case indicates the target cluster is not mapped. + */ + if (lclu >= first_lclu) { + last_lclu = EXT4_B2C(sbi, first_lblk + + ext4_ext_get_actual_len(extent) - 1); + if (lclu <= last_lclu) { + mapped = 1; + } else { + first_lblk = ext4_ext_next_allocated_block(path); + first_lclu = EXT4_B2C(sbi, first_lblk); + if (lclu == first_lclu) + mapped = 1; + } + } + +out: + ext4_ext_drop_refs(path); + kfree(path); + + return err ? err : mapped; +} diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index c4e6fb15101b..2b439afafe13 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -142,6 +142,7 @@ */ static struct kmem_cache *ext4_es_cachep; +static struct kmem_cache *ext4_pending_cachep; static int __es_insert_extent(struct inode *inode, struct extent_status *newes); static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, @@ -149,6 +150,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, struct ext4_inode_info *locked_ei); +static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); int __init ext4_init_es(void) { @@ -233,30 +236,38 @@ static struct extent_status *__es_tree_search(struct rb_root *root, } /* - * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering - * @es->lblk if it exists, otherwise, the next extent after @es->lblk. + * ext4_es_find_extent_range - find extent with specified status within block + * range or next extent following block range in + * extents status tree * - * @inode: the inode which owns delayed extents - * @lblk: the offset where we start to search - * @end: the offset where we stop to search - * @es: delayed extent that we found + * @inode - file containing the range + * @matching_fn - pointer to function that matches extents with desired status + * @lblk - logical block defining start of range + * @end - logical block defining end of range + * @es - extent found, if any + * + * Find the first extent within the block range specified by @lblk and @end + * in the extents status tree that satisfies @matching_fn. If a match + * is found, it's returned in @es. If not, and a matching extent is found + * beyond the block range, it's returned in @es. If no match is found, an + * extent is returned in @es whose es_lblk, es_len, and es_pblk components + * are 0. */ -void ext4_es_find_delayed_extent_range(struct inode *inode, - ext4_lblk_t lblk, ext4_lblk_t end, - struct extent_status *es) +static void __es_find_extent_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es) { struct ext4_es_tree *tree = NULL; struct extent_status *es1 = NULL; struct rb_node *node; - BUG_ON(es == NULL); - BUG_ON(end < lblk); - trace_ext4_es_find_delayed_extent_range_enter(inode, lblk); + WARN_ON(es == NULL); + WARN_ON(end < lblk); - read_lock(&EXT4_I(inode)->i_es_lock); tree = &EXT4_I(inode)->i_es_tree; - /* find extent in cache firstly */ + /* see if the extent has been cached */ es->es_lblk = es->es_len = es->es_pblk = 0; if (tree->cache_es) { es1 = tree->cache_es; @@ -271,28 +282,133 @@ void ext4_es_find_delayed_extent_range(struct inode *inode, es1 = __es_tree_search(&tree->root, lblk); out: - if (es1 && !ext4_es_is_delayed(es1)) { + if (es1 && !matching_fn(es1)) { while ((node = rb_next(&es1->rb_node)) != NULL) { es1 = rb_entry(node, struct extent_status, rb_node); if (es1->es_lblk > end) { es1 = NULL; break; } - if (ext4_es_is_delayed(es1)) + if (matching_fn(es1)) break; } } - if (es1 && ext4_es_is_delayed(es1)) { + if (es1 && matching_fn(es1)) { tree->cache_es = es1; es->es_lblk = es1->es_lblk; es->es_len = es1->es_len; es->es_pblk = es1->es_pblk; } +} + +/* + * Locking for __es_find_extent_range() for external use + */ +void ext4_es_find_extent_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es) +{ + trace_ext4_es_find_extent_range_enter(inode, lblk); + + read_lock(&EXT4_I(inode)->i_es_lock); + __es_find_extent_range(inode, matching_fn, lblk, end, es); + read_unlock(&EXT4_I(inode)->i_es_lock); + + trace_ext4_es_find_extent_range_exit(inode, es); +} + +/* + * __es_scan_range - search block range for block with specified status + * in extents status tree + * + * @inode - file containing the range + * @matching_fn - pointer to function that matches extents with desired status + * @lblk - logical block defining start of range + * @end - logical block defining end of range + * + * Returns true if at least one block in the specified block range satisfies + * the criterion specified by @matching_fn, and false if not. If at least + * one extent has the specified status, then there is at least one block + * in the cluster with that status. Should only be called by code that has + * taken i_es_lock. + */ +static bool __es_scan_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t start, ext4_lblk_t end) +{ + struct extent_status es; + + __es_find_extent_range(inode, matching_fn, start, end, &es); + if (es.es_len == 0) + return false; /* no matching extent in the tree */ + else if (es.es_lblk <= start && + start < es.es_lblk + es.es_len) + return true; + else if (start <= es.es_lblk && es.es_lblk <= end) + return true; + else + return false; +} +/* + * Locking for __es_scan_range() for external use + */ +bool ext4_es_scan_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end) +{ + bool ret; + + read_lock(&EXT4_I(inode)->i_es_lock); + ret = __es_scan_range(inode, matching_fn, lblk, end); read_unlock(&EXT4_I(inode)->i_es_lock); - trace_ext4_es_find_delayed_extent_range_exit(inode, es); + return ret; +} + +/* + * __es_scan_clu - search cluster for block with specified status in + * extents status tree + * + * @inode - file containing the cluster + * @matching_fn - pointer to function that matches extents with desired status + * @lblk - logical block in cluster to be searched + * + * Returns true if at least one extent in the cluster containing @lblk + * satisfies the criterion specified by @matching_fn, and false if not. If at + * least one extent has the specified status, then there is at least one block + * in the cluster with that status. Should only be called by code that has + * taken i_es_lock. + */ +static bool __es_scan_clu(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t lblk_start, lblk_end; + + lblk_start = EXT4_LBLK_CMASK(sbi, lblk); + lblk_end = lblk_start + sbi->s_cluster_ratio - 1; + + return __es_scan_range(inode, matching_fn, lblk_start, lblk_end); +} + +/* + * Locking for __es_scan_clu() for external use + */ +bool ext4_es_scan_clu(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk) +{ + bool ret; + + read_lock(&EXT4_I(inode)->i_es_lock); + ret = __es_scan_clu(inode, matching_fn, lblk); + read_unlock(&EXT4_I(inode)->i_es_lock); + + return ret; } static void ext4_es_list_add(struct inode *inode) @@ -694,6 +810,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status newes; ext4_lblk_t end = lblk + len - 1; int err = 0; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n", lblk, len, pblk, status, inode->i_ino); @@ -730,6 +847,11 @@ retry: if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) err = 0; + if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) && + (status & EXTENT_STATUS_WRITTEN || + status & EXTENT_STATUS_UNWRITTEN)) + __revise_pending(inode, lblk, len); + error: write_unlock(&EXT4_I(inode)->i_es_lock); @@ -1252,3 +1374,499 @@ static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan) ei->i_es_tree.cache_es = NULL; return nr_shrunk; } + +#ifdef ES_DEBUG__ +static void ext4_print_pending_tree(struct inode *inode) +{ + struct ext4_pending_tree *tree; + struct rb_node *node; + struct pending_reservation *pr; + + printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino); + tree = &EXT4_I(inode)->i_pending_tree; + node = rb_first(&tree->root); + while (node) { + pr = rb_entry(node, struct pending_reservation, rb_node); + printk(KERN_DEBUG " %u", pr->lclu); + node = rb_next(node); + } + printk(KERN_DEBUG "\n"); +} +#else +#define ext4_print_pending_tree(inode) +#endif + +int __init ext4_init_pending(void) +{ + ext4_pending_cachep = kmem_cache_create("ext4_pending_reservation", + sizeof(struct pending_reservation), + 0, (SLAB_RECLAIM_ACCOUNT), NULL); + if (ext4_pending_cachep == NULL) + return -ENOMEM; + return 0; +} + +void ext4_exit_pending(void) +{ + kmem_cache_destroy(ext4_pending_cachep); +} + +void ext4_init_pending_tree(struct ext4_pending_tree *tree) +{ + tree->root = RB_ROOT; +} + +/* + * __get_pending - retrieve a pointer to a pending reservation + * + * @inode - file containing the pending cluster reservation + * @lclu - logical cluster of interest + * + * Returns a pointer to a pending reservation if it's a member of + * the set, and NULL if not. Must be called holding i_es_lock. + */ +static struct pending_reservation *__get_pending(struct inode *inode, + ext4_lblk_t lclu) +{ + struct ext4_pending_tree *tree; + struct rb_node *node; + struct pending_reservation *pr = NULL; + + tree = &EXT4_I(inode)->i_pending_tree; + node = (&tree->root)->rb_node; + + while (node) { + pr = rb_entry(node, struct pending_reservation, rb_node); + if (lclu < pr->lclu) + node = node->rb_left; + else if (lclu > pr->lclu) + node = node->rb_right; + else if (lclu == pr->lclu) + return pr; + } + return NULL; +} + +/* + * __insert_pending - adds a pending cluster reservation to the set of + * pending reservations + * + * @inode - file containing the cluster + * @lblk - logical block in the cluster to be added + * + * Returns 0 on successful insertion and -ENOMEM on failure. If the + * pending reservation is already in the set, returns successfully. + */ +static int __insert_pending(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; + struct rb_node **p = &tree->root.rb_node; + struct rb_node *parent = NULL; + struct pending_reservation *pr; + ext4_lblk_t lclu; + int ret = 0; + + lclu = EXT4_B2C(sbi, lblk); + /* search to find parent for insertion */ + while (*p) { + parent = *p; + pr = rb_entry(parent, struct pending_reservation, rb_node); + + if (lclu < pr->lclu) { + p = &(*p)->rb_left; + } else if (lclu > pr->lclu) { + p = &(*p)->rb_right; + } else { + /* pending reservation already inserted */ + goto out; + } + } + + pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC); + if (pr == NULL) { + ret = -ENOMEM; + goto out; + } + pr->lclu = lclu; + + rb_link_node(&pr->rb_node, parent, p); + rb_insert_color(&pr->rb_node, &tree->root); + +out: + return ret; +} + +/* + * __remove_pending - removes a pending cluster reservation from the set + * of pending reservations + * + * @inode - file containing the cluster + * @lblk - logical block in the pending cluster reservation to be removed + * + * Returns successfully if pending reservation is not a member of the set. + */ +static void __remove_pending(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct pending_reservation *pr; + struct ext4_pending_tree *tree; + + pr = __get_pending(inode, EXT4_B2C(sbi, lblk)); + if (pr != NULL) { + tree = &EXT4_I(inode)->i_pending_tree; + rb_erase(&pr->rb_node, &tree->root); + kmem_cache_free(ext4_pending_cachep, pr); + } +} + +/* + * ext4_remove_pending - removes a pending cluster reservation from the set + * of pending reservations + * + * @inode - file containing the cluster + * @lblk - logical block in the pending cluster reservation to be removed + * + * Locking for external use of __remove_pending. + */ +void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + write_lock(&ei->i_es_lock); + __remove_pending(inode, lblk); + write_unlock(&ei->i_es_lock); +} + +/* + * ext4_is_pending - determine whether a cluster has a pending reservation + * on it + * + * @inode - file containing the cluster + * @lblk - logical block in the cluster + * + * Returns true if there's a pending reservation for the cluster in the + * set of pending reservations, and false if not. + */ +bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + bool ret; + + read_lock(&ei->i_es_lock); + ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL); + read_unlock(&ei->i_es_lock); + + return ret; +} + +/* + * ext4_es_insert_delayed_block - adds a delayed block to the extents status + * tree, adding a pending reservation where + * needed + * + * @inode - file containing the newly added block + * @lblk - logical block to be added + * @allocated - indicates whether a physical cluster has been allocated for + * the logical cluster that contains the block + * + * Returns 0 on success, negative error code on failure. + */ +int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, + bool allocated) +{ + struct extent_status newes; + int err = 0; + + es_debug("add [%u/1) delayed to extent status tree of inode %lu\n", + lblk, inode->i_ino); + + newes.es_lblk = lblk; + newes.es_len = 1; + ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED); + trace_ext4_es_insert_delayed_block(inode, &newes, allocated); + + ext4_es_insert_extent_check(inode, &newes); + + write_lock(&EXT4_I(inode)->i_es_lock); + + err = __es_remove_extent(inode, lblk, lblk); + if (err != 0) + goto error; +retry: + err = __es_insert_extent(inode, &newes); + if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb), + 128, EXT4_I(inode))) + goto retry; + if (err != 0) + goto error; + + if (allocated) + __insert_pending(inode, lblk); + +error: + write_unlock(&EXT4_I(inode)->i_es_lock); + + ext4_es_print_tree(inode); + ext4_print_pending_tree(inode); + + return err; +} + +/* + * __es_delayed_clu - count number of clusters containing blocks that + * are delayed only + * + * @inode - file containing block range + * @start - logical block defining start of range + * @end - logical block defining end of range + * + * Returns the number of clusters containing only delayed (not delayed + * and unwritten) blocks in the range specified by @start and @end. Any + * cluster or part of a cluster within the range and containing a delayed + * and not unwritten block within the range is counted as a whole cluster. + */ +static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end) +{ + struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; + struct extent_status *es; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct rb_node *node; + ext4_lblk_t first_lclu, last_lclu; + unsigned long long last_counted_lclu; + unsigned int n = 0; + + /* guaranteed to be unequal to any ext4_lblk_t value */ + last_counted_lclu = ~0ULL; + + es = __es_tree_search(&tree->root, start); + + while (es && (es->es_lblk <= end)) { + if (ext4_es_is_delonly(es)) { + if (es->es_lblk <= start) + first_lclu = EXT4_B2C(sbi, start); + else + first_lclu = EXT4_B2C(sbi, es->es_lblk); + + if (ext4_es_end(es) >= end) + last_lclu = EXT4_B2C(sbi, end); + else + last_lclu = EXT4_B2C(sbi, ext4_es_end(es)); + + if (first_lclu == last_counted_lclu) + n += last_lclu - first_lclu; + else + n += last_lclu - first_lclu + 1; + last_counted_lclu = last_lclu; + } + node = rb_next(&es->rb_node); + if (!node) + break; + es = rb_entry(node, struct extent_status, rb_node); + } + + return n; +} + +/* + * ext4_es_delayed_clu - count number of clusters containing blocks that + * are both delayed and unwritten + * + * @inode - file containing block range + * @lblk - logical block defining start of range + * @len - number of blocks in range + * + * Locking for external use of __es_delayed_clu(). + */ +unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + ext4_lblk_t end; + unsigned int n; + + if (len == 0) + return 0; + + end = lblk + len - 1; + WARN_ON(end < lblk); + + read_lock(&ei->i_es_lock); + + n = __es_delayed_clu(inode, lblk, end); + + read_unlock(&ei->i_es_lock); + + return n; +} + +/* + * __revise_pending - makes, cancels, or leaves unchanged pending cluster + * reservations for a specified block range depending + * upon the presence or absence of delayed blocks + * outside the range within clusters at the ends of the + * range + * + * @inode - file containing the range + * @lblk - logical block defining the start of range + * @len - length of range in blocks + * + * Used after a newly allocated extent is added to the extents status tree. + * Requires that the extents in the range have either written or unwritten + * status. Must be called while holding i_es_lock. + */ +static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t end = lblk + len - 1; + ext4_lblk_t first, last; + bool f_del = false, l_del = false; + + if (len == 0) + return; + + /* + * Two cases - block range within single cluster and block range + * spanning two or more clusters. Note that a cluster belonging + * to a range starting and/or ending on a cluster boundary is treated + * as if it does not contain a delayed extent. The new range may + * have allocated space for previously delayed blocks out to the + * cluster boundary, requiring that any pre-existing pending + * reservation be canceled. Because this code only looks at blocks + * outside the range, it should revise pending reservations + * correctly even if the extent represented by the range can't be + * inserted in the extents status tree due to ENOSPC. + */ + + if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) { + first = EXT4_LBLK_CMASK(sbi, lblk); + if (first != lblk) + f_del = __es_scan_range(inode, &ext4_es_is_delonly, + first, lblk - 1); + if (f_del) { + __insert_pending(inode, first); + } else { + last = EXT4_LBLK_CMASK(sbi, end) + + sbi->s_cluster_ratio - 1; + if (last != end) + l_del = __es_scan_range(inode, + &ext4_es_is_delonly, + end + 1, last); + if (l_del) + __insert_pending(inode, last); + else + __remove_pending(inode, last); + } + } else { + first = EXT4_LBLK_CMASK(sbi, lblk); + if (first != lblk) + f_del = __es_scan_range(inode, &ext4_es_is_delonly, + first, lblk - 1); + if (f_del) + __insert_pending(inode, first); + else + __remove_pending(inode, first); + + last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1; + if (last != end) + l_del = __es_scan_range(inode, &ext4_es_is_delonly, + end + 1, last); + if (l_del) + __insert_pending(inode, last); + else + __remove_pending(inode, last); + } +} + +/* + * ext4_es_remove_blks - remove block range from extents status tree and + * reduce reservation count or cancel pending + * reservation as needed + * + * @inode - file containing range + * @lblk - first block in range + * @len - number of blocks to remove + * + */ +void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned int clu_size, reserved = 0; + ext4_lblk_t last_lclu, first, length, remainder, last; + bool delonly; + int err = 0; + struct pending_reservation *pr; + struct ext4_pending_tree *tree; + + /* + * Process cluster by cluster for bigalloc - there may be up to + * two clusters in a 4k page with a 1k block size and two blocks + * per cluster. Also necessary for systems with larger page sizes + * and potentially larger block sizes. + */ + clu_size = sbi->s_cluster_ratio; + last_lclu = EXT4_B2C(sbi, lblk + len - 1); + + write_lock(&EXT4_I(inode)->i_es_lock); + + for (first = lblk, remainder = len; + remainder > 0; + first += length, remainder -= length) { + + if (EXT4_B2C(sbi, first) == last_lclu) + length = remainder; + else + length = clu_size - EXT4_LBLK_COFF(sbi, first); + + /* + * The BH_Delay flag, which triggers calls to this function, + * and the contents of the extents status tree can be + * inconsistent due to writepages activity. So, note whether + * the blocks to be removed actually belong to an extent with + * delayed only status. + */ + delonly = __es_scan_clu(inode, &ext4_es_is_delonly, first); + + /* + * because of the writepages effect, written and unwritten + * blocks could be removed here + */ + last = first + length - 1; + err = __es_remove_extent(inode, first, last); + if (err) + ext4_warning(inode->i_sb, + "%s: couldn't remove page (err = %d)", + __func__, err); + + /* non-bigalloc case: simply count the cluster for release */ + if (sbi->s_cluster_ratio == 1 && delonly) { + reserved++; + continue; + } + + /* + * bigalloc case: if all delayed allocated only blocks have + * just been removed from a cluster, either cancel a pending + * reservation if it exists or count a cluster for release + */ + if (delonly && + !__es_scan_clu(inode, &ext4_es_is_delonly, first)) { + pr = __get_pending(inode, EXT4_B2C(sbi, first)); + if (pr != NULL) { + tree = &EXT4_I(inode)->i_pending_tree; + rb_erase(&pr->rb_node, &tree->root); + kmem_cache_free(ext4_pending_cachep, pr); + } else { + reserved++; + } + } + } + + write_unlock(&EXT4_I(inode)->i_es_lock); + + ext4_da_release_space(inode, reserved); +} diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 8efdeb903d6b..131a8b7df265 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -78,6 +78,51 @@ struct ext4_es_stats { struct percpu_counter es_stats_shk_cnt; }; +/* + * Pending cluster reservations for bigalloc file systems + * + * A cluster with a pending reservation is a logical cluster shared by at + * least one extent in the extents status tree with delayed and unwritten + * status and at least one other written or unwritten extent. The + * reservation is said to be pending because a cluster reservation would + * have to be taken in the event all blocks in the cluster shared with + * written or unwritten extents were deleted while the delayed and + * unwritten blocks remained. + * + * The set of pending cluster reservations is an auxiliary data structure + * used with the extents status tree to implement reserved cluster/block + * accounting for bigalloc file systems. The set is kept in memory and + * records all pending cluster reservations. + * + * Its primary function is to avoid the need to read extents from the + * disk when invalidating pages as a result of a truncate, punch hole, or + * collapse range operation. Page invalidation requires a decrease in the + * reserved cluster count if it results in the removal of all delayed + * and unwritten extents (blocks) from a cluster that is not shared with a + * written or unwritten extent, and no decrease otherwise. Determining + * whether the cluster is shared can be done by searching for a pending + * reservation on it. + * + * Secondarily, it provides a potentially faster method for determining + * whether the reserved cluster count should be increased when a physical + * cluster is deallocated as a result of a truncate, punch hole, or + * collapse range operation. The necessary information is also present + * in the extents status tree, but might be more rapidly accessed in + * the pending reservation set in many cases due to smaller size. + * + * The pending cluster reservation set is implemented as a red-black tree + * with the goal of minimizing per page search time overhead. + */ + +struct pending_reservation { + struct rb_node rb_node; + ext4_lblk_t lclu; +}; + +struct ext4_pending_tree { + struct rb_root root; +}; + extern int __init ext4_init_es(void); extern void ext4_exit_es(void); extern void ext4_es_init_tree(struct ext4_es_tree *tree); @@ -90,11 +135,18 @@ extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, unsigned int status); extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); -extern void ext4_es_find_delayed_extent_range(struct inode *inode, - ext4_lblk_t lblk, ext4_lblk_t end, - struct extent_status *es); +extern void ext4_es_find_extent_range(struct inode *inode, + int (*match_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es); +extern bool ext4_es_scan_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end); +extern bool ext4_es_scan_clu(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk); static inline unsigned int ext4_es_status(struct extent_status *es) { @@ -126,6 +178,16 @@ static inline int ext4_es_is_hole(struct extent_status *es) return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; } +static inline int ext4_es_is_mapped(struct extent_status *es) +{ + return (ext4_es_is_written(es) || ext4_es_is_unwritten(es)); +} + +static inline int ext4_es_is_delonly(struct extent_status *es) +{ + return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es)); +} + static inline void ext4_es_set_referenced(struct extent_status *es) { es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; @@ -175,4 +237,16 @@ extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v); +extern int __init ext4_init_pending(void); +extern void ext4_exit_pending(void); +extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); +extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); +extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); +extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, + bool allocated); +extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); +extern void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); + #endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 3543fe80a3c4..9c4bac18cc6c 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -863,7 +863,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, handle_t *handle; struct page *page; struct ext4_iloc iloc; - int retries; + int retries = 0; ret = ext4_get_inode_loc(inode, &iloc); if (ret) @@ -1753,6 +1753,7 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) { int err, inline_size; struct ext4_iloc iloc; + size_t inline_len; void *inline_pos; unsigned int offset; struct ext4_dir_entry_2 *de; @@ -1780,8 +1781,9 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) goto out; } + inline_len = ext4_get_inline_size(dir); offset = EXT4_INLINE_DOTDOT_SIZE; - while (offset < dir->i_size) { + while (offset < inline_len) { de = ext4_get_inline_entry(dir, &iloc, offset, &inline_pos, &inline_size); if (ext4_check_dir_entry(dir, NULL, de, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d0dd585add6a..c3d9a42c561e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -577,8 +577,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && !(status & EXTENT_STATUS_WRITTEN) && - ext4_find_delalloc_range(inode, map->m_lblk, - map->m_lblk + map->m_len - 1)) + ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, + map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); @@ -701,8 +701,8 @@ found: EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && !(status & EXTENT_STATUS_WRITTEN) && - ext4_find_delalloc_range(inode, map->m_lblk, - map->m_lblk + map->m_len - 1)) + ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, + map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); @@ -1595,7 +1595,7 @@ static int ext4_da_reserve_space(struct inode *inode) return 0; /* success */ } -static void ext4_da_release_space(struct inode *inode, int to_free) +void ext4_da_release_space(struct inode *inode, int to_free) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); @@ -1634,13 +1634,11 @@ static void ext4_da_page_release_reservation(struct page *page, unsigned int offset, unsigned int length) { - int to_release = 0, contiguous_blks = 0; + int contiguous_blks = 0; struct buffer_head *head, *bh; unsigned int curr_off = 0; struct inode *inode = page->mapping->host; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned int stop = offset + length; - int num_clusters; ext4_fsblk_t lblk; BUG_ON(stop > PAGE_SIZE || stop < length); @@ -1654,7 +1652,6 @@ static void ext4_da_page_release_reservation(struct page *page, break; if ((offset <= curr_off) && (buffer_delay(bh))) { - to_release++; contiguous_blks++; clear_buffer_delay(bh); } else if (contiguous_blks) { @@ -1662,7 +1659,7 @@ static void ext4_da_page_release_reservation(struct page *page, (PAGE_SHIFT - inode->i_blkbits); lblk += (curr_off >> inode->i_blkbits) - contiguous_blks; - ext4_es_remove_extent(inode, lblk, contiguous_blks); + ext4_es_remove_blks(inode, lblk, contiguous_blks); contiguous_blks = 0; } curr_off = next_off; @@ -1671,21 +1668,9 @@ static void ext4_da_page_release_reservation(struct page *page, if (contiguous_blks) { lblk = page->index << (PAGE_SHIFT - inode->i_blkbits); lblk += (curr_off >> inode->i_blkbits) - contiguous_blks; - ext4_es_remove_extent(inode, lblk, contiguous_blks); + ext4_es_remove_blks(inode, lblk, contiguous_blks); } - /* If we have released all the blocks belonging to a cluster, then we - * need to release the reserved space for that cluster. */ - num_clusters = EXT4_NUM_B2C(sbi, to_release); - while (num_clusters > 0) { - lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) + - ((num_clusters - 1) << sbi->s_cluster_bits); - if (sbi->s_cluster_ratio == 1 || - !ext4_find_delalloc_cluster(inode, lblk)) - ext4_da_release_space(inode, 1); - - num_clusters--; - } } /* @@ -1781,6 +1766,65 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) } /* + * ext4_insert_delayed_block - adds a delayed block to the extents status + * tree, incrementing the reserved cluster/block + * count or making a pending reservation + * where needed + * + * @inode - file containing the newly added block + * @lblk - logical block to be added + * + * Returns 0 on success, negative error code on failure. + */ +static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int ret; + bool allocated = false; + + /* + * If the cluster containing lblk is shared with a delayed, + * written, or unwritten extent in a bigalloc file system, it's + * already been accounted for and does not need to be reserved. + * A pending reservation must be made for the cluster if it's + * shared with a written or unwritten extent and doesn't already + * have one. Written and unwritten extents can be purged from the + * extents status tree if the system is under memory pressure, so + * it's necessary to examine the extent tree if a search of the + * extents status tree doesn't get a match. + */ + if (sbi->s_cluster_ratio == 1) { + ret = ext4_da_reserve_space(inode); + if (ret != 0) /* ENOSPC */ + goto errout; + } else { /* bigalloc */ + if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) { + if (!ext4_es_scan_clu(inode, + &ext4_es_is_mapped, lblk)) { + ret = ext4_clu_mapped(inode, + EXT4_B2C(sbi, lblk)); + if (ret < 0) + goto errout; + if (ret == 0) { + ret = ext4_da_reserve_space(inode); + if (ret != 0) /* ENOSPC */ + goto errout; + } else { + allocated = true; + } + } else { + allocated = true; + } + } + } + + ret = ext4_es_insert_delayed_block(inode, lblk, allocated); + +errout: + return ret; +} + +/* * This function is grabs code from the very beginning of * ext4_map_blocks, but assumes that the caller is from delayed write * time. This function looks up the requested blocks and sets the @@ -1859,28 +1903,14 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, add_delayed: if (retval == 0) { int ret; + /* * XXX: __block_prepare_write() unmaps passed block, * is it OK? */ - /* - * If the block was allocated from previously allocated cluster, - * then we don't need to reserve it again. However we still need - * to reserve metadata for every block we're going to write. - */ - if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 || - !ext4_find_delalloc_cluster(inode, map->m_lblk)) { - ret = ext4_da_reserve_space(inode); - if (ret) { - /* not enough space to reserve */ - retval = ret; - goto out_unlock; - } - } - ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - ~0, EXTENT_STATUS_DELAYED); - if (ret) { + ret = ext4_insert_delayed_block(inode, map->m_lblk); + if (ret != 0) { retval = ret; goto out_unlock; } @@ -3413,12 +3443,16 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned int blkbits = inode->i_blkbits; - unsigned long first_block = offset >> blkbits; - unsigned long last_block = (offset + length - 1) >> blkbits; + unsigned long first_block, last_block; struct ext4_map_blocks map; bool delalloc = false; int ret; + if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) + return -EINVAL; + first_block = offset >> blkbits; + last_block = min_t(loff_t, (offset + length - 1) >> blkbits, + EXT4_MAX_LOGICAL_BLOCK); if (flags & IOMAP_REPORT) { if (ext4_has_inline_data(inode)) { @@ -3446,7 +3480,8 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, ext4_lblk_t end = map.m_lblk + map.m_len - 1; struct extent_status es; - ext4_es_find_delayed_extent_range(inode, map.m_lblk, end, &es); + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, + map.m_lblk, end, &es); if (!es.es_len || es.es_lblk > end) { /* entire range is a hole */ @@ -3948,6 +3983,7 @@ static const struct address_space_operations ext4_dax_aops = { .writepages = ext4_dax_writepages, .direct_IO = noop_direct_IO, .set_page_dirty = noop_set_page_dirty, + .bmap = ext4_bmap, .invalidatepage = noop_invalidatepage, }; @@ -4192,9 +4228,8 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, return 0; } -static void ext4_wait_dax_page(struct ext4_inode_info *ei, bool *did_unlock) +static void ext4_wait_dax_page(struct ext4_inode_info *ei) { - *did_unlock = true; up_write(&ei->i_mmap_sem); schedule(); down_write(&ei->i_mmap_sem); @@ -4204,14 +4239,12 @@ int ext4_break_layouts(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct page *page; - bool retry; int error; if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem))) return -EINVAL; do { - retry = false; page = dax_layout_busy_page(inode->i_mapping); if (!page) return 0; @@ -4219,8 +4252,8 @@ int ext4_break_layouts(struct inode *inode) error = ___wait_var_event(&page->_refcount, atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, 0, 0, - ext4_wait_dax_page(ei, &retry)); - } while (error == 0 && retry); + ext4_wait_dax_page(ei)); + } while (error == 0); return error; } @@ -4895,6 +4928,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) * not initialized on a new filesystem. */ } ei->i_flags = le32_to_cpu(raw_inode->i_flags); + ext4_set_inode_flags(inode); inode->i_blocks = ext4_inode_blocks(raw_inode, ei); ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); if (ext4_has_feature_64bit(sb)) @@ -5041,7 +5075,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) goto bad_inode; } brelse(iloc.bh); - ext4_set_inode_flags(inode); unlock_new_inode(inode); return inode; @@ -6151,13 +6184,14 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) return !buffer_mapped(bh); } -int ext4_page_mkwrite(struct vm_fault *vmf) +vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = vmf->page; loff_t size; unsigned long len; - int ret; + int err; + vm_fault_t ret; struct file *file = vma->vm_file; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; @@ -6170,8 +6204,8 @@ int ext4_page_mkwrite(struct vm_fault *vmf) down_read(&EXT4_I(inode)->i_mmap_sem); - ret = ext4_convert_inline_data(inode); - if (ret) + err = ext4_convert_inline_data(inode); + if (err) goto out_ret; /* Delalloc case is easy... */ @@ -6179,9 +6213,9 @@ int ext4_page_mkwrite(struct vm_fault *vmf) !ext4_should_journal_data(inode) && !ext4_nonda_switch(inode->i_sb)) { do { - ret = block_page_mkwrite(vma, vmf, + err = block_page_mkwrite(vma, vmf, ext4_da_get_block_prep); - } while (ret == -ENOSPC && + } while (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)); goto out_ret; } @@ -6226,8 +6260,8 @@ retry_alloc: ret = VM_FAULT_SIGBUS; goto out; } - ret = block_page_mkwrite(vma, vmf, get_block); - if (!ret && ext4_should_journal_data(inode)) { + err = block_page_mkwrite(vma, vmf, get_block); + if (!err && ext4_should_journal_data(inode)) { if (ext4_walk_page_buffers(handle, page_buffers(page), 0, PAGE_SIZE, NULL, do_journal_get_write_access)) { unlock_page(page); @@ -6238,24 +6272,24 @@ retry_alloc: ext4_set_inode_state(inode, EXT4_STATE_JDATA); } ext4_journal_stop(handle); - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_alloc; out_ret: - ret = block_page_mkwrite_return(ret); + ret = block_page_mkwrite_return(err); out: up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(inode->i_sb); return ret; } -int ext4_filemap_fault(struct vm_fault *vmf) +vm_fault_t ext4_filemap_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); - int err; + vm_fault_t ret; down_read(&EXT4_I(inode)->i_mmap_sem); - err = filemap_fault(vmf); + ret = filemap_fault(vmf); up_read(&EXT4_I(inode)->i_mmap_sem); - return err; + return ret; } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index a7074115d6f6..0edee31913d1 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -67,7 +67,6 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) ei1 = EXT4_I(inode1); ei2 = EXT4_I(inode2); - swap(inode1->i_flags, inode2->i_flags); swap(inode1->i_version, inode2->i_version); swap(inode1->i_blocks, inode2->i_blocks); swap(inode1->i_bytes, inode2->i_bytes); @@ -85,6 +84,21 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) i_size_write(inode2, isize); } +static void reset_inode_seed(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __le32 inum = cpu_to_le32(inode->i_ino); + __le32 gen = cpu_to_le32(inode->i_generation); + __u32 csum; + + if (!ext4_has_metadata_csum(inode->i_sb)) + return; + + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); + ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen)); +} + /** * Swap the information from the given @inode and the inode * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other @@ -102,10 +116,13 @@ static long swap_inode_boot_loader(struct super_block *sb, struct inode *inode_bl; struct ext4_inode_info *ei_bl; - if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) + if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) || + IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) || + ext4_has_inline_data(inode)) return -EINVAL; - if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) + if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || + !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) return -EPERM; inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); @@ -120,13 +137,13 @@ static long swap_inode_boot_loader(struct super_block *sb, * that only 1 swap_inode_boot_loader is running. */ lock_two_nondirectories(inode, inode_bl); - truncate_inode_pages(&inode->i_data, 0); - truncate_inode_pages(&inode_bl->i_data, 0); - /* Wait for all existing dio workers */ inode_dio_wait(inode); inode_dio_wait(inode_bl); + truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages(&inode_bl->i_data, 0); + handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); if (IS_ERR(handle)) { err = -EINVAL; @@ -159,6 +176,8 @@ static long swap_inode_boot_loader(struct super_block *sb, inode->i_generation = prandom_u32(); inode_bl->i_generation = prandom_u32(); + reset_inode_seed(inode); + reset_inode_seed(inode_bl); ext4_discard_preallocations(inode); @@ -169,6 +188,7 @@ static long swap_inode_boot_loader(struct super_block *sb, inode->i_ino, err); /* Revert all changes: */ swap_inode_data(inode, inode_bl); + ext4_mark_inode_dirty(handle, inode); } else { err = ext4_mark_inode_dirty(handle, inode_bl); if (err < 0) { @@ -178,6 +198,7 @@ static long swap_inode_boot_loader(struct super_block *sb, /* Revert all changes: */ swap_inode_data(inode, inode_bl); ext4_mark_inode_dirty(handle, inode); + ext4_mark_inode_dirty(handle, inode_bl); } } ext4_journal_stop(handle); @@ -339,19 +360,14 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) if (projid_eq(kprojid, EXT4_I(inode)->i_projid)) return 0; - err = mnt_want_write_file(filp); - if (err) - return err; - err = -EPERM; - inode_lock(inode); /* Is it quota file? Do not allow user to mess with it */ if (ext4_is_quota_file(inode)) - goto out_unlock; + return err; err = ext4_get_inode_loc(inode, &iloc); if (err) - goto out_unlock; + return err; raw_inode = ext4_raw_inode(&iloc); if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) { @@ -359,20 +375,20 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) EXT4_SB(sb)->s_want_extra_isize, &iloc); if (err) - goto out_unlock; + return err; } else { brelse(iloc.bh); } - dquot_initialize(inode); + err = dquot_initialize(inode); + if (err) + return err; handle = ext4_journal_start(inode, EXT4_HT_QUOTA, EXT4_QUOTA_INIT_BLOCKS(sb) + EXT4_QUOTA_DEL_BLOCKS(sb) + 3); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto out_unlock; - } + if (IS_ERR(handle)) + return PTR_ERR(handle); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) @@ -400,9 +416,6 @@ out_dirty: err = rc; out_stop: ext4_journal_stop(handle); -out_unlock: - inode_unlock(inode); - mnt_drop_write_file(filp); return err; } #else @@ -626,6 +639,30 @@ group_add_out: return err; } +static int ext4_ioctl_check_project(struct inode *inode, struct fsxattr *fa) +{ + /* + * Project Quota ID state is only allowed to change from within the init + * namespace. Enforce that restriction only if we are trying to change + * the quota ID state. Everything else is allowed in user namespaces. + */ + if (current_user_ns() == &init_user_ns) + return 0; + + if (__kprojid_val(EXT4_I(inode)->i_projid) != fa->fsx_projid) + return -EINVAL; + + if (ext4_test_inode_flag(inode, EXT4_INODE_PROJINHERIT)) { + if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT)) + return -EINVAL; + } else { + if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT) + return -EINVAL; + } + + return 0; +} + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -1025,19 +1062,19 @@ resizefs_out: return err; inode_lock(inode); + err = ext4_ioctl_check_project(inode, &fa); + if (err) + goto out; flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) | (flags & EXT4_FL_XFLAG_VISIBLE); err = ext4_ioctl_setflags(inode, flags); - inode_unlock(inode); - mnt_drop_write_file(filp); if (err) - return err; - + goto out; err = ext4_ioctl_setproject(filp, fa.fsx_projid); - if (err) - return err; - - return 0; +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return err; } case EXT4_IOC_SHUTDOWN: return ext4_shutdown(sb, arg); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e29fce2fbf25..e2248083cdca 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4915,9 +4915,17 @@ do_more: &sbi->s_flex_groups[flex_group].free_clusters); } - if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) - dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); - percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); + /* + * on a bigalloc file system, defer the s_freeclusters_counter + * update to the caller (ext4_remove_space and friends) so they + * can determine if a cluster freed here should be rereserved + */ + if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) { + if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) + dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); + percpu_counter_add(&sbi->s_freeclusters_counter, + count_clusters); + } ext4_mb_unload_buddy(&e4b); diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 39b07c2d3384..2305b4374fd3 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -49,7 +49,6 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) */ sb_start_write(sb); ext4_mmp_csum_set(sb, mmp); - mark_buffer_dirty(bh); lock_buffer(bh); bh->b_end_io = end_buffer_write_sync; get_bh(bh); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index a409ff70d67b..2f5be02fc6f6 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -516,9 +516,13 @@ mext_check_arguments(struct inode *orig_inode, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } - if (orig_eof < orig_start + *len - 1) + if (orig_eof <= orig_start) + *len = 0; + else if (orig_eof < orig_start + *len - 1) *len = orig_eof - orig_start; - if (donor_eof < donor_start + *len - 1) + if (donor_eof <= donor_start) + *len = 0; + else if (donor_eof < donor_start + *len - 1) *len = donor_eof - donor_start; if (!*len) { ext4_debug("ext4 move extent: len should not be 0 " diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 116ff68c5bd4..67a38532032a 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2261,7 +2261,7 @@ again: dxroot->info.indirect_levels += 1; dxtrace(printk(KERN_DEBUG "Creating %d level index...\n", - info->indirect_levels)); + dxroot->info.indirect_levels)); err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (err) goto journal_error; @@ -3478,6 +3478,12 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, int credits; u8 old_file_type; + if (new.inode && new.inode->i_nlink == 0) { + EXT4_ERROR_INODE(new.inode, + "target of rename is already freed"); + return -EFSCORRUPTED; + } + if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT)) && (!projid_eq(EXT4_I(new_dir)->i_projid, EXT4_I(old_dentry->d_inode)->i_projid))) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index db7590178dfc..2aa62d58d8dd 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -374,13 +374,13 @@ static int io_submit_init_bio(struct ext4_io_submit *io, bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); if (!bio) return -ENOMEM; - wbc_init_bio(io->io_wbc, bio); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); bio->bi_end_io = ext4_end_bio; bio->bi_private = ext4_get_io_end(io->io_end); io->io_bio = bio; io->io_next_block = bh->b_blocknr; + wbc_init_bio(io->io_wbc, bio); return 0; } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index e5fb38451a73..ebbc663d0798 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -19,6 +19,7 @@ int ext4_resize_begin(struct super_block *sb) { + struct ext4_sb_info *sbi = EXT4_SB(sb); int ret = 0; if (!capable(CAP_SYS_RESOURCE)) @@ -29,7 +30,7 @@ int ext4_resize_begin(struct super_block *sb) * because the user tools have no way of handling this. Probably a * bad time to do it anyways. */ - if (EXT4_SB(sb)->s_sbh->b_blocknr != + if (EXT4_B2C(sbi, sbi->s_sbh->b_blocknr) != le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { ext4_warning(sb, "won't resize using backup superblock at %llu", (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr); @@ -1986,6 +1987,26 @@ retry: } } + /* + * Make sure the last group has enough space so that it's + * guaranteed to have enough space for all metadata blocks + * that it might need to hold. (We might not need to store + * the inode table blocks in the last block group, but there + * will be cases where this might be needed.) + */ + if ((ext4_group_first_block_no(sb, n_group) + + ext4_group_overhead_blocks(sb, n_group) + 2 + + sbi->s_itb_per_group + sbi->s_cluster_ratio) >= n_blocks_count) { + n_blocks_count = ext4_group_first_block_no(sb, n_group); + n_group--; + n_blocks_count_retry = 0; + if (resize_inode) { + iput(resize_inode); + resize_inode = NULL; + } + goto retry; + } + /* extend the last group */ if (n_group == o_group) add = n_blocks_count - o_blocks_count; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5863fd22e90b..a221f1cdf704 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -914,6 +914,18 @@ static inline void ext4_quota_off_umount(struct super_block *sb) for (type = 0; type < EXT4_MAXQUOTAS; type++) ext4_quota_off(sb, type); } + +/* + * This is a helper function which is used in the mount/remount + * codepaths (which holds s_umount) to fetch the quota file name. + */ +static inline char *get_qf_name(struct super_block *sb, + struct ext4_sb_info *sbi, + int type) +{ + return rcu_dereference_protected(sbi->s_qf_names[type], + lockdep_is_held(&sb->s_umount)); +} #else static inline void ext4_quota_off_umount(struct super_block *sb) { @@ -965,7 +977,7 @@ static void ext4_put_super(struct super_block *sb) percpu_free_rwsem(&sbi->s_journal_flag_rwsem); #ifdef CONFIG_QUOTA for (i = 0; i < EXT4_MAXQUOTAS; i++) - kfree(sbi->s_qf_names[i]); + kfree(get_qf_name(sb, sbi, i)); #endif /* Debugging code just in case the in-memory inode orphan list @@ -1040,6 +1052,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_da_metadata_calc_len = 0; ei->i_da_metadata_calc_last_lblock = 0; spin_lock_init(&(ei->i_block_reservation_lock)); + ext4_init_pending_tree(&ei->i_pending_tree); #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); @@ -1530,11 +1543,10 @@ static const char deprecated_msg[] = static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) { struct ext4_sb_info *sbi = EXT4_SB(sb); - char *qname; + char *qname, *old_qname = get_qf_name(sb, sbi, qtype); int ret = -1; - if (sb_any_quota_loaded(sb) && - !sbi->s_qf_names[qtype]) { + if (sb_any_quota_loaded(sb) && !old_qname) { ext4_msg(sb, KERN_ERR, "Cannot change journaled " "quota options when quota turned on"); @@ -1551,8 +1563,8 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) "Not enough memory for storing quotafile name"); return -1; } - if (sbi->s_qf_names[qtype]) { - if (strcmp(sbi->s_qf_names[qtype], qname) == 0) + if (old_qname) { + if (strcmp(old_qname, qname) == 0) ret = 1; else ext4_msg(sb, KERN_ERR, @@ -1565,7 +1577,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) "quotafile must be on filesystem root"); goto errout; } - sbi->s_qf_names[qtype] = qname; + rcu_assign_pointer(sbi->s_qf_names[qtype], qname); set_opt(sb, QUOTA); return 1; errout: @@ -1577,15 +1589,16 @@ static int clear_qf_name(struct super_block *sb, int qtype) { struct ext4_sb_info *sbi = EXT4_SB(sb); + char *old_qname = get_qf_name(sb, sbi, qtype); - if (sb_any_quota_loaded(sb) && - sbi->s_qf_names[qtype]) { + if (sb_any_quota_loaded(sb) && old_qname) { ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options" " when quota turned on"); return -1; } - kfree(sbi->s_qf_names[qtype]); - sbi->s_qf_names[qtype] = NULL; + rcu_assign_pointer(sbi->s_qf_names[qtype], NULL); + synchronize_rcu(); + kfree(old_qname); return 1; } #endif @@ -1960,7 +1973,7 @@ static int parse_options(char *options, struct super_block *sb, int is_remount) { struct ext4_sb_info *sbi = EXT4_SB(sb); - char *p; + char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name; substring_t args[MAX_OPT_ARGS]; int token; @@ -1991,11 +2004,13 @@ static int parse_options(char *options, struct super_block *sb, "Cannot enable project quota enforcement."); return 0; } - if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { - if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) + usr_qf_name = get_qf_name(sb, sbi, USRQUOTA); + grp_qf_name = get_qf_name(sb, sbi, GRPQUOTA); + if (usr_qf_name || grp_qf_name) { + if (test_opt(sb, USRQUOTA) && usr_qf_name) clear_opt(sb, USRQUOTA); - if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) + if (test_opt(sb, GRPQUOTA) && grp_qf_name) clear_opt(sb, GRPQUOTA); if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { @@ -2029,6 +2044,7 @@ static inline void ext4_show_quota_options(struct seq_file *seq, { #if defined(CONFIG_QUOTA) struct ext4_sb_info *sbi = EXT4_SB(sb); + char *usr_qf_name, *grp_qf_name; if (sbi->s_jquota_fmt) { char *fmtname = ""; @@ -2047,11 +2063,14 @@ static inline void ext4_show_quota_options(struct seq_file *seq, seq_printf(seq, ",jqfmt=%s", fmtname); } - if (sbi->s_qf_names[USRQUOTA]) - seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]); - - if (sbi->s_qf_names[GRPQUOTA]) - seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]); + rcu_read_lock(); + usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]); + grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]); + if (usr_qf_name) + seq_show_option(seq, "usrjquota", usr_qf_name); + if (grp_qf_name) + seq_show_option(seq, "grpjquota", grp_qf_name); + rcu_read_unlock(); #endif } @@ -2145,6 +2164,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); if (test_opt(sb, DATA_ERR_ABORT)) SEQ_OPTS_PUTS("data_err=abort"); + if (DUMMY_ENCRYPTION_ENABLED(sbi)) + SEQ_OPTS_PUTS("test_dummy_encryption"); ext4_show_quota_options(seq, sb); return 0; @@ -4378,11 +4399,13 @@ no_journal: block = ext4_count_free_clusters(sb); ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block)); + ext4_superblock_csum_set(sb); err = percpu_counter_init(&sbi->s_freeclusters_counter, block, GFP_KERNEL); if (!err) { unsigned long freei = ext4_count_free_inodes(sb); sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); + ext4_superblock_csum_set(sb); err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, GFP_KERNEL); } @@ -5099,6 +5122,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) int err = 0; #ifdef CONFIG_QUOTA int i, j; + char *to_free[EXT4_MAXQUOTAS]; #endif char *orig_data = kstrdup(data, GFP_KERNEL); @@ -5118,8 +5142,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) old_opts.s_jquota_fmt = sbi->s_jquota_fmt; for (i = 0; i < EXT4_MAXQUOTAS; i++) if (sbi->s_qf_names[i]) { - old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], - GFP_KERNEL); + char *qf_name = get_qf_name(sb, sbi, i); + + old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL); if (!old_opts.s_qf_names[i]) { for (j = 0; j < i; j++) kfree(old_opts.s_qf_names[j]); @@ -5348,9 +5373,12 @@ restore_opts: #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; for (i = 0; i < EXT4_MAXQUOTAS; i++) { - kfree(sbi->s_qf_names[i]); - sbi->s_qf_names[i] = old_opts.s_qf_names[i]; + to_free[i] = get_qf_name(sb, sbi, i); + rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]); } + synchronize_rcu(); + for (i = 0; i < EXT4_MAXQUOTAS; i++) + kfree(to_free[i]); #endif kfree(orig_data); return err; @@ -5541,7 +5569,7 @@ static int ext4_write_info(struct super_block *sb, int type) */ static int ext4_quota_on_mount(struct super_block *sb, int type) { - return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type], + return dquot_quota_on_mount(sb, get_qf_name(sb, EXT4_SB(sb), type), EXT4_SB(sb)->s_jquota_fmt, type); } @@ -5950,6 +5978,10 @@ static int __init ext4_init_fs(void) if (err) return err; + err = ext4_init_pending(); + if (err) + goto out6; + err = ext4_init_pageio(); if (err) goto out5; @@ -5988,6 +6020,8 @@ out3: out4: ext4_exit_pageio(); out5: + ext4_exit_pending(); +out6: ext4_exit_es(); return err; @@ -6005,6 +6039,7 @@ static void __exit ext4_exit_fs(void) ext4_exit_system_zone(); ext4_exit_pageio(); ext4_exit_es(); + ext4_exit_pending(); } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 111824199a88..fa707cdd4120 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/acl.c * @@ -7,10 +8,6 @@ * Portions of this code from linux/fs/ext2/acl.c * * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/f2fs_fs.h> #include "f2fs.h" @@ -53,6 +50,9 @@ static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size) struct f2fs_acl_entry *entry = (struct f2fs_acl_entry *)(hdr + 1); const char *end = value + size; + if (size < sizeof(struct f2fs_acl_header)) + return ERR_PTR(-EINVAL); + if (hdr->a_version != cpu_to_le32(F2FS_ACL_VERSION)) return ERR_PTR(-EINVAL); @@ -394,12 +394,16 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl, ipage); posix_acl_release(default_acl); + } else { + inode->i_default_acl = NULL; } if (acl) { if (!error) error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, ipage); posix_acl_release(acl); + } else { + inode->i_acl = NULL; } return error; diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index 2c685185c24d..b96823c59b15 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/acl.h * @@ -7,10 +8,6 @@ * Portions of this code from linux/fs/ext2/acl.h * * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #ifndef __F2FS_ACL_H__ #define __F2FS_ACL_H__ diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index e8b6b89bddb8..9c28ea439e0b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/checkpoint.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/fs.h> #include <linux/bio.h> @@ -122,11 +119,8 @@ retry: if (PTR_ERR(page) == -EIO && ++count <= DEFAULT_RETRY_IO_COUNT) goto retry; - f2fs_stop_checkpoint(sbi, false); - f2fs_bug_on(sbi, 1); } - return page; } @@ -282,8 +276,7 @@ static int __f2fs_write_meta_page(struct page *page, dec_page_count(sbi, F2FS_DIRTY_META); if (wbc->for_reclaim) - f2fs_submit_merged_write_cond(sbi, page->mapping->host, - 0, page->index, META); + f2fs_submit_merged_write_cond(sbi, NULL, page, 0, META); unlock_page(page); @@ -696,6 +689,8 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) /* clear Orphan Flag */ clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG); out: + set_sbi_flag(sbi, SBI_IS_RECOVERED); + #ifdef CONFIG_QUOTA /* Turn quotas off */ if (quota_enabled) @@ -1084,6 +1079,21 @@ static void __prepare_cp_block(struct f2fs_sb_info *sbi) ckpt->next_free_nid = cpu_to_le32(last_nid); } +static bool __need_flush_quota(struct f2fs_sb_info *sbi) +{ + if (!is_journalled_quota(sbi)) + return false; + if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) + return false; + if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) + return false; + if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_FLUSH)) + return true; + if (get_pages(sbi, F2FS_DIRTY_QDATA)) + return true; + return false; +} + /* * Freeze all the FS-operations for checkpoint. */ @@ -1095,12 +1105,36 @@ static int block_operations(struct f2fs_sb_info *sbi) .for_reclaim = 0, }; struct blk_plug plug; - int err = 0; + int err = 0, cnt = 0; blk_start_plug(&plug); -retry_flush_dents: +retry_flush_quotas: + if (__need_flush_quota(sbi)) { + int locked; + + if (++cnt > DEFAULT_RETRY_QUOTA_FLUSH_COUNT) { + set_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH); + f2fs_lock_all(sbi); + goto retry_flush_dents; + } + clear_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH); + + /* only failed during mount/umount/freeze/quotactl */ + locked = down_read_trylock(&sbi->sb->s_umount); + f2fs_quota_sync(sbi->sb, -1); + if (locked) + up_read(&sbi->sb->s_umount); + } + f2fs_lock_all(sbi); + if (__need_flush_quota(sbi)) { + f2fs_unlock_all(sbi); + cond_resched(); + goto retry_flush_quotas; + } + +retry_flush_dents: /* write all the dirty dentry pages */ if (get_pages(sbi, F2FS_DIRTY_DENTS)) { f2fs_unlock_all(sbi); @@ -1108,7 +1142,7 @@ retry_flush_dents: if (err) goto out; cond_resched(); - goto retry_flush_dents; + goto retry_flush_quotas; } /* @@ -1117,6 +1151,12 @@ retry_flush_dents: */ down_write(&sbi->node_change); + if (__need_flush_quota(sbi)) { + up_write(&sbi->node_change); + f2fs_unlock_all(sbi); + goto retry_flush_quotas; + } + if (get_pages(sbi, F2FS_DIRTY_IMETA)) { up_write(&sbi->node_change); f2fs_unlock_all(sbi); @@ -1124,7 +1164,7 @@ retry_flush_dents: if (err) goto out; cond_resched(); - goto retry_flush_dents; + goto retry_flush_quotas; } retry_flush_nodes: @@ -1215,6 +1255,19 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) __set_ckpt_flags(ckpt, CP_FSCK_FLAG); + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED)) + __set_ckpt_flags(ckpt, CP_DISABLED_FLAG); + else + __clear_ckpt_flags(ckpt, CP_DISABLED_FLAG); + + if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) + __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); + else + __clear_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); + + if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) + __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); + /* set this flag to activate crc|cp_ver for recovery */ __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG); __clear_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG); @@ -1422,6 +1475,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) clear_sbi_flag(sbi, SBI_IS_DIRTY); clear_sbi_flag(sbi, SBI_NEED_CP); + clear_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH); + sbi->unusable_block_count = 0; __set_cp_next_pack(sbi); /* @@ -1446,6 +1501,12 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) unsigned long long ckpt_ver; int err = 0; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (cpc->reason != CP_PAUSE) + return 0; + f2fs_msg(sbi->sb, KERN_WARNING, + "Start checkpoint disabled!"); + } mutex_lock(&sbi->cp_mutex); if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && @@ -1497,7 +1558,10 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver); /* write cached NAT/SIT entries to NAT/SIT area */ - f2fs_flush_nat_entries(sbi, cpc); + err = f2fs_flush_nat_entries(sbi, cpc); + if (err) + goto stop; + f2fs_flush_sit_entries(sbi, cpc); /* unlock all the fs_lock[] in do_checkpoint() */ @@ -1506,7 +1570,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_release_discard_addrs(sbi); else f2fs_clear_prefree_segments(sbi, cpc); - +stop: unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 382c1ef9a9e4..106f116466bf 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/data.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/fs.h> #include <linux/f2fs_fs.h> @@ -49,12 +46,29 @@ static bool __is_cp_guaranteed(struct page *page) inode->i_ino == F2FS_NODE_INO(sbi) || S_ISDIR(inode->i_mode) || (S_ISREG(inode->i_mode) && - is_inode_flag_set(inode, FI_ATOMIC_FILE)) || + (f2fs_is_atomic_file(inode) || IS_NOQUOTA(inode))) || is_cold_data(page)) return true; return false; } +static enum count_type __read_io_type(struct page *page) +{ + struct address_space *mapping = page->mapping; + + if (mapping) { + struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_META_INO(sbi)) + return F2FS_RD_META; + + if (inode->i_ino == F2FS_NODE_INO(sbi)) + return F2FS_RD_NODE; + } + return F2FS_RD_DATA; +} + /* postprocessing steps for read bios */ enum bio_post_read_step { STEP_INITIAL = 0, @@ -80,10 +94,12 @@ static void __read_end_io(struct bio *bio) /* PG_error was set if any post_read step failed */ if (bio->bi_status || PageError(page)) { ClearPageUptodate(page); - SetPageError(page); + /* will re-read again later */ + ClearPageError(page); } else { SetPageUptodate(page); } + dec_page_count(F2FS_P_SB(page), __read_io_type(page)); unlock_page(page); } if (bio->bi_private) @@ -126,8 +142,9 @@ static bool f2fs_bio_post_read_required(struct bio *bio) static void f2fs_read_end_io(struct bio *bio) { - if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)), FAULT_IO)) { - f2fs_show_injection_info(FAULT_IO); + if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)), + FAULT_READ_IO)) { + f2fs_show_injection_info(FAULT_READ_IO); bio->bi_status = BLK_STS_IOERR; } @@ -148,6 +165,11 @@ static void f2fs_write_end_io(struct bio *bio) struct bio_vec *bvec; int i; + if (time_to_inject(sbi, FAULT_WRITE_IO)) { + f2fs_show_injection_info(FAULT_WRITE_IO); + bio->bi_status = BLK_STS_IOERR; + } + bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; enum count_type type = WB_DATA_TYPE(page); @@ -319,8 +341,8 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) io->bio = NULL; } -static bool __has_merged_page(struct f2fs_bio_info *io, - struct inode *inode, nid_t ino, pgoff_t idx) +static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, + struct page *page, nid_t ino) { struct bio_vec *bvec; struct page *target; @@ -329,7 +351,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, if (!io->bio) return false; - if (!inode && !ino) + if (!inode && !page && !ino) return true; bio_for_each_segment_all(bvec, io->bio, i) { @@ -339,11 +361,10 @@ static bool __has_merged_page(struct f2fs_bio_info *io, else target = fscrypt_control_page(bvec->bv_page); - if (idx != target->index) - continue; - if (inode && inode == target->mapping->host) return true; + if (page && page == target) + return true; if (ino && ino == ino_of_node(target)) return true; } @@ -352,7 +373,8 @@ static bool __has_merged_page(struct f2fs_bio_info *io, } static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode, - nid_t ino, pgoff_t idx, enum page_type type) + struct page *page, nid_t ino, + enum page_type type) { enum page_type btype = PAGE_TYPE_OF_BIO(type); enum temp_type temp; @@ -363,7 +385,7 @@ static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode, io = sbi->write_io[btype] + temp; down_read(&io->io_rwsem); - ret = __has_merged_page(io, inode, ino, idx); + ret = __has_merged_page(io, inode, page, ino); up_read(&io->io_rwsem); /* TODO: use HOT temp only for meta pages now. */ @@ -394,12 +416,12 @@ static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, } static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, - struct inode *inode, nid_t ino, pgoff_t idx, - enum page_type type, bool force) + struct inode *inode, struct page *page, + nid_t ino, enum page_type type, bool force) { enum temp_type temp; - if (!force && !has_merged_page(sbi, inode, ino, idx, type)) + if (!force && !has_merged_page(sbi, inode, page, ino, type)) return; for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { @@ -418,10 +440,10 @@ void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type) } void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, - struct inode *inode, nid_t ino, pgoff_t idx, - enum page_type type) + struct inode *inode, struct page *page, + nid_t ino, enum page_type type) { - __submit_merged_write_cond(sbi, inode, ino, idx, type, false); + __submit_merged_write_cond(sbi, inode, page, ino, type, false); } void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi) @@ -456,12 +478,16 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) bio_put(bio); return -EFAULT; } + + if (fio->io_wbc && !is_read_io(fio->op)) + wbc_account_io(fio->io_wbc, page, PAGE_SIZE); + bio_set_op_attrs(bio, fio->op, fio->op_flags); - __submit_bio(fio->sbi, bio, fio->type); + inc_page_count(fio->sbi, is_read_io(fio->op) ? + __read_io_type(page): WB_DATA_TYPE(fio->page)); - if (!is_read_io(fio->op)) - inc_page_count(fio->sbi, WB_DATA_TYPE(fio->page)); + __submit_bio(fio->sbi, bio, fio->type); return 0; } @@ -533,6 +559,9 @@ skip: if (fio->in_list) goto next; out: + if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || + f2fs_is_checkpoint_ready(sbi)) + __submit_merged_bio(io); up_write(&io->io_rwsem); } @@ -565,9 +594,6 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, ctx->bio = bio; ctx->enabled_steps = post_read_steps; bio->bi_private = ctx; - - /* wait the page to be moved by cleaning */ - f2fs_wait_on_block_writeback(sbi, blkaddr); } return bio; @@ -582,10 +608,15 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, if (IS_ERR(bio)) return PTR_ERR(bio); + /* wait for GCed page writeback via META_MAPPING */ + f2fs_wait_on_block_writeback(inode, blkaddr); + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); return -EFAULT; } + ClearPageError(page); + inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA); __submit_bio(F2FS_I_SB(inode), bio, DATA); return 0; } @@ -876,7 +907,6 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) struct f2fs_summary sum; struct node_info ni; block_t old_blkaddr; - pgoff_t fofs; blkcnt_t count = 1; int err; @@ -889,7 +919,7 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) dn->data_blkaddr = datablock_addr(dn->inode, dn->node_page, dn->ofs_in_node); - if (dn->data_blkaddr == NEW_ADDR) + if (dn->data_blkaddr != NULL_ADDR) goto alloc; if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count)))) @@ -905,12 +935,10 @@ alloc: old_blkaddr, old_blkaddr); f2fs_set_data_blkaddr(dn); - /* update i_size */ - fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + - dn->ofs_in_node; - if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_SHIFT)) - f2fs_i_size_write(dn->inode, - ((loff_t)(fofs + 1) << PAGE_SHIFT)); + /* + * i_size will be updated by direct_IO. Otherwise, we'll get stale + * data from unwritten block via dio_read. + */ return 0; } @@ -945,7 +973,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) if (direct_io) { map.m_seg_type = f2fs_rw_hint_to_seg_type(iocb->ki_hint); - flag = f2fs_force_buffered_io(inode, WRITE) ? + flag = f2fs_force_buffered_io(inode, iocb, from) ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO; goto map_blocks; @@ -970,7 +998,7 @@ map_blocks: return err; } -static inline void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) +void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) { if (flag == F2FS_GET_BLOCK_PRE_AIO) { if (lock) @@ -1025,6 +1053,11 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, map->m_flags = F2FS_MAP_MAPPED; if (map->m_next_extent) *map->m_next_extent = pgofs + map->m_len; + + /* for hardware encryption, but to avoid potential issue in future */ + if (flag == F2FS_GET_BLOCK_DIO) + f2fs_wait_on_block_writeback_range(inode, + map->m_pblk, map->m_len); goto out; } @@ -1064,7 +1097,15 @@ next_block: goto sync_out; } - if (!is_valid_data_blkaddr(sbi, blkaddr)) { + if (is_valid_data_blkaddr(sbi, blkaddr)) { + /* use out-place-update for driect IO under LFS mode */ + if (test_opt(sbi, LFS) && create && + flag == F2FS_GET_BLOCK_DIO) { + err = __allocate_data_block(&dn, map->m_seg_type); + if (!err) + set_inode_flag(inode, FI_APPEND_WRITE); + } + } else { if (create) { if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; @@ -1076,6 +1117,8 @@ next_block: last_ofs_in_node = dn.ofs_in_node; } } else { + WARN_ON(flag != F2FS_GET_BLOCK_PRE_DIO && + flag != F2FS_GET_BLOCK_DIO); err = __allocate_data_block(&dn, map->m_seg_type); if (!err) @@ -1173,6 +1216,12 @@ skip: goto next_dnode; sync_out: + + /* for hardware encryption, but to avoid potential issue in future */ + if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) + f2fs_wait_on_block_writeback_range(inode, + map->m_pblk, map->m_len); + if (flag == F2FS_GET_BLOCK_PRECACHE) { if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; @@ -1255,7 +1304,7 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DEFAULT, NULL, + F2FS_GET_BLOCK_DIO, NULL, f2fs_rw_hint_to_seg_type( inode->i_write_hint)); } @@ -1558,9 +1607,17 @@ submit_and_realloc: } } + /* + * If the page is under writeback, we need to wait for + * its completion to see the correct decrypted data. + */ + f2fs_wait_on_block_writeback(inode, block_nr); + if (bio_add_page(bio, page, blocksize, 0) < blocksize) goto submit_and_realloc; + inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA); + ClearPageError(page); last_block_in_bio = block_nr; goto next_page; set_error_page: @@ -1625,7 +1682,7 @@ static int encrypt_one_page(struct f2fs_io_info *fio) return 0; /* wait for GCed page writeback via META_MAPPING */ - f2fs_wait_on_block_writeback(fio->sbi, fio->old_blkaddr); + f2fs_wait_on_block_writeback(inode, fio->old_blkaddr); retry_encrypt: fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, @@ -1682,6 +1739,10 @@ static inline bool check_inplace_update_policy(struct inode *inode, is_inode_flag_set(inode, FI_NEED_IPU)) return true; + if (unlikely(fio && is_sbi_flag_set(sbi, SBI_CP_DISABLED) && + !f2fs_is_checkpointed_data(sbi, fio->old_blkaddr))) + return true; + return false; } @@ -1705,6 +1766,8 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) return true; if (S_ISDIR(inode->i_mode)) return true; + if (IS_NOQUOTA(inode)) + return true; if (f2fs_is_atomic_file(inode)) return true; if (fio) { @@ -1712,6 +1775,9 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) return true; if (IS_ATOMIC_WRITTEN_PAGE(fio->page)) return true; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) && + f2fs_is_checkpointed_data(sbi, fio->old_blkaddr))) + return true; } return false; } @@ -1763,6 +1829,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) /* This page is already truncated */ if (fio->old_blkaddr == NULL_ADDR) { ClearPageUptodate(page); + clear_cold_data(page); goto out_writepage; } got_it: @@ -1938,18 +2005,20 @@ done: out: inode_dec_dirty_pages(inode); - if (err) + if (err) { ClearPageUptodate(page); + clear_cold_data(page); + } if (wbc->for_reclaim) { - f2fs_submit_merged_write_cond(sbi, inode, 0, page->index, DATA); + f2fs_submit_merged_write_cond(sbi, NULL, page, 0, DATA); clear_inode_flag(inode, FI_HOT_DATA); f2fs_remove_dirty_inode(inode); submitted = NULL; } unlock_page(page); - if (!S_ISDIR(inode->i_mode)) + if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode)) f2fs_balance_fs(sbi, need_balance_fs); if (unlikely(f2fs_cp_error(sbi))) { @@ -2000,10 +2069,10 @@ static int f2fs_write_cache_pages(struct address_space *mapping, pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; - pgoff_t last_idx = ULONG_MAX; int cycled; int range_whole = 0; int tag; + int nwritten = 0; pagevec_init(&pvec); @@ -2106,7 +2175,7 @@ continue_unlock: done = 1; break; } else if (submitted) { - last_idx = page->index; + nwritten++; } if (--wbc->nr_to_write <= 0 && @@ -2128,9 +2197,9 @@ continue_unlock: if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; - if (last_idx != ULONG_MAX) + if (nwritten) f2fs_submit_merged_write_cond(F2FS_M_SB(mapping), mapping->host, - 0, last_idx, DATA); + NULL, 0, DATA); return ret; } @@ -2140,6 +2209,8 @@ static inline bool __should_serialize_io(struct inode *inode, { if (!S_ISREG(inode->i_mode)) return false; + if (IS_NOQUOTA(inode)) + return false; if (wbc->sync_mode != WB_SYNC_ALL) return true; if (get_dirty_pages(inode) >= SM_I(F2FS_I_SB(inode))->min_seq_blocks) @@ -2169,7 +2240,8 @@ static int __f2fs_write_data_pages(struct address_space *mapping, if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto skip_write; - if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && + if ((S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) && + wbc->sync_mode == WB_SYNC_NONE && get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && f2fs_available_free_memory(sbi, DIRTY_DENTS)) goto skip_write; @@ -2234,7 +2306,7 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, i_size); - f2fs_truncate_blocks(inode, i_size, true); + f2fs_truncate_blocks(inode, i_size, true, true); up_write(&F2FS_I(inode)->i_mmap_sem); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -2332,6 +2404,10 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, trace_f2fs_write_begin(inode, pos, len, flags); + err = f2fs_is_checkpoint_ready(sbi); + if (err) + goto fail; + if ((f2fs_is_atomic_file(inode) && !f2fs_available_free_memory(sbi, INMEM_PAGES)) || is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) { @@ -2369,7 +2445,8 @@ repeat: if (err) goto fail; - if (need_balance && has_not_enough_free_secs(sbi, 0, 0)) { + if (need_balance && !IS_NOQUOTA(inode) && + has_not_enough_free_secs(sbi, 0, 0)) { unlock_page(page); f2fs_balance_fs(sbi, true); lock_page(page); @@ -2382,10 +2459,6 @@ repeat: f2fs_wait_on_page_writeback(page, DATA, false); - /* wait for GCed page writeback via META_MAPPING */ - if (f2fs_post_read_required(inode)) - f2fs_wait_on_block_writeback(sbi, blkaddr); - if (len == PAGE_SIZE || PageUptodate(page)) return 0; @@ -2480,36 +2553,53 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); size_t count = iov_iter_count(iter); loff_t offset = iocb->ki_pos; int rw = iov_iter_rw(iter); int err; enum rw_hint hint = iocb->ki_hint; int whint_mode = F2FS_OPTION(sbi).whint_mode; + bool do_opu; err = check_direct_IO(inode, iter, offset); if (err) return err < 0 ? err : 0; - if (f2fs_force_buffered_io(inode, rw)) + if (f2fs_force_buffered_io(inode, iocb, iter)) return 0; + do_opu = allow_outplace_dio(inode, iocb, iter); + trace_f2fs_direct_IO_enter(inode, offset, count, rw); if (rw == WRITE && whint_mode == WHINT_MODE_OFF) iocb->ki_hint = WRITE_LIFE_NOT_SET; - if (!down_read_trylock(&F2FS_I(inode)->i_gc_rwsem[rw])) { - if (iocb->ki_flags & IOCB_NOWAIT) { + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!down_read_trylock(&fi->i_gc_rwsem[rw])) { iocb->ki_hint = hint; err = -EAGAIN; goto out; } - down_read(&F2FS_I(inode)->i_gc_rwsem[rw]); + if (do_opu && !down_read_trylock(&fi->i_gc_rwsem[READ])) { + up_read(&fi->i_gc_rwsem[rw]); + iocb->ki_hint = hint; + err = -EAGAIN; + goto out; + } + } else { + down_read(&fi->i_gc_rwsem[rw]); + if (do_opu) + down_read(&fi->i_gc_rwsem[READ]); } err = blockdev_direct_IO(iocb, inode, iter, get_data_block_dio); - up_read(&F2FS_I(inode)->i_gc_rwsem[rw]); + + if (do_opu) + up_read(&fi->i_gc_rwsem[READ]); + + up_read(&fi->i_gc_rwsem[rw]); if (rw == WRITE) { if (whint_mode == WHINT_MODE_OFF) @@ -2517,7 +2607,8 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (err > 0) { f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, err); - set_inode_flag(inode, FI_UPDATE_WRITE); + if (!do_opu) + set_inode_flag(inode, FI_UPDATE_WRITE); } else if (err < 0) { f2fs_write_failed(mapping, offset + count); } @@ -2550,6 +2641,8 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, } } + clear_cold_data(page); + /* This is atomic written page, keep Private */ if (IS_ATOMIC_WRITTEN_PAGE(page)) return f2fs_drop_inmem_page(inode, page); @@ -2568,6 +2661,7 @@ int f2fs_release_page(struct page *page, gfp_t wait) if (IS_ATOMIC_WRITTEN_PAGE(page)) return 0; + clear_cold_data(page); set_page_private(page, 0); ClearPagePrivate(page); return 1; @@ -2583,10 +2677,6 @@ static int f2fs_set_data_page_dirty(struct page *page) if (!PageUptodate(page)) SetPageUptodate(page); - /* don't remain PG_checked flag which was set during GC */ - if (is_cold_data(page)) - clear_cold_data(page); - if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { if (!IS_ATOMIC_WRITTEN_PAGE(page)) { f2fs_register_inmem_page(inode, page); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 214a968962a1..139b4d5c83d5 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * f2fs debugging statistics * @@ -5,10 +6,6 @@ * http://www.samsung.com/ * Copyright (c) 2012 Linux Foundation * Copyright (c) 2012 Greg Kroah-Hartman <gregkh@linuxfoundation.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/fs.h> @@ -58,6 +55,9 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); + si->nr_rd_data = get_pages(sbi, F2FS_RD_DATA); + si->nr_rd_node = get_pages(sbi, F2FS_RD_NODE); + si->nr_rd_meta = get_pages(sbi, F2FS_RD_META); if (SM_I(sbi) && SM_I(sbi)->fcc_info) { si->nr_flushed = atomic_read(&SM_I(sbi)->fcc_info->issued_flush); @@ -104,6 +104,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->avail_nids = NM_I(sbi)->available_nids; si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID]; si->bg_gc = sbi->bg_gc; + si->io_skip_bggc = sbi->io_skip_bggc; + si->other_skip_bggc = sbi->other_skip_bggc; si->skipped_atomic_files[BG_GC] = sbi->skipped_atomic_files[BG_GC]; si->skipped_atomic_files[FG_GC] = sbi->skipped_atomic_files[FG_GC]; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) @@ -121,6 +123,9 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]); } + for (i = META_CP; i < META_MAX; i++) + si->meta_count[i] = atomic_read(&sbi->meta_count[i]); + for (i = 0; i < 2; i++) { si->segment_count[i] = sbi->segment_count[i]; si->block_count[i] = sbi->block_count[i]; @@ -190,8 +195,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry); si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi)); si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); - if (f2fs_discard_en(sbi)) - si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); + si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); si->base_mem += SIT_VBLOCK_MAP_SIZE; if (sbi->segs_per_sec > 1) si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry); @@ -271,7 +275,8 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n", si->sbi->sb->s_bdev, i++, f2fs_readonly(si->sbi->sb) ? "RO": "RW", - f2fs_cp_error(si->sbi) ? "Error": "Good"); + is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ? + "Disabled": (f2fs_cp_error(si->sbi) ? "Error": "Good")); seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", @@ -333,6 +338,13 @@ static int stat_show(struct seq_file *s, void *v) si->prefree_count, si->free_segs, si->free_secs); seq_printf(s, "CP calls: %d (BG: %d)\n", si->cp_count, si->bg_cp_count); + seq_printf(s, " - cp blocks : %u\n", si->meta_count[META_CP]); + seq_printf(s, " - sit blocks : %u\n", + si->meta_count[META_SIT]); + seq_printf(s, " - nat blocks : %u\n", + si->meta_count[META_NAT]); + seq_printf(s, " - ssa blocks : %u\n", + si->meta_count[META_SSA]); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d (%d)\n", @@ -349,6 +361,8 @@ static int stat_show(struct seq_file *s, void *v) si->skipped_atomic_files[BG_GC] + si->skipped_atomic_files[FG_GC], si->skipped_atomic_files[BG_GC]); + seq_printf(s, "BG skip : IO: %u, Other: %u\n", + si->io_skip_bggc, si->other_skip_bggc); seq_puts(s, "\nExtent Cache:\n"); seq_printf(s, " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n", si->hit_largest, si->hit_cached, @@ -360,7 +374,9 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), " + seq_printf(s, " - IO_R (Data: %4d, Node: %4d, Meta: %4d\n", + si->nr_rd_data, si->nr_rd_node, si->nr_rd_meta); + seq_printf(s, " - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), " "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", si->nr_wb_cp_data, si->nr_wb_data, si->nr_flushing, si->nr_flushed, @@ -445,6 +461,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_stat_info *si; + int i; si = f2fs_kzalloc(sbi, sizeof(struct f2fs_stat_info), GFP_KERNEL); if (!si) @@ -470,6 +487,8 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inline_inode, 0); atomic_set(&sbi->inline_dir, 0); atomic_set(&sbi->inplace_count, 0); + for (i = META_CP; i < META_MAX; i++) + atomic_set(&sbi->meta_count[i], 0); atomic_set(&sbi->aw_cnt, 0); atomic_set(&sbi->vw_cnt, 0); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index ecc3a4e2be96..2ef84b4590ea 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/dir.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/fs.h> #include <linux/f2fs_fs.h> @@ -658,9 +655,9 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) f2fs_put_page(page, 1); clear_inode_flag(inode, FI_NEW_INODE); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); fail: up_write(&F2FS_I(inode)->i_sem); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return err; } @@ -733,6 +730,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, clear_page_dirty_for_io(page); ClearPagePrivate(page); ClearPageUptodate(page); + clear_cold_data(page); inode_dec_dirty_pages(dir); f2fs_remove_dirty_inode(dir); } @@ -784,9 +782,15 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, struct f2fs_dir_entry *de = NULL; struct fscrypt_str de_name = FSTR_INIT(NULL, 0); struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode); + struct blk_plug plug; + bool readdir_ra = sbi->readdir_ra == 1; + int err = 0; bit_pos = ((unsigned long)ctx->pos % d->max); + if (readdir_ra) + blk_start_plug(&plug); + while (bit_pos < d->max) { bit_pos = find_next_bit_le(d->bitmap, d->max, bit_pos); if (bit_pos >= d->max) @@ -806,29 +810,33 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, if (f2fs_encrypted_inode(d->inode)) { int save_len = fstr->len; - int err; err = fscrypt_fname_disk_to_usr(d->inode, (u32)de->hash_code, 0, &de_name, fstr); if (err) - return err; + goto out; de_name = *fstr; fstr->len = save_len; } if (!dir_emit(ctx, de_name.name, de_name.len, - le32_to_cpu(de->ino), d_type)) - return 1; + le32_to_cpu(de->ino), d_type)) { + err = 1; + goto out; + } - if (sbi->readdir_ra == 1) + if (readdir_ra) f2fs_ra_node_page(sbi, le32_to_cpu(de->ino)); bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); ctx->pos = start_pos + bit_pos; } - return 0; +out: + if (readdir_ra) + blk_finish_plug(&plug); + return err; } static int f2fs_readdir(struct file *file, struct dir_context *ctx) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 231b77ef5a53..1cb0fcc67d2d 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * f2fs extent cache support * @@ -5,10 +6,6 @@ * Copyright (c) 2015 Samsung Electronics * Authors: Jaegeuk Kim <jaegeuk@kernel.org> * Chao Yu <chao2.yu@samsung.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/fs.h> @@ -30,10 +27,10 @@ static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re, return NULL; } -static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root, +static struct rb_entry *__lookup_rb_tree_slow(struct rb_root_cached *root, unsigned int ofs) { - struct rb_node *node = root->rb_node; + struct rb_node *node = root->rb_root.rb_node; struct rb_entry *re; while (node) { @@ -49,7 +46,7 @@ static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root, return NULL; } -struct rb_entry *f2fs_lookup_rb_tree(struct rb_root *root, +struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, struct rb_entry *cached_re, unsigned int ofs) { struct rb_entry *re; @@ -62,22 +59,25 @@ struct rb_entry *f2fs_lookup_rb_tree(struct rb_root *root, } struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, - struct rb_root *root, struct rb_node **parent, - unsigned int ofs) + struct rb_root_cached *root, + struct rb_node **parent, + unsigned int ofs, bool *leftmost) { - struct rb_node **p = &root->rb_node; + struct rb_node **p = &root->rb_root.rb_node; struct rb_entry *re; while (*p) { *parent = *p; re = rb_entry(*parent, struct rb_entry, rb_node); - if (ofs < re->ofs) + if (ofs < re->ofs) { p = &(*p)->rb_left; - else if (ofs >= re->ofs + re->len) + } else if (ofs >= re->ofs + re->len) { p = &(*p)->rb_right; - else + *leftmost = false; + } else { f2fs_bug_on(sbi, 1); + } } return p; @@ -92,16 +92,16 @@ struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, * in order to simpfy the insertion after. * tree must stay unchanged between lookup and insertion. */ -struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root *root, +struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, struct rb_entry *cached_re, unsigned int ofs, struct rb_entry **prev_entry, struct rb_entry **next_entry, struct rb_node ***insert_p, struct rb_node **insert_parent, - bool force) + bool force, bool *leftmost) { - struct rb_node **pnode = &root->rb_node; + struct rb_node **pnode = &root->rb_root.rb_node; struct rb_node *parent = NULL, *tmp_node; struct rb_entry *re = cached_re; @@ -110,7 +110,7 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root *root, *prev_entry = NULL; *next_entry = NULL; - if (RB_EMPTY_ROOT(root)) + if (RB_EMPTY_ROOT(&root->rb_root)) return NULL; if (re) { @@ -118,16 +118,22 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root *root, goto lookup_neighbors; } + if (leftmost) + *leftmost = true; + while (*pnode) { parent = *pnode; re = rb_entry(*pnode, struct rb_entry, rb_node); - if (ofs < re->ofs) + if (ofs < re->ofs) { pnode = &(*pnode)->rb_left; - else if (ofs >= re->ofs + re->len) + } else if (ofs >= re->ofs + re->len) { pnode = &(*pnode)->rb_right; - else + if (leftmost) + *leftmost = false; + } else { goto lookup_neighbors; + } } *insert_p = pnode; @@ -160,10 +166,10 @@ lookup_neighbors: } bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, - struct rb_root *root) + struct rb_root_cached *root) { #ifdef CONFIG_F2FS_CHECK_FS - struct rb_node *cur = rb_first(root), *next; + struct rb_node *cur = rb_first_cached(root), *next; struct rb_entry *cur_re, *next_re; if (!cur) @@ -196,7 +202,8 @@ static struct kmem_cache *extent_node_slab; static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_info *ei, - struct rb_node *parent, struct rb_node **p) + struct rb_node *parent, struct rb_node **p, + bool leftmost) { struct extent_node *en; @@ -209,7 +216,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, en->et = et; rb_link_node(&en->rb_node, parent, p); - rb_insert_color(&en->rb_node, &et->root); + rb_insert_color_cached(&en->rb_node, &et->root, leftmost); atomic_inc(&et->node_cnt); atomic_inc(&sbi->total_ext_node); return en; @@ -218,7 +225,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, static void __detach_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_node *en) { - rb_erase(&en->rb_node, &et->root); + rb_erase_cached(&en->rb_node, &et->root); atomic_dec(&et->node_cnt); atomic_dec(&sbi->total_ext_node); @@ -257,7 +264,7 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et); memset(et, 0, sizeof(struct extent_tree)); et->ino = ino; - et->root = RB_ROOT; + et->root = RB_ROOT_CACHED; et->cached_en = NULL; rwlock_init(&et->lock); INIT_LIST_HEAD(&et->list); @@ -278,10 +285,10 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_info *ei) { - struct rb_node **p = &et->root.rb_node; + struct rb_node **p = &et->root.rb_root.rb_node; struct extent_node *en; - en = __attach_extent_node(sbi, et, ei, NULL, p); + en = __attach_extent_node(sbi, et, ei, NULL, p, true); if (!en) return NULL; @@ -297,7 +304,7 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, struct extent_node *en; unsigned int count = atomic_read(&et->node_cnt); - node = rb_first(&et->root); + node = rb_first_cached(&et->root); while (node) { next = rb_next(node); en = rb_entry(node, struct extent_node, rb_node); @@ -308,14 +315,13 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, return count - atomic_read(&et->node_cnt); } -static void __drop_largest_extent(struct inode *inode, +static void __drop_largest_extent(struct extent_tree *et, pgoff_t fofs, unsigned int len) { - struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest; - - if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) { - largest->len = 0; - f2fs_mark_inode_dirty_sync(inode, true); + if (fofs < et->largest.fofs + et->largest.len && + fofs + len > et->largest.fofs) { + et->largest.len = 0; + et->largest_updated = true; } } @@ -416,12 +422,11 @@ out: return ret; } -static struct extent_node *__try_merge_extent_node(struct inode *inode, +static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_info *ei, struct extent_node *prev_ex, struct extent_node *next_ex) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_node *en = NULL; if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) { @@ -443,7 +448,7 @@ static struct extent_node *__try_merge_extent_node(struct inode *inode, if (!en) return NULL; - __try_update_largest_extent(inode, et, en); + __try_update_largest_extent(et, en); spin_lock(&sbi->extent_lock); if (!list_empty(&en->list)) { @@ -454,12 +459,12 @@ static struct extent_node *__try_merge_extent_node(struct inode *inode, return en; } -static struct extent_node *__insert_extent_tree(struct inode *inode, +static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_info *ei, struct rb_node **insert_p, - struct rb_node *insert_parent) + struct rb_node *insert_parent, + bool leftmost) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct rb_node **p; struct rb_node *parent = NULL; struct extent_node *en = NULL; @@ -470,13 +475,16 @@ static struct extent_node *__insert_extent_tree(struct inode *inode, goto do_insert; } - p = f2fs_lookup_rb_tree_for_insert(sbi, &et->root, &parent, ei->fofs); + leftmost = true; + + p = f2fs_lookup_rb_tree_for_insert(sbi, &et->root, &parent, + ei->fofs, &leftmost); do_insert: - en = __attach_extent_node(sbi, et, ei, parent, p); + en = __attach_extent_node(sbi, et, ei, parent, p, leftmost); if (!en) return NULL; - __try_update_largest_extent(inode, et, en); + __try_update_largest_extent(et, en); /* update in global extent list */ spin_lock(&sbi->extent_lock); @@ -497,6 +505,8 @@ static void f2fs_update_extent_tree_range(struct inode *inode, struct rb_node **insert_p = NULL, *insert_parent = NULL; unsigned int end = fofs + len; unsigned int pos = (unsigned int)fofs; + bool updated = false; + bool leftmost; if (!et) return; @@ -517,14 +527,15 @@ static void f2fs_update_extent_tree_range(struct inode *inode, * drop largest extent before lookup, in case it's already * been shrunk from extent tree */ - __drop_largest_extent(inode, fofs, len); + __drop_largest_extent(et, fofs, len); /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root, (struct rb_entry *)et->cached_en, fofs, (struct rb_entry **)&prev_en, (struct rb_entry **)&next_en, - &insert_p, &insert_parent, false); + &insert_p, &insert_parent, false, + &leftmost); if (!en) en = next_en; @@ -550,8 +561,8 @@ static void f2fs_update_extent_tree_range(struct inode *inode, set_extent_info(&ei, end, end - dei.fofs + dei.blk, org_end - end); - en1 = __insert_extent_tree(inode, et, &ei, - NULL, NULL); + en1 = __insert_extent_tree(sbi, et, &ei, + NULL, NULL, true); next_en = en1; } else { en->ei.fofs = end; @@ -570,7 +581,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, } if (parts) - __try_update_largest_extent(inode, et, en); + __try_update_largest_extent(et, en); else __release_extent_node(sbi, et, en); @@ -590,15 +601,16 @@ static void f2fs_update_extent_tree_range(struct inode *inode, if (blkaddr) { set_extent_info(&ei, fofs, blkaddr, len); - if (!__try_merge_extent_node(inode, et, &ei, prev_en, next_en)) - __insert_extent_tree(inode, et, &ei, - insert_p, insert_parent); + if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) + __insert_extent_tree(sbi, et, &ei, + insert_p, insert_parent, leftmost); /* give up extent_cache, if split and small updates happen */ if (dei.len >= 1 && prev.len < F2FS_MIN_EXTENT_LEN && et->largest.len < F2FS_MIN_EXTENT_LEN) { - __drop_largest_extent(inode, 0, UINT_MAX); + et->largest.len = 0; + et->largest_updated = true; set_inode_flag(inode, FI_NO_EXTENT); } } @@ -606,7 +618,15 @@ static void f2fs_update_extent_tree_range(struct inode *inode, if (is_inode_flag_set(inode, FI_NO_EXTENT)) __free_extent_tree(sbi, et); + if (et->largest_updated) { + et->largest_updated = false; + updated = true; + } + write_unlock(&et->lock); + + if (updated) + f2fs_mark_inode_dirty_sync(inode, true); } unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) @@ -705,6 +725,7 @@ void f2fs_drop_extent_tree(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et = F2FS_I(inode)->extent_tree; + bool updated = false; if (!f2fs_may_extent_tree(inode)) return; @@ -713,8 +734,13 @@ void f2fs_drop_extent_tree(struct inode *inode) write_lock(&et->lock); __free_extent_tree(sbi, et); - __drop_largest_extent(inode, 0, UINT_MAX); + if (et->largest.len) { + et->largest.len = 0; + updated = true; + } write_unlock(&et->lock); + if (updated) + f2fs_mark_inode_dirty_sync(inode, true); } void f2fs_destroy_extent_tree(struct inode *inode) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index abf925664d9c..56204a8f8a12 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1,16 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/f2fs.h * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #ifndef _LINUX_F2FS_H #define _LINUX_F2FS_H +#include <linux/uio.h> #include <linux/types.h> #include <linux/page-flags.h> #include <linux/buffer_head.h> @@ -53,9 +51,10 @@ enum { FAULT_DIR_DEPTH, FAULT_EVICT_INODE, FAULT_TRUNCATE, - FAULT_IO, + FAULT_READ_IO, FAULT_CHECKPOINT, FAULT_DISCARD, + FAULT_WRITE_IO, FAULT_MAX, }; @@ -100,6 +99,7 @@ extern char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_QUOTA 0x00400000 #define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000 #define F2FS_MOUNT_RESERVE_ROOT 0x01000000 +#define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -150,6 +150,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_INODE_CRTIME 0x0100 #define F2FS_FEATURE_LOST_FOUND 0x0200 #define F2FS_FEATURE_VERITY 0x0400 /* reserved */ +#define F2FS_FEATURE_SB_CHKSUM 0x0800 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -178,6 +179,7 @@ enum { #define CP_RECOVERY 0x00000008 #define CP_DISCARD 0x00000010 #define CP_TRIMMED 0x00000020 +#define CP_PAUSE 0x00000040 #define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) #define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */ @@ -187,6 +189,7 @@ enum { #define DEF_DISCARD_URGENT_UTIL 80 /* do more discard over 80% */ #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ +#define DEF_DISABLE_INTERVAL 5 /* 5 secs */ struct cp_control { int reason; @@ -203,6 +206,7 @@ enum { META_NAT, META_SIT, META_SSA, + META_MAX, META_POR, DATA_GENERIC, META_GENERIC, @@ -324,7 +328,7 @@ struct discard_cmd_control { atomic_t issued_discard; /* # of issued discard */ atomic_t issing_discard; /* # of issing discard */ atomic_t discard_cmd_cnt; /* # of cached cmd count */ - struct rb_root root; /* root of discard rb-tree */ + struct rb_root_cached root; /* root of discard rb-tree */ bool rbtree_check; /* config for consistence check */ }; @@ -527,6 +531,9 @@ enum { #define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO count */ +/* maximum retry quota flush count */ +#define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8 + #define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */ #define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ @@ -566,12 +573,13 @@ struct extent_node { struct extent_tree { nid_t ino; /* inode number */ - struct rb_root root; /* root of extent info rb-tree */ + struct rb_root_cached root; /* root of extent info rb-tree */ struct extent_node *cached_en; /* recently accessed extent node */ struct extent_info largest; /* largested extent info */ struct list_head list; /* to be used by sbi->zombie_list */ rwlock_t lock; /* protect extent info rb-tree */ atomic_t node_cnt; /* # of extent node in rb-tree*/ + bool largest_updated; /* largest extent updated */ }; /* @@ -600,6 +608,7 @@ enum { F2FS_GET_BLOCK_DEFAULT, F2FS_GET_BLOCK_FIEMAP, F2FS_GET_BLOCK_BMAP, + F2FS_GET_BLOCK_DIO, F2FS_GET_BLOCK_PRE_DIO, F2FS_GET_BLOCK_PRE_AIO, F2FS_GET_BLOCK_PRECACHE, @@ -754,12 +763,12 @@ static inline bool __is_front_mergeable(struct extent_info *cur, } extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); -static inline void __try_update_largest_extent(struct inode *inode, - struct extent_tree *et, struct extent_node *en) +static inline void __try_update_largest_extent(struct extent_tree *et, + struct extent_node *en) { if (en->ei.len > et->largest.len) { et->largest = en->ei; - f2fs_mark_inode_dirty_sync(inode, true); + et->largest_updated = true; } } @@ -944,6 +953,9 @@ enum count_type { F2FS_DIRTY_IMETA, F2FS_WB_CP_DATA, F2FS_WB_DATA, + F2FS_RD_DATA, + F2FS_RD_NODE, + F2FS_RD_META, NR_COUNT_TYPE, }; @@ -1088,11 +1100,19 @@ enum { SBI_NEED_SB_WRITE, /* need to recover superblock */ SBI_NEED_CP, /* need to checkpoint */ SBI_IS_SHUTDOWN, /* shutdown by ioctl */ + SBI_IS_RECOVERED, /* recovered orphan/data */ + SBI_CP_DISABLED, /* CP was disabled last mount */ + SBI_QUOTA_NEED_FLUSH, /* need to flush quota info in CP */ + SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */ + SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */ }; enum { CP_TIME, REQ_TIME, + DISCARD_TIME, + GC_TIME, + DISABLE_TIME, MAX_TIME, }; @@ -1209,7 +1229,6 @@ struct f2fs_sb_info { unsigned int total_valid_node_count; /* valid node block count */ loff_t max_file_blocks; /* max block index of file */ int dir_level; /* directory level */ - unsigned int trigger_ssr_threshold; /* threshold to trigger ssr */ int readdir_ra; /* readahead inode in readdir */ block_t user_block_count; /* # of user blocks */ @@ -1219,6 +1238,9 @@ struct f2fs_sb_info { block_t reserved_blocks; /* configurable reserved blocks */ block_t current_reserved_blocks; /* current reserved blocks */ + /* Additional tracking for no checkpoint mode */ + block_t unusable_block_count; /* # of blocks saved by last cp */ + unsigned int nquota_files; /* # of quota sysfile */ u32 s_next_generation; /* for NFS support */ @@ -1257,6 +1279,7 @@ struct f2fs_sb_info { */ #ifdef CONFIG_F2FS_STAT_FS struct f2fs_stat_info *stat_info; /* FS status information */ + atomic_t meta_count[META_MAX]; /* # of meta blocks */ unsigned int segment_count[2]; /* # of allocated segments */ unsigned int block_count[2]; /* # of allocated blocks */ atomic_t inplace_count; /* # of inplace update */ @@ -1272,6 +1295,8 @@ struct f2fs_sb_info { atomic_t max_aw_cnt; /* max # of atomic writes */ atomic_t max_vw_cnt; /* max # of volatile writes */ int bg_gc; /* background gc calls */ + unsigned int io_skip_bggc; /* skip background gc for in-flight IO */ + unsigned int other_skip_bggc; /* skip background gc for other reasons */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif spinlock_t stat_lock; /* lock for stat operations */ @@ -1306,9 +1331,9 @@ struct f2fs_sb_info { }; #ifdef CONFIG_F2FS_FAULT_INJECTION -#define f2fs_show_injection_info(type) \ - printk("%sF2FS-fs : inject %s in %s of %pF\n", \ - KERN_INFO, f2fs_fault_name[type], \ +#define f2fs_show_injection_info(type) \ + printk_ratelimited("%sF2FS-fs : inject %s in %s of %pF\n", \ + KERN_INFO, f2fs_fault_name[type], \ __func__, __builtin_return_address(0)) static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) { @@ -1344,7 +1369,15 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) { - sbi->last_time[type] = jiffies; + unsigned long now = jiffies; + + sbi->last_time[type] = now; + + /* DISCARD_TIME and GC_TIME are based on REQ_TIME */ + if (type == REQ_TIME) { + sbi->last_time[DISCARD_TIME] = now; + sbi->last_time[GC_TIME] = now; + } } static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type) @@ -1354,16 +1387,18 @@ static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type) return time_after(jiffies, sbi->last_time[type] + interval); } -static inline bool is_idle(struct f2fs_sb_info *sbi) +static inline unsigned int f2fs_time_to_wait(struct f2fs_sb_info *sbi, + int type) { - struct block_device *bdev = sbi->sb->s_bdev; - struct request_queue *q = bdev_get_queue(bdev); - struct request_list *rl = &q->root_rl; + unsigned long interval = sbi->interval_time[type] * HZ; + unsigned int wait_ms = 0; + long delta; - if (rl->count[BLK_RW_SYNC] || rl->count[BLK_RW_ASYNC]) - return false; + delta = (sbi->last_time[type] + interval) - jiffies; + if (delta > 0) + wait_ms = jiffies_to_msecs(delta); - return f2fs_time_over(sbi, REQ_TIME); + return wait_ms; } /* @@ -1704,7 +1739,8 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, if (!__allow_reserved_blocks(sbi, inode, true)) avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; - + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + avail_user_block_count -= sbi->unusable_block_count; if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { diff = sbi->total_valid_block_count - avail_user_block_count; if (diff > *count) @@ -1755,7 +1791,9 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) atomic_inc(&sbi->nr_pages[count_type]); if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES || - count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA) + count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA || + count_type == F2FS_RD_DATA || count_type == F2FS_RD_NODE || + count_type == F2FS_RD_META) return; set_sbi_flag(sbi, SBI_IS_DIRTY); @@ -1891,12 +1929,18 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, { block_t valid_block_count; unsigned int valid_node_count; - bool quota = inode && !is_inode; + int err; - if (quota) { - int ret = dquot_reserve_block(inode, 1); - if (ret) - return ret; + if (is_inode) { + if (inode) { + err = dquot_alloc_inode(inode); + if (err) + return err; + } + } else { + err = dquot_reserve_block(inode, 1); + if (err) + return err; } if (time_to_inject(sbi, FAULT_BLOCK)) { @@ -1911,6 +1955,8 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, if (!__allow_reserved_blocks(sbi, inode, false)) valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + valid_block_count += sbi->unusable_block_count; if (unlikely(valid_block_count > sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); @@ -1938,8 +1984,12 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, return 0; enospc: - if (quota) + if (is_inode) { + if (inode) + dquot_free_inode(inode); + } else { dquot_release_reservation_block(inode, 1); + } return -ENOSPC; } @@ -1960,7 +2010,9 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, spin_unlock(&sbi->stat_lock); - if (!is_inode) + if (is_inode) + dquot_free_inode(inode); + else f2fs_i_blocks_write(inode, 1, false, true); } @@ -2090,6 +2142,15 @@ static inline struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, return bio_alloc(GFP_KERNEL, npages); } +static inline bool is_idle(struct f2fs_sb_info *sbi, int type) +{ + if (get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_RD_NODE) || + get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) || + get_pages(sbi, F2FS_WB_CP_DATA)) + return false; + return f2fs_time_over(sbi, type); +} + static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item) { @@ -2739,7 +2800,8 @@ static inline bool is_valid_data_blkaddr(struct f2fs_sb_info *sbi, */ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); void f2fs_truncate_data_blocks(struct dnode_of_data *dn); -int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock); +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, + bool buf_write); int f2fs_truncate(struct inode *inode); int f2fs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); @@ -2749,6 +2811,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count); int f2fs_precache_extents(struct inode *inode); long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid); int f2fs_pin_file_control(struct inode *inode, bool inc); /* @@ -2827,6 +2890,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); +int f2fs_quota_sync(struct super_block *sb, int type); void f2fs_quota_off_umount(struct super_block *sb); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); @@ -2869,7 +2933,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs); void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); struct page *f2fs_get_node_page_ra(struct page *parent, int start); -void f2fs_move_node_page(struct page *node_page, int gc_type); +int f2fs_move_node_page(struct page *node_page, int gc_type); int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic, unsigned int *seq_id); @@ -2886,7 +2950,7 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page); int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); -void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); int f2fs_build_node_manager(struct f2fs_sb_info *sbi); void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi); int __init f2fs_create_node_manager_caches(void); @@ -2914,6 +2978,8 @@ void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi); bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); +void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi); +int f2fs_disable_cp_again(struct f2fs_sb_info *sbi); void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi); int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi); @@ -2942,7 +3008,9 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct f2fs_io_info *fio, bool add_list); void f2fs_wait_on_page_writeback(struct page *page, enum page_type type, bool ordered); -void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr); +void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr); +void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, + block_t len); void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, @@ -3002,8 +3070,8 @@ int f2fs_init_post_read_processing(void); void f2fs_destroy_post_read_processing(void); void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, - struct inode *inode, nid_t ino, pgoff_t idx, - enum page_type type); + struct inode *inode, struct page *page, + nid_t ino, enum page_type type); void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi); int f2fs_submit_page_bio(struct f2fs_io_info *fio); void f2fs_submit_page_write(struct f2fs_io_info *fio); @@ -3025,6 +3093,7 @@ struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, struct page *f2fs_get_new_data_page(struct inode *inode, struct page *ipage, pgoff_t index, bool new_i_size); int f2fs_do_write_data_page(struct f2fs_io_info *fio); +void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock); int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int create, int flag); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, @@ -3077,6 +3146,8 @@ struct f2fs_stat_info { int free_nids, avail_nids, alloc_nids; int total_count, utilization; int bg_gc, nr_wb_cp_data, nr_wb_data; + int nr_rd_data, nr_rd_node, nr_rd_meta; + unsigned int io_skip_bggc, other_skip_bggc; int nr_flushing, nr_flushed, flush_list_empty; int nr_discarding, nr_discarded; int nr_discard_cmd; @@ -3098,6 +3169,7 @@ struct f2fs_stat_info { int cursec[NR_CURSEG_TYPE]; int curzone[NR_CURSEG_TYPE]; + unsigned int meta_count[META_MAX]; unsigned int segment_count[2]; unsigned int block_count[2]; unsigned int inplace_count; @@ -3113,6 +3185,8 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_inc_bg_cp_count(si) ((si)->bg_cp_count++) #define stat_inc_call_count(si) ((si)->call_count++) #define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) +#define stat_io_skip_bggc_count(sbi) ((sbi)->io_skip_bggc++) +#define stat_other_skip_bggc_count(sbi) ((sbi)->other_skip_bggc++) #define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++) #define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--) #define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext)) @@ -3149,6 +3223,17 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) if (f2fs_has_inline_dentry(inode)) \ (atomic_dec(&F2FS_I_SB(inode)->inline_dir)); \ } while (0) +#define stat_inc_meta_count(sbi, blkaddr) \ + do { \ + if (blkaddr < SIT_I(sbi)->sit_base_addr) \ + atomic_inc(&(sbi)->meta_count[META_CP]); \ + else if (blkaddr < NM_I(sbi)->nat_blkaddr) \ + atomic_inc(&(sbi)->meta_count[META_SIT]); \ + else if (blkaddr < SM_I(sbi)->ssa_blkaddr) \ + atomic_inc(&(sbi)->meta_count[META_NAT]); \ + else if (blkaddr < SM_I(sbi)->main_blkaddr) \ + atomic_inc(&(sbi)->meta_count[META_SSA]); \ + } while (0) #define stat_inc_seg_type(sbi, curseg) \ ((sbi)->segment_count[(curseg)->alloc_type]++) #define stat_inc_block_count(sbi, curseg) \ @@ -3218,6 +3303,8 @@ void f2fs_destroy_root_stats(void); #define stat_inc_bg_cp_count(si) do { } while (0) #define stat_inc_call_count(si) do { } while (0) #define stat_inc_bggc_count(si) do { } while (0) +#define stat_io_skip_bggc_count(sbi) do { } while (0) +#define stat_other_skip_bggc_count(sbi) do { } while (0) #define stat_inc_dirty_inode(sbi, type) do { } while (0) #define stat_dec_dirty_inode(sbi, type) do { } while (0) #define stat_inc_total_hit(sb) do { } while (0) @@ -3236,6 +3323,7 @@ void f2fs_destroy_root_stats(void); #define stat_inc_volatile_write(inode) do { } while (0) #define stat_dec_volatile_write(inode) do { } while (0) #define stat_update_max_volatile_write(inode) do { } while (0) +#define stat_inc_meta_count(sbi, blkaddr) do { } while (0) #define stat_inc_seg_type(sbi, curseg) do { } while (0) #define stat_inc_block_count(sbi, curseg) do { } while (0) #define stat_inc_inplace_blocks(sbi) do { } while (0) @@ -3305,18 +3393,19 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); /* * extent_cache.c */ -struct rb_entry *f2fs_lookup_rb_tree(struct rb_root *root, +struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, struct rb_entry *cached_re, unsigned int ofs); struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, - struct rb_root *root, struct rb_node **parent, - unsigned int ofs); -struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root *root, + struct rb_root_cached *root, + struct rb_node **parent, + unsigned int ofs, bool *leftmost); +struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, struct rb_entry *cached_re, unsigned int ofs, struct rb_entry **prev_entry, struct rb_entry **next_entry, struct rb_node ***insert_p, struct rb_node **insert_parent, - bool force); + bool force, bool *leftmost); bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, - struct rb_root *root); + struct rb_root_cached *root); unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext); void f2fs_drop_extent_tree(struct inode *inode); @@ -3356,7 +3445,7 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode) { #ifdef CONFIG_F2FS_FS_ENCRYPTION file_set_encrypt(inode); - inode->i_flags |= S_ENCRYPTED; + f2fs_set_inode_flags(inode); #endif } @@ -3384,6 +3473,7 @@ F2FS_FEATURE_FUNCS(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR); F2FS_FEATURE_FUNCS(quota_ino, QUOTA_INO); F2FS_FEATURE_FUNCS(inode_crtime, INODE_CRTIME); F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND); +F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM); #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, @@ -3399,11 +3489,20 @@ static inline int get_blkz_type(struct f2fs_sb_info *sbi, } #endif -static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) +static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi) { - struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); + return f2fs_sb_has_blkzoned(sbi->sb); +} - return blk_queue_discard(q) || f2fs_sb_has_blkzoned(sbi->sb); +static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi) +{ + return blk_queue_discard(bdev_get_queue(sbi->sb->s_bdev)); +} + +static inline bool f2fs_realtime_discard_enable(struct f2fs_sb_info *sbi) +{ + return (test_opt(sbi, DISCARD) && f2fs_hw_support_discard(sbi)) || + f2fs_hw_should_discard(sbi); } static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) @@ -3432,11 +3531,50 @@ static inline bool f2fs_may_encrypt(struct inode *inode) #endif } -static inline bool f2fs_force_buffered_io(struct inode *inode, int rw) +static inline int block_unaligned_IO(struct inode *inode, + struct kiocb *iocb, struct iov_iter *iter) { - return (f2fs_post_read_required(inode) || - (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || - F2FS_I_SB(inode)->s_ndevs); + unsigned int i_blkbits = READ_ONCE(inode->i_blkbits); + unsigned int blocksize_mask = (1 << i_blkbits) - 1; + loff_t offset = iocb->ki_pos; + unsigned long align = offset | iov_iter_alignment(iter); + + return align & blocksize_mask; +} + +static inline int allow_outplace_dio(struct inode *inode, + struct kiocb *iocb, struct iov_iter *iter) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int rw = iov_iter_rw(iter); + + return (test_opt(sbi, LFS) && (rw == WRITE) && + !block_unaligned_IO(inode, iocb, iter)); +} + +static inline bool f2fs_force_buffered_io(struct inode *inode, + struct kiocb *iocb, struct iov_iter *iter) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int rw = iov_iter_rw(iter); + + if (f2fs_post_read_required(inode)) + return true; + if (sbi->s_ndevs) + return true; + /* + * for blkzoned device, fallback direct IO to buffered IO, so + * all IOs can be serialized by log-structured write. + */ + if (f2fs_sb_has_blkzoned(sbi->sb)) + return true; + if (test_opt(sbi, LFS) && (rw == WRITE) && + block_unaligned_IO(inode, iocb, iter)) + return true; + if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED)) + return true; + + return false; } #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -3447,3 +3585,16 @@ extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, #endif #endif + +static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) +{ +#ifdef CONFIG_QUOTA + if (f2fs_sb_has_quota_ino(sbi->sb)) + return true; + if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] || + F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] || + F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) + return true; +#endif + return false; +} diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5474aaa274b9..88b124677189 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/file.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/fs.h> #include <linux/f2fs_fs.h> @@ -50,7 +47,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct dnode_of_data dn; + struct dnode_of_data dn = { .node_changed = false }; int err; if (unlikely(f2fs_cp_error(sbi))) { @@ -62,19 +59,6 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); - /* block allocation */ - f2fs_lock_op(sbi); - set_new_dnode(&dn, inode, NULL, NULL, 0); - err = f2fs_reserve_block(&dn, page->index); - if (err) { - f2fs_unlock_op(sbi); - goto out; - } - f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); - - f2fs_balance_fs(sbi, dn.node_changed); - file_update_time(vmf->vma->vm_file); down_read(&F2FS_I(inode)->i_mmap_sem); lock_page(page); @@ -86,11 +70,28 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) goto out_sem; } + /* block allocation */ + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_block(&dn, page->index); + f2fs_put_dnode(&dn); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + if (err) { + unlock_page(page); + goto out_sem; + } + + /* fill the page */ + f2fs_wait_on_page_writeback(page, DATA, false); + + /* wait for GCed page writeback via META_MAPPING */ + f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); + /* * check to see if the page is mapped already (no holes) */ if (PageMappedToDisk(page)) - goto mapped; + goto out_sem; /* page is wholly or partially inside EOF */ if (((loff_t)(page->index + 1) << PAGE_SHIFT) > @@ -105,21 +106,15 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) SetPageUptodate(page); f2fs_update_iostat(sbi, APP_MAPPED_IO, F2FS_BLKSIZE); + f2fs_update_time(sbi, REQ_TIME); trace_f2fs_vm_page_mkwrite(page, DATA); -mapped: - /* fill the page */ - f2fs_wait_on_page_writeback(page, DATA, false); - - /* wait for GCed page writeback via META_MAPPING */ - if (f2fs_post_read_required(inode)) - f2fs_wait_on_block_writeback(sbi, dn.data_blkaddr); - out_sem: up_read(&F2FS_I(inode)->i_mmap_sem); -out: + + f2fs_balance_fs(sbi, dn.node_changed); + sb_end_pagefault(inode->i_sb); - f2fs_update_time(sbi, REQ_TIME); err: return block_page_mkwrite_return(err); } @@ -215,7 +210,8 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, }; unsigned int seq_id = 0; - if (unlikely(f2fs_readonly(inode->i_sb))) + if (unlikely(f2fs_readonly(inode->i_sb) || + is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return 0; trace_f2fs_sync_file_enter(inode); @@ -590,7 +586,8 @@ truncate_out: return 0; } -int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, + bool buf_write) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; @@ -598,6 +595,7 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) int count = 0, err = 0; struct page *ipage; bool truncate_page = false; + int flag = buf_write ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO; trace_f2fs_truncate_blocks_enter(inode, from); @@ -607,7 +605,7 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) goto free_partial; if (lock) - f2fs_lock_op(sbi); + __do_map_lock(sbi, flag, true); ipage = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { @@ -645,7 +643,7 @@ free_next: err = f2fs_truncate_inode_blocks(inode, free_from); out: if (lock) - f2fs_unlock_op(sbi); + __do_map_lock(sbi, flag, false); free_partial: /* lastly zero out the first data page */ if (!err) @@ -680,7 +678,7 @@ int f2fs_truncate(struct inode *inode) return err; } - err = f2fs_truncate_blocks(inode, i_size_read(inode), true); + err = f2fs_truncate_blocks(inode, i_size_read(inode), true, false); if (err) return err; @@ -789,9 +787,24 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) !uid_eq(attr->ia_uid, inode->i_uid)) || (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { + f2fs_lock_op(F2FS_I_SB(inode)); err = dquot_transfer(inode, attr); - if (err) + if (err) { + set_sbi_flag(F2FS_I_SB(inode), + SBI_QUOTA_NEED_REPAIR); + f2fs_unlock_op(F2FS_I_SB(inode)); return err; + } + /* + * update uid/gid under lock_op(), so that dquot and inode can + * be updated atomically. + */ + if (attr->ia_valid & ATTR_UID) + inode->i_uid = attr->ia_uid; + if (attr->ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; + f2fs_mark_inode_dirty_sync(inode, true); + f2fs_unlock_op(F2FS_I_SB(inode)); } if (attr->ia_valid & ATTR_SIZE) { @@ -1246,7 +1259,7 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) new_size = i_size_read(inode) - len; truncate_pagecache(inode, new_size); - ret = f2fs_truncate_blocks(inode, new_size, true); + ret = f2fs_truncate_blocks(inode, new_size, true, false); up_write(&F2FS_I(inode)->i_mmap_sem); if (!ret) f2fs_i_size_write(inode, new_size); @@ -1431,7 +1444,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); down_write(&F2FS_I(inode)->i_mmap_sem); - ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); + ret = f2fs_truncate_blocks(inode, i_size_read(inode), true, false); up_write(&F2FS_I(inode)->i_mmap_sem); if (ret) return ret; @@ -1978,7 +1991,7 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) + if (!f2fs_hw_support_discard(F2FS_SB(sb))) return -EOPNOTSUPP; if (copy_from_user(&range, (struct fstrim_range __user *)arg, @@ -2162,6 +2175,12 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) if (f2fs_readonly(sbi->sb)) return -EROFS; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + f2fs_msg(sbi->sb, KERN_INFO, + "Skipping Checkpoint. Checkpoints currently disabled."); + return -EINVAL; + } + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -2533,6 +2552,9 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) if (f2fs_readonly(sbi->sb)) return -EROFS; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return -EINVAL; + if (copy_from_user(&range, (struct f2fs_flush_device __user *)arg, sizeof(range))) return -EFAULT; @@ -2591,13 +2613,29 @@ static int f2fs_ioc_get_features(struct file *filp, unsigned long arg) } #ifdef CONFIG_QUOTA +int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid) +{ + struct dquot *transfer_to[MAXQUOTAS] = {}; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct super_block *sb = sbi->sb; + int err = 0; + + transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); + if (!IS_ERR(transfer_to[PRJQUOTA])) { + err = __dquot_transfer(inode, transfer_to); + if (err) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + dqput(transfer_to[PRJQUOTA]); + } + return err; +} + static int f2fs_ioc_setproject(struct file *filp, __u32 projid) { struct inode *inode = file_inode(filp); struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct super_block *sb = sbi->sb; - struct dquot *transfer_to[MAXQUOTAS] = {}; struct page *ipage; kprojid_t kprojid; int err; @@ -2617,53 +2655,45 @@ static int f2fs_ioc_setproject(struct file *filp, __u32 projid) if (projid_eq(kprojid, F2FS_I(inode)->i_projid)) return 0; - err = mnt_want_write_file(filp); - if (err) - return err; - err = -EPERM; - inode_lock(inode); - /* Is it quota file? Do not allow user to mess with it */ if (IS_NOQUOTA(inode)) - goto out_unlock; + return err; ipage = f2fs_get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - err = PTR_ERR(ipage); - goto out_unlock; - } + if (IS_ERR(ipage)) + return PTR_ERR(ipage); if (!F2FS_FITS_IN_INODE(F2FS_INODE(ipage), fi->i_extra_isize, i_projid)) { err = -EOVERFLOW; f2fs_put_page(ipage, 1); - goto out_unlock; + return err; } f2fs_put_page(ipage, 1); err = dquot_initialize(inode); if (err) - goto out_unlock; + return err; - transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); - if (!IS_ERR(transfer_to[PRJQUOTA])) { - err = __dquot_transfer(inode, transfer_to); - dqput(transfer_to[PRJQUOTA]); - if (err) - goto out_dirty; - } + f2fs_lock_op(sbi); + err = f2fs_transfer_project_quota(inode, kprojid); + if (err) + goto out_unlock; F2FS_I(inode)->i_projid = kprojid; inode->i_ctime = current_time(inode); -out_dirty: f2fs_mark_inode_dirty_sync(inode, true); out_unlock: - inode_unlock(inode); - mnt_drop_write_file(filp); + f2fs_unlock_op(sbi); return err; } #else +int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid) +{ + return 0; +} + static int f2fs_ioc_setproject(struct file *filp, __u32 projid) { if (projid != F2FS_DEF_PROJID) @@ -2736,6 +2766,30 @@ static int f2fs_ioc_fsgetxattr(struct file *filp, unsigned long arg) return 0; } +static int f2fs_ioctl_check_project(struct inode *inode, struct fsxattr *fa) +{ + /* + * Project Quota ID state is only allowed to change from within the init + * namespace. Enforce that restriction only if we are trying to change + * the quota ID state. Everything else is allowed in user namespaces. + */ + if (current_user_ns() == &init_user_ns) + return 0; + + if (__kprojid_val(F2FS_I(inode)->i_projid) != fa->fsx_projid) + return -EINVAL; + + if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL) { + if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT)) + return -EINVAL; + } else { + if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT) + return -EINVAL; + } + + return 0; +} + static int f2fs_ioc_fssetxattr(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -2763,19 +2817,20 @@ static int f2fs_ioc_fssetxattr(struct file *filp, unsigned long arg) return err; inode_lock(inode); + err = f2fs_ioctl_check_project(inode, &fa); + if (err) + goto out; flags = (fi->i_flags & ~F2FS_FL_XFLAG_VISIBLE) | (flags & F2FS_FL_XFLAG_VISIBLE); err = __f2fs_ioc_setflags(inode, flags); - inode_unlock(inode); - mnt_drop_write_file(filp); if (err) - return err; + goto out; err = f2fs_ioc_setproject(filp, fa.fsx_projid); - if (err) - return err; - - return 0; +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return err; } int f2fs_pin_file_control(struct inode *inode, bool inc) @@ -2992,7 +3047,8 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (!f2fs_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from)) || f2fs_has_inline_data(inode) || - f2fs_force_buffered_io(inode, WRITE)) { + f2fs_force_buffered_io(inode, + iocb, from)) { clear_inode_flag(inode, FI_NO_PREALLOC); inode_unlock(inode); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 5c8d00422237..a07241fb8537 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/gc.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/fs.h> #include <linux/module.h> @@ -43,13 +40,16 @@ static int gc_thread_func(void *data) if (gc_th->gc_wake) gc_th->gc_wake = 0; - if (try_to_freeze()) + if (try_to_freeze()) { + stat_other_skip_bggc_count(sbi); continue; + } if (kthread_should_stop()) break; if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) { increase_sleep_time(gc_th, &wait_ms); + stat_other_skip_bggc_count(sbi); continue; } @@ -58,8 +58,10 @@ static int gc_thread_func(void *data) f2fs_stop_checkpoint(sbi, false); } - if (!sb_start_write_trylock(sbi->sb)) + if (!sb_start_write_trylock(sbi->sb)) { + stat_other_skip_bggc_count(sbi); continue; + } /* * [GC triggering condition] @@ -80,12 +82,15 @@ static int gc_thread_func(void *data) goto do_gc; } - if (!mutex_trylock(&sbi->gc_mutex)) + if (!mutex_trylock(&sbi->gc_mutex)) { + stat_other_skip_bggc_count(sbi); goto next; + } - if (!is_idle(sbi)) { + if (!is_idle(sbi, GC_TIME)) { increase_sleep_time(gc_th, &wait_ms); mutex_unlock(&sbi->gc_mutex); + stat_io_skip_bggc_count(sbi); goto next; } @@ -365,6 +370,10 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, if (sec_usage_check(sbi, secno)) goto next; + /* Don't touch checkpointed data */ + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) && + get_ckpt_valid_blocks(sbi, segno))) + goto next; if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; @@ -464,7 +473,7 @@ static int check_valid_map(struct f2fs_sb_info *sbi, * On validity, copy that node with cold status, otherwise (invalid node) * ignore that. */ -static void gc_node_segment(struct f2fs_sb_info *sbi, +static int gc_node_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, unsigned int segno, int gc_type) { struct f2fs_summary *entry; @@ -472,6 +481,7 @@ static void gc_node_segment(struct f2fs_sb_info *sbi, int off; int phase = 0; bool fggc = (gc_type == FG_GC); + int submitted = 0; start_addr = START_BLOCK(sbi, segno); @@ -485,10 +495,11 @@ next_step: nid_t nid = le32_to_cpu(entry->nid); struct page *node_page; struct node_info ni; + int err; /* stop BG_GC if there is not enough free sections. */ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) - return; + return submitted; if (check_valid_map(sbi, segno, off) == 0) continue; @@ -525,7 +536,9 @@ next_step: continue; } - f2fs_move_node_page(node_page, gc_type); + err = f2fs_move_node_page(node_page, gc_type); + if (!err && gc_type == FG_GC) + submitted++; stat_inc_node_blk_count(sbi, 1, gc_type); } @@ -534,6 +547,7 @@ next_step: if (fggc) atomic_dec(&sbi->wb_sync_req[NODE]); + return submitted; } /* @@ -669,7 +683,7 @@ put_page: * Move data block via META_MAPPING while keeping locked data page. * This can be used to move blocks, aka LBAs, directly on disk. */ -static void move_data_block(struct inode *inode, block_t bidx, +static int move_data_block(struct inode *inode, block_t bidx, int gc_type, unsigned int segno, int off) { struct f2fs_io_info fio = { @@ -688,25 +702,29 @@ static void move_data_block(struct inode *inode, block_t bidx, struct node_info ni; struct page *page, *mpage; block_t newaddr; - int err; + int err = 0; bool lfs_mode = test_opt(fio.sbi, LFS); /* do not read out */ page = f2fs_grab_cache_page(inode->i_mapping, bidx, false); if (!page) - return; + return -ENOMEM; - if (!check_valid_map(F2FS_I_SB(inode), segno, off)) + if (!check_valid_map(F2FS_I_SB(inode), segno, off)) { + err = -ENOENT; goto out; + } if (f2fs_is_atomic_file(inode)) { F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; + err = -EAGAIN; goto out; } if (f2fs_is_pinned_file(inode)) { f2fs_pin_file_control(inode, true); + err = -EAGAIN; goto out; } @@ -717,6 +735,7 @@ static void move_data_block(struct inode *inode, block_t bidx, if (unlikely(dn.data_blkaddr == NULL_ADDR)) { ClearPageUptodate(page); + err = -ENOENT; goto put_out; } @@ -799,6 +818,7 @@ write_page: fio.new_blkaddr = newaddr; f2fs_submit_page_write(&fio); if (fio.retry) { + err = -EAGAIN; if (PageWriteback(fio.encrypted_page)) end_page_writeback(fio.encrypted_page); goto put_page_out; @@ -822,34 +842,42 @@ put_out: f2fs_put_dnode(&dn); out: f2fs_put_page(page, 1); + return err; } -static void move_data_page(struct inode *inode, block_t bidx, int gc_type, +static int move_data_page(struct inode *inode, block_t bidx, int gc_type, unsigned int segno, int off) { struct page *page; + int err = 0; page = f2fs_get_lock_data_page(inode, bidx, true); if (IS_ERR(page)) - return; + return PTR_ERR(page); - if (!check_valid_map(F2FS_I_SB(inode), segno, off)) + if (!check_valid_map(F2FS_I_SB(inode), segno, off)) { + err = -ENOENT; goto out; + } if (f2fs_is_atomic_file(inode)) { F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; + err = -EAGAIN; goto out; } if (f2fs_is_pinned_file(inode)) { if (gc_type == FG_GC) f2fs_pin_file_control(inode, true); + err = -EAGAIN; goto out; } if (gc_type == BG_GC) { - if (PageWriteback(page)) + if (PageWriteback(page)) { + err = -EAGAIN; goto out; + } set_page_dirty(page); set_cold_data(page); } else { @@ -867,7 +895,6 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, .io_type = FS_GC_DATA_IO, }; bool is_dirty = PageDirty(page); - int err; retry: set_page_dirty(page); @@ -892,6 +919,7 @@ retry: } out: f2fs_put_page(page, 1); + return err; } /* @@ -901,7 +929,7 @@ out: * If the parent node is not valid or the data block address is different, * the victim data block is ignored. */ -static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, +static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, struct gc_inode_list *gc_list, unsigned int segno, int gc_type) { struct super_block *sb = sbi->sb; @@ -909,6 +937,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t start_addr; int off; int phase = 0; + int submitted = 0; start_addr = START_BLOCK(sbi, segno); @@ -925,7 +954,7 @@ next_step: /* stop BG_GC if there is not enough free sections. */ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) - return; + return submitted; if (check_valid_map(sbi, segno, off) == 0) continue; @@ -997,6 +1026,7 @@ next_step: if (inode) { struct f2fs_inode_info *fi = F2FS_I(inode); bool locked = false; + int err; if (S_ISREG(inode->i_mode)) { if (!down_write_trylock(&fi->i_gc_rwsem[READ])) @@ -1016,12 +1046,16 @@ next_step: start_bidx = f2fs_start_bidx_of_node(nofs, inode) + ofs_in_node; if (f2fs_post_read_required(inode)) - move_data_block(inode, start_bidx, gc_type, - segno, off); + err = move_data_block(inode, start_bidx, + gc_type, segno, off); else - move_data_page(inode, start_bidx, gc_type, + err = move_data_page(inode, start_bidx, gc_type, segno, off); + if (!err && (gc_type == FG_GC || + f2fs_post_read_required(inode))) + submitted++; + if (locked) { up_write(&fi->i_gc_rwsem[WRITE]); up_write(&fi->i_gc_rwsem[READ]); @@ -1033,6 +1067,8 @@ next_step: if (++phase < 5) goto next_step; + + return submitted; } static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, @@ -1060,6 +1096,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, int seg_freed = 0; unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? SUM_TYPE_DATA : SUM_TYPE_NODE; + int submitted = 0; /* readahead multi ssa blocks those have contiguous address */ if (sbi->segs_per_sec > 1) @@ -1069,6 +1106,18 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, /* reference all summary page */ while (segno < end_segno) { sum_page = f2fs_get_sum_page(sbi, segno++); + if (IS_ERR(sum_page)) { + int err = PTR_ERR(sum_page); + + end_segno = segno - 1; + for (segno = start_segno; segno < end_segno; segno++) { + sum_page = find_get_page(META_MAPPING(sbi), + GET_SUM_BLOCK(sbi, segno)); + f2fs_put_page(sum_page, 0); + f2fs_put_page(sum_page, 0); + } + return err; + } unlock_page(sum_page); } @@ -1103,10 +1152,11 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, * - lock_page(sum_page) */ if (type == SUM_TYPE_NODE) - gc_node_segment(sbi, sum->entries, segno, gc_type); - else - gc_data_segment(sbi, sum->entries, gc_list, segno, + submitted += gc_node_segment(sbi, sum->entries, segno, gc_type); + else + submitted += gc_data_segment(sbi, sum->entries, gc_list, + segno, gc_type); stat_inc_seg_count(sbi, type, gc_type); @@ -1117,7 +1167,7 @@ next: f2fs_put_page(sum_page, 0); } - if (gc_type == FG_GC) + if (submitted) f2fs_submit_merged_write(sbi, (type == SUM_TYPE_NODE) ? NODE : DATA); @@ -1172,7 +1222,8 @@ gc_more: * threshold, we can make them free by checkpoint. Then, we * secure free segments which doesn't need fggc any more. */ - if (prefree_segments(sbi)) { + if (prefree_segments(sbi) && + !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { ret = f2fs_write_checkpoint(sbi, &cpc); if (ret) goto stop; @@ -1224,7 +1275,7 @@ gc_more: segno = NULL_SEGNO; goto gc_more; } - if (gc_type == FG_GC) + if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) ret = f2fs_write_checkpoint(sbi, &cpc); } stop: @@ -1244,7 +1295,7 @@ stop: put_gc_inode(&gc_list); - if (sync) + if (sync && !ret) ret = sec_freed ? 0 : -EAGAIN; return ret; } diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index c8619e408009..bbac9d3787bd 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/gc.h * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #define GC_THREAD_MIN_WB_PAGES 1 /* * a threshold to determine diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index eb2e031ea887..cc82f142f811 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/hash.c * @@ -7,10 +8,6 @@ * Portions of this code from linux/fs/ext3/hash.c * * Copyright (C) 2002 by Theodore Ts'o - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/types.h> #include <linux/fs.h> diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 115dc219344b..cb31a719b048 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -1,11 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/inline.c * Copyright (c) 2013, Intel Corporation * Authors: Huajun Li <huajun.li@intel.com> * Haicheng Li <haicheng.li@intel.com> - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/fs.h> @@ -300,7 +298,7 @@ process_inline: clear_inode_flag(inode, FI_INLINE_DATA); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { - if (f2fs_truncate_blocks(inode, 0, false)) + if (f2fs_truncate_blocks(inode, 0, false, false)) return false; goto process_inline; } @@ -472,7 +470,7 @@ static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) return 0; punch_dentry_pages: truncate_inode_pages(&dir->i_data, 0); - f2fs_truncate_blocks(dir, 0, false); + f2fs_truncate_blocks(dir, 0, false, false); f2fs_remove_dirty_inode(dir); return err; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 959df2249875..91ceee0ed4c4 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/inode.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/fs.h> #include <linux/f2fs_fs.h> @@ -368,6 +365,12 @@ static int do_read_inode(struct inode *inode) if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) __recover_inline_status(inode, node_page); + /* try to recover cold bit for non-dir inode */ + if (!S_ISDIR(inode->i_mode) && !is_cold_node(node_page)) { + set_cold_node(node_page, false); + set_page_dirty(node_page); + } + /* get rdev by using inline_info */ __get_inode_rdev(inode, ri); @@ -610,6 +613,9 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) return 0; + if (f2fs_is_checkpoint_ready(sbi)) + return -ENOSPC; + /* * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. @@ -648,7 +654,11 @@ void f2fs_evict_inode(struct inode *inode) if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; - dquot_initialize(inode); + err = dquot_initialize(inode); + if (err) { + err = 0; + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + } f2fs_remove_ino_entry(sbi, inode->i_ino, APPEND_INO); f2fs_remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); @@ -680,9 +690,10 @@ retry: goto retry; } - if (err) + if (err) { f2fs_update_inode_page(inode); - dquot_free_inode(inode); + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + } sb_end_intwrite(inode->i_sb); no_delete: dquot_drop(inode); @@ -691,7 +702,8 @@ no_delete: stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); - if (likely(!is_set_ckpt_flags(sbi, CP_ERROR_FLAG))) + if (likely(!is_set_ckpt_flags(sbi, CP_ERROR_FLAG) && + !is_sbi_flag_set(sbi, SBI_CP_DISABLED))) f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE)); else f2fs_inode_synced(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 1f67e389169f..99299ede7429 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/namei.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/fs.h> #include <linux/f2fs_fs.h> @@ -19,6 +16,7 @@ #include "f2fs.h" #include "node.h" +#include "segment.h" #include "xattr.h" #include "acl.h" #include <trace/events/f2fs.h> @@ -74,10 +72,6 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (err) goto fail_drop; - err = dquot_alloc_inode(inode); - if (err) - goto fail_drop; - set_inode_flag(inode, FI_NEW_INODE); /* If the directory encrypted, then we should encrypt the inode. */ @@ -124,6 +118,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL) set_inode_flag(inode, FI_PROJ_INHERIT); + f2fs_set_inode_flags(inode); + trace_f2fs_new_inode(inode, 0); return inode; @@ -184,16 +180,19 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode * hot_count = sbi->raw_super->hot_ext_count; for (i = 0; i < cold_count + hot_count; i++) { - if (!is_extension_exist(name, extlist[i])) - continue; - if (i < cold_count) - file_set_cold(inode); - else - file_set_hot(inode); - break; + if (is_extension_exist(name, extlist[i])) + break; } up_read(&sbi->sb_lock); + + if (i == cold_count + hot_count) + return; + + if (i < cold_count) + file_set_cold(inode); + else + file_set_hot(inode); } int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, @@ -272,6 +271,9 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (unlikely(f2fs_cp_error(sbi))) return -EIO; + err = f2fs_is_checkpoint_ready(sbi); + if (err) + return err; err = dquot_initialize(dir); if (err) @@ -318,6 +320,9 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, if (unlikely(f2fs_cp_error(sbi))) return -EIO; + err = f2fs_is_checkpoint_ready(sbi); + if (err) + return err; err = fscrypt_prepare_link(old_dentry, dir, dentry); if (err) @@ -564,6 +569,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, if (unlikely(f2fs_cp_error(sbi))) return -EIO; + err = f2fs_is_checkpoint_ready(sbi); + if (err) + return err; err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize, &disk_link); @@ -693,6 +701,9 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, if (unlikely(f2fs_cp_error(sbi))) return -EIO; + err = f2fs_is_checkpoint_ready(sbi); + if (err) + return err; err = dquot_initialize(dir); if (err) @@ -823,10 +834,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, struct f2fs_dir_entry *old_entry; struct f2fs_dir_entry *new_entry; bool is_old_inline = f2fs_has_inline_dentry(old_dir); - int err = -ENOENT; + int err; if (unlikely(f2fs_cp_error(sbi))) return -EIO; + err = f2fs_is_checkpoint_ready(sbi); + if (err) + return err; if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && (!projid_eq(F2FS_I(new_dir)->i_projid, @@ -847,6 +861,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; } + err = -ENOENT; old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) @@ -983,6 +998,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_update_time(sbi, REQ_TIME); return 0; put_out_dir: @@ -1012,10 +1029,13 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, struct f2fs_dir_entry *old_dir_entry = NULL, *new_dir_entry = NULL; struct f2fs_dir_entry *old_entry, *new_entry; int old_nlink = 0, new_nlink = 0; - int err = -ENOENT; + int err; if (unlikely(f2fs_cp_error(sbi))) return -EIO; + err = f2fs_is_checkpoint_ready(sbi); + if (err) + return err; if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && !projid_eq(F2FS_I(new_dir)->i_projid, @@ -1033,6 +1053,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto out; + err = -ENOENT; old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) @@ -1136,6 +1157,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_update_time(sbi, REQ_TIME); return 0; out_new_dir: if (new_dir_entry) { diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index dd2e45a661aa..2b34206486d8 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/node.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/fs.h> #include <linux/f2fs_fs.h> @@ -129,6 +126,8 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) /* get current nat block page with lock */ src_page = get_current_nat_page(sbi, nid); + if (IS_ERR(src_page)) + return src_page; dst_page = f2fs_grab_meta_page(sbi, dst_off); f2fs_bug_on(sbi, PageDirty(src_page)); @@ -1542,8 +1541,10 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, } if (__is_valid_data_blkaddr(ni.blk_addr) && - !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC)) + !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC)) { + up_read(&sbi->node_write); goto redirty_out; + } if (atomic && !test_opt(sbi, NOBARRIER)) fio.op_flags |= REQ_PREFLUSH | REQ_FUA; @@ -1564,8 +1565,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, up_read(&sbi->node_write); if (wbc->for_reclaim) { - f2fs_submit_merged_write_cond(sbi, page->mapping->host, 0, - page->index, NODE); + f2fs_submit_merged_write_cond(sbi, NULL, page, 0, NODE); submitted = NULL; } @@ -1587,8 +1587,10 @@ redirty_out: return AOP_WRITEPAGE_ACTIVATE; } -void f2fs_move_node_page(struct page *node_page, int gc_type) +int f2fs_move_node_page(struct page *node_page, int gc_type) { + int err = 0; + if (gc_type == FG_GC) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, @@ -1600,12 +1602,16 @@ void f2fs_move_node_page(struct page *node_page, int gc_type) f2fs_wait_on_page_writeback(node_page, NODE, true); f2fs_bug_on(F2FS_P_SB(node_page), PageWriteback(node_page)); - if (!clear_page_dirty_for_io(node_page)) + if (!clear_page_dirty_for_io(node_page)) { + err = -EAGAIN; goto out_page; + } if (__write_node_page(node_page, false, NULL, - &wbc, false, FS_GC_NODE_IO, NULL)) + &wbc, false, FS_GC_NODE_IO, NULL)) { + err = -EAGAIN; unlock_page(node_page); + } goto release_page; } else { /* set page dirty and write it */ @@ -1616,6 +1622,7 @@ out_page: unlock_page(node_page); release_page: f2fs_put_page(node_page, 0); + return err; } static int f2fs_write_node_page(struct page *page, @@ -1630,13 +1637,13 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, unsigned int *seq_id) { pgoff_t index; - pgoff_t last_idx = ULONG_MAX; struct pagevec pvec; int ret = 0; struct page *last_page = NULL; bool marked = false; nid_t ino = inode->i_ino; int nr_pages; + int nwritten = 0; if (atomic) { last_page = last_fsync_dnode(sbi, ino); @@ -1714,7 +1721,7 @@ continue_unlock: f2fs_put_page(last_page, 0); break; } else if (submitted) { - last_idx = page->index; + nwritten++; } if (page == last_page) { @@ -1740,8 +1747,8 @@ continue_unlock: goto retry; } out: - if (last_idx != ULONG_MAX) - f2fs_submit_merged_write_cond(sbi, NULL, ino, last_idx, NODE); + if (nwritten) + f2fs_submit_merged_write_cond(sbi, NULL, NULL, ino, NODE); return ret ? -EIO: 0; } @@ -2268,15 +2275,19 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, nm_i->nat_block_bitmap)) { struct page *page = get_current_nat_page(sbi, nid); - ret = scan_nat_page(sbi, page, nid); - f2fs_put_page(page, 1); + if (IS_ERR(page)) { + ret = PTR_ERR(page); + } else { + ret = scan_nat_page(sbi, page, nid); + f2fs_put_page(page, 1); + } if (ret) { up_read(&nm_i->nat_tree_lock); f2fs_bug_on(sbi, !mount); f2fs_msg(sbi->sb, KERN_ERR, "NAT is corrupt, run fsck to fix it"); - return -EINVAL; + return ret; } } @@ -2353,8 +2364,9 @@ retry: spin_unlock(&nm_i->nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ - f2fs_build_free_nids(sbi, true, false); - goto retry; + if (!f2fs_build_free_nids(sbi, true, false)) + goto retry; + return false; } /* @@ -2537,7 +2549,7 @@ retry: if (!PageUptodate(ipage)) SetPageUptodate(ipage); fill_node_footer(ipage, ino, ino, 0, true); - set_cold_node(page, false); + set_cold_node(ipage, false); src = F2FS_INODE(page); dst = F2FS_INODE(ipage); @@ -2560,6 +2572,13 @@ retry: F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), i_projid)) dst->i_projid = src->i_projid; + + if (f2fs_sb_has_inode_crtime(sbi->sb) && + F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), + i_crtime_nsec)) { + dst->i_crtime = src->i_crtime; + dst->i_crtime_nsec = src->i_crtime_nsec; + } } new_ni = old_ni; @@ -2703,7 +2722,7 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, __clear_bit_le(nat_index, nm_i->full_nat_bits); } -static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, +static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, struct nat_entry_set *set, struct cp_control *cpc) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -2727,6 +2746,9 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, down_write(&curseg->journal_rwsem); } else { page = get_next_nat_page(sbi, start_nid); + if (IS_ERR(page)) + return PTR_ERR(page); + nat_blk = page_address(page); f2fs_bug_on(sbi, !nat_blk); } @@ -2772,12 +2794,13 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); kmem_cache_free(nat_entry_set_slab, set); } + return 0; } /* * This function is called during the checkpointing process. */ -void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) +int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -2787,6 +2810,7 @@ void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) unsigned int found; nid_t set_idx = 0; LIST_HEAD(sets); + int err = 0; /* during unmount, let's flush nat_bits before checking dirty_nat_cnt */ if (enabled_nat_bits(sbi, cpc)) { @@ -2796,7 +2820,7 @@ void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) } if (!nm_i->dirty_nat_cnt) - return; + return 0; down_write(&nm_i->nat_tree_lock); @@ -2819,11 +2843,16 @@ void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) } /* flush dirty nats in nat entry set */ - list_for_each_entry_safe(set, tmp, &sets, set_list) - __flush_nat_entry_set(sbi, set, cpc); + list_for_each_entry_safe(set, tmp, &sets, set_list) { + err = __flush_nat_entry_set(sbi, set, cpc); + if (err) + break; + } up_write(&nm_i->nat_tree_lock); /* Allow dirty nats by node block allocation in write_begin */ + + return err; } static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) @@ -2850,10 +2879,8 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) struct page *page; page = f2fs_get_meta_page(sbi, nat_bits_addr++); - if (IS_ERR(page)) { - disable_nat_bits(sbi, true); + if (IS_ERR(page)) return PTR_ERR(page); - } memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS), page_address(page), F2FS_BLKSIZE); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 0f4db7a61254..1c73d879a9bc 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/node.h * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ /* start node id of a node block dedicated to the given node id */ #define START_NID(nid) (((nid) / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 95511ed11a22..1dfb17f9f9ff 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/recovery.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/fs.h> #include <linux/f2fs_fs.h> @@ -99,8 +96,12 @@ err_out: return ERR_PTR(err); } -static void del_fsync_inode(struct fsync_inode_entry *entry) +static void del_fsync_inode(struct fsync_inode_entry *entry, int drop) { + if (drop) { + /* inode should not be recovered, drop it */ + f2fs_inode_synced(entry->inode); + } iput(entry->inode); list_del(&entry->list); kmem_cache_free(fsync_entry_slab, entry); @@ -194,6 +195,33 @@ out: return err; } +static int recover_quota_data(struct inode *inode, struct page *page) +{ + struct f2fs_inode *raw = F2FS_INODE(page); + struct iattr attr; + uid_t i_uid = le32_to_cpu(raw->i_uid); + gid_t i_gid = le32_to_cpu(raw->i_gid); + int err; + + memset(&attr, 0, sizeof(attr)); + + attr.ia_uid = make_kuid(inode->i_sb->s_user_ns, i_uid); + attr.ia_gid = make_kgid(inode->i_sb->s_user_ns, i_gid); + + if (!uid_eq(attr.ia_uid, inode->i_uid)) + attr.ia_valid |= ATTR_UID; + if (!gid_eq(attr.ia_gid, inode->i_gid)) + attr.ia_valid |= ATTR_GID; + + if (!attr.ia_valid) + return 0; + + err = dquot_transfer(inode, &attr); + if (err) + set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR); + return err; +} + static void recover_inline_flags(struct inode *inode, struct f2fs_inode *ri) { if (ri->i_inline & F2FS_PIN_FILE) @@ -206,12 +234,41 @@ static void recover_inline_flags(struct inode *inode, struct f2fs_inode *ri) clear_inode_flag(inode, FI_DATA_EXIST); } -static void recover_inode(struct inode *inode, struct page *page) +static int recover_inode(struct inode *inode, struct page *page) { struct f2fs_inode *raw = F2FS_INODE(page); char *name; + int err; inode->i_mode = le16_to_cpu(raw->i_mode); + + err = recover_quota_data(inode, page); + if (err) + return err; + + i_uid_write(inode, le32_to_cpu(raw->i_uid)); + i_gid_write(inode, le32_to_cpu(raw->i_gid)); + + if (raw->i_inline & F2FS_EXTRA_ATTR) { + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) && + F2FS_FITS_IN_INODE(raw, le16_to_cpu(raw->i_extra_isize), + i_projid)) { + projid_t i_projid; + kprojid_t kprojid; + + i_projid = (projid_t)le32_to_cpu(raw->i_projid); + kprojid = make_kprojid(&init_user_ns, i_projid); + + if (!projid_eq(kprojid, F2FS_I(inode)->i_projid)) { + err = f2fs_transfer_project_quota(inode, + kprojid); + if (err) + return err; + F2FS_I(inode)->i_projid = kprojid; + } + } + } + f2fs_i_size_write(inode, le64_to_cpu(raw->i_size)); inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime); inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime); @@ -221,9 +278,15 @@ static void recover_inode(struct inode *inode, struct page *page) inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); F2FS_I(inode)->i_advise = raw->i_advise; + F2FS_I(inode)->i_flags = le32_to_cpu(raw->i_flags); + f2fs_set_inode_flags(inode); + F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN] = + le16_to_cpu(raw->i_gc_failures); recover_inline_flags(inode, raw); + f2fs_mark_inode_dirty_sync(inode, true); + if (file_enc_name(inode)) name = "<encrypted>"; else @@ -232,6 +295,7 @@ static void recover_inode(struct inode *inode, struct page *page) f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s, inline = %x", ino_of_node(page), name, raw->i_inline); + return 0; } static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, @@ -320,12 +384,12 @@ next: return err; } -static void destroy_fsync_dnodes(struct list_head *head) +static void destroy_fsync_dnodes(struct list_head *head, int drop) { struct fsync_inode_entry *entry, *tmp; list_for_each_entry_safe(entry, tmp, head, list) - del_fsync_inode(entry); + del_fsync_inode(entry, drop); } static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, @@ -358,6 +422,8 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, } sum_page = f2fs_get_sum_page(sbi, segno); + if (IS_ERR(sum_page)) + return PTR_ERR(sum_page); sum_node = (struct f2fs_summary_block *)page_address(sum_page); sum = sum_node->entries[blkoff]; f2fs_put_page(sum_page, 1); @@ -560,7 +626,7 @@ out: } static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, - struct list_head *dir_list) + struct list_head *tmp_inode_list, struct list_head *dir_list) { struct curseg_info *curseg; struct page *page = NULL; @@ -598,8 +664,11 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, * In this case, we can lose the latest inode(x). * So, call recover_inode for the inode update. */ - if (IS_INODE(page)) - recover_inode(entry->inode, page); + if (IS_INODE(page)) { + err = recover_inode(entry->inode, page); + if (err) + break; + } if (entry->last_dentry == blkaddr) { err = recover_dentry(entry->inode, page, dir_list); if (err) { @@ -614,7 +683,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, } if (entry->blkaddr == blkaddr) - del_fsync_inode(entry); + list_move_tail(&entry->list, tmp_inode_list); next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); @@ -627,7 +696,7 @@ next: int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) { - struct list_head inode_list; + struct list_head inode_list, tmp_inode_list; struct list_head dir_list; int err; int ret = 0; @@ -658,6 +727,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) } INIT_LIST_HEAD(&inode_list); + INIT_LIST_HEAD(&tmp_inode_list); INIT_LIST_HEAD(&dir_list); /* prevent checkpoint */ @@ -676,11 +746,16 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) need_writecp = true; /* step #2: recover data */ - err = recover_data(sbi, &inode_list, &dir_list); + err = recover_data(sbi, &inode_list, &tmp_inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); + else { + /* restore s_flags to let iput() trash data */ + sbi->sb->s_flags = s_flags; + } skip: - destroy_fsync_dnodes(&inode_list); + destroy_fsync_dnodes(&inode_list, err); + destroy_fsync_dnodes(&tmp_inode_list, err); /* truncate meta pages to be used by the recovery */ truncate_inode_pages_range(META_MAPPING(sbi), @@ -689,19 +764,23 @@ skip: if (err) { truncate_inode_pages_final(NODE_MAPPING(sbi)); truncate_inode_pages_final(META_MAPPING(sbi)); + } else { + clear_sbi_flag(sbi, SBI_POR_DOING); } - - clear_sbi_flag(sbi, SBI_POR_DOING); mutex_unlock(&sbi->cp_mutex); /* let's drop all the directory inodes for clean checkpoint */ - destroy_fsync_dnodes(&dir_list); + destroy_fsync_dnodes(&dir_list, err); - if (!err && need_writecp) { - struct cp_control cpc = { - .reason = CP_RECOVERY, - }; - err = f2fs_write_checkpoint(sbi, &cpc); + if (need_writecp) { + set_sbi_flag(sbi, SBI_IS_RECOVERED); + + if (!err) { + struct cp_control cpc = { + .reason = CP_RECOVERY, + }; + err = f2fs_write_checkpoint(sbi, &cpc); + } } kmem_cache_destroy(fsync_entry_slab); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 30779aaa9dba..6edcf8391dd3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/segment.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/fs.h> #include <linux/f2fs_fs.h> @@ -179,6 +176,8 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi) return false; if (sbi->gc_mode == GC_URGENT) return true; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return true; return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + SM_I(sbi)->min_ssr_sections + reserved_sections(sbi)); @@ -267,8 +266,10 @@ retry: } next: /* we don't need to invalidate this in the sccessful status */ - if (drop || recover) + if (drop || recover) { ClearPageUptodate(page); + clear_cold_data(page); + } set_page_private(page, 0); ClearPagePrivate(page); f2fs_put_page(page, 1); @@ -374,7 +375,7 @@ static int __f2fs_commit_inmem_pages(struct inode *inode) .io_type = FS_DATA_IO, }; struct list_head revoke_list; - pgoff_t last_idx = ULONG_MAX; + bool submit_bio = false; int err = 0; INIT_LIST_HEAD(&revoke_list); @@ -409,14 +410,14 @@ retry: } /* record old blkaddr for revoking */ cur->old_addr = fio.old_blkaddr; - last_idx = page->index; + submit_bio = true; } unlock_page(page); list_move_tail(&cur->list, &revoke_list); } - if (last_idx != ULONG_MAX) - f2fs_submit_merged_write_cond(sbi, inode, 0, last_idx, DATA); + if (submit_bio) + f2fs_submit_merged_write_cond(sbi, inode, NULL, 0, DATA); if (err) { /* @@ -483,6 +484,9 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) if (need && excess_cached_nats(sbi)) f2fs_balance_fs_bg(sbi); + if (f2fs_is_checkpoint_ready(sbi)) + return; + /* * We should do GC or end up with checkpoint, if there are so many dirty * dir/node pages without enough free segments. @@ -511,7 +515,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) else f2fs_build_free_nids(sbi, false, false); - if (!is_idle(sbi) && + if (!is_idle(sbi, REQ_TIME) && (!excess_dirty_nats(sbi) && !excess_dirty_nodes(sbi))) return; @@ -799,7 +803,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned short valid_blocks; + unsigned short valid_blocks, ckpt_valid_blocks; if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno)) return; @@ -807,8 +811,10 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_lock(&dirty_i->seglist_lock); valid_blocks = get_valid_blocks(sbi, segno, false); + ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno); - if (valid_blocks == 0) { + if (valid_blocks == 0 && (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) || + ckpt_valid_blocks == sbi->blocks_per_seg)) { __locate_dirty_segment(sbi, segno, PRE); __remove_dirty_segment(sbi, segno, DIRTY); } else if (valid_blocks < sbi->blocks_per_seg) { @@ -821,6 +827,66 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } +/* This moves currently empty dirty blocks to prefree. Must hold seglist_lock */ +void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int segno; + + mutex_lock(&dirty_i->seglist_lock); + for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { + if (get_valid_blocks(sbi, segno, false)) + continue; + if (IS_CURSEG(sbi, segno)) + continue; + __locate_dirty_segment(sbi, segno, PRE); + __remove_dirty_segment(sbi, segno, DIRTY); + } + mutex_unlock(&dirty_i->seglist_lock); +} + +int f2fs_disable_cp_again(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + block_t ovp = overprovision_segments(sbi) << sbi->log_blocks_per_seg; + block_t holes[2] = {0, 0}; /* DATA and NODE */ + struct seg_entry *se; + unsigned int segno; + + mutex_lock(&dirty_i->seglist_lock); + for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { + se = get_seg_entry(sbi, segno); + if (IS_NODESEG(se->type)) + holes[NODE] += sbi->blocks_per_seg - se->valid_blocks; + else + holes[DATA] += sbi->blocks_per_seg - se->valid_blocks; + } + mutex_unlock(&dirty_i->seglist_lock); + + if (holes[DATA] > ovp || holes[NODE] > ovp) + return -EAGAIN; + return 0; +} + +/* This is only used by SBI_CP_DISABLED */ +static unsigned int get_free_segment(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int segno = 0; + + mutex_lock(&dirty_i->seglist_lock); + for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { + if (get_valid_blocks(sbi, segno, false)) + continue; + if (get_ckpt_valid_blocks(sbi, segno)) + continue; + mutex_unlock(&dirty_i->seglist_lock); + return segno; + } + mutex_unlock(&dirty_i->seglist_lock); + return NULL_SEGNO; +} + static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len) @@ -856,7 +922,8 @@ static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, static struct discard_cmd *__attach_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len, - struct rb_node *parent, struct rb_node **p) + struct rb_node *parent, struct rb_node **p, + bool leftmost) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *dc; @@ -864,7 +931,7 @@ static struct discard_cmd *__attach_discard_cmd(struct f2fs_sb_info *sbi, dc = __create_discard_cmd(sbi, bdev, lstart, start, len); rb_link_node(&dc->rb_node, parent, p); - rb_insert_color(&dc->rb_node, &dcc->root); + rb_insert_color_cached(&dc->rb_node, &dcc->root, leftmost); return dc; } @@ -876,7 +943,7 @@ static void __detach_discard_cmd(struct discard_cmd_control *dcc, atomic_sub(dc->issuing, &dcc->issing_discard); list_del(&dc->list); - rb_erase(&dc->rb_node, &dcc->root); + rb_erase_cached(&dc->rb_node, &dcc->root); dcc->undiscard_blks -= dc->len; kmem_cache_free(discard_cmd_slab, dc); @@ -905,9 +972,9 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, dc->error = 0; if (dc->error) - f2fs_msg(sbi->sb, KERN_INFO, - "Issue discard(%u, %u, %u) failed, ret: %d", - dc->lstart, dc->start, dc->len, dc->error); + printk_ratelimited( + "%sF2FS-fs: Issue discard(%u, %u, %u) failed, ret: %d", + KERN_INFO, dc->lstart, dc->start, dc->len, dc->error); __detach_discard_cmd(dcc, dc); } @@ -1113,6 +1180,7 @@ static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, struct rb_node **p; struct rb_node *parent = NULL; struct discard_cmd *dc = NULL; + bool leftmost = true; if (insert_p && insert_parent) { parent = insert_parent; @@ -1120,9 +1188,11 @@ static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, goto do_insert; } - p = f2fs_lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, lstart); + p = f2fs_lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, + lstart, &leftmost); do_insert: - dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, p); + dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, + p, leftmost); if (!dc) return NULL; @@ -1190,7 +1260,7 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, NULL, lstart, (struct rb_entry **)&prev_dc, (struct rb_entry **)&next_dc, - &insert_p, &insert_parent, true); + &insert_p, &insert_parent, true, NULL); if (dc) prev_dc = dc; @@ -1298,7 +1368,7 @@ static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, NULL, pos, (struct rb_entry **)&prev_dc, (struct rb_entry **)&next_dc, - &insert_p, &insert_parent, true); + &insert_p, &insert_parent, true, NULL); if (!dc) dc = next_dc; @@ -1311,7 +1381,7 @@ static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, if (dc->state != D_PREP) goto next; - if (dpolicy->io_aware && !is_idle(sbi)) { + if (dpolicy->io_aware && !is_idle(sbi, DISCARD_TIME)) { io_interrupted = true; break; } @@ -1371,7 +1441,7 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, dc->state != D_PREP); if (dpolicy->io_aware && i < dpolicy->io_aware_gran && - !is_idle(sbi)) { + !is_idle(sbi, DISCARD_TIME)) { io_interrupted = true; break; } @@ -1600,7 +1670,9 @@ static int issue_discard_thread(void *data) __wait_all_discard_cmd(sbi, &dpolicy); wait_ms = dpolicy.min_interval; } else if (issued == -1){ - wait_ms = dpolicy.mid_interval; + wait_ms = f2fs_time_to_wait(sbi, DISCARD_TIME); + if (!wait_ms) + wait_ms = dpolicy.mid_interval; } else { wait_ms = dpolicy.max_interval; } @@ -1725,11 +1797,11 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, struct list_head *head = &SM_I(sbi)->dcc_info->entry_list; int i; - if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi)) + if (se->valid_blocks == max_blocks || !f2fs_hw_support_discard(sbi)) return false; if (!force) { - if (!test_opt(sbi, DISCARD) || !se->valid_blocks || + if (!f2fs_realtime_discard_enable(sbi) || !se->valid_blocks || SM_I(sbi)->dcc_info->nr_discards >= SM_I(sbi)->dcc_info->max_discards) return false; @@ -1835,7 +1907,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, dirty_i->nr_dirty[PRE]--; } - if (!test_opt(sbi, DISCARD)) + if (!f2fs_realtime_discard_enable(sbi)) continue; if (force && start >= cpc->trim_start && @@ -1928,7 +2000,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg; dcc->undiscard_blks = 0; dcc->next_pos = 0; - dcc->root = RB_ROOT; + dcc->root = RB_ROOT_CACHED; dcc->rbtree_check = false; init_waitqueue_head(&dcc->discard_wait_queue); @@ -2025,12 +2097,12 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) del = 0; } - if (f2fs_discard_en(sbi) && - !f2fs_test_and_set_bit(offset, se->discard_map)) + if (!f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; /* don't overwrite by SSR to keep node chain */ - if (IS_NODESEG(se->type)) { + if (IS_NODESEG(se->type) && + !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map)) se->ckpt_valid_blocks++; } @@ -2052,10 +2124,18 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) f2fs_bug_on(sbi, 1); se->valid_blocks++; del = 0; + } else if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + /* + * If checkpoints are off, we must not reuse data that + * was used in the previous checkpoint. If it was used + * before, we must track that to know how much space we + * really have. + */ + if (f2fs_test_bit(offset, se->ckpt_valid_map)) + sbi->unusable_block_count++; } - if (f2fs_discard_en(sbi) && - f2fs_test_and_clear_bit(offset, se->discard_map)) + if (f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; } if (!f2fs_test_bit(offset, se->ckpt_valid_map)) @@ -2335,6 +2415,9 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) if (sbi->segs_per_sec != 1) return CURSEG_I(sbi, type)->segno; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return 0; + if (test_opt(sbi, NOHEAP) && (type == CURSEG_HOT_DATA || IS_NODESEG(type))) return 0; @@ -2432,6 +2515,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type) __next_free_blkoff(sbi, curseg, 0); sum_page = f2fs_get_sum_page(sbi, new_segno); + f2fs_bug_on(sbi, IS_ERR(sum_page)); sum_node = (struct f2fs_summary_block *)page_address(sum_page); memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); f2fs_put_page(sum_page, 1); @@ -2478,6 +2562,15 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) return 1; } } + + /* find valid_blocks=0 in dirty list */ + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + segno = get_free_segment(sbi); + if (segno != NULL_SEGNO) { + curseg->next_segno = segno; + return 1; + } + } return 0; } @@ -2495,7 +2588,8 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && type == CURSEG_WARM_NODE) new_curseg(sbi, type, false); - else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) + else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type) && + likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) new_curseg(sbi, type, false); else if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type)) change_curseg(sbi, type); @@ -2570,7 +2664,7 @@ next: NULL, start, (struct rb_entry **)&prev_dc, (struct rb_entry **)&next_dc, - &insert_p, &insert_parent, true); + &insert_p, &insert_parent, true, NULL); if (!dc) dc = next_dc; @@ -2671,7 +2765,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) * discard option. User configuration looks like using runtime discard * or periodic fstrim instead of it. */ - if (test_opt(sbi, DISCARD)) + if (f2fs_realtime_discard_enable(sbi)) goto out; start_block = START_BLOCK(sbi, start_segno); @@ -3020,6 +3114,7 @@ void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page, ClearPageError(page); f2fs_submit_page_write(&fio); + stat_inc_meta_count(sbi, page->index); f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE); } @@ -3182,8 +3277,7 @@ void f2fs_wait_on_page_writeback(struct page *page, if (PageWriteback(page)) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); - f2fs_submit_merged_write_cond(sbi, page->mapping->host, - 0, page->index, type); + f2fs_submit_merged_write_cond(sbi, NULL, page, 0, type); if (ordered) wait_on_page_writeback(page); else @@ -3191,10 +3285,14 @@ void f2fs_wait_on_page_writeback(struct page *page, } } -void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr) +void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *cpage; + if (!f2fs_post_read_required(inode)) + return; + if (!is_valid_data_blkaddr(sbi, blkaddr)) return; @@ -3205,6 +3303,15 @@ void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr) } } +void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, + block_t len) +{ + block_t i; + + for (i = 0; i < len; i++) + f2fs_wait_on_block_writeback(inode, blkaddr + i); +} + static int read_compacted_summaries(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -3762,13 +3869,11 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; #endif - if (f2fs_discard_en(sbi)) { - sit_i->sentries[start].discard_map - = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, - GFP_KERNEL); - if (!sit_i->sentries[start].discard_map) - return -ENOMEM; - } + sit_i->sentries[start].discard_map + = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, + GFP_KERNEL); + if (!sit_i->sentries[start].discard_map) + return -ENOMEM; } sit_i->tmp_map = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); @@ -3904,6 +4009,8 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) se = &sit_i->sentries[start]; page = get_current_sit_page(sbi, start); + if (IS_ERR(page)) + return PTR_ERR(page); sit_blk = (struct f2fs_sit_block *)page_address(page); sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; f2fs_put_page(page, 1); @@ -3916,18 +4023,16 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) total_node_blocks += se->valid_blocks; /* build discard map only one time */ - if (f2fs_discard_en(sbi)) { - if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { - memset(se->discard_map, 0xff, - SIT_VBLOCK_MAP_SIZE); - } else { - memcpy(se->discard_map, - se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += - sbi->blocks_per_seg - - se->valid_blocks; - } + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, + se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += + sbi->blocks_per_seg - + se->valid_blocks; } if (sbi->segs_per_sec > 1) @@ -3965,16 +4070,13 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (IS_NODESEG(se->type)) total_node_blocks += se->valid_blocks; - if (f2fs_discard_en(sbi)) { - if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { - memset(se->discard_map, 0xff, - SIT_VBLOCK_MAP_SIZE); - } else { - memcpy(se->discard_map, se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += old_valid_blocks; - sbi->discard_blks -= se->valid_blocks; - } + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += old_valid_blocks; + sbi->discard_blks -= se->valid_blocks; } if (sbi->segs_per_sec > 1) { diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index b3d9e317ff0c..ab3465faddf1 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/segment.h * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/blkdev.h> #include <linux/backing-dev.h> @@ -342,6 +339,12 @@ static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, return get_seg_entry(sbi, segno)->valid_blocks; } +static inline unsigned int get_ckpt_valid_blocks(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + return get_seg_entry(sbi, segno)->ckpt_valid_blocks; +} + static inline void seg_info_from_raw_sit(struct seg_entry *se, struct f2fs_sit_entry *rs) { @@ -579,6 +582,15 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, reserved_sections(sbi) + needed); } +static inline int f2fs_is_checkpoint_ready(struct f2fs_sb_info *sbi) +{ + if (likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return 0; + if (likely(!has_not_enough_free_secs(sbi, 0, 0))) + return 0; + return -ENOSPC; +} + static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) { return prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments; diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 36cfd816c160..9e13db994fdf 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -1,13 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 /* * f2fs shrinker support * the basic infra was copied from fs/ubifs/shrinker.c * * Copyright (c) 2015 Motorola Mobility * Copyright (c) 2015 Jaegeuk Kim <jaegeuk@kernel.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/fs.h> #include <linux/f2fs_fs.h> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 896b885f504e..af58b2cc21b8 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/super.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/module.h> #include <linux/init.h> @@ -53,9 +50,10 @@ char *f2fs_fault_name[FAULT_MAX] = { [FAULT_DIR_DEPTH] = "too big dir depth", [FAULT_EVICT_INODE] = "evict_inode fail", [FAULT_TRUNCATE] = "truncate fail", - [FAULT_IO] = "IO error", + [FAULT_READ_IO] = "read IO error", [FAULT_CHECKPOINT] = "checkpoint error", [FAULT_DISCARD] = "discard error", + [FAULT_WRITE_IO] = "write IO error", }; void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, @@ -138,6 +136,7 @@ enum { Opt_alloc, Opt_fsync, Opt_test_dummy_encryption, + Opt_checkpoint, Opt_err, }; @@ -196,6 +195,7 @@ static match_table_t f2fs_tokens = { {Opt_alloc, "alloc_mode=%s"}, {Opt_fsync, "fsync_mode=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption"}, + {Opt_checkpoint, "checkpoint=%s"}, {Opt_err, NULL}, }; @@ -207,7 +207,7 @@ void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk_ratelimited("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); + printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); va_end(args); } @@ -360,7 +360,6 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) static int parse_options(struct super_block *sb, char *options) { struct f2fs_sb_info *sbi = F2FS_SB(sb); - struct request_queue *q; substring_t args[MAX_OPT_ARGS]; char *p, *name; int arg = 0; @@ -415,14 +414,7 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; break; case Opt_discard: - q = bdev_get_queue(sb->s_bdev); - if (blk_queue_discard(q)) { - set_opt(sbi, DISCARD); - } else if (!f2fs_sb_has_blkzoned(sb)) { - f2fs_msg(sb, KERN_WARNING, - "mounting with \"discard\" option, but " - "the device does not support discard"); - } + set_opt(sbi, DISCARD); break; case Opt_nodiscard: if (f2fs_sb_has_blkzoned(sb)) { @@ -602,28 +594,31 @@ static int parse_options(struct super_block *sb, char *options) } F2FS_OPTION(sbi).write_io_size_bits = arg; break; +#ifdef CONFIG_F2FS_FAULT_INJECTION case Opt_fault_injection: if (args->from && match_int(args, &arg)) return -EINVAL; -#ifdef CONFIG_F2FS_FAULT_INJECTION f2fs_build_fault_attr(sbi, arg, F2FS_ALL_FAULT_TYPE); set_opt(sbi, FAULT_INJECTION); -#else - f2fs_msg(sb, KERN_INFO, - "FAULT_INJECTION was not selected"); -#endif break; + case Opt_fault_type: if (args->from && match_int(args, &arg)) return -EINVAL; -#ifdef CONFIG_F2FS_FAULT_INJECTION f2fs_build_fault_attr(sbi, 0, arg); set_opt(sbi, FAULT_INJECTION); + break; #else + case Opt_fault_injection: f2fs_msg(sb, KERN_INFO, - "FAULT_INJECTION was not selected"); -#endif + "fault_injection options not supported"); + break; + + case Opt_fault_type: + f2fs_msg(sb, KERN_INFO, + "fault_type options not supported"); break; +#endif case Opt_lazytime: sb->s_flags |= SB_LAZYTIME; break; @@ -776,6 +771,23 @@ static int parse_options(struct super_block *sb, char *options) "Test dummy encryption mount option ignored"); #endif break; + case Opt_checkpoint: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + + if (strlen(name) == 6 && + !strncmp(name, "enable", 6)) { + clear_opt(sbi, DISABLE_CHECKPOINT); + } else if (strlen(name) == 7 && + !strncmp(name, "disable", 7)) { + set_opt(sbi, DISABLE_CHECKPOINT); + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -834,6 +846,12 @@ static int parse_options(struct super_block *sb, char *options) } } + if (test_opt(sbi, DISABLE_CHECKPOINT) && test_opt(sbi, LFS)) { + f2fs_msg(sb, KERN_ERR, + "LFS not compatible with checkpoint=disable\n"); + return -EINVAL; + } + /* Not pass down write hints if the number of active logs is lesser * than NR_CURSEG_TYPE. */ @@ -1021,8 +1039,8 @@ static void f2fs_put_super(struct super_block *sb) * But, the previous checkpoint was not done by umount, it needs to do * clean checkpoint again. */ - if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) || - !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { + if ((is_sbi_flag_set(sbi, SBI_IS_DIRTY) || + !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG))) { struct cp_control cpc = { .reason = CP_UMOUNT, }; @@ -1032,7 +1050,8 @@ static void f2fs_put_super(struct super_block *sb) /* be sure to wait for any on-going discard commands */ dropped = f2fs_wait_discard_bios(sbi); - if (f2fs_discard_en(sbi) && !sbi->discard_blks && !dropped) { + if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) && + !sbi->discard_blks && !dropped) { struct cp_control cpc = { .reason = CP_UMOUNT | CP_TRIMMED, }; @@ -1093,6 +1112,8 @@ int f2fs_sync_fs(struct super_block *sb, int sync) if (unlikely(f2fs_cp_error(sbi))) return 0; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return 0; trace_f2fs_sync_fs(sb, sync); @@ -1192,6 +1213,11 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = total_count - start_count; buf->f_bfree = user_block_count - valid_user_blocks(sbi) - sbi->current_reserved_blocks; + if (unlikely(buf->f_bfree <= sbi->unusable_block_count)) + buf->f_bfree = 0; + else + buf->f_bfree -= sbi->unusable_block_count; + if (buf->f_bfree > F2FS_OPTION(sbi).root_reserved_blocks) buf->f_bavail = buf->f_bfree - F2FS_OPTION(sbi).root_reserved_blocks; @@ -1336,7 +1362,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) from_kgid_munged(&init_user_ns, F2FS_OPTION(sbi).s_resgid)); if (F2FS_IO_SIZE_BITS(sbi)) - seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); + seq_printf(seq, ",io_bits=%u", + F2FS_OPTION(sbi).write_io_size_bits); #ifdef CONFIG_F2FS_FAULT_INJECTION if (test_opt(sbi, FAULT_INJECTION)) { seq_printf(seq, ",fault_injection=%u", @@ -1370,6 +1397,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE) seq_printf(seq, ",alloc_mode=%s", "reuse"); + if (test_opt(sbi, DISABLE_CHECKPOINT)) + seq_puts(seq, ",checkpoint=disable"); + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_POSIX) seq_printf(seq, ",fsync_mode=%s", "posix"); else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) @@ -1397,10 +1427,10 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, INLINE_DENTRY); set_opt(sbi, EXTENT_CACHE); set_opt(sbi, NOHEAP); + clear_opt(sbi, DISABLE_CHECKPOINT); sbi->sb->s_flags |= SB_LAZYTIME; set_opt(sbi, FLUSH_MERGE); - if (blk_queue_discard(bdev_get_queue(sbi->sb->s_bdev))) - set_opt(sbi, DISCARD); + set_opt(sbi, DISCARD); if (f2fs_sb_has_blkzoned(sbi->sb)) set_opt_mode(sbi, F2FS_MOUNT_LFS); else @@ -1419,6 +1449,57 @@ static void default_options(struct f2fs_sb_info *sbi) #ifdef CONFIG_QUOTA static int f2fs_enable_quotas(struct super_block *sb); #endif + +static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) +{ + struct cp_control cpc; + int err; + + sbi->sb->s_flags |= SB_ACTIVE; + + mutex_lock(&sbi->gc_mutex); + f2fs_update_time(sbi, DISABLE_TIME); + + while (!f2fs_time_over(sbi, DISABLE_TIME)) { + err = f2fs_gc(sbi, true, false, NULL_SEGNO); + if (err == -ENODATA) + break; + if (err && err != -EAGAIN) { + mutex_unlock(&sbi->gc_mutex); + return err; + } + } + mutex_unlock(&sbi->gc_mutex); + + err = sync_filesystem(sbi->sb); + if (err) + return err; + + if (f2fs_disable_cp_again(sbi)) + return -EAGAIN; + + mutex_lock(&sbi->gc_mutex); + cpc.reason = CP_PAUSE; + set_sbi_flag(sbi, SBI_CP_DISABLED); + f2fs_write_checkpoint(sbi, &cpc); + + sbi->unusable_block_count = 0; + mutex_unlock(&sbi->gc_mutex); + return 0; +} + +static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) +{ + mutex_lock(&sbi->gc_mutex); + f2fs_dirty_to_prefree(sbi); + + clear_sbi_flag(sbi, SBI_CP_DISABLED); + set_sbi_flag(sbi, SBI_IS_DIRTY); + mutex_unlock(&sbi->gc_mutex); + + f2fs_sync_fs(sbi->sb, 1); +} + static int f2fs_remount(struct super_block *sb, int *flags, char *data) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -1428,6 +1509,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_restart_gc = false; bool need_stop_gc = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); + bool disable_checkpoint = test_opt(sbi, DISABLE_CHECKPOINT); + bool checkpoint_changed; #ifdef CONFIG_QUOTA int i, j; #endif @@ -1472,6 +1555,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) err = parse_options(sb, data); if (err) goto restore_opts; + checkpoint_changed = + disable_checkpoint != test_opt(sbi, DISABLE_CHECKPOINT); /* * Previous and new state of filesystem is RO, @@ -1485,7 +1570,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) err = dquot_suspend(sb, -1); if (err < 0) goto restore_opts; - } else if (f2fs_readonly(sb) && !(*flags & MS_RDONLY)) { + } else if (f2fs_readonly(sb) && !(*flags & SB_RDONLY)) { /* dquot_resume needs RW */ sb->s_flags &= ~SB_RDONLY; if (sb_any_quota_suspended(sb)) { @@ -1505,6 +1590,13 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } + if ((*flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) { + err = -EINVAL; + f2fs_msg(sbi->sb, KERN_WARNING, + "disabling checkpoint not compatible with read-only"); + goto restore_opts; + } + /* * We stop the GC thread if FS is mounted as RO * or if background_gc = off is passed in mount @@ -1533,6 +1625,16 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) clear_sbi_flag(sbi, SBI_IS_CLOSE); } + if (checkpoint_changed) { + if (test_opt(sbi, DISABLE_CHECKPOINT)) { + err = f2fs_disable_checkpoint(sbi); + if (err) + goto restore_gc; + } else { + f2fs_enable_checkpoint(sbi); + } + } + /* * We stop issue flush thread if FS is mounted as RO * or if flush_merge is not passed in mount option. @@ -1556,6 +1658,7 @@ skip: (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); limit_reserve_root(sbi); + *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); return 0; restore_gc: if (need_restart_gc) { @@ -1608,6 +1711,7 @@ repeat: congestion_wait(BLK_RW_ASYNC, HZ/50); goto repeat; } + set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); return PTR_ERR(page); } @@ -1619,6 +1723,7 @@ repeat: } if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); + set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); return -EIO; } @@ -1660,6 +1765,7 @@ retry: congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } + set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); break; } @@ -1696,6 +1802,12 @@ static qsize_t *f2fs_get_reserved_space(struct inode *inode) static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type) { + if (is_set_ckpt_flags(sbi, CP_QUOTA_NEED_FSCK_FLAG)) { + f2fs_msg(sbi->sb, KERN_ERR, + "quota sysfile may be corrupted, skip loading it"); + return 0; + } + return dquot_quota_on_mount(sbi->sb, F2FS_OPTION(sbi).s_qf_names[type], F2FS_OPTION(sbi).s_jquota_fmt, type); } @@ -1766,7 +1878,14 @@ static int f2fs_enable_quotas(struct super_block *sb) test_opt(F2FS_SB(sb), PRJQUOTA), }; - sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; + if (is_set_ckpt_flags(F2FS_SB(sb), CP_QUOTA_NEED_FSCK_FLAG)) { + f2fs_msg(sb, KERN_ERR, + "quota file may be corrupted, skip loading it"); + return 0; + } + + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; + for (type = 0; type < MAXQUOTAS; type++) { qf_inum = f2fs_qf_ino(sb, type); if (qf_inum) { @@ -1780,6 +1899,8 @@ static int f2fs_enable_quotas(struct super_block *sb) "fsck to fix.", type, err); for (type--; type >= 0; type--) dquot_quota_off(sb, type); + set_sbi_flag(F2FS_SB(sb), + SBI_QUOTA_NEED_REPAIR); return err; } } @@ -1787,35 +1908,51 @@ static int f2fs_enable_quotas(struct super_block *sb) return 0; } -static int f2fs_quota_sync(struct super_block *sb, int type) +int f2fs_quota_sync(struct super_block *sb, int type) { + struct f2fs_sb_info *sbi = F2FS_SB(sb); struct quota_info *dqopt = sb_dqopt(sb); int cnt; int ret; ret = dquot_writeback_dquots(sb, type); if (ret) - return ret; + goto out; /* * Now when everything is written we can discard the pagecache so * that userspace sees the changes. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + struct address_space *mapping; + if (type != -1 && cnt != type) continue; if (!sb_has_quota_active(sb, cnt)) continue; - ret = filemap_write_and_wait(dqopt->files[cnt]->i_mapping); + mapping = dqopt->files[cnt]->i_mapping; + + ret = filemap_fdatawrite(mapping); + if (ret) + goto out; + + /* if we are using journalled quota */ + if (is_journalled_quota(sbi)) + continue; + + ret = filemap_fdatawait(mapping); if (ret) - return ret; + set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); inode_lock(dqopt->files[cnt]); truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); inode_unlock(dqopt->files[cnt]); } - return 0; +out: + if (ret) + set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); + return ret; } static int f2fs_quota_on(struct super_block *sb, int type, int format_id, @@ -1836,8 +1973,7 @@ static int f2fs_quota_on(struct super_block *sb, int type, int format_id, inode_lock(inode); F2FS_I(inode)->i_flags |= F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL; - inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, - S_NOATIME | S_IMMUTABLE); + f2fs_set_inode_flags(inode); inode_unlock(inode); f2fs_mark_inode_dirty_sync(inode, false); @@ -1852,7 +1988,9 @@ static int f2fs_quota_off(struct super_block *sb, int type) if (!inode || !igrab(inode)) return dquot_quota_off(sb, type); - f2fs_quota_sync(sb, type); + err = f2fs_quota_sync(sb, type); + if (err) + goto out_put; err = dquot_quota_off(sb, type); if (err || f2fs_sb_has_quota_ino(sb)) @@ -1860,7 +1998,7 @@ static int f2fs_quota_off(struct super_block *sb, int type) inode_lock(inode); F2FS_I(inode)->i_flags &= ~(F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL); - inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); + f2fs_set_inode_flags(inode); inode_unlock(inode); f2fs_mark_inode_dirty_sync(inode, false); out_put: @@ -1871,9 +2009,88 @@ out_put: void f2fs_quota_off_umount(struct super_block *sb) { int type; + int err; + + for (type = 0; type < MAXQUOTAS; type++) { + err = f2fs_quota_off(sb, type); + if (err) { + int ret = dquot_quota_off(sb, type); + + f2fs_msg(sb, KERN_ERR, + "Fail to turn off disk quota " + "(type: %d, err: %d, ret:%d), Please " + "run fsck to fix it.", type, err, ret); + set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); + } + } +} + +static void f2fs_truncate_quota_inode_pages(struct super_block *sb) +{ + struct quota_info *dqopt = sb_dqopt(sb); + int type; + + for (type = 0; type < MAXQUOTAS; type++) { + if (!dqopt->files[type]) + continue; + f2fs_inode_synced(dqopt->files[type]); + } +} + +static int f2fs_dquot_commit(struct dquot *dquot) +{ + int ret; + + ret = dquot_commit(dquot); + if (ret < 0) + set_sbi_flag(F2FS_SB(dquot->dq_sb), SBI_QUOTA_NEED_REPAIR); + return ret; +} + +static int f2fs_dquot_acquire(struct dquot *dquot) +{ + int ret; + + ret = dquot_acquire(dquot); + if (ret < 0) + set_sbi_flag(F2FS_SB(dquot->dq_sb), SBI_QUOTA_NEED_REPAIR); + + return ret; +} + +static int f2fs_dquot_release(struct dquot *dquot) +{ + int ret; + + ret = dquot_release(dquot); + if (ret < 0) + set_sbi_flag(F2FS_SB(dquot->dq_sb), SBI_QUOTA_NEED_REPAIR); + return ret; +} + +static int f2fs_dquot_mark_dquot_dirty(struct dquot *dquot) +{ + struct super_block *sb = dquot->dq_sb; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int ret; + + ret = dquot_mark_dquot_dirty(dquot); + + /* if we are using journalled quota */ + if (is_journalled_quota(sbi)) + set_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH); + + return ret; +} - for (type = 0; type < MAXQUOTAS; type++) - f2fs_quota_off(sb, type); +static int f2fs_dquot_commit_info(struct super_block *sb, int type) +{ + int ret; + + ret = dquot_commit_info(sb, type); + if (ret < 0) + set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); + return ret; } static int f2fs_get_projid(struct inode *inode, kprojid_t *projid) @@ -1884,11 +2101,11 @@ static int f2fs_get_projid(struct inode *inode, kprojid_t *projid) static const struct dquot_operations f2fs_quota_operations = { .get_reserved_space = f2fs_get_reserved_space, - .write_dquot = dquot_commit, - .acquire_dquot = dquot_acquire, - .release_dquot = dquot_release, - .mark_dirty = dquot_mark_dquot_dirty, - .write_info = dquot_commit_info, + .write_dquot = f2fs_dquot_commit, + .acquire_dquot = f2fs_dquot_acquire, + .release_dquot = f2fs_dquot_release, + .mark_dirty = f2fs_dquot_mark_dquot_dirty, + .write_info = f2fs_dquot_commit_info, .alloc_dquot = dquot_alloc, .destroy_dquot = dquot_destroy, .get_projid = f2fs_get_projid, @@ -1906,6 +2123,11 @@ static const struct quotactl_ops f2fs_quotactl_ops = { .get_nextdqblk = dquot_get_next_dqblk, }; #else +int f2fs_quota_sync(struct super_block *sb, int type) +{ + return 0; +} + void f2fs_quota_off_umount(struct super_block *sb) { } @@ -2170,6 +2392,26 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, (bh->b_data + F2FS_SUPER_OFFSET); struct super_block *sb = sbi->sb; unsigned int blocksize; + size_t crc_offset = 0; + __u32 crc = 0; + + /* Check checksum_offset and crc in superblock */ + if (le32_to_cpu(raw_super->feature) & F2FS_FEATURE_SB_CHKSUM) { + crc_offset = le32_to_cpu(raw_super->checksum_offset); + if (crc_offset != + offsetof(struct f2fs_super_block, crc)) { + f2fs_msg(sb, KERN_INFO, + "Invalid SB checksum offset: %zu", + crc_offset); + return 1; + } + crc = le32_to_cpu(raw_super->crc); + if (!f2fs_crc_valid(sbi, crc, raw_super, crc_offset)) { + f2fs_msg(sb, KERN_INFO, + "Invalid SB checksum value: %u", crc); + return 1; + } + } if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) { f2fs_msg(sb, KERN_INFO, @@ -2320,7 +2562,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) unsigned int segment_count_main; unsigned int cp_pack_start_sum, cp_payload; block_t user_block_count; - int i; + int i, j; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); @@ -2361,11 +2603,43 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs || le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) return 1; + for (j = i + 1; j < NR_CURSEG_NODE_TYPE; j++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) == + le32_to_cpu(ckpt->cur_node_segno[j])) { + f2fs_msg(sbi->sb, KERN_ERR, + "Node segment (%u, %u) has the same " + "segno: %u", i, j, + le32_to_cpu(ckpt->cur_node_segno[i])); + return 1; + } + } } for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs || le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) return 1; + for (j = i + 1; j < NR_CURSEG_DATA_TYPE; j++) { + if (le32_to_cpu(ckpt->cur_data_segno[i]) == + le32_to_cpu(ckpt->cur_data_segno[j])) { + f2fs_msg(sbi->sb, KERN_ERR, + "Data segment (%u, %u) has the same " + "segno: %u", i, j, + le32_to_cpu(ckpt->cur_data_segno[i])); + return 1; + } + } + } + for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { + for (j = i; j < NR_CURSEG_DATA_TYPE; j++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) == + le32_to_cpu(ckpt->cur_data_segno[j])) { + f2fs_msg(sbi->sb, KERN_ERR, + "Data segment (%u) and Data segment (%u)" + " has the same segno: %u", i, j, + le32_to_cpu(ckpt->cur_node_segno[i])); + return 1; + } + } } sit_bitmap_size = le32_to_cpu(ckpt->sit_ver_bitmap_bytesize); @@ -2423,6 +2697,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->dir_level = DEF_DIR_LEVEL; sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL; + sbi->interval_time[DISCARD_TIME] = DEF_IDLE_INTERVAL; + sbi->interval_time[GC_TIME] = DEF_IDLE_INTERVAL; + sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_INTERVAL; clear_sbi_flag(sbi, SBI_NEED_FSCK); for (i = 0; i < NR_COUNT_TYPE; i++) @@ -2453,8 +2730,12 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) if (err) return err; - return percpu_counter_init(&sbi->total_valid_inode_count, 0, + err = percpu_counter_init(&sbi->total_valid_inode_count, 0, GFP_KERNEL); + if (err) + percpu_counter_destroy(&sbi->alloc_valid_block_count); + + return err; } #ifdef CONFIG_BLK_DEV_ZONED @@ -2589,6 +2870,7 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi, int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) { struct buffer_head *bh; + __u32 crc = 0; int err; if ((recover && f2fs_readonly(sbi->sb)) || @@ -2597,6 +2879,13 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) return -EROFS; } + /* we should update superblock crc here */ + if (!recover && f2fs_sb_has_sb_chksum(sbi->sb)) { + crc = f2fs_crc32(sbi, F2FS_RAW_SUPER(sbi), + offsetof(struct f2fs_super_block, crc)); + F2FS_RAW_SUPER(sbi)->crc = cpu_to_le32(crc); + } + /* write back-up superblock first */ bh = sb_bread(sbi->sb, sbi->valid_super_block ? 0 : 1); if (!bh) @@ -2866,7 +3155,7 @@ try_onemore: GFP_KERNEL); if (!sbi->write_io[i]) { err = -ENOMEM; - goto free_options; + goto free_bio_info; } for (j = HOT; j < n; j++) { @@ -2909,6 +3198,9 @@ try_onemore: goto free_meta_inode; } + if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_QUOTA_NEED_FSCK_FLAG)) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + /* Initialize device list */ err = f2fs_scan_devices(sbi); if (err) { @@ -3007,11 +3299,9 @@ try_onemore: /* Enable quota usage during mount */ if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) { err = f2fs_enable_quotas(sb); - if (err) { + if (err) f2fs_msg(sb, KERN_ERR, "Cannot turn on quotas: error %d", err); - goto free_sysfs; - } } #endif /* if there are nt orphan nodes free them */ @@ -3019,6 +3309,9 @@ try_onemore: if (err) goto free_meta; + if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG))) + goto skip_recovery; + /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { /* @@ -3058,6 +3351,14 @@ skip_recovery: /* f2fs_recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); + if (test_opt(sbi, DISABLE_CHECKPOINT)) { + err = f2fs_disable_checkpoint(sbi); + if (err) + goto free_meta; + } else if (is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)) { + f2fs_enable_checkpoint(sbi); + } + /* * If filesystem is not mounted as read-only then * do start the gc_thread. @@ -3090,10 +3391,10 @@ skip_recovery: free_meta: #ifdef CONFIG_QUOTA + f2fs_truncate_quota_inode_pages(sb); if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) f2fs_quota_off_umount(sbi->sb); #endif - f2fs_sync_inode_meta(sbi); /* * Some dirty meta pages can be produced by f2fs_recover_orphan_inodes() * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() @@ -3101,9 +3402,6 @@ free_meta: * falls into an infinite loop in f2fs_sync_meta_pages(). */ truncate_inode_pages_final(META_MAPPING(sbi)); -#ifdef CONFIG_QUOTA -free_sysfs: -#endif f2fs_unregister_sysfs(sbi); free_root_inode: dput(sb->s_root); @@ -3175,6 +3473,9 @@ static void kill_f2fs_super(struct super_block *sb) }; f2fs_write_checkpoint(sbi, &cpc); } + + if (is_sbi_flag_set(sbi, SBI_IS_RECOVERED) && f2fs_readonly(sb)) + sb->s_flags &= ~SB_RDONLY; } kill_block_super(sb); } diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 81c0e5337443..b777cbdd796b 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -1,13 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 /* * f2fs sysfs interface * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ * Copyright (c) 2017 Chao Yu <chao@kernel.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/compiler.h> #include <linux/proc_fs.h> @@ -120,6 +117,9 @@ static ssize_t features_show(struct f2fs_attr *a, if (f2fs_sb_has_lost_found(sb)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "lost_found"); + if (f2fs_sb_has_sb_chksum(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "sb_checksum"); len += snprintf(buf + len, PAGE_SIZE - len, "\n"); return len; } @@ -337,6 +337,7 @@ enum feat_id { FEAT_QUOTA_INO, FEAT_INODE_CRTIME, FEAT_LOST_FOUND, + FEAT_SB_CHECKSUM, }; static ssize_t f2fs_feature_show(struct f2fs_attr *a, @@ -353,6 +354,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_QUOTA_INO: case FEAT_INODE_CRTIME: case FEAT_LOST_FOUND: + case FEAT_SB_CHECKSUM: return snprintf(buf, PAGE_SIZE, "supported\n"); } return 0; @@ -407,6 +409,9 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, discard_idle_interval, + interval_time[DISCARD_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle_interval, interval_time[GC_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold); @@ -434,6 +439,7 @@ F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR); F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO); F2FS_FEATURE_RO_ATTR(inode_crtime, FEAT_INODE_CRTIME); F2FS_FEATURE_RO_ATTR(lost_found, FEAT_LOST_FOUND); +F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -460,6 +466,8 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(dirty_nats_ratio), ATTR_LIST(cp_interval), ATTR_LIST(idle_interval), + ATTR_LIST(discard_idle_interval), + ATTR_LIST(gc_idle_interval), ATTR_LIST(iostat_enable), ATTR_LIST(readdir_ra), ATTR_LIST(gc_pin_file_thresh), @@ -491,6 +499,7 @@ static struct attribute *f2fs_feat_attrs[] = { ATTR_LIST(quota_ino), ATTR_LIST(inode_crtime), ATTR_LIST(lost_found), + ATTR_LIST(sb_checksum), NULL, }; diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index a1fcd00bbb2b..ce2a5eb210b6 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * f2fs IO tracer * * Copyright (c) 2014 Motorola Mobility * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/fs.h> #include <linux/f2fs_fs.h> diff --git a/fs/f2fs/trace.h b/fs/f2fs/trace.h index 67db24ac1e85..e8075fc5b228 100644 --- a/fs/f2fs/trace.h +++ b/fs/f2fs/trace.h @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * f2fs IO tracer * * Copyright (c) 2014 Motorola Mobility * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #ifndef __F2FS_TRACE_H__ #define __F2FS_TRACE_H__ diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 77a010e625f5..7261245c208d 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/xattr.c * @@ -13,10 +14,6 @@ * suggestion of Luka Renko <luka.renko@hermes.si>. * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>, * Red Hat Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/rwsem.h> #include <linux/f2fs_fs.h> diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index dbcd1d16e669..67db134da0f5 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/xattr.h * @@ -9,10 +10,6 @@ * On-disk format of extended attributes for the ext2 filesystem. * * (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #ifndef __F2FS_XATTR_H__ #define __F2FS_XATTR_H__ diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index defc2168de91..f58c0cacc531 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -682,6 +682,7 @@ int fat_count_free_clusters(struct super_block *sb) if (ops->ent_get(&fatent) == FAT_ENT_FREE) free++; } while (fat_ent_next(sbi, &fatent)); + cond_resched(); } sbi->free_clusters = free; sbi->free_clus_valid = 1; diff --git a/fs/fcntl.c b/fs/fcntl.c index 4137d96534a6..083185174c6d 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -735,7 +735,7 @@ static void send_sigio_to_task(struct task_struct *p, return; switch (signum) { - siginfo_t si; + kernel_siginfo_t si; default: /* Queue a rt signal with the appropriate fd as its value. We use SI_SIGIO as the source, not diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 83bfe04456b6..c550512ce335 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -70,20 +70,7 @@ void fscache_free_cookie(struct fscache_cookie *cookie) } /* - * initialise an cookie jar slab element prior to any use - */ -void fscache_cookie_init_once(void *_cookie) -{ - struct fscache_cookie *cookie = _cookie; - - memset(cookie, 0, sizeof(*cookie)); - spin_lock_init(&cookie->lock); - spin_lock_init(&cookie->stores_lock); - INIT_HLIST_HEAD(&cookie->backing_objects); -} - -/* - * Set the index key in a cookie. The cookie struct has space for a 12-byte + * Set the index key in a cookie. The cookie struct has space for a 16-byte * key plus length and hash, but if that's not big enough, it's instead a * pointer to a buffer containing 3 bytes of hash, 1 byte of length and then * the key data. @@ -93,20 +80,18 @@ static int fscache_set_key(struct fscache_cookie *cookie, { unsigned long long h; u32 *buf; + int bufs; int i; - cookie->key_len = index_key_len; + bufs = DIV_ROUND_UP(index_key_len, sizeof(*buf)); if (index_key_len > sizeof(cookie->inline_key)) { - buf = kzalloc(index_key_len, GFP_KERNEL); + buf = kcalloc(bufs, sizeof(*buf), GFP_KERNEL); if (!buf) return -ENOMEM; cookie->key = buf; } else { buf = (u32 *)cookie->inline_key; - buf[0] = 0; - buf[1] = 0; - buf[2] = 0; } memcpy(buf, index_key, index_key_len); @@ -116,7 +101,8 @@ static int fscache_set_key(struct fscache_cookie *cookie, */ h = (unsigned long)cookie->parent; h += index_key_len + cookie->type; - for (i = 0; i < (index_key_len + sizeof(u32) - 1) / sizeof(u32); i++) + + for (i = 0; i < bufs; i++) h += buf[i]; cookie->key_hash = h ^ (h >> 32); @@ -161,7 +147,7 @@ struct fscache_cookie *fscache_alloc_cookie( struct fscache_cookie *cookie; /* allocate and initialise a cookie */ - cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL); + cookie = kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL); if (!cookie) return NULL; @@ -192,6 +178,9 @@ struct fscache_cookie *fscache_alloc_cookie( cookie->netfs_data = netfs_data; cookie->flags = (1 << FSCACHE_COOKIE_NO_DATA_YET); cookie->type = def->type; + spin_lock_init(&cookie->lock); + spin_lock_init(&cookie->stores_lock); + INIT_HLIST_HEAD(&cookie->backing_objects); /* radix tree insertion won't use the preallocation pool unless it's * told it may not wait */ diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index f83328a7f048..d6209022e965 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -51,7 +51,6 @@ extern struct fscache_cache *fscache_select_cache_for_object( extern struct kmem_cache *fscache_cookie_jar; extern void fscache_free_cookie(struct fscache_cookie *); -extern void fscache_cookie_init_once(void *); extern struct fscache_cookie *fscache_alloc_cookie(struct fscache_cookie *, const struct fscache_cookie_def *, const void *, size_t, diff --git a/fs/fscache/main.c b/fs/fscache/main.c index 7dce110bf17d..30ad89db1efc 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -143,9 +143,7 @@ static int __init fscache_init(void) fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar", sizeof(struct fscache_cookie), - 0, - 0, - fscache_cookie_init_once); + 0, 0, NULL); if (!fscache_cookie_jar) { pr_notice("Failed to allocate a cookie jar\n"); ret = -ENOMEM; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 03128ed1f34e..a683d9b27d76 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1057,7 +1057,7 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, } } release_metapath(&mp); - if (gfs2_is_jdata(ip)) + if (!gfs2_is_stuffed(ip) && gfs2_is_jdata(ip)) iomap->page_done = gfs2_iomap_journaled_page_done; return 0; @@ -1566,7 +1566,7 @@ more_rgrps: continue; } if (bstart) { - __gfs2_free_blocks(ip, bstart, (u32)blen, meta); + __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta); (*btotal) += blen; gfs2_add_inode_blocks(&ip->i_inode, -blen); } @@ -1574,7 +1574,7 @@ more_rgrps: blen = 1; } if (bstart) { - __gfs2_free_blocks(ip, bstart, (u32)blen, meta); + __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta); (*btotal) += blen; gfs2_add_inode_blocks(&ip->i_inode, -blen); } diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index e37002560c11..daa14ab4e31b 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -506,7 +506,8 @@ static int gfs2_dirent_gather(const struct gfs2_dirent *dent, * For now the most important thing is to check that the various sizes * are correct. */ -static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset, +static int gfs2_check_dirent(struct gfs2_sbd *sdp, + struct gfs2_dirent *dent, unsigned int offset, unsigned int size, unsigned int len, int first) { const char *msg = "gfs2_dirent too small"; @@ -528,12 +529,12 @@ static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset, goto error; return 0; error: - pr_warn("%s: %s (%s)\n", + fs_warn(sdp, "%s: %s (%s)\n", __func__, msg, first ? "first in block" : "not first in block"); return -EIO; } -static int gfs2_dirent_offset(const void *buf) +static int gfs2_dirent_offset(struct gfs2_sbd *sdp, const void *buf) { const struct gfs2_meta_header *h = buf; int offset; @@ -552,7 +553,8 @@ static int gfs2_dirent_offset(const void *buf) } return offset; wrong_type: - pr_warn("%s: wrong block type %u\n", __func__, be32_to_cpu(h->mh_type)); + fs_warn(sdp, "%s: wrong block type %u\n", __func__, + be32_to_cpu(h->mh_type)); return -1; } @@ -566,7 +568,7 @@ static struct gfs2_dirent *gfs2_dirent_scan(struct inode *inode, void *buf, unsigned size; int ret = 0; - ret = gfs2_dirent_offset(buf); + ret = gfs2_dirent_offset(GFS2_SB(inode), buf); if (ret < 0) goto consist_inode; @@ -574,7 +576,7 @@ static struct gfs2_dirent *gfs2_dirent_scan(struct inode *inode, void *buf, prev = NULL; dent = buf + offset; size = be16_to_cpu(dent->de_rec_len); - if (gfs2_check_dirent(dent, offset, size, len, 1)) + if (gfs2_check_dirent(GFS2_SB(inode), dent, offset, size, len, 1)) goto consist_inode; do { ret = scan(dent, name, opaque); @@ -586,7 +588,8 @@ static struct gfs2_dirent *gfs2_dirent_scan(struct inode *inode, void *buf, prev = dent; dent = buf + offset; size = be16_to_cpu(dent->de_rec_len); - if (gfs2_check_dirent(dent, offset, size, len, 0)) + if (gfs2_check_dirent(GFS2_SB(inode), dent, offset, size, + len, 0)) goto consist_inode; } while(1); @@ -1043,7 +1046,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) len = BIT(dip->i_depth - be16_to_cpu(oleaf->lf_depth)); half_len = len >> 1; if (!half_len) { - pr_warn("i_depth %u lf_depth %u index %u\n", + fs_warn(GFS2_SB(inode), "i_depth %u lf_depth %u index %u\n", dip->i_depth, be16_to_cpu(oleaf->lf_depth), index); gfs2_consist_inode(dip); error = -EIO; @@ -1351,7 +1354,7 @@ static int gfs2_set_cookies(struct gfs2_sbd *sdp, struct buffer_head *bh, if (!sdp->sd_args.ar_loccookie) continue; offset = (char *)(darr[i]) - - (bh->b_data + gfs2_dirent_offset(bh->b_data)); + (bh->b_data + gfs2_dirent_offset(sdp, bh->b_data)); offset /= GFS2_MIN_DIRENT_SIZE; offset += leaf_nr * sdp->sd_max_dents_per_leaf; if (offset >= GFS2_USE_HASH_FLAG || @@ -2018,7 +2021,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, l_blocks++; } - gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); + gfs2_rlist_alloc(&rlist); for (x = 0; x < rlist.rl_rgrps; x++) { struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl); @@ -2039,6 +2042,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, bh = leaf_bh; for (blk = leaf_no; blk; blk = nblk) { + struct gfs2_rgrpd *rgd; + if (blk != leaf_no) { error = get_leaf(dip, blk, &bh); if (error) @@ -2049,7 +2054,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, if (blk != leaf_no) brelse(bh); - gfs2_free_meta(dip, blk, 1); + rgd = gfs2_blk2rgrpd(sdp, blk, true); + gfs2_free_meta(dip, rgd, blk, 1); gfs2_add_inode_blocks(&dip->i_inode, -1); } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 08369c6cd127..45a17b770d97 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -314,6 +314,17 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr) return do_gfs2_set_flags(filp, gfsflags, mask); } +static int gfs2_getlabel(struct file *filp, char __user *label) +{ + struct inode *inode = file_inode(filp); + struct gfs2_sbd *sdp = GFS2_SB(inode); + + if (copy_to_user(label, sdp->sd_sb.sb_locktable, GFS2_LOCKNAME_LEN)) + return -EFAULT; + + return 0; +} + static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch(cmd) { @@ -323,7 +334,10 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return gfs2_set_flags(filp, (u32 __user *)arg); case FITRIM: return gfs2_fitrim(filp, (void __user *)arg); + case FS_IOC_GETFSLABEL: + return gfs2_getlabel(filp, (char __user *)arg); } + return -ENOTTY; } @@ -347,8 +361,8 @@ static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size) size_t blks = (size + sdp->sd_sb.sb_bsize - 1) >> sdp->sd_sb.sb_bsize_shift; int hint = min_t(size_t, INT_MAX, blks); - if (hint > atomic_read(&ip->i_res.rs_sizehint)) - atomic_set(&ip->i_res.rs_sizehint, hint); + if (hint > atomic_read(&ip->i_sizehint)) + atomic_set(&ip->i_sizehint, hint); } /** diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 4614ee25f621..05431324b262 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -494,7 +494,8 @@ retry: do_xmote(gl, gh, LM_ST_UNLOCKED); break; default: /* Everything else */ - pr_err("wanted %u got %u\n", gl->gl_target, state); + fs_err(gl->gl_name.ln_sbd, "wanted %u got %u\n", + gl->gl_target, state); GLOCK_BUG_ON(gl, 1); } spin_unlock(&gl->gl_lockref.lock); @@ -577,7 +578,7 @@ __acquires(&gl->gl_lockref.lock) gfs2_glock_queue_work(gl, 0); } else if (ret) { - pr_err("lm_lock ret %d\n", ret); + fs_err(sdp, "lm_lock ret %d\n", ret); GLOCK_BUG_ON(gl, !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)); } @@ -1064,13 +1065,13 @@ do_cancel: return; trap_recursive: - pr_err("original: %pSR\n", (void *)gh2->gh_ip); - pr_err("pid: %d\n", pid_nr(gh2->gh_owner_pid)); - pr_err("lock type: %d req lock state : %d\n", + fs_err(sdp, "original: %pSR\n", (void *)gh2->gh_ip); + fs_err(sdp, "pid: %d\n", pid_nr(gh2->gh_owner_pid)); + fs_err(sdp, "lock type: %d req lock state : %d\n", gh2->gh_gl->gl_name.ln_type, gh2->gh_state); - pr_err("new: %pSR\n", (void *)gh->gh_ip); - pr_err("pid: %d\n", pid_nr(gh->gh_owner_pid)); - pr_err("lock type: %d req lock state : %d\n", + fs_err(sdp, "new: %pSR\n", (void *)gh->gh_ip); + fs_err(sdp, "pid: %d\n", pid_nr(gh->gh_owner_pid)); + fs_err(sdp, "lock type: %d req lock state : %d\n", gh->gh_gl->gl_name.ln_type, gh->gh_state); gfs2_dump_glock(NULL, gl); BUG(); diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index b96d39c28e17..888b62cfd6d1 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -92,7 +92,7 @@ struct gfs2_bitmap { unsigned long bi_flags; u32 bi_offset; u32 bi_start; - u32 bi_len; + u32 bi_bytes; u32 bi_blocks; }; @@ -309,10 +309,6 @@ struct gfs2_qadata { /* quota allocation data */ */ struct gfs2_blkreserv { - /* components used during write (step 1): */ - atomic_t rs_sizehint; /* hint of the write size */ - - struct gfs2_holder rs_rgd_gh; /* Filled in by get_local_rgrp */ struct rb_node rs_node; /* link to other block reservations */ struct gfs2_rbm rs_rbm; /* Start of reservation */ u32 rs_free; /* how many blocks are still free */ @@ -417,8 +413,10 @@ struct gfs2_inode { struct gfs2_holder i_iopen_gh; struct gfs2_holder i_gh; /* for prepare/commit_write only */ struct gfs2_qadata *i_qadata; /* quota allocation data */ + struct gfs2_holder i_rgd_gh; struct gfs2_blkreserv i_res; /* rgrp multi-block reservation */ u64 i_goal; /* goal block for allocations */ + atomic_t i_sizehint; /* hint of the write size */ struct rw_semaphore i_rw_mutex; struct list_head i_ordered; struct list_head i_trunc_list; @@ -623,6 +621,7 @@ enum { SDF_RORECOVERY = 7, /* read only recovery */ SDF_SKIP_DLM_UNLOCK = 8, SDF_FORCE_AIL_FLUSH = 9, + SDF_AIL1_IO_ERROR = 10, }; enum gfs2_freeze_state { diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index ac7caa267ed6..31df26ed7854 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -177,14 +177,14 @@ static void gdlm_bast(void *arg, int mode) gfs2_glock_cb(gl, LM_ST_SHARED); break; default: - pr_err("unknown bast mode %d\n", mode); + fs_err(gl->gl_name.ln_sbd, "unknown bast mode %d\n", mode); BUG(); } } /* convert gfs lock-state to dlm lock-mode */ -static int make_mode(const unsigned int lmstate) +static int make_mode(struct gfs2_sbd *sdp, const unsigned int lmstate) { switch (lmstate) { case LM_ST_UNLOCKED: @@ -196,7 +196,7 @@ static int make_mode(const unsigned int lmstate) case LM_ST_SHARED: return DLM_LOCK_PR; } - pr_err("unknown LM state %d\n", lmstate); + fs_err(sdp, "unknown LM state %d\n", lmstate); BUG(); return -1; } @@ -257,7 +257,7 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, u32 lkf; char strname[GDLM_STRNAME_BYTES] = ""; - req = make_mode(req_state); + req = make_mode(gl->gl_name.ln_sbd, req_state); lkf = make_flags(gl, flags, req); gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); @@ -309,7 +309,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl) error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK, NULL, gl); if (error) { - pr_err("gdlm_unlock %x,%llx err=%d\n", + fs_err(sdp, "gdlm_unlock %x,%llx err=%d\n", gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number, error); return; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index ee20ea42e7b5..99dd58694ba1 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -108,7 +108,9 @@ __acquires(&sdp->sd_ail_lock) gfs2_assert(sdp, bd->bd_tr == tr); if (!buffer_busy(bh)) { - if (!buffer_uptodate(bh)) { + if (!buffer_uptodate(bh) && + !test_and_set_bit(SDF_AIL1_IO_ERROR, + &sdp->sd_flags)) { gfs2_io_error_bh(sdp, bh); *withdraw = true; } @@ -206,7 +208,8 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr, gfs2_assert(sdp, bd->bd_tr == tr); if (buffer_busy(bh)) continue; - if (!buffer_uptodate(bh)) { + if (!buffer_uptodate(bh) && + !test_and_set_bit(SDF_AIL1_IO_ERROR, &sdp->sd_flags)) { gfs2_io_error_bh(sdp, bh); *withdraw = true; } @@ -618,7 +621,7 @@ void gfs2_write_revokes(struct gfs2_sbd *sdp) gfs2_ail1_empty(sdp); spin_lock(&sdp->sd_ail_lock); - list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) { + list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) { list_for_each_entry(bd, &tr->tr_ail2_list, bd_ail_st_list) { if (list_empty(&bd->bd_list)) { have_revokes = 1; @@ -642,7 +645,7 @@ done: } gfs2_log_lock(sdp); spin_lock(&sdp->sd_ail_lock); - list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) { + list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) { list_for_each_entry_safe(bd, tmp, &tr->tr_ail2_list, bd_ail_st_list) { if (max_revokes == 0) goto out_of_blocks; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index f2567f958d00..4c7069b8f3c1 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -81,7 +81,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd) if (sdp->sd_args.ar_discard) gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi, 1, NULL); memcpy(bi->bi_clone + bi->bi_offset, - bd->bd_bh->b_data + bi->bi_offset, bi->bi_len); + bd->bd_bh->b_data + bi->bi_offset, bi->bi_bytes); clear_bit(GBF_FULL, &bi->bi_flags); rgd->rd_free_clone = rgd->rd_free; rgd->rd_extfail_pt = rgd->rd_free; diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 2d55e2c3333c..c7603063f861 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -39,9 +39,11 @@ static void gfs2_init_inode_once(void *foo) struct gfs2_inode *ip = foo; inode_init_once(&ip->i_inode); + atomic_set(&ip->i_sizehint, 0); init_rwsem(&ip->i_rw_mutex); INIT_LIST_HEAD(&ip->i_trunc_list); ip->i_qadata = NULL; + gfs2_holder_mark_uninitialized(&ip->i_rgd_gh); memset(&ip->i_res, 0, sizeof(ip->i_res)); RB_CLEAR_NODE(&ip->i_res.rs_node); ip->i_hash_cache = NULL; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index c2469833b4fb..b041cb8ae383 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -72,13 +72,13 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) if (!sdp) return NULL; - sb->s_fs_info = sdp; sdp->sd_vfs = sb; sdp->sd_lkstats = alloc_percpu(struct gfs2_pcpu_lkstats); if (!sdp->sd_lkstats) { kfree(sdp); return NULL; } + sb->s_fs_info = sdp; set_bit(SDF_NOJOURNALID, &sdp->sd_flags); gfs2_tune_init(&sdp->sd_tune); @@ -1333,6 +1333,9 @@ static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type, struct path path; int error; + if (!dev_name || !*dev_name) + return ERR_PTR(-EINVAL); + error = kern_path(dev_name, LOOKUP_FOLLOW, &path); if (error) { pr_warn("path_lookup on %s returned error %d\n", diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 0efae7a0ee80..2ae5a109eea7 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1183,7 +1183,7 @@ static int print_message(struct gfs2_quota_data *qd, char *type) * * Returns: 0 on success. * min_req = ap->min_target ? ap->min_target : ap->target; - * quota must allow atleast min_req blks for success and + * quota must allow at least min_req blks for success and * ap->allowed is set to the number of blocks allowed * * -EDQUOT otherwise, quota violation. ap->allowed is set to number diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 1ad3256b9cbc..ffe3032b1043 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -90,7 +90,7 @@ static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone, { unsigned char *byte1, *byte2, *end, cur_state; struct gfs2_bitmap *bi = rbm_bi(rbm); - unsigned int buflen = bi->bi_len; + unsigned int buflen = bi->bi_bytes; const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE; byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY); @@ -101,12 +101,16 @@ static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone, cur_state = (*byte1 >> bit) & GFS2_BIT_MASK; if (unlikely(!valid_change[new_state * 4 + cur_state])) { - pr_warn("buf_blk = 0x%x old_state=%d, new_state=%d\n", + struct gfs2_sbd *sdp = rbm->rgd->rd_sbd; + + fs_warn(sdp, "buf_blk = 0x%x old_state=%d, new_state=%d\n", rbm->offset, cur_state, new_state); - pr_warn("rgrp=0x%llx bi_start=0x%x\n", - (unsigned long long)rbm->rgd->rd_addr, bi->bi_start); - pr_warn("bi_offset=0x%x bi_len=0x%x\n", - bi->bi_offset, bi->bi_len); + fs_warn(sdp, "rgrp=0x%llx bi_start=0x%x biblk: 0x%llx\n", + (unsigned long long)rbm->rgd->rd_addr, bi->bi_start, + (unsigned long long)bi->bi_bh->b_blocknr); + fs_warn(sdp, "bi_offset=0x%x bi_bytes=0x%x block=0x%llx\n", + bi->bi_offset, bi->bi_bytes, + (unsigned long long)gfs2_rbm_to_block(rbm)); dump_stack(); gfs2_consist_rgrpd(rbm->rgd); return; @@ -269,15 +273,10 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len, static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block) { - u64 rblock = block - rbm->rgd->rd_data0; - - if (WARN_ON_ONCE(rblock > UINT_MAX)) - return -EINVAL; - if (block >= rbm->rgd->rd_data0 + rbm->rgd->rd_data) + if (!rgrp_contains_block(rbm->rgd, block)) return -E2BIG; - rbm->bii = 0; - rbm->offset = (u32)(rblock); + rbm->offset = block - rbm->rgd->rd_data0; /* Check if the block is within the first block */ if (rbm->offset < rbm_bi(rbm)->bi_blocks) return 0; @@ -382,7 +381,7 @@ static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len) if (bi->bi_clone) start = bi->bi_clone; start += bi->bi_offset; - end = start + bi->bi_len; + end = start + bi->bi_bytes; BUG_ON(rbm.offset & 3); start += (rbm.offset / GFS2_NBBY); bytes = min_t(u32, len / GFS2_NBBY, (end - start)); @@ -467,7 +466,7 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd) count[x] += gfs2_bitcount(rgd, bi->bi_bh->b_data + bi->bi_offset, - bi->bi_len, x); + bi->bi_bytes, x); } if (count[0] != rgd->rd_free) { @@ -642,7 +641,10 @@ static void __rs_deltree(struct gfs2_blkreserv *rs) RB_CLEAR_NODE(&rs->rs_node); if (rs->rs_free) { - struct gfs2_bitmap *bi = rbm_bi(&rs->rs_rbm); + u64 last_block = gfs2_rbm_to_block(&rs->rs_rbm) + + rs->rs_free - 1; + struct gfs2_rbm last_rbm = { .rgd = rs->rs_rbm.rgd, }; + struct gfs2_bitmap *start, *last; /* return reserved blocks to the rgrp */ BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free); @@ -653,7 +655,13 @@ static void __rs_deltree(struct gfs2_blkreserv *rs) it will force the number to be recalculated later. */ rgd->rd_extfail_pt += rs->rs_free; rs->rs_free = 0; - clear_bit(GBF_FULL, &bi->bi_flags); + if (gfs2_rbm_from_block(&last_rbm, last_block)) + return; + start = rbm_bi(&rs->rs_rbm); + last = rbm_bi(&last_rbm); + do + clear_bit(GBF_FULL, &start->bi_flags); + while (start++ != last); } } @@ -738,11 +746,13 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd) { - pr_info("ri_addr = %llu\n", (unsigned long long)rgd->rd_addr); - pr_info("ri_length = %u\n", rgd->rd_length); - pr_info("ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0); - pr_info("ri_data = %u\n", rgd->rd_data); - pr_info("ri_bitbytes = %u\n", rgd->rd_bitbytes); + struct gfs2_sbd *sdp = rgd->rd_sbd; + + fs_info(sdp, "ri_addr = %llu\n", (unsigned long long)rgd->rd_addr); + fs_info(sdp, "ri_length = %u\n", rgd->rd_length); + fs_info(sdp, "ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0); + fs_info(sdp, "ri_data = %u\n", rgd->rd_data); + fs_info(sdp, "ri_bitbytes = %u\n", rgd->rd_bitbytes); } /** @@ -780,21 +790,21 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd) bytes = bytes_left; bi->bi_offset = sizeof(struct gfs2_rgrp); bi->bi_start = 0; - bi->bi_len = bytes; + bi->bi_bytes = bytes; bi->bi_blocks = bytes * GFS2_NBBY; /* header block */ } else if (x == 0) { bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp); bi->bi_offset = sizeof(struct gfs2_rgrp); bi->bi_start = 0; - bi->bi_len = bytes; + bi->bi_bytes = bytes; bi->bi_blocks = bytes * GFS2_NBBY; /* last block */ } else if (x + 1 == length) { bytes = bytes_left; bi->bi_offset = sizeof(struct gfs2_meta_header); bi->bi_start = rgd->rd_bitbytes - bytes_left; - bi->bi_len = bytes; + bi->bi_bytes = bytes; bi->bi_blocks = bytes * GFS2_NBBY; /* other blocks */ } else { @@ -802,7 +812,7 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd) sizeof(struct gfs2_meta_header); bi->bi_offset = sizeof(struct gfs2_meta_header); bi->bi_start = rgd->rd_bitbytes - bytes_left; - bi->bi_len = bytes; + bi->bi_bytes = bytes; bi->bi_blocks = bytes * GFS2_NBBY; } @@ -814,11 +824,11 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd) return -EIO; } bi = rgd->rd_bits + (length - 1); - if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_data) { + if ((bi->bi_start + bi->bi_bytes) * GFS2_NBBY != rgd->rd_data) { if (gfs2_consist_rgrpd(rgd)) { gfs2_rindex_print(rgd); fs_err(sdp, "start=%u len=%u offset=%u\n", - bi->bi_start, bi->bi_len, bi->bi_offset); + bi->bi_start, bi->bi_bytes, bi->bi_offset); } return -EIO; } @@ -1103,12 +1113,35 @@ static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd) { struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl; struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bits[0].bi_bh->b_data; + int valid = 1; - if (rgl->rl_flags != str->rg_flags || rgl->rl_free != str->rg_free || - rgl->rl_dinodes != str->rg_dinodes || - rgl->rl_igeneration != str->rg_igeneration) - return 0; - return 1; + if (rgl->rl_flags != str->rg_flags) { + printk(KERN_WARNING "GFS2: rgd: %llu lvb flag mismatch %u/%u", + (unsigned long long)rgd->rd_addr, + be32_to_cpu(rgl->rl_flags), be32_to_cpu(str->rg_flags)); + valid = 0; + } + if (rgl->rl_free != str->rg_free) { + printk(KERN_WARNING "GFS2: rgd: %llu lvb free mismatch %u/%u", + (unsigned long long)rgd->rd_addr, + be32_to_cpu(rgl->rl_free), be32_to_cpu(str->rg_free)); + valid = 0; + } + if (rgl->rl_dinodes != str->rg_dinodes) { + printk(KERN_WARNING "GFS2: rgd: %llu lvb dinode mismatch %u/%u", + (unsigned long long)rgd->rd_addr, + be32_to_cpu(rgl->rl_dinodes), + be32_to_cpu(str->rg_dinodes)); + valid = 0; + } + if (rgl->rl_igeneration != str->rg_igeneration) { + printk(KERN_WARNING "GFS2: rgd: %llu lvb igen mismatch " + "%llu/%llu", (unsigned long long)rgd->rd_addr, + (unsigned long long)be64_to_cpu(rgl->rl_igeneration), + (unsigned long long)be64_to_cpu(str->rg_igeneration)); + valid = 0; + } + return valid; } static u32 count_unlinked(struct gfs2_rgrpd *rgd) @@ -1122,8 +1155,8 @@ static u32 count_unlinked(struct gfs2_rgrpd *rgd) goal = 0; buffer = bi->bi_bh->b_data + bi->bi_offset; WARN_ON(!buffer_uptodate(bi->bi_bh)); - while (goal < bi->bi_len * GFS2_NBBY) { - goal = gfs2_bitfit(buffer, bi->bi_len, goal, + while (goal < bi->bi_blocks) { + goal = gfs2_bitfit(buffer, bi->bi_bytes, goal, GFS2_BLKST_UNLINKED); if (goal == BFITNOENT) break; @@ -1226,7 +1259,7 @@ static int update_rgrp_lvb(struct gfs2_rgrpd *rgd) rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags); rl_flags &= ~GFS2_RDF_MASK; rgd->rd_flags &= GFS2_RDF_MASK; - rgd->rd_flags |= (rl_flags | GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); + rgd->rd_flags |= (rl_flags | GFS2_RDF_CHECK); if (rgd->rd_rgl->rl_unlinked == 0) rgd->rd_flags &= ~GFS2_RDF_CHECK; rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free); @@ -1295,7 +1328,7 @@ int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, u32 trimmed = 0; u8 diff; - for (x = 0; x < bi->bi_len; x++) { + for (x = 0; x < bi->bi_bytes; x++) { const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data; clone += bi->bi_offset; clone += x; @@ -1541,8 +1574,8 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, if (S_ISDIR(inode->i_mode)) extlen = 1; else { - extlen = max_t(u32, atomic_read(&rs->rs_sizehint), ap->target); - extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks); + extlen = max_t(u32, atomic_read(&ip->i_sizehint), ap->target); + extlen = clamp(extlen, (u32)RGRP_RSRV_MINBLKS, free_blocks); } if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen)) return; @@ -1728,7 +1761,7 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, if (state != GFS2_BLKST_UNLINKED && bi->bi_clone) buffer = bi->bi_clone + bi->bi_offset; initial_offset = rbm->offset; - offset = gfs2_bitfit(buffer, bi->bi_len, rbm->offset, state); + offset = gfs2_bitfit(buffer, bi->bi_bytes, rbm->offset, state); if (offset == BFITNOENT) goto bitmap_full; rbm->offset = offset; @@ -1999,7 +2032,7 @@ static inline int fast_to_acquire(struct gfs2_rgrpd *rgd) * We try our best to find an rgrp that has at least ap->target blocks * available. After a couple of passes (loops == 2), the prospects of finding * such an rgrp diminish. At this stage, we return the first rgrp that has - * atleast ap->min_target blocks available. Either way, we set ap->allowed to + * at least ap->min_target blocks available. Either way, we set ap->allowed to * the number of blocks available in the chosen rgrp. * * Returns: 0 on success, @@ -2053,7 +2086,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) } error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl, LM_ST_EXCLUSIVE, flags, - &rs->rs_rgd_gh); + &ip->i_rgd_gh); if (unlikely(error)) return error; if (!gfs2_rs_active(rs) && (loops < 2) && @@ -2062,13 +2095,13 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) if (sdp->sd_args.ar_rgrplvb) { error = update_rgrp_lvb(rs->rs_rbm.rgd); if (unlikely(error)) { - gfs2_glock_dq_uninit(&rs->rs_rgd_gh); + gfs2_glock_dq_uninit(&ip->i_rgd_gh); return error; } } } - /* Skip unuseable resource groups */ + /* Skip unusable resource groups */ if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR)) || (loops == 0 && ap->target > rs->rs_rbm.rgd->rd_extfail_pt)) @@ -2105,7 +2138,7 @@ skip_rgrp: /* Unlock rgrp if required */ if (!rg_locked) - gfs2_glock_dq_uninit(&rs->rs_rgd_gh); + gfs2_glock_dq_uninit(&ip->i_rgd_gh); next_rgrp: /* Find the next rgrp, and continue looking */ if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin)) @@ -2142,10 +2175,8 @@ next_rgrp: void gfs2_inplace_release(struct gfs2_inode *ip) { - struct gfs2_blkreserv *rs = &ip->i_res; - - if (gfs2_holder_initialized(&rs->rs_rgd_gh)) - gfs2_glock_dq_uninit(&rs->rs_rgd_gh); + if (gfs2_holder_initialized(&ip->i_rgd_gh)) + gfs2_glock_dq_uninit(&ip->i_rgd_gh); } /** @@ -2184,27 +2215,21 @@ static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode, /** * rgblk_free - Change alloc state of given block(s) * @sdp: the filesystem + * @rgd: the resource group the blocks are in * @bstart: the start of a run of blocks to free * @blen: the length of the block run (all must lie within ONE RG!) * @new_state: GFS2_BLKST_XXX the after-allocation block state - * - * Returns: Resource group containing the block(s) */ -static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, - u32 blen, unsigned char new_state) +static void rgblk_free(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd, + u64 bstart, u32 blen, unsigned char new_state) { struct gfs2_rbm rbm; struct gfs2_bitmap *bi, *bi_prev = NULL; - rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1); - if (!rbm.rgd) { - if (gfs2_consist(sdp)) - fs_err(sdp, "block = %llu\n", (unsigned long long)bstart); - return NULL; - } - - gfs2_rbm_from_block(&rbm, bstart); + rbm.rgd = rgd; + if (WARN_ON_ONCE(gfs2_rbm_from_block(&rbm, bstart))) + return; while (blen--) { bi = rbm_bi(&rbm); if (bi != bi_prev) { @@ -2213,7 +2238,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, GFP_NOFS | __GFP_NOFAIL); memcpy(bi->bi_clone + bi->bi_offset, bi->bi_bh->b_data + bi->bi_offset, - bi->bi_len); + bi->bi_bytes); } gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh); bi_prev = bi; @@ -2221,8 +2246,6 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, gfs2_setbit(&rbm, false, new_state); gfs2_rbm_incr(&rbm); } - - return rbm.rgd; } /** @@ -2244,6 +2267,14 @@ void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl) (unsigned long long)rgd->rd_addr, rgd->rd_flags, rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes, rgd->rd_reserved, rgd->rd_extfail_pt); + if (rgd->rd_sbd->sd_args.ar_rgrplvb) { + struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl; + + gfs2_print_dbg(seq, " L: f:%02x b:%u i:%u\n", + be32_to_cpu(rgl->rl_flags), + be32_to_cpu(rgl->rl_free), + be32_to_cpu(rgl->rl_dinodes)); + } spin_lock(&rgd->rd_rsspin); for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) { trs = rb_entry(n, struct gfs2_blkreserv, rs_node); @@ -2295,7 +2326,7 @@ static void gfs2_adjust_reservation(struct gfs2_inode *ip, goto out; /* We used up our block reservation, so we should reserve more blocks next time. */ - atomic_add(RGRP_RSRV_ADDBLKS, &rs->rs_sizehint); + atomic_add(RGRP_RSRV_ADDBLKS, &ip->i_sizehint); } __rs_deltree(rs); } @@ -2329,7 +2360,10 @@ static void gfs2_set_alloc_start(struct gfs2_rbm *rbm, else goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0; - gfs2_rbm_from_block(rbm, goal); + if (WARN_ON_ONCE(gfs2_rbm_from_block(rbm, goal))) { + rbm->bii = 0; + rbm->offset = 0; + } } /** @@ -2392,7 +2426,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, } } if (rbm.rgd->rd_free < *nblocks) { - pr_warn("nblocks=%u\n", *nblocks); + fs_warn(sdp, "nblocks=%u\n", *nblocks); goto rgrp_error; } @@ -2427,20 +2461,19 @@ rgrp_error: /** * __gfs2_free_blocks - free a contiguous run of block(s) * @ip: the inode these blocks are being freed from + * @rgd: the resource group the blocks are in * @bstart: first block of a run of contiguous blocks * @blen: the length of the block run * @meta: 1 if the blocks represent metadata * */ -void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta) +void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, + u64 bstart, u32 blen, int meta) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_rgrpd *rgd; - rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE); - if (!rgd) - return; + rgblk_free(sdp, rgd, bstart, blen, GFS2_BLKST_FREE); trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE); rgd->rd_free += blen; rgd->rd_flags &= ~GFS2_RGF_TRIMMED; @@ -2455,16 +2488,18 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta) /** * gfs2_free_meta - free a contiguous run of data block(s) * @ip: the inode these blocks are being freed from + * @rgd: the resource group the blocks are in * @bstart: first block of a run of contiguous blocks * @blen: the length of the block run * */ -void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen) +void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, + u64 bstart, u32 blen) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - __gfs2_free_blocks(ip, bstart, blen, 1); + __gfs2_free_blocks(ip, rgd, bstart, blen, 1); gfs2_statfs_change(sdp, 0, +blen, 0); gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid); } @@ -2476,9 +2511,10 @@ void gfs2_unlink_di(struct inode *inode) struct gfs2_rgrpd *rgd; u64 blkno = ip->i_no_addr; - rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED); + rgd = gfs2_blk2rgrpd(sdp, blkno, true); if (!rgd) return; + rgblk_free(sdp, rgd, blkno, 1, GFS2_BLKST_UNLINKED); trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED); gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); @@ -2488,13 +2524,8 @@ void gfs2_unlink_di(struct inode *inode) void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) { struct gfs2_sbd *sdp = rgd->rd_sbd; - struct gfs2_rgrpd *tmp_rgd; - - tmp_rgd = rgblk_free(sdp, ip->i_no_addr, 1, GFS2_BLKST_FREE); - if (!tmp_rgd) - return; - gfs2_assert_withdraw(sdp, rgd == tmp_rgd); + rgblk_free(sdp, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE); if (!rgd->rd_dinodes) gfs2_consist_rgrpd(rgd); rgd->rd_dinodes--; @@ -2538,7 +2569,8 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type) rbm.rgd = rgd; error = gfs2_rbm_from_block(&rbm, no_addr); - WARN_ON_ONCE(error != 0); + if (WARN_ON_ONCE(error)) + goto fail; if (gfs2_testbit(&rbm, false) != type) error = -ESTALE; @@ -2624,13 +2656,12 @@ void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate * and initialize an array of glock holders for them * @rlist: the list of resource groups - * @state: the lock state to acquire the RG lock in * * FIXME: Don't use NOFAIL * */ -void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state) +void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist) { unsigned int x; @@ -2639,7 +2670,7 @@ void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state) GFP_NOFS | __GFP_NOFAIL); for (x = 0; x < rlist->rl_rgrps; x++) gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, - state, 0, + LM_ST_EXCLUSIVE, 0, &rlist->rl_ghs[x]); } diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index e90478e2f545..b596c3d17988 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -18,8 +18,7 @@ * By reserving 32 blocks at a time, we can optimize / shortcut how we search * through the bitmaps by looking a word at a time. */ -#define RGRP_RSRV_MINBYTES 8 -#define RGRP_RSRV_MINBLKS ((u32)(RGRP_RSRV_MINBYTES * GFS2_NBBY)) +#define RGRP_RSRV_MINBLKS 32 #define RGRP_RSRV_ADDBLKS 64 struct gfs2_rgrpd; @@ -52,8 +51,10 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, extern int gfs2_rsqa_alloc(struct gfs2_inode *ip); extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs); extern void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount); -extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta); -extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); +extern void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, + u64 bstart, u32 blen, int meta); +extern void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, + u64 bstart, u32 blen); extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); extern void gfs2_unlink_di(struct inode *inode); extern int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, @@ -68,7 +69,7 @@ struct gfs2_rgrp_list { extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, u64 block); -extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state); +extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist); extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); extern void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index c212893534ed..ca71163ff7cf 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -854,10 +854,10 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) return error; + flush_workqueue(gfs2_delete_workqueue); kthread_stop(sdp->sd_quotad_process); kthread_stop(sdp->sd_logd_process); - flush_workqueue(gfs2_delete_workqueue); gfs2_quota_sync(sdp->sd_vfs, 0); gfs2_statfs_sync(sdp->sd_vfs, 0); @@ -971,7 +971,7 @@ void gfs2_freeze_func(struct work_struct *work) error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0, &freeze_gh); if (error) { - printk(KERN_INFO "GFS2: couln't get freeze lock : %d\n", error); + printk(KERN_INFO "GFS2: couldn't get freeze lock : %d\n", error); gfs2_assert_withdraw(sdp, 0); } else { diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 064c9a0ef046..423bc2d03dd8 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -74,13 +74,13 @@ fail: return error; } -static void gfs2_print_trans(const struct gfs2_trans *tr) +static void gfs2_print_trans(struct gfs2_sbd *sdp, const struct gfs2_trans *tr) { - pr_warn("Transaction created at: %pSR\n", (void *)tr->tr_ip); - pr_warn("blocks=%u revokes=%u reserved=%u touched=%u\n", + fs_warn(sdp, "Transaction created at: %pSR\n", (void *)tr->tr_ip); + fs_warn(sdp, "blocks=%u revokes=%u reserved=%u touched=%u\n", tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, test_bit(TR_TOUCHED, &tr->tr_flags)); - pr_warn("Buf %u/%u Databuf %u/%u Revoke %u/%u\n", + fs_warn(sdp, "Buf %u/%u Databuf %u/%u Revoke %u/%u\n", tr->tr_num_buf_new, tr->tr_num_buf_rm, tr->tr_num_databuf_new, tr->tr_num_databuf_rm, tr->tr_num_revoke, tr->tr_num_revoke_rm); @@ -109,7 +109,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) if (gfs2_assert_withdraw(sdp, (nbuf <= tr->tr_blocks) && (tr->tr_num_revoke <= tr->tr_revokes))) - gfs2_print_trans(tr); + gfs2_print_trans(sdp, tr); gfs2_log_commit(sdp, tr); if (alloced && !test_bit(TR_ATTACHED, &tr->tr_flags)) @@ -225,12 +225,13 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) { - pr_err("Attempting to add uninitialised block to journal (inplace block=%lld)\n", + fs_err(sdp, "Attempting to add uninitialised block to " + "journal (inplace block=%lld)\n", (unsigned long long)bd->bd_bh->b_blocknr); BUG(); } if (unlikely(state == SFS_FROZEN)) { - printk(KERN_INFO "GFS2:adding buf while frozen\n"); + fs_info(sdp, "GFS2:adding buf while frozen\n"); gfs2_assert_withdraw(sdp, 0); } gfs2_pin(sdp, bd->bd_bh); diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 59c811de0dc7..0a814ccac41d 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -19,6 +19,7 @@ #include "gfs2.h" #include "incore.h" #include "glock.h" +#include "rgrp.h" #include "util.h" struct kmem_cache *gfs2_glock_cachep __read_mostly; @@ -181,6 +182,8 @@ int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide, { struct gfs2_sbd *sdp = rgd->rd_sbd; int rv; + + gfs2_rgrp_dump(NULL, rgd->rd_gl); rv = gfs2_lm_withdraw(sdp, "fatal: filesystem consistency error\n" " RG = %llu\n" @@ -256,12 +259,13 @@ void gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, const char *function, char *file, unsigned int line, bool withdraw) { - fs_err(sdp, - "fatal: I/O error\n" - " block = %llu\n" - " function = %s, file = %s, line = %u\n", - (unsigned long long)bh->b_blocknr, - function, file, line); + if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + fs_err(sdp, + "fatal: I/O error\n" + " block = %llu\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)bh->b_blocknr, + function, file, line); if (withdraw) gfs2_lm_withdraw(sdp, NULL); } diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index 96ac4aba4738..9278fecba632 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -86,7 +86,7 @@ static inline int gfs2_meta_check(struct gfs2_sbd *sdp, struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data; u32 magic = be32_to_cpu(mh->mh_magic); if (unlikely(magic != GFS2_MAGIC)) { - pr_err("Magic number missing at %llu\n", + fs_err(sdp, "Magic number missing at %llu\n", (unsigned long long)bh->b_blocknr); return -EIO; } diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 38515988aaf7..996c915a9c97 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -283,7 +283,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, blen++; else { if (bstart) - gfs2_free_meta(ip, bstart, blen); + gfs2_free_meta(ip, rgd, bstart, blen); bstart = bn; blen = 1; } @@ -292,7 +292,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, gfs2_add_inode_blocks(&ip->i_inode, -1); } if (bstart) - gfs2_free_meta(ip, bstart, blen); + gfs2_free_meta(ip, rgd, bstart, blen); if (prev && !leave) { u32 len; @@ -1250,6 +1250,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrp_list rlist; + struct gfs2_rgrpd *rgd; struct buffer_head *indbh, *dibh; __be64 *eablk, *end; unsigned int rg_blocks = 0; @@ -1299,11 +1300,10 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) else goto out; - gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); + gfs2_rlist_alloc(&rlist); for (x = 0; x < rlist.rl_rgrps; x++) { - struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl); - + rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl); rg_blocks += rgd->rd_length; } @@ -1320,6 +1320,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header)); bstart = 0; + rgd = NULL; blen = 0; for (; eablk < end; eablk++) { @@ -1333,8 +1334,9 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) blen++; else { if (bstart) - gfs2_free_meta(ip, bstart, blen); + gfs2_free_meta(ip, rgd, bstart, blen); bstart = bn; + rgd = gfs2_blk2rgrpd(sdp, bstart, true); blen = 1; } @@ -1342,7 +1344,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) gfs2_add_inode_blocks(&ip->i_inode, -1); } if (bstart) - gfs2_free_meta(ip, bstart, blen); + gfs2_free_meta(ip, rgd, bstart, blen); ip->i_diskflags &= ~GFS2_DIF_EA_INDIRECT; @@ -1391,7 +1393,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip) if (error) goto out_gunlock; - gfs2_free_meta(ip, ip->i_eattr, 1); + gfs2_free_meta(ip, rgd, ip->i_eattr, 1); ip->i_eattr = 0; gfs2_add_inode_blocks(&ip->i_inode, -1); diff --git a/fs/ioctl.c b/fs/ioctl.c index 3212c29235ce..2005529af560 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -230,7 +230,7 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, ret = -EXDEV; if (src_file.file->f_path.mnt != dst_file->f_path.mnt) goto fdput; - ret = do_clone_file_range(src_file.file, off, dst_file, destoff, olen); + ret = vfs_clone_file_range(src_file.file, off, dst_file, destoff, olen); fdput: fdput(src_file); return ret; diff --git a/fs/iomap.c b/fs/iomap.c index 74762b1ec233..ec15cf2ec696 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1051,6 +1051,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, } else { WARN_ON_ONCE(!PageUptodate(page)); iomap_page_create(inode, page); + set_page_dirty(page); } return length; @@ -1090,7 +1091,6 @@ int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) length -= ret; } - set_page_dirty(page); wait_for_stable_page(page); return VM_FAULT_LOCKED; out_unlock: diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index c125d662777c..26f8d7e46462 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -251,8 +251,8 @@ restart: bh = jh2bh(jh); if (buffer_locked(bh)) { - spin_unlock(&journal->j_list_lock); get_bh(bh); + spin_unlock(&journal->j_list_lock); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); @@ -333,8 +333,8 @@ restart2: jh = transaction->t_checkpoint_io_list; bh = jh2bh(jh); if (buffer_locked(bh)) { - spin_unlock(&journal->j_list_lock); get_bh(bh); + spin_unlock(&journal->j_list_lock); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c index 453a6a1fff34..2b4d5013dc5d 100644 --- a/fs/jffs2/background.c +++ b/fs/jffs2/background.c @@ -125,7 +125,7 @@ static int jffs2_garbage_collect_thread(void *_c) if (try_to_freeze()) goto again; - signr = kernel_dequeue_signal(NULL); + signr = kernel_dequeue_signal(); switch(signr) { case SIGSTOP: diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 87bdf0f4cba1..902a7dd10e5c 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -285,10 +285,8 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = c; ret = jffs2_parse_options(c, data); - if (ret) { - kfree(c); + if (ret) return -EINVAL; - } /* Initialize JFFS2 superblock locks, the further initialization will * be done later */ diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 2e71b6e7e646..8c06a6ea862d 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -146,12 +146,16 @@ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) if (default_acl) { rc = __jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, default_acl); posix_acl_release(default_acl); + } else { + inode->i_default_acl = NULL; } if (acl) { if (!rc) rc = __jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl); posix_acl_release(acl); + } else { + inode->i_acl = NULL; } JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) | diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 054cc761b426..805ae9e8944a 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -166,7 +166,6 @@ void jfs_evict_inode(struct inode *inode) /* * Free the inode from the quota allocation. */ - dquot_initialize(inode); dquot_free_inode(inode); } } else { diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 09da5cf14e27..65d8fc87ab11 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -247,7 +247,7 @@ static const match_table_t tokens = { {Opt_resize_nosize, "resize"}, {Opt_errors, "errors=%s"}, {Opt_ignore, "noquota"}, - {Opt_ignore, "quota"}, + {Opt_quota, "quota"}, {Opt_usrquota, "usrquota"}, {Opt_grpquota, "grpquota"}, {Opt_uid, "uid=%u"}, diff --git a/fs/namespace.c b/fs/namespace.c index 99186556f8d3..d86830c86ce8 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2642,6 +2642,7 @@ static long exact_copy_from_user(void *to, const void __user * from, if (!access_ok(VERIFY_READ, from, n)) return n; + current->kernel_uaccess_faults_ok++; while (n) { if (__get_user(c, f)) { memset(t, 0, n); @@ -2651,6 +2652,7 @@ static long exact_copy_from_user(void *to, const void __user * from, f++; n--; } + current->kernel_uaccess_faults_ok--; return n; } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 55a099e47ba2..b53e76391e52 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -541,7 +541,8 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst, u64 dst_pos, u64 count) { - return nfserrno(do_clone_file_range(src, src_pos, dst, dst_pos, count)); + return nfserrno(vfs_clone_file_range(src, src_pos, dst, dst_pos, + count)); } ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst, diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index d9ebe11c8990..1d098c3c00e0 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -342,6 +342,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, * for this bh as it's not marked locally * uptodate. */ status = -EIO; + clear_buffer_needs_validate(bh); put_bh(bh); bhs[i] = NULL; continue; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index aaca0949fe53..826f0567ec43 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -584,9 +584,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, res->last_used = 0; - spin_lock(&dlm->spinlock); + spin_lock(&dlm->track_lock); list_add_tail(&res->tracking, &dlm->tracking_list); - spin_unlock(&dlm->spinlock); + spin_unlock(&dlm->track_lock); memset(res->lvb, 0, DLM_LVB_LEN); memset(res->refmap, 0, sizeof(res->refmap)); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 8e712b614e6e..933aac5da193 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -96,7 +96,9 @@ struct ocfs2_unblock_ctl { }; /* Lockdep class keys */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES]; +#endif static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres, int new_level); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 7869622af22a..7a5ee145c733 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2946,6 +2946,7 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, if (map_end & (PAGE_SIZE - 1)) to = map_end & (PAGE_SIZE - 1); +retry: page = find_or_create_page(mapping, page_index, GFP_NOFS); if (!page) { ret = -ENOMEM; @@ -2954,11 +2955,18 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, } /* - * In case PAGE_SIZE <= CLUSTER_SIZE, This page - * can't be dirtied before we CoW it out. + * In case PAGE_SIZE <= CLUSTER_SIZE, we do not expect a dirty + * page, so write it back. */ - if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize) - BUG_ON(PageDirty(page)); + if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize) { + if (PageDirty(page)) { + /* + * write_on_page will unlock the page on return + */ + ret = write_one_page(page); + goto retry; + } + } if (!PageUptodate(page)) { ret = block_read_full_page(page, ocfs2_get_block); diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index 10587413b20e..72d2ff17d27b 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -167,12 +167,16 @@ int orangefs_init_acl(struct inode *inode, struct inode *dir) error = __orangefs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); posix_acl_release(default_acl); + } else { + inode->i_default_acl = NULL; } if (acl) { if (!error) error = __orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS); posix_acl_release(acl); + } else { + inode->i_acl = NULL; } /* If mode of the inode was changed, then do a forcible ->setattr */ diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 31932879b716..5e65d818937b 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -405,7 +405,11 @@ struct inode *orangefs_iget(struct super_block *sb, orangefs_test_inode, orangefs_set_inode, ref); - if (!inode || !(inode->i_state & I_NEW)) + + if (!inode) + return ERR_PTR(-ENOMEM); + + if (!(inode->i_state & I_NEW)) return inode; error = orangefs_inode_getattr(inode, 1, 1, STATX_ALL); @@ -448,7 +452,7 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, inode = new_inode(sb); if (!inode) - return NULL; + return ERR_PTR(-ENOMEM); orangefs_set_inode(inode, ref); inode->i_ino = hash; /* needed for stat etc */ diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 625b0580f9be..c8676c996249 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -58,7 +58,6 @@ static int orangefs_create(struct inode *dir, goto out; ref = new_op->downcall.resp.create.refn; - op_release(new_op); inode = orangefs_new_inode(dir->i_sb, dir, S_IFREG | mode, 0, &ref); if (IS_ERR(inode)) { @@ -92,6 +91,7 @@ static int orangefs_create(struct inode *dir, mark_inode_dirty_sync(dir); ret = 0; out: + op_release(new_op); gossip_debug(GOSSIP_NAME_DEBUG, "%s: %pd: returning %d\n", __func__, @@ -157,7 +157,7 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry, new_op->downcall.resp.lookup.refn.fs_id, ret); - if (ret >= 0) { + if (ret == 0) { orangefs_set_timeout(dentry); inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn); } else if (ret == -ENOENT) { @@ -269,7 +269,6 @@ static int orangefs_symlink(struct inode *dir, } ref = new_op->downcall.resp.sym.refn; - op_release(new_op); inode = orangefs_new_inode(dir->i_sb, dir, S_IFLNK | mode, 0, &ref); if (IS_ERR(inode)) { @@ -307,6 +306,7 @@ static int orangefs_symlink(struct inode *dir, mark_inode_dirty_sync(dir); ret = 0; out: + op_release(new_op); return ret; } @@ -346,7 +346,6 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode } ref = new_op->downcall.resp.mkdir.refn; - op_release(new_op); inode = orangefs_new_inode(dir->i_sb, dir, S_IFDIR | mode, 0, &ref); if (IS_ERR(inode)) { @@ -379,6 +378,7 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode orangefs_inode_setattr(dir, &iattr); mark_inode_dirty_sync(dir); out: + op_release(new_op); return ret; } diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c index dd28079f518c..19739aaee675 100644 --- a/fs/orangefs/orangefs-sysfs.c +++ b/fs/orangefs/orangefs-sysfs.c @@ -323,7 +323,7 @@ static ssize_t sysfs_service_op_show(struct kobject *kobj, /* Can't do a service_operation if the client is not running... */ rc = is_daemon_in_service(); if (rc) { - pr_info("%s: Client not running :%d:\n", + pr_info_ratelimited("%s: Client not running :%d:\n", __func__, is_daemon_in_service()); goto out; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 296037afecdb..1cc797a08a5b 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -141,7 +141,7 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) } /* Try to use clone_file_range to clone up within the same fs */ - error = vfs_clone_file_range(old_file, 0, new_file, 0, len); + error = do_clone_file_range(old_file, 0, new_file, 0, len); if (!error) goto out; /* Couldn't clone, so now we try to copy the data */ diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index aeaefd2a551b..986313da0c88 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -240,8 +240,10 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) goto out_unlock; old_cred = ovl_override_creds(file_inode(file)->i_sb); + file_start_write(real.file); ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, ovl_iocb_to_rwf(iocb)); + file_end_write(real.file); revert_creds(old_cred); /* Update size */ diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index b6ac545b5a32..3b7ed5d2279c 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -504,7 +504,7 @@ static const struct inode_operations ovl_special_inode_operations = { .update_time = ovl_update_time, }; -const struct address_space_operations ovl_aops = { +static const struct address_space_operations ovl_aops = { /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */ .direct_IO = noop_direct_IO, }; diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index f28711846dd6..9c0ca6a7becf 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -686,7 +686,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, index = NULL; goto out; } - pr_warn_ratelimited("overlayfs: failed inode index lookup (ino=%lu, key=%*s, err=%i);\n" + pr_warn_ratelimited("overlayfs: failed inode index lookup (ino=%lu, key=%.*s, err=%i);\n" "overlayfs: mount with '-o index=off' to disable inodes index.\n", d_inode(origin)->i_ino, name.len, name.name, err); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index f61839e1054c..a3c0d9584312 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -152,8 +152,8 @@ static inline int ovl_do_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { int err = vfs_setxattr(dentry, name, value, size, flags); - pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n", - dentry, name, (int) size, (char *) value, flags, err); + pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, 0x%x) = %i\n", + dentry, name, min((int)size, 48), value, size, flags, err); return err; } diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 8cfb62cc8672..ace4fe4c39a9 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -683,7 +683,7 @@ static void ovl_cleanup_index(struct dentry *dentry) struct dentry *upperdentry = ovl_dentry_upper(dentry); struct dentry *index = NULL; struct inode *inode; - struct qstr name; + struct qstr name = { }; int err; err = ovl_get_index_name(lowerdentry, &name); @@ -726,6 +726,7 @@ static void ovl_cleanup_index(struct dentry *dentry) goto fail; out: + kfree(name.name); dput(index); return; diff --git a/fs/proc/base.c b/fs/proc/base.c index ccf86f16d9f0..7e9f07bf260d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -407,6 +407,20 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, unsigned long *entries; int err; + /* + * The ability to racily run the kernel stack unwinder on a running task + * and then observe the unwinder output is scary; while it is useful for + * debugging kernel issues, it can also allow an attacker to leak kernel + * stack contents. + * Doing this in a manner that is at least safe from races would require + * some work to ensure that the remote task can not be scheduled; and + * even then, this would still expose the unwinder as local attack + * surface. + * Therefore, this interface is restricted to root. + */ + if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) + return -EACCES; + entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries), GFP_KERNEL); if (!entries) diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index ad72261ee3fe..d297fe4472a9 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -464,6 +464,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) ret = -EFAULT; goto out; } + m = NULL; /* skip the list anchor */ } else if (m->type == KCORE_VMALLOC) { vread(buf, (char *)start, tsz); /* we have to zero-fill user buffer even if no read */ diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index cbde728f8ac6..91ae16fbd7d5 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -24,6 +24,8 @@ #include <linux/vmalloc.h> #include <linux/pagemap.h> #include <linux/uaccess.h> +#include <linux/mem_encrypt.h> +#include <asm/pgtable.h> #include <asm/io.h> #include "internal.h" @@ -98,7 +100,8 @@ static int pfn_is_ram(unsigned long pfn) /* Reads a page from the oldmem device from given offset. */ static ssize_t read_from_oldmem(char *buf, size_t count, - u64 *ppos, int userbuf) + u64 *ppos, int userbuf, + bool encrypted) { unsigned long pfn, offset; size_t nr_bytes; @@ -120,8 +123,15 @@ static ssize_t read_from_oldmem(char *buf, size_t count, if (pfn_is_ram(pfn) == 0) memset(buf, 0, nr_bytes); else { - tmp = copy_oldmem_page(pfn, buf, nr_bytes, - offset, userbuf); + if (encrypted) + tmp = copy_oldmem_page_encrypted(pfn, buf, + nr_bytes, + offset, + userbuf); + else + tmp = copy_oldmem_page(pfn, buf, nr_bytes, + offset, userbuf); + if (tmp < 0) return tmp; } @@ -155,7 +165,7 @@ void __weak elfcorehdr_free(unsigned long long addr) */ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0); + return read_from_oldmem(buf, count, ppos, 0, false); } /* @@ -163,7 +173,7 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) */ ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0); + return read_from_oldmem(buf, count, ppos, 0, sme_active()); } /* @@ -173,10 +183,21 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from, unsigned long pfn, unsigned long size, pgprot_t prot) { + prot = pgprot_encrypted(prot); return remap_pfn_range(vma, from, pfn, size, prot); } /* + * Architectures which support memory encryption override this. + */ +ssize_t __weak +copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize, + unsigned long offset, int userbuf) +{ + return copy_oldmem_page(pfn, buf, csize, offset, userbuf); +} + +/* * Copy to either kernel or user space */ static int copy_to(void *target, void *src, size_t size, int userbuf) @@ -351,7 +372,8 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, m->offset + m->size - *fpos, buflen); start = m->paddr + *fpos - m->offset; - tmp = read_from_oldmem(buffer, tsz, &start, userbuf); + tmp = read_from_oldmem(buffer, tsz, &start, + userbuf, sme_active()); if (tmp < 0) return tmp; buflen -= tsz; diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 5fcb845b9fec..8cf2218b46a7 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -482,12 +482,10 @@ static struct file_system_type pstore_fs_type = { .kill_sb = pstore_kill_sb, }; -static int __init init_pstore_fs(void) +int __init pstore_init_fs(void) { int err; - pstore_choose_compression(); - /* Create a convenient mount point for people to access pstore */ err = sysfs_create_mount_point(fs_kobj, "pstore"); if (err) @@ -500,14 +498,9 @@ static int __init init_pstore_fs(void) out: return err; } -module_init(init_pstore_fs) -static void __exit exit_pstore_fs(void) +void __exit pstore_exit_fs(void) { unregister_filesystem(&pstore_fs_type); sysfs_remove_mount_point(fs_kobj, "pstore"); } -module_exit(exit_pstore_fs) - -MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>"); -MODULE_LICENSE("GPL"); diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index fb767e28aeb2..7062ea4bc57c 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -37,7 +37,8 @@ extern bool pstore_is_mounted(void); extern void pstore_record_init(struct pstore_record *record, struct pstore_info *psi); -/* Called during module_init() */ -extern void __init pstore_choose_compression(void); +/* Called during pstore init/exit. */ +int __init pstore_init_fs(void); +void __exit pstore_exit_fs(void); #endif diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 15e99d5a681d..b821054ca3ed 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -274,36 +274,56 @@ static int pstore_decompress(void *in, void *out, static void allocate_buf_for_compression(void) { + struct crypto_comp *ctx; + int size; + char *buf; + + /* Skip if not built-in or compression backend not selected yet. */ if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !zbackend) return; + /* Skip if no pstore backend yet or compression init already done. */ + if (!psinfo || tfm) + return; + if (!crypto_has_comp(zbackend->name, 0, 0)) { - pr_err("No %s compression\n", zbackend->name); + pr_err("Unknown compression: %s\n", zbackend->name); return; } - big_oops_buf_sz = zbackend->zbufsize(psinfo->bufsize); - if (big_oops_buf_sz <= 0) + size = zbackend->zbufsize(psinfo->bufsize); + if (size <= 0) { + pr_err("Invalid compression size for %s: %d\n", + zbackend->name, size); return; + } - big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL); - if (!big_oops_buf) { - pr_err("allocate compression buffer error!\n"); + buf = kmalloc(size, GFP_KERNEL); + if (!buf) { + pr_err("Failed %d byte compression buffer allocation for: %s\n", + size, zbackend->name); return; } - tfm = crypto_alloc_comp(zbackend->name, 0, 0); - if (IS_ERR_OR_NULL(tfm)) { - kfree(big_oops_buf); - big_oops_buf = NULL; - pr_err("crypto_alloc_comp() failed!\n"); + ctx = crypto_alloc_comp(zbackend->name, 0, 0); + if (IS_ERR_OR_NULL(ctx)) { + kfree(buf); + pr_err("crypto_alloc_comp('%s') failed: %ld\n", zbackend->name, + PTR_ERR(ctx)); return; } + + /* A non-NULL big_oops_buf indicates compression is available. */ + tfm = ctx; + big_oops_buf_sz = size; + big_oops_buf = buf; + + pr_info("Using compression: %s\n", zbackend->name); } static void free_buf_for_compression(void) { - if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && !IS_ERR_OR_NULL(tfm)) + if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && tfm) crypto_free_comp(tfm); kfree(big_oops_buf); big_oops_buf = NULL; @@ -774,14 +794,43 @@ void __init pstore_choose_compression(void) for (step = zbackends; step->name; step++) { if (!strcmp(compress, step->name)) { zbackend = step; - pr_info("using %s compression\n", zbackend->name); return; } } } +static int __init pstore_init(void) +{ + int ret; + + pstore_choose_compression(); + + /* + * Check if any pstore backends registered earlier but did not + * initialize compression because crypto was not ready. If so, + * initialize compression now. + */ + allocate_buf_for_compression(); + + ret = pstore_init_fs(); + if (ret) + return ret; + + return 0; +} +late_initcall(pstore_init); + +static void __exit pstore_exit(void) +{ + pstore_exit_fs(); +} +module_exit(pstore_exit) + module_param(compress, charp, 0444); MODULE_PARM_DESC(compress, "Pstore compression to use"); module_param(backend, charp, 0444); MODULE_PARM_DESC(backend, "Pstore backend to use"); + +MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>"); +MODULE_LICENSE("GPL"); diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index bbd1e357c23d..ffcff6516e89 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -587,9 +587,16 @@ static int ramoops_init_przs(const char *name, goto fail; for (i = 0; i < *cnt; i++) { + char *label; + + if (*cnt == 1) + label = kasprintf(GFP_KERNEL, "ramoops:%s", name); + else + label = kasprintf(GFP_KERNEL, "ramoops:%s(%d/%d)", + name, i, *cnt - 1); prz_ar[i] = persistent_ram_new(*paddr, zone_sz, sig, - &cxt->ecc_info, - cxt->memtype, flags); + &cxt->ecc_info, + cxt->memtype, flags, label); if (IS_ERR(prz_ar[i])) { err = PTR_ERR(prz_ar[i]); dev_err(dev, "failed to request %s mem region (0x%zx@0x%llx): %d\n", @@ -619,6 +626,8 @@ static int ramoops_init_prz(const char *name, struct persistent_ram_zone **prz, phys_addr_t *paddr, size_t sz, u32 sig) { + char *label; + if (!sz) return 0; @@ -629,8 +638,9 @@ static int ramoops_init_prz(const char *name, return -ENOMEM; } + label = kasprintf(GFP_KERNEL, "ramoops:%s", name); *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info, - cxt->memtype, 0); + cxt->memtype, 0, label); if (IS_ERR(*prz)) { int err = PTR_ERR(*prz); @@ -898,8 +908,22 @@ static struct platform_driver ramoops_driver = { }, }; -static void ramoops_register_dummy(void) +static inline void ramoops_unregister_dummy(void) +{ + platform_device_unregister(dummy); + dummy = NULL; + + kfree(dummy_data); + dummy_data = NULL; +} + +static void __init ramoops_register_dummy(void) { + /* + * Prepare a dummy platform data structure to carry the module + * parameters. If mem_size isn't set, then there are no module + * parameters, and we can skip this. + */ if (!mem_size) return; @@ -932,21 +956,28 @@ static void ramoops_register_dummy(void) if (IS_ERR(dummy)) { pr_info("could not create platform device: %ld\n", PTR_ERR(dummy)); + dummy = NULL; + ramoops_unregister_dummy(); } } static int __init ramoops_init(void) { + int ret; + ramoops_register_dummy(); - return platform_driver_register(&ramoops_driver); + ret = platform_driver_register(&ramoops_driver); + if (ret != 0) + ramoops_unregister_dummy(); + + return ret; } -late_initcall(ramoops_init); +postcore_initcall(ramoops_init); static void __exit ramoops_exit(void) { platform_driver_unregister(&ramoops_driver); - platform_device_unregister(dummy); - kfree(dummy_data); + ramoops_unregister_dummy(); } module_exit(ramoops_exit); diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 0792595ebcfb..12e21f789194 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -438,11 +438,11 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size, } static void *persistent_ram_iomap(phys_addr_t start, size_t size, - unsigned int memtype) + unsigned int memtype, char *label) { void *va; - if (!request_mem_region(start, size, "persistent_ram")) { + if (!request_mem_region(start, size, label ?: "ramoops")) { pr_err("request mem region (0x%llx@0x%llx) failed\n", (unsigned long long)size, (unsigned long long)start); return NULL; @@ -470,7 +470,8 @@ static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size, if (pfn_valid(start >> PAGE_SHIFT)) prz->vaddr = persistent_ram_vmap(start, size, memtype); else - prz->vaddr = persistent_ram_iomap(start, size, memtype); + prz->vaddr = persistent_ram_iomap(start, size, memtype, + prz->label); if (!prz->vaddr) { pr_err("%s: Failed to map 0x%llx pages at 0x%llx\n", __func__, @@ -541,12 +542,13 @@ void persistent_ram_free(struct persistent_ram_zone *prz) prz->ecc_info.par = NULL; persistent_ram_free_old(prz); + kfree(prz->label); kfree(prz); } struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, u32 sig, struct persistent_ram_ecc_info *ecc_info, - unsigned int memtype, u32 flags) + unsigned int memtype, u32 flags, char *label) { struct persistent_ram_zone *prz; int ret = -ENOMEM; @@ -560,6 +562,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, /* Initialize general buffer state. */ raw_spin_lock_init(&prz->buffer_lock); prz->flags = flags; + prz->label = label; ret = persistent_ram_buffer_map(start, size, prz, memtype); if (ret) diff --git a/fs/read_write.c b/fs/read_write.c index 39b4a21dd933..8a2737f0d61d 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1818,8 +1818,8 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, } EXPORT_SYMBOL(vfs_clone_file_prep_inodes); -int vfs_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len) +int do_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, u64 len) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); @@ -1866,6 +1866,19 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in, return ret; } +EXPORT_SYMBOL(do_clone_file_range); + +int vfs_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, u64 len) +{ + int ret; + + file_start_write(file_out); + ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len); + file_end_write(file_out); + + return ret; +} EXPORT_SYMBOL(vfs_clone_file_range); /* diff --git a/fs/signalfd.c b/fs/signalfd.c index 4fcd1498acf5..757afc7c5895 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -79,7 +79,7 @@ static __poll_t signalfd_poll(struct file *file, poll_table *wait) * Copied from copy_siginfo_to_user() in kernel/signal.c */ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, - siginfo_t const *kinfo) + kernel_siginfo_t const *kinfo) { struct signalfd_siginfo new; @@ -163,7 +163,7 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, return sizeof(*uinfo); } -static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, siginfo_t *info, +static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info, int nonblock) { ssize_t ret; @@ -215,7 +215,7 @@ static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count, struct signalfd_siginfo __user *siginfo; int nonblock = file->f_flags & O_NONBLOCK; ssize_t ret, total = 0; - siginfo_t info; + kernel_siginfo_t info; count /= sizeof(struct signalfd_siginfo); if (!count) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 23e7042666a7..fec62e9dfbe6 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1912,7 +1912,9 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) mutex_unlock(&c->bu_mutex); } - ubifs_assert(c, c->lst.taken_empty_lebs > 0); + if (!c->need_recovery) + ubifs_assert(c, c->lst.taken_empty_lebs > 0); + return 0; } @@ -1954,6 +1956,9 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode) int dev, vol; char *endptr; + if (!name || !*name) + return ERR_PTR(-EINVAL); + /* First, try to open using the device node path method */ ubi = ubi_open_volume_path(name, mode); if (!IS_ERR(ubi)) @@ -2332,8 +2337,8 @@ late_initcall(ubifs_init); static void __exit ubifs_exit(void) { - WARN_ON(list_empty(&ubifs_infos)); - WARN_ON(atomic_long_read(&ubifs_clean_zn_cnt) == 0); + WARN_ON(!list_empty(&ubifs_infos)); + WARN_ON(atomic_long_read(&ubifs_clean_zn_cnt) != 0); dbg_debugfs_exit(); ubifs_compressors_exit(); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 61afdfee4b28..f5ad1ede7990 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -152,12 +152,6 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, ui->data_len = size; mutex_lock(&host_ui->ui_mutex); - - if (!host->i_nlink) { - err = -ENOENT; - goto out_noent; - } - host->i_ctime = current_time(host); host_ui->xattr_cnt += 1; host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm)); @@ -190,7 +184,6 @@ out_cancel: host_ui->xattr_size -= CALC_XATTR_BYTES(size); host_ui->xattr_names -= fname_len(nm); host_ui->flags &= ~UBIFS_CRYPT_FL; -out_noent: mutex_unlock(&host_ui->ui_mutex); out_free: make_bad_inode(inode); @@ -242,12 +235,6 @@ static int change_xattr(struct ubifs_info *c, struct inode *host, mutex_unlock(&ui->ui_mutex); mutex_lock(&host_ui->ui_mutex); - - if (!host->i_nlink) { - err = -ENOENT; - goto out_noent; - } - host->i_ctime = current_time(host); host_ui->xattr_size -= CALC_XATTR_BYTES(old_size); host_ui->xattr_size += CALC_XATTR_BYTES(size); @@ -269,7 +256,6 @@ static int change_xattr(struct ubifs_info *c, struct inode *host, out_cancel: host_ui->xattr_size -= CALC_XATTR_BYTES(size); host_ui->xattr_size += CALC_XATTR_BYTES(old_size); -out_noent: mutex_unlock(&host_ui->ui_mutex); make_bad_inode(inode); out_free: @@ -496,12 +482,6 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host, return err; mutex_lock(&host_ui->ui_mutex); - - if (!host->i_nlink) { - err = -ENOENT; - goto out_noent; - } - host->i_ctime = current_time(host); host_ui->xattr_cnt -= 1; host_ui->xattr_size -= CALC_DENT_SIZE(fname_len(nm)); @@ -521,7 +501,6 @@ out_cancel: host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm)); host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len); host_ui->xattr_names += fname_len(nm); -out_noent: mutex_unlock(&host_ui->ui_mutex); ubifs_release_budget(c, &req); make_bad_inode(inode); @@ -561,9 +540,6 @@ static int ubifs_xattr_remove(struct inode *host, const char *name) ubifs_assert(c, inode_is_locked(host)); - if (!host->i_nlink) - return -ENOENT; - if (fname_len(&nm) > UBIFS_MAX_NLEN) return -ENAMETOOLONG; diff --git a/fs/xattr.c b/fs/xattr.c index daa732550088..0d6a6a4af861 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -948,17 +948,19 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, int err = 0; #ifdef CONFIG_FS_POSIX_ACL - if (inode->i_acl) { - err = xattr_list_one(&buffer, &remaining_size, - XATTR_NAME_POSIX_ACL_ACCESS); - if (err) - return err; - } - if (inode->i_default_acl) { - err = xattr_list_one(&buffer, &remaining_size, - XATTR_NAME_POSIX_ACL_DEFAULT); - if (err) - return err; + if (IS_POSIXACL(inode)) { + if (inode->i_acl) { + err = xattr_list_one(&buffer, &remaining_size, + XATTR_NAME_POSIX_ACL_ACCESS); + if (err) + return err; + } + if (inode->i_default_acl) { + err = xattr_list_one(&buffer, &remaining_size, + XATTR_NAME_POSIX_ACL_DEFAULT); + if (err) + return err; + } } #endif diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 1e671d4eb6fa..844ed87b1900 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -191,6 +191,128 @@ xfs_attr_calc_size( return nblks; } +STATIC int +xfs_attr_try_sf_addname( + struct xfs_inode *dp, + struct xfs_da_args *args) +{ + + struct xfs_mount *mp = dp->i_mount; + int error, error2; + + error = xfs_attr_shortform_addname(args); + if (error == -ENOSPC) + return error; + + /* + * Commit the shortform mods, and we're done. + * NOTE: this is also the error path (EEXIST, etc). + */ + if (!error && (args->flags & ATTR_KERNOTIME) == 0) + xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); + + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(args->trans); + + error2 = xfs_trans_commit(args->trans); + args->trans = NULL; + return error ? error : error2; +} + +/* + * Set the attribute specified in @args. + */ +int +xfs_attr_set_args( + struct xfs_da_args *args, + struct xfs_buf **leaf_bp) +{ + struct xfs_inode *dp = args->dp; + int error; + + /* + * If the attribute list is non-existent or a shortform list, + * upgrade it to a single-leaf-block attribute list. + */ + if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL || + (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + dp->i_d.di_anextents == 0)) { + + /* + * Build initial attribute list (if required). + */ + if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) + xfs_attr_shortform_create(args); + + /* + * Try to add the attr to the attribute list in the inode. + */ + error = xfs_attr_try_sf_addname(dp, args); + if (error != -ENOSPC) + return error; + + /* + * It won't fit in the shortform, transform to a leaf block. + * GROT: another possible req'mt for a double-split btree op. + */ + error = xfs_attr_shortform_to_leaf(args, leaf_bp); + if (error) + return error; + + /* + * Prevent the leaf buffer from being unlocked so that a + * concurrent AIL push cannot grab the half-baked leaf + * buffer and run into problems with the write verifier. + */ + xfs_trans_bhold(args->trans, *leaf_bp); + + error = xfs_defer_finish(&args->trans); + if (error) + return error; + + /* + * Commit the leaf transformation. We'll need another + * (linked) transaction to add the new attribute to the + * leaf. + */ + error = xfs_trans_roll_inode(&args->trans, dp); + if (error) + return error; + xfs_trans_bjoin(args->trans, *leaf_bp); + *leaf_bp = NULL; + } + + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) + error = xfs_attr_leaf_addname(args); + else + error = xfs_attr_node_addname(args); + return error; +} + +/* + * Remove the attribute specified in @args. + */ +int +xfs_attr_remove_args( + struct xfs_da_args *args) +{ + struct xfs_inode *dp = args->dp; + int error; + + if (!xfs_inode_hasattr(dp)) { + error = -ENOATTR; + } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + ASSERT(dp->i_afp->if_flags & XFS_IFINLINE); + error = xfs_attr_shortform_remove(args); + } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_removename(args); + } else { + error = xfs_attr_node_removename(args); + } + + return error; +} + int xfs_attr_set( struct xfs_inode *dp, @@ -204,7 +326,7 @@ xfs_attr_set( struct xfs_da_args args; struct xfs_trans_res tres; int rsvd = (flags & ATTR_ROOT) != 0; - int error, err2, local; + int error, local; XFS_STATS_INC(mp, xs_attr_set); @@ -255,93 +377,17 @@ xfs_attr_set( error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0, rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : XFS_QMOPT_RES_REGBLKS); - if (error) { - xfs_iunlock(dp, XFS_ILOCK_EXCL); - xfs_trans_cancel(args.trans); - return error; - } + if (error) + goto out_trans_cancel; xfs_trans_ijoin(args.trans, dp, 0); - - /* - * If the attribute list is non-existent or a shortform list, - * upgrade it to a single-leaf-block attribute list. - */ - if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL || - (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - dp->i_d.di_anextents == 0)) { - - /* - * Build initial attribute list (if required). - */ - if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) - xfs_attr_shortform_create(&args); - - /* - * Try to add the attr to the attribute list in - * the inode. - */ - error = xfs_attr_shortform_addname(&args); - if (error != -ENOSPC) { - /* - * Commit the shortform mods, and we're done. - * NOTE: this is also the error path (EEXIST, etc). - */ - ASSERT(args.trans != NULL); - - /* - * If this is a synchronous mount, make sure that - * the transaction goes to disk before returning - * to the user. - */ - if (mp->m_flags & XFS_MOUNT_WSYNC) - xfs_trans_set_sync(args.trans); - - if (!error && (flags & ATTR_KERNOTIME) == 0) { - xfs_trans_ichgtime(args.trans, dp, - XFS_ICHGTIME_CHG); - } - err2 = xfs_trans_commit(args.trans); - xfs_iunlock(dp, XFS_ILOCK_EXCL); - - return error ? error : err2; - } - - /* - * It won't fit in the shortform, transform to a leaf block. - * GROT: another possible req'mt for a double-split btree op. - */ - error = xfs_attr_shortform_to_leaf(&args, &leaf_bp); - if (error) - goto out; - /* - * Prevent the leaf buffer from being unlocked so that a - * concurrent AIL push cannot grab the half-baked leaf - * buffer and run into problems with the write verifier. - */ - xfs_trans_bhold(args.trans, leaf_bp); - error = xfs_defer_finish(&args.trans); - if (error) - goto out; - - /* - * Commit the leaf transformation. We'll need another (linked) - * transaction to add the new attribute to the leaf, which - * means that we have to hold & join the leaf buffer here too. - */ - error = xfs_trans_roll_inode(&args.trans, dp); - if (error) - goto out; - xfs_trans_bjoin(args.trans, leaf_bp); - leaf_bp = NULL; - } - - if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) - error = xfs_attr_leaf_addname(&args); - else - error = xfs_attr_node_addname(&args); + error = xfs_attr_set_args(&args, &leaf_bp); if (error) - goto out; + goto out_release_leaf; + if (!args.trans) { + /* shortform attribute has already been committed */ + goto out_unlock; + } /* * If this is a synchronous mount, make sure that the @@ -358,17 +404,17 @@ xfs_attr_set( */ xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); error = xfs_trans_commit(args.trans); +out_unlock: xfs_iunlock(dp, XFS_ILOCK_EXCL); - return error; -out: +out_release_leaf: if (leaf_bp) xfs_trans_brelse(args.trans, leaf_bp); +out_trans_cancel: if (args.trans) xfs_trans_cancel(args.trans); - xfs_iunlock(dp, XFS_ILOCK_EXCL); - return error; + goto out_unlock; } /* @@ -423,17 +469,7 @@ xfs_attr_remove( */ xfs_trans_ijoin(args.trans, dp, 0); - if (!xfs_inode_hasattr(dp)) { - error = -ENOATTR; - } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { - ASSERT(dp->i_afp->if_flags & XFS_IFINLINE); - error = xfs_attr_shortform_remove(&args); - } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { - error = xfs_attr_leaf_removename(&args); - } else { - error = xfs_attr_node_removename(&args); - } - + error = xfs_attr_remove_args(&args); if (error) goto out; @@ -587,7 +623,7 @@ xfs_attr_leaf_addname( */ error = xfs_attr3_leaf_to_node(args); if (error) - goto out_defer_cancel; + return error; error = xfs_defer_finish(&args->trans); if (error) return error; @@ -675,7 +711,7 @@ xfs_attr_leaf_addname( error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (error) - goto out_defer_cancel; + return error; error = xfs_defer_finish(&args->trans); if (error) return error; @@ -693,9 +729,6 @@ xfs_attr_leaf_addname( error = xfs_attr3_leaf_clearflag(args); } return error; -out_defer_cancel: - xfs_defer_cancel(args->trans); - return error; } /* @@ -738,15 +771,12 @@ xfs_attr_leaf_removename( error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (error) - goto out_defer_cancel; + return error; error = xfs_defer_finish(&args->trans); if (error) return error; } return 0; -out_defer_cancel: - xfs_defer_cancel(args->trans); - return error; } /* @@ -864,7 +894,7 @@ restart: state = NULL; error = xfs_attr3_leaf_to_node(args); if (error) - goto out_defer_cancel; + goto out; error = xfs_defer_finish(&args->trans); if (error) goto out; @@ -888,7 +918,7 @@ restart: */ error = xfs_da3_split(state); if (error) - goto out_defer_cancel; + goto out; error = xfs_defer_finish(&args->trans); if (error) goto out; @@ -984,7 +1014,7 @@ restart: if (retval && (state->path.active > 1)) { error = xfs_da3_join(state); if (error) - goto out_defer_cancel; + goto out; error = xfs_defer_finish(&args->trans); if (error) goto out; @@ -1013,9 +1043,6 @@ out: if (error) return error; return retval; -out_defer_cancel: - xfs_defer_cancel(args->trans); - goto out; } /* @@ -1107,7 +1134,7 @@ xfs_attr_node_removename( if (retval && (state->path.active > 1)) { error = xfs_da3_join(state); if (error) - goto out_defer_cancel; + goto out; error = xfs_defer_finish(&args->trans); if (error) goto out; @@ -1138,7 +1165,7 @@ xfs_attr_node_removename( error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (error) - goto out_defer_cancel; + goto out; error = xfs_defer_finish(&args->trans); if (error) goto out; @@ -1150,9 +1177,6 @@ xfs_attr_node_removename( out: xfs_da_state_free(state); return error; -out_defer_cancel: - xfs_defer_cancel(args->trans); - goto out; } /* diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 033ff8c478e2..bdf52a333f3f 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -140,7 +140,9 @@ int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, unsigned char *value, int *valuelenp, int flags); int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, unsigned char *value, int valuelen, int flags); +int xfs_attr_set_args(struct xfs_da_args *args, struct xfs_buf **leaf_bp); int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); +int xfs_attr_remove_args(struct xfs_da_args *args); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index af094063e402..d89363c6b523 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -485,7 +485,7 @@ xfs_attr_rmtval_set( blkcnt, XFS_BMAPI_ATTRFORK, args->total, &map, &nmap); if (error) - goto out_defer_cancel; + return error; error = xfs_defer_finish(&args->trans); if (error) return error; @@ -553,9 +553,6 @@ xfs_attr_rmtval_set( } ASSERT(valuelen == 0); return 0; -out_defer_cancel: - xfs_defer_cancel(args->trans); - return error; } /* @@ -625,7 +622,7 @@ xfs_attr_rmtval_remove( error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, XFS_BMAPI_ATTRFORK, 1, &done); if (error) - goto out_defer_cancel; + return error; error = xfs_defer_finish(&args->trans); if (error) return error; @@ -638,7 +635,4 @@ xfs_attr_rmtval_remove( return error; } return 0; -out_defer_cancel: - xfs_defer_cancel(args->trans); - return error; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 2760314fdf7f..74d7228e755b 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -673,7 +673,8 @@ xfs_bmap_extents_to_btree( ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); /* - * Make space in the inode incore. + * Make space in the inode incore. This needs to be undone if we fail + * to expand the root. */ xfs_iroot_realloc(ip, 1, whichfork); ifp->if_flags |= XFS_IFBROOT; @@ -711,16 +712,15 @@ xfs_bmap_extents_to_btree( args.minlen = args.maxlen = args.prod = 1; args.wasdel = wasdel; *logflagsp = 0; - if ((error = xfs_alloc_vextent(&args))) { - ASSERT(ifp->if_broot == NULL); - goto err1; - } + error = xfs_alloc_vextent(&args); + if (error) + goto out_root_realloc; if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { - ASSERT(ifp->if_broot == NULL); error = -ENOSPC; - goto err1; + goto out_root_realloc; } + /* * Allocation can't fail, the space was reserved. */ @@ -732,9 +732,10 @@ xfs_bmap_extents_to_btree( xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0); if (!abp) { - error = -ENOSPC; - goto err2; + error = -EFSCORRUPTED; + goto out_unreserve_dquot; } + /* * Fill in the child block. */ @@ -775,11 +776,12 @@ xfs_bmap_extents_to_btree( *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork); return 0; -err2: +out_unreserve_dquot: xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); -err1: +out_root_realloc: xfs_iroot_realloc(ip, -1, whichfork); XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + ASSERT(ifp->if_broot == NULL); xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; @@ -1017,6 +1019,34 @@ xfs_bmap_add_attrfork_local( return -EFSCORRUPTED; } +/* Set an inode attr fork off based on the format */ +int +xfs_bmap_set_attrforkoff( + struct xfs_inode *ip, + int size, + int *version) +{ + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_DEV: + ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; + break; + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size); + if (!ip->i_d.di_forkoff) + ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3; + else if ((ip->i_mount->m_flags & XFS_MOUNT_ATTR2) && version) + *version = 2; + break; + default: + ASSERT(0); + return -EINVAL; + } + + return 0; +} + /* * Convert inode from non-attributed to attributed. * Must not be in a transaction, ip must not be locked. @@ -1068,26 +1098,9 @@ xfs_bmap_add_attrfork( xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - switch (ip->i_d.di_format) { - case XFS_DINODE_FMT_DEV: - ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; - break; - case XFS_DINODE_FMT_LOCAL: - case XFS_DINODE_FMT_EXTENTS: - case XFS_DINODE_FMT_BTREE: - ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size); - if (!ip->i_d.di_forkoff) - ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3; - else if (mp->m_flags & XFS_MOUNT_ATTR2) - version = 2; - break; - default: - ASSERT(0); - error = -EINVAL; + error = xfs_bmap_set_attrforkoff(ip, size, &version); + if (error) goto trans_cancel; - } - ASSERT(ip->i_afp == NULL); ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); ip->i_afp->if_flags = XFS_IFEXTENTS; @@ -4079,8 +4092,7 @@ xfs_bmapi_allocate( * extents to real extents when we're about to write the data. */ if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) && - (bma->flags & XFS_BMAPI_PREALLOC) && - xfs_sb_version_hasextflgbit(&mp->m_sb)) + (bma->flags & XFS_BMAPI_PREALLOC)) bma->got.br_state = XFS_EXT_UNWRITTEN; if (bma->wasdel) @@ -5243,8 +5255,7 @@ __xfs_bunmapi( * unmapping part of it. But we can't really * get rid of part of a realtime extent. */ - if (del.br_state == XFS_EXT_UNWRITTEN || - !xfs_sb_version_hasextflgbit(&mp->m_sb)) { + if (del.br_state == XFS_EXT_UNWRITTEN) { /* * This piece is unwritten, or we're not * using unwritten extents. Skip over it. @@ -5294,10 +5305,9 @@ __xfs_bunmapi( del.br_blockcount -= mod; del.br_startoff += mod; del.br_startblock += mod; - } else if ((del.br_startoff == start && - (del.br_state == XFS_EXT_UNWRITTEN || - tp->t_blk_res == 0)) || - !xfs_sb_version_hasextflgbit(&mp->m_sb)) { + } else if (del.br_startoff == start && + (del.br_state == XFS_EXT_UNWRITTEN || + tp->t_blk_res == 0)) { /* * Can't make it unwritten. There isn't * a full extent here so just skip it. @@ -6112,11 +6122,7 @@ xfs_bmap_validate_extent( XFS_FSB_TO_AGNO(mp, endfsb)) return __this_address; } - if (irec->br_state != XFS_EXT_NORM) { - if (whichfork != XFS_DATA_FORK) - return __this_address; - if (!xfs_sb_version_hasextflgbit(&mp->m_sb)) - return __this_address; - } + if (irec->br_state != XFS_EXT_NORM && whichfork != XFS_DATA_FORK) + return __this_address; return NULL; } diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index b6e9b639e731..488dc8860fd7 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -183,6 +183,7 @@ void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); +int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version); void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, struct xfs_owner_info *oinfo, diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 059bc44c27e8..9995d5ae380b 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -287,6 +287,8 @@ static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp) { if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)) return false; + if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT)) + return false; /* check for unknown features in the fs */ if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || @@ -357,12 +359,6 @@ static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp) (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT); } -static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || - (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT); -} - static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp) { return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT); @@ -1016,6 +1012,8 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */ #define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */ #define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */ +/* Do not use bit 15, di_flags is legacy and unchanging now */ + #define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT) #define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT) #define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 30d1d60f1d46..09d9c8cfa4a0 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -415,6 +415,31 @@ xfs_dinode_verify_fork( return NULL; } +static xfs_failaddr_t +xfs_dinode_verify_forkoff( + struct xfs_dinode *dip, + struct xfs_mount *mp) +{ + if (!XFS_DFORK_Q(dip)) + return NULL; + + switch (dip->di_format) { + case XFS_DINODE_FMT_DEV: + if (dip->di_forkoff != (roundup(sizeof(xfs_dev_t), 8) >> 3)) + return __this_address; + break; + case XFS_DINODE_FMT_LOCAL: /* fall through ... */ + case XFS_DINODE_FMT_EXTENTS: /* fall through ... */ + case XFS_DINODE_FMT_BTREE: + if (dip->di_forkoff >= (XFS_LITINO(mp, dip->di_version) >> 3)) + return __this_address; + break; + default: + return __this_address; + } + return NULL; +} + xfs_failaddr_t xfs_dinode_verify( struct xfs_mount *mp, @@ -470,6 +495,11 @@ xfs_dinode_verify( if (mode && (flags & XFS_DIFLAG_REALTIME) && !mp->m_rtdev_targp) return __this_address; + /* check for illegal values of forkoff */ + fa = xfs_dinode_verify_forkoff(dip, mp); + if (fa) + return fa; + /* Do we have appropriate data fork formats for the mode? */ switch (mode & S_IFMT) { case S_IFIFO: diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 081f46e30556..b5a82acd7dfe 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -1115,7 +1115,8 @@ xfs_fs_geometry( geo->version = XFS_FSOP_GEOM_VERSION; geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK | - XFS_FSOP_GEOM_FLAGS_DIRV2; + XFS_FSOP_GEOM_FLAGS_DIRV2 | + XFS_FSOP_GEOM_FLAGS_EXTFLG; if (xfs_sb_version_hasattr(sbp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR; if (xfs_sb_version_hasquota(sbp)) @@ -1124,8 +1125,6 @@ xfs_fs_geometry( geo->flags |= XFS_FSOP_GEOM_FLAGS_IALIGN; if (xfs_sb_version_hasdalign(sbp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_DALIGN; - if (xfs_sb_version_hasextflgbit(sbp)) - geo->flags |= XFS_FSOP_GEOM_FLAGS_EXTFLG; if (xfs_sb_version_hassector(sbp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR; if (xfs_sb_version_hasasciici(sbp)) diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 036b5c7021eb..376bcb585ae6 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -17,7 +17,6 @@ #include "xfs_sb.h" #include "xfs_alloc.h" #include "xfs_rmap.h" -#include "xfs_alloc.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 5b3b177c0fc9..e386c9b0b4ab 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -126,6 +126,7 @@ xchk_inode_flags( { struct xfs_mount *mp = sc->mp; + /* di_flags are all taken, last bit cannot be used */ if (flags & ~XFS_DIFLAG_ANY) goto bad; @@ -172,8 +173,9 @@ xchk_inode_flags2( { struct xfs_mount *mp = sc->mp; + /* Unknown di_flags2 could be from a future kernel */ if (flags2 & ~XFS_DIFLAG2_ANY) - goto bad; + xchk_ino_set_warning(sc, ino); /* reflink flag requires reflink feature */ if ((flags2 & XFS_DIFLAG2_REFLINK) && diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 9f08dd9bf1d5..4fc0a5ea7673 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -29,6 +29,8 @@ #include "xfs_ag_resv.h" #include "xfs_trans_space.h" #include "xfs_quota.h" +#include "xfs_attr.h" +#include "xfs_reflink.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -692,13 +694,14 @@ xrep_findroot_block( struct xrep_find_ag_btree *fab, uint64_t owner, xfs_agblock_t agbno, - bool *found_it) + bool *done_with_block) { struct xfs_mount *mp = ri->sc->mp; struct xfs_buf *bp; struct xfs_btree_block *btblock; xfs_daddr_t daddr; - int error; + int block_level; + int error = 0; daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.agno, agbno); @@ -717,36 +720,111 @@ xrep_findroot_block( return error; } + /* + * Read the buffer into memory so that we can see if it's a match for + * our btree type. We have no clue if it is beforehand, and we want to + * avoid xfs_trans_read_buf's behavior of dumping the DONE state (which + * will cause needless disk reads in subsequent calls to this function) + * and logging metadata verifier failures. + * + * Therefore, pass in NULL buffer ops. If the buffer was already in + * memory from some other caller it will already have b_ops assigned. + * If it was in memory from a previous unsuccessful findroot_block + * call, the buffer won't have b_ops but it should be clean and ready + * for us to try to verify if the read call succeeds. The same applies + * if the buffer wasn't in memory at all. + * + * Note: If we never match a btree type with this buffer, it will be + * left in memory with NULL b_ops. This shouldn't be a problem unless + * the buffer gets written. + */ error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr, mp->m_bsize, 0, &bp, NULL); if (error) return error; - /* - * Does this look like a block matching our fs and higher than any - * other block we've found so far? If so, reattach buffer verifiers - * so the AIL won't complain if the buffer is also dirty. - */ + /* Ensure the block magic matches the btree type we're looking for. */ btblock = XFS_BUF_TO_BLOCK(bp); if (be32_to_cpu(btblock->bb_magic) != fab->magic) goto out; - if (xfs_sb_version_hascrc(&mp->m_sb) && - !uuid_equal(&btblock->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) - goto out; - bp->b_ops = fab->buf_ops; - /* Ignore this block if it's lower in the tree than we've seen. */ - if (fab->root != NULLAGBLOCK && - xfs_btree_get_level(btblock) < fab->height) - goto out; + /* + * If the buffer already has ops applied and they're not the ones for + * this btree type, we know this block doesn't match the btree and we + * can bail out. + * + * If the buffer ops match ours, someone else has already validated + * the block for us, so we can move on to checking if this is a root + * block candidate. + * + * If the buffer does not have ops, nobody has successfully validated + * the contents and the buffer cannot be dirty. If the magic, uuid, + * and structure match this btree type then we'll move on to checking + * if it's a root block candidate. If there is no match, bail out. + */ + if (bp->b_ops) { + if (bp->b_ops != fab->buf_ops) + goto out; + } else { + ASSERT(!xfs_trans_buf_is_dirty(bp)); + if (!uuid_equal(&btblock->bb_u.s.bb_uuid, + &mp->m_sb.sb_meta_uuid)) + goto out; + fab->buf_ops->verify_read(bp); + if (bp->b_error) { + bp->b_error = 0; + goto out; + } - /* Make sure we pass the verifiers. */ - bp->b_ops->verify_read(bp); - if (bp->b_error) + /* + * Some read verifiers will (re)set b_ops, so we must be + * careful not to blow away any such assignment. + */ + if (!bp->b_ops) + bp->b_ops = fab->buf_ops; + } + + /* + * This block passes the magic/uuid and verifier tests for this btree + * type. We don't need the caller to try the other tree types. + */ + *done_with_block = true; + + /* + * Compare this btree block's level to the height of the current + * candidate root block. + * + * If the level matches the root we found previously, throw away both + * blocks because there can't be two candidate roots. + * + * If level is lower in the tree than the root we found previously, + * ignore this block. + */ + block_level = xfs_btree_get_level(btblock); + if (block_level + 1 == fab->height) { + fab->root = NULLAGBLOCK; goto out; - fab->root = agbno; - fab->height = xfs_btree_get_level(btblock) + 1; - *found_it = true; + } else if (block_level < fab->height) { + goto out; + } + + /* + * This is the highest block in the tree that we've found so far. + * Update the btree height to reflect what we've learned from this + * block. + */ + fab->height = block_level + 1; + + /* + * If this block doesn't have sibling pointers, then it's the new root + * block candidate. Otherwise, the root will be found farther up the + * tree. + */ + if (btblock->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) && + btblock->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK)) + fab->root = agbno; + else + fab->root = NULLAGBLOCK; trace_xrep_findroot_block(mp, ri->sc->sa.agno, agbno, be32_to_cpu(btblock->bb_magic), fab->height - 1); @@ -768,7 +846,7 @@ xrep_findroot_rmap( struct xrep_findroot *ri = priv; struct xrep_find_ag_btree *fab; xfs_agblock_t b; - bool found_it; + bool done; int error = 0; /* Ignore anything that isn't AG metadata. */ @@ -777,16 +855,16 @@ xrep_findroot_rmap( /* Otherwise scan each block + btree type. */ for (b = 0; b < rec->rm_blockcount; b++) { - found_it = false; + done = false; for (fab = ri->btree_info; fab->buf_ops; fab++) { if (rec->rm_owner != fab->rmap_owner) continue; error = xrep_findroot_block(ri, fab, rec->rm_owner, rec->rm_startblock + b, - &found_it); + &done); if (error) return error; - if (found_it) + if (done) break; } } diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 4bfae1e61d30..1b2344d00525 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -412,19 +412,6 @@ xchk_validate_inputs( goto out; } - error = -EOPNOTSUPP; - /* - * We won't scrub any filesystem that doesn't have the ability - * to record unwritten extents. The option was made default in - * 2003, removed from mkfs in 2007, and cannot be disabled in - * v5, so if we find a filesystem without this flag it's either - * really old or totally unsupported. Avoid it either way. - * We also don't support v1-v3 filesystems, which aren't - * mountable. - */ - if (!xfs_sb_version_hasextflgbit(&mp->m_sb)) - goto out; - /* * We only want to repair read-write v5+ filesystems. Defer the check * for ops->repair until after our scrub confirms that we need to diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 49f5f5896a43..338b9d9984e0 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -917,7 +917,7 @@ xfs_vm_writepage( struct writeback_control *wbc) { struct xfs_writepage_ctx wpc = { - .io_type = XFS_IO_INVALID, + .io_type = XFS_IO_HOLE, }; int ret; @@ -933,7 +933,7 @@ xfs_vm_writepages( struct writeback_control *wbc) { struct xfs_writepage_ctx wpc = { - .io_type = XFS_IO_INVALID, + .io_type = XFS_IO_HOLE, }; int ret; diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 9af867951a10..494b4338446e 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -12,21 +12,19 @@ extern struct bio_set xfs_ioend_bioset; * Types of I/O for bmap clustering and I/O completion tracking. */ enum { - XFS_IO_INVALID, /* initial state */ + XFS_IO_HOLE, /* covers region without any block allocation */ XFS_IO_DELALLOC, /* covers delalloc region */ XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ XFS_IO_OVERWRITE, /* covers already allocated extent */ XFS_IO_COW, /* covers copy-on-write extent */ - XFS_IO_HOLE, /* covers region without any block allocation */ }; #define XFS_IO_TYPES \ - { XFS_IO_INVALID, "invalid" }, \ - { XFS_IO_DELALLOC, "delalloc" }, \ - { XFS_IO_UNWRITTEN, "unwritten" }, \ - { XFS_IO_OVERWRITE, "overwrite" }, \ - { XFS_IO_COW, "CoW" }, \ - { XFS_IO_HOLE, "hole" } + { XFS_IO_HOLE, "hole" }, \ + { XFS_IO_DELALLOC, "delalloc" }, \ + { XFS_IO_UNWRITTEN, "unwritten" }, \ + { XFS_IO_OVERWRITE, "overwrite" }, \ + { XFS_IO_COW, "CoW" } /* * Structure for buffered I/O completions. diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index addbd74ecd8e..5d263dfdb3bc 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -406,10 +406,10 @@ xfs_getbmap_report_one( struct xfs_bmbt_irec *got) { struct kgetbmap *p = out + bmv->bmv_entries; - bool shared = false, trimmed = false; + bool shared = false; int error; - error = xfs_reflink_trim_around_shared(ip, got, &shared, &trimmed); + error = xfs_reflink_trim_around_shared(ip, got, &shared); if (error) return error; @@ -702,13 +702,9 @@ xfs_bmap_punch_delalloc_range( struct xfs_iext_cursor icur; int error = 0; - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); - if (error) - goto out_unlock; - } + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + xfs_ilock(ip, XFS_ILOCK_EXCL); if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) goto out_unlock; @@ -1047,44 +1043,6 @@ out_trans_cancel: } static int -xfs_adjust_extent_unmap_boundaries( - struct xfs_inode *ip, - xfs_fileoff_t *startoffset_fsb, - xfs_fileoff_t *endoffset_fsb) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_bmbt_irec imap; - int nimap, error; - xfs_extlen_t mod = 0; - - nimap = 1; - error = xfs_bmapi_read(ip, *startoffset_fsb, 1, &imap, &nimap, 0); - if (error) - return error; - - if (nimap && imap.br_startblock != HOLESTARTBLOCK) { - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - div_u64_rem(imap.br_startblock, mp->m_sb.sb_rextsize, &mod); - if (mod) - *startoffset_fsb += mp->m_sb.sb_rextsize - mod; - } - - nimap = 1; - error = xfs_bmapi_read(ip, *endoffset_fsb - 1, 1, &imap, &nimap, 0); - if (error) - return error; - - if (nimap && imap.br_startblock != HOLESTARTBLOCK) { - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - mod++; - if (mod && mod != mp->m_sb.sb_rextsize) - *endoffset_fsb -= mod; - } - - return 0; -} - -static int xfs_flush_unmap_range( struct xfs_inode *ip, xfs_off_t offset, @@ -1137,19 +1095,8 @@ xfs_free_file_space( endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); /* - * Need to zero the stuff we're not freeing, on disk. If it's a RT file - * and we can't use unwritten extents then we actually need to ensure - * to zero the whole extent, otherwise we just need to take of block - * boundaries, and xfs_bunmapi will handle the rest. + * Need to zero the stuff we're not freeing, on disk. */ - if (XFS_IS_REALTIME_INODE(ip) && - !xfs_sb_version_hasextflgbit(&mp->m_sb)) { - error = xfs_adjust_extent_unmap_boundaries(ip, &startoffset_fsb, - &endoffset_fsb); - if (error) - return error; - } - if (endoffset_fsb > startoffset_fsb) { while (!done) { error = xfs_unmap_extent(ip, startoffset_fsb, @@ -1584,7 +1531,7 @@ xfs_swap_extent_rmap( tirec.br_blockcount, &irec, &nimaps, 0); if (error) - goto out_defer; + goto out; ASSERT(nimaps == 1); ASSERT(tirec.br_startoff == irec.br_startoff); trace_xfs_swap_extent_rmap_remap_piece(ip, &irec); @@ -1599,22 +1546,22 @@ xfs_swap_extent_rmap( /* Remove the mapping from the donor file. */ error = xfs_bmap_unmap_extent(tp, tip, &uirec); if (error) - goto out_defer; + goto out; /* Remove the mapping from the source file. */ error = xfs_bmap_unmap_extent(tp, ip, &irec); if (error) - goto out_defer; + goto out; /* Map the donor file's blocks into the source file. */ error = xfs_bmap_map_extent(tp, ip, &uirec); if (error) - goto out_defer; + goto out; /* Map the source file's blocks into the donor file. */ error = xfs_bmap_map_extent(tp, tip, &irec); if (error) - goto out_defer; + goto out; error = xfs_defer_finish(tpp); tp = *tpp; @@ -1636,8 +1583,6 @@ xfs_swap_extent_rmap( tip->i_d.di_flags2 = tip_flags2; return 0; -out_defer: - xfs_defer_cancel(tp); out: trace_xfs_swap_extent_rmap_error(ip, error, _RET_IP_); tip->i_d.di_flags2 = tip_flags2; @@ -1830,6 +1775,12 @@ xfs_swap_extents( if (error) goto out_unlock; + if (xfs_inode_has_cow_data(tip)) { + error = xfs_reflink_cancel_cow_range(tip, 0, NULLFILEOFF, true); + if (error) + return error; + } + /* * Extent "swapping" with rmap requires a permanent reservation and * a block reservation because it's really just a remap operation diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index e839907e8492..b21ea2ba768d 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -37,6 +37,32 @@ static kmem_zone_t *xfs_buf_zone; #define xb_to_gfp(flags) \ ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN) +/* + * Locking orders + * + * xfs_buf_ioacct_inc: + * xfs_buf_ioacct_dec: + * b_sema (caller holds) + * b_lock + * + * xfs_buf_stale: + * b_sema (caller holds) + * b_lock + * lru_lock + * + * xfs_buf_rele: + * b_lock + * pag_buf_lock + * lru_lock + * + * xfs_buftarg_wait_rele + * lru_lock + * b_lock (trylock due to inversion) + * + * xfs_buftarg_isolate + * lru_lock + * b_lock (trylock due to inversion) + */ static inline int xfs_buf_is_vmapped( @@ -749,6 +775,30 @@ _xfs_buf_read( return xfs_buf_submit(bp); } +/* + * If the caller passed in an ops structure and the buffer doesn't have ops + * assigned, set the ops and use them to verify the contents. If the contents + * cannot be verified, we'll clear XBF_DONE. We assume the buffer has no + * recorded errors and is already in XBF_DONE state. + */ +int +xfs_buf_ensure_ops( + struct xfs_buf *bp, + const struct xfs_buf_ops *ops) +{ + ASSERT(bp->b_flags & XBF_DONE); + ASSERT(bp->b_error == 0); + + if (!ops || bp->b_ops) + return 0; + + bp->b_ops = ops; + bp->b_ops->verify_read(bp); + if (bp->b_error) + bp->b_flags &= ~XBF_DONE; + return bp->b_error; +} + xfs_buf_t * xfs_buf_read_map( struct xfs_buftarg *target, @@ -762,26 +812,32 @@ xfs_buf_read_map( flags |= XBF_READ; bp = xfs_buf_get_map(target, map, nmaps, flags); - if (bp) { - trace_xfs_buf_read(bp, flags, _RET_IP_); + if (!bp) + return NULL; - if (!(bp->b_flags & XBF_DONE)) { - XFS_STATS_INC(target->bt_mount, xb_get_read); - bp->b_ops = ops; - _xfs_buf_read(bp, flags); - } else if (flags & XBF_ASYNC) { - /* - * Read ahead call which is already satisfied, - * drop the buffer - */ - xfs_buf_relse(bp); - return NULL; - } else { - /* We do not want read in the flags */ - bp->b_flags &= ~XBF_READ; - } + trace_xfs_buf_read(bp, flags, _RET_IP_); + + if (!(bp->b_flags & XBF_DONE)) { + XFS_STATS_INC(target->bt_mount, xb_get_read); + bp->b_ops = ops; + _xfs_buf_read(bp, flags); + return bp; } + xfs_buf_ensure_ops(bp, ops); + + if (flags & XBF_ASYNC) { + /* + * Read ahead call which is already satisfied, + * drop the buffer + */ + xfs_buf_relse(bp); + return NULL; + } + + /* We do not want read in the flags */ + bp->b_flags &= ~XBF_READ; + ASSERT(bp->b_ops != NULL || ops == NULL); return bp; } @@ -1006,8 +1062,18 @@ xfs_buf_rele( ASSERT(atomic_read(&bp->b_hold) > 0); - release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); + /* + * We grab the b_lock here first to serialise racing xfs_buf_rele() + * calls. The pag_buf_lock being taken on the last reference only + * serialises against racing lookups in xfs_buf_find(). IOWs, the second + * to last reference we drop here is not serialised against the last + * reference until we take bp->b_lock. Hence if we don't grab b_lock + * first, the last "release" reference can win the race to the lock and + * free the buffer before the second-to-last reference is processed, + * leading to a use-after-free scenario. + */ spin_lock(&bp->b_lock); + release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); if (!release) { /* * Drop the in-flight state if the buffer is already on the LRU @@ -1989,6 +2055,13 @@ xfs_buf_delwri_submit_buffers( * is only safely useable for callers that can track I/O completion by higher * level means, e.g. AIL pushing as the @buffer_list is consumed in this * function. + * + * Note: this function will skip buffers it would block on, and in doing so + * leaves them on @buffer_list so they can be retried on a later pass. As such, + * it is up to the caller to ensure that the buffer list is fully submitted or + * cancelled appropriately when they are finished with the list. Failure to + * cancel or resubmit the list until it is empty will result in leaked buffers + * at unmount time. */ int xfs_buf_delwri_submit_nowait( diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 4e3171acd0f8..b9f5511ea998 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -385,4 +385,6 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) +int xfs_buf_ensure_ops(struct xfs_buf *bp, const struct xfs_buf_ops *ops); + #endif /* __XFS_BUF_H__ */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 1c9d1398980b..12d8455bfbb2 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -532,6 +532,49 @@ xfs_buf_item_push( } /* + * Drop the buffer log item refcount and take appropriate action. This helper + * determines whether the bli must be freed or not, since a decrement to zero + * does not necessarily mean the bli is unused. + * + * Return true if the bli is freed, false otherwise. + */ +bool +xfs_buf_item_put( + struct xfs_buf_log_item *bip) +{ + struct xfs_log_item *lip = &bip->bli_item; + bool aborted; + bool dirty; + + /* drop the bli ref and return if it wasn't the last one */ + if (!atomic_dec_and_test(&bip->bli_refcount)) + return false; + + /* + * We dropped the last ref and must free the item if clean or aborted. + * If the bli is dirty and non-aborted, the buffer was clean in the + * transaction but still awaiting writeback from previous changes. In + * that case, the bli is freed on buffer writeback completion. + */ + aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) || + XFS_FORCED_SHUTDOWN(lip->li_mountp); + dirty = bip->bli_flags & XFS_BLI_DIRTY; + if (dirty && !aborted) + return false; + + /* + * The bli is aborted or clean. An aborted item may be in the AIL + * regardless of dirty state. For example, consider an aborted + * transaction that invalidated a dirty bli and cleared the dirty + * state. + */ + if (aborted) + xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); + xfs_buf_item_relse(bip->bli_buf); + return true; +} + +/* * Release the buffer associated with the buf log item. If there is no dirty * logged data associated with the buffer recorded in the buf log item, then * free the buf log item and remove the reference to it in the buffer. @@ -556,76 +599,42 @@ xfs_buf_item_unlock( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - bool aborted; - bool hold = !!(bip->bli_flags & XFS_BLI_HOLD); - bool dirty = !!(bip->bli_flags & XFS_BLI_DIRTY); + bool released; + bool hold = bip->bli_flags & XFS_BLI_HOLD; + bool stale = bip->bli_flags & XFS_BLI_STALE; #if defined(DEBUG) || defined(XFS_WARN) - bool ordered = !!(bip->bli_flags & XFS_BLI_ORDERED); + bool ordered = bip->bli_flags & XFS_BLI_ORDERED; + bool dirty = bip->bli_flags & XFS_BLI_DIRTY; #endif - aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags); - - /* Clear the buffer's association with this transaction. */ - bp->b_transp = NULL; - - /* - * The per-transaction state has been copied above so clear it from the - * bli. - */ - bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); - - /* - * If the buf item is marked stale, then don't do anything. We'll - * unlock the buffer and free the buf item when the buffer is unpinned - * for the last time. - */ - if (bip->bli_flags & XFS_BLI_STALE) { - trace_xfs_buf_item_unlock_stale(bip); - ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); - if (!aborted) { - atomic_dec(&bip->bli_refcount); - return; - } - } - trace_xfs_buf_item_unlock(bip); /* - * If the buf item isn't tracking any data, free it, otherwise drop the - * reference we hold to it. If we are aborting the transaction, this may - * be the only reference to the buf item, so we free it anyway - * regardless of whether it is dirty or not. A dirty abort implies a - * shutdown, anyway. - * * The bli dirty state should match whether the blf has logged segments * except for ordered buffers, where only the bli should be dirty. */ ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) || (ordered && dirty && !xfs_buf_item_dirty_format(bip))); + ASSERT(!stale || (bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); /* - * Clean buffers, by definition, cannot be in the AIL. However, aborted - * buffers may be in the AIL regardless of dirty state. An aborted - * transaction that invalidates a buffer already in the AIL may have - * marked it stale and cleared the dirty state, for example. - * - * Therefore if we are aborting a buffer and we've just taken the last - * reference away, we have to check if it is in the AIL before freeing - * it. We need to free it in this case, because an aborted transaction - * has already shut the filesystem down and this is the last chance we - * will have to do so. + * Clear the buffer's association with this transaction and + * per-transaction state from the bli, which has been copied above. */ - if (atomic_dec_and_test(&bip->bli_refcount)) { - if (aborted) { - ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); - xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); - xfs_buf_item_relse(bp); - } else if (!dirty) - xfs_buf_item_relse(bp); - } + bp->b_transp = NULL; + bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); - if (!hold) - xfs_buf_relse(bp); + /* + * Unref the item and unlock the buffer unless held or stale. Stale + * buffers remain locked until final unpin unless the bli is freed by + * the unref call. The latter implies shutdown because buffer + * invalidation dirties the bli and transaction. + */ + released = xfs_buf_item_put(bip); + if (hold || (stale && !released)) + return; + ASSERT(!stale || test_bit(XFS_LI_ABORTED, &lip->li_flags)); + xfs_buf_relse(bp); } /* diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 3f7d7b72e7e6..90f65f891fab 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -51,6 +51,7 @@ struct xfs_buf_log_item { int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); void xfs_buf_item_relse(struct xfs_buf *); +bool xfs_buf_item_put(struct xfs_buf_log_item *); void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint); bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); void xfs_buf_attach_iodone(struct xfs_buf *, diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 7c00b8bedfe3..093c2b8d7e20 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -470,20 +470,13 @@ xfs_fs_goingdown( */ void xfs_do_force_shutdown( - xfs_mount_t *mp, + struct xfs_mount *mp, int flags, char *fname, int lnnum) { - int logerror; - - logerror = flags & SHUTDOWN_LOG_IO_ERROR; + bool logerror = flags & SHUTDOWN_LOG_IO_ERROR; - if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { - xfs_notice(mp, - "%s(0x%x) called from line %d of file %s. Return address = "PTR_FMT, - __func__, flags, lnnum, fname, __return_address); - } /* * No need to duplicate efforts. */ @@ -499,27 +492,34 @@ xfs_do_force_shutdown( if (xfs_log_force_umount(mp, logerror)) return; + if (flags & SHUTDOWN_FORCE_UMOUNT) { + xfs_alert(mp, +"User initiated shutdown received. Shutting down filesystem"); + return; + } + + xfs_notice(mp, +"%s(0x%x) called from line %d of file %s. Return address = "PTR_FMT, + __func__, flags, lnnum, fname, __return_address); + if (flags & SHUTDOWN_CORRUPT_INCORE) { xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT, - "Corruption of in-memory data detected. Shutting down filesystem"); +"Corruption of in-memory data detected. Shutting down filesystem"); if (XFS_ERRLEVEL_HIGH <= xfs_error_level) xfs_stack_trace(); - } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { - if (logerror) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, - "Log I/O Error Detected. Shutting down filesystem"); - } else if (flags & SHUTDOWN_DEVICE_REQ) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, - "All device paths lost. Shutting down filesystem"); - } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, - "I/O Error Detected. Shutting down filesystem"); - } - } - if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { - xfs_alert(mp, - "Please umount the filesystem and rectify the problem(s)"); + } else if (logerror) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, + "Log I/O Error Detected. Shutting down filesystem"); + } else if (flags & SHUTDOWN_DEVICE_REQ) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, + "All device paths lost. Shutting down filesystem"); + } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, + "I/O Error Detected. Shutting down filesystem"); } + + xfs_alert(mp, + "Please unmount the filesystem and rectify the problem(s)"); } /* diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d957a46dc1cb..05db9540e459 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1563,7 +1563,7 @@ xfs_itruncate_extents_flags( error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags, XFS_ITRUNC_MAX_EXTENTS, &done); if (error) - goto out_bmap_cancel; + goto out; /* * Duplicate the transaction that has the permanent @@ -1599,14 +1599,6 @@ xfs_itruncate_extents_flags( out: *tpp = tp; return error; -out_bmap_cancel: - /* - * If the bunmapi call encounters an error, return to the caller where - * the transaction can be properly aborted. We just need to make sure - * we're not holding any resources that we were not when we came in. - */ - xfs_defer_cancel(tp); - goto out; } int diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 0ef5ece5634c..6e2c08f30f60 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -604,14 +604,6 @@ xfs_ioc_space( uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; int error; - /* - * Only allow the sys admin to reserve space unless - * unwritten extents are enabled. - */ - if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) return -EPERM; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 6320aca39f39..27c93b5f029d 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -62,6 +62,21 @@ xfs_bmbt_to_iomap( iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip)); } +static void +xfs_hole_to_iomap( + struct xfs_inode *ip, + struct iomap *iomap, + xfs_fileoff_t offset_fsb, + xfs_fileoff_t end_fsb) +{ + iomap->addr = IOMAP_NULL_ADDR; + iomap->type = IOMAP_HOLE; + iomap->offset = XFS_FSB_TO_B(ip->i_mount, offset_fsb); + iomap->length = XFS_FSB_TO_B(ip->i_mount, end_fsb - offset_fsb); + iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); + iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip)); +} + xfs_extlen_t xfs_eof_alignment( struct xfs_inode *ip, @@ -502,6 +517,7 @@ xfs_file_iomap_begin_delay( struct inode *inode, loff_t offset, loff_t count, + unsigned flags, struct iomap *iomap) { struct xfs_inode *ip = XFS_I(inode); @@ -538,15 +554,23 @@ xfs_file_iomap_begin_delay( goto out_unlock; } + end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); + eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got); - if (!eof && got.br_startoff <= offset_fsb) { - if (xfs_is_reflink_inode(ip)) { - bool shared; + if (eof) + got.br_startoff = end_fsb; /* fake hole until the end */ - end_fsb = min(XFS_B_TO_FSB(mp, offset + count), - maxbytes_fsb); + if (got.br_startoff <= offset_fsb) { + /* + * For reflink files we may need a delalloc reservation when + * overwriting shared extents. This includes zeroing of + * existing extents that contain data. + */ + if (xfs_is_reflink_inode(ip) && + ((flags & IOMAP_WRITE) || + got.br_state != XFS_EXT_UNWRITTEN)) { xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb); - error = xfs_reflink_reserve_cow(ip, &got, &shared); + error = xfs_reflink_reserve_cow(ip, &got); if (error) goto out_unlock; } @@ -555,6 +579,11 @@ xfs_file_iomap_begin_delay( goto done; } + if (flags & IOMAP_ZERO) { + xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff); + goto out_unlock; + } + error = xfs_qm_dqattach_locked(ip, false); if (error) goto out_unlock; @@ -1003,16 +1032,17 @@ xfs_file_iomap_begin( struct xfs_bmbt_irec imap; xfs_fileoff_t offset_fsb, end_fsb; int nimaps = 1, error = 0; - bool shared = false, trimmed = false; + bool shared = false; unsigned lockmode; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) && + if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && !(flags & IOMAP_DIRECT) && !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) { /* Reserve delalloc blocks for regular writeback. */ - return xfs_file_iomap_begin_delay(inode, offset, length, iomap); + return xfs_file_iomap_begin_delay(inode, offset, length, flags, + iomap); } /* @@ -1038,8 +1068,7 @@ xfs_file_iomap_begin( if (flags & IOMAP_REPORT) { /* Trim the mapping to the nearest shared extent boundary. */ - error = xfs_reflink_trim_around_shared(ip, &imap, &shared, - &trimmed); + error = xfs_reflink_trim_around_shared(ip, &imap, &shared); if (error) goto out_unlock; } @@ -1065,7 +1094,7 @@ xfs_file_iomap_begin( if (error) goto out_unlock; } else { - error = xfs_reflink_reserve_cow(ip, &imap, &shared); + error = xfs_reflink_reserve_cow(ip, &imap); if (error) goto out_unlock; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index c3e74f9128e8..f48ffd7a8d3e 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -471,8 +471,18 @@ xfs_vn_get_link_inline( struct inode *inode, struct delayed_call *done) { + char *link; + ASSERT(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE); - return XFS_I(inode)->i_df.if_u1.if_data; + + /* + * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED if + * if_data is junk. + */ + link = XFS_I(inode)->i_df.if_u1.if_data; + if (!link) + return ERR_PTR(-EFSCORRUPTED); + return link; } STATIC int diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a21dc61ec09e..1fc9e9042e0e 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1570,16 +1570,6 @@ xlog_find_zeroed( if (last_cycle != 0) { /* log completely written to */ xlog_put_bp(bp); return 0; - } else if (first_cycle != 1) { - /* - * If the cycle of the last block is zero, the cycle of - * the first block must be 1. If it's not, maybe we're - * not looking at a log... Bail out. - */ - xfs_warn(log->l_mp, - "Log inconsistent or not a log (last==0, first!=1)"); - error = -EINVAL; - goto bp_err; } /* we have a partially zeroed log */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 38f405415b88..8eaeec9d58ed 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -182,8 +182,7 @@ int xfs_reflink_trim_around_shared( struct xfs_inode *ip, struct xfs_bmbt_irec *irec, - bool *shared, - bool *trimmed) + bool *shared) { xfs_agnumber_t agno; xfs_agblock_t agbno; @@ -209,7 +208,7 @@ xfs_reflink_trim_around_shared( if (error) return error; - *shared = *trimmed = false; + *shared = false; if (fbno == NULLAGBLOCK) { /* No shared blocks at all. */ return 0; @@ -222,8 +221,6 @@ xfs_reflink_trim_around_shared( */ irec->br_blockcount = flen; *shared = true; - if (flen != aglen) - *trimmed = true; return 0; } else { /* @@ -233,7 +230,6 @@ xfs_reflink_trim_around_shared( * start of the shared region. */ irec->br_blockcount = fbno - agbno; - *trimmed = true; return 0; } } @@ -241,7 +237,7 @@ xfs_reflink_trim_around_shared( /* * Trim the passed in imap to the next shared/unshared extent boundary, and * if imap->br_startoff points to a shared extent reserve space for it in the - * COW fork. In this case *shared is set to true, else to false. + * COW fork. * * Note that imap will always contain the block numbers for the existing blocks * in the data fork, as the upper layers need them for read-modify-write @@ -250,14 +246,14 @@ xfs_reflink_trim_around_shared( int xfs_reflink_reserve_cow( struct xfs_inode *ip, - struct xfs_bmbt_irec *imap, - bool *shared) + struct xfs_bmbt_irec *imap) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); struct xfs_bmbt_irec got; int error = 0; - bool eof = false, trimmed; + bool eof = false; struct xfs_iext_cursor icur; + bool shared; /* * Search the COW fork extent list first. This serves two purposes: @@ -273,18 +269,16 @@ xfs_reflink_reserve_cow( if (!eof && got.br_startoff <= imap->br_startoff) { trace_xfs_reflink_cow_found(ip, imap); xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); - - *shared = true; return 0; } /* Trim the mapping to the nearest shared extent boundary. */ - error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed); + error = xfs_reflink_trim_around_shared(ip, imap, &shared); if (error) return error; /* Not shared? Just report the (potentially capped) extent. */ - if (!*shared) + if (!shared) return 0; /* @@ -352,6 +346,50 @@ xfs_reflink_convert_cow( return error; } +/* + * Find the extent that maps the given range in the COW fork. Even if the extent + * is not shared we might have a preallocation for it in the COW fork. If so we + * use it that rather than trigger a new allocation. + */ +static int +xfs_find_trim_cow_extent( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + bool *shared, + bool *found) +{ + xfs_fileoff_t offset_fsb = imap->br_startoff; + xfs_filblks_t count_fsb = imap->br_blockcount; + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got; + + *found = false; + + /* + * If we don't find an overlapping extent, trim the range we need to + * allocate to fit the hole we found. + */ + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) + got.br_startoff = offset_fsb + count_fsb; + if (got.br_startoff > offset_fsb) { + xfs_trim_extent(imap, imap->br_startoff, + got.br_startoff - imap->br_startoff); + return xfs_reflink_trim_around_shared(ip, imap, shared); + } + + *shared = true; + if (isnullstartblock(got.br_startblock)) { + xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); + return 0; + } + + /* real extent found - no need to allocate */ + xfs_trim_extent(&got, offset_fsb, count_fsb); + *imap = got; + *found = true; + return 0; +} + /* Allocate all CoW reservations covering a range of blocks in a file. */ int xfs_reflink_allocate_cow( @@ -363,78 +401,64 @@ xfs_reflink_allocate_cow( struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = imap->br_startoff; xfs_filblks_t count_fsb = imap->br_blockcount; - struct xfs_bmbt_irec got; - struct xfs_trans *tp = NULL; + struct xfs_trans *tp; int nimaps, error = 0; - bool trimmed; + bool found; xfs_filblks_t resaligned; xfs_extlen_t resblks = 0; - struct xfs_iext_cursor icur; -retry: - ASSERT(xfs_is_reflink_inode(ip)); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(xfs_is_reflink_inode(ip)); - /* - * Even if the extent is not shared we might have a preallocation for - * it in the COW fork. If so use it. - */ - if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) && - got.br_startoff <= offset_fsb) { - *shared = true; - - /* If we have a real allocation in the COW fork we're done. */ - if (!isnullstartblock(got.br_startblock)) { - xfs_trim_extent(&got, offset_fsb, count_fsb); - *imap = got; - goto convert; - } + error = xfs_find_trim_cow_extent(ip, imap, shared, &found); + if (error || !*shared) + return error; + if (found) + goto convert; - xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); - } else { - error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed); - if (error || !*shared) - goto out; - } + resaligned = xfs_aligned_fsb_count(imap->br_startoff, + imap->br_blockcount, xfs_get_cowextsz_hint(ip)); + resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); - if (!tp) { - resaligned = xfs_aligned_fsb_count(imap->br_startoff, - imap->br_blockcount, xfs_get_cowextsz_hint(ip)); - resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); + xfs_iunlock(ip, *lockmode); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + *lockmode = XFS_ILOCK_EXCL; + xfs_ilock(ip, *lockmode); - xfs_iunlock(ip, *lockmode); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); - *lockmode = XFS_ILOCK_EXCL; - xfs_ilock(ip, *lockmode); + if (error) + return error; - if (error) - return error; + error = xfs_qm_dqattach_locked(ip, false); + if (error) + goto out_trans_cancel; - error = xfs_qm_dqattach_locked(ip, false); - if (error) - goto out; - goto retry; + /* + * Check for an overlapping extent again now that we dropped the ilock. + */ + error = xfs_find_trim_cow_extent(ip, imap, shared, &found); + if (error || !*shared) + goto out_trans_cancel; + if (found) { + xfs_trans_cancel(tp); + goto convert; } error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, XFS_QMOPT_RES_REGBLKS); if (error) - goto out; + goto out_trans_cancel; xfs_trans_ijoin(tp, ip, 0); - nimaps = 1; - /* Allocate the entire reservation as unwritten blocks. */ + nimaps = 1; error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, resblks, imap, &nimaps); if (error) - goto out_trans_cancel; + goto out_unreserve; xfs_inode_set_cowblocks_tag(ip); - - /* Finish up. */ error = xfs_trans_commit(tp); if (error) return error; @@ -447,12 +471,12 @@ retry: return -ENOSPC; convert: return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb); -out_trans_cancel: + +out_unreserve: xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0, XFS_QMOPT_RES_REGBLKS); -out: - if (tp) - xfs_trans_cancel(tp); +out_trans_cancel: + xfs_trans_cancel(tp); return error; } @@ -666,14 +690,12 @@ xfs_reflink_end_cow( if (!del.br_blockcount) goto prev_extent; - ASSERT(!isnullstartblock(got.br_startblock)); - /* - * Don't remap unwritten extents; these are - * speculatively preallocated CoW extents that have been - * allocated but have not yet been involved in a write. + * Only remap real extent that contain data. With AIO + * speculatively preallocations can leak into the range we + * are called upon, and we need to skip them. */ - if (got.br_state == XFS_EXT_UNWRITTEN) + if (!xfs_bmap_is_real_extent(&got)) goto prev_extent; /* Unmap the old blocks in the data fork. */ @@ -1195,35 +1217,92 @@ retry: return 0; } +/* Unlock both inodes after they've been prepped for a range clone. */ +STATIC void +xfs_reflink_remap_unlock( + struct file *file_in, + struct file *file_out) +{ + struct inode *inode_in = file_inode(file_in); + struct xfs_inode *src = XFS_I(inode_in); + struct inode *inode_out = file_inode(file_out); + struct xfs_inode *dest = XFS_I(inode_out); + bool same_inode = (inode_in == inode_out); + + xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); + if (!same_inode) + xfs_iunlock(src, XFS_MMAPLOCK_SHARED); + inode_unlock(inode_out); + if (!same_inode) + inode_unlock_shared(inode_in); +} + /* - * Link a range of blocks from one file to another. + * If we're reflinking to a point past the destination file's EOF, we must + * zero any speculative post-EOF preallocations that sit between the old EOF + * and the destination file offset. */ -int -xfs_reflink_remap_range( +static int +xfs_reflink_zero_posteof( + struct xfs_inode *ip, + loff_t pos) +{ + loff_t isize = i_size_read(VFS_I(ip)); + + if (pos <= isize) + return 0; + + trace_xfs_zero_eof(ip, isize, pos - isize); + return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL, + &xfs_iomap_ops); +} + +/* + * Prepare two files for range cloning. Upon a successful return both inodes + * will have the iolock and mmaplock held, the page cache of the out file will + * be truncated, and any leases on the out file will have been broken. This + * function borrows heavily from xfs_file_aio_write_checks. + * + * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't + * checked that the bytes beyond EOF physically match. Hence we cannot use the + * EOF block in the source dedupe range because it's not a complete block match, + * hence can introduce a corruption into the file that has it's block replaced. + * + * In similar fashion, the VFS file cloning also allows partial EOF blocks to be + * "block aligned" for the purposes of cloning entire files. However, if the + * source file range includes the EOF block and it lands within the existing EOF + * of the destination file, then we can expose stale data from beyond the source + * file EOF in the destination file. + * + * XFS doesn't support partial block sharing, so in both cases we have check + * these cases ourselves. For dedupe, we can simply round the length to dedupe + * down to the previous whole block and ignore the partial EOF block. While this + * means we can't dedupe the last block of a file, this is an acceptible + * tradeoff for simplicity on implementation. + * + * For cloning, we want to share the partial EOF block if it is also the new EOF + * block of the destination file. If the partial EOF block lies inside the + * existing destination EOF, then we have to abort the clone to avoid exposing + * stale data in the destination file. Hence we reject these clone attempts with + * -EINVAL in this case. + */ +STATIC int +xfs_reflink_remap_prep( struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - u64 len, + u64 *len, bool is_dedupe) { struct inode *inode_in = file_inode(file_in); struct xfs_inode *src = XFS_I(inode_in); struct inode *inode_out = file_inode(file_out); struct xfs_inode *dest = XFS_I(inode_out); - struct xfs_mount *mp = src->i_mount; bool same_inode = (inode_in == inode_out); - xfs_fileoff_t sfsbno, dfsbno; - xfs_filblks_t fsblen; - xfs_extlen_t cowextsize; + u64 blkmask = i_blocksize(inode_in) - 1; ssize_t ret; - if (!xfs_sb_version_hasreflink(&mp->m_sb)) - return -EOPNOTSUPP; - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - /* Lock both files against IO */ ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out); if (ret) @@ -1245,33 +1324,115 @@ xfs_reflink_remap_range( goto out_unlock; ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out, - &len, is_dedupe); + len, is_dedupe); if (ret <= 0) goto out_unlock; + /* + * If the dedupe data matches, chop off the partial EOF block + * from the source file so we don't try to dedupe the partial + * EOF block. + */ + if (is_dedupe) { + *len &= ~blkmask; + } else if (*len & blkmask) { + /* + * The user is attempting to share a partial EOF block, + * if it's inside the destination EOF then reject it. + */ + if (pos_out + *len < i_size_read(inode_out)) { + ret = -EINVAL; + goto out_unlock; + } + } + /* Attach dquots to dest inode before changing block map */ ret = xfs_qm_dqattach(dest); if (ret) goto out_unlock; - trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); - /* - * Clear out post-eof preallocations because we don't have page cache - * backing the delayed allocations and they'll never get freed on - * their own. + * Zero existing post-eof speculative preallocations in the destination + * file. */ - if (xfs_can_free_eofblocks(dest, true)) { - ret = xfs_free_eofblocks(dest); - if (ret) - goto out_unlock; - } + ret = xfs_reflink_zero_posteof(dest, pos_out); + if (ret) + goto out_unlock; /* Set flags and remap blocks. */ ret = xfs_reflink_set_inode_flag(src, dest); if (ret) goto out_unlock; + /* Zap any page cache for the destination file's range. */ + truncate_inode_pages_range(&inode_out->i_data, pos_out, + PAGE_ALIGN(pos_out + *len) - 1); + + /* If we're altering the file contents... */ + if (!is_dedupe) { + /* + * ...update the timestamps (which will grab the ilock again + * from xfs_fs_dirty_inode, so we have to call it before we + * take the ilock). + */ + if (!(file_out->f_mode & FMODE_NOCMTIME)) { + ret = file_update_time(file_out); + if (ret) + goto out_unlock; + } + + /* + * ...clear the security bits if the process is not being run + * by root. This keeps people from modifying setuid and setgid + * binaries. + */ + ret = file_remove_privs(file_out); + if (ret) + goto out_unlock; + } + + return 1; +out_unlock: + xfs_reflink_remap_unlock(file_in, file_out); + return ret; +} + +/* + * Link a range of blocks from one file to another. + */ +int +xfs_reflink_remap_range( + struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + u64 len, + bool is_dedupe) +{ + struct inode *inode_in = file_inode(file_in); + struct xfs_inode *src = XFS_I(inode_in); + struct inode *inode_out = file_inode(file_out); + struct xfs_inode *dest = XFS_I(inode_out); + struct xfs_mount *mp = src->i_mount; + xfs_fileoff_t sfsbno, dfsbno; + xfs_filblks_t fsblen; + xfs_extlen_t cowextsize; + ssize_t ret; + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return -EOPNOTSUPP; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + /* Prepare and then clone file data. */ + ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, + &len, is_dedupe); + if (ret <= 0) + return ret; + + trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); + dfsbno = XFS_B_TO_FSBT(mp, pos_out); sfsbno = XFS_B_TO_FSBT(mp, pos_in); fsblen = XFS_B_TO_FSB(mp, len); @@ -1280,10 +1441,6 @@ xfs_reflink_remap_range( if (ret) goto out_unlock; - /* Zap any page cache for the destination file's range. */ - truncate_inode_pages_range(&inode_out->i_data, pos_out, - PAGE_ALIGN(pos_out + len) - 1); - /* * Carry the cowextsize hint from src to dest if we're sharing the * entire source file to the entire destination file, the source file @@ -1300,12 +1457,7 @@ xfs_reflink_remap_range( is_dedupe); out_unlock: - xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); - if (!same_inode) - xfs_iunlock(src, XFS_MMAPLOCK_SHARED); - inode_unlock(inode_out); - if (!same_inode) - inode_unlock_shared(inode_in); + xfs_reflink_remap_unlock(file_in, file_out); if (ret) trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); return ret; diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index c585ad9552b2..7f47202b5639 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -10,10 +10,10 @@ extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal); extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, - struct xfs_bmbt_irec *irec, bool *shared, bool *trimmed); + struct xfs_bmbt_irec *irec, bool *shared); extern int xfs_reflink_reserve_cow(struct xfs_inode *ip, - struct xfs_bmbt_irec *imap, bool *shared); + struct xfs_bmbt_irec *imap); extern int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode); extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset, diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 4e4423153071..cc509743facd 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -29,30 +29,30 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) char *desc; int endpoint; } xstats[] = { - { "extent_alloc", XFSSTAT_END_EXTENT_ALLOC }, - { "abt", XFSSTAT_END_ALLOC_BTREE }, - { "blk_map", XFSSTAT_END_BLOCK_MAPPING }, - { "bmbt", XFSSTAT_END_BLOCK_MAP_BTREE }, - { "dir", XFSSTAT_END_DIRECTORY_OPS }, - { "trans", XFSSTAT_END_TRANSACTIONS }, - { "ig", XFSSTAT_END_INODE_OPS }, - { "log", XFSSTAT_END_LOG_OPS }, - { "push_ail", XFSSTAT_END_TAIL_PUSHING }, - { "xstrat", XFSSTAT_END_WRITE_CONVERT }, - { "rw", XFSSTAT_END_READ_WRITE_OPS }, - { "attr", XFSSTAT_END_ATTRIBUTE_OPS }, - { "icluster", XFSSTAT_END_INODE_CLUSTER }, - { "vnodes", XFSSTAT_END_VNODE_OPS }, - { "buf", XFSSTAT_END_BUF }, - { "abtb2", XFSSTAT_END_ABTB_V2 }, - { "abtc2", XFSSTAT_END_ABTC_V2 }, - { "bmbt2", XFSSTAT_END_BMBT_V2 }, - { "ibt2", XFSSTAT_END_IBT_V2 }, - { "fibt2", XFSSTAT_END_FIBT_V2 }, - { "rmapbt", XFSSTAT_END_RMAP_V2 }, - { "refcntbt", XFSSTAT_END_REFCOUNT }, + { "extent_alloc", xfsstats_offset(xs_abt_lookup) }, + { "abt", xfsstats_offset(xs_blk_mapr) }, + { "blk_map", xfsstats_offset(xs_bmbt_lookup) }, + { "bmbt", xfsstats_offset(xs_dir_lookup) }, + { "dir", xfsstats_offset(xs_trans_sync) }, + { "trans", xfsstats_offset(xs_ig_attempts) }, + { "ig", xfsstats_offset(xs_log_writes) }, + { "log", xfsstats_offset(xs_try_logspace)}, + { "push_ail", xfsstats_offset(xs_xstrat_quick)}, + { "xstrat", xfsstats_offset(xs_write_calls) }, + { "rw", xfsstats_offset(xs_attr_get) }, + { "attr", xfsstats_offset(xs_iflush_count)}, + { "icluster", xfsstats_offset(vn_active) }, + { "vnodes", xfsstats_offset(xb_get) }, + { "buf", xfsstats_offset(xs_abtb_2) }, + { "abtb2", xfsstats_offset(xs_abtc_2) }, + { "abtc2", xfsstats_offset(xs_bmbt_2) }, + { "bmbt2", xfsstats_offset(xs_ibt_2) }, + { "ibt2", xfsstats_offset(xs_fibt_2) }, + { "fibt2", xfsstats_offset(xs_rmap_2) }, + { "rmapbt", xfsstats_offset(xs_refcbt_2) }, + { "refcntbt", xfsstats_offset(xs_qm_dqreclaims)}, /* we print both series of quota information together */ - { "qm", XFSSTAT_END_QM }, + { "qm", xfsstats_offset(xs_xstrat_bytes)}, }; /* Loop over all stats groups */ @@ -104,6 +104,10 @@ void xfs_stats_clearall(struct xfsstats __percpu *stats) #ifdef CONFIG_PROC_FS /* legacy quota interfaces */ #ifdef CONFIG_XFS_QUOTA + +#define XFSSTAT_START_XQMSTAT xfsstats_offset(xs_qm_dqreclaims) +#define XFSSTAT_END_XQMSTAT xfsstats_offset(xs_qm_dquot) + static int xqm_proc_show(struct seq_file *m, void *v) { /* maximum; incore; ratio free to inuse; freelist */ @@ -119,7 +123,7 @@ static int xqmstat_proc_show(struct seq_file *m, void *v) int j; seq_printf(m, "qm"); - for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++) + for (j = XFSSTAT_START_XQMSTAT; j < XFSSTAT_END_XQMSTAT; j++) seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j)); seq_putc(m, '\n'); return 0; diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 130db070e4d8..34d704f703d2 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -41,17 +41,14 @@ enum { * XFS global statistics */ struct __xfsstats { -# define XFSSTAT_END_EXTENT_ALLOC 4 uint32_t xs_allocx; uint32_t xs_allocb; uint32_t xs_freex; uint32_t xs_freeb; -# define XFSSTAT_END_ALLOC_BTREE (XFSSTAT_END_EXTENT_ALLOC+4) uint32_t xs_abt_lookup; uint32_t xs_abt_compare; uint32_t xs_abt_insrec; uint32_t xs_abt_delrec; -# define XFSSTAT_END_BLOCK_MAPPING (XFSSTAT_END_ALLOC_BTREE+7) uint32_t xs_blk_mapr; uint32_t xs_blk_mapw; uint32_t xs_blk_unmap; @@ -59,21 +56,17 @@ struct __xfsstats { uint32_t xs_del_exlist; uint32_t xs_look_exlist; uint32_t xs_cmp_exlist; -# define XFSSTAT_END_BLOCK_MAP_BTREE (XFSSTAT_END_BLOCK_MAPPING+4) uint32_t xs_bmbt_lookup; uint32_t xs_bmbt_compare; uint32_t xs_bmbt_insrec; uint32_t xs_bmbt_delrec; -# define XFSSTAT_END_DIRECTORY_OPS (XFSSTAT_END_BLOCK_MAP_BTREE+4) uint32_t xs_dir_lookup; uint32_t xs_dir_create; uint32_t xs_dir_remove; uint32_t xs_dir_getdents; -# define XFSSTAT_END_TRANSACTIONS (XFSSTAT_END_DIRECTORY_OPS+3) uint32_t xs_trans_sync; uint32_t xs_trans_async; uint32_t xs_trans_empty; -# define XFSSTAT_END_INODE_OPS (XFSSTAT_END_TRANSACTIONS+7) uint32_t xs_ig_attempts; uint32_t xs_ig_found; uint32_t xs_ig_frecycle; @@ -81,13 +74,11 @@ struct __xfsstats { uint32_t xs_ig_dup; uint32_t xs_ig_reclaims; uint32_t xs_ig_attrchg; -# define XFSSTAT_END_LOG_OPS (XFSSTAT_END_INODE_OPS+5) uint32_t xs_log_writes; uint32_t xs_log_blocks; uint32_t xs_log_noiclogs; uint32_t xs_log_force; uint32_t xs_log_force_sleep; -# define XFSSTAT_END_TAIL_PUSHING (XFSSTAT_END_LOG_OPS+10) uint32_t xs_try_logspace; uint32_t xs_sleep_logspace; uint32_t xs_push_ail; @@ -98,22 +89,17 @@ struct __xfsstats { uint32_t xs_push_ail_flushing; uint32_t xs_push_ail_restarts; uint32_t xs_push_ail_flush; -# define XFSSTAT_END_WRITE_CONVERT (XFSSTAT_END_TAIL_PUSHING+2) uint32_t xs_xstrat_quick; uint32_t xs_xstrat_split; -# define XFSSTAT_END_READ_WRITE_OPS (XFSSTAT_END_WRITE_CONVERT+2) uint32_t xs_write_calls; uint32_t xs_read_calls; -# define XFSSTAT_END_ATTRIBUTE_OPS (XFSSTAT_END_READ_WRITE_OPS+4) uint32_t xs_attr_get; uint32_t xs_attr_set; uint32_t xs_attr_remove; uint32_t xs_attr_list; -# define XFSSTAT_END_INODE_CLUSTER (XFSSTAT_END_ATTRIBUTE_OPS+3) uint32_t xs_iflush_count; uint32_t xs_icluster_flushcnt; uint32_t xs_icluster_flushinode; -# define XFSSTAT_END_VNODE_OPS (XFSSTAT_END_INODE_CLUSTER+8) uint32_t vn_active; /* # vnodes not on free lists */ uint32_t vn_alloc; /* # times vn_alloc called */ uint32_t vn_get; /* # times vn_get called */ @@ -122,7 +108,6 @@ struct __xfsstats { uint32_t vn_reclaim; /* # times vn_reclaim called */ uint32_t vn_remove; /* # times vn_remove called */ uint32_t vn_free; /* # times vn_free called */ -#define XFSSTAT_END_BUF (XFSSTAT_END_VNODE_OPS+9) uint32_t xb_get; uint32_t xb_create; uint32_t xb_get_locked; @@ -133,28 +118,19 @@ struct __xfsstats { uint32_t xb_page_found; uint32_t xb_get_read; /* Version 2 btree counters */ -#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF + __XBTS_MAX) uint32_t xs_abtb_2[__XBTS_MAX]; -#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2 + __XBTS_MAX) uint32_t xs_abtc_2[__XBTS_MAX]; -#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2 + __XBTS_MAX) uint32_t xs_bmbt_2[__XBTS_MAX]; -#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2 + __XBTS_MAX) uint32_t xs_ibt_2[__XBTS_MAX]; -#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2 + __XBTS_MAX) uint32_t xs_fibt_2[__XBTS_MAX]; -#define XFSSTAT_END_RMAP_V2 (XFSSTAT_END_FIBT_V2 + __XBTS_MAX) uint32_t xs_rmap_2[__XBTS_MAX]; -#define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + __XBTS_MAX) uint32_t xs_refcbt_2[__XBTS_MAX]; -#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_REFCOUNT + 6) uint32_t xs_qm_dqreclaims; uint32_t xs_qm_dqreclaim_misses; uint32_t xs_qm_dquot_dups; uint32_t xs_qm_dqcachemisses; uint32_t xs_qm_dqcachehits; uint32_t xs_qm_dqwants; -#define XFSSTAT_END_QM (XFSSTAT_END_XQMSTAT+2) uint32_t xs_qm_dquot; uint32_t xs_qm_dquot_unused; /* Extra precision counters */ @@ -163,10 +139,12 @@ struct __xfsstats { uint64_t xs_read_bytes; }; +#define xfsstats_offset(f) (offsetof(struct __xfsstats, f)/sizeof(uint32_t)) + struct xfsstats { union { struct __xfsstats s; - uint32_t a[XFSSTAT_END_XQMSTAT]; + uint32_t a[xfsstats_offset(xs_qm_dquot)]; }; }; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 207ee302b1bb..d3e6cd063688 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -43,6 +43,7 @@ #include <linux/dax.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/magic.h> #include <linux/mount.h> #include <linux/mempool.h> #include <linux/writeback.h> @@ -933,6 +934,32 @@ xfs_fs_alloc_inode( return NULL; } +#ifdef DEBUG +static void +xfs_check_delalloc( + struct xfs_inode *ip, + int whichfork) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_bmbt_irec got; + struct xfs_iext_cursor icur; + + if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got)) + return; + do { + if (isnullstartblock(got.br_startblock)) { + xfs_warn(ip->i_mount, + "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]", + ip->i_ino, + whichfork == XFS_DATA_FORK ? "data" : "cow", + got.br_startoff, got.br_blockcount); + } + } while (xfs_iext_next_extent(ifp, &icur, &got)); +} +#else +#define xfs_check_delalloc(ip, whichfork) do { } while (0) +#endif + /* * Now that the generic code is guaranteed not to be accessing * the linux inode, we can inactivate and reclaim the inode. @@ -951,7 +978,12 @@ xfs_fs_destroy_inode( xfs_inactive(ip); - ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); + if (!XFS_FORCED_SHUTDOWN(ip->i_mount) && ip->i_delayed_blks) { + xfs_check_delalloc(ip, XFS_DATA_FORK); + xfs_check_delalloc(ip, XFS_COW_FORK); + ASSERT(0); + } + XFS_STATS_INC(ip->i_mount, vn_reclaim); /* @@ -1097,7 +1129,7 @@ xfs_fs_statfs( xfs_extlen_t lsize; int64_t ffree; - statp->f_type = XFS_SB_MAGIC; + statp->f_type = XFS_SUPER_MAGIC; statp->f_namelen = MAXNAMELEN - 1; id = huge_encode_dev(mp->m_ddev_targp->bt_dev); @@ -1650,7 +1682,7 @@ xfs_fs_fill_super( * we must configure the block size in the superblock before we run the * full mount process as the mount process can lookup and cache inodes. */ - sb->s_magic = XFS_SB_MAGIC; + sb->s_magic = XFS_SUPER_MAGIC; sb->s_blocksize = mp->m_sb.sb_blocksize; sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index ad315e83bc02..3043e5ed6495 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -473,7 +473,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock); -DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push); DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index bedc5a5133a5..912b42f5fe4a 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -259,6 +259,14 @@ xfs_trans_alloc( struct xfs_trans *tp; int error; + /* + * Allocate the handle before we do our freeze accounting and setting up + * GFP_NOFS allocation context so that we avoid lockdep false positives + * by doing GFP_KERNEL allocations inside sb_start_intwrite(). + */ + tp = kmem_zone_zalloc(xfs_trans_zone, + (flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP); + if (!(flags & XFS_TRANS_NO_WRITECOUNT)) sb_start_intwrite(mp->m_super); @@ -270,8 +278,6 @@ xfs_trans_alloc( mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); atomic_inc(&mp->m_active_trans); - tp = kmem_zone_zalloc(xfs_trans_zone, - (flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP); tp->t_magic = XFS_TRANS_HEADER_MAGIC; tp->t_flags = flags; tp->t_mountp = mp; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index c3d278e96ad1..a0c5dbda18aa 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -220,6 +220,7 @@ void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint); void xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint, uint); void xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *); +bool xfs_trans_buf_is_dirty(struct xfs_buf *bp); void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); void xfs_extent_free_init_defer_op(void); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 55326f971cb3..d3a4e89bf4a0 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -531,17 +531,33 @@ xfsaild( set_current_state(TASK_INTERRUPTIBLE); /* - * Check kthread_should_stop() after we set the task state - * to guarantee that we either see the stop bit and exit or - * the task state is reset to runnable such that it's not - * scheduled out indefinitely and detects the stop bit at - * next iteration. - * + * Check kthread_should_stop() after we set the task state to + * guarantee that we either see the stop bit and exit or the + * task state is reset to runnable such that it's not scheduled + * out indefinitely and detects the stop bit at next iteration. * A memory barrier is included in above task state set to * serialize again kthread_stop(). */ if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); + + /* + * The caller forces out the AIL before stopping the + * thread in the common case, which means the delwri + * queue is drained. In the shutdown case, the queue may + * still hold relogged buffers that haven't been + * submitted because they were pinned since added to the + * queue. + * + * Log I/O error processing stales the underlying buffer + * and clears the delwri state, expecting the buf to be + * removed on the next submission attempt. That won't + * happen if we're shutting down, so this is the last + * opportunity to release such buffers from the queue. + */ + ASSERT(list_empty(&ailp->ail_buf_list) || + XFS_FORCED_SHUTDOWN(ailp->ail_mount)); + xfs_buf_delwri_cancel(&ailp->ail_buf_list); break; } diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 15919f67a88f..629f1479c9d2 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -264,11 +264,39 @@ xfs_trans_read_buf_map( return -EIO; } + /* + * Check if the caller is trying to read a buffer that is + * already attached to the transaction yet has no buffer ops + * assigned. Ops are usually attached when the buffer is + * attached to the transaction, or by the read caller if + * special circumstances. That didn't happen, which is not + * how this is supposed to go. + * + * If the buffer passes verification we'll let this go, but if + * not we have to shut down. Let the transaction cleanup code + * release this buffer when it kills the tranaction. + */ + ASSERT(bp->b_ops != NULL); + error = xfs_buf_ensure_ops(bp, ops); + if (error) { + xfs_buf_ioerror_alert(bp, __func__); + + if (tp->t_flags & XFS_TRANS_DIRTY) + xfs_force_shutdown(tp->t_mountp, + SHUTDOWN_META_IO_ERROR); + + /* bad CRC means corrupted metadata */ + if (error == -EFSBADCRC) + error = -EFSCORRUPTED; + return error; + } + bip = bp->b_log_item; bip->bli_recur++; ASSERT(atomic_read(&bip->bli_refcount) > 0); trace_xfs_trans_read_buf_recur(bip); + ASSERT(bp->b_ops != NULL || ops == NULL); *bpp = bp; return 0; } @@ -316,55 +344,58 @@ xfs_trans_read_buf_map( _xfs_trans_bjoin(tp, bp, 1); trace_xfs_trans_read_buf(bp->b_log_item); } + ASSERT(bp->b_ops != NULL || ops == NULL); *bpp = bp; return 0; } +/* Has this buffer been dirtied by anyone? */ +bool +xfs_trans_buf_is_dirty( + struct xfs_buf *bp) +{ + struct xfs_buf_log_item *bip = bp->b_log_item; + + if (!bip) + return false; + ASSERT(bip->bli_item.li_type == XFS_LI_BUF); + return test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags); +} + /* - * Release the buffer bp which was previously acquired with one of the - * xfs_trans_... buffer allocation routines if the buffer has not - * been modified within this transaction. If the buffer is modified - * within this transaction, do decrement the recursion count but do - * not release the buffer even if the count goes to 0. If the buffer is not - * modified within the transaction, decrement the recursion count and - * release the buffer if the recursion count goes to 0. + * Release a buffer previously joined to the transaction. If the buffer is + * modified within this transaction, decrement the recursion count but do not + * release the buffer even if the count goes to 0. If the buffer is not modified + * within the transaction, decrement the recursion count and release the buffer + * if the recursion count goes to 0. * - * If the buffer is to be released and it was not modified before - * this transaction began, then free the buf_log_item associated with it. + * If the buffer is to be released and it was not already dirty before this + * transaction began, then also free the buf_log_item associated with it. * - * If the transaction pointer is NULL, make this just a normal - * brelse() call. + * If the transaction pointer is NULL, this is a normal xfs_buf_relse() call. */ void xfs_trans_brelse( - xfs_trans_t *tp, - xfs_buf_t *bp) + struct xfs_trans *tp, + struct xfs_buf *bp) { - struct xfs_buf_log_item *bip; - int freed; + struct xfs_buf_log_item *bip = bp->b_log_item; - /* - * Default to a normal brelse() call if the tp is NULL. - */ - if (tp == NULL) { - ASSERT(bp->b_transp == NULL); + ASSERT(bp->b_transp == tp); + + if (!tp) { xfs_buf_relse(bp); return; } - ASSERT(bp->b_transp == tp); - bip = bp->b_log_item; + trace_xfs_trans_brelse(bip); ASSERT(bip->bli_item.li_type == XFS_LI_BUF); - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); - trace_xfs_trans_brelse(bip); - /* - * If the release is just for a recursive lock, - * then decrement the count and return. + * If the release is for a recursive lookup, then decrement the count + * and return. */ if (bip->bli_recur > 0) { bip->bli_recur--; @@ -372,64 +403,24 @@ xfs_trans_brelse( } /* - * If the buffer is dirty within this transaction, we can't + * If the buffer is invalidated or dirty in this transaction, we can't * release it until we commit. */ if (test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags)) return; - - /* - * If the buffer has been invalidated, then we can't release - * it until the transaction commits to disk unless it is re-dirtied - * as part of this transaction. This prevents us from pulling - * the item from the AIL before we should. - */ if (bip->bli_flags & XFS_BLI_STALE) return; - ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); - /* - * Free up the log item descriptor tracking the released item. + * Unlink the log item from the transaction and clear the hold flag, if + * set. We wouldn't want the next user of the buffer to get confused. */ + ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); xfs_trans_del_item(&bip->bli_item); + bip->bli_flags &= ~XFS_BLI_HOLD; - /* - * Clear the hold flag in the buf log item if it is set. - * We wouldn't want the next user of the buffer to - * get confused. - */ - if (bip->bli_flags & XFS_BLI_HOLD) { - bip->bli_flags &= ~XFS_BLI_HOLD; - } - - /* - * Drop our reference to the buf log item. - */ - freed = atomic_dec_and_test(&bip->bli_refcount); - - /* - * If the buf item is not tracking data in the log, then we must free it - * before releasing the buffer back to the free pool. - * - * If the fs has shutdown and we dropped the last reference, it may fall - * on us to release a (possibly dirty) bli if it never made it to the - * AIL (e.g., the aborted unpin already happened and didn't release it - * due to our reference). Since we're already shutdown and need - * ail_lock, just force remove from the AIL and release the bli here. - */ - if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) { - xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR); - xfs_buf_item_relse(bp); - } else if (!(bip->bli_flags & XFS_BLI_DIRTY)) { -/*** - ASSERT(bp->b_pincount == 0); -***/ - ASSERT(atomic_read(&bip->bli_refcount) == 0); - ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); - ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF)); - xfs_buf_item_relse(bp); - } + /* drop the reference to the bli */ + xfs_buf_item_put(bip); bp->b_transp = NULL; xfs_buf_relse(bp); |