diff options
Diffstat (limited to 'fs')
137 files changed, 2257 insertions, 1009 deletions
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index cc2007be2173..5b5fda617b80 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -407,8 +407,8 @@ static struct dentry *v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, err); goto error; } - v9fs_fid_add(dentry, &fid); v9fs_set_create_acl(inode, fid, dacl, pacl); + v9fs_fid_add(dentry, &fid); d_instantiate(dentry, inode); err = 0; inc_nlink(dir); diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c index 07a8bfbdd9b9..e0030ac74ea0 100644 --- a/fs/afs/fs_probe.c +++ b/fs/afs/fs_probe.c @@ -534,6 +534,6 @@ dont_wait: */ void afs_fs_probe_cleanup(struct afs_net *net) { - if (del_timer_sync(&net->fs_probe_timer)) + if (timer_delete_sync(&net->fs_probe_timer)) afs_dec_servers_outstanding(net); } diff --git a/fs/afs/server.c b/fs/afs/server.c index c530d1ca15df..8755f2703815 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -318,7 +318,7 @@ struct afs_server *afs_use_server(struct afs_server *server, bool activate, a = atomic_inc_return(&server->active); if (a == 1 && activate && !test_bit(AFS_SERVER_FL_EXPIRED, &server->flags)) - del_timer(&server->timer); + timer_delete(&server->timer); trace_afs_server(server->debug_id, r + 1, a, reason); return server; diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index c9798750202d..bf1c94e51dd0 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -26,6 +26,7 @@ config BCACHEFS_FS select SRCU select SYMBOLIC_ERRNAME select MIN_HEAP + select XARRAY_MULTI help The bcachefs filesystem - a modern, copy on write filesystem, with support for multiple devices, compression, checksumming, etc. diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index 99487727ae64..d03adc36100e 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -273,7 +273,7 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu) struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); - struct btree_iter iter = { NULL }; + struct btree_iter iter = {}; struct posix_acl *acl = NULL; if (rcu) @@ -344,7 +344,7 @@ int bch2_set_acl(struct mnt_idmap *idmap, { struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_iter inode_iter = { NULL }; + struct btree_iter inode_iter = {}; struct bch_inode_unpacked inode_u; struct posix_acl *acl; umode_t mode; diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index c12ca7538e4f..94ea9e49aec4 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -610,7 +610,7 @@ int bch2_alloc_read(struct bch_fs *c) * bch2_check_alloc_key() which runs later: */ if (!ca) { - bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); continue; } @@ -631,17 +631,17 @@ int bch2_alloc_read(struct bch_fs *c) * bch2_check_alloc_key() which runs later: */ if (!ca) { - bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); continue; } if (k.k->p.offset < ca->mi.first_bucket) { - bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket)); + bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode, ca->mi.first_bucket)); continue; } if (k.k->p.offset >= ca->mi.nbuckets) { - bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); continue; } @@ -1039,9 +1039,10 @@ invalid_bucket: * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for * extents style btrees, but works on non-extents btrees: */ -static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole) +static struct bkey_s_c bch2_get_key_or_hole(struct btree_trans *trans, struct btree_iter *iter, + struct bpos end, struct bkey *hole) { - struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); if (bkey_err(k)) return k; @@ -1052,9 +1053,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos struct btree_iter iter2; struct bpos next; - bch2_trans_copy_iter(&iter2, iter); + bch2_trans_copy_iter(trans, &iter2, iter); - struct btree_path *path = btree_iter_path(iter->trans, iter); + struct btree_path *path = btree_iter_path(trans, iter); if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX)) end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p)); @@ -1064,9 +1065,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos * btree node min/max is a closed interval, upto takes a half * open interval: */ - k = bch2_btree_iter_peek_max(&iter2, end); + k = bch2_btree_iter_peek_max(trans, &iter2, end); next = iter2.pos; - bch2_trans_iter_exit(iter->trans, &iter2); + bch2_trans_iter_exit(trans, &iter2); BUG_ON(next.offset >= iter->pos.offset + U32_MAX); @@ -1107,13 +1108,14 @@ static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *buck return *ca != NULL; } -static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, - struct bch_dev **ca, struct bkey *hole) +static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_dev **ca, struct bkey *hole) { - struct bch_fs *c = iter->trans->c; + struct bch_fs *c = trans->c; struct bkey_s_c k; again: - k = bch2_get_key_or_hole(iter, POS_MAX, hole); + k = bch2_get_key_or_hole(trans, iter, POS_MAX, hole); if (bkey_err(k)) return k; @@ -1126,7 +1128,7 @@ again: if (!next_bucket(c, ca, &hole_start)) return bkey_s_c_null; - bch2_btree_iter_set_pos(iter, hole_start); + bch2_btree_iter_set_pos(trans, iter, hole_start); goto again; } @@ -1167,8 +1169,8 @@ int bch2_check_alloc_key(struct btree_trans *trans, a = bch2_alloc_to_v4(alloc_k, &a_convert); - bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p); - k = bch2_btree_iter_peek_slot(discard_iter); + bch2_btree_iter_set_pos(trans, discard_iter, alloc_k.k->p); + k = bch2_btree_iter_peek_slot(trans, discard_iter); ret = bkey_err(k); if (ret) goto err; @@ -1181,8 +1183,8 @@ int bch2_check_alloc_key(struct btree_trans *trans, goto err; } - bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); - k = bch2_btree_iter_peek_slot(freespace_iter); + bch2_btree_iter_set_pos(trans, freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); + k = bch2_btree_iter_peek_slot(trans, freespace_iter); ret = bkey_err(k); if (ret) goto err; @@ -1195,8 +1197,8 @@ int bch2_check_alloc_key(struct btree_trans *trans, goto err; } - bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset)); - k = bch2_btree_iter_peek_slot(bucket_gens_iter); + bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset)); + k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter); ret = bkey_err(k); if (ret) goto err; @@ -1249,9 +1251,9 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans, if (!ca->mi.freespace_initialized) return 0; - bch2_btree_iter_set_pos(freespace_iter, start); + bch2_btree_iter_set_pos(trans, freespace_iter, start); - k = bch2_btree_iter_peek_slot(freespace_iter); + k = bch2_btree_iter_peek_slot(trans, freespace_iter); ret = bkey_err(k); if (ret) goto err; @@ -1300,9 +1302,9 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, unsigned i, gens_offset, gens_end_offset; int ret; - bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); + bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); - k = bch2_btree_iter_peek_slot(bucket_gens_iter); + k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter); ret = bkey_err(k); if (ret) goto err; @@ -1435,7 +1437,7 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite *gen = a->gen; out: fsck_err: - bch2_set_btree_iter_dontneed(&alloc_iter); + bch2_set_btree_iter_dontneed(trans, &alloc_iter); bch2_trans_iter_exit(trans, &alloc_iter); printbuf_exit(&buf); return ret; @@ -1572,7 +1574,7 @@ int bch2_check_alloc_info(struct bch_fs *c) bch2_trans_begin(trans); - k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole); + k = bch2_get_key_or_real_bucket_hole(trans, &iter, &ca, &hole); ret = bkey_err(k); if (ret) goto bkey_err; @@ -1610,7 +1612,7 @@ int bch2_check_alloc_info(struct bch_fs *c) if (ret) goto bkey_err; - bch2_btree_iter_set_pos(&iter, next); + bch2_btree_iter_set_pos(trans, &iter, next); bkey_err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; @@ -1638,7 +1640,7 @@ bkey_err: BTREE_ITER_prefetch); while (1) { bch2_trans_begin(trans); - k = bch2_btree_iter_peek(&iter); + k = bch2_btree_iter_peek(trans, &iter); if (!k.k) break; @@ -1657,7 +1659,7 @@ bkey_err: break; } - bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos)); + bch2_btree_iter_set_pos(trans, &iter, bpos_nosnap_successor(iter.pos)); } bch2_trans_iter_exit(trans, &iter); if (ret) @@ -1685,7 +1687,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, struct printbuf buf = PRINTBUF; int ret; - alloc_k = bch2_btree_iter_peek(alloc_iter); + alloc_k = bch2_btree_iter_peek(trans, alloc_iter); if (!alloc_k.k) return 0; @@ -1826,7 +1828,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bpos pos = need_discard_iter->pos; - struct btree_iter iter = { NULL }; + struct btree_iter iter = {}; struct bkey_s_c k; struct bkey_i_alloc_v4 *a; struct printbuf buf = PRINTBUF; @@ -1950,7 +1952,7 @@ static void bch2_do_discards_work(struct work_struct *work) trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[WRITE]); bch2_write_ref_put(c, BCH_WRITE_REF_discard); } @@ -1967,7 +1969,7 @@ void bch2_dev_do_discards(struct bch_dev *ca) if (queue_work(c->write_ref_wq, &ca->discard_work)) return; - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[WRITE]); put_write_ref: bch2_write_ref_put(c, BCH_WRITE_REF_discard); } @@ -2045,7 +2047,7 @@ static void bch2_do_discards_fast_work(struct work_struct *work) trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); bch2_trans_put(trans); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[WRITE]); bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); } @@ -2065,7 +2067,7 @@ static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) if (queue_work(c->write_ref_wq, &ca->discard_fast_work)) return; - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[WRITE]); put_ref: bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); } @@ -2082,6 +2084,9 @@ static int invalidate_one_bp(struct btree_trans *trans, if (ret) return ret; + if (!extent_k.k) + return 0; + struct bkey_i *n = bch2_bkey_make_mut(trans, &extent_iter, &extent_k, BTREE_UPDATE_internal_snapshot_node); @@ -2199,9 +2204,9 @@ static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter { struct bkey_s_c k; again: - k = bch2_btree_iter_peek_max(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); + k = bch2_btree_iter_peek_max(trans, iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); if (!k.k && !*wrapped) { - bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0)); + bch2_btree_iter_set_pos(trans, iter, lru_pos(ca->dev_idx, 0, 0)); *wrapped = true; goto again; } @@ -2251,12 +2256,12 @@ restart_err: if (ret) break; - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); } bch2_trans_iter_exit(trans, &iter); err: bch2_trans_put(trans); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[WRITE]); bch2_bkey_buf_exit(&last_flushed, c); bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } @@ -2274,7 +2279,7 @@ void bch2_dev_do_invalidates(struct bch_dev *ca) if (queue_work(c->write_ref_wq, &ca->invalidate_work)) return; - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[WRITE]); put_ref: bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } @@ -2321,7 +2326,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, break; } - k = bch2_get_key_or_hole(&iter, end, &hole); + k = bch2_get_key_or_hole(trans, &iter, end, &hole); ret = bkey_err(k); if (ret) goto bkey_err; @@ -2340,7 +2345,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, if (ret) goto bkey_err; - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); } else { struct bkey_i *freespace; @@ -2360,7 +2365,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, if (ret) goto bkey_err; - bch2_btree_iter_set_pos(&iter, k.k->p); + bch2_btree_iter_set_pos(trans, &iter, k.k->p); } bkey_err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -2506,7 +2511,7 @@ void bch2_recalc_capacity(struct bch_fs *c) bch2_set_ra_pages(c, ra_pages); - for_each_rw_member(c, ca) { + __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) { u64 dev_reserve = 0; /* diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index c556ccaffe89..34b3d6ac4fbb 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -321,11 +321,11 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca, { u64 want_free = ca->mi.nbuckets >> 7; u64 free = max_t(s64, 0, - u.d[BCH_DATA_free].buckets - + u.d[BCH_DATA_need_discard].buckets + u.buckets[BCH_DATA_free] + + u.buckets[BCH_DATA_need_discard] - bch2_dev_buckets_reserved(ca, BCH_WATERMARK_stripe)); - return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets); + return clamp_t(s64, want_free - free, 0, u.buckets[BCH_DATA_cached]); } void bch2_dev_do_invalidates(struct bch_dev *); diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index da0d72928b5b..7c930ef77380 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -327,7 +327,7 @@ again: bucket = sector_to_bucket(ca, round_up(bucket_to_sector(ca, bucket) + 1, 1ULL << ca->mi.btree_bitmap_shift)); - bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, bucket)); + bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, bucket)); s->buckets_seen++; s->skipped_mi_btree_bitmap++; continue; @@ -355,7 +355,7 @@ again: watermark, s, cl) : NULL; next: - bch2_set_btree_iter_dontneed(&citer); + bch2_set_btree_iter_dontneed(trans, &citer); bch2_trans_iter_exit(trans, &citer); if (ob) break; @@ -417,7 +417,7 @@ again: 1ULL << ca->mi.btree_bitmap_shift)); alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56)); - bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor)); + bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, alloc_cursor)); s->skipped_mi_btree_bitmap++; goto next; } @@ -426,7 +426,7 @@ again: if (ob) { if (!IS_ERR(ob)) *dev_alloc_cursor = iter.pos.offset; - bch2_set_btree_iter_dontneed(&iter); + bch2_set_btree_iter_dontneed(trans, &iter); break; } @@ -469,7 +469,7 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[watermark]); prt_printf(&buf, "data type\t%s\n", __bch2_data_types[data_type]); prt_printf(&buf, "blocking\t%u\n", cl != NULL); - prt_printf(&buf, "free\t%llu\n", usage->d[BCH_DATA_free].buckets); + prt_printf(&buf, "free\t%llu\n", usage->buckets[BCH_DATA_free]); prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(ca, *usage, watermark)); prt_printf(&buf, "copygc_wait\t%lu/%lli\n", bch2_copygc_wait_amount(c), @@ -524,10 +524,10 @@ again: bch2_dev_usage_read_fast(ca, usage); avail = dev_buckets_free(ca, *usage, watermark); - if (usage->d[BCH_DATA_need_discard].buckets > avail) + if (usage->buckets[BCH_DATA_need_discard] > avail) bch2_dev_do_discards(ca); - if (usage->d[BCH_DATA_need_gc_gens].buckets > avail) + if (usage->buckets[BCH_DATA_need_gc_gens] > avail) bch2_gc_gens_async(c); if (should_invalidate_buckets(ca, *usage)) @@ -606,8 +606,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, static int __dev_stripe_cmp(struct dev_stripe_state *stripe, unsigned l, unsigned r) { - return ((stripe->next_alloc[l] > stripe->next_alloc[r]) - - (stripe->next_alloc[l] < stripe->next_alloc[r])); + return cmp_int(stripe->next_alloc[l], stripe->next_alloc[r]); } #define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) @@ -626,25 +625,62 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, return ret; } +static const u64 stripe_clock_hand_rescale = 1ULL << 62; /* trigger rescale at */ +static const u64 stripe_clock_hand_max = 1ULL << 56; /* max after rescale */ +static const u64 stripe_clock_hand_inv = 1ULL << 52; /* max increment, if a device is empty */ + +static noinline void bch2_stripe_state_rescale(struct dev_stripe_state *stripe) +{ + /* + * Avoid underflowing clock hands if at all possible, if clock hands go + * to 0 then we lose information - clock hands can be in a wide range if + * we have devices we rarely try to allocate from, if we generally + * allocate from a specified target but only sometimes have to fall back + * to the whole filesystem. + */ + u64 scale_max = U64_MAX; /* maximum we can subtract without underflow */ + u64 scale_min = 0; /* minumum we must subtract to avoid overflow */ + + for (u64 *v = stripe->next_alloc; + v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) { + if (*v) + scale_max = min(scale_max, *v); + if (*v > stripe_clock_hand_max) + scale_min = max(scale_min, *v - stripe_clock_hand_max); + } + + u64 scale = max(scale_min, scale_max); + + for (u64 *v = stripe->next_alloc; + v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) + *v = *v < scale ? 0 : *v - scale; +} + static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca, struct dev_stripe_state *stripe, struct bch_dev_usage *usage) { + /* + * Stripe state has a per device clock hand: we allocate from the device + * with the smallest clock hand. + * + * When we allocate, we don't do a simple increment; we add the inverse + * of the device's free space. This results in round robin behavior that + * biases in favor of the device(s) with more free space. + */ + u64 *v = stripe->next_alloc + ca->dev_idx; u64 free_space = __dev_buckets_available(ca, *usage, BCH_WATERMARK_normal); u64 free_space_inv = free_space - ? div64_u64(1ULL << 48, free_space) - : 1ULL << 48; - u64 scale = *v / 4; + ? div64_u64(stripe_clock_hand_inv, free_space) + : stripe_clock_hand_inv; - if (*v + free_space_inv >= *v) - *v += free_space_inv; - else - *v = U64_MAX; + /* Saturating add, avoid overflow: */ + u64 sum = *v + free_space_inv; + *v = sum >= *v ? sum : U64_MAX; - for (v = stripe->next_alloc; - v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) - *v = *v < scale ? 0 : *v - scale; + if (unlikely(*v > stripe_clock_hand_rescale)) + bch2_stripe_state_rescale(stripe); } void bch2_dev_stripe_increment(struct bch_dev *ca, @@ -1633,7 +1669,7 @@ void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) { struct bch_fs *c = ca->fs; - struct bch_dev_usage stats = bch2_dev_usage_read(ca); + struct bch_dev_usage_full stats = bch2_dev_usage_full_read(ca); unsigned nr[BCH_DATA_NR]; memset(nr, 0, sizeof(nr)); @@ -1656,7 +1692,8 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) printbuf_tabstop_push(out, 16); prt_printf(out, "open buckets\t%i\r\n", ca->nr_open_buckets); - prt_printf(out, "buckets to invalidate\t%llu\r\n", should_invalidate_buckets(ca, stats)); + prt_printf(out, "buckets to invalidate\t%llu\r\n", + should_invalidate_buckets(ca, bch2_dev_usage_read(ca))); } static noinline void bch2_print_allocator_stuck(struct bch_fs *c) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 21d1d86d5008..ff26bb515150 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -252,12 +252,24 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, 0, bp.v->level, iter_flags); - struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); if (bkey_err(k)) { bch2_trans_iter_exit(trans, iter); return k; } + /* + * peek_slot() doesn't normally return NULL - except when we ask for a + * key at a btree level that doesn't exist. + * + * We may want to revisit this and change peek_slot(): + */ + if (!k.k) { + bkey_init(&iter->k); + iter->k.p = bp.v->pos; + k.k = &iter->k; + } + if (k.k && extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) return k; @@ -293,7 +305,7 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans, 0, bp.v->level - 1, 0); - struct btree *b = bch2_btree_iter_peek_node(iter); + struct btree *b = bch2_btree_iter_peek_node(trans, iter); if (IS_ERR_OR_NULL(b)) goto err; @@ -321,7 +333,7 @@ static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, st return 0; struct bch_fs *c = trans->c; - struct btree_iter alloc_iter = { NULL }; + struct btree_iter alloc_iter = {}; struct bkey_s_c alloc_k; struct printbuf buf = PRINTBUF; int ret = 0; @@ -462,7 +474,7 @@ err: if (bio) bio_put(bio); kvfree(data_buf); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); printbuf_exit(&buf); return ret; } @@ -650,7 +662,7 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans, retry: bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, bch2_btree_id_root(c, btree_id)->b->c.level, 0); - b = bch2_btree_iter_peek_node(&iter); + b = bch2_btree_iter_peek_node(trans, &iter); ret = PTR_ERR_OR_ZERO(b); if (ret) goto err; @@ -934,7 +946,7 @@ static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k, { struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0); - struct btree *b = bch2_btree_iter_peek_node(&iter); + struct btree *b = bch2_btree_iter_peek_node(trans, &iter); int ret = PTR_ERR_OR_ZERO(b); if (ret) goto err; diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index f52311017aee..5d9f208a1bb7 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -524,8 +524,8 @@ struct bch_dev { struct percpu_ref ref; #endif struct completion ref_completion; - struct percpu_ref io_ref; - struct completion io_ref_completion; + struct percpu_ref io_ref[2]; + struct completion io_ref_completion[2]; struct bch_fs *fs; @@ -562,7 +562,8 @@ struct bch_dev { unsigned long *bucket_backpointer_mismatches; unsigned long *bucket_backpointer_empty; - struct bch_dev_usage __percpu *usage; + struct bch_dev_usage_full __percpu + *usage; /* Allocator: */ u64 alloc_cursor[3]; diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 2025d408979c..7b98ba2dec64 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -691,7 +691,7 @@ retry_root: struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, bch2_btree_id_root(c, btree)->b->c.level, 0); - struct btree *b = bch2_btree_iter_peek_node(&iter); + struct btree *b = bch2_btree_iter_peek_node(trans, &iter); ret = PTR_ERR_OR_ZERO(b); if (ret) goto err_root; @@ -1199,7 +1199,7 @@ int bch2_gc_gens(struct bch_fs *c) BCH_TRANS_COMMIT_no_enospc, ({ ca = bch2_dev_iterate(c, ca, k.k->p.inode); if (!ca) { - bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); continue; } bch2_alloc_write_oldest_gen(trans, ca, &iter, k); diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 1d94a2bf706d..5fd4a58d2ad2 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1353,7 +1353,7 @@ start: "btree read error %s for %s", bch2_blk_status_to_str(bio->bi_status), buf.buf); if (rb->have_ioref) - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); rb->have_ioref = false; bch2_mark_io_failure(&failed, &rb->pick, false); @@ -1609,6 +1609,7 @@ static void btree_node_read_all_replicas_endio(struct bio *bio) struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev); bch2_latency_acct(ca, rb->start_time, READ); + percpu_ref_put(&ca->io_ref[READ]); } ra->err[rb->idx] = bio->bi_status; @@ -1908,7 +1909,8 @@ static void btree_node_scrub_work(struct work_struct *work) scrub->key.k->k.p, 0, scrub->level - 1, 0); struct btree *b; - int ret = lockrestart_do(trans, PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(&iter))); + int ret = lockrestart_do(trans, + PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(trans, &iter))); if (ret) goto err; @@ -1927,7 +1929,7 @@ err: printbuf_exit(&err); bch2_bkey_buf_exit(&scrub->key, c);; btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf); - percpu_ref_put(&scrub->ca->io_ref); + percpu_ref_put(&scrub->ca->io_ref[READ]); kfree(scrub); bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub); } @@ -1996,7 +1998,7 @@ int bch2_btree_node_scrub(struct btree_trans *trans, return 0; err_free: btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); err: bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub); return ret; @@ -2144,6 +2146,7 @@ static void btree_node_write_endio(struct bio *bio) if (ca && bio->bi_status) { struct printbuf buf = PRINTBUF; + buf.atomic++; prt_printf(&buf, "btree write error: %s\n ", bch2_blk_status_to_str(bio->bi_status)); bch2_btree_pos_to_text(&buf, c, b); @@ -2158,8 +2161,12 @@ static void btree_node_write_endio(struct bio *bio) spin_unlock_irqrestore(&c->btree_write_error_lock, flags); } + /* + * XXX: we should be using io_ref[WRITE], but we aren't retrying failed + * btree writes yet (due to device removal/ro): + */ if (wbio->have_ioref) - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); if (parent) { bio_put(bio); diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index a9c110b846b5..e34e9598ef25 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -244,10 +244,8 @@ void bch2_trans_verify_paths(struct btree_trans *trans) bch2_btree_path_verify(trans, path); } -static void bch2_btree_iter_verify(struct btree_iter *iter) +static void bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter *iter) { - struct btree_trans *trans = iter->trans; - BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached); BUG_ON((iter->flags & BTREE_ITER_is_extents) && @@ -276,9 +274,9 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) bkey_gt(iter->pos, iter->k.p))); } -static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) +static int bch2_btree_iter_verify_ret(struct btree_trans *trans, + struct btree_iter *iter, struct bkey_s_c k) { - struct btree_trans *trans = iter->trans; struct btree_iter copy; struct bkey_s_c prev; int ret = 0; @@ -299,7 +297,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, BTREE_ITER_nopreserve| BTREE_ITER_all_snapshots); - prev = bch2_btree_iter_prev(©); + prev = bch2_btree_iter_prev(trans, ©); if (!prev.k) goto out; @@ -365,9 +363,11 @@ static inline void bch2_btree_path_verify_level(struct btree_trans *trans, struct btree_path *path, unsigned l) {} static inline void bch2_btree_path_verify(struct btree_trans *trans, struct btree_path *path) {} -static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} +static inline void bch2_btree_iter_verify(struct btree_trans *trans, + struct btree_iter *iter) {} static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {} -static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; } +static inline int bch2_btree_iter_verify_ret(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k) { return 0; } #endif @@ -1855,10 +1855,8 @@ hole: return (struct bkey_s_c) { u, NULL }; } -void bch2_set_btree_iter_dontneed(struct btree_iter *iter) +void bch2_set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter) { - struct btree_trans *trans = iter->trans; - if (!iter->path || trans->restarted) return; @@ -1870,17 +1868,14 @@ void bch2_set_btree_iter_dontneed(struct btree_iter *iter) /* Btree iterators: */ int __must_check -__bch2_btree_iter_traverse(struct btree_iter *iter) +__bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter) { - return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); + return bch2_btree_path_traverse(trans, iter->path, iter->flags); } int __must_check -bch2_btree_iter_traverse(struct btree_iter *iter) +bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter) { - struct btree_trans *trans = iter->trans; - int ret; - bch2_trans_verify_not_unlocked_or_in_restart(trans); iter->path = bch2_btree_path_set_pos(trans, iter->path, @@ -1888,7 +1883,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter) iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); - ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); + int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (ret) return ret; @@ -1900,14 +1895,14 @@ bch2_btree_iter_traverse(struct btree_iter *iter) /* Iterate across nodes (leaf and interior nodes) */ -struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) +struct btree *bch2_btree_iter_peek_node(struct btree_trans *trans, + struct btree_iter *iter) { - struct btree_trans *trans = iter->trans; struct btree *b = NULL; int ret; EBUG_ON(trans->paths[iter->path].cached); - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (ret) @@ -1929,7 +1924,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); out: bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); return b; err: @@ -1938,26 +1933,26 @@ err: } /* Only kept for -tools */ -struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter) +struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *trans, + struct btree_iter *iter) { struct btree *b; - while (b = bch2_btree_iter_peek_node(iter), + while (b = bch2_btree_iter_peek_node(trans, iter), bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart)) - bch2_trans_begin(iter->trans); + bch2_trans_begin(trans); return b; } -struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) +struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_iter *iter) { - struct btree_trans *trans = iter->trans; struct btree *b = NULL; int ret; EBUG_ON(trans->paths[iter->path].cached); bch2_trans_verify_not_unlocked_or_in_restart(trans); - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (ret) @@ -2024,7 +2019,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) EBUG_ON(btree_iter_path(trans, iter)->uptodate); out: bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); return b; err: @@ -2034,7 +2029,7 @@ err: /* Iterate across keys (in leaf nodes only) */ -inline bool bch2_btree_iter_advance(struct btree_iter *iter) +inline bool bch2_btree_iter_advance(struct btree_trans *trans, struct btree_iter *iter) { struct bpos pos = iter->k.p; bool ret = !(iter->flags & BTREE_ITER_all_snapshots @@ -2043,11 +2038,11 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter) if (ret && !(iter->flags & BTREE_ITER_is_extents)) pos = bkey_successor(iter, pos); - bch2_btree_iter_set_pos(iter, pos); + bch2_btree_iter_set_pos(trans, iter, pos); return ret; } -inline bool bch2_btree_iter_rewind(struct btree_iter *iter) +inline bool bch2_btree_iter_rewind(struct btree_trans *trans, struct btree_iter *iter) { struct bpos pos = bkey_start_pos(&iter->k); bool ret = !(iter->flags & BTREE_ITER_all_snapshots @@ -2056,7 +2051,7 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter) if (ret && !(iter->flags & BTREE_ITER_is_extents)) pos = bkey_predecessor(iter, pos); - bch2_btree_iter_set_pos(iter, pos); + bch2_btree_iter_set_pos(trans, iter, pos); return ret; } @@ -2183,9 +2178,9 @@ void btree_trans_peek_prev_journal(struct btree_trans *trans, * bkey_s_c_null: */ static noinline -struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) +struct bkey_s_c btree_trans_peek_key_cache(struct btree_trans *trans, struct btree_iter *iter, + struct bpos pos) { - struct btree_trans *trans = iter->trans; struct bch_fs *c = trans->c; struct bkey u; struct bkey_s_c k; @@ -2231,14 +2226,14 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos return k; } -static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) +static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct btree_iter *iter, + struct bpos search_key) { - struct btree_trans *trans = iter->trans; struct bkey_s_c k, k2; int ret; EBUG_ON(btree_iter_path(trans, iter)->cached); - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); while (1) { iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, @@ -2248,7 +2243,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (unlikely(ret)) { /* ensure that iter->k is consistent with iter->pos: */ - bch2_btree_iter_set_pos(iter, iter->pos); + bch2_btree_iter_set_pos(trans, iter, iter->pos); k = bkey_s_c_err(ret); break; } @@ -2258,7 +2253,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp if (unlikely(!l->b)) { /* No btree nodes at requested level: */ - bch2_btree_iter_set_pos(iter, SPOS_MAX); + bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); k = bkey_s_c_null; break; } @@ -2269,10 +2264,10 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && k.k && - (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { + (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) { k = k2; if (bkey_err(k)) { - bch2_btree_iter_set_pos(iter, iter->pos); + bch2_btree_iter_set_pos(trans, iter, iter->pos); break; } } @@ -2305,27 +2300,28 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp search_key = bpos_successor(l->b->key.k.p); } else { /* End of btree: */ - bch2_btree_iter_set_pos(iter, SPOS_MAX); + bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); k = bkey_s_c_null; break; } } - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); return k; } /** * bch2_btree_iter_peek_max() - returns first key greater than or equal to * iterator's current position + * @trans: btree transaction object * @iter: iterator to peek from * @end: search limit: returns keys less than or equal to @end * * Returns: key if found, or an error extractable with bkey_err(). */ -struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos end) +struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree_iter *iter, + struct bpos end) { - struct btree_trans *trans = iter->trans; struct bpos search_key = btree_iter_search_key(iter); struct bkey_s_c k; struct bpos iter_pos = iter->pos; @@ -2348,7 +2344,7 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en } while (1) { - k = __bch2_btree_iter_peek(iter, search_key); + k = __bch2_btree_iter_peek(trans, iter, search_key); if (unlikely(!k.k)) goto end; if (unlikely(bkey_err(k))) @@ -2462,9 +2458,9 @@ out_no_locked: if (!(iter->flags & BTREE_ITER_all_snapshots)) iter->pos.snapshot = iter->snapshot; - ret = bch2_btree_iter_verify_ret(iter, k); + ret = bch2_btree_iter_verify_ret(trans, iter, k); if (unlikely(ret)) { - bch2_btree_iter_set_pos(iter, iter->pos); + bch2_btree_iter_set_pos(trans, iter, iter->pos); k = bkey_s_c_err(ret); } @@ -2472,7 +2468,7 @@ out_no_locked: return k; end: - bch2_btree_iter_set_pos(iter, end); + bch2_btree_iter_set_pos(trans, iter, end); k = bkey_s_c_null; goto out_no_locked; } @@ -2480,24 +2476,25 @@ end: /** * bch2_btree_iter_next() - returns first key greater than iterator's current * position + * @trans: btree transaction object * @iter: iterator to peek from * * Returns: key if found, or an error extractable with bkey_err(). */ -struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_next(struct btree_trans *trans, struct btree_iter *iter) { - if (!bch2_btree_iter_advance(iter)) + if (!bch2_btree_iter_advance(trans, iter)) return bkey_s_c_null; - return bch2_btree_iter_peek(iter); + return bch2_btree_iter_peek(trans, iter); } -static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, struct bpos search_key) +static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter, + struct bpos search_key) { - struct btree_trans *trans = iter->trans; struct bkey_s_c k, k2; - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); while (1) { iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, @@ -2507,7 +2504,7 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (unlikely(ret)) { /* ensure that iter->k is consistent with iter->pos: */ - bch2_btree_iter_set_pos(iter, iter->pos); + bch2_btree_iter_set_pos(trans, iter, iter->pos); k = bkey_s_c_err(ret); break; } @@ -2517,7 +2514,7 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru if (unlikely(!l->b)) { /* No btree nodes at requested level: */ - bch2_btree_iter_set_pos(iter, SPOS_MAX); + bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); k = bkey_s_c_null; break; } @@ -2533,10 +2530,10 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && k.k && - (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { + (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) { k = k2; if (bkey_err(k2)) { - bch2_btree_iter_set_pos(iter, iter->pos); + bch2_btree_iter_set_pos(trans, iter, iter->pos); break; } } @@ -2557,25 +2554,27 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru search_key = bpos_predecessor(path->l[0].b->data->min_key); } else { /* Start of btree: */ - bch2_btree_iter_set_pos(iter, POS_MIN); + bch2_btree_iter_set_pos(trans, iter, POS_MIN); k = bkey_s_c_null; break; } } - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); return k; } /** * bch2_btree_iter_peek_prev_min() - returns first key less than or equal to * iterator's current position + * @trans: btree transaction object * @iter: iterator to peek from * @end: search limit: returns keys greater than or equal to @end * * Returns: key if found, or an error extractable with bkey_err(). */ -struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bpos end) +struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct btree_iter *iter, + struct bpos end) { if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) && !bkey_eq(iter->pos, POS_MAX)) { @@ -2587,7 +2586,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp * real visible extents - easiest to just use peek_slot() (which * internally uses peek() for extents) */ - struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); if (bkey_err(k)) return k; @@ -2597,7 +2596,6 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp return k; } - struct btree_trans *trans = iter->trans; struct bpos search_key = iter->pos; struct bkey_s_c k; btree_path_idx_t saved_path = 0; @@ -2613,7 +2611,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp } while (1) { - k = __bch2_btree_iter_peek_prev(iter, search_key); + k = __bch2_btree_iter_peek_prev(trans, iter, search_key); if (unlikely(!k.k)) goto end; if (unlikely(bkey_err(k))) @@ -2704,10 +2702,10 @@ out_no_locked: bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_intent); bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); return k; end: - bch2_btree_iter_set_pos(iter, end); + bch2_btree_iter_set_pos(trans, iter, end); k = bkey_s_c_null; goto out_no_locked; } @@ -2715,27 +2713,27 @@ end: /** * bch2_btree_iter_prev() - returns first key less than iterator's current * position + * @trans: btree transaction object * @iter: iterator to peek from * * Returns: key if found, or an error extractable with bkey_err(). */ -struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *trans, struct btree_iter *iter) { - if (!bch2_btree_iter_rewind(iter)) + if (!bch2_btree_iter_rewind(trans, iter)) return bkey_s_c_null; - return bch2_btree_iter_peek_prev(iter); + return bch2_btree_iter_peek_prev(trans, iter); } -struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btree_iter *iter) { - struct btree_trans *trans = iter->trans; struct bpos search_key; struct bkey_s_c k; int ret; bch2_trans_verify_not_unlocked_or_in_restart(trans); - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); bch2_btree_iter_verify_entry_exit(iter); EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache)); @@ -2751,7 +2749,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) if (iter->pos.inode == KEY_INODE_MAX) return bkey_s_c_null; - bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); + bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos)); } search_key = btree_iter_search_key(iter); @@ -2785,7 +2783,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) goto out; if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && - (k = btree_trans_peek_key_cache(iter, iter->pos)).k) { + (k = btree_trans_peek_key_cache(trans, iter, iter->pos)).k) { if (!bkey_err(k)) iter->k = *k.k; /* We're not returning a key from iter->path: */ @@ -2812,8 +2810,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) if (iter->flags & BTREE_ITER_intent) { struct btree_iter iter2; - bch2_trans_copy_iter(&iter2, iter); - k = bch2_btree_iter_peek_max(&iter2, end); + bch2_trans_copy_iter(trans, &iter2, iter); + k = bch2_btree_iter_peek_max(trans, &iter2, end); if (k.k && !bkey_err(k)) { swap(iter->key_cache_path, iter2.key_cache_path); @@ -2824,9 +2822,9 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } else { struct bpos pos = iter->pos; - k = bch2_btree_iter_peek_max(iter, end); + k = bch2_btree_iter_peek_max(trans, iter, end); if (unlikely(bkey_err(k))) - bch2_btree_iter_set_pos(iter, pos); + bch2_btree_iter_set_pos(trans, iter, pos); else iter->pos = pos; } @@ -2857,39 +2855,39 @@ out: btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); out_no_locked: bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(iter); - ret = bch2_btree_iter_verify_ret(iter, k); + bch2_btree_iter_verify(trans, iter); + ret = bch2_btree_iter_verify_ret(trans, iter, k); if (unlikely(ret)) return bkey_s_c_err(ret); return k; } -struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *trans, struct btree_iter *iter) { - if (!bch2_btree_iter_advance(iter)) + if (!bch2_btree_iter_advance(trans, iter)) return bkey_s_c_null; - return bch2_btree_iter_peek_slot(iter); + return bch2_btree_iter_peek_slot(trans, iter); } -struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *trans, struct btree_iter *iter) { - if (!bch2_btree_iter_rewind(iter)) + if (!bch2_btree_iter_rewind(trans, iter)) return bkey_s_c_null; - return bch2_btree_iter_peek_slot(iter); + return bch2_btree_iter_peek_slot(trans, iter); } /* Obsolete, but still used by rust wrapper in -tools */ -struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *trans, struct btree_iter *iter) { struct bkey_s_c k; - while (btree_trans_too_many_iters(iter->trans) || - (k = bch2_btree_iter_peek_type(iter, iter->flags), + while (btree_trans_too_many_iters(trans) || + (k = bch2_btree_iter_peek_type(trans, iter, iter->flags), bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) - bch2_trans_begin(iter->trans); + bch2_trans_begin(trans); return k; } @@ -3035,7 +3033,6 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) iter->path = 0; iter->update_path = 0; iter->key_cache_path = 0; - iter->trans = NULL; } void bch2_trans_iter_init_outlined(struct btree_trans *trans, @@ -3075,10 +3072,9 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, BUG_ON(iter->min_depth != depth); } -void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) +void bch2_trans_copy_iter(struct btree_trans *trans, + struct btree_iter *dst, struct btree_iter *src) { - struct btree_trans *trans = src->trans; - *dst = *src; #ifdef TRACK_PATH_ALLOCATED dst->ip_allocated = _RET_IP_; diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index e6f51a3b8187..9d2cccf5d21a 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -393,36 +393,37 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct void bch2_trans_node_drop(struct btree_trans *trans, struct btree *); void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *); -int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter); -int __must_check bch2_btree_iter_traverse(struct btree_iter *); +int __must_check __bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *); +int __must_check bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *); -struct btree *bch2_btree_iter_peek_node(struct btree_iter *); -struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *); -struct btree *bch2_btree_iter_next_node(struct btree_iter *); +struct btree *bch2_btree_iter_peek_node(struct btree_trans *, struct btree_iter *); +struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *, struct btree_iter *); +struct btree *bch2_btree_iter_next_node(struct btree_trans *, struct btree_iter *); -struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *, struct bpos); -struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *, struct btree_iter *, struct bpos); +struct bkey_s_c bch2_btree_iter_next(struct btree_trans *, struct btree_iter *); -static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) +static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_trans *trans, + struct btree_iter *iter) { - return bch2_btree_iter_peek_max(iter, SPOS_MAX); + return bch2_btree_iter_peek_max(trans, iter, SPOS_MAX); } -struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *, struct bpos); +struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *, struct btree_iter *, struct bpos); -static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) +static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter) { - return bch2_btree_iter_peek_prev_min(iter, POS_MIN); + return bch2_btree_iter_peek_prev_min(trans, iter, POS_MIN); } -struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *, struct btree_iter *); -struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); -struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); -struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *, struct btree_iter *); +struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *, struct btree_iter *); +struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *, struct btree_iter *); -bool bch2_btree_iter_advance(struct btree_iter *); -bool bch2_btree_iter_rewind(struct btree_iter *); +bool bch2_btree_iter_advance(struct btree_trans *, struct btree_iter *); +bool bch2_btree_iter_rewind(struct btree_trans *, struct btree_iter *); static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) { @@ -433,10 +434,9 @@ static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpo iter->k.size = 0; } -static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) +static inline void bch2_btree_iter_set_pos(struct btree_trans *trans, + struct btree_iter *iter, struct bpos new_pos) { - struct btree_trans *trans = iter->trans; - if (unlikely(iter->update_path)) bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_intent); @@ -454,13 +454,14 @@ static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *it iter->pos = bkey_start_pos(&iter->k); } -static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot) +static inline void bch2_btree_iter_set_snapshot(struct btree_trans *trans, + struct btree_iter *iter, u32 snapshot) { struct bpos pos = iter->pos; iter->snapshot = snapshot; pos.snapshot = snapshot; - bch2_btree_iter_set_pos(iter, pos); + bch2_btree_iter_set_pos(trans, iter, pos); } void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *); @@ -502,7 +503,6 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans, unsigned flags, unsigned long ip) { - iter->trans = trans; iter->update_path = 0; iter->key_cache_path = 0; iter->btree_id = btree_id; @@ -539,9 +539,9 @@ static inline void bch2_trans_iter_init(struct btree_trans *trans, void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *, enum btree_id, struct bpos, unsigned, unsigned, unsigned); -void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *); +void bch2_trans_copy_iter(struct btree_trans *, struct btree_iter *, struct btree_iter *); -void bch2_set_btree_iter_dontneed(struct btree_iter *); +void bch2_set_btree_iter_dontneed(struct btree_trans *, struct btree_iter *); void *__bch2_trans_kmalloc(struct btree_trans *, size_t); @@ -588,7 +588,7 @@ static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans, struct bkey_s_c k; bch2_trans_iter_init(trans, iter, btree_id, pos, flags); - k = bch2_btree_iter_peek_slot(iter); + k = bch2_btree_iter_peek_slot(trans, iter); if (!bkey_err(k) && type && k.k->type != type) k = bkey_s_c_err(-BCH_ERR_ENOENT_bkey_type_mismatch); @@ -658,14 +658,14 @@ u32 bch2_trans_begin(struct btree_trans *); int _ret3 = 0; \ do { \ _ret3 = lockrestart_do((_trans), ({ \ - struct btree *_b = bch2_btree_iter_peek_node(&_iter); \ + struct btree *_b = bch2_btree_iter_peek_node(_trans, &_iter);\ if (!_b) \ break; \ \ PTR_ERR_OR_ZERO(_b) ?: (_do); \ })) ?: \ lockrestart_do((_trans), \ - PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(&_iter))); \ + PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(_trans, &_iter)));\ } while (!_ret3); \ \ bch2_trans_iter_exit((_trans), &(_iter)); \ @@ -677,31 +677,34 @@ u32 bch2_trans_begin(struct btree_trans *); __for_each_btree_node(_trans, _iter, _btree_id, _start, \ 0, 0, _flags, _b, _do) -static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter, +static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_trans *trans, + struct btree_iter *iter, unsigned flags) { - return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) : - bch2_btree_iter_peek_prev(iter); + return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) : + bch2_btree_iter_peek_prev(trans, iter); } -static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, +static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_trans *trans, + struct btree_iter *iter, unsigned flags) { - return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) : - bch2_btree_iter_peek(iter); + return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) : + bch2_btree_iter_peek(trans, iter); } -static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_iter *iter, - struct bpos end, - unsigned flags) +static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos end, + unsigned flags) { if (!(flags & BTREE_ITER_slots)) - return bch2_btree_iter_peek_max(iter, end); + return bch2_btree_iter_peek_max(trans, iter, end); if (bkey_gt(iter->pos, end)) return bkey_s_c_null; - return bch2_btree_iter_peek_slot(iter); + return bch2_btree_iter_peek_slot(trans, iter); } int __bch2_btree_trans_too_many_iters(struct btree_trans *); @@ -768,14 +771,14 @@ transaction_restart: \ \ do { \ _ret3 = lockrestart_do(_trans, ({ \ - (_k) = bch2_btree_iter_peek_max_type(&(_iter), \ + (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), \ _end, (_flags)); \ if (!(_k).k) \ break; \ \ bkey_err(_k) ?: (_do); \ })); \ - } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \ + } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \ \ bch2_trans_iter_exit((_trans), &(_iter)); \ _ret3; \ @@ -813,14 +816,14 @@ transaction_restart: \ \ do { \ _ret3 = lockrestart_do(_trans, ({ \ - (_k) = bch2_btree_iter_peek_prev_type(&(_iter), \ + (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), \ (_flags)); \ if (!(_k).k) \ break; \ \ bkey_err(_k) ?: (_do); \ })); \ - } while (!_ret3 && bch2_btree_iter_rewind(&(_iter))); \ + } while (!_ret3 && bch2_btree_iter_rewind(_trans, &(_iter))); \ \ bch2_trans_iter_exit((_trans), &(_iter)); \ _ret3; \ @@ -850,37 +853,38 @@ transaction_restart: \ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ (_journal_seq), (_commit_flags))) -struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *, + struct btree_iter *); #define for_each_btree_key_max_norestart(_trans, _iter, _btree_id, \ _start, _end, _flags, _k, _ret) \ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ - (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags),\ + (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags),\ !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(&(_iter))) + bch2_btree_iter_advance(_trans, &(_iter))) -#define for_each_btree_key_max_continue_norestart(_iter, _end, _flags, _k, _ret)\ +#define for_each_btree_key_max_continue_norestart(_trans, _iter, _end, _flags, _k, _ret)\ for (; \ - (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags), \ + (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags), \ !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(&(_iter))) + bch2_btree_iter_advance(_trans, &(_iter))) #define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ _start, _flags, _k, _ret) \ for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start,\ SPOS_MAX, _flags, _k, _ret) -#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \ - _start, _flags, _k, _ret) \ - for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - (_k) = bch2_btree_iter_peek_prev_type(&(_iter), _flags), \ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_rewind(&(_iter))) +#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \ + _start, _flags, _k, _ret) \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), _flags), \ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_rewind(_trans, &(_iter))) -#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ - for_each_btree_key_max_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret) +#define for_each_btree_key_continue_norestart(_trans, _iter, _flags, _k, _ret) \ + for_each_btree_key_max_continue_norestart(_trans, _iter, SPOS_MAX, _flags, _k, _ret) /* * This should not be used in a fastpath, without first trying _do in diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index edce59433375..2b186584a291 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -287,6 +287,19 @@ err: return ret; } +static noinline_for_stack void do_trace_key_cache_fill(struct btree_trans *trans, + struct btree_path *ck_path, + struct bkey_s_c k) +{ + struct printbuf buf = PRINTBUF; + + bch2_bpos_to_text(&buf, ck_path->pos); + prt_char(&buf, ' '); + bch2_bkey_val_to_text(&buf, trans->c, k); + trace_key_cache_fill(trans, buf.buf); + printbuf_exit(&buf); +} + static noinline int btree_key_cache_fill(struct btree_trans *trans, struct btree_path *ck_path, unsigned flags) @@ -306,7 +319,7 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, BTREE_ITER_key_cache_fill| BTREE_ITER_cached_nofill); iter.flags &= ~BTREE_ITER_with_journal; - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); ret = bkey_err(k); if (ret) goto err; @@ -320,18 +333,11 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, if (ret) goto err; - if (trace_key_cache_fill_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bpos_to_text(&buf, ck_path->pos); - prt_char(&buf, ' '); - bch2_bkey_val_to_text(&buf, trans->c, k); - trace_key_cache_fill(trans, buf.buf); - printbuf_exit(&buf); - } + if (trace_key_cache_fill_enabled()) + do_trace_key_cache_fill(trans, ck_path, k); out: /* We're not likely to need this iterator again: */ - bch2_set_btree_iter_dontneed(&iter); + bch2_set_btree_iter_dontneed(trans, &iter); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -412,7 +418,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, BTREE_ITER_intent); b_iter.flags &= ~BTREE_ITER_with_key_cache; - ret = bch2_btree_iter_traverse(&c_iter); + ret = bch2_btree_iter_traverse(trans, &c_iter); if (ret) goto out; @@ -444,7 +450,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, !test_bit(JOURNAL_space_low, &c->journal.flags)) commit_flags |= BCH_TRANS_COMMIT_no_journal_res; - struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(&b_iter); + struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(trans, &b_iter); ret = bkey_err(btree_k); if (ret) goto err; diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 25d54b77cdc2..8c9fdb7263fe 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -271,7 +271,7 @@ static int read_btree_nodes_worker(void *p) err: bio_put(bio); free_page((unsigned long) buf); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); closure_put(w->cl); kfree(w); return 0; @@ -291,7 +291,7 @@ static int read_btree_nodes(struct find_btree_nodes *f) struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); if (!w) { - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); ret = -ENOMEM; goto err; } @@ -303,14 +303,14 @@ static int read_btree_nodes(struct find_btree_nodes *f) struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); ret = PTR_ERR_OR_ZERO(t); if (ret) { - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); kfree(w); bch_err_msg(c, ret, "starting kthread"); break; } closure_get(&cl); - percpu_ref_get(&ca->io_ref); + percpu_ref_get(&ca->io_ref[READ]); wake_up_process(t); } err: diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 77578da2d23f..023c472dc9ee 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -367,7 +367,6 @@ static inline unsigned long btree_path_ip_allocated(struct btree_path *path) * @nodes_intent_locked - bitmask indicating which locks are intent locks */ struct btree_iter { - struct btree_trans *trans; btree_path_idx_t path; btree_path_idx_t update_path; btree_path_idx_t key_cache_path; diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index c05394f56424..1e6b7836cc01 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -126,7 +126,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, struct bpos new_pos) { struct bch_fs *c = trans->c; - struct btree_iter old_iter, new_iter = { NULL }; + struct btree_iter old_iter, new_iter = {}; struct bkey_s_c old_k, new_k; snapshot_id_list s; struct bkey_i *update; @@ -140,7 +140,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, bch2_trans_iter_init(trans, &old_iter, id, old_pos, BTREE_ITER_not_extents| BTREE_ITER_all_snapshots); - while ((old_k = bch2_btree_iter_prev(&old_iter)).k && + while ((old_k = bch2_btree_iter_prev(trans, &old_iter)).k && !(ret = bkey_err(old_k)) && bkey_eq(old_pos, old_k.k->p)) { struct bpos whiteout_pos = @@ -296,7 +296,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans, BTREE_ITER_intent| BTREE_ITER_with_updates| BTREE_ITER_not_extents); - k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX)); + k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX)); if ((ret = bkey_err(k))) goto err; if (!k.k) @@ -322,8 +322,8 @@ static int bch2_trans_update_extent(struct btree_trans *trans, if (done) goto out; next: - bch2_btree_iter_advance(&iter); - k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX)); + bch2_btree_iter_advance(trans, &iter); + k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX)); if ((ret = bkey_err(k))) goto err; if (!k.k) @@ -592,13 +592,13 @@ int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, enum btree_id btree, struct bpos end) { bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent); - struct bkey_s_c k = bch2_btree_iter_peek_prev(iter); + struct bkey_s_c k = bch2_btree_iter_peek_prev(trans, iter); int ret = bkey_err(k); if (ret) goto err; - bch2_btree_iter_advance(iter); - k = bch2_btree_iter_peek_slot(iter); + bch2_btree_iter_advance(trans, iter); + k = bch2_btree_iter_peek_slot(trans, iter); ret = bkey_err(k); if (ret) goto err; @@ -634,7 +634,7 @@ int bch2_btree_insert_nonextent(struct btree_trans *trans, BTREE_ITER_cached| BTREE_ITER_not_extents| BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(&iter) ?: + ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_trans_update(trans, &iter, k, flags); bch2_trans_iter_exit(trans, &iter); return ret; @@ -646,7 +646,7 @@ int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id, struct btree_iter iter; bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), BTREE_ITER_intent|flags); - int ret = bch2_btree_iter_traverse(&iter) ?: + int ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_trans_update(trans, &iter, k, flags); bch2_trans_iter_exit(trans, &iter); return ret; @@ -695,7 +695,7 @@ int bch2_btree_delete(struct btree_trans *trans, bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_cached| BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(&iter) ?: + ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_btree_delete_at(trans, &iter, update_flags); bch2_trans_iter_exit(trans, &iter); @@ -713,7 +713,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, int ret = 0; bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent); - while ((k = bch2_btree_iter_peek_max(&iter, end)).k) { + while ((k = bch2_btree_iter_peek_max(trans, &iter, end)).k) { struct disk_reservation disk_res = bch2_disk_reservation_init(trans->c, 0); struct bkey_i delete; @@ -808,7 +808,7 @@ int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, struct btree_iter iter; bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent); - int ret = bch2_btree_iter_traverse(&iter) ?: + int ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_btree_bit_mod_iter(trans, &iter, set); bch2_trans_iter_exit(trans, &iter); return ret; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index bf7e1dac7f46..55fbeeb8eaaa 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -2147,7 +2147,7 @@ static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter, bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p, BTREE_MAX_DEPTH, b->c.level, BTREE_ITER_intent); - int ret = bch2_btree_iter_traverse(iter); + int ret = bch2_btree_iter_traverse(trans, iter); if (ret) goto err; @@ -2239,7 +2239,7 @@ static int bch2_btree_node_rewrite_key(struct btree_trans *trans, bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, BTREE_MAX_DEPTH, level, 0); - struct btree *b = bch2_btree_iter_peek_node(&iter); + struct btree *b = bch2_btree_iter_peek_node(trans, &iter); int ret = PTR_ERR_OR_ZERO(b); if (ret) goto out; @@ -2262,7 +2262,7 @@ int bch2_btree_node_rewrite_pos(struct btree_trans *trans, /* Traverse one depth lower to get a pointer to the node itself: */ struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0); - struct btree *b = bch2_btree_iter_peek_node(&iter); + struct btree *b = bch2_btree_iter_peek_node(trans, &iter); int ret = PTR_ERR_OR_ZERO(b); if (ret) goto err; @@ -2406,7 +2406,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, bool skip_triggers) { struct bch_fs *c = trans->c; - struct btree_iter iter2 = { NULL }; + struct btree_iter iter2 = {}; struct btree *parent; int ret; @@ -2430,7 +2430,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, parent = btree_node_parent(btree_iter_path(trans, iter), b); if (parent) { - bch2_trans_copy_iter(&iter2, iter); + bch2_trans_copy_iter(trans, &iter2, iter); iter2.path = bch2_btree_path_make_mut(trans, iter2.path, iter2.flags & BTREE_ITER_intent, @@ -2444,7 +2444,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, trans->paths_sorted = false; - ret = bch2_btree_iter_traverse(&iter2) ?: + ret = bch2_btree_iter_traverse(trans, &iter2) ?: bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun); if (ret) goto err; diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 2c09d19dd621..adbe576ec77e 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -144,7 +144,7 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq); EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq); - ret = bch2_btree_iter_traverse(iter); + ret = bch2_btree_iter_traverse(trans, iter); if (ret) return ret; @@ -208,7 +208,7 @@ btree_write_buffered_insert(struct btree_trans *trans, trans->journal_res.seq = wb->journal_seq; - ret = bch2_btree_iter_traverse(&iter) ?: + ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_trans_update(trans, &iter, &wb->k, BTREE_UPDATE_internal_snapshot_node); bch2_trans_iter_exit(trans, &iter); @@ -285,7 +285,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) struct bch_fs *c = trans->c; struct journal *j = &c->journal; struct btree_write_buffer *wb = &c->btree_write_buffer; - struct btree_iter iter = { NULL }; + struct btree_iter iter = {}; size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0; bool write_locked = false; bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags); @@ -368,7 +368,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) write_locked = false; ret = lockrestart_do(trans, - bch2_btree_iter_traverse(&iter) ?: + bch2_btree_iter_traverse(trans, &iter) ?: bch2_foreground_maybe_merge(trans, iter.path, 0, BCH_WATERMARK_reclaim| BCH_TRANS_COMMIT_journal_reclaim| @@ -385,7 +385,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) BTREE_ITER_intent|BTREE_ITER_all_snapshots); } - bch2_btree_iter_set_pos(&iter, k->k.k.p); + bch2_btree_iter_set_pos(trans, &iter, k->k.k.p); btree_iter_path(trans, &iter)->preserve = false; bool accounting_accumulated = false; diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 0903311cc71e..fea61e60a9ee 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -30,6 +30,12 @@ void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage) { + for (unsigned i = 0; i < BCH_DATA_NR; i++) + usage->buckets[i] = percpu_u64_get(&ca->usage->d[i].buckets); +} + +void bch2_dev_usage_full_read_fast(struct bch_dev *ca, struct bch_dev_usage_full *usage) +{ memset(usage, 0, sizeof(*usage)); acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, dev_usage_u64s()); } @@ -75,7 +81,7 @@ bch2_fs_usage_read_short(struct bch_fs *c) void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev *ca, - struct bch_dev_usage *usage) + struct bch_dev_usage_full *usage) { if (out->nr_tabstops < 5) { printbuf_tabstops_reset(out); @@ -365,7 +371,7 @@ found: struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level, BTREE_ITER_intent|BTREE_ITER_all_snapshots); - ret = bch2_btree_iter_traverse(&iter) ?: + ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node| BTREE_TRIGGER_norun); @@ -707,7 +713,7 @@ err: struct disk_accounting_pos acc; memset(&acc, 0, sizeof(acc)); acc.type = BCH_DISK_ACCOUNTING_replicas; - memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e)); + unsafe_memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e), "VLA"); gc_stripe_unlock(m); acc.replicas.data_type = data_type; @@ -1132,7 +1138,7 @@ int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c, for_each_online_member(c, ca) { int ret = bch2_trans_mark_dev_sb(c, ca, flags); if (ret) { - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); return ret; } } @@ -1331,7 +1337,7 @@ void bch2_dev_buckets_free(struct bch_dev *ca) int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) { - ca->usage = alloc_percpu(struct bch_dev_usage); + ca->usage = alloc_percpu(struct bch_dev_usage_full); if (!ca->usage) return -BCH_ERR_ENOMEM_usage_init; diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index c5363256e363..1c38b165f48b 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -172,7 +172,16 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) return ret; } -void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage *); +void bch2_dev_usage_full_read_fast(struct bch_dev *, struct bch_dev_usage_full *); +static inline struct bch_dev_usage_full bch2_dev_usage_full_read(struct bch_dev *ca) +{ + struct bch_dev_usage_full ret; + + bch2_dev_usage_full_read_fast(ca, &ret); + return ret; +} + +void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage_full *); static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark) { @@ -207,7 +216,7 @@ static inline u64 dev_buckets_free(struct bch_dev *ca, enum bch_watermark watermark) { return max_t(s64, 0, - usage.d[BCH_DATA_free].buckets - + usage.buckets[BCH_DATA_free]- ca->nr_open_buckets - bch2_dev_buckets_reserved(ca, watermark)); } @@ -217,10 +226,10 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca, enum bch_watermark watermark) { return max_t(s64, 0, - usage.d[BCH_DATA_free].buckets - + usage.d[BCH_DATA_cached].buckets - + usage.d[BCH_DATA_need_gc_gens].buckets - + usage.d[BCH_DATA_need_discard].buckets + usage.buckets[BCH_DATA_free] + + usage.buckets[BCH_DATA_cached] + + usage.buckets[BCH_DATA_need_gc_gens] + + usage.buckets[BCH_DATA_need_discard] - ca->nr_open_buckets - bch2_dev_buckets_reserved(ca, watermark)); } diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 900b8680c8b5..0aed2500ade3 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -54,7 +54,12 @@ struct bucket_gens { u8 b[] __counted_by(nbuckets); }; +/* Only info on bucket countns: */ struct bch_dev_usage { + u64 buckets[BCH_DATA_NR]; +}; + +struct bch_dev_usage_full { struct bch_dev_usage_type { u64 buckets; u64 sectors; /* _compressed_ sectors: */ diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 584f4a3eb670..5891b3a1e61c 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -350,8 +350,8 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf, if (ctx->arg.op == BCH_DATA_OP_scrub) { struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev); if (ca) { - struct bch_dev_usage u; - bch2_dev_usage_read_fast(ca, &u); + struct bch_dev_usage_full u; + bch2_dev_usage_full_read_fast(ca, &u); for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++) if (ctx->arg.scrub.data_types & BIT(i)) e.p.sectors_total += u.d[i].sectors; @@ -473,7 +473,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, struct bch_ioctl_dev_usage __user *user_arg) { struct bch_ioctl_dev_usage arg; - struct bch_dev_usage src; + struct bch_dev_usage_full src; struct bch_dev *ca; unsigned i; @@ -493,7 +493,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, if (IS_ERR(ca)) return PTR_ERR(ca); - src = bch2_dev_usage_read(ca); + src = bch2_dev_usage_full_read(ca); arg.state = ca->mi.state; arg.bucket_size = ca->mi.bucket_size; @@ -514,7 +514,7 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c, struct bch_ioctl_dev_usage_v2 __user *user_arg) { struct bch_ioctl_dev_usage_v2 arg; - struct bch_dev_usage src; + struct bch_dev_usage_full src; struct bch_dev *ca; int ret = 0; @@ -534,7 +534,7 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c, if (IS_ERR(ca)) return PTR_ERR(ca); - src = bch2_dev_usage_read(ca); + src = bch2_dev_usage_full_read(ca); arg.state = ca->mi.state; arg.bucket_size = ca->mi.bucket_size; @@ -615,7 +615,7 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c, for_each_online_member(c, ca) if (ca->dev == dev) { - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); return ca->dev_idx; } diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c index 1f8e035d7119..d6dd12d74d4f 100644 --- a/fs/bcachefs/clock.c +++ b/fs/bcachefs/clock.c @@ -121,7 +121,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, } while (0); __set_current_state(TASK_RUNNING); - del_timer_sync(&wait.cpu_timer); + timer_delete_sync(&wait.cpu_timer); destroy_timer_on_stack(&wait.cpu_timer); bch2_io_timer_del(clock, &wait.io_timer); } diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 85fc90342492..28ed32449913 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -371,13 +371,14 @@ static int attempt_compress(struct bch_fs *c, }; zlib_set_workspace(&strm, workspace); - zlib_deflateInit2(&strm, + if (zlib_deflateInit2(&strm, compression.level ? clamp_t(unsigned, compression.level, Z_BEST_SPEED, Z_BEST_COMPRESSION) : Z_DEFAULT_COMPRESSION, Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL, - Z_DEFAULT_STRATEGY); + Z_DEFAULT_STRATEGY) != Z_OK) + return 0; if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END) return 0; diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index fe400dfc5d76..de02ebf847ec 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -216,7 +216,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, bch2_trans_begin(trans); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); ret = bkey_err(k); if (ret) goto err; @@ -398,7 +398,7 @@ restart_drop_extra_replicas: BCH_TRANS_COMMIT_no_enospc| m->data_opts.btree_insert_flags); if (!ret) { - bch2_btree_iter_set_pos(&iter, next_pos); + bch2_btree_iter_set_pos(trans, &iter, next_pos); this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size); if (trace_io_move_finish_enabled()) @@ -426,7 +426,7 @@ nowork: count_event(c, io_move_fail); - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); goto next; } out: @@ -497,7 +497,7 @@ static int bch2_update_unwritten_extent(struct btree_trans *trans, bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos, BTREE_ITER_slots); ret = lockrestart_do(trans, ({ - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); bkey_err(k); })); bch2_trans_iter_exit(trans, &iter); diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 788af88f6979..5a8bc7013512 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -57,7 +57,7 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, submit_bio_wait(bio); bio_put(bio); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); memcpy(n_ondisk, n_sorted, btree_buf_bytes(b)); @@ -297,7 +297,7 @@ out: if (bio) bio_put(bio); kvfree(n_ondisk); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); } #ifdef CONFIG_DEBUG_FS diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index d7f9f79318a2..bf53a029f356 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -417,8 +417,8 @@ int bch2_dirent_rename(struct btree_trans *trans, enum bch_rename_mode mode) { struct qstr src_name_lookup, dst_name_lookup; - struct btree_iter src_iter = { NULL }; - struct btree_iter dst_iter = { NULL }; + struct btree_iter src_iter = {}; + struct btree_iter dst_iter = {}; struct bkey_s_c old_src, old_dst = bkey_s_c_null; struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; struct bpos dst_pos = @@ -586,16 +586,16 @@ out_set_src: } if (delete_src) { - bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); - ret = bch2_btree_iter_traverse(&src_iter) ?: + bch2_btree_iter_set_snapshot(trans, &src_iter, old_src.k->p.snapshot); + ret = bch2_btree_iter_traverse(trans, &src_iter) ?: bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_internal_snapshot_node); if (ret) goto out; } if (delete_dst) { - bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot); - ret = bch2_btree_iter_traverse(&dst_iter) ?: + bch2_btree_iter_set_snapshot(trans, &dst_iter, old_dst.k->p.snapshot); + ret = bch2_btree_iter_traverse(trans, &dst_iter) ?: bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_internal_snapshot_node); if (ret) goto out; @@ -642,7 +642,7 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, const struct qstr *name, subvol_inum *inum) { struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = { NULL }; + struct btree_iter iter = {}; int ret = lockrestart_do(trans, bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0)); @@ -771,7 +771,7 @@ int bch2_fsck_remove_dirent(struct btree_trans *trans, struct bpos pos) bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(&iter) ?: + ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_hash_delete_at(trans, bch2_dirent_hash_desc, &dir_hash_info, &iter, BTREE_UPDATE_internal_snapshot_node); diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index a59f6c12529b..b007319b72e9 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -739,7 +739,7 @@ int bch2_accounting_read(struct bch_fs *c) struct disk_accounting_pos next; memset(&next, 0, sizeof(next)); next.type = acc_k.type + 1; - bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); + bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next)); continue; } @@ -930,7 +930,7 @@ void bch2_verify_accounting_clean(struct bch_fs *c) struct disk_accounting_pos next; memset(&next, 0, sizeof(next)); next.type = acc_k.type + 1; - bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); + bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next)); continue; } diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index 5df8de0b8c02..1186280b29e9 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -555,9 +555,9 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) ? rcu_dereference(c->devs[t.dev]) : NULL; - if (ca && percpu_ref_tryget(&ca->io_ref)) { + if (ca && percpu_ref_tryget(&ca->io_ref[READ])) { prt_printf(out, "/dev/%s", ca->name); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); } else if (ca) { prt_printf(out, "offline device %u", t.dev); } else { diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 6faeda7ad03d..a396865e8b17 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -105,6 +105,7 @@ struct ec_bio { struct bch_dev *ca; struct ec_stripe_buf *buf; size_t idx; + int rw; u64 submit_time; struct bio bio; }; @@ -462,7 +463,8 @@ int bch2_trigger_stripe(struct btree_trans *trans, return ret; if (gc) - memcpy(&gc->r.e, &acc.replicas, replicas_entry_bytes(&acc.replicas)); + unsafe_memcpy(&gc->r.e, &acc.replicas, + replicas_entry_bytes(&acc.replicas), "VLA"); } if (old_s) { @@ -703,6 +705,7 @@ static void ec_block_endio(struct bio *bio) struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx]; struct bch_dev *ca = ec_bio->ca; struct closure *cl = bio->bi_private; + int rw = ec_bio->rw; bch2_account_io_completion(ca, bio_data_dir(bio), ec_bio->submit_time, !bio->bi_status); @@ -724,7 +727,7 @@ static void ec_block_endio(struct bio *bio) } bio_put(&ec_bio->bio); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[rw]); closure_put(cl); } @@ -775,6 +778,7 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, ec_bio->ca = ca; ec_bio->buf = buf; ec_bio->idx = idx; + ec_bio->rw = rw; ec_bio->submit_time = local_clock(); ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); @@ -784,14 +788,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); closure_get(cl); - percpu_ref_get(&ca->io_ref); + percpu_ref_get(&ca->io_ref[rw]); submit_bio(&ec_bio->bio); offset += b; } - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[rw]); } static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, @@ -1264,7 +1268,7 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c, ob->sectors_free, GFP_KERNEL, 0); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[WRITE]); if (ret) s->err = ret; @@ -1836,7 +1840,7 @@ static int __get_existing_stripe(struct btree_trans *trans, ret = 1; } out: - bch2_set_btree_iter_dontneed(&iter); + bch2_set_btree_iter_dontneed(trans, &iter); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -1949,7 +1953,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st if (bkey_gt(k.k->p, POS(0, U32_MAX))) { if (start_pos.offset) { start_pos = min_pos; - bch2_btree_iter_set_pos(&iter, start_pos); + bch2_btree_iter_set_pos(trans, &iter, start_pos); continue; } diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index d4dfd13a8076..baf5dfb32298 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -34,7 +34,7 @@ bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out) journal_cur_seq(&c->journal)); return true; case BCH_ON_ERROR_panic: - bch2_print_string_as_lines(KERN_ERR, out->buf); + bch2_print_string_as_lines_nonblocking(KERN_ERR, out->buf); panic(bch2_fmt(c, "panic after error")); return true; default: @@ -45,6 +45,8 @@ bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out) bool bch2_inconsistent_error(struct bch_fs *c) { struct printbuf buf = PRINTBUF; + buf.atomic++; + printbuf_indent_add_nextline(&buf, 2); bool ret = __bch2_inconsistent_error(c, &buf); @@ -59,6 +61,7 @@ static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *tra const char *fmt, va_list args) { struct printbuf buf = PRINTBUF; + buf.atomic++; bch2_log_msg_start(c, &buf); @@ -68,7 +71,7 @@ static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *tra if (trans) bch2_trans_updates_to_text(&buf, trans); bool ret = __bch2_inconsistent_error(c, &buf); - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf); printbuf_exit(&buf); return ret; diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c index 6aac579a692a..6bb42985306e 100644 --- a/fs/bcachefs/extent_update.c +++ b/fs/bcachefs/extent_update.c @@ -112,7 +112,7 @@ int bch2_extent_atomic_end(struct btree_trans *trans, unsigned nr_iters = 0; int ret; - ret = bch2_btree_iter_traverse(iter); + ret = bch2_btree_iter_traverse(trans, iter); if (ret) return ret; @@ -126,9 +126,9 @@ int bch2_extent_atomic_end(struct btree_trans *trans, if (ret < 0) return ret; - bch2_trans_copy_iter(©, iter); + bch2_trans_copy_iter(trans, ©, iter); - for_each_btree_key_max_continue_norestart(copy, insert->k.p, 0, k, ret) { + for_each_btree_key_max_continue_norestart(trans, copy, insert->k.p, 0, k, ret) { unsigned offset = 0; if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index a03e2c780cba..19d4599918dc 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -183,12 +183,12 @@ static void bchfs_read(struct btree_trans *trans, if (ret) goto err; - bch2_btree_iter_set_snapshot(&iter, snapshot); + bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - bch2_btree_iter_set_pos(&iter, + bch2_btree_iter_set_pos(trans, &iter, POS(inum.inum, rbio->bio.bi_iter.bi_sector)); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); ret = bkey_err(k); if (ret) goto err; diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index c80ed3a54e70..65c2c33d253d 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -48,7 +48,7 @@ static void nocow_flush_endio(struct bio *_bio) struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); closure_put(bio->cl); - percpu_ref_put(&bio->ca->io_ref); + percpu_ref_put(&bio->ca->io_ref[WRITE]); bio_put(&bio->bio); } @@ -71,7 +71,7 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { rcu_read_lock(); ca = rcu_dereference(c->devs[dev]); - if (ca && !percpu_ref_tryget(&ca->io_ref)) + if (ca && !percpu_ref_tryget(&ca->io_ref[WRITE])) ca = NULL; rcu_read_unlock(); @@ -636,9 +636,9 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, if (ret) goto bkey_err; - bch2_btree_iter_set_snapshot(&iter, snapshot); + bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); if ((ret = bkey_err(k))) goto bkey_err; @@ -649,13 +649,13 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, /* already reserved */ if (bkey_extent_is_reservation(k) && bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) { - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); continue; } if (bkey_extent_is_data(k.k) && !(mode & FALLOC_FL_ZERO_RANGE)) { - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); continue; } @@ -676,7 +676,7 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, if (ret) goto bkey_err; } - bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start)); + bch2_btree_iter_set_pos(trans, &iter, POS(iter.pos.inode, hole_start)); if (ret) goto bkey_err; diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index fc834bdf1f52..5a41b1a8e54f 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -88,7 +88,7 @@ int __must_check bch2_write_inode(struct bch_fs *c, void *p, unsigned fields) { struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = { NULL }; + struct btree_iter iter = {}; struct bch_inode_unpacked inode_u; int ret; retry: @@ -1075,7 +1075,7 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap, struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_qid qid; struct btree_trans *trans; - struct btree_iter inode_iter = { NULL }; + struct btree_iter inode_iter = {}; struct bch_inode_unpacked inode_u; struct posix_acl *acl = NULL; kuid_t kuid; @@ -1330,9 +1330,9 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, if (ret) continue; - bch2_btree_iter_set_snapshot(&iter, snapshot); + bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - k = bch2_btree_iter_peek_max(&iter, end); + k = bch2_btree_iter_peek_max(trans, &iter, end); ret = bkey_err(k); if (ret) continue; @@ -1342,7 +1342,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, if (!bkey_extent_is_data(k.k) && k.k->type != KEY_TYPE_reservation) { - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); continue; } @@ -1380,7 +1380,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bkey_copy(prev.k, cur.k); have_extent = true; - bch2_btree_iter_set_pos(&iter, + bch2_btree_iter_set_pos(trans, &iter, POS(iter.pos.inode, iter.pos.offset + sectors)); } bch2_trans_iter_exit(trans, &iter); @@ -1697,17 +1697,17 @@ retry: if (ret) goto err; - bch2_btree_iter_set_snapshot(&iter1, snapshot); - bch2_btree_iter_set_snapshot(&iter2, snapshot); + bch2_btree_iter_set_snapshot(trans, &iter1, snapshot); + bch2_btree_iter_set_snapshot(trans, &iter2, snapshot); ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u); if (ret) goto err; if (inode_u.bi_dir == dir->ei_inode.bi_inum) { - bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); + bch2_btree_iter_set_pos(trans, &iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); - k = bch2_btree_iter_peek_slot(&iter1); + k = bch2_btree_iter_peek_slot(trans, &iter1); ret = bkey_err(k); if (ret) goto err; @@ -1731,7 +1731,7 @@ retry: * File with multiple hardlinks and our backref is to the wrong * directory - linear search: */ - for_each_btree_key_continue_norestart(iter2, 0, k, ret) { + for_each_btree_key_continue_norestart(trans, iter2, 0, k, ret) { if (k.k->p.inode > dir->ei_inode.bi_inum) break; @@ -2237,7 +2237,7 @@ got_sb: /* XXX: create an anonymous device for multi device filesystems */ sb->s_bdev = bdev; sb->s_dev = bdev->bd_dev; - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); break; } diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 52320295dcf6..18308f3d64a1 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -186,7 +186,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, { struct bch_fs *c = trans->c; struct qstr lostfound_str = QSTR("lost+found"); - struct btree_iter lostfound_iter = { NULL }; + struct btree_iter lostfound_iter = {}; u64 inum = 0; unsigned d_type = 0; int ret; @@ -295,8 +295,8 @@ create_lostfound: if (ret) goto err; - bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot); - ret = bch2_btree_iter_traverse(&lostfound_iter); + bch2_btree_iter_set_snapshot(trans, &lostfound_iter, snapshot); + ret = bch2_btree_iter_traverse(trans, &lostfound_iter); if (ret) goto err; @@ -544,7 +544,7 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub new_inode.bi_subvol = subvolid; int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?: - bch2_btree_iter_traverse(&inode_iter) ?: + bch2_btree_iter_traverse(trans, &inode_iter) ?: bch2_inode_write(trans, &inode_iter, &new_inode); bch2_trans_iter_exit(trans, &inode_iter); if (ret) @@ -609,7 +609,7 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 struct btree_iter iter = {}; bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); - struct bkey_s_c k = bch2_btree_iter_peek_prev_min(&iter, POS(inum, 0)); + struct bkey_s_c k = bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum, 0)); bch2_trans_iter_exit(trans, &iter); int ret = bkey_err(k); if (ret) @@ -1557,7 +1557,7 @@ static int overlapping_extents_found(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; - struct btree_iter iter1, iter2 = { NULL }; + struct btree_iter iter1, iter2 = {}; struct bkey_s_c k1, k2; int ret; @@ -1566,7 +1566,7 @@ static int overlapping_extents_found(struct btree_trans *trans, bch2_trans_iter_init(trans, &iter1, btree, pos1, BTREE_ITER_all_snapshots| BTREE_ITER_not_extents); - k1 = bch2_btree_iter_peek_max(&iter1, POS(pos1.inode, U64_MAX)); + k1 = bch2_btree_iter_peek_max(trans, &iter1, POS(pos1.inode, U64_MAX)); ret = bkey_err(k1); if (ret) goto err; @@ -1586,12 +1586,12 @@ static int overlapping_extents_found(struct btree_trans *trans, goto err; } - bch2_trans_copy_iter(&iter2, &iter1); + bch2_trans_copy_iter(trans, &iter2, &iter1); while (1) { - bch2_btree_iter_advance(&iter2); + bch2_btree_iter_advance(trans, &iter2); - k2 = bch2_btree_iter_peek_max(&iter2, POS(pos1.inode, U64_MAX)); + k2 = bch2_btree_iter_peek_max(trans, &iter2, POS(pos1.inode, U64_MAX)); ret = bkey_err(k2); if (ret) goto err; @@ -1791,9 +1791,9 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { struct btree_iter iter2; - bch2_trans_copy_iter(&iter2, iter); - bch2_btree_iter_set_snapshot(&iter2, i->snapshot); - ret = bch2_btree_iter_traverse(&iter2) ?: + bch2_trans_copy_iter(trans, &iter2, iter); + bch2_btree_iter_set_snapshot(trans, &iter2, i->snapshot); + ret = bch2_btree_iter_traverse(trans, &iter2) ?: bch2_btree_delete_at(trans, &iter2, BTREE_UPDATE_internal_snapshot_node); bch2_trans_iter_exit(trans, &iter2); @@ -2185,7 +2185,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, BTREE_ID_dirents, SPOS(k.k->p.inode, k.k->p.offset, *i), BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(&delete_iter) ?: + ret = bch2_btree_iter_traverse(trans, &delete_iter) ?: bch2_hash_delete_at(trans, bch2_dirent_hash_desc, hash_info, &delete_iter, @@ -2412,7 +2412,7 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, bch2_trans_iter_exit(trans, &parent_iter); bch2_trans_iter_init(trans, &parent_iter, BTREE_ID_subvolumes, POS(0, parent), 0); - k = bch2_btree_iter_peek_slot(&parent_iter); + k = bch2_btree_iter_peek_slot(trans, &parent_iter); ret = bkey_err(k); if (ret) goto err; diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 80051073f613..b51d98cf8a80 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -940,7 +940,7 @@ int bch2_inode_create(struct btree_trans *trans, BTREE_ITER_intent); struct bkey_s_c k; again: - while ((k = bch2_btree_iter_peek(iter)).k && + while ((k = bch2_btree_iter_peek(trans, iter)).k && !(ret = bkey_err(k)) && bkey_lt(k.k->p, POS(0, max))) { if (pos < iter->pos.offset) @@ -951,7 +951,7 @@ again: * we've found just one: */ pos = iter->pos.offset + 1; - bch2_btree_iter_set_pos(iter, POS(0, pos)); + bch2_btree_iter_set_pos(trans, iter, POS(0, pos)); } if (!ret && pos < max) @@ -967,12 +967,12 @@ again: /* Retry from start */ pos = start = min; - bch2_btree_iter_set_pos(iter, POS(0, pos)); + bch2_btree_iter_set_pos(trans, iter, POS(0, pos)); le32_add_cpu(&cursor->v.gen, 1); goto again; found_slot: - bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot)); - k = bch2_btree_iter_peek_slot(iter); + bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, snapshot)); + k = bch2_btree_iter_peek_slot(trans, iter); ret = bkey_err(k); if (ret) { bch2_trans_iter_exit(trans, iter); @@ -1009,9 +1009,9 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, if (ret) goto err; - bch2_btree_iter_set_snapshot(&iter, snapshot); + bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - k = bch2_btree_iter_peek_max(&iter, end); + k = bch2_btree_iter_peek_max(trans, &iter, end); ret = bkey_err(k); if (ret) goto err; @@ -1042,7 +1042,7 @@ err: int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) { struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = { NULL }; + struct btree_iter iter = {}; struct bkey_s_c k; u32 snapshot; int ret; @@ -1207,7 +1207,7 @@ int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_i static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) { struct bch_fs *c = trans->c; - struct btree_iter iter = { NULL }; + struct btree_iter iter = {}; struct bkey_i_inode_generation delete; struct bch_inode_unpacked inode_u; struct bkey_s_c k; diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index 6b842c8d21be..cc07729a4b62 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -43,7 +43,7 @@ int bch2_extent_fallocate(struct btree_trans *trans, bch2_bkey_buf_init(&new); closure_init_stack(&cl); - k = bch2_btree_iter_peek_slot(iter); + k = bch2_btree_iter_peek_slot(trans, iter); ret = bkey_err(k); if (ret) return ret; @@ -164,12 +164,12 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, if (ret) continue; - bch2_btree_iter_set_snapshot(iter, snapshot); + bch2_btree_iter_set_snapshot(trans, iter, snapshot); /* * peek_max() doesn't have ideal semantics for extents: */ - k = bch2_btree_iter_peek_max(iter, end_pos); + k = bch2_btree_iter_peek_max(trans, iter, end_pos); if (!k.k) break; @@ -230,7 +230,7 @@ static int truncate_set_isize(struct btree_trans *trans, u64 new_i_size, bool warn) { - struct btree_iter iter = { NULL }; + struct btree_iter iter = {}; struct bch_inode_unpacked inode_u; int ret; @@ -399,7 +399,7 @@ case LOGGED_OP_FINSERT_start: if (ret) goto err; } else { - bch2_btree_iter_set_pos(&iter, POS(inum.inum, src_offset)); + bch2_btree_iter_set_pos(trans, &iter, POS(inum.inum, src_offset)); ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -425,12 +425,12 @@ case LOGGED_OP_FINSERT_shift_extents: if (ret) goto btree_err; - bch2_btree_iter_set_snapshot(&iter, snapshot); - bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot)); + bch2_btree_iter_set_snapshot(trans, &iter, snapshot); + bch2_btree_iter_set_pos(trans, &iter, SPOS(inum.inum, pos, snapshot)); k = insert - ? bch2_btree_iter_peek_prev_min(&iter, POS(inum.inum, 0)) - : bch2_btree_iter_peek_max(&iter, POS(inum.inum, U64_MAX)); + ? bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum.inum, 0)) + : bch2_btree_iter_peek_max(trans, &iter, POS(inum.inum, U64_MAX)); if ((ret = bkey_err(k))) goto btree_err; diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index fd01e67b3e84..417bb0c7bbfa 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -394,7 +394,7 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) if (rbio->have_ioref) { struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); } if (rbio->split) { @@ -909,7 +909,7 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, prt_printf(&buf, "memory gen: %u", gen); - ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter))); if (!ret) { prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, k); @@ -1003,7 +1003,7 @@ retry_pick: unlikely(dev_ptr_stale(ca, &pick.ptr))) { read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); bch2_mark_io_failure(failed, &pick, false); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); goto retry_pick; } @@ -1036,7 +1036,7 @@ retry_pick: */ if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { if (ca) - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); rbio->ret = -BCH_ERR_data_read_buffer_too_small; goto out_read_done; } @@ -1285,12 +1285,12 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, if (ret) goto err; - bch2_btree_iter_set_snapshot(&iter, snapshot); + bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - bch2_btree_iter_set_pos(&iter, + bch2_btree_iter_set_pos(trans, &iter, POS(inum.inum, bvec_iter.bi_sector)); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); ret = bkey_err(k); if (ret) goto err; diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 07b55839768e..a418fa62f09d 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -168,9 +168,9 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, *i_sectors_delta = 0; *disk_sectors_delta = 0; - bch2_trans_copy_iter(&iter, extent_iter); + bch2_trans_copy_iter(trans, &iter, extent_iter); - for_each_btree_key_max_continue_norestart(iter, + for_each_btree_key_max_continue_norestart(trans, iter, new->k.p, BTREE_ITER_slots, old, ret) { s64 sectors = min(new->k.p.offset, old.k->p.offset) - max(bkey_start_offset(&new->k), @@ -292,7 +292,7 @@ int bch2_extent_update(struct btree_trans *trans, * path already traversed at iter->pos because * bch2_trans_extent_update() will use it to attempt extent merging */ - ret = __bch2_btree_iter_traverse(iter); + ret = __bch2_btree_iter_traverse(trans, iter); if (ret) return ret; @@ -337,7 +337,7 @@ int bch2_extent_update(struct btree_trans *trans, if (i_sectors_delta_total) *i_sectors_delta_total += i_sectors_delta; - bch2_btree_iter_set_pos(iter, next_pos); + bch2_btree_iter_set_pos(trans, iter, next_pos); return 0; } @@ -445,6 +445,11 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, BUG_ON(c->opts.nochanges); bkey_for_each_ptr(ptrs, ptr) { + /* + * XXX: btree writes should be using io_ref[WRITE], but we + * aren't retrying failed btree writes yet (due to device + * removal/ro): + */ struct bch_dev *ca = nocow ? bch2_dev_have_ref(c, ptr->dev) : bch2_dev_get_ioref(c, ptr->dev, type == BCH_DATA_btree ? READ : WRITE); @@ -697,12 +702,19 @@ static void bch2_write_endio(struct bio *bio) bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, wbio->submit_time, !bio->bi_status); - if (bio->bi_status) { - bch_err_inum_offset_ratelimited(ca, - op->pos.inode, - wbio->inode_offset << 9, - "data write error: %s", - bch2_blk_status_to_str(bio->bi_status)); + if (unlikely(bio->bi_status)) { + if (ca) + bch_err_inum_offset_ratelimited(ca, + op->pos.inode, + wbio->inode_offset << 9, + "data write error: %s", + bch2_blk_status_to_str(bio->bi_status)); + else + bch_err_inum_offset_ratelimited(c, + op->pos.inode, + wbio->inode_offset << 9, + "data write error: %s", + bch2_blk_status_to_str(bio->bi_status)); set_bit(wbio->dev, op->failed.d); op->flags |= BCH_WRITE_io_error; } @@ -715,7 +727,7 @@ static void bch2_write_endio(struct bio *bio) } if (wbio->have_ioref) - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[WRITE]); if (wbio->bounce) bch2_bio_free_pages_pool(c, bio); @@ -1293,7 +1305,7 @@ retry: if (ret) break; - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); ret = bkey_err(k); if (ret) break; @@ -1377,7 +1389,7 @@ retry: bch2_keylist_push(&op->insert_keys); if (op->flags & BCH_WRITE_submitted) break; - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); } out: bch2_trans_iter_exit(trans, &iter); @@ -1414,7 +1426,7 @@ err: return; err_get_ioref: darray_for_each(buckets, i) - percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref); + percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE]); /* Fall back to COW path: */ goto out; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 8a36d5536668..d8f74b6d0a75 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1315,7 +1315,7 @@ int bch2_fs_journal_alloc(struct bch_fs *c) int ret = bch2_dev_journal_alloc(ca, true); if (ret) { - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); return ret; } } @@ -1404,6 +1404,14 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) nr = cur_seq - last_seq; + /* + * Extra fudge factor, in case we crashed when the journal pin fifo was + * nearly or completely full. We'll need to be able to open additional + * journal entries (at least a few) in order for journal replay to get + * going: + */ + nr += nr / 4; + if (nr + 1 > j->pin.size) { free_fifo(&j->pin); init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL); @@ -1461,11 +1469,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) j->reservations.idx = journal_cur_seq(j); c->last_bucket_seq_cleanup = journal_cur_seq(j); - - bch2_journal_space_available(j); spin_unlock(&j->lock); - return bch2_journal_reclaim_start(j); + return 0; } /* init/exit: */ diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 2debc213e47c..1b7961f4f609 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1218,7 +1218,7 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) out: bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); kvfree(buf.data); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); closure_return(cl); return; err: @@ -1253,7 +1253,7 @@ int bch2_journal_read(struct bch_fs *c, if ((ca->mi.state == BCH_MEMBER_STATE_rw || ca->mi.state == BCH_MEMBER_STATE_ro) && - percpu_ref_tryget(&ca->io_ref)) + percpu_ref_tryget(&ca->io_ref[READ])) closure_call(&ca->journal.read, bch2_journal_read_device, system_unbound_wq, @@ -1768,7 +1768,7 @@ static void journal_write_endio(struct bio *bio) } closure_put(&w->io); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[WRITE]); } static CLOSURE_CALLBACK(journal_write_submit) @@ -1843,7 +1843,7 @@ static CLOSURE_CALLBACK(journal_write_preflush) if (w->separate_flush) { for_each_rw_member(c, ca) { - percpu_ref_get(&ca->io_ref); + percpu_ref_get(&ca->io_ref[WRITE]); struct journal_device *ja = &ca->journal; struct bio *bio = &ja->bio[w->idx]->bio; diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index 57ad662871ba..90dcf80bd64a 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -130,7 +130,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, retry: ret = 0; while (bch2_trans_begin(trans), - (b = bch2_btree_iter_peek_node(&iter)) && + (b = bch2_btree_iter_peek_node(trans, &iter)) && !(ret = PTR_ERR_OR_ZERO(b))) { bch2_progress_update_iter(trans, progress, &iter, "dropping metadata"); @@ -154,7 +154,7 @@ retry: if (ret) break; next: - bch2_btree_iter_next_node(&iter); + bch2_btree_iter_next_node(trans, &iter); } if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 5d41260e10da..fc396b9fa754 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -545,7 +545,7 @@ static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans * BTREE_ID_reflink, reflink_pos, BTREE_ITER_not_extents); - struct bkey_s_c k = bch2_btree_iter_peek(iter); + struct bkey_s_c k = bch2_btree_iter_peek(trans, iter); if (!k.k || bkey_err(k)) { bch2_trans_iter_exit(trans, iter); return k; @@ -603,7 +603,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, bch2_trans_begin(trans); - k = bch2_btree_iter_peek(&iter); + k = bch2_btree_iter_peek(trans, &iter); if (!k.k) break; @@ -681,7 +681,7 @@ next: if (ctxt->stats) atomic64_add(k.k->size, &ctxt->stats->sectors_seen); next_nondata: - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); } bch2_trans_iter_exit(trans, &reflink_iter); @@ -794,7 +794,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, bch2_trans_begin(trans); - k = bch2_btree_iter_peek(&bp_iter); + k = bch2_btree_iter_peek(trans, &bp_iter); ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; @@ -876,7 +876,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, if (ctxt->stats) atomic64_add(sectors, &ctxt->stats->sectors_seen); next: - bch2_btree_iter_advance(&bp_iter); + bch2_btree_iter_advance(trans, &bp_iter); } err: bch2_trans_iter_exit(trans, &bp_iter); @@ -991,7 +991,7 @@ static int bch2_move_btree(struct bch_fs *c, retry: ret = 0; while (bch2_trans_begin(trans), - (b = bch2_btree_iter_peek_node(&iter)) && + (b = bch2_btree_iter_peek_node(trans, &iter)) && !(ret = PTR_ERR_OR_ZERO(b))) { if (kthread && kthread_should_stop()) break; @@ -1011,7 +1011,7 @@ retry: if (ret) break; next: - bch2_btree_iter_next_node(&iter); + bch2_btree_iter_next_node(trans, &iter); } if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 5126c870ce5b..159410c50861 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -280,7 +280,11 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) s64 wait = S64_MAX, fragmented_allowed, fragmented; for_each_rw_member(c, ca) { - struct bch_dev_usage usage = bch2_dev_usage_read(ca); + struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca); + struct bch_dev_usage usage; + + for (unsigned i = 0; i < BCH_DATA_NR; i++) + usage.buckets[i] = usage_full.d[i].buckets; fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) * ca->mi.bucket_size) >> 1); @@ -288,7 +292,7 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) for (unsigned i = 0; i < BCH_DATA_NR; i++) if (data_type_movable(i)) - fragmented += usage.d[i].fragmented; + fragmented += usage_full.d[i].fragmented; wait = min(wait, max(0LL, fragmented_allowed - fragmented)); } diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c index ee7251709fb9..0d65ea96f7a2 100644 --- a/fs/bcachefs/namei.c +++ b/fs/bcachefs/namei.c @@ -28,8 +28,8 @@ int bch2_create_trans(struct btree_trans *trans, unsigned flags) { struct bch_fs *c = trans->c; - struct btree_iter dir_iter = { NULL }; - struct btree_iter inode_iter = { NULL }; + struct btree_iter dir_iter = {}; + struct btree_iter inode_iter = {}; subvol_inum new_inum = dir; u64 now = bch2_current_time(c); u64 cpu = raw_smp_processor_id(); @@ -127,8 +127,8 @@ int bch2_create_trans(struct btree_trans *trans, if (ret) goto err; - bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot); - ret = bch2_btree_iter_traverse(&dir_iter); + bch2_btree_iter_set_snapshot(trans, &dir_iter, dir_snapshot); + ret = bch2_btree_iter_traverse(trans, &dir_iter); if (ret) goto err; } @@ -177,9 +177,9 @@ int bch2_create_trans(struct btree_trans *trans, new_inode->bi_depth = dir_u->bi_depth + 1; inode_iter.flags &= ~BTREE_ITER_all_snapshots; - bch2_btree_iter_set_snapshot(&inode_iter, snapshot); + bch2_btree_iter_set_snapshot(trans, &inode_iter, snapshot); - ret = bch2_btree_iter_traverse(&inode_iter) ?: + ret = bch2_btree_iter_traverse(trans, &inode_iter) ?: bch2_inode_write(trans, &inode_iter, new_inode); err: bch2_trans_iter_exit(trans, &inode_iter); @@ -193,8 +193,8 @@ int bch2_link_trans(struct btree_trans *trans, const struct qstr *name) { struct bch_fs *c = trans->c; - struct btree_iter dir_iter = { NULL }; - struct btree_iter inode_iter = { NULL }; + struct btree_iter dir_iter = {}; + struct btree_iter inode_iter = {}; struct bch_hash_info dir_hash; u64 now = bch2_current_time(c); u64 dir_offset = 0; @@ -253,9 +253,9 @@ int bch2_unlink_trans(struct btree_trans *trans, bool deleting_subvol) { struct bch_fs *c = trans->c; - struct btree_iter dir_iter = { NULL }; - struct btree_iter dirent_iter = { NULL }; - struct btree_iter inode_iter = { NULL }; + struct btree_iter dir_iter = {}; + struct btree_iter dirent_iter = {}; + struct btree_iter inode_iter = {}; struct bch_hash_info dir_hash; subvol_inum inum; u64 now = bch2_current_time(c); @@ -301,7 +301,7 @@ int bch2_unlink_trans(struct btree_trans *trans, if (ret) goto err; - k = bch2_btree_iter_peek_slot(&dirent_iter); + k = bch2_btree_iter_peek_slot(trans, &dirent_iter); ret = bkey_err(k); if (ret) goto err; @@ -310,8 +310,8 @@ int bch2_unlink_trans(struct btree_trans *trans, * If we're deleting a subvolume, we need to really delete the * dirent, not just emit a whiteout in the current snapshot: */ - bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot); - ret = bch2_btree_iter_traverse(&dirent_iter); + bch2_btree_iter_set_snapshot(trans, &dirent_iter, k.k->p.snapshot); + ret = bch2_btree_iter_traverse(trans, &dirent_iter); if (ret) goto err; } else { @@ -390,10 +390,10 @@ int bch2_rename_trans(struct btree_trans *trans, enum bch_rename_mode mode) { struct bch_fs *c = trans->c; - struct btree_iter src_dir_iter = { NULL }; - struct btree_iter dst_dir_iter = { NULL }; - struct btree_iter src_inode_iter = { NULL }; - struct btree_iter dst_inode_iter = { NULL }; + struct btree_iter src_dir_iter = {}; + struct btree_iter dst_dir_iter = {}; + struct btree_iter src_inode_iter = {}; + struct btree_iter dst_inode_iter = {}; struct bch_hash_info src_hash, dst_hash; subvol_inum src_inum, dst_inum; u64 src_offset, dst_offset; @@ -666,7 +666,7 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; - struct btree_iter bp_iter = { NULL }; + struct btree_iter bp_iter = {}; int ret = 0; if (inode_points_to_dirent(target, d)) diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index 8b857fc33244..3d4755d73af7 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -516,7 +516,7 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans, bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, KEY_TYPE_QUOTA_NOCHECK); advance: - bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); + bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos)); return 0; } diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index b9bde04b66c0..c63fa53f30d2 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -233,7 +233,7 @@ int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum) bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), BTREE_ITER_intent); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); ret = bkey_err(k); if (ret) goto err; @@ -281,7 +281,7 @@ static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), BTREE_ITER_intent); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); ret = bkey_err(k); if (ret) goto err; @@ -301,7 +301,7 @@ static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans, struct btree_iter *work_iter) { return !kthread_should_stop() - ? bch2_btree_iter_peek(work_iter) + ? bch2_btree_iter_peek(trans, work_iter) : bkey_s_c_null; } @@ -335,7 +335,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink, work_pos, BTREE_ITER_all_snapshots); - struct bkey_s_c k = bch2_btree_iter_peek_slot(extent_iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, extent_iter); if (bkey_err(k)) return k; @@ -511,7 +511,7 @@ static int do_rebalance(struct moving_context *ctxt) struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; struct bch_fs_rebalance *r = &c->rebalance; - struct btree_iter rebalance_work_iter, extent_iter = { NULL }; + struct btree_iter rebalance_work_iter, extent_iter = {}; struct bkey_s_c k; int ret = 0; @@ -552,7 +552,7 @@ static int do_rebalance(struct moving_context *ctxt) if (ret) break; - bch2_btree_iter_advance(&rebalance_work_iter); + bch2_btree_iter_advance(trans, &rebalance_work_iter); } bch2_trans_iter_exit(trans, &extent_iter); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 266c5770c824..79fd18a5a07c 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -198,7 +198,7 @@ static int bch2_journal_replay_accounting_key(struct btree_trans *trans, bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, BTREE_MAX_DEPTH, k->level, BTREE_ITER_intent); - int ret = bch2_btree_iter_traverse(&iter); + int ret = bch2_btree_iter_traverse(trans, &iter); if (ret) goto out; @@ -261,7 +261,7 @@ static int bch2_journal_replay_key(struct btree_trans *trans, bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, BTREE_MAX_DEPTH, k->level, iter_flags); - ret = bch2_btree_iter_traverse(&iter); + ret = bch2_btree_iter_traverse(trans, &iter); if (ret) goto out; @@ -270,7 +270,7 @@ static int bch2_journal_replay_key(struct btree_trans *trans, bch2_trans_iter_exit(trans, &iter); bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, BTREE_MAX_DEPTH, 0, iter_flags); - ret = bch2_btree_iter_traverse(&iter) ?: + ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_btree_increase_depth(trans, iter.path, 0) ?: -BCH_ERR_transaction_restart_nested; goto out; diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index ee23f1f93acc..710178e3da4c 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -495,7 +495,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, bool reflink_p_may_update_opts_field) { struct bch_fs *c = trans->c; - struct btree_iter reflink_iter = { NULL }; + struct btree_iter reflink_iter = {}; struct bkey_s_c k; struct bkey_i *r_v; struct bkey_i_reflink_p *r_p; @@ -507,7 +507,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX, BTREE_ITER_intent); - k = bch2_btree_iter_peek_prev(&reflink_iter); + k = bch2_btree_iter_peek_prev(trans, &reflink_iter); ret = bkey_err(k); if (ret) goto err; @@ -569,12 +569,13 @@ err: return ret; } -static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) +static struct bkey_s_c get_next_src(struct btree_trans *trans, + struct btree_iter *iter, struct bpos end) { struct bkey_s_c k; int ret; - for_each_btree_key_max_continue_norestart(*iter, end, 0, k, ret) { + for_each_btree_key_max_continue_norestart(trans, *iter, end, 0, k, ret) { if (bkey_extent_is_unwritten(k)) continue; @@ -583,7 +584,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) } if (bkey_ge(iter->pos, end)) - bch2_btree_iter_set_pos(iter, end); + bch2_btree_iter_set_pos(trans, iter, end); return ret ? bkey_s_c_err(ret) : bkey_s_c_null; } @@ -647,27 +648,27 @@ s64 bch2_remap_range(struct bch_fs *c, if (ret) continue; - bch2_btree_iter_set_snapshot(&src_iter, src_snapshot); + bch2_btree_iter_set_snapshot(trans, &src_iter, src_snapshot); ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol, &dst_snapshot); if (ret) continue; - bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot); + bch2_btree_iter_set_snapshot(trans, &dst_iter, dst_snapshot); if (dst_inum.inum < src_inum.inum) { /* Avoid some lock cycle transaction restarts */ - ret = bch2_btree_iter_traverse(&dst_iter); + ret = bch2_btree_iter_traverse(trans, &dst_iter); if (ret) continue; } dst_done = dst_iter.pos.offset - dst_start.offset; src_want = POS(src_start.inode, src_start.offset + dst_done); - bch2_btree_iter_set_pos(&src_iter, src_want); + bch2_btree_iter_set_pos(trans, &src_iter, src_want); - src_k = get_next_src(&src_iter, src_end); + src_k = get_next_src(trans, &src_iter, src_end); ret = bkey_err(src_k); if (ret) continue; @@ -738,7 +739,7 @@ s64 bch2_remap_range(struct bch_fs *c, do { struct bch_inode_unpacked inode_u; - struct btree_iter inode_iter = { NULL }; + struct btree_iter inode_iter = {}; bch2_trans_begin(trans); diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index 38261638a611..06bb41a3f360 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -20,7 +20,7 @@ struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i); static inline bool bch2_dev_is_online(struct bch_dev *ca) { - return !percpu_ref_is_zero(&ca->io_ref); + return !percpu_ref_is_zero(&ca->io_ref[READ]); } static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned); @@ -156,33 +156,34 @@ static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, struct bch_dev *ca, - unsigned state_mask) + unsigned state_mask, + int rw) { rcu_read_lock(); if (ca) - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[rw]); while ((ca = __bch2_next_dev(c, ca, NULL)) && (!((1 << ca->mi.state) & state_mask) || - !percpu_ref_tryget(&ca->io_ref))) + !percpu_ref_tryget(&ca->io_ref[rw]))) ; rcu_read_unlock(); return ca; } -#define __for_each_online_member(_c, _ca, state_mask) \ +#define __for_each_online_member(_c, _ca, state_mask, rw) \ for (struct bch_dev *_ca = NULL; \ - (_ca = bch2_get_next_online_dev(_c, _ca, state_mask));) + (_ca = bch2_get_next_online_dev(_c, _ca, state_mask, rw));) #define for_each_online_member(c, ca) \ - __for_each_online_member(c, ca, ~0) + __for_each_online_member(c, ca, ~0, READ) #define for_each_rw_member(c, ca) \ - __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw)) + __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), WRITE) #define for_each_readable_member(c, ca) \ - __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro)) + __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro), READ) static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev) { @@ -287,7 +288,7 @@ static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu(c, dev); - if (ca && !percpu_ref_tryget(&ca->io_ref)) + if (ca && !percpu_ref_tryget(&ca->io_ref[rw])) ca = NULL; rcu_read_unlock(); @@ -297,7 +298,7 @@ static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, return ca; if (ca) - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[rw]); return NULL; } diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 0c65065b08ec..b7de29aed839 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -843,9 +843,6 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) { struct bch_fs *c = trans->c; - if (bch2_snapshot_exists(c, id)) - return 0; - /* Do we need to reconstruct the snapshot_tree entry as well? */ struct btree_iter iter; struct bkey_s_c k; @@ -1074,9 +1071,9 @@ static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s) static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) { struct bch_fs *c = trans->c; - struct btree_iter iter, p_iter = (struct btree_iter) { NULL }; - struct btree_iter c_iter = (struct btree_iter) { NULL }; - struct btree_iter tree_iter = (struct btree_iter) { NULL }; + struct btree_iter iter, p_iter = {}; + struct btree_iter c_iter = {}; + struct btree_iter tree_iter = {}; struct bkey_s_c_snapshot s; u32 parent_id, child_id; unsigned i; @@ -1193,13 +1190,13 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS_MIN, BTREE_ITER_intent); - k = bch2_btree_iter_peek(&iter); + k = bch2_btree_iter_peek(trans, &iter); ret = bkey_err(k); if (ret) goto err; for (i = 0; i < nr_snapids; i++) { - k = bch2_btree_iter_prev_slot(&iter); + k = bch2_btree_iter_prev_slot(trans, &iter); ret = bkey_err(k); if (ret) goto err; diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c index 602afca2f5ef..a90bf7b8a2b4 100644 --- a/fs/bcachefs/str_hash.c +++ b/fs/bcachefs/str_hash.c @@ -195,7 +195,7 @@ int __bch2_str_hash_check_key(struct btree_trans *trans, struct btree_iter *k_iter, struct bkey_s_c hash_k) { struct bch_fs *c = trans->c; - struct btree_iter iter = { NULL }; + struct btree_iter iter = {}; struct printbuf buf = PRINTBUF; struct bkey_s_c k; int ret = 0; diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 575ad1e03904..09a354a26c3b 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -231,11 +231,11 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, struct bkey_s_c k; int ret; - bch2_trans_copy_iter(&iter, start); + bch2_trans_copy_iter(trans, &iter, start); - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); - for_each_btree_key_continue_norestart(iter, BTREE_ITER_slots, k, ret) { + for_each_btree_key_continue_norestart(trans, iter, BTREE_ITER_slots, k, ret) { if (k.k->type != desc.key_type && k.k->type != KEY_TYPE_hash_whiteout) break; @@ -280,7 +280,7 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans, } if (!slot.path && !(flags & STR_HASH_must_replace)) - bch2_trans_copy_iter(&slot, iter); + bch2_trans_copy_iter(trans, &slot, iter); if (k.k->type != KEY_TYPE_hash_whiteout) goto not_found; diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index cd0d8e5e44e7..5537283d0bea 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -275,7 +275,7 @@ int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol) struct btree_iter iter; bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0); - struct bkey_s_c k = bch2_btree_iter_peek(&iter); + struct bkey_s_c k = bch2_btree_iter_peek(trans, &iter); bch2_trans_iter_exit(trans, &iter); return bkey_err(k) ?: k.k && k.k->p.inode == subvol @@ -574,7 +574,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, bool ro) { struct bch_fs *c = trans->c; - struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL }; + struct btree_iter dst_iter, src_iter = {}; struct bkey_i_subvolume *new_subvol = NULL; struct bkey_i_subvolume *src_subvol = NULL; u32 parent = 0, new_nodes[2], snapshot_subvols[2]; diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index 910f6196700e..f640c1e3d639 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -33,16 +33,16 @@ int bch2_subvol_is_ro_trans(struct btree_trans *, u32); int bch2_subvol_is_ro(struct bch_fs *, u32); static inline struct bkey_s_c -bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos end, - u32 subvolid, unsigned flags) +bch2_btree_iter_peek_in_subvolume_max_type(struct btree_trans *trans, struct btree_iter *iter, + struct bpos end, u32 subvolid, unsigned flags) { u32 snapshot; - int ret = bch2_subvolume_get_snapshot(iter->trans, subvolid, &snapshot); + int ret = bch2_subvolume_get_snapshot(trans, subvolid, &snapshot); if (ret) return bkey_s_c_err(ret); - bch2_btree_iter_set_snapshot(iter, snapshot); - return bch2_btree_iter_peek_max_type(iter, end, flags); + bch2_btree_iter_set_snapshot(trans, iter, snapshot); + return bch2_btree_iter_peek_max_type(trans, iter, end, flags); } #define for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \ @@ -53,14 +53,14 @@ bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos \ do { \ _ret3 = lockrestart_do(_trans, ({ \ - (_k) = bch2_btree_iter_peek_in_subvolume_max_type(&(_iter), \ + (_k) = bch2_btree_iter_peek_in_subvolume_max_type(trans, &(_iter),\ _end, _subvolid, (_flags)); \ if (!(_k).k) \ break; \ \ bkey_err(_k) ?: (_do); \ })); \ - } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \ + } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \ \ bch2_trans_iter_exit((_trans), &(_iter)); \ _ret3; \ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 572b06bfa0b8..e27422b6d9c6 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -248,7 +248,7 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb, struct bch_sb_handle *dev_sb = &ca->disk_sb; if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) { - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); return NULL; } } @@ -945,7 +945,7 @@ static void write_super_endio(struct bio *bio) } closure_put(&ca->fs->sb_write); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); } static void read_back_super(struct bch_fs *c, struct bch_dev *ca) @@ -963,7 +963,7 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca) this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio)); - percpu_ref_get(&ca->io_ref); + percpu_ref_get(&ca->io_ref[READ]); closure_bio_submit(bio, &c->sb_write); } @@ -989,7 +989,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb], bio_sectors(bio)); - percpu_ref_get(&ca->io_ref); + percpu_ref_get(&ca->io_ref[READ]); closure_bio_submit(bio, &c->sb_write); } @@ -1014,13 +1014,20 @@ int bch2_write_super(struct bch_fs *c) closure_init_stack(cl); memset(&sb_written, 0, sizeof(sb_written)); + /* + * Note: we do writes to RO devices here, and we might want to change + * that in the future. + * + * For now, we expect to be able to call write_super() when we're not + * yet RW: + */ for_each_online_member(c, ca) { ret = darray_push(&online_devices, ca); if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) { - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); goto out; } - percpu_ref_get(&ca->io_ref); + percpu_ref_get(&ca->io_ref[READ]); } /* Make sure we're using the new magic numbers: */ @@ -1186,7 +1193,7 @@ out: /* Make new options visible after they're persistent: */ bch2_sb_update(c); darray_for_each(online_devices, ca) - percpu_ref_put(&(*ca)->io_ref); + percpu_ref_put(&(*ca)->io_ref[READ]); darray_exit(&online_devices); printbuf_exit(&err); return ret; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 20208f3c5d8b..a58edde43bee 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -185,6 +185,7 @@ static void bch2_dev_unlink(struct bch_dev *); static void bch2_dev_free(struct bch_dev *); static int bch2_dev_alloc(struct bch_fs *, unsigned); static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); +static void bch2_dev_io_ref_stop(struct bch_dev *, int); static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); struct bch_fs *bch2_dev_to_fs(dev_t dev) @@ -294,8 +295,10 @@ static void __bch2_fs_read_only(struct bch_fs *c) /* * After stopping journal: */ - for_each_member_device(c, ca) + for_each_member_device(c, ca) { + bch2_dev_io_ref_stop(ca, WRITE); bch2_dev_allocator_remove(c, ca); + } } #ifndef BCH_WRITE_REF_DEBUG @@ -465,10 +468,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) if (ret) goto err; - ret = bch2_fs_mark_dirty(c); - if (ret) - goto err; - clear_bit(BCH_FS_clean_shutdown, &c->flags); /* @@ -480,10 +479,24 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) set_bit(JOURNAL_need_flush_write, &c->journal.flags); set_bit(JOURNAL_running, &c->journal.flags); - for_each_rw_member(c, ca) + __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) { bch2_dev_allocator_add(c, ca); + percpu_ref_reinit(&ca->io_ref[WRITE]); + } bch2_recalc_capacity(c); + ret = bch2_fs_mark_dirty(c); + if (ret) + goto err; + + spin_lock(&c->journal.lock); + bch2_journal_space_available(&c->journal); + spin_unlock(&c->journal.lock); + + ret = bch2_journal_reclaim_start(&c->journal); + if (ret) + goto err; + set_bit(BCH_FS_rw, &c->flags); set_bit(BCH_FS_was_rw, &c->flags); @@ -495,11 +508,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) atomic_long_inc(&c->writes[i]); } #endif - - ret = bch2_journal_reclaim_start(&c->journal); - if (ret) - goto err; - if (!early) { ret = bch2_fs_read_write_late(c); if (ret) @@ -675,6 +683,7 @@ void bch2_fs_free(struct bch_fs *c) if (ca) { EBUG_ON(atomic_long_read(&ca->ref) != 1); + bch2_dev_io_ref_stop(ca, READ); bch2_free_super(&ca->disk_sb); bch2_dev_free(ca); } @@ -1199,6 +1208,15 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, /* Device startup/shutdown: */ +static void bch2_dev_io_ref_stop(struct bch_dev *ca, int rw) +{ + if (!percpu_ref_is_zero(&ca->io_ref[rw])) { + reinit_completion(&ca->io_ref_completion[rw]); + percpu_ref_kill(&ca->io_ref[rw]); + wait_for_completion(&ca->io_ref_completion[rw]); + } +} + static void bch2_dev_release(struct kobject *kobj) { struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); @@ -1208,6 +1226,9 @@ static void bch2_dev_release(struct kobject *kobj) static void bch2_dev_free(struct bch_dev *ca) { + WARN_ON(!percpu_ref_is_zero(&ca->io_ref[WRITE])); + WARN_ON(!percpu_ref_is_zero(&ca->io_ref[READ])); + cancel_work_sync(&ca->io_error_work); bch2_dev_unlink(ca); @@ -1226,7 +1247,8 @@ static void bch2_dev_free(struct bch_dev *ca) bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); - percpu_ref_exit(&ca->io_ref); + percpu_ref_exit(&ca->io_ref[WRITE]); + percpu_ref_exit(&ca->io_ref[READ]); #ifndef CONFIG_BCACHEFS_DEBUG percpu_ref_exit(&ca->ref); #endif @@ -1238,14 +1260,12 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) lockdep_assert_held(&c->state_lock); - if (percpu_ref_is_zero(&ca->io_ref)) + if (percpu_ref_is_zero(&ca->io_ref[READ])) return; __bch2_dev_read_only(c, ca); - reinit_completion(&ca->io_ref_completion); - percpu_ref_kill(&ca->io_ref); - wait_for_completion(&ca->io_ref_completion); + bch2_dev_io_ref_stop(ca, READ); bch2_dev_unlink(ca); @@ -1262,11 +1282,18 @@ static void bch2_dev_ref_complete(struct percpu_ref *ref) } #endif -static void bch2_dev_io_ref_complete(struct percpu_ref *ref) +static void bch2_dev_io_ref_read_complete(struct percpu_ref *ref) +{ + struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref[READ]); + + complete(&ca->io_ref_completion[READ]); +} + +static void bch2_dev_io_ref_write_complete(struct percpu_ref *ref) { - struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref); + struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref[WRITE]); - complete(&ca->io_ref_completion); + complete(&ca->io_ref_completion[WRITE]); } static void bch2_dev_unlink(struct bch_dev *ca) @@ -1330,7 +1357,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, kobject_init(&ca->kobj, &bch2_dev_ktype); init_completion(&ca->ref_completion); - init_completion(&ca->io_ref_completion); + init_completion(&ca->io_ref_completion[READ]); + init_completion(&ca->io_ref_completion[WRITE]); INIT_WORK(&ca->io_error_work, bch2_io_error_work); @@ -1356,7 +1384,9 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, bch2_dev_allocator_background_init(ca); - if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, + if (percpu_ref_init(&ca->io_ref[READ], bch2_dev_io_ref_read_complete, + PERCPU_REF_INIT_DEAD, GFP_KERNEL) || + percpu_ref_init(&ca->io_ref[WRITE], bch2_dev_io_ref_write_complete, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) || bch2_dev_buckets_alloc(c, ca) || @@ -1419,7 +1449,8 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) return -BCH_ERR_device_size_too_small; } - BUG_ON(!percpu_ref_is_zero(&ca->io_ref)); + BUG_ON(!percpu_ref_is_zero(&ca->io_ref[READ])); + BUG_ON(!percpu_ref_is_zero(&ca->io_ref[WRITE])); ret = bch2_dev_journal_init(ca, sb->sb); if (ret) @@ -1438,7 +1469,7 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) ca->dev = ca->disk_sb.bdev->bd_dev; - percpu_ref_reinit(&ca->io_ref); + percpu_ref_reinit(&ca->io_ref[READ]); return 0; } @@ -1568,6 +1599,8 @@ static bool bch2_fs_may_start(struct bch_fs *c) static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) { + bch2_dev_io_ref_stop(ca, WRITE); + /* * The allocator thread itself allocates btree nodes, so stop it first: */ @@ -1584,6 +1617,10 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); + + if (percpu_ref_is_zero(&ca->io_ref[WRITE])) + percpu_ref_reinit(&ca->io_ref[WRITE]); + bch2_dev_do_discards(ca); } @@ -1731,7 +1768,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) return 0; err: if (ca->mi.state == BCH_MEMBER_STATE_rw && - !percpu_ref_is_zero(&ca->io_ref)) + !percpu_ref_is_zero(&ca->io_ref[READ])) __bch2_dev_read_write(c, ca); up_write(&c->state_lock); return ret; diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index 6c6469814637..c265b102267a 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -43,7 +43,7 @@ static int test_delete(struct bch_fs *c, u64 nr) BTREE_ITER_intent); ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(&iter) ?: + bch2_btree_iter_traverse(trans, &iter) ?: bch2_trans_update(trans, &iter, &k.k_i, 0)); bch_err_msg(c, ret, "update error"); if (ret) @@ -51,7 +51,7 @@ static int test_delete(struct bch_fs *c, u64 nr) pr_info("deleting once"); ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(&iter) ?: + bch2_btree_iter_traverse(trans, &iter) ?: bch2_btree_delete_at(trans, &iter, 0)); bch_err_msg(c, ret, "delete error (first)"); if (ret) @@ -59,7 +59,7 @@ static int test_delete(struct bch_fs *c, u64 nr) pr_info("deleting twice"); ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(&iter) ?: + bch2_btree_iter_traverse(trans, &iter) ?: bch2_btree_delete_at(trans, &iter, 0)); bch_err_msg(c, ret, "delete error (second)"); if (ret) @@ -84,7 +84,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr) BTREE_ITER_intent); ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(&iter) ?: + bch2_btree_iter_traverse(trans, &iter) ?: bch2_trans_update(trans, &iter, &k.k_i, 0)); bch_err_msg(c, ret, "update error"); if (ret) @@ -94,7 +94,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr) bch2_journal_flush_all_pins(&c->journal); ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(&iter) ?: + bch2_btree_iter_traverse(trans, &iter) ?: bch2_btree_delete_at(trans, &iter, 0)); bch_err_msg(c, ret, "delete error"); if (ret) @@ -349,10 +349,10 @@ static int test_peek_end(struct bch_fs *c, u64 nr) bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), 0); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); BUG_ON(k.k); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); BUG_ON(k.k); bch2_trans_iter_exit(trans, &iter); @@ -369,10 +369,10 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr) bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), 0); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); BUG_ON(k.k); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); BUG_ON(k.k); bch2_trans_iter_exit(trans, &iter); @@ -488,7 +488,7 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) trans = bch2_trans_get(c); bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, SPOS(0, 0, snapid_lo), 0); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); BUG_ON(k.k->p.snapshot != U32_MAX); @@ -602,9 +602,9 @@ static int rand_lookup(struct bch_fs *c, u64 nr) SPOS(0, 0, U32_MAX), 0); for (i = 0; i < nr; i++) { - bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX)); + bch2_btree_iter_set_pos(trans, &iter, SPOS(0, test_rand(), U32_MAX)); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(trans, &iter))); ret = bkey_err(k); if (ret) break; @@ -623,9 +623,9 @@ static int rand_mixed_trans(struct btree_trans *trans, struct bkey_s_c k; int ret; - bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX)); + bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, U32_MAX)); - k = bch2_btree_iter_peek(iter); + k = bch2_btree_iter_peek(trans, iter); ret = bkey_err(k); bch_err_msg(trans->c, ret, "lookup error"); if (ret) @@ -672,7 +672,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, BTREE_ITER_intent); - k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)); + k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)); ret = bkey_err(k); if (ret) goto err; diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index f9667b944c0d..651da52b2cbc 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -168,7 +168,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, int type, int flags) { struct bch_fs *c = trans->c; - struct btree_iter inode_iter = { NULL }; + struct btree_iter inode_iter = {}; int ret; ret = bch2_subvol_is_ro_trans(trans, inum.subvol) ?: diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index cd5f38d6fbaa..3541efa765c7 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -225,7 +225,7 @@ void zstd_cleanup_workspace_manager(void) } spin_unlock_bh(&wsm.lock); - del_timer_sync(&wsm.timer); + timer_delete_sync(&wsm.timer); } /* diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 83a60126de0f..14d0cc894000 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -128,10 +128,11 @@ retry: ret = security_path_mkdir(&path, subdir, 0700); if (ret < 0) goto mkdir_error; - subdir = ERR_PTR(cachefiles_inject_write_error()); - if (!IS_ERR(subdir)) + ret = cachefiles_inject_write_error(); + if (ret == 0) subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700); - ret = PTR_ERR(subdir); + else + subdir = ERR_PTR(ret); if (IS_ERR(subdir)) { trace_cachefiles_vfs_error(NULL, d_inode(dir), ret, cachefiles_trace_mkdir_error); diff --git a/fs/exec.c b/fs/exec.c index f45859ad13ac..8e4ea5f1e64c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1227,13 +1227,12 @@ int begin_new_exec(struct linux_binprm * bprm) */ bprm->point_of_no_return = true; - /* - * Make this the only thread in the thread group. - */ + /* Make this the only thread in the thread group */ retval = de_thread(me); if (retval) goto out; - + /* see the comment in check_unsafe_exec() */ + current->fs->in_exec = 0; /* * Cancel any io_uring activity across execve */ @@ -1495,6 +1494,8 @@ static void free_bprm(struct linux_binprm *bprm) } free_arg_pages(bprm); if (bprm->cred) { + /* in case exec fails before de_thread() succeeds */ + current->fs->in_exec = 0; mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } @@ -1616,6 +1617,10 @@ static void check_unsafe_exec(struct linux_binprm *bprm) * suid exec because the differently privileged task * will be able to manipulate the current directory, etc. * It would be nice to force an unshare instead... + * + * Otherwise we set fs->in_exec = 1 to deny clone(CLONE_FS) + * from another sub-thread until de_thread() succeeds, this + * state is protected by cred_guard_mutex we hold. */ n_fs = 1; spin_lock(&p->fs->lock); @@ -1859,10 +1864,9 @@ static int bprm_execve(struct linux_binprm *bprm) goto out; sched_mm_cid_after_execve(current); + rseq_execve(current); /* execve succeeded */ - current->fs->in_exec = 0; current->in_execve = 0; - rseq_execve(current); user_events_execve(current); acct_update_integrals(current); task_numa_free(current, false); @@ -1879,7 +1883,7 @@ out: force_fatal_sig(SIGSEGV); sched_mm_cid_after_execve(current); - current->fs->in_exec = 0; + rseq_set_notify_resume(current); current->in_execve = 0; return retval; diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index b5845c4846b8..128dd092916b 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -608,4 +608,5 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, } EXPORT_SYMBOL_GPL(exportfs_decode_fh); +MODULE_DESCRIPTION("Code mapping from inodes to file handles"); MODULE_LICENSE("GPL"); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8122d4ffb3b5..181934499624 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5680,7 +5680,7 @@ failed_mount3: /* flush s_sb_upd_work before sbi destroy */ flush_work(&sbi->s_sb_upd_work); ext4_stop_mmpd(sbi); - del_timer_sync(&sbi->s_err_report); + timer_delete_sync(&sbi->s_err_report); ext4_group_desc_free(sbi); failed_mount: #if IS_ENABLED(CONFIG_UNICODE) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 51e31df4c546..6dcbaa218b7a 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -32,6 +32,100 @@ MODULE_ALIAS("devname:fuse"); static struct kmem_cache *fuse_req_cachep; +const unsigned long fuse_timeout_timer_freq = + secs_to_jiffies(FUSE_TIMEOUT_TIMER_FREQ); + +bool fuse_request_expired(struct fuse_conn *fc, struct list_head *list) +{ + struct fuse_req *req; + + req = list_first_entry_or_null(list, struct fuse_req, list); + if (!req) + return false; + return time_is_before_jiffies(req->create_time + fc->timeout.req_timeout); +} + +bool fuse_fpq_processing_expired(struct fuse_conn *fc, struct list_head *processing) +{ + int i; + + for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) + if (fuse_request_expired(fc, &processing[i])) + return true; + + return false; +} + +/* + * Check if any requests aren't being completed by the time the request timeout + * elapses. To do so, we: + * - check the fiq pending list + * - check the bg queue + * - check the fpq io and processing lists + * + * To make this fast, we only check against the head request on each list since + * these are generally queued in order of creation time (eg newer requests get + * queued to the tail). We might miss a few edge cases (eg requests transitioning + * between lists, re-sent requests at the head of the pending list having a + * later creation time than other requests on that list, etc.) but that is fine + * since if the request never gets fulfilled, it will eventually be caught. + */ +void fuse_check_timeout(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct fuse_conn *fc = container_of(dwork, struct fuse_conn, + timeout.work); + struct fuse_iqueue *fiq = &fc->iq; + struct fuse_dev *fud; + struct fuse_pqueue *fpq; + bool expired = false; + + if (!atomic_read(&fc->num_waiting)) + goto out; + + spin_lock(&fiq->lock); + expired = fuse_request_expired(fc, &fiq->pending); + spin_unlock(&fiq->lock); + if (expired) + goto abort_conn; + + spin_lock(&fc->bg_lock); + expired = fuse_request_expired(fc, &fc->bg_queue); + spin_unlock(&fc->bg_lock); + if (expired) + goto abort_conn; + + spin_lock(&fc->lock); + if (!fc->connected) { + spin_unlock(&fc->lock); + return; + } + list_for_each_entry(fud, &fc->devices, entry) { + fpq = &fud->pq; + spin_lock(&fpq->lock); + if (fuse_request_expired(fc, &fpq->io) || + fuse_fpq_processing_expired(fc, fpq->processing)) { + spin_unlock(&fpq->lock); + spin_unlock(&fc->lock); + goto abort_conn; + } + + spin_unlock(&fpq->lock); + } + spin_unlock(&fc->lock); + + if (fuse_uring_request_expired(fc)) + goto abort_conn; + +out: + queue_delayed_work(system_wq, &fc->timeout.work, + fuse_timeout_timer_freq); + return; + +abort_conn: + fuse_abort_conn(fc); +} + static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req) { INIT_LIST_HEAD(&req->list); @@ -40,6 +134,7 @@ static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req) refcount_set(&req->count, 1); __set_bit(FR_PENDING, &req->flags); req->fm = fm; + req->create_time = jiffies; } static struct fuse_req *fuse_request_alloc(struct fuse_mount *fm, gfp_t flags) @@ -407,6 +502,24 @@ static int queue_interrupt(struct fuse_req *req) return 0; } +bool fuse_remove_pending_req(struct fuse_req *req, spinlock_t *lock) +{ + spin_lock(lock); + if (test_bit(FR_PENDING, &req->flags)) { + /* + * FR_PENDING does not get cleared as the request will end + * up in destruction anyway. + */ + list_del(&req->list); + spin_unlock(lock); + __fuse_put_request(req); + req->out.h.error = -EINTR; + return true; + } + spin_unlock(lock); + return false; +} + static void request_wait_answer(struct fuse_req *req) { struct fuse_conn *fc = req->fm->fc; @@ -428,22 +541,20 @@ static void request_wait_answer(struct fuse_req *req) } if (!test_bit(FR_FORCE, &req->flags)) { + bool removed; + /* Only fatal signals may interrupt this */ err = wait_event_killable(req->waitq, test_bit(FR_FINISHED, &req->flags)); if (!err) return; - spin_lock(&fiq->lock); - /* Request is not yet in userspace, bail out */ - if (test_bit(FR_PENDING, &req->flags)) { - list_del(&req->list); - spin_unlock(&fiq->lock); - __fuse_put_request(req); - req->out.h.error = -EINTR; + if (test_bit(FR_URING, &req->flags)) + removed = fuse_uring_remove_pending_req(req); + else + removed = fuse_remove_pending_req(req, &fiq->lock); + if (removed) return; - } - spin_unlock(&fiq->lock); } /* @@ -1533,14 +1644,10 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_inval_entry_out outarg; - int err = -ENOMEM; - char *buf; + int err; + char *buf = NULL; struct qstr name; - buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL); - if (!buf) - goto err; - err = -EINVAL; if (size < sizeof(outarg)) goto err; @@ -1550,13 +1657,18 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, goto err; err = -ENAMETOOLONG; - if (outarg.namelen > FUSE_NAME_MAX) + if (outarg.namelen > fc->name_max) goto err; err = -EINVAL; if (size != sizeof(outarg) + outarg.namelen + 1) goto err; + err = -ENOMEM; + buf = kzalloc(outarg.namelen + 1, GFP_KERNEL); + if (!buf) + goto err; + name.name = buf; name.len = outarg.namelen; err = fuse_copy_one(cs, buf, outarg.namelen + 1); @@ -1581,14 +1693,10 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_delete_out outarg; - int err = -ENOMEM; - char *buf; + int err; + char *buf = NULL; struct qstr name; - buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL); - if (!buf) - goto err; - err = -EINVAL; if (size < sizeof(outarg)) goto err; @@ -1598,13 +1706,18 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, goto err; err = -ENAMETOOLONG; - if (outarg.namelen > FUSE_NAME_MAX) + if (outarg.namelen > fc->name_max) goto err; err = -EINVAL; if (size != sizeof(outarg) + outarg.namelen + 1) goto err; + err = -ENOMEM; + buf = kzalloc(outarg.namelen + 1, GFP_KERNEL); + if (!buf) + goto err; + name.name = buf; name.len = outarg.namelen; err = fuse_copy_one(cs, buf, outarg.namelen + 1); @@ -2275,6 +2388,9 @@ void fuse_abort_conn(struct fuse_conn *fc) LIST_HEAD(to_end); unsigned int i; + if (fc->timeout.req_timeout) + cancel_delayed_work(&fc->timeout.work); + /* Background queuing checks fc->connected under bg_lock */ spin_lock(&fc->bg_lock); fc->connected = 0; diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 82bf458fa9db..accdce2977c5 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -140,6 +140,33 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring) } } +bool fuse_uring_request_expired(struct fuse_conn *fc) +{ + struct fuse_ring *ring = fc->ring; + struct fuse_ring_queue *queue; + int qid; + + if (!ring) + return false; + + for (qid = 0; qid < ring->nr_queues; qid++) { + queue = READ_ONCE(ring->queues[qid]); + if (!queue) + continue; + + spin_lock(&queue->lock); + if (fuse_request_expired(fc, &queue->fuse_req_queue) || + fuse_request_expired(fc, &queue->fuse_req_bg_queue) || + fuse_fpq_processing_expired(fc, queue->fpq.processing)) { + spin_unlock(&queue->lock); + return true; + } + spin_unlock(&queue->lock); + } + + return false; +} + void fuse_uring_destruct(struct fuse_conn *fc) { struct fuse_ring *ring = fc->ring; @@ -211,7 +238,6 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) ring->nr_queues = nr_queues; ring->fc = fc; ring->max_payload_sz = max_payload_size; - atomic_set(&ring->queue_refs, 0); smp_store_release(&fc->ring, ring); spin_unlock(&fc->lock); @@ -726,8 +752,6 @@ static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent, struct fuse_req *req) { struct fuse_ring_queue *queue = ent->queue; - struct fuse_conn *fc = req->fm->fc; - struct fuse_iqueue *fiq = &fc->iq; lockdep_assert_held(&queue->lock); @@ -737,9 +761,7 @@ static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent, ent->state); } - spin_lock(&fiq->lock); clear_bit(FR_PENDING, &req->flags); - spin_unlock(&fiq->lock); ent->fuse_req = req; ent->state = FRRS_FUSE_REQ; list_move(&ent->list, &queue->ent_w_req_queue); @@ -1238,6 +1260,8 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) if (unlikely(queue->stopped)) goto err_unlock; + set_bit(FR_URING, &req->flags); + req->ring_queue = queue; ent = list_first_entry_or_null(&queue->ent_avail_queue, struct fuse_ring_ent, list); if (ent) @@ -1276,6 +1300,8 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) return false; } + set_bit(FR_URING, &req->flags); + req->ring_queue = queue; list_add_tail(&req->list, &queue->fuse_req_bg_queue); ent = list_first_entry_or_null(&queue->ent_avail_queue, @@ -1306,6 +1332,13 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) return true; } +bool fuse_uring_remove_pending_req(struct fuse_req *req) +{ + struct fuse_ring_queue *queue = req->ring_queue; + + return fuse_remove_pending_req(req, &queue->lock); +} + static const struct fuse_iqueue_ops fuse_io_uring_ops = { /* should be send over io-uring as enhancement */ .send_forget = fuse_dev_queue_forget, diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index 2102b3d0c1ae..51a563922ce1 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -142,6 +142,8 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring); int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req); bool fuse_uring_queue_bq_req(struct fuse_req *req); +bool fuse_uring_remove_pending_req(struct fuse_req *req); +bool fuse_uring_request_expired(struct fuse_conn *fc); static inline void fuse_uring_abort(struct fuse_conn *fc) { @@ -172,12 +174,6 @@ static inline bool fuse_uring_ready(struct fuse_conn *fc) #else /* CONFIG_FUSE_IO_URING */ -struct fuse_ring; - -static inline void fuse_uring_create(struct fuse_conn *fc) -{ -} - static inline void fuse_uring_destruct(struct fuse_conn *fc) { } @@ -200,6 +196,16 @@ static inline bool fuse_uring_ready(struct fuse_conn *fc) return false; } +static inline bool fuse_uring_remove_pending_req(struct fuse_req *req) +{ + return false; +} + +static inline bool fuse_uring_request_expired(struct fuse_conn *fc) +{ + return false; +} + #endif /* CONFIG_FUSE_IO_URING */ #endif /* _FS_FUSE_DEV_URING_I_H */ diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 85e4f894a59f..83ac192e7fdd 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -370,7 +370,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name *inode = NULL; err = -ENAMETOOLONG; - if (name->len > FUSE_NAME_MAX) + if (name->len > fm->fc->name_max) goto out; @@ -1137,6 +1137,9 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); + if (fm->fc->no_link) + goto out; + memset(&inarg, 0, sizeof(inarg)); inarg.oldnodeid = get_node_id(inode); args.opcode = FUSE_LINK; @@ -1151,6 +1154,12 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, else if (err == -EINTR) fuse_invalidate_attr(inode); + if (err == -ENOSYS) + fm->fc->no_link = 1; +out: + if (fm->fc->no_link) + return -EPERM; + return err; } diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index 3b2bfe1248d3..b3c2e32254ba 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -61,6 +61,10 @@ int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, void fuse_dev_queue_forget(struct fuse_iqueue *fiq, struct fuse_forget_link *forget); void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req); +bool fuse_remove_pending_req(struct fuse_req *req, spinlock_t *lock); + +bool fuse_request_expired(struct fuse_conn *fc, struct list_head *list); +bool fuse_fpq_processing_expired(struct fuse_conn *fc, struct list_head *processing); #endif diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index fee96fe7887b..d56d4fd956db 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -38,14 +38,34 @@ /** Bias for fi->writectr, meaning new writepages must not be sent */ #define FUSE_NOWRITE INT_MIN -/** It could be as large as PATH_MAX, but would that have any uses? */ -#define FUSE_NAME_MAX 1024 +/** Maximum length of a filename, not including terminating null */ + +/* maximum, small enough for FUSE_MIN_READ_BUFFER*/ +#define FUSE_NAME_LOW_MAX 1024 +/* maximum, but needs a request buffer > FUSE_MIN_READ_BUFFER */ +#define FUSE_NAME_MAX (PATH_MAX - 1) /** Number of dentries for each connection in the control filesystem */ #define FUSE_CTL_NUM_DENTRIES 5 +/* Frequency (in seconds) of request timeout checks, if opted into */ +#define FUSE_TIMEOUT_TIMER_FREQ 15 + +/** Frequency (in jiffies) of request timeout checks, if opted into */ +extern const unsigned long fuse_timeout_timer_freq; + /** Maximum of max_pages received in init_out */ extern unsigned int fuse_max_pages_limit; +/* + * Default timeout (in seconds) for the server to reply to a request + * before the connection is aborted, if no timeout was specified on mount. + */ +extern unsigned int fuse_default_req_timeout; +/* + * Max timeout (in seconds) for the server to reply to a request before + * the connection is aborted. + */ +extern unsigned int fuse_max_req_timeout; /** List of active connections */ extern struct list_head fuse_conn_list; @@ -378,6 +398,7 @@ struct fuse_io_priv { * FR_FINISHED: request is finished * FR_PRIVATE: request is on private list * FR_ASYNC: request is asynchronous + * FR_URING: request is handled through fuse-io-uring */ enum fuse_req_flag { FR_ISREPLY, @@ -392,6 +413,7 @@ enum fuse_req_flag { FR_FINISHED, FR_PRIVATE, FR_ASYNC, + FR_URING, }; /** @@ -441,7 +463,10 @@ struct fuse_req { #ifdef CONFIG_FUSE_IO_URING void *ring_entry; + void *ring_queue; #endif + /** When (in jiffies) the request was created */ + unsigned long create_time; }; struct fuse_iqueue; @@ -867,6 +892,9 @@ struct fuse_conn { /* Use pages instead of pointer for kernel I/O */ unsigned int use_pages_for_kvec_io:1; + /* Is link not implemented by fs? */ + unsigned int no_link:1; + /* Use io_uring for communication */ unsigned int io_uring; @@ -900,6 +928,9 @@ struct fuse_conn { /** Version counter for evict inode */ atomic64_t evict_ctr; + /* maximum file name length */ + u32 name_max; + /** Called on final put */ void (*release)(struct fuse_conn *); @@ -935,6 +966,15 @@ struct fuse_conn { /** uring connection information*/ struct fuse_ring *ring; #endif + + /** Only used if the connection opts into request timeouts */ + struct { + /* Worker for checking if any requests have timed out */ + struct delayed_work work; + + /* Request timeout (in jiffies). 0 = no timeout */ + unsigned int req_timeout; + } timeout; }; /* @@ -1216,6 +1256,9 @@ void fuse_request_end(struct fuse_req *req); void fuse_abort_conn(struct fuse_conn *fc); void fuse_wait_aborted(struct fuse_conn *fc); +/* Check if any requests timed out */ +void fuse_check_timeout(struct work_struct *work); + /** * Invalidate inode attributes */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index e9db2cb8c150..fd48e8d37f2e 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -37,6 +37,9 @@ DEFINE_MUTEX(fuse_mutex); static int set_global_limit(const char *val, const struct kernel_param *kp); unsigned int fuse_max_pages_limit = 256; +/* default is no timeout */ +unsigned int fuse_default_req_timeout; +unsigned int fuse_max_req_timeout; unsigned max_user_bgreq; module_param_call(max_user_bgreq, set_global_limit, param_get_uint, @@ -979,6 +982,8 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->user_ns = get_user_ns(user_ns); fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; fc->max_pages_limit = fuse_max_pages_limit; + fc->name_max = FUSE_NAME_LOW_MAX; + fc->timeout.req_timeout = 0; if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) fuse_backing_files_init(fc); @@ -1007,6 +1012,8 @@ void fuse_conn_put(struct fuse_conn *fc) if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_conn_free(fc); + if (fc->timeout.req_timeout) + cancel_delayed_work_sync(&fc->timeout.work); if (fiq->ops->release) fiq->ops->release(fiq); put_pid_ns(fc->pid_ns); @@ -1257,6 +1264,34 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg) spin_unlock(&fc->bg_lock); } +static void set_request_timeout(struct fuse_conn *fc, unsigned int timeout) +{ + fc->timeout.req_timeout = secs_to_jiffies(timeout); + INIT_DELAYED_WORK(&fc->timeout.work, fuse_check_timeout); + queue_delayed_work(system_wq, &fc->timeout.work, + fuse_timeout_timer_freq); +} + +static void init_server_timeout(struct fuse_conn *fc, unsigned int timeout) +{ + if (!timeout && !fuse_max_req_timeout && !fuse_default_req_timeout) + return; + + if (!timeout) + timeout = fuse_default_req_timeout; + + if (fuse_max_req_timeout) { + if (timeout) + timeout = min(fuse_max_req_timeout, timeout); + else + timeout = fuse_max_req_timeout; + } + + timeout = max(FUSE_TIMEOUT_TIMER_FREQ, timeout); + + set_request_timeout(fc, timeout); +} + struct fuse_init_args { struct fuse_args args; struct fuse_init_in in; @@ -1275,6 +1310,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, ok = false; else { unsigned long ra_pages; + unsigned int timeout = 0; process_init_limits(fc, arg); @@ -1338,6 +1374,13 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->max_pages = min_t(unsigned int, fc->max_pages_limit, max_t(unsigned int, arg->max_pages, 1)); + + /* + * PATH_MAX file names might need two pages for + * ops like rename + */ + if (fc->max_pages > 1) + fc->name_max = FUSE_NAME_MAX; } if (IS_ENABLED(CONFIG_FUSE_DAX)) { if (flags & FUSE_MAP_ALIGNMENT && @@ -1392,12 +1435,17 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, } if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled()) fc->io_uring = 1; + + if (flags & FUSE_REQUEST_TIMEOUT) + timeout = arg->request_timeout; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; fc->no_flock = 1; } + init_server_timeout(fc, timeout); + fm->sb->s_bdi->ra_pages = min(fm->sb->s_bdi->ra_pages, ra_pages); fc->minor = arg->minor; @@ -1439,7 +1487,8 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP | - FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_ALLOW_IDMAP; + FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_ALLOW_IDMAP | + FUSE_REQUEST_TIMEOUT; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; diff --git a/fs/fuse/sysctl.c b/fs/fuse/sysctl.c index 63fb1e5bee30..e2d921abcb88 100644 --- a/fs/fuse/sysctl.c +++ b/fs/fuse/sysctl.c @@ -13,6 +13,12 @@ static struct ctl_table_header *fuse_table_header; /* Bound by fuse_init_out max_pages, which is a u16 */ static unsigned int sysctl_fuse_max_pages_limit = 65535; +/* + * fuse_init_out request timeouts are u16. + * This goes up to ~18 hours, which is plenty for a timeout. + */ +static unsigned int sysctl_fuse_req_timeout_limit = 65535; + static const struct ctl_table fuse_sysctl_table[] = { { .procname = "max_pages_limit", @@ -23,6 +29,24 @@ static const struct ctl_table fuse_sysctl_table[] = { .extra1 = SYSCTL_ONE, .extra2 = &sysctl_fuse_max_pages_limit, }, + { + .procname = "default_request_timeout", + .data = &fuse_default_req_timeout, + .maxlen = sizeof(fuse_default_req_timeout), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &sysctl_fuse_req_timeout_limit, + }, + { + .procname = "max_request_timeout", + .data = &fuse_max_req_timeout, + .maxlen = sizeof(fuse_max_req_timeout), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &sysctl_fuse_req_timeout_limit, + }, }; int fuse_sysctl_register(void) diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index 8b39c15c408c..15b2f094d36e 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -60,7 +60,7 @@ struct hostfs_stat { unsigned int uid; unsigned int gid; unsigned long long size; - struct hostfs_timespec atime, mtime, ctime; + struct hostfs_timespec atime, mtime, ctime, btime; unsigned int blksize; unsigned long long blocks; struct { diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index a2c6b9051c5b..702c41317589 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -33,6 +33,7 @@ struct hostfs_inode_info { struct inode vfs_inode; struct mutex open_mutex; dev_t dev; + struct hostfs_timespec btime; }; static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode) @@ -547,6 +548,7 @@ static int hostfs_inode_set(struct inode *ino, void *data) } HOSTFS_I(ino)->dev = dev; + HOSTFS_I(ino)->btime = st->btime; ino->i_ino = st->ino; ino->i_mode = st->mode; return hostfs_inode_update(ino, st); @@ -557,7 +559,10 @@ static int hostfs_inode_test(struct inode *inode, void *data) const struct hostfs_stat *st = data; dev_t dev = MKDEV(st->dev.maj, st->dev.min); - return inode->i_ino == st->ino && HOSTFS_I(inode)->dev == dev; + return inode->i_ino == st->ino && HOSTFS_I(inode)->dev == dev && + (inode->i_mode & S_IFMT) == (st->mode & S_IFMT) && + HOSTFS_I(inode)->btime.tv_sec == st->btime.tv_sec && + HOSTFS_I(inode)->btime.tv_nsec == st->btime.tv_nsec; } static struct inode *hostfs_iget(struct super_block *sb, char *name) diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index 97e9c40a9448..3bcd9f35e70b 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -18,39 +18,48 @@ #include "hostfs.h" #include <utime.h> -static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p) +static void statx_to_hostfs(const struct statx *buf, struct hostfs_stat *p) { - p->ino = buf->st_ino; - p->mode = buf->st_mode; - p->nlink = buf->st_nlink; - p->uid = buf->st_uid; - p->gid = buf->st_gid; - p->size = buf->st_size; - p->atime.tv_sec = buf->st_atime; - p->atime.tv_nsec = 0; - p->ctime.tv_sec = buf->st_ctime; - p->ctime.tv_nsec = 0; - p->mtime.tv_sec = buf->st_mtime; - p->mtime.tv_nsec = 0; - p->blksize = buf->st_blksize; - p->blocks = buf->st_blocks; - p->rdev.maj = os_major(buf->st_rdev); - p->rdev.min = os_minor(buf->st_rdev); - p->dev.maj = os_major(buf->st_dev); - p->dev.min = os_minor(buf->st_dev); + p->ino = buf->stx_ino; + p->mode = buf->stx_mode; + p->nlink = buf->stx_nlink; + p->uid = buf->stx_uid; + p->gid = buf->stx_gid; + p->size = buf->stx_size; + p->atime.tv_sec = buf->stx_atime.tv_sec; + p->atime.tv_nsec = buf->stx_atime.tv_nsec; + p->ctime.tv_sec = buf->stx_ctime.tv_sec; + p->ctime.tv_nsec = buf->stx_ctime.tv_nsec; + p->mtime.tv_sec = buf->stx_mtime.tv_sec; + p->mtime.tv_nsec = buf->stx_mtime.tv_nsec; + if (buf->stx_mask & STATX_BTIME) { + p->btime.tv_sec = buf->stx_btime.tv_sec; + p->btime.tv_nsec = buf->stx_btime.tv_nsec; + } else { + memset(&p->btime, 0, sizeof(p->btime)); + } + p->blksize = buf->stx_blksize; + p->blocks = buf->stx_blocks; + p->rdev.maj = buf->stx_rdev_major; + p->rdev.min = buf->stx_rdev_minor; + p->dev.maj = buf->stx_dev_major; + p->dev.min = buf->stx_dev_minor; } int stat_file(const char *path, struct hostfs_stat *p, int fd) { - struct stat64 buf; + struct statx buf; + int flags = AT_SYMLINK_NOFOLLOW; if (fd >= 0) { - if (fstat64(fd, &buf) < 0) - return -errno; - } else if (lstat64(path, &buf) < 0) { - return -errno; + flags |= AT_EMPTY_PATH; + path = ""; } - stat64_to_hostfs(&buf, p); + + if ((statx(fd, path, flags, STATX_BASIC_STATS | STATX_BTIME, &buf)) < 0) + return -errno; + + statx_to_hostfs(&buf, p); return 0; } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index a5ccba25ff47..743a1d7633cd 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -197,7 +197,7 @@ loop: if (journal->j_commit_sequence != journal->j_commit_request) { jbd2_debug(1, "OK, requests differ\n"); write_unlock(&journal->j_state_lock); - del_timer_sync(&journal->j_commit_timer); + timer_delete_sync(&journal->j_commit_timer); jbd2_journal_commit_transaction(journal); write_lock(&journal->j_state_lock); goto loop; @@ -246,7 +246,7 @@ loop: goto loop; end_loop: - del_timer_sync(&journal->j_commit_timer); + timer_delete_sync(&journal->j_commit_timer); journal->j_task = NULL; wake_up(&journal->j_wait_done_commit); jbd2_debug(1, "Journal thread exiting.\n"); diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 4061e0ba7010..bb815a002984 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -584,7 +584,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad) size_t retlen; /* Nothing to do if not write-buffering the flash. In particular, we shouldn't - del_timer() the timer we never initialised. */ + call timer_delete() on the timer we never initialised. */ if (!jffs2_is_writebuffered(c)) return 0; diff --git a/fs/namespace.c b/fs/namespace.c index 6100e5b962a6..14935a0500a2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2478,7 +2478,8 @@ struct vfsmount *clone_private_mount(const struct path *path) struct mount *old_mnt = real_mount(path->mnt); struct mount *new_mnt; - scoped_guard(rwsem_read, &namespace_sem) + guard(rwsem_read)(&namespace_sem); + if (IS_MNT_UNBINDABLE(old_mnt)) return ERR_PTR(-EINVAL); @@ -5326,8 +5327,10 @@ struct kstatmount { struct mnt_idmap *idmap; u64 mask; struct path root; - struct statmount sm; struct seq_file seq; + + /* Must be last --ends in a flexible-array member. */ + struct statmount sm; }; static u64 mnt_to_attr_flags(struct vfsmount *mnt) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 3b0918ade53c..02c916a55020 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -546,6 +546,8 @@ int nfs_create_rpc_client(struct nfs_client *clp, args.flags |= RPC_CLNT_CREATE_NOPING; if (test_bit(NFS_CS_REUSEPORT, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_REUSEPORT; + if (test_bit(NFS_CS_NETUNREACH_FATAL, &clp->cl_flags)) + args.flags |= RPC_CLNT_CREATE_NETUNREACH_FATAL; if (!IS_ERR(clp->cl_rpcclient)) return 0; @@ -709,6 +711,9 @@ static int nfs_init_server(struct nfs_server *server, if (ctx->flags & NFS_MOUNT_NORESVPORT) set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + if (ctx->flags & NFS_MOUNT_NETUNREACH_FATAL) + __set_bit(NFS_CS_NETUNREACH_FATAL, &cl_init.init_flags); + /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init); if (IS_ERR(clp)) diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 4db912f56230..8bdbc4dca89c 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -79,6 +79,7 @@ static void nfs_mark_return_delegation(struct nfs_server *server, struct nfs_delegation *delegation) { set_bit(NFS_DELEGATION_RETURN, &delegation->flags); + set_bit(NFS4SERV_DELEGRETURN, &server->delegation_flags); set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); } @@ -306,7 +307,8 @@ nfs_start_delegation_return_locked(struct nfs_inode *nfsi) if (delegation == NULL) goto out; spin_lock(&delegation->lock); - if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { + if (delegation->inode && + !test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { clear_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags); /* Refcount matched in nfs_end_delegation_return() */ ret = nfs_get_delegation(delegation); @@ -330,14 +332,16 @@ nfs_start_delegation_return(struct nfs_inode *nfsi) } static void nfs_abort_delegation_return(struct nfs_delegation *delegation, - struct nfs_client *clp, int err) + struct nfs_server *server, int err) { - spin_lock(&delegation->lock); clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags); if (err == -EAGAIN) { set_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags); - set_bit(NFS4CLNT_DELEGRETURN_DELAYED, &clp->cl_state); + set_bit(NFS4SERV_DELEGRETURN_DELAYED, + &server->delegation_flags); + set_bit(NFS4CLNT_DELEGRETURN_DELAYED, + &server->nfs_client->cl_state); } spin_unlock(&delegation->lock); } @@ -547,7 +551,7 @@ out: */ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation *delegation, int issync) { - struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_server *server = NFS_SERVER(inode); unsigned int mode = O_WRONLY | O_RDWR; int err = 0; @@ -569,11 +573,11 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation /* * Guard against state recovery */ - err = nfs4_wait_clnt_recover(clp); + err = nfs4_wait_clnt_recover(server->nfs_client); } if (err) { - nfs_abort_delegation_return(delegation, clp, err); + nfs_abort_delegation_return(delegation, server, err); goto out; } @@ -590,17 +594,6 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation) if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags)) ret = true; - else if (test_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags)) { - struct inode *inode; - - spin_lock(&delegation->lock); - inode = delegation->inode; - if (inode && list_empty(&NFS_I(inode)->open_files)) - ret = true; - spin_unlock(&delegation->lock); - } - if (ret) - clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags) || test_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags) || test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) @@ -619,6 +612,9 @@ static int nfs_server_return_marked_delegations(struct nfs_server *server, struct nfs_delegation *place_holder_deleg = NULL; int err = 0; + if (!test_and_clear_bit(NFS4SERV_DELEGRETURN, + &server->delegation_flags)) + return 0; restart: /* * To avoid quadratic looping we hold a reference @@ -670,6 +666,7 @@ restart: cond_resched(); if (!err) goto restart; + set_bit(NFS4SERV_DELEGRETURN, &server->delegation_flags); set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); goto out; } @@ -684,6 +681,9 @@ static bool nfs_server_clear_delayed_delegations(struct nfs_server *server) struct nfs_delegation *d; bool ret = false; + if (!test_and_clear_bit(NFS4SERV_DELEGRETURN_DELAYED, + &server->delegation_flags)) + goto out; list_for_each_entry_rcu (d, &server->delegations, super_list) { if (!test_bit(NFS_DELEGATION_RETURN_DELAYED, &d->flags)) continue; @@ -691,6 +691,7 @@ static bool nfs_server_clear_delayed_delegations(struct nfs_server *server) clear_bit(NFS_DELEGATION_RETURN_DELAYED, &d->flags); ret = true; } +out: return ret; } @@ -878,11 +879,25 @@ int nfs4_inode_make_writeable(struct inode *inode) return nfs4_inode_return_delegation(inode); } -static void nfs_mark_return_if_closed_delegation(struct nfs_server *server, - struct nfs_delegation *delegation) +static void +nfs_mark_return_if_closed_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) { - set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); - set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); + struct inode *inode; + + if (test_bit(NFS_DELEGATION_RETURN, &delegation->flags) || + test_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags)) + return; + spin_lock(&delegation->lock); + inode = delegation->inode; + if (!inode) + goto out; + if (list_empty(&NFS_I(inode)->open_files)) + nfs_mark_return_delegation(server, delegation); + else + set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); +out: + spin_unlock(&delegation->lock); } static bool nfs_server_mark_return_all_delegations(struct nfs_server *server) @@ -1276,6 +1291,7 @@ static void nfs_mark_test_expired_delegation(struct nfs_server *server, return; clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); set_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags); + set_bit(NFS4SERV_DELEGATION_EXPIRED, &server->delegation_flags); set_bit(NFS4CLNT_DELEGATION_EXPIRED, &server->nfs_client->cl_state); } @@ -1354,6 +1370,9 @@ static int nfs_server_reap_expired_delegations(struct nfs_server *server, nfs4_stateid stateid; unsigned long gen = ++server->delegation_gen; + if (!test_and_clear_bit(NFS4SERV_DELEGATION_EXPIRED, + &server->delegation_flags)) + return 0; restart: rcu_read_lock(); list_for_each_entry_rcu(delegation, &server->delegations, super_list) { @@ -1383,6 +1402,9 @@ restart: goto restart; } nfs_inode_mark_test_expired_delegation(server,inode); + set_bit(NFS4SERV_DELEGATION_EXPIRED, &server->delegation_flags); + set_bit(NFS4CLNT_DELEGATION_EXPIRED, + &server->nfs_client->cl_state); iput(inode); return -EAGAIN; } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index bc957487f6ec..bd23fc736b39 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -666,6 +666,8 @@ static bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx, { if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS)) return false; + if (NFS_SERVER(dir)->flags & NFS_MOUNT_FORCE_RDIRPLUS) + return true; if (ctx->pos == 0 || cache_hits + cache_misses > NFS_READDIR_CACHE_USAGE_THRESHOLD) return true; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 98b45b636be3..61ad269c825f 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1154,10 +1154,14 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, rpc_wake_up(&tbl->slot_tbl_waitq); goto reset; /* RPC connection errors */ + case -ENETDOWN: + case -ENETUNREACH: + if (test_bit(NFS_CS_NETUNREACH_FATAL, &clp->cl_flags)) + return -NFS4ERR_FATAL_IOERROR; + fallthrough; case -ECONNREFUSED: case -EHOSTDOWN: case -EHOSTUNREACH: - case -ENETUNREACH: case -EIO: case -ETIMEDOUT: case -EPIPE: @@ -1183,6 +1187,7 @@ reset: /* Retry all errors through either pNFS or MDS except for -EJUKEBOX */ static int ff_layout_async_handle_error_v3(struct rpc_task *task, + struct nfs_client *clp, struct pnfs_layout_segment *lseg, u32 idx) { @@ -1200,6 +1205,11 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, case -EJUKEBOX: nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); goto out_retry; + case -ENETDOWN: + case -ENETUNREACH: + if (test_bit(NFS_CS_NETUNREACH_FATAL, &clp->cl_flags)) + return -NFS4ERR_FATAL_IOERROR; + fallthrough; default: dprintk("%s DS connection error %d\n", __func__, task->tk_status); @@ -1234,7 +1244,7 @@ static int ff_layout_async_handle_error(struct rpc_task *task, switch (vers) { case 3: - return ff_layout_async_handle_error_v3(task, lseg, idx); + return ff_layout_async_handle_error_v3(task, clp, lseg, idx); case 4: return ff_layout_async_handle_error_v4(task, state, clp, lseg, idx); @@ -1264,6 +1274,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, case -ECONNRESET: case -EHOSTDOWN: case -EHOSTUNREACH: + case -ENETDOWN: case -ENETUNREACH: case -EADDRINUSE: case -ENOBUFS: @@ -1337,6 +1348,9 @@ static int ff_layout_read_done_cb(struct rpc_task *task, return task->tk_status; case -EAGAIN: goto out_eagain; + case -NFS4ERR_FATAL_IOERROR: + task->tk_status = -EIO; + return 0; } return 0; @@ -1507,6 +1521,9 @@ static int ff_layout_write_done_cb(struct rpc_task *task, return task->tk_status; case -EAGAIN: return -EAGAIN; + case -NFS4ERR_FATAL_IOERROR: + task->tk_status = -EIO; + return 0; } if (hdr->res.verf->committed == NFS_FILE_SYNC || @@ -1551,6 +1568,9 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, case -EAGAIN: rpc_restart_call_prepare(task); return -EAGAIN; + case -NFS4ERR_FATAL_IOERROR: + task->tk_status = -EIO; + return 0; } ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb); diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index b069385eea17..13f71ca8c974 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -50,6 +50,7 @@ enum nfs_param { Opt_clientaddr, Opt_cto, Opt_alignwrite, + Opt_fatal_neterrors, Opt_fg, Opt_fscache, Opt_fscache_flag, @@ -72,6 +73,8 @@ enum nfs_param { Opt_posix, Opt_proto, Opt_rdirplus, + Opt_rdirplus_none, + Opt_rdirplus_force, Opt_rdma, Opt_resvport, Opt_retrans, @@ -96,6 +99,20 @@ enum nfs_param { }; enum { + Opt_fatal_neterrors_default, + Opt_fatal_neterrors_enetunreach, + Opt_fatal_neterrors_none, +}; + +static const struct constant_table nfs_param_enums_fatal_neterrors[] = { + { "default", Opt_fatal_neterrors_default }, + { "ENETDOWN:ENETUNREACH", Opt_fatal_neterrors_enetunreach }, + { "ENETUNREACH:ENETDOWN", Opt_fatal_neterrors_enetunreach }, + { "none", Opt_fatal_neterrors_none }, + {} +}; + +enum { Opt_local_lock_all, Opt_local_lock_flock, Opt_local_lock_none, @@ -151,6 +168,8 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = { fsparam_string("clientaddr", Opt_clientaddr), fsparam_flag_no("cto", Opt_cto), fsparam_flag_no("alignwrite", Opt_alignwrite), + fsparam_enum("fatal_neterrors", Opt_fatal_neterrors, + nfs_param_enums_fatal_neterrors), fsparam_flag ("fg", Opt_fg), fsparam_flag_no("fsc", Opt_fscache_flag), fsparam_string("fsc", Opt_fscache), @@ -174,7 +193,8 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = { fsparam_u32 ("port", Opt_port), fsparam_flag_no("posix", Opt_posix), fsparam_string("proto", Opt_proto), - fsparam_flag_no("rdirplus", Opt_rdirplus), + fsparam_flag_no("rdirplus", Opt_rdirplus), // rdirplus|nordirplus + fsparam_string("rdirplus", Opt_rdirplus), // rdirplus=... fsparam_flag ("rdma", Opt_rdma), fsparam_flag_no("resvport", Opt_resvport), fsparam_u32 ("retrans", Opt_retrans), @@ -288,6 +308,12 @@ static const struct constant_table nfs_xprtsec_policies[] = { {} }; +static const struct constant_table nfs_rdirplus_tokens[] = { + { "none", Opt_rdirplus_none }, + { "force", Opt_rdirplus_force }, + {} +}; + /* * Sanity-check a server address provided by the mount command. * @@ -636,10 +662,25 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, ctx->flags &= ~NFS_MOUNT_NOACL; break; case Opt_rdirplus: - if (result.negated) + if (result.negated) { + ctx->flags &= ~NFS_MOUNT_FORCE_RDIRPLUS; ctx->flags |= NFS_MOUNT_NORDIRPLUS; - else - ctx->flags &= ~NFS_MOUNT_NORDIRPLUS; + } else if (!param->string) { + ctx->flags &= ~(NFS_MOUNT_NORDIRPLUS | NFS_MOUNT_FORCE_RDIRPLUS); + } else { + switch (lookup_constant(nfs_rdirplus_tokens, param->string, -1)) { + case Opt_rdirplus_none: + ctx->flags &= ~NFS_MOUNT_FORCE_RDIRPLUS; + ctx->flags |= NFS_MOUNT_NORDIRPLUS; + break; + case Opt_rdirplus_force: + ctx->flags &= ~NFS_MOUNT_NORDIRPLUS; + ctx->flags |= NFS_MOUNT_FORCE_RDIRPLUS; + break; + default: + goto out_invalid_value; + } + } break; case Opt_sharecache: if (result.negated) @@ -872,6 +913,25 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, goto out_of_bounds; ctx->nfs_server.max_connect = result.uint_32; break; + case Opt_fatal_neterrors: + trace_nfs_mount_assign(param->key, param->string); + switch (result.uint_32) { + case Opt_fatal_neterrors_default: + if (fc->net_ns != &init_net) + ctx->flags |= NFS_MOUNT_NETUNREACH_FATAL; + else + ctx->flags &= ~NFS_MOUNT_NETUNREACH_FATAL; + break; + case Opt_fatal_neterrors_enetunreach: + ctx->flags |= NFS_MOUNT_NETUNREACH_FATAL; + break; + case Opt_fatal_neterrors_none: + ctx->flags &= ~NFS_MOUNT_NETUNREACH_FATAL; + break; + default: + goto out_invalid_value; + } + break; case Opt_lookupcache: trace_nfs_mount_assign(param->key, param->string); switch (result.uint_32) { @@ -1651,6 +1711,9 @@ static int nfs_init_fs_context(struct fs_context *fc) ctx->xprtsec.cert_serial = TLS_NO_CERT; ctx->xprtsec.privkey_serial = TLS_NO_PRIVKEY; + if (fc->net_ns != &init_net) + ctx->flags |= NFS_MOUNT_NETUNREACH_FATAL; + fc->s_iflags |= SB_I_STABLE_WRITES; } fc->fs_private = ctx; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 1aa67fca69b2..119e447758b9 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -74,6 +74,8 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) { + if (unlikely(nfs_current_task_exiting())) + return -EINTR; schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 1ac1d3eec517..ec8d32d0e2e9 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -912,6 +912,11 @@ static inline u32 nfs_stateid_hash(nfs4_stateid *stateid) } #endif +static inline bool nfs_current_task_exiting(void) +{ + return (current->flags & PF_EXITING) != 0; +} + static inline bool nfs_error_is_fatal(int err) { switch (err) { diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index b0c8a39c2bbd..0d7310c1ee0c 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -120,6 +120,8 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, if (mds_srv->flags & NFS_MOUNT_NORESVPORT) __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + if (test_bit(NFS_CS_NETUNREACH_FATAL, &mds_clp->cl_flags)) + __set_bit(NFS_CS_NETUNREACH_FATAL, &cl_init.init_flags); __set_bit(NFS_CS_DS, &cl_init.init_flags); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 755ed3c37051..a4cb67573aa7 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -39,7 +39,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); schedule_timeout(NFS_JUKEBOX_RETRY_TIME); res = -ERESTARTSYS; - } while (!fatal_signal_pending(current)); + } while (!fatal_signal_pending(current) && !nfs_current_task_exiting()); return res; } diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 1924c4a2077b..5cf52ece96ac 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -21,6 +21,8 @@ #define NFSDBG_FACILITY NFSDBG_PROC static int nfs42_do_offload_cancel_async(struct file *dst, nfs4_stateid *std); +static int nfs42_proc_offload_status(struct file *file, nfs4_stateid *stateid, + u64 *copied); static void nfs42_set_netaddr(struct file *filep, struct nfs42_netaddr *naddr) { @@ -173,6 +175,20 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) return err; } +static void nfs4_copy_dequeue_callback(struct nfs_server *dst_server, + struct nfs_server *src_server, + struct nfs4_copy_state *copy) +{ + spin_lock(&dst_server->nfs_client->cl_lock); + list_del_init(©->copies); + spin_unlock(&dst_server->nfs_client->cl_lock); + if (dst_server != src_server) { + spin_lock(&src_server->nfs_client->cl_lock); + list_del_init(©->src_copies); + spin_unlock(&src_server->nfs_client->cl_lock); + } +} + static int handle_async_copy(struct nfs42_copy_res *res, struct nfs_server *dst_server, struct nfs_server *src_server, @@ -182,9 +198,12 @@ static int handle_async_copy(struct nfs42_copy_res *res, bool *restart) { struct nfs4_copy_state *copy, *tmp_copy = NULL, *iter; - int status = NFS4_OK; struct nfs_open_context *dst_ctx = nfs_file_open_context(dst); struct nfs_open_context *src_ctx = nfs_file_open_context(src); + struct nfs_client *clp = dst_server->nfs_client; + unsigned long timeout = 3 * HZ; + int status = NFS4_OK; + u64 copied; copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_KERNEL); if (!copy) @@ -222,15 +241,12 @@ static int handle_async_copy(struct nfs42_copy_res *res, spin_unlock(&src_server->nfs_client->cl_lock); } - status = wait_for_completion_interruptible(©->completion); - spin_lock(&dst_server->nfs_client->cl_lock); - list_del_init(©->copies); - spin_unlock(&dst_server->nfs_client->cl_lock); - if (dst_server != src_server) { - spin_lock(&src_server->nfs_client->cl_lock); - list_del_init(©->src_copies); - spin_unlock(&src_server->nfs_client->cl_lock); - } +wait: + status = wait_for_completion_interruptible_timeout(©->completion, + timeout); + if (!status) + goto timeout; + nfs4_copy_dequeue_callback(dst_server, src_server, copy); if (status == -ERESTARTSYS) { goto out_cancel; } else if (copy->flags || copy->error == NFS4ERR_PARTNER_NO_AUTH) { @@ -240,6 +256,7 @@ static int handle_async_copy(struct nfs42_copy_res *res, } out: res->write_res.count = copy->count; + /* Copy out the updated write verifier provided by CB_OFFLOAD. */ memcpy(&res->write_res.verifier, ©->verf, sizeof(copy->verf)); status = -copy->error; @@ -251,6 +268,39 @@ out_cancel: if (!nfs42_files_from_same_server(src, dst)) nfs42_do_offload_cancel_async(src, src_stateid); goto out_free; +timeout: + timeout <<= 1; + if (timeout > (clp->cl_lease_time >> 1)) + timeout = clp->cl_lease_time >> 1; + status = nfs42_proc_offload_status(dst, ©->stateid, &copied); + if (status == -EINPROGRESS) + goto wait; + nfs4_copy_dequeue_callback(dst_server, src_server, copy); + switch (status) { + case 0: + /* The server recognized the copy stateid, so it hasn't + * rebooted. Don't overwrite the verifier returned in the + * COPY result. */ + res->write_res.count = copied; + goto out_free; + case -EREMOTEIO: + /* COPY operation failed on the server. */ + status = -EOPNOTSUPP; + res->write_res.count = copied; + goto out_free; + case -EBADF: + /* Server did not recognize the copy stateid. It has + * probably restarted and lost the plot. */ + res->write_res.count = 0; + status = -EOPNOTSUPP; + break; + case -EOPNOTSUPP: + /* RFC 7862 REQUIREs server to support OFFLOAD_STATUS when + * it has signed up for an async COPY, so server is not + * spec-compliant. */ + res->write_res.count = 0; + } + goto out_free; } static int process_copy_commit(struct file *dst, loff_t pos_dst, @@ -582,6 +632,108 @@ static int nfs42_do_offload_cancel_async(struct file *dst, return status; } +static int +_nfs42_proc_offload_status(struct nfs_server *server, struct file *file, + struct nfs42_offload_data *data) +{ + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OFFLOAD_STATUS], + .rpc_argp = &data->args, + .rpc_resp = &data->res, + .rpc_cred = ctx->cred, + }; + int status; + + status = nfs4_call_sync(server->client, server, &msg, + &data->args.osa_seq_args, + &data->res.osr_seq_res, 1); + trace_nfs4_offload_status(&data->args, status); + switch (status) { + case 0: + break; + + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OLD_STATEID: + /* + * Server does not recognize the COPY stateid. CB_OFFLOAD + * could have purged it, or server might have rebooted. + * Since COPY stateids don't have an associated inode, + * avoid triggering state recovery. + */ + status = -EBADF; + break; + case -NFS4ERR_NOTSUPP: + case -ENOTSUPP: + case -EOPNOTSUPP: + server->caps &= ~NFS_CAP_OFFLOAD_STATUS; + status = -EOPNOTSUPP; + break; + } + + return status; +} + +/** + * nfs42_proc_offload_status - Poll completion status of an async copy operation + * @dst: handle of file being copied into + * @stateid: copy stateid (from async COPY result) + * @copied: OUT: number of bytes copied so far + * + * Return values: + * %0: Server returned an NFS4_OK completion status + * %-EINPROGRESS: Server returned no completion status + * %-EREMOTEIO: Server returned an error completion status + * %-EBADF: Server did not recognize the copy stateid + * %-EOPNOTSUPP: Server does not support OFFLOAD_STATUS + * %-ERESTARTSYS: Wait interrupted by signal + * + * Other negative errnos indicate the client could not complete the + * request. + */ +static int +nfs42_proc_offload_status(struct file *dst, nfs4_stateid *stateid, u64 *copied) +{ + struct inode *inode = file_inode(dst); + struct nfs_server *server = NFS_SERVER(inode); + struct nfs4_exception exception = { + .inode = inode, + }; + struct nfs42_offload_data *data; + int status; + + if (!(server->caps & NFS_CAP_OFFLOAD_STATUS)) + return -EOPNOTSUPP; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + data->seq_server = server; + data->args.osa_src_fh = NFS_FH(inode); + memcpy(&data->args.osa_stateid, stateid, + sizeof(data->args.osa_stateid)); + exception.stateid = &data->args.osa_stateid; + do { + status = _nfs42_proc_offload_status(server, dst, data); + if (status == -EOPNOTSUPP) + goto out; + status = nfs4_handle_exception(server, status, &exception); + } while (exception.retry); + if (status) + goto out; + + *copied = data->res.osr_count; + if (!data->res.complete_count) + status = -EINPROGRESS; + else if (data->res.osr_complete != NFS_OK) + status = -EREMOTEIO; + +out: + kfree(data); + return status; +} + static int _nfs42_proc_copy_notify(struct file *src, struct file *dst, struct nfs42_copy_notify_args *args, struct nfs42_copy_notify_res *res) diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 5072d7ea72e9..b1b663468249 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -35,6 +35,11 @@ #define encode_offload_cancel_maxsz (op_encode_hdr_maxsz + \ XDR_QUADLEN(NFS4_STATEID_SIZE)) #define decode_offload_cancel_maxsz (op_decode_hdr_maxsz) +#define encode_offload_status_maxsz (op_encode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define decode_offload_status_maxsz (op_decode_hdr_maxsz + \ + 2 /* osr_count */ + \ + 2 /* osr_complete */) #define encode_copy_notify_maxsz (op_encode_hdr_maxsz + \ XDR_QUADLEN(NFS4_STATEID_SIZE) + \ 1 + /* nl4_type */ \ @@ -143,6 +148,14 @@ decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_offload_cancel_maxsz) +#define NFS4_enc_offload_status_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_offload_status_maxsz) +#define NFS4_dec_offload_status_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_offload_status_maxsz) #define NFS4_enc_copy_notify_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -345,6 +358,14 @@ static void encode_offload_cancel(struct xdr_stream *xdr, encode_nfs4_stateid(xdr, &args->osa_stateid); } +static void encode_offload_status(struct xdr_stream *xdr, + const struct nfs42_offload_status_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_OFFLOAD_STATUS, decode_offload_status_maxsz, hdr); + encode_nfs4_stateid(xdr, &args->osa_stateid); +} + static void encode_copy_notify(struct xdr_stream *xdr, const struct nfs42_copy_notify_args *args, struct compound_hdr *hdr) @@ -570,6 +591,25 @@ static void nfs4_xdr_enc_offload_cancel(struct rpc_rqst *req, } /* + * Encode OFFLOAD_STATUS request + */ +static void nfs4_xdr_enc_offload_status(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const struct nfs42_offload_status_args *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->osa_seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->osa_seq_args, &hdr); + encode_putfh(xdr, args->osa_src_fh, &hdr); + encode_offload_status(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* * Encode COPY_NOTIFY request */ static void nfs4_xdr_enc_copy_notify(struct rpc_rqst *req, @@ -921,6 +961,26 @@ static int decode_offload_cancel(struct xdr_stream *xdr, return decode_op_hdr(xdr, OP_OFFLOAD_CANCEL); } +static int decode_offload_status(struct xdr_stream *xdr, + struct nfs42_offload_status_res *res) +{ + ssize_t result; + int status; + + status = decode_op_hdr(xdr, OP_OFFLOAD_STATUS); + if (status) + return status; + /* osr_count */ + if (xdr_stream_decode_u64(xdr, &res->osr_count) < 0) + return -EIO; + /* osr_complete<1> */ + result = xdr_stream_decode_uint32_array(xdr, &res->osr_complete, 1); + if (result < 0) + return -EIO; + res->complete_count = result; + return 0; +} + static int decode_copy_notify(struct xdr_stream *xdr, struct nfs42_copy_notify_res *res) { @@ -1371,6 +1431,32 @@ out: } /* + * Decode OFFLOAD_STATUS response + */ +static int nfs4_xdr_dec_offload_status(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfs42_offload_status_res *res = data; + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->osr_seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_offload_status(xdr, res); + +out: + return status; +} + +/* * Decode COPY_NOTIFY response */ static int nfs4_xdr_dec_copy_notify(struct rpc_rqst *rqstp, diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 83378f69b35e..162c85a83a14 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -233,6 +233,8 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); if (test_bit(NFS_CS_PNFS, &cl_init->init_flags)) __set_bit(NFS_CS_PNFS, &clp->cl_flags); + if (test_bit(NFS_CS_NETUNREACH_FATAL, &cl_init->init_flags)) + __set_bit(NFS_CS_NETUNREACH_FATAL, &clp->cl_flags); /* * Set up the connection to the server before we add add to the * global list. @@ -937,6 +939,9 @@ static int nfs4_set_client(struct nfs_server *server, __set_bit(NFS_CS_TSM_POSSIBLE, &cl_init.init_flags); server->port = rpc_get_port((struct sockaddr *)addr); + if (server->flags & NFS_MOUNT_NETUNREACH_FATAL) + __set_bit(NFS_CS_NETUNREACH_FATAL, &cl_init.init_flags); + /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init); if (IS_ERR(clp)) @@ -1011,6 +1016,8 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, if (mds_srv->flags & NFS_MOUNT_NORESVPORT) __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + if (test_bit(NFS_CS_NETUNREACH_FATAL, &mds_clp->cl_flags)) + __set_bit(NFS_CS_NETUNREACH_FATAL, &cl_init.init_flags); __set_bit(NFS_CS_PNFS, &cl_init.init_flags); cl_init.max_connect = NFS_MAX_TRANSPORTS; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 70c8ea943019..970f28dbf253 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -195,6 +195,9 @@ static int nfs4_map_errors(int err) return -EBUSY; case -NFS4ERR_NOT_SAME: return -ENOTSYNC; + case -ENETDOWN: + case -ENETUNREACH: + break; default: dprintk("%s could not handle NFSv4 error %d\n", __func__, -err); @@ -443,6 +446,8 @@ static int nfs4_delay_killable(long *timeout) { might_sleep(); + if (unlikely(nfs_current_task_exiting())) + return -EINTR; __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); schedule_timeout(nfs4_update_delay(timeout)); if (!__fatal_signal_pending(current)) @@ -454,6 +459,8 @@ static int nfs4_delay_interruptible(long *timeout) { might_sleep(); + if (unlikely(nfs_current_task_exiting())) + return -EINTR; __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE_UNSAFE); schedule_timeout(nfs4_update_delay(timeout)); if (!signal_pending(current)) @@ -1774,7 +1781,8 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, rcu_read_unlock(); trace_nfs4_open_stateid_update_wait(state->inode, stateid, 0); - if (!fatal_signal_pending(current)) { + if (!fatal_signal_pending(current) && + !nfs_current_task_exiting()) { if (schedule_timeout(5*HZ) == 0) status = -EAGAIN; else @@ -3576,7 +3584,7 @@ static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst, write_sequnlock(&state->seqlock); trace_nfs4_close_stateid_update_wait(state->inode, dst, 0); - if (fatal_signal_pending(current)) + if (fatal_signal_pending(current) || nfs_current_task_exiting()) status = -EINTR; else if (schedule_timeout(5*HZ) != 0) @@ -9594,7 +9602,7 @@ static void nfs41_sequence_call_done(struct rpc_task *task, void *data) return; trace_nfs4_sequence(clp, task->tk_status); - if (task->tk_status < 0 && !task->tk_client->cl_shutdown) { + if (task->tk_status < 0 && clp->cl_cons_state >= 0) { dprintk("%s ERROR %d\n", __func__, task->tk_status); if (refcount_read(&clp->cl_count) == 1) return; @@ -10798,7 +10806,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_CLONE | NFS_CAP_LAYOUTERROR | NFS_CAP_READ_PLUS - | NFS_CAP_MOVEABLE, + | NFS_CAP_MOVEABLE + | NFS_CAP_OFFLOAD_STATUS, .init_client = nfs41_init_client, .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 542cdf71229f..7612e977e80b 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1198,7 +1198,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) struct rpc_clnt *clnt = clp->cl_rpcclient; bool swapon = false; - if (clnt->cl_shutdown) + if (clp->cl_cons_state < 0) return; set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); @@ -1403,7 +1403,7 @@ int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_ dprintk("%s: scheduling stateid recovery for server %s\n", __func__, clp->cl_hostname); nfs4_schedule_state_manager(clp); - return 0; + return clp->cl_cons_state < 0 ? clp->cl_cons_state : 0; } EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery); @@ -2739,7 +2739,15 @@ out_error: pr_warn_ratelimited("NFS: state manager%s%s failed on NFSv4 server %s" " with error %d\n", section_sep, section, clp->cl_hostname, -status); - ssleep(1); + switch (status) { + case -ENETDOWN: + case -ENETUNREACH: + nfs_mark_client_ready(clp, -EIO); + break; + default: + ssleep(1); + break; + } out_drain: memalloc_nofs_restore(memflags); nfs4_end_drain_session(clp); diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 22c973316f0b..bc67fe6801b1 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -2608,7 +2608,7 @@ TRACE_EVENT(nfs4_copy_notify, ) ); -TRACE_EVENT(nfs4_offload_cancel, +DECLARE_EVENT_CLASS(nfs4_offload_class, TP_PROTO( const struct nfs42_offload_status_args *args, int error @@ -2640,6 +2640,15 @@ TRACE_EVENT(nfs4_offload_cancel, __entry->stateid_seq, __entry->stateid_hash ) ); +#define DEFINE_NFS4_OFFLOAD_EVENT(name) \ + DEFINE_EVENT(nfs4_offload_class, name, \ + TP_PROTO( \ + const struct nfs42_offload_status_args *args, \ + int error \ + ), \ + TP_ARGS(args, error)) +DEFINE_NFS4_OFFLOAD_EVENT(nfs4_offload_cancel); +DEFINE_NFS4_OFFLOAD_EVENT(nfs4_offload_status); DECLARE_EVENT_CLASS(nfs4_xattr_event, TP_PROTO( diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index e8ac3f615f93..55bef5fbfa47 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -82,9 +82,8 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2) */ #define pagepad_maxsz (1) -#define open_owner_id_maxsz (1 + 2 + 1 + 1 + 2) -#define lock_owner_id_maxsz (1 + 1 + 4) -#define decode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) +#define open_owner_id_maxsz (2 + 1 + 2 + 2) +#define lock_owner_id_maxsz (2 + 1 + 2) #define compound_encode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) #define compound_decode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) #define op_encode_hdr_maxsz (1) @@ -185,7 +184,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, #define encode_claim_null_maxsz (1 + nfs4_name_maxsz) #define encode_open_maxsz (op_encode_hdr_maxsz + \ 2 + encode_share_access_maxsz + 2 + \ - open_owner_id_maxsz + \ + 1 + open_owner_id_maxsz + \ encode_opentype_maxsz + \ encode_claim_null_maxsz) #define decode_space_limit_maxsz (3) @@ -255,13 +254,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, #define encode_link_maxsz (op_encode_hdr_maxsz + \ nfs4_name_maxsz) #define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) -#define encode_lockowner_maxsz (7) +#define encode_lockowner_maxsz (2 + 1 + lock_owner_id_maxsz) + #define encode_lock_maxsz (op_encode_hdr_maxsz + \ 7 + \ 1 + encode_stateid_maxsz + 1 + \ encode_lockowner_maxsz) #define decode_lock_denied_maxsz \ - (8 + decode_lockowner_maxsz) + (2 + 2 + 1 + 2 + 1 + lock_owner_id_maxsz) #define decode_lock_maxsz (op_decode_hdr_maxsz + \ decode_lock_denied_maxsz) #define encode_lockt_maxsz (op_encode_hdr_maxsz + 5 + \ @@ -617,7 +617,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, encode_lockowner_maxsz) #define NFS4_dec_release_lockowner_sz \ (compound_decode_hdr_maxsz + \ - decode_lockowner_maxsz) + decode_release_lockowner_maxsz) #define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -1412,7 +1412,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena __be32 *p; /* * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4, - * owner 4 = 32 + * owner 28 */ encode_nfs4_seqid(xdr, arg->seqid); encode_share_access(xdr, arg->share_access); @@ -5077,7 +5077,7 @@ static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) /* * We create the owner, so we know a proper owner.id length is 4. */ -static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) +static int decode_lock_denied(struct xdr_stream *xdr, struct file_lock *fl) { uint64_t offset, length, clientid; __be32 *p; @@ -7702,6 +7702,7 @@ const struct rpc_procinfo nfs4_procedures[] = { PROC42(CLONE, enc_clone, dec_clone), PROC42(COPY, enc_copy, dec_copy), PROC42(OFFLOAD_CANCEL, enc_offload_cancel, dec_offload_cancel), + PROC42(OFFLOAD_STATUS, enc_offload_status, dec_offload_status), PROC42(COPY_NOTIFY, enc_copy_notify, dec_copy_notify), PROC(LOOKUPP, enc_lookupp, dec_lookupp), PROC42(LAYOUTERROR, enc_layouterror, dec_layouterror), diff --git a/fs/nfs/super.c b/fs/nfs/super.c index aeb715b4a690..9eea9e62afc9 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -454,8 +454,12 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, { NFS_MOUNT_NONLM, ",nolock", "" }, { NFS_MOUNT_NOACL, ",noacl", "" }, { NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" }, + { NFS_MOUNT_FORCE_RDIRPLUS, ",rdirplus=force", "" }, { NFS_MOUNT_UNSHARED, ",nosharecache", "" }, { NFS_MOUNT_NORESVPORT, ",noresvport", "" }, + { NFS_MOUNT_NETUNREACH_FATAL, + ",fatal_neterrors=ENETDOWN:ENETUNREACH", + ",fatal_neterrors=none" }, { 0, NULL, NULL } }; const struct proc_nfs_info *nfs_infop; diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c index 7b59a40d40c0..37cb2b776435 100644 --- a/fs/nfs/sysfs.c +++ b/fs/nfs/sysfs.c @@ -14,6 +14,7 @@ #include <linux/rcupdate.h> #include <linux/lockd/lockd.h> +#include "internal.h" #include "nfs4_fs.h" #include "netns.h" #include "sysfs.h" @@ -228,6 +229,25 @@ static void shutdown_client(struct rpc_clnt *clnt) rpc_cancel_tasks(clnt, -EIO, shutdown_match_client, NULL); } +/* + * Shut down the nfs_client only once all the superblocks + * have been shut down. + */ +static void shutdown_nfs_client(struct nfs_client *clp) +{ + struct nfs_server *server; + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + if (!(server->flags & NFS_MOUNT_SHUTDOWN)) { + rcu_read_unlock(); + return; + } + } + rcu_read_unlock(); + nfs_mark_client_ready(clp, -EIO); + shutdown_client(clp->cl_rpcclient); +} + static ssize_t shutdown_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -259,7 +279,6 @@ shutdown_store(struct kobject *kobj, struct kobj_attribute *attr, server->flags |= NFS_MOUNT_SHUTDOWN; shutdown_client(server->client); - shutdown_client(server->nfs_client->cl_rpcclient); if (!IS_ERR(server->client_acl)) shutdown_client(server->client_acl); @@ -267,11 +286,44 @@ shutdown_store(struct kobject *kobj, struct kobj_attribute *attr, if (server->nlm_host) shutdown_client(server->nlm_host->h_rpcclnt); out: + shutdown_nfs_client(server->nfs_client); return count; } static struct kobj_attribute nfs_sysfs_attr_shutdown = __ATTR_RW(shutdown); +#if IS_ENABLED(CONFIG_NFS_V4_1) +static ssize_t +implid_domain_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct nfs_server *server = container_of(kobj, struct nfs_server, kobj); + struct nfs41_impl_id *impl_id = server->nfs_client->cl_implid; + + if (!impl_id || strlen(impl_id->domain) == 0) + return 0; //sysfs_emit(buf, ""); + return sysfs_emit(buf, "%s\n", impl_id->domain); +} + +static struct kobj_attribute nfs_sysfs_attr_implid_domain = __ATTR_RO(implid_domain); + + +static ssize_t +implid_name_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct nfs_server *server = container_of(kobj, struct nfs_server, kobj); + struct nfs41_impl_id *impl_id = server->nfs_client->cl_implid; + + if (!impl_id || strlen(impl_id->name) == 0) + return 0; //sysfs_emit(buf, ""); + return sysfs_emit(buf, "%s\n", impl_id->name); +} + +static struct kobj_attribute nfs_sysfs_attr_implid_name = __ATTR_RO(implid_name); + +#endif /* IS_ENABLED(CONFIG_NFS_V4_1) */ + #define RPC_CLIENT_NAME_SIZE 64 void nfs_sysfs_link_rpc_client(struct nfs_server *server, @@ -309,6 +361,32 @@ static struct kobj_type nfs_sb_ktype = { .child_ns_type = nfs_netns_object_child_ns_type, }; +#if IS_ENABLED(CONFIG_NFS_V4_1) +static void nfs_sysfs_add_nfsv41_server(struct nfs_server *server) +{ + int ret; + + if (!server->nfs_client->cl_implid) + return; + + ret = sysfs_create_file_ns(&server->kobj, &nfs_sysfs_attr_implid_domain.attr, + nfs_netns_server_namespace(&server->kobj)); + if (ret < 0) + pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n", + server->s_sysfs_id, ret); + + ret = sysfs_create_file_ns(&server->kobj, &nfs_sysfs_attr_implid_name.attr, + nfs_netns_server_namespace(&server->kobj)); + if (ret < 0) + pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n", + server->s_sysfs_id, ret); +} +#else /* CONFIG_NFS_V4_1 */ +static inline void nfs_sysfs_add_nfsv41_server(struct nfs_server *server) +{ +} +#endif /* CONFIG_NFS_V4_1 */ + void nfs_sysfs_add_server(struct nfs_server *server) { int ret; @@ -325,6 +403,8 @@ void nfs_sysfs_add_server(struct nfs_server *server) if (ret < 0) pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n", server->s_sysfs_id, ret); + + nfs_sysfs_add_nfsv41_server(server); } EXPORT_SYMBOL_GPL(nfs_sysfs_add_server); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index aa3d8bea3ec0..23df8b214474 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -579,8 +579,10 @@ retry: while (!nfs_lock_request(head)) { ret = nfs_wait_on_request(head); - if (ret < 0) + if (ret < 0) { + nfs_release_request(head); return ERR_PTR(ret); + } } /* Ensure that nobody removed the request before we locked it */ diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 3a202e51b360..83970d97840b 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2424,7 +2424,7 @@ static void nilfs_segctor_accept(struct nilfs_sc_info *sci) * the area protected by sc_state_lock. */ if (thread_is_alive) - del_timer_sync(&sci->sc_timer); + timer_delete_sync(&sci->sc_timer); } /** diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c index af94e3737470..e946f75eb540 100644 --- a/fs/ntfs3/attrib.c +++ b/fs/ntfs3/attrib.c @@ -2664,8 +2664,9 @@ int attr_set_compress(struct ntfs_inode *ni, bool compr) attr->nres.run_off = cpu_to_le16(run_off); } - /* Update data attribute flags. */ + /* Update attribute flags. */ if (compr) { + attr->flags &= ~ATTR_FLAG_SPARSED; attr->flags |= ATTR_FLAG_COMPRESSED; attr->nres.c_unit = NTFS_LZNT_CUNIT; } else { diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 3f96a11804c9..9b6a3f8d2e7c 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -101,8 +101,26 @@ int ntfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, /* Allowed to change compression for empty files and for directories only. */ if (!is_dedup(ni) && !is_encrypted(ni) && (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { - /* Change compress state. */ - int err = ni_set_compress(inode, flags & FS_COMPR_FL); + int err = 0; + struct address_space *mapping = inode->i_mapping; + + /* write out all data and wait. */ + filemap_invalidate_lock(mapping); + err = filemap_write_and_wait(mapping); + + if (err >= 0) { + /* Change compress state. */ + bool compr = flags & FS_COMPR_FL; + err = ni_set_compress(inode, compr); + + /* For files change a_ops too. */ + if (!err) + mapping->a_ops = compr ? &ntfs_aops_cmpr : + &ntfs_aops; + } + + filemap_invalidate_unlock(mapping); + if (err) return err; } @@ -412,6 +430,7 @@ static int ntfs_extend(struct inode *inode, loff_t pos, size_t count, } if (extend_init && !is_compressed(ni)) { + WARN_ON(ni->i_valid >= pos); err = ntfs_extend_initialized_size(file, ni, ni->i_valid, pos); if (err) goto out; @@ -1228,21 +1247,22 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t ret; int err; - err = check_write_restriction(inode); - if (err) - return err; - - if (is_compressed(ni) && (iocb->ki_flags & IOCB_DIRECT)) { - ntfs_inode_warn(inode, "direct i/o + compressed not supported"); - return -EOPNOTSUPP; - } - if (!inode_trylock(inode)) { if (iocb->ki_flags & IOCB_NOWAIT) return -EAGAIN; inode_lock(inode); } + ret = check_write_restriction(inode); + if (ret) + goto out; + + if (is_compressed(ni) && (iocb->ki_flags & IOCB_DIRECT)) { + ntfs_inode_warn(inode, "direct i/o + compressed not supported"); + ret = -EOPNOTSUPP; + goto out; + } + ret = generic_write_checks(iocb, from); if (ret <= 0) goto out; diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 5df6a0b5add9..b7a83200f2cc 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -281,63 +281,6 @@ struct ATTRIB *ni_enum_attr_ex(struct ntfs_inode *ni, struct ATTRIB *attr, } /* - * ni_load_attr - Load attribute that contains given VCN. - */ -struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, - const __le16 *name, u8 name_len, CLST vcn, - struct mft_inode **pmi) -{ - struct ATTR_LIST_ENTRY *le; - struct ATTRIB *attr; - struct mft_inode *mi; - struct ATTR_LIST_ENTRY *next; - - if (!ni->attr_list.size) { - if (pmi) - *pmi = &ni->mi; - return mi_find_attr(ni, &ni->mi, NULL, type, name, name_len, - NULL); - } - - le = al_find_ex(ni, NULL, type, name, name_len, NULL); - if (!le) - return NULL; - - /* - * Unfortunately ATTR_LIST_ENTRY contains only start VCN. - * So to find the ATTRIB segment that contains 'vcn' we should - * enumerate some entries. - */ - if (vcn) { - for (;; le = next) { - next = al_find_ex(ni, le, type, name, name_len, NULL); - if (!next || le64_to_cpu(next->vcn) > vcn) - break; - } - } - - if (ni_load_mi(ni, le, &mi)) - return NULL; - - if (pmi) - *pmi = mi; - - attr = mi_find_attr(ni, mi, NULL, type, name, name_len, &le->id); - if (!attr) - return NULL; - - if (!attr->non_res) - return attr; - - if (le64_to_cpu(attr->nres.svcn) <= vcn && - vcn <= le64_to_cpu(attr->nres.evcn)) - return attr; - - _ntfs_bad_inode(&ni->vfs_inode); - return NULL; -} - -/* * ni_load_all_mi - Load all subrecords. */ int ni_load_all_mi(struct ntfs_inode *ni) @@ -3434,10 +3377,12 @@ int ni_set_compress(struct inode *inode, bool compr) } ni->std_fa = std->fa; - if (compr) + if (compr) { + std->fa &= ~FILE_ATTRIBUTE_SPARSE_FILE; std->fa |= FILE_ATTRIBUTE_COMPRESSED; - else + } else { std->fa &= ~FILE_ATTRIBUTE_COMPRESSED; + } if (ni->std_fa != std->fa) { ni->std_fa = std->fa; diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index 938d351ebac7..df81f1f7330c 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -1035,34 +1035,6 @@ struct buffer_head *ntfs_bread(struct super_block *sb, sector_t block) return NULL; } -int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer) -{ - struct block_device *bdev = sb->s_bdev; - u32 blocksize = sb->s_blocksize; - u64 block = lbo >> sb->s_blocksize_bits; - u32 off = lbo & (blocksize - 1); - u32 op = blocksize - off; - - for (; bytes; block += 1, off = 0, op = blocksize) { - struct buffer_head *bh = __bread(bdev, block, blocksize); - - if (!bh) - return -EIO; - - if (op > bytes) - op = bytes; - - memcpy(buffer, bh->b_data + off, op); - - put_bh(bh); - - bytes -= op; - buffer = Add2Ptr(buffer, op); - } - - return 0; -} - int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes, const void *buf, int wait) { diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index 7eb9fae22f8d..78d20e4baa2c 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -618,7 +618,7 @@ static bool index_hdr_check(const struct INDEX_HDR *hdr, u32 bytes) u32 off = le32_to_cpu(hdr->de_off); if (!IS_ALIGNED(off, 8) || tot > bytes || end > tot || - off + sizeof(struct NTFS_DE) > end) { + size_add(off, sizeof(struct NTFS_DE)) > end) { /* incorrect index buffer. */ return false; } @@ -736,7 +736,7 @@ fill_table: if (end > total) return NULL; - if (off + sizeof(struct NTFS_DE) > end) + if (size_add(off, sizeof(struct NTFS_DE)) > end) return NULL; e = Add2Ptr(hdr, off); diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index a1e11228dafd..3e2957a1e360 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -1025,46 +1025,6 @@ int ntfs_sync_inode(struct inode *inode) } /* - * writeback_inode - Helper function for ntfs_flush_inodes(). - * - * This writes both the inode and the file data blocks, waiting - * for in flight data blocks before the start of the call. It - * does not wait for any io started during the call. - */ -static int writeback_inode(struct inode *inode) -{ - int ret = sync_inode_metadata(inode, 0); - - if (!ret) - ret = filemap_fdatawrite(inode->i_mapping); - return ret; -} - -/* - * ntfs_flush_inodes - * - * Write data and metadata corresponding to i1 and i2. The io is - * started but we do not wait for any of it to finish. - * - * filemap_flush() is used for the block device, so if there is a dirty - * page for a block already in flight, we will not wait and start the - * io over again. - */ -int ntfs_flush_inodes(struct super_block *sb, struct inode *i1, - struct inode *i2) -{ - int ret = 0; - - if (i1) - ret = writeback_inode(i1); - if (!ret && i2) - ret = writeback_inode(i2); - if (!ret) - ret = filemap_flush(sb->s_bdev_file->f_mapping); - return ret; -} - -/* * Helper function to read file. */ int inode_read_data(struct inode *inode, void *data, size_t bytes) diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h index 241f2ffdd920..1ff13b6f9613 100644 --- a/fs/ntfs3/ntfs.h +++ b/fs/ntfs3/ntfs.h @@ -717,7 +717,7 @@ static inline struct NTFS_DE *hdr_first_de(const struct INDEX_HDR *hdr) struct NTFS_DE *e; u16 esize; - if (de_off >= used || de_off + sizeof(struct NTFS_DE) > used ) + if (de_off >= used || size_add(de_off, sizeof(struct NTFS_DE)) > used) return NULL; e = Add2Ptr(hdr, de_off); diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 382820464dee..d628977e2556 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -530,9 +530,6 @@ struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr, struct ATTRIB *ni_enum_attr_ex(struct ntfs_inode *ni, struct ATTRIB *attr, struct ATTR_LIST_ENTRY **le, struct mft_inode **mi); -struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, - const __le16 *name, u8 name_len, CLST vcn, - struct mft_inode **pmi); int ni_load_all_mi(struct ntfs_inode *ni); bool ni_add_subrecord(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi); int ni_remove_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, @@ -619,7 +616,6 @@ enum NTFS_DIRTY_FLAGS { NTFS_DIRTY_ERROR = 2, }; int ntfs_set_state(struct ntfs_sb_info *sbi, enum NTFS_DIRTY_FLAGS dirty); -int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer); int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes, const void *buffer, int wait); int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, @@ -717,8 +713,6 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, u32 len, u32 copied, struct folio *folio, void *fsdata); int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc); int ntfs_sync_inode(struct inode *inode); -int ntfs_flush_inodes(struct super_block *sb, struct inode *i1, - struct inode *i2); int inode_read_data(struct inode *inode, void *data, size_t bytes); int ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const struct cpu_str *uni, diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 6a0f6b0a3ab2..920a1ab47b63 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -555,6 +555,55 @@ static const struct proc_ops ntfs3_label_fops = { .proc_write = ntfs3_label_write, }; +static void ntfs_create_procdir(struct super_block *sb) +{ + struct proc_dir_entry *e; + + if (!proc_info_root) + return; + + e = proc_mkdir(sb->s_id, proc_info_root); + if (e) { + struct ntfs_sb_info *sbi = sb->s_fs_info; + + proc_create_data("volinfo", 0444, e, + &ntfs3_volinfo_fops, sb); + proc_create_data("label", 0644, e, + &ntfs3_label_fops, sb); + sbi->procdir = e; + } +} + +static void ntfs_remove_procdir(struct super_block *sb) +{ + struct ntfs_sb_info *sbi = sb->s_fs_info; + + if (!sbi->procdir) + return; + + remove_proc_entry("label", sbi->procdir); + remove_proc_entry("volinfo", sbi->procdir); + remove_proc_entry(sb->s_id, proc_info_root); + sbi->procdir = NULL; +} + +static void ntfs_create_proc_root(void) +{ + proc_info_root = proc_mkdir("fs/ntfs3", NULL); +} + +static void ntfs_remove_proc_root(void) +{ + if (proc_info_root) { + remove_proc_entry("fs/ntfs3", NULL); + proc_info_root = NULL; + } +} +#else +static void ntfs_create_procdir(struct super_block *sb) {} +static void ntfs_remove_procdir(struct super_block *sb) {} +static void ntfs_create_proc_root(void) {} +static void ntfs_remove_proc_root(void) {} #endif static struct kmem_cache *ntfs_inode_cachep; @@ -644,15 +693,7 @@ static void ntfs_put_super(struct super_block *sb) { struct ntfs_sb_info *sbi = sb->s_fs_info; -#ifdef CONFIG_PROC_FS - // Remove /proc/fs/ntfs3/.. - if (sbi->procdir) { - remove_proc_entry("label", sbi->procdir); - remove_proc_entry("volinfo", sbi->procdir); - remove_proc_entry(sb->s_id, proc_info_root); - sbi->procdir = NULL; - } -#endif + ntfs_remove_procdir(sb); /* Mark rw ntfs as clear, if possible. */ ntfs_set_state(sbi, NTFS_DIRTY_CLEAR); @@ -1590,20 +1631,7 @@ load_root: kfree(boot2); } -#ifdef CONFIG_PROC_FS - /* Create /proc/fs/ntfs3/.. */ - if (proc_info_root) { - struct proc_dir_entry *e = proc_mkdir(sb->s_id, proc_info_root); - static_assert((S_IRUGO | S_IWUSR) == 0644); - if (e) { - proc_create_data("volinfo", S_IRUGO, e, - &ntfs3_volinfo_fops, sb); - proc_create_data("label", S_IRUGO | S_IWUSR, e, - &ntfs3_label_fops, sb); - sbi->procdir = e; - } - } -#endif + ntfs_create_procdir(sb); if (is_legacy_ntfs(sb)) sb->s_flags |= SB_RDONLY; @@ -1853,14 +1881,11 @@ static int __init init_ntfs_fs(void) if (IS_ENABLED(CONFIG_NTFS3_LZX_XPRESS)) pr_info("ntfs3: Read-only LZX/Xpress compression included\n"); -#ifdef CONFIG_PROC_FS - /* Create "/proc/fs/ntfs3" */ - proc_info_root = proc_mkdir("fs/ntfs3", NULL); -#endif + ntfs_create_proc_root(); err = ntfs3_init_bitmap(); if (err) - return err; + goto out2; ntfs_inode_cachep = kmem_cache_create( "ntfs_inode_cache", sizeof(struct ntfs_inode), 0, @@ -1880,6 +1905,8 @@ out: kmem_cache_destroy(ntfs_inode_cachep); out1: ntfs3_exit_bitmap(); +out2: + ntfs_remove_proc_root(); return err; } @@ -1890,11 +1917,7 @@ static void __exit exit_ntfs_fs(void) unregister_filesystem(&ntfs_fs_type); unregister_as_ntfs_legacy(); ntfs3_exit_bitmap(); - -#ifdef CONFIG_PROC_FS - if (proc_info_root) - remove_proc_entry("fs/ntfs3", NULL); -#endif + ntfs_remove_proc_root(); } MODULE_LICENSE("GPL"); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 0f46b22561d6..fce9beb214f0 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -724,7 +724,7 @@ static void o2net_shutdown_sc(struct work_struct *work) if (o2net_unregister_callbacks(sc->sc_sock->sk, sc)) { /* we shouldn't flush as we're in the thread, the * races with pending sc work structs are harmless */ - del_timer_sync(&sc->sc_idle_timeout); + timer_delete_sync(&sc->sc_idle_timeout); o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); sc_put(sc); kernel_sock_shutdown(sc->sc_sock, SHUT_RDWR); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 557cf9d40177..f8b9c9c73997 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -563,7 +563,7 @@ void pstore_unregister(struct pstore_info *psi) pstore_unregister_kmsg(); /* Stop timer and make sure all work has finished. */ - del_timer_sync(&pstore_timer); + timer_delete_sync(&pstore_timer); flush_work(&pstore_work); /* Remove all backend records from filesystem tree. */ diff --git a/fs/smb/client/cifs_fs_sb.h b/fs/smb/client/cifs_fs_sb.h index 651759192280..5e8d163cb5f8 100644 --- a/fs/smb/client/cifs_fs_sb.h +++ b/fs/smb/client/cifs_fs_sb.h @@ -49,6 +49,7 @@ struct cifs_sb_info { struct rb_root tlink_tree; + struct list_head tcon_sb_link; spinlock_t tlink_tree_lock; struct tcon_link *master_tlink; struct nls_table *local_nls; diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h index 8dea0cf3a8de..ca435a3841b8 100644 --- a/fs/smb/client/cifsfs.h +++ b/fs/smb/client/cifsfs.h @@ -135,7 +135,6 @@ extern ssize_t cifs_file_copychunk_range(unsigned int xid, extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern void cifs_setsize(struct inode *inode, loff_t offset); -extern int cifs_truncate_page(struct address_space *mapping, loff_t from); struct smb3_fs_context; extern struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type, @@ -146,6 +145,6 @@ extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ /* when changing internal version - update following two lines at same time */ -#define SMB3_PRODUCT_BUILD 53 -#define CIFS_VERSION "2.53" +#define SMB3_PRODUCT_BUILD 54 +#define CIFS_VERSION "2.54" #endif /* _CIFSFS_H */ diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 6ae170a2a042..07c4688ec4c9 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -714,6 +714,8 @@ struct TCP_Server_Info { spinlock_t srv_lock; /* protect anything here that is not protected */ __u64 conn_id; /* connection identifier (useful for debugging) */ int srv_count; /* reference counter */ + int rfc1001_sessinit; /* whether to estasblish netbios session */ + bool with_rfc1001; /* if netbios session is used */ /* 15 character server name + 0x20 16th byte indicating type = srv */ char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; struct smb_version_operations *ops; @@ -1321,7 +1323,8 @@ struct cifs_tcon { #endif struct list_head pending_opens; /* list of incomplete opens */ struct cached_fids *cfids; - /* BB add field for back pointer to sb struct(s)? */ + struct list_head cifs_sb_list; + spinlock_t sb_list_lock; #ifdef CONFIG_CIFS_DFS_UPCALL struct delayed_work dfs_cache_work; struct list_head dfs_ses_list; @@ -1718,6 +1721,7 @@ struct mid_q_entry { void *resp_buf; /* pointer to received SMB header */ unsigned int resp_buf_size; int mid_state; /* wish this were enum but can not pass to wait_event */ + int mid_rc; /* rc for MID_RC */ unsigned int mid_flags; __le16 command; /* smb command code */ unsigned int optype; /* operation type */ @@ -1880,6 +1884,7 @@ static inline bool is_replayable_error(int error) #define MID_RESPONSE_MALFORMED 0x10 #define MID_SHUTDOWN 0x20 #define MID_RESPONSE_READY 0x40 /* ready for other process handle the rsp */ +#define MID_RC 0x80 /* mid_rc contains custom rc */ /* Flags */ #define MID_WAIT_CANCELLED 1 /* Cancelled while waiting for response */ diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 29dcb88392e5..60cb264a01e5 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -1041,15 +1041,31 @@ static __u16 convert_disposition(int disposition) static int access_flags_to_smbopen_mode(const int access_flags) { - int masked_flags = access_flags & (GENERIC_READ | GENERIC_WRITE); - - if (masked_flags == GENERIC_READ) - return SMBOPEN_READ; - else if (masked_flags == GENERIC_WRITE) + /* + * SYSTEM_SECURITY grants both read and write access to SACL, treat is as read/write. + * MAXIMUM_ALLOWED grants as many access as possible, so treat it as read/write too. + * SYNCHRONIZE as is does not grant any specific access, so do not check its mask. + * If only SYNCHRONIZE bit is specified then fallback to read access. + */ + bool with_write_flags = access_flags & (FILE_WRITE_DATA | FILE_APPEND_DATA | FILE_WRITE_EA | + FILE_DELETE_CHILD | FILE_WRITE_ATTRIBUTES | DELETE | + WRITE_DAC | WRITE_OWNER | SYSTEM_SECURITY | + MAXIMUM_ALLOWED | GENERIC_WRITE | GENERIC_ALL); + bool with_read_flags = access_flags & (FILE_READ_DATA | FILE_READ_EA | FILE_EXECUTE | + FILE_READ_ATTRIBUTES | READ_CONTROL | + SYSTEM_SECURITY | MAXIMUM_ALLOWED | GENERIC_ALL | + GENERIC_EXECUTE | GENERIC_READ); + bool with_execute_flags = access_flags & (FILE_EXECUTE | MAXIMUM_ALLOWED | GENERIC_ALL | + GENERIC_EXECUTE); + + if (with_write_flags && with_read_flags) + return SMBOPEN_READWRITE; + else if (with_write_flags) return SMBOPEN_WRITE; - - /* just go for read/write */ - return SMBOPEN_READWRITE; + else if (with_execute_flags) + return SMBOPEN_EXECUTE; + else + return SMBOPEN_READ; } int diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index d7bad2c3af37..f298e86a3c1f 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -371,7 +371,7 @@ static bool cifs_tcp_ses_needs_reconnect(struct TCP_Server_Info *server, int num * */ static int __cifs_reconnect(struct TCP_Server_Info *server, - bool mark_smb_session) + bool mark_smb_session, bool once) { int rc = 0; @@ -399,6 +399,9 @@ static int __cifs_reconnect(struct TCP_Server_Info *server, if (rc) { cifs_server_unlock(server); cifs_dbg(FYI, "%s: reconnect error %d\n", __func__, rc); + /* If was asked to reconnect only once, do not try it more times */ + if (once) + break; msleep(3000); } else { atomic_inc(&tcpSesReconnectCount); @@ -564,19 +567,33 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server) return rc; } -int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session) +static int +_cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session, bool once) { if (!server->leaf_fullpath) - return __cifs_reconnect(server, mark_smb_session); + return __cifs_reconnect(server, mark_smb_session, once); return reconnect_dfs_server(server); } #else -int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session) +static int +_cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session, bool once) { - return __cifs_reconnect(server, mark_smb_session); + return __cifs_reconnect(server, mark_smb_session, once); } #endif +int +cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session) +{ + return _cifs_reconnect(server, mark_smb_session, false); +} + +static int +cifs_reconnect_once(struct TCP_Server_Info *server) +{ + return _cifs_reconnect(server, true, true); +} + static void cifs_echo_request(struct work_struct *work) { @@ -803,26 +820,110 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type) /* Regular SMB response */ return true; case RFC1002_SESSION_KEEP_ALIVE: + /* + * RFC 1002 session keep alive can sent by the server only when + * we established a RFC 1002 session. But Samba servers send + * RFC 1002 session keep alive also over port 445 on which + * RFC 1002 session is not established. + */ cifs_dbg(FYI, "RFC 1002 session keep alive\n"); break; case RFC1002_POSITIVE_SESSION_RESPONSE: - cifs_dbg(FYI, "RFC 1002 positive session response\n"); + /* + * RFC 1002 positive session response cannot be returned + * for SMB request. RFC 1002 session response is handled + * exclusively in ip_rfc1001_connect() function. + */ + cifs_server_dbg(VFS, "RFC 1002 positive session response (unexpected)\n"); + cifs_reconnect(server, true); break; case RFC1002_NEGATIVE_SESSION_RESPONSE: /* * We get this from Windows 98 instead of an error on - * SMB negprot response. + * SMB negprot response, when we have not established + * RFC 1002 session (which means ip_rfc1001_connect() + * was skipped). Note that same still happens with + * Windows Server 2022 when connecting via port 139. + * So for this case when mount option -o nonbsessinit + * was not specified, try to reconnect with establishing + * RFC 1002 session. If new socket establishment with + * RFC 1002 session was successful then return to the + * mid's caller -EAGAIN, so it can retry the request. */ - cifs_dbg(FYI, "RFC 1002 negative session response\n"); - /* give server a second to clean up */ - msleep(1000); - /* - * Always try 445 first on reconnect since we get NACK - * on some if we ever connected to port 139 (the NACK - * is since we do not begin with RFC1001 session - * initialize frame). - */ - cifs_set_port((struct sockaddr *)&server->dstaddr, CIFS_PORT); + if (!cifs_rdma_enabled(server) && + server->tcpStatus == CifsInNegotiate && + !server->with_rfc1001 && + server->rfc1001_sessinit != 0) { + int rc, mid_rc; + struct mid_q_entry *mid, *nmid; + LIST_HEAD(dispose_list); + + cifs_dbg(FYI, "RFC 1002 negative session response during SMB Negotiate, retrying with NetBIOS session\n"); + + /* + * Before reconnect, delete all pending mids for this + * server, so reconnect would not signal connection + * aborted error to mid's callbacks. Note that for this + * server there should be exactly one pending mid + * corresponding to SMB1/SMB2 Negotiate packet. + */ + spin_lock(&server->mid_lock); + list_for_each_entry_safe(mid, nmid, &server->pending_mid_q, qhead) { + kref_get(&mid->refcount); + list_move(&mid->qhead, &dispose_list); + mid->mid_flags |= MID_DELETED; + } + spin_unlock(&server->mid_lock); + + /* Now try to reconnect once with NetBIOS session. */ + server->with_rfc1001 = true; + rc = cifs_reconnect_once(server); + + /* + * If reconnect was successful then indicate -EAGAIN + * to mid's caller. If reconnect failed with -EAGAIN + * then mask it as -EHOSTDOWN, so mid's caller would + * know that it failed. + */ + if (rc == 0) + mid_rc = -EAGAIN; + else if (rc == -EAGAIN) + mid_rc = -EHOSTDOWN; + else + mid_rc = rc; + + /* + * After reconnect (either successful or unsuccessful) + * deliver reconnect status to mid's caller via mid's + * callback. Use MID_RC state which indicates that the + * return code should be read from mid_rc member. + */ + list_for_each_entry_safe(mid, nmid, &dispose_list, qhead) { + list_del_init(&mid->qhead); + mid->mid_rc = mid_rc; + mid->mid_state = MID_RC; + mid->callback(mid); + release_mid(mid); + } + + /* + * If reconnect failed then wait two seconds. In most + * cases we were been called from the mount context and + * delivered failure to mid's callback will stop this + * receiver task thread and fails the mount process. + * So wait two seconds to prevent another reconnect + * in this task thread, which would be useless as the + * mount context will fail at all. + */ + if (rc != 0) + msleep(2000); + } else { + cifs_server_dbg(VFS, "RFC 1002 negative session response (unexpected)\n"); + cifs_reconnect(server, true); + } + break; + case RFC1002_RETARGET_SESSION_RESPONSE: + cifs_server_dbg(VFS, "RFC 1002 retarget session response (unexpected)\n"); cifs_reconnect(server, true); break; default: @@ -1701,6 +1802,8 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, ctx->source_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); memcpy(tcp_ses->server_RFC1001_name, ctx->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); + tcp_ses->rfc1001_sessinit = ctx->rfc1001_sessinit; + tcp_ses->with_rfc1001 = false; tcp_ses->session_estab = false; tcp_ses->sequence_number = 0; tcp_ses->channel_sequence_num = 0; /* only tracked for primary channel */ @@ -1731,12 +1834,8 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, */ tcp_ses->tcpStatus = CifsNew; ++tcp_ses->srv_count; + tcp_ses->echo_interval = ctx->echo_interval * HZ; - if (ctx->echo_interval >= SMB_ECHO_INTERVAL_MIN && - ctx->echo_interval <= SMB_ECHO_INTERVAL_MAX) - tcp_ses->echo_interval = ctx->echo_interval * HZ; - else - tcp_ses->echo_interval = SMB_ECHO_INTERVAL_DEFAULT * HZ; if (tcp_ses->rdma) { #ifndef CONFIG_CIFS_SMB_DIRECT cifs_dbg(VFS, "CONFIG_CIFS_SMB_DIRECT is not enabled\n"); @@ -3221,6 +3320,7 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) return -EIO; } + server->with_rfc1001 = true; return 0; } @@ -3332,7 +3432,16 @@ generic_ip_connect(struct TCP_Server_Info *server) return rc; } trace_smb3_connect_done(server->hostname, server->conn_id, &server->dstaddr); - if (sport == htons(RFC1001_PORT)) + + /* + * Establish RFC1001 NetBIOS session when it was explicitly requested + * by mount option -o nbsessinit, or when connecting to default RFC1001 + * server port (139) and it was not explicitly disabled by mount option + * -o nonbsessinit. + */ + if (server->with_rfc1001 || + server->rfc1001_sessinit == 1 || + (server->rfc1001_sessinit == -1 && sport == htons(RFC1001_PORT))) rc = ip_rfc1001_connect(server); return rc; @@ -3481,6 +3590,7 @@ int cifs_setup_cifs_sb(struct cifs_sb_info *cifs_sb) struct smb3_fs_context *ctx = cifs_sb->ctx; INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks); + INIT_LIST_HEAD(&cifs_sb->tcon_sb_link); spin_lock_init(&cifs_sb->tlink_tree_lock); cifs_sb->tlink_tree = RB_ROOT; @@ -3713,6 +3823,10 @@ static int mount_setup_tlink(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, tlink_rb_insert(&cifs_sb->tlink_tree, tlink); spin_unlock(&cifs_sb->tlink_tree_lock); + spin_lock(&tcon->sb_list_lock); + list_add(&cifs_sb->tcon_sb_link, &tcon->cifs_sb_list); + spin_unlock(&tcon->sb_list_lock); + queue_delayed_work(cifsiod_wq, &cifs_sb->prune_tlinks, TLINK_IDLE_EXPIRE); return 0; @@ -4054,9 +4168,19 @@ cifs_umount(struct cifs_sb_info *cifs_sb) struct rb_root *root = &cifs_sb->tlink_tree; struct rb_node *node; struct tcon_link *tlink; + struct cifs_tcon *tcon = NULL; cancel_delayed_work_sync(&cifs_sb->prune_tlinks); + if (cifs_sb->master_tlink) { + tcon = cifs_sb->master_tlink->tl_tcon; + if (tcon) { + spin_lock(&tcon->sb_list_lock); + list_del_init(&cifs_sb->tcon_sb_link); + spin_unlock(&tcon->sb_list_lock); + } + } + spin_lock(&cifs_sb->tlink_tree_lock); while ((node = rb_first(root))) { tlink = rb_entry(node, struct tcon_link, tl_rbnode); @@ -4078,11 +4202,13 @@ int cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses, struct TCP_Server_Info *server) { + bool in_retry = false; int rc = 0; if (!server->ops->need_neg || !server->ops->negotiate) return -ENOSYS; +retry: /* only send once per connect */ spin_lock(&server->srv_lock); if (server->tcpStatus != CifsGood && @@ -4102,6 +4228,14 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses, spin_unlock(&server->srv_lock); rc = server->ops->negotiate(xid, ses, server); + if (rc == -EAGAIN) { + /* Allow one retry attempt */ + if (!in_retry) { + in_retry = true; + goto retry; + } + rc = -EHOSTDOWN; + } if (rc == 0) { spin_lock(&server->srv_lock); if (server->tcpStatus == CifsInNegotiate) diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index bdb762d398af..2980941b9667 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -135,6 +135,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_flag("witness", Opt_witness), fsparam_flag_no("nativesocket", Opt_nativesocket), fsparam_flag_no("unicode", Opt_unicode), + fsparam_flag_no("nbsessinit", Opt_nbsessinit), /* Mount options which take uid or gid */ fsparam_uid("backupuid", Opt_backupuid), @@ -968,6 +969,10 @@ static int smb3_verify_reconfigure_ctx(struct fs_context *fc, cifs_errorf(fc, "can not change unicode during remount\n"); return -EINVAL; } + if (new_ctx->rfc1001_sessinit != old_ctx->rfc1001_sessinit) { + cifs_errorf(fc, "can not change nbsessinit during remount\n"); + return -EINVAL; + } return 0; } @@ -1333,6 +1338,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, case Opt_rsize: ctx->rsize = result.uint_32; ctx->got_rsize = true; + ctx->vol_rsize = ctx->rsize; break; case Opt_wsize: ctx->wsize = result.uint_32; @@ -1348,6 +1354,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->wsize, PAGE_SIZE); } } + ctx->vol_wsize = ctx->wsize; break; case Opt_acregmax: if (result.uint_32 > CIFS_MAX_ACTIMEO / HZ) { @@ -1383,6 +1390,11 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->closetimeo = HZ * result.uint_32; break; case Opt_echo_interval: + if (result.uint_32 < SMB_ECHO_INTERVAL_MIN || + result.uint_32 > SMB_ECHO_INTERVAL_MAX) { + cifs_errorf(fc, "echo interval is out of bounds\n"); + goto cifs_parse_mount_err; + } ctx->echo_interval = result.uint_32; break; case Opt_snapshot: @@ -1602,6 +1614,10 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, if (i == RFC1001_NAME_LEN && param->string[i] != 0) pr_warn("server netbiosname longer than 15 truncated\n"); break; + case Opt_nbsessinit: + ctx->rfc1001_sessinit = !result.negated; + cifs_dbg(FYI, "rfc1001_sessinit set to %d\n", ctx->rfc1001_sessinit); + break; case Opt_ver: /* version of mount userspace tools, not dialect */ /* If interface changes in mount.cifs bump to new ver */ @@ -1889,13 +1905,16 @@ int smb3_init_fs_context(struct fs_context *fc) memset(ctx->source_rfc1001_name, 0x20, RFC1001_NAME_LEN); for (i = 0; i < strnlen(nodename, RFC1001_NAME_LEN); i++) ctx->source_rfc1001_name[i] = toupper(nodename[i]); - ctx->source_rfc1001_name[RFC1001_NAME_LEN] = 0; + /* * null target name indicates to use *SMBSERVR default called name * if we end up sending RFC1001 session initialize */ ctx->target_rfc1001_name[0] = 0; + + ctx->rfc1001_sessinit = -1; /* autodetect based on port number */ + ctx->cred_uid = current_uid(); ctx->linux_uid = current_uid(); ctx->linux_gid = current_gid(); diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h index 42c6b66c2c1a..d1d29249bcdb 100644 --- a/fs/smb/client/fs_context.h +++ b/fs/smb/client/fs_context.h @@ -174,6 +174,7 @@ enum cifs_param { Opt_iocharset, Opt_netbiosname, Opt_servern, + Opt_nbsessinit, Opt_ver, Opt_vers, Opt_sec, @@ -216,6 +217,7 @@ struct smb3_fs_context { char *iocharset; /* local code page for mapping to and from Unicode */ char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */ char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */ + int rfc1001_sessinit; kuid_t cred_uid; kuid_t linux_uid; kgid_t linux_gid; @@ -280,6 +282,9 @@ struct smb3_fs_context { bool use_client_guid:1; /* reuse existing guid for multichannel */ u8 client_guid[SMB2_CLIENT_GUID_SIZE]; + /* User-specified original r/wsize value */ + unsigned int vol_rsize; + unsigned int vol_wsize; unsigned int bsize; unsigned int rasize; unsigned int rsize; diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 3bb21aa58474..a00a9d91d0da 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -2901,23 +2901,6 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start, return -EOPNOTSUPP; } -int cifs_truncate_page(struct address_space *mapping, loff_t from) -{ - pgoff_t index = from >> PAGE_SHIFT; - unsigned offset = from & (PAGE_SIZE - 1); - struct page *page; - int rc = 0; - - page = grab_cache_page(mapping, index); - if (!page) - return -ENOMEM; - - zero_user_segment(page, offset, PAGE_SIZE); - unlock_page(page); - put_page(page); - return rc; -} - void cifs_setsize(struct inode *inode, loff_t offset) { struct cifsInodeInfo *cifs_i = CIFS_I(inode); @@ -3012,8 +2995,6 @@ set_size_out: */ attrs->ia_ctime = attrs->ia_mtime = current_time(inode); attrs->ia_valid |= ATTR_CTIME | ATTR_MTIME; - - cifs_truncate_page(inode->i_mapping, inode->i_size); } return rc; diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c index a88253668286..769752ad2c5c 100644 --- a/fs/smb/client/link.c +++ b/fs/smb/client/link.c @@ -258,7 +258,7 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_io_parms io_parms = {0}; int buf_type = CIFS_NO_BUFFER; - FILE_ALL_INFO file_info; + struct cifs_open_info_data query_data; oparms = (struct cifs_open_parms) { .tcon = tcon, @@ -270,11 +270,11 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, .fid = &fid, }; - rc = CIFS_open(xid, &oparms, &oplock, &file_info); + rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, &query_data); if (rc) return rc; - if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { + if (query_data.fi.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { rc = -ENOENT; /* it's not a symlink */ goto out; @@ -313,7 +313,7 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, .fid = &fid, }; - rc = CIFS_open(xid, &oparms, &oplock, NULL); + rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, NULL); if (rc) return rc; diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index b328dc5c7988..7b6ed9b23e71 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -137,8 +137,10 @@ tcon_info_alloc(bool dir_leases_enabled, enum smb3_tcon_ref_trace trace) spin_lock_init(&ret_buf->tc_lock); INIT_LIST_HEAD(&ret_buf->openFileList); INIT_LIST_HEAD(&ret_buf->tcon_list); + INIT_LIST_HEAD(&ret_buf->cifs_sb_list); spin_lock_init(&ret_buf->open_file_lock); spin_lock_init(&ret_buf->stat_lock); + spin_lock_init(&ret_buf->sb_list_lock); atomic_set(&ret_buf->num_local_opens, 0); atomic_set(&ret_buf->num_remote_opens, 0); ret_buf->stats_from_time = ktime_get_real_seconds(); diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index 8701484805cd..26df807fbe7a 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -14,6 +14,8 @@ #include "cifspdu.h" #include "cifs_unicode.h" #include "fs_context.h" +#include "nterr.h" +#include "smberr.h" /* * An NT cancel request header looks just like the original request except: @@ -426,13 +428,6 @@ cifs_negotiate(const unsigned int xid, { int rc; rc = CIFSSMBNegotiate(xid, ses, server); - if (rc == -EAGAIN) { - /* retry only once on 1st time connection */ - set_credits(server, 1); - rc = CIFSSMBNegotiate(xid, ses, server); - if (rc == -EAGAIN) - rc = -EHOSTDOWN; - } return rc; } @@ -444,8 +439,8 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) unsigned int wsize; /* start with specified wsize, or default */ - if (ctx->wsize) - wsize = ctx->wsize; + if (ctx->got_wsize) + wsize = ctx->vol_wsize; else if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_WRITE_CAP)) wsize = CIFS_DEFAULT_IOSIZE; else @@ -497,7 +492,7 @@ cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) else defsize = server->maxBuf - sizeof(READ_RSP); - rsize = ctx->rsize ? ctx->rsize : defsize; + rsize = ctx->got_rsize ? ctx->vol_rsize : defsize; /* * no CAP_LARGE_READ_X? Then MS-CIFS states that we must limit this to @@ -1069,6 +1064,47 @@ cifs_make_node(unsigned int xid, struct inode *inode, full_path, mode, dev); } +static bool +cifs_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) +{ + struct smb_hdr *shdr = (struct smb_hdr *)buf; + struct TCP_Server_Info *pserver; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + + if (shdr->Flags2 & SMBFLG2_ERR_STATUS) { + if (shdr->Status.CifsError != cpu_to_le32(NT_STATUS_NETWORK_NAME_DELETED)) + return false; + } else { + if (shdr->Status.DosError.ErrorClass != ERRSRV || + shdr->Status.DosError.Error != cpu_to_le16(ERRinvtid)) + return false; + } + + /* If server is a channel, select the primary channel */ + pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; + + spin_lock(&cifs_tcp_ses_lock); + list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + if (cifs_ses_exiting(ses)) + continue; + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { + if (tcon->tid == shdr->Tid) { + spin_lock(&tcon->tc_lock); + tcon->need_reconnect = true; + spin_unlock(&tcon->tc_lock); + spin_unlock(&cifs_tcp_ses_lock); + pr_warn_once("Server share %s deleted.\n", + tcon->tree_name); + return true; + } + } + } + spin_unlock(&cifs_tcp_ses_lock); + + return false; +} + struct smb_version_operations smb1_operations = { .send_cancel = send_nt_cancel, .compare_fids = cifs_compare_fids, @@ -1153,6 +1189,7 @@ struct smb_version_operations smb1_operations = { .get_acl_by_fid = get_cifs_acl_by_fid, .set_acl = set_cifs_acl, .make_node = cifs_make_node, + .is_network_name_deleted = cifs_is_network_name_deleted, }; struct smb_version_values smb1_values = { diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c index d609a20fb98a..a7f629238830 100644 --- a/fs/smb/client/smb2file.c +++ b/fs/smb/client/smb2file.c @@ -152,16 +152,35 @@ int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 int err_buftype = CIFS_NO_BUFFER; struct cifs_fid *fid = oparms->fid; struct network_resiliency_req nr_ioctl_req; + bool retry_without_read_attributes = false; smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb); if (smb2_path == NULL) return -ENOMEM; - oparms->desired_access |= FILE_READ_ATTRIBUTES; + /* + * GENERIC_READ, GENERIC_EXECUTE, GENERIC_ALL and MAXIMUM_ALLOWED + * contains also FILE_READ_ATTRIBUTES access right. So do not append + * FILE_READ_ATTRIBUTES when not needed and prevent calling code path + * for retry_without_read_attributes. + */ + if (!(oparms->desired_access & FILE_READ_ATTRIBUTES) && + !(oparms->desired_access & GENERIC_READ) && + !(oparms->desired_access & GENERIC_EXECUTE) && + !(oparms->desired_access & GENERIC_ALL) && + !(oparms->desired_access & MAXIMUM_ALLOWED)) { + oparms->desired_access |= FILE_READ_ATTRIBUTES; + retry_without_read_attributes = true; + } smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH; rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, &err_iov, &err_buftype); + if (rc == -EACCES && retry_without_read_attributes) { + oparms->desired_access &= ~FILE_READ_ATTRIBUTES; + rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, &err_iov, + &err_buftype); + } if (rc && data) { struct smb2_hdr *hdr = err_iov.iov_base; diff --git a/fs/smb/client/smb2glob.h b/fs/smb/client/smb2glob.h index 2466e6155136..224495322a05 100644 --- a/fs/smb/client/smb2glob.h +++ b/fs/smb/client/smb2glob.h @@ -38,6 +38,7 @@ enum smb2_compound_ops { SMB2_OP_SET_REPARSE, SMB2_OP_GET_REPARSE, SMB2_OP_QUERY_WSL_EA, + SMB2_OP_OPEN_QUERY, }; /* Used when constructing chained read requests. */ diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index e9fd3e204a6f..57d9bfbadd97 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -176,6 +176,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, struct kvec *out_iov, int *out_buftype, struct dentry *dentry) { + struct smb2_create_rsp *create_rsp = NULL; struct smb2_query_info_rsp *qi_rsp = NULL; struct smb2_compound_vars *vars = NULL; __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; @@ -265,7 +266,13 @@ replay_again: num_rqst++; rc = 0; - for (i = 0; i < num_cmds; i++) { + i = 0; + + /* Skip the leading explicit OPEN operation */ + if (num_cmds > 0 && cmds[0] == SMB2_OP_OPEN_QUERY) + i++; + + for (; i < num_cmds; i++) { /* Operation */ switch (cmds[i]) { case SMB2_OP_QUERY_INFO: @@ -640,6 +647,27 @@ finished: } tmp_rc = rc; + + if (rc == 0 && num_cmds > 0 && cmds[0] == SMB2_OP_OPEN_QUERY) { + create_rsp = rsp_iov[0].iov_base; + idata = in_iov[0].iov_base; + idata->fi.CreationTime = create_rsp->CreationTime; + idata->fi.LastAccessTime = create_rsp->LastAccessTime; + idata->fi.LastWriteTime = create_rsp->LastWriteTime; + idata->fi.ChangeTime = create_rsp->ChangeTime; + idata->fi.Attributes = create_rsp->FileAttributes; + idata->fi.AllocationSize = create_rsp->AllocationSize; + idata->fi.EndOfFile = create_rsp->EndofFile; + if (le32_to_cpu(idata->fi.NumberOfLinks) == 0) + idata->fi.NumberOfLinks = cpu_to_le32(1); /* dummy value */ + idata->fi.DeletePending = 0; + idata->fi.Directory = !!(le32_to_cpu(create_rsp->FileAttributes) & ATTR_DIRECTORY); + + /* smb2_parse_contexts() fills idata->fi.IndexNumber */ + rc = smb2_parse_contexts(server, &rsp_iov[0], &oparms->fid->epoch, + oparms->fid->lease_key, &oplock, &idata->fi, NULL); + } + for (i = 0; i < num_cmds; i++) { char *buf = rsp_iov[i + i].iov_base; @@ -978,6 +1006,43 @@ int smb2_query_path_info(const unsigned int xid, case 0: rc = parse_create_response(data, cifs_sb, full_path, &out_iov[0]); break; + case -EACCES: + /* + * If SMB2_OP_QUERY_INFO (called when POSIX extensions are not used) failed with + * STATUS_ACCESS_DENIED then it means that caller does not have permission to + * open the path with FILE_READ_ATTRIBUTES access and therefore cannot issue + * SMB2_OP_QUERY_INFO command. + * + * There is an alternative way how to query limited information about path but still + * suitable for stat() syscall. SMB2 OPEN/CREATE operation returns in its successful + * response subset of query information. + * + * So try to open the path without FILE_READ_ATTRIBUTES but with MAXIMUM_ALLOWED + * access which will grant the maximum possible access to the file and the response + * will contain required query information for stat() syscall. + */ + + if (tcon->posix_extensions) + break; + + num_cmds = 1; + cmds[0] = SMB2_OP_OPEN_QUERY; + in_iov[0].iov_base = data; + in_iov[0].iov_len = sizeof(*data); + oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, MAXIMUM_ALLOWED, + FILE_OPEN, create_options, ACL_NO_MODE); + free_rsp_iov(out_iov, out_buftype, ARRAY_SIZE(out_iov)); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, + &oparms, in_iov, cmds, num_cmds, + cfile, out_iov, out_buftype, NULL); + + hdr = out_iov[0].iov_base; + if (!hdr || out_buftype[0] == CIFS_NO_BUFFER) + goto out; + + if (!rc) + rc = parse_create_response(data, cifs_sb, full_path, &out_iov[0]); + break; case -EOPNOTSUPP: /* * BB TODO: When support for special files added to Samba diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index a700e5921961..41d8cd20b25f 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -464,12 +464,20 @@ smb2_negotiate(const unsigned int xid, server->CurrentMid = 0; spin_unlock(&server->mid_lock); rc = SMB2_negotiate(xid, ses, server); - /* BB we probably don't need to retry with modern servers */ - if (rc == -EAGAIN) - rc = -EHOSTDOWN; return rc; } +static inline unsigned int +prevent_zero_iosize(unsigned int size, const char *type) +{ + if (size == 0) { + cifs_dbg(VFS, "SMB: Zero %ssize calculated, using minimum value %u\n", + type, CIFS_MIN_DEFAULT_IOSIZE); + return CIFS_MIN_DEFAULT_IOSIZE; + } + return size; +} + static unsigned int smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) { @@ -477,12 +485,12 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) unsigned int wsize; /* start with specified wsize, or default */ - wsize = ctx->wsize ? ctx->wsize : CIFS_DEFAULT_IOSIZE; + wsize = ctx->got_wsize ? ctx->vol_wsize : CIFS_DEFAULT_IOSIZE; wsize = min_t(unsigned int, wsize, server->max_write); if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE); - return wsize; + return prevent_zero_iosize(wsize, "w"); } static unsigned int @@ -492,7 +500,7 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) unsigned int wsize; /* start with specified wsize, or default */ - wsize = ctx->wsize ? ctx->wsize : SMB3_DEFAULT_IOSIZE; + wsize = ctx->got_wsize ? ctx->vol_wsize : SMB3_DEFAULT_IOSIZE; wsize = min_t(unsigned int, wsize, server->max_write); #ifdef CONFIG_CIFS_SMB_DIRECT if (server->rdma) { @@ -514,7 +522,7 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE); - return wsize; + return prevent_zero_iosize(wsize, "w"); } static unsigned int @@ -524,13 +532,13 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) unsigned int rsize; /* start with specified rsize, or default */ - rsize = ctx->rsize ? ctx->rsize : CIFS_DEFAULT_IOSIZE; + rsize = ctx->got_rsize ? ctx->vol_rsize : CIFS_DEFAULT_IOSIZE; rsize = min_t(unsigned int, rsize, server->max_read); if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE); - return rsize; + return prevent_zero_iosize(rsize, "r"); } static unsigned int @@ -540,7 +548,7 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) unsigned int rsize; /* start with specified rsize, or default */ - rsize = ctx->rsize ? ctx->rsize : SMB3_DEFAULT_IOSIZE; + rsize = ctx->got_rsize ? ctx->vol_rsize : SMB3_DEFAULT_IOSIZE; rsize = min_t(unsigned int, rsize, server->max_read); #ifdef CONFIG_CIFS_SMB_DIRECT if (server->rdma) { @@ -563,7 +571,7 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE); - return rsize; + return prevent_zero_iosize(rsize, "r"); } /* @@ -3526,8 +3534,6 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, if (rc == 0) { netfs_resize_file(&cifsi->netfs, new_eof, true); cifs_setsize(inode, new_eof); - cifs_truncate_page(inode->i_mapping, inode->i_size); - truncate_setsize(inode, new_eof); } goto out; } diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 4f69a1825e42..81e05db8e4d5 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -43,6 +43,7 @@ #endif #include "cached_dir.h" #include "compress.h" +#include "fs_context.h" /* * The following table defines the expected "StructureSize" of SMB2 requests @@ -4089,6 +4090,24 @@ smb2_echo_callback(struct mid_q_entry *mid) add_credits(server, &credits, CIFS_ECHO_OP); } +static void cifs_renegotiate_iosize(struct TCP_Server_Info *server, + struct cifs_tcon *tcon) +{ + struct cifs_sb_info *cifs_sb; + + if (server == NULL || tcon == NULL) + return; + + spin_lock(&tcon->sb_list_lock); + list_for_each_entry(cifs_sb, &tcon->cifs_sb_list, tcon_sb_link) { + cifs_sb->ctx->rsize = + server->ops->negotiate_rsize(tcon, cifs_sb->ctx); + cifs_sb->ctx->wsize = + server->ops->negotiate_wsize(tcon, cifs_sb->ctx); + } + spin_unlock(&tcon->sb_list_lock); +} + void smb2_reconnect_server(struct work_struct *work) { struct TCP_Server_Info *server = container_of(work, @@ -4174,9 +4193,10 @@ void smb2_reconnect_server(struct work_struct *work) list_for_each_entry_safe(tcon, tcon2, &tmp_list, rlist) { rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon, server, true); - if (!rc) + if (!rc) { + cifs_renegotiate_iosize(server, tcon); cifs_reopen_persistent_handles(tcon); - else + } else resched = true; list_del_init(&tcon->rlist); if (tcon->ipc) diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index 03434dbe9374..266af17aa7d9 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -894,6 +894,9 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) case MID_SHUTDOWN: rc = -EHOSTDOWN; break; + case MID_RC: + rc = mid->mid_rc; + break; default: if (!(mid->mid_flags & MID_DELETED)) { list_del_init(&mid->qhead); diff --git a/fs/smb/client/xattr.c b/fs/smb/client/xattr.c index 7d49f38f01f3..b88fa04f5792 100644 --- a/fs/smb/client/xattr.c +++ b/fs/smb/client/xattr.c @@ -31,6 +31,8 @@ * secure, replaced by SMB2 (then even more highly secure SMB3) many years ago */ #define SMB3_XATTR_CIFS_ACL "system.smb3_acl" /* DACL only */ +#define SMB3_XATTR_CIFS_NTSD_SACL "system.smb3_ntsd_sacl" /* SACL only */ +#define SMB3_XATTR_CIFS_NTSD_OWNER "system.smb3_ntsd_owner" /* owner only */ #define SMB3_XATTR_CIFS_NTSD "system.smb3_ntsd" /* owner plus DACL */ #define SMB3_XATTR_CIFS_NTSD_FULL "system.smb3_ntsd_full" /* owner/DACL/SACL */ #define SMB3_XATTR_ATTRIB "smb3.dosattrib" /* full name: user.smb3.dosattrib */ @@ -38,6 +40,7 @@ /* BB need to add server (Samba e.g) support for security and trusted prefix */ enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT, + XATTR_CIFS_NTSD_SACL, XATTR_CIFS_NTSD_OWNER, XATTR_CIFS_NTSD, XATTR_CIFS_NTSD_FULL }; static int cifs_attrib_set(unsigned int xid, struct cifs_tcon *pTcon, @@ -160,6 +163,8 @@ static int cifs_xattr_set(const struct xattr_handler *handler, break; case XATTR_CIFS_ACL: + case XATTR_CIFS_NTSD_SACL: + case XATTR_CIFS_NTSD_OWNER: case XATTR_CIFS_NTSD: case XATTR_CIFS_NTSD_FULL: { struct smb_ntsd *pacl; @@ -187,6 +192,13 @@ static int cifs_xattr_set(const struct xattr_handler *handler, CIFS_ACL_GROUP | CIFS_ACL_DACL); break; + case XATTR_CIFS_NTSD_OWNER: + aclflags = (CIFS_ACL_OWNER | + CIFS_ACL_GROUP); + break; + case XATTR_CIFS_NTSD_SACL: + aclflags = CIFS_ACL_SACL; + break; case XATTR_CIFS_ACL: default: aclflags = CIFS_ACL_DACL; @@ -308,6 +320,8 @@ static int cifs_xattr_get(const struct xattr_handler *handler, break; case XATTR_CIFS_ACL: + case XATTR_CIFS_NTSD_SACL: + case XATTR_CIFS_NTSD_OWNER: case XATTR_CIFS_NTSD: case XATTR_CIFS_NTSD_FULL: { /* @@ -327,6 +341,12 @@ static int cifs_xattr_get(const struct xattr_handler *handler, case XATTR_CIFS_NTSD: extra_info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO; break; + case XATTR_CIFS_NTSD_OWNER: + extra_info = OWNER_SECINFO | GROUP_SECINFO; + break; + case XATTR_CIFS_NTSD_SACL: + extra_info = SACL_SECINFO; + break; case XATTR_CIFS_ACL: default: extra_info = DACL_SECINFO; @@ -448,6 +468,20 @@ static const struct xattr_handler smb3_acl_xattr_handler = { .set = cifs_xattr_set, }; +static const struct xattr_handler smb3_ntsd_sacl_xattr_handler = { + .name = SMB3_XATTR_CIFS_NTSD_SACL, + .flags = XATTR_CIFS_NTSD_SACL, + .get = cifs_xattr_get, + .set = cifs_xattr_set, +}; + +static const struct xattr_handler smb3_ntsd_owner_xattr_handler = { + .name = SMB3_XATTR_CIFS_NTSD_OWNER, + .flags = XATTR_CIFS_NTSD_OWNER, + .get = cifs_xattr_get, + .set = cifs_xattr_set, +}; + static const struct xattr_handler cifs_cifs_ntsd_xattr_handler = { .name = CIFS_XATTR_CIFS_NTSD, .flags = XATTR_CIFS_NTSD, @@ -493,6 +527,8 @@ const struct xattr_handler * const cifs_xattr_handlers[] = { &cifs_os2_xattr_handler, &cifs_cifs_acl_xattr_handler, &smb3_acl_xattr_handler, /* alias for above since avoiding "cifs" */ + &smb3_ntsd_sacl_xattr_handler, + &smb3_ntsd_owner_xattr_handler, &cifs_cifs_ntsd_xattr_handler, &smb3_ntsd_xattr_handler, /* alias for above since avoiding "cifs" */ &cifs_cifs_ntsd_full_xattr_handler, diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index c7a0efda4403..764dca80c15c 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -95,6 +95,9 @@ */ #define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024) +/* According to MS-SMB2 specification The minimum recommended value is 65536.*/ +#define CIFS_MIN_DEFAULT_IOSIZE (65536) + /* * SMB2 Header Definition * diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c index 00b31cf86462..83caa3849749 100644 --- a/fs/smb/server/auth.c +++ b/fs/smb/server/auth.c @@ -1016,9 +1016,9 @@ static int ksmbd_get_encryption_key(struct ksmbd_work *work, __u64 ses_id, ses_enc_key = enc ? sess->smb3encryptionkey : sess->smb3decryptionkey; - if (enc) - ksmbd_user_session_get(sess); memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE); + if (!enc) + ksmbd_user_session_put(sess); return 0; } diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index 91c2318639e7..14620e147dda 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -27,6 +27,7 @@ enum { KSMBD_SESS_EXITING, KSMBD_SESS_NEED_RECONNECT, KSMBD_SESS_NEED_NEGOTIATE, + KSMBD_SESS_NEED_SETUP, KSMBD_SESS_RELEASING }; @@ -187,6 +188,11 @@ static inline bool ksmbd_conn_need_negotiate(struct ksmbd_conn *conn) return READ_ONCE(conn->status) == KSMBD_SESS_NEED_NEGOTIATE; } +static inline bool ksmbd_conn_need_setup(struct ksmbd_conn *conn) +{ + return READ_ONCE(conn->status) == KSMBD_SESS_NEED_SETUP; +} + static inline bool ksmbd_conn_need_reconnect(struct ksmbd_conn *conn) { return READ_ONCE(conn->status) == KSMBD_SESS_NEED_RECONNECT; @@ -217,6 +223,11 @@ static inline void ksmbd_conn_set_need_negotiate(struct ksmbd_conn *conn) WRITE_ONCE(conn->status, KSMBD_SESS_NEED_NEGOTIATE); } +static inline void ksmbd_conn_set_need_setup(struct ksmbd_conn *conn) +{ + WRITE_ONCE(conn->status, KSMBD_SESS_NEED_SETUP); +} + static inline void ksmbd_conn_set_need_reconnect(struct ksmbd_conn *conn) { WRITE_ONCE(conn->status, KSMBD_SESS_NEED_RECONNECT); diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c index 53d308f331af..3f45f28f6f0f 100644 --- a/fs/smb/server/mgmt/user_session.c +++ b/fs/smb/server/mgmt/user_session.c @@ -181,7 +181,7 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn) down_write(&sessions_table_lock); down_write(&conn->session_lock); xa_for_each(&conn->sessions, id, sess) { - if (atomic_read(&sess->refcnt) == 0 && + if (atomic_read(&sess->refcnt) <= 1 && (sess->state != SMB2_SESSION_VALID || time_after(jiffies, sess->last_active + SMB2_SESSION_TIMEOUT))) { @@ -233,7 +233,8 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn) down_write(&conn->session_lock); xa_erase(&conn->sessions, sess->id); up_write(&conn->session_lock); - ksmbd_session_destroy(sess); + if (atomic_dec_and_test(&sess->refcnt)) + ksmbd_session_destroy(sess); } } } @@ -252,7 +253,8 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn) if (xa_empty(&sess->ksmbd_chann_list)) { xa_erase(&conn->sessions, sess->id); hash_del(&sess->hlist); - ksmbd_session_destroy(sess); + if (atomic_dec_and_test(&sess->refcnt)) + ksmbd_session_destroy(sess); } } up_write(&conn->session_lock); @@ -328,8 +330,8 @@ void ksmbd_user_session_put(struct ksmbd_session *sess) if (atomic_read(&sess->refcnt) <= 0) WARN_ON(1); - else - atomic_dec(&sess->refcnt); + else if (atomic_dec_and_test(&sess->refcnt)) + ksmbd_session_destroy(sess); } struct preauth_session *ksmbd_preauth_session_alloc(struct ksmbd_conn *conn, @@ -372,13 +374,13 @@ void destroy_previous_session(struct ksmbd_conn *conn, ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_RECONNECT); err = ksmbd_conn_wait_idle_sess_id(conn, id); if (err) { - ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_NEGOTIATE); + ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_SETUP); goto out; } ksmbd_destroy_file_table(&prev_sess->file_table); prev_sess->state = SMB2_SESSION_EXPIRED; - ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_NEGOTIATE); + ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_SETUP); ksmbd_launch_ksmbd_durable_scavenger(); out: up_write(&conn->session_lock); @@ -436,7 +438,7 @@ static struct ksmbd_session *__session_create(int protocol) xa_init(&sess->rpc_handle_list); sess->sequence_number = 1; rwlock_init(&sess->tree_conns_lock); - atomic_set(&sess->refcnt, 1); + atomic_set(&sess->refcnt, 2); ret = __init_smb2_session(sess); if (ret) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 4ddf4300371b..d24d95d15d87 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1249,7 +1249,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) } conn->srv_sec_mode = le16_to_cpu(rsp->SecurityMode); - ksmbd_conn_set_need_negotiate(conn); + ksmbd_conn_set_need_setup(conn); err_out: ksmbd_conn_unlock(conn); @@ -1271,6 +1271,9 @@ static int alloc_preauth_hash(struct ksmbd_session *sess, if (sess->Preauth_HashValue) return 0; + if (!conn->preauth_info) + return -ENOMEM; + sess->Preauth_HashValue = kmemdup(conn->preauth_info->Preauth_HashValue, PREAUTH_HASHVALUE_SIZE, KSMBD_DEFAULT_GFP); if (!sess->Preauth_HashValue) @@ -1674,6 +1677,11 @@ int smb2_sess_setup(struct ksmbd_work *work) ksmbd_debug(SMB, "Received smb2 session setup request\n"); + if (!ksmbd_conn_need_setup(conn) && !ksmbd_conn_good(conn)) { + work->send_no_response = 1; + return rc; + } + WORK_BUFFERS(work, req, rsp); rsp->StructureSize = cpu_to_le16(9); @@ -1909,7 +1917,7 @@ out_err: if (try_delay) { ksmbd_conn_set_need_reconnect(conn); ssleep(5); - ksmbd_conn_set_need_negotiate(conn); + ksmbd_conn_set_need_setup(conn); } } smb2_set_err_rsp(work); @@ -2235,14 +2243,15 @@ int smb2_session_logoff(struct ksmbd_work *work) return -ENOENT; } - ksmbd_destroy_file_table(&sess->file_table); down_write(&conn->session_lock); sess->state = SMB2_SESSION_EXPIRED; up_write(&conn->session_lock); - ksmbd_free_user(sess->user); - sess->user = NULL; - ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_NEGOTIATE); + if (sess->user) { + ksmbd_free_user(sess->user); + sess->user = NULL; + } + ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_SETUP); rsp->StructureSize = cpu_to_le16(4); err = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_logoff_rsp)); diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c index 49b128698670..5aa7a66334d9 100644 --- a/fs/smb/server/smbacl.c +++ b/fs/smb/server/smbacl.c @@ -270,6 +270,11 @@ static int sid_to_id(struct mnt_idmap *idmap, return -EIO; } + if (psid->num_subauth == 0) { + pr_err("%s: zero subauthorities!\n", __func__); + return -EIO; + } + if (sidtype == SIDOWNER) { kuid_t uid; uid_t id; @@ -1026,7 +1031,9 @@ int smb_inherit_dacl(struct ksmbd_conn *conn, struct dentry *parent = path->dentry->d_parent; struct mnt_idmap *idmap = mnt_idmap(path->mnt); int inherited_flags = 0, flags = 0, i, nt_size = 0, pdacl_size; - int rc = 0, dacloffset, pntsd_type, pntsd_size, acl_len, aces_size; + int rc = 0, pntsd_type, pntsd_size, acl_len, aces_size; + unsigned int dacloffset; + size_t dacl_struct_end; u16 num_aces, ace_cnt = 0; char *aces_base; bool is_dir = S_ISDIR(d_inode(path->dentry)->i_mode); @@ -1035,8 +1042,11 @@ int smb_inherit_dacl(struct ksmbd_conn *conn, parent, &parent_pntsd); if (pntsd_size <= 0) return -ENOENT; + dacloffset = le32_to_cpu(parent_pntsd->dacloffset); - if (!dacloffset || (dacloffset + sizeof(struct smb_acl) > pntsd_size)) { + if (!dacloffset || + check_add_overflow(dacloffset, sizeof(struct smb_acl), &dacl_struct_end) || + dacl_struct_end > (size_t)pntsd_size) { rc = -EINVAL; goto free_parent_pntsd; } @@ -1240,7 +1250,9 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, struct smb_ntsd *pntsd = NULL; struct smb_acl *pdacl; struct posix_acl *posix_acls; - int rc = 0, pntsd_size, acl_size, aces_size, pdacl_size, dacl_offset; + int rc = 0, pntsd_size, acl_size, aces_size, pdacl_size; + unsigned int dacl_offset; + size_t dacl_struct_end; struct smb_sid sid; int granted = le32_to_cpu(*pdaccess & ~FILE_MAXIMAL_ACCESS_LE); struct smb_ace *ace; @@ -1259,7 +1271,8 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, dacl_offset = le32_to_cpu(pntsd->dacloffset); if (!dacl_offset || - (dacl_offset + sizeof(struct smb_acl) > pntsd_size)) + check_add_overflow(dacl_offset, sizeof(struct smb_acl), &dacl_struct_end) || + dacl_struct_end > (size_t)pntsd_size) goto err_out; pdacl = (struct smb_acl *)((char *)pntsd + le32_to_cpu(pntsd->dacloffset)); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 97c4d71115d8..d80f94346199 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -396,32 +396,6 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) goto out; /* - * If it's already released don't get it. This avoids to loop - * in __get_user_pages if userfaultfd_release waits on the - * caller of handle_userfault to release the mmap_lock. - */ - if (unlikely(READ_ONCE(ctx->released))) { - /* - * Don't return VM_FAULT_SIGBUS in this case, so a non - * cooperative manager can close the uffd after the - * last UFFDIO_COPY, without risking to trigger an - * involuntary SIGBUS if the process was starting the - * userfaultfd while the userfaultfd was still armed - * (but after the last UFFDIO_COPY). If the uffd - * wasn't already closed when the userfault reached - * this point, that would normally be solved by - * userfaultfd_must_wait returning 'false'. - * - * If we were to return VM_FAULT_SIGBUS here, the non - * cooperative manager would be instead forced to - * always call UFFDIO_UNREGISTER before it can safely - * close the uffd. - */ - ret = VM_FAULT_NOPAGE; - goto out; - } - - /* * Check that we can return VM_FAULT_RETRY. * * NOTE: it should become possible to return VM_FAULT_RETRY @@ -457,6 +431,31 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) goto out; + if (unlikely(READ_ONCE(ctx->released))) { + /* + * If a concurrent release is detected, do not return + * VM_FAULT_SIGBUS or VM_FAULT_NOPAGE, but instead always + * return VM_FAULT_RETRY with lock released proactively. + * + * If we were to return VM_FAULT_SIGBUS here, the non + * cooperative manager would be instead forced to + * always call UFFDIO_UNREGISTER before it can safely + * close the uffd, to avoid involuntary SIGBUS triggered. + * + * If we were to return VM_FAULT_NOPAGE, it would work for + * the fault path, in which the lock will be released + * later. However for GUP, faultin_page() does nothing + * special on NOPAGE, so GUP would spin retrying without + * releasing the mmap read lock, causing possible livelock. + * + * Here only VM_FAULT_RETRY would make sure the mmap lock + * be released immediately, so that the thread concurrently + * releasing the userfault would always make progress. + */ + release_fault_lock(vmf); + goto out; + } + /* take the reference before dropping the mmap_lock */ userfaultfd_ctx_get(ctx); |