From 59b35d41942732bfd1973373459c959c53e8099e Mon Sep 17 00:00:00 2001 From: Kent Overstreet <kent.overstreet@linux.dev> Date: Sun, 6 Apr 2025 16:16:32 -0400 Subject: [PATCH] Update bcachefs sources to 8e5380376586 bcachefs: Improve opts.degraded Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev> --- .bcachefs_revision | 2 +- bch_bindgen/src/btree.rs | 26 +- c_src/cmd_device.c | 8 +- c_src/cmd_dump.c | 3 +- c_src/cmd_list_journal.c | 3 +- include/linux/moduleparam.h | 1 + include/linux/string_helpers.h | 11 + libbcachefs/acl.c | 4 +- libbcachefs/alloc_background.c | 95 ++--- libbcachefs/alloc_background.h | 6 +- libbcachefs/alloc_foreground.c | 563 +++++++++++++--------------- libbcachefs/alloc_foreground.h | 54 ++- libbcachefs/alloc_types.h | 16 - libbcachefs/backpointers.c | 24 +- libbcachefs/bcachefs.h | 13 +- libbcachefs/bcachefs_format.h | 16 +- libbcachefs/bkey.h | 1 + libbcachefs/btree_gc.c | 11 +- libbcachefs/btree_gc.h | 3 +- libbcachefs/btree_io.c | 19 +- libbcachefs/btree_iter.c | 235 +++++++----- libbcachefs/btree_iter.h | 190 ++++++---- libbcachefs/btree_key_cache.c | 32 +- libbcachefs/btree_node_scan.c | 8 +- libbcachefs/btree_trans_commit.c | 4 +- libbcachefs/btree_types.h | 10 +- libbcachefs/btree_update.c | 34 +- libbcachefs/btree_update.h | 4 +- libbcachefs/btree_update_interior.c | 12 +- libbcachefs/btree_write_buffer.c | 17 +- libbcachefs/btree_write_buffer.h | 1 + libbcachefs/buckets.c | 16 +- libbcachefs/buckets.h | 21 +- libbcachefs/buckets_types.h | 5 + libbcachefs/chardev.c | 14 +- libbcachefs/compress.c | 9 +- libbcachefs/darray.h | 12 +- libbcachefs/data_update.c | 10 +- libbcachefs/debug.c | 16 +- libbcachefs/dirent.c | 20 +- libbcachefs/disk_accounting.c | 4 +- libbcachefs/disk_groups.c | 4 +- libbcachefs/ec.c | 119 +++--- libbcachefs/ec.h | 5 +- libbcachefs/errcode.h | 3 + libbcachefs/error.c | 7 +- libbcachefs/extent_update.c | 6 +- libbcachefs/extents.c | 6 +- libbcachefs/fs-io-buffered.c | 23 +- libbcachefs/fs-io.c | 15 +- libbcachefs/fs.c | 24 +- libbcachefs/fsck.c | 32 +- libbcachefs/inode.c | 18 +- libbcachefs/io_misc.c | 18 +- libbcachefs/io_read.c | 152 +++++++- libbcachefs/io_read.h | 5 +- libbcachefs/io_write.c | 48 +-- libbcachefs/io_write.h | 28 -- libbcachefs/io_write_types.h | 28 ++ libbcachefs/journal.c | 37 +- libbcachefs/journal.h | 1 + libbcachefs/journal_io.c | 10 +- libbcachefs/journal_types.h | 2 - libbcachefs/migrate.c | 4 +- libbcachefs/move.c | 40 +- libbcachefs/movinggc.c | 8 +- libbcachefs/namei.c | 38 +- libbcachefs/nocow_locking.c | 4 +- libbcachefs/nocow_locking.h | 2 +- libbcachefs/opts.c | 81 +++- libbcachefs/opts.h | 21 +- libbcachefs/quota.c | 2 +- libbcachefs/rebalance.c | 18 +- libbcachefs/recovery.c | 6 +- libbcachefs/reflink.c | 23 +- libbcachefs/sb-counters_format.h | 1 + libbcachefs/sb-members.h | 23 +- libbcachefs/snapshot.c | 28 +- libbcachefs/str_hash.c | 2 +- libbcachefs/str_hash.h | 8 +- libbcachefs/subvolume.c | 13 +- libbcachefs/subvolume.h | 16 +- libbcachefs/super-io.c | 23 +- libbcachefs/super.c | 252 ++++++++----- libbcachefs/tests.c | 30 +- libbcachefs/trace.h | 5 + libbcachefs/util.h | 7 +- libbcachefs/xattr.c | 2 +- src/commands/list.rs | 4 +- 89 files changed, 1635 insertions(+), 1170 deletions(-) create mode 100644 include/linux/moduleparam.h diff --git a/.bcachefs_revision b/.bcachefs_revision index d69fbd4d..92176371 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -7fdc3fa3cb5fb561f5945b4de418d48d1a726a8d +8e5380376586b54782ffc7a4d9cf04eaf5976f85 diff --git a/bch_bindgen/src/btree.rs b/bch_bindgen/src/btree.rs index 81699429..5c154f4c 100644 --- a/bch_bindgen/src/btree.rs +++ b/bch_bindgen/src/btree.rs @@ -53,7 +53,7 @@ bitflags! { pub struct BtreeIter<'t> { raw: c::btree_iter, - trans: PhantomData<&'t BtreeTrans<'t>>, + trans: &'t BtreeTrans<'t>, } impl<'t> BtreeIter<'t> { @@ -76,14 +76,14 @@ impl<'t> BtreeIter<'t> { BtreeIter { raw: iter.assume_init(), - trans: PhantomData, + trans: trans, } } } pub fn peek_max(&mut self, end: c::bpos) -> Result<Option<BkeySC<'_>>, bch_errcode> { unsafe { - let k = c::bch2_btree_iter_peek_max(&mut self.raw, end); + let k = c::bch2_btree_iter_peek_max(self.trans.raw, &mut self.raw, end); errptr_to_result_c(k.k).map(|_| { if !k.k.is_null() { Some(BkeySC { @@ -104,7 +104,7 @@ impl<'t> BtreeIter<'t> { pub fn peek_and_restart(&mut self) -> Result<Option<BkeySC>, bch_errcode> { unsafe { - let k = c::bch2_btree_iter_peek_and_restart_outlined(&mut self.raw); + let k = c::bch2_btree_iter_peek_and_restart_outlined(self.trans.raw, &mut self.raw); errptr_to_result_c(k.k).map(|_| { if !k.k.is_null() { @@ -122,20 +122,20 @@ impl<'t> BtreeIter<'t> { pub fn advance(&mut self) { unsafe { - c::bch2_btree_iter_advance(&mut self.raw); + c::bch2_btree_iter_advance(self.trans.raw, &mut self.raw); } } } impl Drop for BtreeIter<'_> { fn drop(&mut self) { - unsafe { c::bch2_trans_iter_exit(self.raw.trans, &mut self.raw) } + unsafe { c::bch2_trans_iter_exit(self.trans.raw, &mut self.raw) } } } pub struct BtreeNodeIter<'t> { raw: c::btree_iter, - trans: PhantomData<&'t BtreeTrans<'t>>, + trans: &'t BtreeTrans<'t>, } impl<'t> BtreeNodeIter<'t> { @@ -161,35 +161,35 @@ impl<'t> BtreeNodeIter<'t> { BtreeNodeIter { raw: iter.assume_init(), - trans: PhantomData, + trans: trans, } } } pub fn peek(&mut self) -> Result<Option<&c::btree>, bch_errcode> { unsafe { - let b = c::bch2_btree_iter_peek_node(&mut self.raw); + let b = c::bch2_btree_iter_peek_node(self.trans.raw, &mut self.raw); errptr_to_result_c(b).map(|b| if !b.is_null() { Some(&*b) } else { None }) } } pub fn peek_and_restart(&mut self) -> Result<Option<&c::btree>, bch_errcode> { unsafe { - let b = c::bch2_btree_iter_peek_node_and_restart(&mut self.raw); + let b = c::bch2_btree_iter_peek_node_and_restart(self.trans.raw, &mut self.raw); errptr_to_result_c(b).map(|b| if !b.is_null() { Some(&*b) } else { None }) } } pub fn advance(&mut self) { unsafe { - c::bch2_btree_iter_next_node(&mut self.raw); + c::bch2_btree_iter_next_node(self.trans.raw, &mut self.raw); } } #[allow(clippy::should_implement_trait)] pub fn next(&mut self) -> Result<Option<&c::btree>, bch_errcode> { unsafe { - let b = c::bch2_btree_iter_next_node(&mut self.raw); + let b = c::bch2_btree_iter_next_node(self.trans.raw, &mut self.raw); errptr_to_result_c(b).map(|b| if !b.is_null() { Some(&*b) } else { None }) } } @@ -197,7 +197,7 @@ impl<'t> BtreeNodeIter<'t> { impl Drop for BtreeNodeIter<'_> { fn drop(&mut self) { - unsafe { c::bch2_trans_iter_exit(self.raw.trans, &mut self.raw) } + unsafe { c::bch2_trans_iter_exit(self.trans.raw, &mut self.raw) } } } diff --git a/c_src/cmd_device.c b/c_src/cmd_device.c index d953055c..e6aeb815 100644 --- a/c_src/cmd_device.c +++ b/c_src/cmd_device.c @@ -544,7 +544,7 @@ int cmd_device_resize(int argc, char *argv[]) if (resize) die("confused: more than one online device?"); resize = ca; - percpu_ref_get(&resize->io_ref); + percpu_ref_get(&resize->io_ref[READ]); } u64 nbuckets = size / le16_to_cpu(resize->mi.bucket_size); @@ -557,7 +557,7 @@ int cmd_device_resize(int argc, char *argv[]) if (ret) fprintf(stderr, "resize error: %s\n", bch2_err_str(ret)); - percpu_ref_put(&resize->io_ref); + percpu_ref_put(&resize->io_ref[READ]); bch2_fs_stop(c); } return 0; @@ -641,7 +641,7 @@ int cmd_device_resize_journal(int argc, char *argv[]) if (resize) die("confused: more than one online device?"); resize = ca; - percpu_ref_get(&resize->io_ref); + percpu_ref_get(&resize->io_ref[READ]); } u64 nbuckets = size / le16_to_cpu(resize->mi.bucket_size); @@ -651,7 +651,7 @@ int cmd_device_resize_journal(int argc, char *argv[]) if (ret) fprintf(stderr, "resize error: %s\n", bch2_err_str(ret)); - percpu_ref_put(&resize->io_ref); + percpu_ref_put(&resize->io_ref[READ]); bch2_fs_stop(c); } return 0; diff --git a/c_src/cmd_dump.c b/c_src/cmd_dump.c index c9e417f2..65cca53e 100644 --- a/c_src/cmd_dump.c +++ b/c_src/cmd_dump.c @@ -116,8 +116,7 @@ int cmd_dump(int argc, char *argv[]) opt_set(opts, read_only, true); opt_set(opts, nochanges, true); opt_set(opts, norecovery, true); - opt_set(opts, degraded, true); - opt_set(opts, very_degraded, true); + opt_set(opts, degraded, BCH_DEGRADED_very); opt_set(opts, errors, BCH_ON_ERROR_continue); opt_set(opts, fix_errors, FSCK_FIX_no); diff --git a/c_src/cmd_list_journal.c b/c_src/cmd_list_journal.c index 2d364224..3cdf4846 100644 --- a/c_src/cmd_list_journal.c +++ b/c_src/cmd_list_journal.c @@ -281,8 +281,7 @@ int cmd_list_journal(int argc, char *argv[]) opt_set(opts, nochanges, true); opt_set(opts, norecovery, true); opt_set(opts, read_only, true); - opt_set(opts, degraded, true); - opt_set(opts, very_degraded, true); + opt_set(opts, degraded, BCH_DEGRADED_very); opt_set(opts, errors, BCH_ON_ERROR_continue); opt_set(opts, fix_errors, FSCK_FIX_yes); opt_set(opts, retain_recovery_info ,true); diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h new file mode 100644 index 00000000..493b1819 --- /dev/null +++ b/include/linux/moduleparam.h @@ -0,0 +1 @@ +#include <linux/module.h> diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h index af587706..d4b2f4d3 100644 --- a/include/linux/string_helpers.h +++ b/include/linux/string_helpers.h @@ -17,4 +17,15 @@ enum string_size_units { int string_get_size(u64 size, u64 blk_size, enum string_size_units units, char *buf, int len); +static inline void memcpy_and_pad(void *dest, size_t dest_len, const void *src, + size_t count, int pad) +{ + if (dest_len > count) { + memcpy(dest, src, count); + memset(dest + count, pad, dest_len - count); + } else { + memcpy(dest, src, dest_len); + } +} + #endif diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c index 99487727..d03adc36 100644 --- a/libbcachefs/acl.c +++ b/libbcachefs/acl.c @@ -273,7 +273,7 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu) struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); - struct btree_iter iter = { NULL }; + struct btree_iter iter = {}; struct posix_acl *acl = NULL; if (rcu) @@ -344,7 +344,7 @@ int bch2_set_acl(struct mnt_idmap *idmap, { struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_iter inode_iter = { NULL }; + struct btree_iter inode_iter = {}; struct bch_inode_unpacked inode_u; struct posix_acl *acl; umode_t mode; diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index c12ca753..94ea9e49 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -610,7 +610,7 @@ int bch2_alloc_read(struct bch_fs *c) * bch2_check_alloc_key() which runs later: */ if (!ca) { - bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); continue; } @@ -631,17 +631,17 @@ int bch2_alloc_read(struct bch_fs *c) * bch2_check_alloc_key() which runs later: */ if (!ca) { - bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); continue; } if (k.k->p.offset < ca->mi.first_bucket) { - bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket)); + bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode, ca->mi.first_bucket)); continue; } if (k.k->p.offset >= ca->mi.nbuckets) { - bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); continue; } @@ -1039,9 +1039,10 @@ invalid_bucket: * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for * extents style btrees, but works on non-extents btrees: */ -static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole) +static struct bkey_s_c bch2_get_key_or_hole(struct btree_trans *trans, struct btree_iter *iter, + struct bpos end, struct bkey *hole) { - struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); if (bkey_err(k)) return k; @@ -1052,9 +1053,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos struct btree_iter iter2; struct bpos next; - bch2_trans_copy_iter(&iter2, iter); + bch2_trans_copy_iter(trans, &iter2, iter); - struct btree_path *path = btree_iter_path(iter->trans, iter); + struct btree_path *path = btree_iter_path(trans, iter); if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX)) end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p)); @@ -1064,9 +1065,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos * btree node min/max is a closed interval, upto takes a half * open interval: */ - k = bch2_btree_iter_peek_max(&iter2, end); + k = bch2_btree_iter_peek_max(trans, &iter2, end); next = iter2.pos; - bch2_trans_iter_exit(iter->trans, &iter2); + bch2_trans_iter_exit(trans, &iter2); BUG_ON(next.offset >= iter->pos.offset + U32_MAX); @@ -1107,13 +1108,14 @@ static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *buck return *ca != NULL; } -static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, - struct bch_dev **ca, struct bkey *hole) +static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_dev **ca, struct bkey *hole) { - struct bch_fs *c = iter->trans->c; + struct bch_fs *c = trans->c; struct bkey_s_c k; again: - k = bch2_get_key_or_hole(iter, POS_MAX, hole); + k = bch2_get_key_or_hole(trans, iter, POS_MAX, hole); if (bkey_err(k)) return k; @@ -1126,7 +1128,7 @@ again: if (!next_bucket(c, ca, &hole_start)) return bkey_s_c_null; - bch2_btree_iter_set_pos(iter, hole_start); + bch2_btree_iter_set_pos(trans, iter, hole_start); goto again; } @@ -1167,8 +1169,8 @@ int bch2_check_alloc_key(struct btree_trans *trans, a = bch2_alloc_to_v4(alloc_k, &a_convert); - bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p); - k = bch2_btree_iter_peek_slot(discard_iter); + bch2_btree_iter_set_pos(trans, discard_iter, alloc_k.k->p); + k = bch2_btree_iter_peek_slot(trans, discard_iter); ret = bkey_err(k); if (ret) goto err; @@ -1181,8 +1183,8 @@ int bch2_check_alloc_key(struct btree_trans *trans, goto err; } - bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); - k = bch2_btree_iter_peek_slot(freespace_iter); + bch2_btree_iter_set_pos(trans, freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); + k = bch2_btree_iter_peek_slot(trans, freespace_iter); ret = bkey_err(k); if (ret) goto err; @@ -1195,8 +1197,8 @@ int bch2_check_alloc_key(struct btree_trans *trans, goto err; } - bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset)); - k = bch2_btree_iter_peek_slot(bucket_gens_iter); + bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset)); + k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter); ret = bkey_err(k); if (ret) goto err; @@ -1249,9 +1251,9 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans, if (!ca->mi.freespace_initialized) return 0; - bch2_btree_iter_set_pos(freespace_iter, start); + bch2_btree_iter_set_pos(trans, freespace_iter, start); - k = bch2_btree_iter_peek_slot(freespace_iter); + k = bch2_btree_iter_peek_slot(trans, freespace_iter); ret = bkey_err(k); if (ret) goto err; @@ -1300,9 +1302,9 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, unsigned i, gens_offset, gens_end_offset; int ret; - bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); + bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); - k = bch2_btree_iter_peek_slot(bucket_gens_iter); + k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter); ret = bkey_err(k); if (ret) goto err; @@ -1435,7 +1437,7 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite *gen = a->gen; out: fsck_err: - bch2_set_btree_iter_dontneed(&alloc_iter); + bch2_set_btree_iter_dontneed(trans, &alloc_iter); bch2_trans_iter_exit(trans, &alloc_iter); printbuf_exit(&buf); return ret; @@ -1572,7 +1574,7 @@ int bch2_check_alloc_info(struct bch_fs *c) bch2_trans_begin(trans); - k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole); + k = bch2_get_key_or_real_bucket_hole(trans, &iter, &ca, &hole); ret = bkey_err(k); if (ret) goto bkey_err; @@ -1610,7 +1612,7 @@ int bch2_check_alloc_info(struct bch_fs *c) if (ret) goto bkey_err; - bch2_btree_iter_set_pos(&iter, next); + bch2_btree_iter_set_pos(trans, &iter, next); bkey_err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; @@ -1638,7 +1640,7 @@ bkey_err: BTREE_ITER_prefetch); while (1) { bch2_trans_begin(trans); - k = bch2_btree_iter_peek(&iter); + k = bch2_btree_iter_peek(trans, &iter); if (!k.k) break; @@ -1657,7 +1659,7 @@ bkey_err: break; } - bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos)); + bch2_btree_iter_set_pos(trans, &iter, bpos_nosnap_successor(iter.pos)); } bch2_trans_iter_exit(trans, &iter); if (ret) @@ -1685,7 +1687,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, struct printbuf buf = PRINTBUF; int ret; - alloc_k = bch2_btree_iter_peek(alloc_iter); + alloc_k = bch2_btree_iter_peek(trans, alloc_iter); if (!alloc_k.k) return 0; @@ -1826,7 +1828,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bpos pos = need_discard_iter->pos; - struct btree_iter iter = { NULL }; + struct btree_iter iter = {}; struct bkey_s_c k; struct bkey_i_alloc_v4 *a; struct printbuf buf = PRINTBUF; @@ -1950,7 +1952,7 @@ static void bch2_do_discards_work(struct work_struct *work) trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[WRITE]); bch2_write_ref_put(c, BCH_WRITE_REF_discard); } @@ -1967,7 +1969,7 @@ void bch2_dev_do_discards(struct bch_dev *ca) if (queue_work(c->write_ref_wq, &ca->discard_work)) return; - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[WRITE]); put_write_ref: bch2_write_ref_put(c, BCH_WRITE_REF_discard); } @@ -2045,7 +2047,7 @@ static void bch2_do_discards_fast_work(struct work_struct *work) trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); bch2_trans_put(trans); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[WRITE]); bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); } @@ -2065,7 +2067,7 @@ static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) if (queue_work(c->write_ref_wq, &ca->discard_fast_work)) return; - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[WRITE]); put_ref: bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); } @@ -2082,6 +2084,9 @@ static int invalidate_one_bp(struct btree_trans *trans, if (ret) return ret; + if (!extent_k.k) + return 0; + struct bkey_i *n = bch2_bkey_make_mut(trans, &extent_iter, &extent_k, BTREE_UPDATE_internal_snapshot_node); @@ -2199,9 +2204,9 @@ static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter { struct bkey_s_c k; again: - k = bch2_btree_iter_peek_max(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); + k = bch2_btree_iter_peek_max(trans, iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); if (!k.k && !*wrapped) { - bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0)); + bch2_btree_iter_set_pos(trans, iter, lru_pos(ca->dev_idx, 0, 0)); *wrapped = true; goto again; } @@ -2251,12 +2256,12 @@ restart_err: if (ret) break; - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); } bch2_trans_iter_exit(trans, &iter); err: bch2_trans_put(trans); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[WRITE]); bch2_bkey_buf_exit(&last_flushed, c); bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } @@ -2274,7 +2279,7 @@ void bch2_dev_do_invalidates(struct bch_dev *ca) if (queue_work(c->write_ref_wq, &ca->invalidate_work)) return; - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[WRITE]); put_ref: bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } @@ -2321,7 +2326,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, break; } - k = bch2_get_key_or_hole(&iter, end, &hole); + k = bch2_get_key_or_hole(trans, &iter, end, &hole); ret = bkey_err(k); if (ret) goto bkey_err; @@ -2340,7 +2345,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, if (ret) goto bkey_err; - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); } else { struct bkey_i *freespace; @@ -2360,7 +2365,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, if (ret) goto bkey_err; - bch2_btree_iter_set_pos(&iter, k.k->p); + bch2_btree_iter_set_pos(trans, &iter, k.k->p); } bkey_err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -2506,7 +2511,7 @@ void bch2_recalc_capacity(struct bch_fs *c) bch2_set_ra_pages(c, ra_pages); - for_each_rw_member(c, ca) { + __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) { u64 dev_reserve = 0; /* diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index c556ccaf..34b3d6ac 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -321,11 +321,11 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca, { u64 want_free = ca->mi.nbuckets >> 7; u64 free = max_t(s64, 0, - u.d[BCH_DATA_free].buckets - + u.d[BCH_DATA_need_discard].buckets + u.buckets[BCH_DATA_free] + + u.buckets[BCH_DATA_need_discard] - bch2_dev_buckets_reserved(ca, BCH_WATERMARK_stripe)); - return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets); + return clamp_t(s64, want_free - free, 0, u.buckets[BCH_DATA_cached]); } void bch2_dev_do_invalidates(struct bch_dev *); diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index 0cac6534..ae7eb523 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -180,11 +180,11 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) } static inline bool may_alloc_bucket(struct bch_fs *c, - struct bpos bucket, - struct bucket_alloc_state *s) + struct alloc_request *req, + struct bpos bucket) { if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) { - s->skipped_open++; + req->counters.skipped_open++; return false; } @@ -193,36 +193,37 @@ static inline bool may_alloc_bucket(struct bch_fs *c, bucket.inode, bucket.offset); if (journal_seq_ready > c->journal.flushed_seq_ondisk) { if (journal_seq_ready > c->journal.flushing_seq) - s->need_journal_commit++; - s->skipped_need_journal_commit++; + req->counters.need_journal_commit++; + req->counters.skipped_need_journal_commit++; return false; } if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) { - s->skipped_nocow++; + req->counters.skipped_nocow++; return false; } return true; } -static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, +static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, + struct alloc_request *req, u64 bucket, u8 gen, - enum bch_watermark watermark, - struct bucket_alloc_state *s, struct closure *cl) { + struct bch_dev *ca = req->ca; + if (unlikely(is_superblock_bucket(c, ca, bucket))) return NULL; if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { - s->skipped_nouse++; + req->counters.skipped_nouse++; return NULL; } spin_lock(&c->freelist_lock); - if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark))) { + if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(req->watermark))) { if (cl) closure_wait(&c->open_buckets_wait, cl); @@ -234,7 +235,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * /* Recheck under lock: */ if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { spin_unlock(&c->freelist_lock); - s->skipped_open++; + req->counters.skipped_open++; return NULL; } @@ -258,16 +259,15 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * return ob; } -static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, - enum bch_watermark watermark, - struct bucket_alloc_state *s, +static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, + struct alloc_request *req, struct btree_iter *freespace_iter, struct closure *cl) { struct bch_fs *c = trans->c; u64 b = freespace_iter->pos.offset & ~(~0ULL << 56); - if (!may_alloc_bucket(c, POS(ca->dev_idx, b), s)) + if (!may_alloc_bucket(c, req, POS(req->ca->dev_idx, b))) return NULL; u8 gen; @@ -277,7 +277,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc if (ret) return NULL; - return __try_alloc_bucket(c, ca, b, gen, watermark, s, cl); + return __try_alloc_bucket(c, req, b, gen, cl); } /* @@ -285,17 +285,16 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc */ static noinline struct open_bucket * bch2_bucket_alloc_early(struct btree_trans *trans, - struct bch_dev *ca, - enum bch_watermark watermark, - struct bucket_alloc_state *s, + struct alloc_request *req, struct closure *cl) { struct bch_fs *c = trans->c; + struct bch_dev *ca = req->ca; struct btree_iter iter, citer; struct bkey_s_c k, ck; struct open_bucket *ob = NULL; u64 first_bucket = ca->mi.first_bucket; - u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; + u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap]; u64 alloc_start = max(first_bucket, *dev_alloc_cursor); u64 alloc_cursor = alloc_start; int ret; @@ -317,19 +316,19 @@ again: if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) break; - if (s->btree_bitmap != BTREE_BITMAP_ANY && - s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, + if (req->btree_bitmap != BTREE_BITMAP_ANY && + req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { - if (s->btree_bitmap == BTREE_BITMAP_YES && + if (req->btree_bitmap == BTREE_BITMAP_YES && bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) break; bucket = sector_to_bucket(ca, round_up(bucket_to_sector(ca, bucket) + 1, 1ULL << ca->mi.btree_bitmap_shift)); - bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, bucket)); - s->buckets_seen++; - s->skipped_mi_btree_bitmap++; + bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, bucket)); + req->counters.buckets_seen++; + req->counters.skipped_mi_btree_bitmap++; continue; } @@ -348,14 +347,13 @@ again: if (a->data_type != BCH_DATA_free) goto next; - s->buckets_seen++; + req->counters.buckets_seen++; - ob = may_alloc_bucket(c, k.k->p, s) - ? __try_alloc_bucket(c, ca, k.k->p.offset, a->gen, - watermark, s, cl) + ob = may_alloc_bucket(c, req, k.k->p) + ? __try_alloc_bucket(c, req, k.k->p.offset, a->gen, cl) : NULL; next: - bch2_set_btree_iter_dontneed(&citer); + bch2_set_btree_iter_dontneed(trans, &citer); bch2_trans_iter_exit(trans, &citer); if (ob) break; @@ -378,15 +376,14 @@ next: } static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, - struct bch_dev *ca, - enum bch_watermark watermark, - struct bucket_alloc_state *s, - struct closure *cl) + struct alloc_request *req, + struct closure *cl) { + struct bch_dev *ca = req->ca; struct btree_iter iter; struct bkey_s_c k; struct open_bucket *ob = NULL; - u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; + u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap]; u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor)); u64 alloc_cursor = alloc_start; int ret; @@ -402,13 +399,13 @@ again: iter.k.size = iter.k.p.offset - iter.pos.offset; while (iter.k.size) { - s->buckets_seen++; + req->counters.buckets_seen++; u64 bucket = iter.pos.offset & ~(~0ULL << 56); - if (s->btree_bitmap != BTREE_BITMAP_ANY && - s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, + if (req->btree_bitmap != BTREE_BITMAP_ANY && + req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { - if (s->btree_bitmap == BTREE_BITMAP_YES && + if (req->btree_bitmap == BTREE_BITMAP_YES && bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) goto fail; @@ -417,16 +414,16 @@ again: 1ULL << ca->mi.btree_bitmap_shift)); alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56)); - bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor)); - s->skipped_mi_btree_bitmap++; + bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, alloc_cursor)); + req->counters.skipped_mi_btree_bitmap++; goto next; } - ob = try_alloc_bucket(trans, ca, watermark, s, &iter, cl); + ob = try_alloc_bucket(trans, req, &iter, cl); if (ob) { if (!IS_ERR(ob)) *dev_alloc_cursor = iter.pos.offset; - bch2_set_btree_iter_dontneed(&iter); + bch2_set_btree_iter_dontneed(trans, &iter); break; } @@ -453,33 +450,30 @@ fail: return ob; } -static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, - enum bch_watermark watermark, - enum bch_data_type data_type, +static noinline void trace_bucket_alloc2(struct bch_fs *c, + struct alloc_request *req, struct closure *cl, - struct bch_dev_usage *usage, - struct bucket_alloc_state *s, struct open_bucket *ob) { struct printbuf buf = PRINTBUF; printbuf_tabstop_push(&buf, 24); - prt_printf(&buf, "dev\t%s (%u)\n", ca->name, ca->dev_idx); - prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[watermark]); - prt_printf(&buf, "data type\t%s\n", __bch2_data_types[data_type]); + prt_printf(&buf, "dev\t%s (%u)\n", req->ca->name, req->ca->dev_idx); + prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[req->watermark]); + prt_printf(&buf, "data type\t%s\n", __bch2_data_types[req->data_type]); prt_printf(&buf, "blocking\t%u\n", cl != NULL); - prt_printf(&buf, "free\t%llu\n", usage->d[BCH_DATA_free].buckets); - prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(ca, *usage, watermark)); + prt_printf(&buf, "free\t%llu\n", req->usage.buckets[BCH_DATA_free]); + prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(req->ca, req->usage, req->watermark)); prt_printf(&buf, "copygc_wait\t%lu/%lli\n", bch2_copygc_wait_amount(c), c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now)); - prt_printf(&buf, "seen\t%llu\n", s->buckets_seen); - prt_printf(&buf, "open\t%llu\n", s->skipped_open); - prt_printf(&buf, "need journal commit\t%llu\n", s->skipped_need_journal_commit); - prt_printf(&buf, "nocow\t%llu\n", s->skipped_nocow); - prt_printf(&buf, "nouse\t%llu\n", s->skipped_nouse); - prt_printf(&buf, "mi_btree_bitmap\t%llu\n", s->skipped_mi_btree_bitmap); + prt_printf(&buf, "seen\t%llu\n", req->counters.buckets_seen); + prt_printf(&buf, "open\t%llu\n", req->counters.skipped_open); + prt_printf(&buf, "need journal commit\t%llu\n", req->counters.skipped_need_journal_commit); + prt_printf(&buf, "nocow\t%llu\n", req->counters.skipped_nocow); + prt_printf(&buf, "nouse\t%llu\n", req->counters.skipped_nouse); + prt_printf(&buf, "mi_btree_bitmap\t%llu\n", req->counters.skipped_mi_btree_bitmap); if (!IS_ERR(ob)) { prt_printf(&buf, "allocated\t%llu\n", ob->bucket); @@ -495,46 +489,41 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, /** * bch2_bucket_alloc_trans - allocate a single bucket from a specific device * @trans: transaction object - * @ca: device to allocate from - * @watermark: how important is this allocation? - * @data_type: BCH_DATA_journal, btree, user... + * @req: state for the entire allocation * @cl: if not NULL, closure to be used to wait if buckets not available * @nowait: if true, do not wait for buckets to become available - * @usage: for secondarily also returning the current device usage * * Returns: an open_bucket on success, or an ERR_PTR() on failure. */ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, - struct bch_dev *ca, - enum bch_watermark watermark, - enum bch_data_type data_type, - struct closure *cl, - bool nowait, - struct bch_dev_usage *usage) + struct alloc_request *req, + struct closure *cl, + bool nowait) { struct bch_fs *c = trans->c; + struct bch_dev *ca = req->ca; struct open_bucket *ob = NULL; bool freespace = READ_ONCE(ca->mi.freespace_initialized); u64 avail; - struct bucket_alloc_state s = { - .btree_bitmap = data_type == BCH_DATA_btree, - }; bool waiting = nowait; -again: - bch2_dev_usage_read_fast(ca, usage); - avail = dev_buckets_free(ca, *usage, watermark); - if (usage->d[BCH_DATA_need_discard].buckets > avail) + req->btree_bitmap = req->data_type == BCH_DATA_btree; + memset(&req->counters, 0, sizeof(req->counters)); +again: + bch2_dev_usage_read_fast(ca, &req->usage); + avail = dev_buckets_free(ca, req->usage, req->watermark); + + if (req->usage.buckets[BCH_DATA_need_discard] > avail) bch2_dev_do_discards(ca); - if (usage->d[BCH_DATA_need_gc_gens].buckets > avail) + if (req->usage.buckets[BCH_DATA_need_gc_gens] > avail) bch2_gc_gens_async(c); - if (should_invalidate_buckets(ca, *usage)) + if (should_invalidate_buckets(ca, req->usage)) bch2_dev_do_invalidates(ca); if (!avail) { - if (watermark > BCH_WATERMARK_normal && + if (req->watermark > BCH_WATERMARK_normal && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) goto alloc; @@ -554,14 +543,14 @@ again: closure_wake_up(&c->freelist_wait); alloc: ob = likely(freespace) - ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl) - : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl); + ? bch2_bucket_alloc_freelist(trans, req, cl) + : bch2_bucket_alloc_early(trans, req, cl); - if (s.need_journal_commit * 2 > avail) + if (req->counters.need_journal_commit * 2 > avail) bch2_journal_flush_async(&c->journal, NULL); - if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) { - s.btree_bitmap = BTREE_BITMAP_ANY; + if (!ob && req->btree_bitmap != BTREE_BITMAP_ANY) { + req->btree_bitmap = BTREE_BITMAP_ANY; goto alloc; } @@ -574,7 +563,7 @@ err: ob = ERR_PTR(-BCH_ERR_no_buckets_found); if (!IS_ERR(ob)) - ob->data_type = data_type; + ob->data_type = req->data_type; if (!IS_ERR(ob)) count_event(c, bucket_alloc); @@ -584,7 +573,7 @@ err: if (!IS_ERR(ob) ? trace_bucket_alloc_enabled() : trace_bucket_alloc_fail_enabled()) - trace_bucket_alloc2(c, ca, watermark, data_type, cl, usage, &s, ob); + trace_bucket_alloc2(c, req, cl, ob); return ob; } @@ -594,20 +583,22 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, enum bch_data_type data_type, struct closure *cl) { - struct bch_dev_usage usage; struct open_bucket *ob; + struct alloc_request req = { + .watermark = watermark, + .data_type = data_type, + .ca = ca, + }; bch2_trans_do(c, - PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark, - data_type, cl, false, &usage))); + PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req, cl, false))); return ob; } static int __dev_stripe_cmp(struct dev_stripe_state *stripe, unsigned l, unsigned r) { - return ((stripe->next_alloc[l] > stripe->next_alloc[r]) - - (stripe->next_alloc[l] < stripe->next_alloc[r])); + return cmp_int(stripe->next_alloc[l], stripe->next_alloc[r]); } #define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) @@ -626,25 +617,62 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, return ret; } +static const u64 stripe_clock_hand_rescale = 1ULL << 62; /* trigger rescale at */ +static const u64 stripe_clock_hand_max = 1ULL << 56; /* max after rescale */ +static const u64 stripe_clock_hand_inv = 1ULL << 52; /* max increment, if a device is empty */ + +static noinline void bch2_stripe_state_rescale(struct dev_stripe_state *stripe) +{ + /* + * Avoid underflowing clock hands if at all possible, if clock hands go + * to 0 then we lose information - clock hands can be in a wide range if + * we have devices we rarely try to allocate from, if we generally + * allocate from a specified target but only sometimes have to fall back + * to the whole filesystem. + */ + u64 scale_max = U64_MAX; /* maximum we can subtract without underflow */ + u64 scale_min = 0; /* minumum we must subtract to avoid overflow */ + + for (u64 *v = stripe->next_alloc; + v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) { + if (*v) + scale_max = min(scale_max, *v); + if (*v > stripe_clock_hand_max) + scale_min = max(scale_min, *v - stripe_clock_hand_max); + } + + u64 scale = max(scale_min, scale_max); + + for (u64 *v = stripe->next_alloc; + v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) + *v = *v < scale ? 0 : *v - scale; +} + static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca, struct dev_stripe_state *stripe, struct bch_dev_usage *usage) { + /* + * Stripe state has a per device clock hand: we allocate from the device + * with the smallest clock hand. + * + * When we allocate, we don't do a simple increment; we add the inverse + * of the device's free space. This results in round robin behavior that + * biases in favor of the device(s) with more free space. + */ + u64 *v = stripe->next_alloc + ca->dev_idx; u64 free_space = __dev_buckets_available(ca, *usage, BCH_WATERMARK_normal); u64 free_space_inv = free_space - ? div64_u64(1ULL << 48, free_space) - : 1ULL << 48; - u64 scale = *v / 4; + ? div64_u64(stripe_clock_hand_inv, free_space) + : stripe_clock_hand_inv; - if (*v + free_space_inv >= *v) - *v += free_space_inv; - else - *v = U64_MAX; + /* Saturating add, avoid overflow: */ + u64 sum = *v + free_space_inv; + *v = sum >= *v ? sum : U64_MAX; - for (v = stripe->next_alloc; - v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) - *v = *v < scale ? 0 : *v - scale; + if (unlikely(*v > stripe_clock_hand_rescale)) + bch2_stripe_state_rescale(stripe); } void bch2_dev_stripe_increment(struct bch_dev *ca, @@ -657,24 +685,20 @@ void bch2_dev_stripe_increment(struct bch_dev *ca, } static int add_new_bucket(struct bch_fs *c, - struct open_buckets *ptrs, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - struct open_bucket *ob) + struct alloc_request *req, + struct open_bucket *ob) { unsigned durability = ob_dev(c, ob)->mi.durability; - BUG_ON(*nr_effective >= nr_replicas); + BUG_ON(req->nr_effective >= req->nr_replicas); - __clear_bit(ob->dev, devs_may_alloc->d); - *nr_effective += durability; - *have_cache |= !durability; + __clear_bit(ob->dev, req->devs_may_alloc.d); + req->nr_effective += durability; + req->have_cache |= !durability; - ob_push(c, ptrs, ob); + ob_push(c, &req->ptrs, ob); - if (*nr_effective >= nr_replicas) + if (req->nr_effective >= req->nr_replicas) return 1; if (ob->ec) return 1; @@ -682,39 +706,31 @@ static int add_new_bucket(struct bch_fs *c, } int bch2_bucket_alloc_set_trans(struct btree_trans *trans, - struct open_buckets *ptrs, - struct dev_stripe_state *stripe, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum bch_write_flags flags, - enum bch_data_type data_type, - enum bch_watermark watermark, - struct closure *cl) + struct alloc_request *req, + struct dev_stripe_state *stripe, + struct closure *cl) { struct bch_fs *c = trans->c; int ret = -BCH_ERR_insufficient_devices; - BUG_ON(*nr_effective >= nr_replicas); + BUG_ON(req->nr_effective >= req->nr_replicas); - struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); + struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc); darray_for_each(devs_sorted, i) { - struct bch_dev *ca = bch2_dev_tryget_noerror(c, *i); - if (!ca) + req->ca = bch2_dev_tryget_noerror(c, *i); + if (!req->ca) continue; - if (!ca->mi.durability && *have_cache) { - bch2_dev_put(ca); + if (!req->ca->mi.durability && req->have_cache) { + bch2_dev_put(req->ca); continue; } - struct bch_dev_usage usage; - struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, - cl, flags & BCH_WRITE_alloc_nowait, &usage); + struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req, cl, + req->flags & BCH_WRITE_alloc_nowait); if (!IS_ERR(ob)) - bch2_dev_stripe_increment_inlined(ca, stripe, &usage); - bch2_dev_put(ca); + bch2_dev_stripe_increment_inlined(req->ca, stripe, &req->usage); + bch2_dev_put(req->ca); if (IS_ERR(ob)) { ret = PTR_ERR(ob); @@ -723,9 +739,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, continue; } - if (add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob)) { + if (add_new_bucket(c, req, ob)) { ret = 0; break; } @@ -743,34 +757,27 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, */ static int bucket_alloc_from_stripe(struct btree_trans *trans, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - u16 target, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum bch_watermark watermark, - enum bch_write_flags flags, - struct closure *cl) + struct alloc_request *req, + struct closure *cl) { struct bch_fs *c = trans->c; int ret = 0; - if (nr_replicas < 2) + if (req->nr_replicas < 2) return 0; - if (ec_open_bucket(c, ptrs)) + if (ec_open_bucket(c, &req->ptrs)) return 0; struct ec_stripe_head *h = - bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); + bch2_ec_stripe_head_get(trans, req, 0, cl); if (IS_ERR(h)) return PTR_ERR(h); if (!h) return 0; - struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); + struct dev_alloc_list devs_sorted = + bch2_dev_alloc_list(c, &req->wp->stripe, &req->devs_may_alloc); darray_for_each(devs_sorted, i) for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { if (!h->s->blocks[ec_idx]) @@ -782,9 +789,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, ob->ec = h->s; ec_stripe_new_get(h->s, STRIPE_REF_io); - ret = add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob); + ret = add_new_bucket(c, req, ob); goto out; } } @@ -796,65 +801,49 @@ out: /* Sector allocator */ static bool want_bucket(struct bch_fs *c, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - bool *have_cache, bool ec, + struct alloc_request *req, struct open_bucket *ob) { struct bch_dev *ca = ob_dev(c, ob); - if (!test_bit(ob->dev, devs_may_alloc->d)) + if (!test_bit(ob->dev, req->devs_may_alloc.d)) return false; - if (ob->data_type != wp->data_type) + if (ob->data_type != req->wp->data_type) return false; if (!ca->mi.durability && - (wp->data_type == BCH_DATA_btree || ec || *have_cache)) + (req->wp->data_type == BCH_DATA_btree || req->ec || req->have_cache)) return false; - if (ec != (ob->ec != NULL)) + if (req->ec != (ob->ec != NULL)) return false; return true; } static int bucket_alloc_set_writepoint(struct bch_fs *c, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - bool ec) + struct alloc_request *req) { - struct open_buckets ptrs_skip = { .nr = 0 }; struct open_bucket *ob; unsigned i; int ret = 0; - open_bucket_for_each(c, &wp->ptrs, ob, i) { - if (!ret && want_bucket(c, wp, devs_may_alloc, - have_cache, ec, ob)) - ret = add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob); + req->scratch_ptrs.nr = 0; + + open_bucket_for_each(c, &req->wp->ptrs, ob, i) { + if (!ret && want_bucket(c, req, ob)) + ret = add_new_bucket(c, req, ob); else - ob_push(c, &ptrs_skip, ob); + ob_push(c, &req->scratch_ptrs, ob); } - wp->ptrs = ptrs_skip; + req->wp->ptrs = req->scratch_ptrs; return ret; } static int bucket_alloc_set_partial(struct bch_fs *c, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, bool ec, - enum bch_watermark watermark) + struct alloc_request *req) { int i, ret = 0; @@ -869,13 +858,12 @@ static int bucket_alloc_set_partial(struct bch_fs *c, for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) { struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; - if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) { + if (want_bucket(c, req, ob)) { struct bch_dev *ca = ob_dev(c, ob); - struct bch_dev_usage usage; u64 avail; - bch2_dev_usage_read_fast(ca, &usage); - avail = dev_buckets_free(ca, usage, watermark) + ca->nr_partial_buckets; + bch2_dev_usage_read_fast(ca, &req->usage); + avail = dev_buckets_free(ca, req->usage, req->watermark) + ca->nr_partial_buckets; if (!avail) continue; @@ -888,9 +876,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c, bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; rcu_read_unlock(); - ret = add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob); + ret = add_new_bucket(c, req, ob); if (ret) break; } @@ -901,61 +887,41 @@ unlock: } static int __open_bucket_add_buckets(struct btree_trans *trans, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_list *devs_have, - u16 target, - bool erasure_code, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum bch_watermark watermark, - enum bch_write_flags flags, - struct closure *_cl) + struct alloc_request *req, + struct closure *_cl) { struct bch_fs *c = trans->c; - struct bch_devs_mask devs; struct open_bucket *ob; struct closure *cl = NULL; unsigned i; int ret; - devs = target_rw_devs(c, wp->data_type, target); + req->devs_may_alloc = target_rw_devs(c, req->wp->data_type, req->target); /* Don't allocate from devices we already have pointers to: */ - darray_for_each(*devs_have, i) - __clear_bit(*i, devs.d); + darray_for_each(*req->devs_have, i) + __clear_bit(*i, req->devs_may_alloc.d); - open_bucket_for_each(c, ptrs, ob, i) - __clear_bit(ob->dev, devs.d); + open_bucket_for_each(c, &req->ptrs, ob, i) + __clear_bit(ob->dev, req->devs_may_alloc.d); - ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs, - nr_replicas, nr_effective, - have_cache, erasure_code); + ret = bucket_alloc_set_writepoint(c, req); if (ret) return ret; - ret = bucket_alloc_set_partial(c, ptrs, wp, &devs, - nr_replicas, nr_effective, - have_cache, erasure_code, watermark); + ret = bucket_alloc_set_partial(c, req); if (ret) return ret; - if (erasure_code) { - ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs, - target, - nr_replicas, nr_effective, - have_cache, - watermark, flags, _cl); + if (req->ec) { + ret = bucket_alloc_from_stripe(trans, req, _cl); } else { retry_blocking: /* * Try nonblocking first, so that if one device is full we'll try from * other devices: */ - ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs, - nr_replicas, nr_effective, have_cache, - flags, wp->data_type, watermark, cl); + ret = bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe, cl); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart) && !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && @@ -969,38 +935,27 @@ retry_blocking: } static int open_bucket_add_buckets(struct btree_trans *trans, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_list *devs_have, - u16 target, - unsigned erasure_code, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum bch_watermark watermark, - enum bch_write_flags flags, - struct closure *cl) + struct alloc_request *req, + struct closure *cl) { int ret; - if (erasure_code && !ec_open_bucket(trans->c, ptrs)) { - ret = __open_bucket_add_buckets(trans, ptrs, wp, - devs_have, target, erasure_code, - nr_replicas, nr_effective, have_cache, - watermark, flags, cl); + if (req->ec && !ec_open_bucket(trans->c, &req->ptrs)) { + ret = __open_bucket_add_buckets(trans, req, cl); if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || bch2_err_matches(ret, BCH_ERR_operation_blocked) || bch2_err_matches(ret, BCH_ERR_freelist_empty) || bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) return ret; - if (*nr_effective >= nr_replicas) + if (req->nr_effective >= req->nr_replicas) return 0; } - ret = __open_bucket_add_buckets(trans, ptrs, wp, - devs_have, target, false, - nr_replicas, nr_effective, have_cache, - watermark, flags, cl); + bool ec = false; + swap(ec, req->ec); + ret = __open_bucket_add_buckets(trans, req, cl); + swap(ec, req->ec); + return ret < 0 ? ret : 0; } @@ -1253,26 +1208,26 @@ out: static noinline void deallocate_extra_replicas(struct bch_fs *c, - struct open_buckets *ptrs, - struct open_buckets *ptrs_no_use, - unsigned extra_replicas) + struct alloc_request *req) { - struct open_buckets ptrs2 = { 0 }; struct open_bucket *ob; + unsigned extra_replicas = req->nr_effective - req->nr_replicas; unsigned i; - open_bucket_for_each(c, ptrs, ob, i) { + req->scratch_ptrs.nr = 0; + + open_bucket_for_each(c, &req->ptrs, ob, i) { unsigned d = ob_dev(c, ob)->mi.durability; if (d && d <= extra_replicas) { extra_replicas -= d; - ob_push(c, ptrs_no_use, ob); + ob_push(c, &req->wp->ptrs, ob); } else { - ob_push(c, &ptrs2, ob); + ob_push(c, &req->scratch_ptrs, ob); } } - *ptrs = ptrs2; + req->ptrs = req->scratch_ptrs; } /* @@ -1291,51 +1246,53 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, struct write_point **wp_ret) { struct bch_fs *c = trans->c; - struct write_point *wp; struct open_bucket *ob; - struct open_buckets ptrs; - unsigned nr_effective, write_points_nr; - bool have_cache; - int ret; + unsigned write_points_nr; int i; + struct alloc_request *req = bch2_trans_kmalloc_nomemzero(trans, sizeof(*req)); + int ret = PTR_ERR_OR_ZERO(req); + if (unlikely(ret)) + return ret; + + req->nr_replicas = nr_replicas; + req->target = target; + req->ec = erasure_code; + req->watermark = watermark; + req->flags = flags; + req->devs_have = devs_have; + if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) erasure_code = false; BUG_ON(!nr_replicas || !nr_replicas_required); retry: - ptrs.nr = 0; - nr_effective = 0; - write_points_nr = c->write_points_nr; - have_cache = false; + req->ptrs.nr = 0; + req->nr_effective = 0; + req->have_cache = false; + write_points_nr = c->write_points_nr; - *wp_ret = wp = writepoint_find(trans, write_point.v); + *wp_ret = req->wp = writepoint_find(trans, write_point.v); + + req->data_type = req->wp->data_type; ret = bch2_trans_relock(trans); if (ret) goto err; /* metadata may not allocate on cache devices: */ - if (wp->data_type != BCH_DATA_user) - have_cache = true; + if (req->data_type != BCH_DATA_user) + req->have_cache = true; if (target && !(flags & BCH_WRITE_only_specified_devs)) { - ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, - target, erasure_code, - nr_replicas, &nr_effective, - &have_cache, watermark, - flags, NULL); + ret = open_bucket_add_buckets(trans, req, NULL); if (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto alloc_done; /* Don't retry from all devices if we're out of open buckets: */ if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) { - int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, - target, erasure_code, - nr_replicas, &nr_effective, - &have_cache, watermark, - flags, cl); + int ret2 = open_bucket_add_buckets(trans, req, cl); if (!ret2 || bch2_err_matches(ret2, BCH_ERR_transaction_restart) || bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) { @@ -1348,59 +1305,52 @@ retry: * Only try to allocate cache (durability = 0 devices) from the * specified target: */ - have_cache = true; + req->have_cache = true; + req->target = 0; - ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, - 0, erasure_code, - nr_replicas, &nr_effective, - &have_cache, watermark, - flags, cl); + ret = open_bucket_add_buckets(trans, req, cl); } else { - ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, - target, erasure_code, - nr_replicas, &nr_effective, - &have_cache, watermark, - flags, cl); + ret = open_bucket_add_buckets(trans, req, cl); } alloc_done: - BUG_ON(!ret && nr_effective < nr_replicas); + BUG_ON(!ret && req->nr_effective < req->nr_replicas); - if (erasure_code && !ec_open_bucket(c, &ptrs)) + if (erasure_code && !ec_open_bucket(c, &req->ptrs)) pr_debug("failed to get ec bucket: ret %u", ret); if (ret == -BCH_ERR_insufficient_devices && - nr_effective >= nr_replicas_required) + req->nr_effective >= nr_replicas_required) ret = 0; if (ret) goto err; - if (nr_effective > nr_replicas) - deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas); + if (req->nr_effective > req->nr_replicas) + deallocate_extra_replicas(c, req); /* Free buckets we didn't use: */ - open_bucket_for_each(c, &wp->ptrs, ob, i) + open_bucket_for_each(c, &req->wp->ptrs, ob, i) open_bucket_free_unused(c, ob); - wp->ptrs = ptrs; + req->wp->ptrs = req->ptrs; - wp->sectors_free = UINT_MAX; + req->wp->sectors_free = UINT_MAX; - open_bucket_for_each(c, &wp->ptrs, ob, i) - wp->sectors_free = min(wp->sectors_free, ob->sectors_free); + open_bucket_for_each(c, &req->wp->ptrs, ob, i) + req->wp->sectors_free = min(req->wp->sectors_free, ob->sectors_free); - BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); + BUG_ON(!req->wp->sectors_free || req->wp->sectors_free == UINT_MAX); return 0; err: - open_bucket_for_each(c, &wp->ptrs, ob, i) - if (ptrs.nr < ARRAY_SIZE(ptrs.v)) - ob_push(c, &ptrs, ob); + open_bucket_for_each(c, &req->wp->ptrs, ob, i) + if (req->ptrs.nr < ARRAY_SIZE(req->ptrs.v)) + ob_push(c, &req->ptrs, ob); else open_bucket_free_unused(c, ob); - wp->ptrs = ptrs; + req->wp->ptrs = req->ptrs; - mutex_unlock(&wp->lock); + mutex_unlock(&req->wp->lock); if (bch2_err_matches(ret, BCH_ERR_freelist_empty) && try_decrease_writepoints(trans, write_points_nr)) @@ -1560,7 +1510,7 @@ static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c, unsigned i; prt_printf(out, "%lu: ", wp->write_point); - prt_human_readable_u64(out, wp->sectors_allocated); + prt_human_readable_u64(out, wp->sectors_allocated << 9); prt_printf(out, " last wrote: "); bch2_pr_time_units(out, sched_clock() - wp->last_used); @@ -1633,7 +1583,7 @@ void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) { struct bch_fs *c = ca->fs; - struct bch_dev_usage stats = bch2_dev_usage_read(ca); + struct bch_dev_usage_full stats = bch2_dev_usage_full_read(ca); unsigned nr[BCH_DATA_NR]; memset(nr, 0, sizeof(nr)); @@ -1656,7 +1606,8 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) printbuf_tabstop_push(out, 16); prt_printf(out, "open buckets\t%i\r\n", ca->nr_open_buckets); - prt_printf(out, "buckets to invalidate\t%llu\r\n", should_invalidate_buckets(ca, stats)); + prt_printf(out, "buckets to invalidate\t%llu\r\n", + should_invalidate_buckets(ca, bch2_dev_usage_read(ca))); } static noinline void bch2_print_allocator_stuck(struct bch_fs *c) diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h index 69ec6a01..64e1f1ef 100644 --- a/libbcachefs/alloc_foreground.h +++ b/libbcachefs/alloc_foreground.h @@ -5,6 +5,7 @@ #include "bcachefs.h" #include "alloc_types.h" #include "extents.h" +#include "io_write_types.h" #include "sb-members.h" #include <linux/hash.h> @@ -23,6 +24,52 @@ struct dev_alloc_list { u8 data[BCH_SB_MEMBERS_MAX]; }; +struct alloc_request { + unsigned nr_replicas; + unsigned target; + bool ec; + enum bch_watermark watermark; + enum bch_write_flags flags; + enum bch_data_type data_type; + struct bch_devs_list *devs_have; + struct write_point *wp; + + /* These fields are used primarily by open_bucket_add_buckets */ + struct open_buckets ptrs; + unsigned nr_effective; /* sum of @ptrs durability */ + bool have_cache; /* have we allocated from a 0 durability dev */ + struct bch_devs_mask devs_may_alloc; + + /* bch2_bucket_alloc_set_trans(): */ + struct bch_dev_usage usage; + + /* bch2_bucket_alloc_trans(): */ + struct bch_dev *ca; + + enum { + BTREE_BITMAP_NO, + BTREE_BITMAP_YES, + BTREE_BITMAP_ANY, + } btree_bitmap; + + struct { + u64 buckets_seen; + u64 skipped_open; + u64 skipped_need_journal_commit; + u64 need_journal_commit; + u64 skipped_nocow; + u64 skipped_nouse; + u64 skipped_mi_btree_bitmap; + } counters; + + unsigned scratch_nr_replicas; + unsigned scratch_nr_effective; + bool scratch_have_cache; + enum bch_data_type scratch_data_type; + struct open_buckets scratch_ptrs; + struct bch_devs_mask scratch_devs_may_alloc; +}; + struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, struct dev_stripe_state *, struct bch_devs_mask *); @@ -171,11 +218,8 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 } enum bch_write_flags; -int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *, - struct dev_stripe_state *, struct bch_devs_mask *, - unsigned, unsigned *, bool *, enum bch_write_flags, - enum bch_data_type, enum bch_watermark, - struct closure *); +int bch2_bucket_alloc_set_trans(struct btree_trans *, struct alloc_request *, + struct dev_stripe_state *, struct closure *); int bch2_alloc_sectors_start_trans(struct btree_trans *, unsigned, unsigned, diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h index 8f79f46c..e7becdf2 100644 --- a/libbcachefs/alloc_types.h +++ b/libbcachefs/alloc_types.h @@ -8,22 +8,6 @@ #include "clock_types.h" #include "fifo.h" -struct bucket_alloc_state { - enum { - BTREE_BITMAP_NO, - BTREE_BITMAP_YES, - BTREE_BITMAP_ANY, - } btree_bitmap; - - u64 buckets_seen; - u64 skipped_open; - u64 skipped_need_journal_commit; - u64 need_journal_commit; - u64 skipped_nocow; - u64 skipped_nouse; - u64 skipped_mi_btree_bitmap; -}; - #define BCH_WATERMARKS() \ x(stripe) \ x(normal) \ diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c index 21d1d86d..ff26bb51 100644 --- a/libbcachefs/backpointers.c +++ b/libbcachefs/backpointers.c @@ -252,12 +252,24 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, 0, bp.v->level, iter_flags); - struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); if (bkey_err(k)) { bch2_trans_iter_exit(trans, iter); return k; } + /* + * peek_slot() doesn't normally return NULL - except when we ask for a + * key at a btree level that doesn't exist. + * + * We may want to revisit this and change peek_slot(): + */ + if (!k.k) { + bkey_init(&iter->k); + iter->k.p = bp.v->pos; + k.k = &iter->k; + } + if (k.k && extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) return k; @@ -293,7 +305,7 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans, 0, bp.v->level - 1, 0); - struct btree *b = bch2_btree_iter_peek_node(iter); + struct btree *b = bch2_btree_iter_peek_node(trans, iter); if (IS_ERR_OR_NULL(b)) goto err; @@ -321,7 +333,7 @@ static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, st return 0; struct bch_fs *c = trans->c; - struct btree_iter alloc_iter = { NULL }; + struct btree_iter alloc_iter = {}; struct bkey_s_c alloc_k; struct printbuf buf = PRINTBUF; int ret = 0; @@ -462,7 +474,7 @@ err: if (bio) bio_put(bio); kvfree(data_buf); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); printbuf_exit(&buf); return ret; } @@ -650,7 +662,7 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans, retry: bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, bch2_btree_id_root(c, btree_id)->b->c.level, 0); - b = bch2_btree_iter_peek_node(&iter); + b = bch2_btree_iter_peek_node(trans, &iter); ret = PTR_ERR_OR_ZERO(b); if (ret) goto err; @@ -934,7 +946,7 @@ static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k, { struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0); - struct btree *b = bch2_btree_iter_peek_node(&iter); + struct btree *b = bch2_btree_iter_peek_node(trans, &iter); int ret = PTR_ERR_OR_ZERO(b); if (ret) goto err; diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index f5231101..04ce43d7 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -524,8 +524,8 @@ struct bch_dev { struct percpu_ref ref; #endif struct completion ref_completion; - struct percpu_ref io_ref; - struct completion io_ref_completion; + struct percpu_ref io_ref[2]; + struct completion io_ref_completion[2]; struct bch_fs *fs; @@ -562,7 +562,8 @@ struct bch_dev { unsigned long *bucket_backpointer_mismatches; unsigned long *bucket_backpointer_empty; - struct bch_dev_usage __percpu *usage; + struct bch_dev_usage_full __percpu + *usage; /* Allocator: */ u64 alloc_cursor[3]; @@ -613,6 +614,7 @@ struct bch_dev { x(accounting_replay_done) \ x(may_go_rw) \ x(rw) \ + x(rw_init_done) \ x(was_rw) \ x(stopping) \ x(emergency_ro) \ @@ -649,6 +651,9 @@ struct btree_transaction_stats { unsigned nr_max_paths; unsigned journal_entries_size; unsigned max_mem; +#ifdef CONFIG_BCACHEFS_DEBUG + darray_trans_kmalloc_trace trans_kmalloc_trace; +#endif char *max_paths_text; }; @@ -871,7 +876,7 @@ struct bch_fs { struct btree_write_buffer btree_write_buffer; struct workqueue_struct *btree_update_wq; - struct workqueue_struct *btree_io_complete_wq; + struct workqueue_struct *btree_write_complete_wq; /* copygc needs its own workqueue for index updates.. */ struct workqueue_struct *copygc_wq; /* diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index a3db328d..f9bfb434 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -842,7 +842,7 @@ LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); -/* one free bit */ +LE64_BITMASK(BCH_SB_SINGLE_DEVICE, struct bch_sb, flags[3], 63, 64); LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34); @@ -863,6 +863,7 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED, LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4); LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14); LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20); +LE64_BITMASK(BCH_SB_DEGRADED_ACTION, struct bch_sb, flags[6], 20, 22); static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) { @@ -984,6 +985,19 @@ enum bch_error_actions { BCH_ON_ERROR_NR }; +#define BCH_DEGRADED_ACTIONS() \ + x(ask, 0) \ + x(yes, 1) \ + x(very, 2) \ + x(no, 3) + +enum bch_degraded_actions { +#define x(t, n) BCH_DEGRADED_##t = n, + BCH_DEGRADED_ACTIONS() +#undef x + BCH_DEGRADED_ACTIONS_NR +}; + #define BCH_STR_HASH_TYPES() \ x(crc32c, 0) \ x(crc64, 1) \ diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index 054e2d5e..08263290 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -191,6 +191,7 @@ static inline struct bpos bkey_max(struct bpos l, struct bpos r) static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) { return bpos_eq(l.k->p, r.k->p) && + l.k->size == r.k->size && bkey_bytes(l.k) == bkey_bytes(r.k) && !memcmp(l.v, r.v, bkey_val_bytes(l.k)); } diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 2025d408..2824a6e8 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -691,7 +691,7 @@ retry_root: struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, bch2_btree_id_root(c, btree)->b->c.level, 0); - struct btree *b = bch2_btree_iter_peek_node(&iter); + struct btree *b = bch2_btree_iter_peek_node(trans, &iter); ret = PTR_ERR_OR_ZERO(b); if (ret) goto err_root; @@ -1199,7 +1199,7 @@ int bch2_gc_gens(struct bch_fs *c) BCH_TRANS_COMMIT_no_enospc, ({ ca = bch2_dev_iterate(c, ca, k.k->p.inode); if (!ca) { - bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); continue; } bch2_alloc_write_oldest_gen(trans, ca, &iter, k); @@ -1243,16 +1243,11 @@ void bch2_gc_gens_async(struct bch_fs *c) bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens); } -void bch2_fs_btree_gc_exit(struct bch_fs *c) -{ -} - -int bch2_fs_btree_gc_init(struct bch_fs *c) +void bch2_fs_btree_gc_init_early(struct bch_fs *c) { seqcount_init(&c->gc_pos_lock); INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work); init_rwsem(&c->gc_lock); mutex_init(&c->gc_gens_lock); - return 0; } diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h index 9693a90a..ec776623 100644 --- a/libbcachefs/btree_gc.h +++ b/libbcachefs/btree_gc.h @@ -83,7 +83,6 @@ void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *); int bch2_gc_gens(struct bch_fs *); void bch2_gc_gens_async(struct bch_fs *); -void bch2_fs_btree_gc_exit(struct bch_fs *); -int bch2_fs_btree_gc_init(struct bch_fs *); +void bch2_fs_btree_gc_init_early(struct bch_fs *); #endif /* _BCACHEFS_BTREE_GC_H */ diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 1d94a2bf..14e3329b 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1353,7 +1353,7 @@ start: "btree read error %s for %s", bch2_blk_status_to_str(bio->bi_status), buf.buf); if (rb->have_ioref) - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); rb->have_ioref = false; bch2_mark_io_failure(&failed, &rb->pick, false); @@ -1609,6 +1609,7 @@ static void btree_node_read_all_replicas_endio(struct bio *bio) struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev); bch2_latency_acct(ca, rb->start_time, READ); + percpu_ref_put(&ca->io_ref[READ]); } ra->err[rb->idx] = bio->bi_status; @@ -1908,7 +1909,8 @@ static void btree_node_scrub_work(struct work_struct *work) scrub->key.k->k.p, 0, scrub->level - 1, 0); struct btree *b; - int ret = lockrestart_do(trans, PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(&iter))); + int ret = lockrestart_do(trans, + PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(trans, &iter))); if (ret) goto err; @@ -1927,7 +1929,7 @@ err: printbuf_exit(&err); bch2_bkey_buf_exit(&scrub->key, c);; btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf); - percpu_ref_put(&scrub->ca->io_ref); + percpu_ref_put(&scrub->ca->io_ref[READ]); kfree(scrub); bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub); } @@ -1996,7 +1998,7 @@ int bch2_btree_node_scrub(struct btree_trans *trans, return 0; err_free: btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); err: bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub); return ret; @@ -2144,6 +2146,7 @@ static void btree_node_write_endio(struct bio *bio) if (ca && bio->bi_status) { struct printbuf buf = PRINTBUF; + buf.atomic++; prt_printf(&buf, "btree write error: %s\n ", bch2_blk_status_to_str(bio->bi_status)); bch2_btree_pos_to_text(&buf, c, b); @@ -2158,8 +2161,12 @@ static void btree_node_write_endio(struct bio *bio) spin_unlock_irqrestore(&c->btree_write_error_lock, flags); } + /* + * XXX: we should be using io_ref[WRITE], but we aren't retrying failed + * btree writes yet (due to device removal/ro): + */ if (wbio->have_ioref) - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); if (parent) { bio_put(bio); @@ -2170,7 +2177,7 @@ static void btree_node_write_endio(struct bio *bio) clear_btree_node_write_in_flight_inner(b); wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner); INIT_WORK(&wb->work, btree_node_write_work); - queue_work(c->btree_io_complete_wq, &wb->work); + queue_work(c->btree_write_complete_wq, &wb->work); } static int validate_bset_for_write(struct bch_fs *c, struct btree *b, diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index a9c110b8..db7d6e19 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -244,10 +244,8 @@ void bch2_trans_verify_paths(struct btree_trans *trans) bch2_btree_path_verify(trans, path); } -static void bch2_btree_iter_verify(struct btree_iter *iter) +static void bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter *iter) { - struct btree_trans *trans = iter->trans; - BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached); BUG_ON((iter->flags & BTREE_ITER_is_extents) && @@ -276,9 +274,9 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) bkey_gt(iter->pos, iter->k.p))); } -static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) +static int bch2_btree_iter_verify_ret(struct btree_trans *trans, + struct btree_iter *iter, struct bkey_s_c k) { - struct btree_trans *trans = iter->trans; struct btree_iter copy; struct bkey_s_c prev; int ret = 0; @@ -299,7 +297,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, BTREE_ITER_nopreserve| BTREE_ITER_all_snapshots); - prev = bch2_btree_iter_prev(©); + prev = bch2_btree_iter_prev(trans, ©); if (!prev.k) goto out; @@ -365,9 +363,11 @@ static inline void bch2_btree_path_verify_level(struct btree_trans *trans, struct btree_path *path, unsigned l) {} static inline void bch2_btree_path_verify(struct btree_trans *trans, struct btree_path *path) {} -static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} +static inline void bch2_btree_iter_verify(struct btree_trans *trans, + struct btree_iter *iter) {} static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {} -static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; } +static inline int bch2_btree_iter_verify_ret(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k) { return 0; } #endif @@ -1855,10 +1855,8 @@ hole: return (struct bkey_s_c) { u, NULL }; } -void bch2_set_btree_iter_dontneed(struct btree_iter *iter) +void bch2_set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter) { - struct btree_trans *trans = iter->trans; - if (!iter->path || trans->restarted) return; @@ -1870,17 +1868,14 @@ void bch2_set_btree_iter_dontneed(struct btree_iter *iter) /* Btree iterators: */ int __must_check -__bch2_btree_iter_traverse(struct btree_iter *iter) +__bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter) { - return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); + return bch2_btree_path_traverse(trans, iter->path, iter->flags); } int __must_check -bch2_btree_iter_traverse(struct btree_iter *iter) +bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter) { - struct btree_trans *trans = iter->trans; - int ret; - bch2_trans_verify_not_unlocked_or_in_restart(trans); iter->path = bch2_btree_path_set_pos(trans, iter->path, @@ -1888,7 +1883,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter) iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); - ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); + int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (ret) return ret; @@ -1900,14 +1895,14 @@ bch2_btree_iter_traverse(struct btree_iter *iter) /* Iterate across nodes (leaf and interior nodes) */ -struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) +struct btree *bch2_btree_iter_peek_node(struct btree_trans *trans, + struct btree_iter *iter) { - struct btree_trans *trans = iter->trans; struct btree *b = NULL; int ret; EBUG_ON(trans->paths[iter->path].cached); - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (ret) @@ -1929,7 +1924,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); out: bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); return b; err: @@ -1938,26 +1933,26 @@ err: } /* Only kept for -tools */ -struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter) +struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *trans, + struct btree_iter *iter) { struct btree *b; - while (b = bch2_btree_iter_peek_node(iter), + while (b = bch2_btree_iter_peek_node(trans, iter), bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart)) - bch2_trans_begin(iter->trans); + bch2_trans_begin(trans); return b; } -struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) +struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_iter *iter) { - struct btree_trans *trans = iter->trans; struct btree *b = NULL; int ret; EBUG_ON(trans->paths[iter->path].cached); bch2_trans_verify_not_unlocked_or_in_restart(trans); - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (ret) @@ -2024,7 +2019,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) EBUG_ON(btree_iter_path(trans, iter)->uptodate); out: bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); return b; err: @@ -2034,7 +2029,7 @@ err: /* Iterate across keys (in leaf nodes only) */ -inline bool bch2_btree_iter_advance(struct btree_iter *iter) +inline bool bch2_btree_iter_advance(struct btree_trans *trans, struct btree_iter *iter) { struct bpos pos = iter->k.p; bool ret = !(iter->flags & BTREE_ITER_all_snapshots @@ -2043,11 +2038,11 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter) if (ret && !(iter->flags & BTREE_ITER_is_extents)) pos = bkey_successor(iter, pos); - bch2_btree_iter_set_pos(iter, pos); + bch2_btree_iter_set_pos(trans, iter, pos); return ret; } -inline bool bch2_btree_iter_rewind(struct btree_iter *iter) +inline bool bch2_btree_iter_rewind(struct btree_trans *trans, struct btree_iter *iter) { struct bpos pos = bkey_start_pos(&iter->k); bool ret = !(iter->flags & BTREE_ITER_all_snapshots @@ -2056,7 +2051,7 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter) if (ret && !(iter->flags & BTREE_ITER_is_extents)) pos = bkey_predecessor(iter, pos); - bch2_btree_iter_set_pos(iter, pos); + bch2_btree_iter_set_pos(trans, iter, pos); return ret; } @@ -2183,9 +2178,9 @@ void btree_trans_peek_prev_journal(struct btree_trans *trans, * bkey_s_c_null: */ static noinline -struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) +struct bkey_s_c btree_trans_peek_key_cache(struct btree_trans *trans, struct btree_iter *iter, + struct bpos pos) { - struct btree_trans *trans = iter->trans; struct bch_fs *c = trans->c; struct bkey u; struct bkey_s_c k; @@ -2231,14 +2226,14 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos return k; } -static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) +static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct btree_iter *iter, + struct bpos search_key) { - struct btree_trans *trans = iter->trans; struct bkey_s_c k, k2; int ret; EBUG_ON(btree_iter_path(trans, iter)->cached); - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); while (1) { iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, @@ -2248,7 +2243,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (unlikely(ret)) { /* ensure that iter->k is consistent with iter->pos: */ - bch2_btree_iter_set_pos(iter, iter->pos); + bch2_btree_iter_set_pos(trans, iter, iter->pos); k = bkey_s_c_err(ret); break; } @@ -2258,7 +2253,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp if (unlikely(!l->b)) { /* No btree nodes at requested level: */ - bch2_btree_iter_set_pos(iter, SPOS_MAX); + bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); k = bkey_s_c_null; break; } @@ -2269,10 +2264,10 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && k.k && - (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { + (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) { k = k2; if (bkey_err(k)) { - bch2_btree_iter_set_pos(iter, iter->pos); + bch2_btree_iter_set_pos(trans, iter, iter->pos); break; } } @@ -2305,27 +2300,28 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp search_key = bpos_successor(l->b->key.k.p); } else { /* End of btree: */ - bch2_btree_iter_set_pos(iter, SPOS_MAX); + bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); k = bkey_s_c_null; break; } } - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); return k; } /** * bch2_btree_iter_peek_max() - returns first key greater than or equal to * iterator's current position + * @trans: btree transaction object * @iter: iterator to peek from * @end: search limit: returns keys less than or equal to @end * * Returns: key if found, or an error extractable with bkey_err(). */ -struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos end) +struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree_iter *iter, + struct bpos end) { - struct btree_trans *trans = iter->trans; struct bpos search_key = btree_iter_search_key(iter); struct bkey_s_c k; struct bpos iter_pos = iter->pos; @@ -2348,7 +2344,7 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en } while (1) { - k = __bch2_btree_iter_peek(iter, search_key); + k = __bch2_btree_iter_peek(trans, iter, search_key); if (unlikely(!k.k)) goto end; if (unlikely(bkey_err(k))) @@ -2462,9 +2458,9 @@ out_no_locked: if (!(iter->flags & BTREE_ITER_all_snapshots)) iter->pos.snapshot = iter->snapshot; - ret = bch2_btree_iter_verify_ret(iter, k); + ret = bch2_btree_iter_verify_ret(trans, iter, k); if (unlikely(ret)) { - bch2_btree_iter_set_pos(iter, iter->pos); + bch2_btree_iter_set_pos(trans, iter, iter->pos); k = bkey_s_c_err(ret); } @@ -2472,7 +2468,7 @@ out_no_locked: return k; end: - bch2_btree_iter_set_pos(iter, end); + bch2_btree_iter_set_pos(trans, iter, end); k = bkey_s_c_null; goto out_no_locked; } @@ -2480,24 +2476,25 @@ end: /** * bch2_btree_iter_next() - returns first key greater than iterator's current * position + * @trans: btree transaction object * @iter: iterator to peek from * * Returns: key if found, or an error extractable with bkey_err(). */ -struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_next(struct btree_trans *trans, struct btree_iter *iter) { - if (!bch2_btree_iter_advance(iter)) + if (!bch2_btree_iter_advance(trans, iter)) return bkey_s_c_null; - return bch2_btree_iter_peek(iter); + return bch2_btree_iter_peek(trans, iter); } -static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, struct bpos search_key) +static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter, + struct bpos search_key) { - struct btree_trans *trans = iter->trans; struct bkey_s_c k, k2; - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); while (1) { iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, @@ -2507,7 +2504,7 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (unlikely(ret)) { /* ensure that iter->k is consistent with iter->pos: */ - bch2_btree_iter_set_pos(iter, iter->pos); + bch2_btree_iter_set_pos(trans, iter, iter->pos); k = bkey_s_c_err(ret); break; } @@ -2517,7 +2514,7 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru if (unlikely(!l->b)) { /* No btree nodes at requested level: */ - bch2_btree_iter_set_pos(iter, SPOS_MAX); + bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); k = bkey_s_c_null; break; } @@ -2533,10 +2530,10 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && k.k && - (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { + (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) { k = k2; if (bkey_err(k2)) { - bch2_btree_iter_set_pos(iter, iter->pos); + bch2_btree_iter_set_pos(trans, iter, iter->pos); break; } } @@ -2557,25 +2554,27 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru search_key = bpos_predecessor(path->l[0].b->data->min_key); } else { /* Start of btree: */ - bch2_btree_iter_set_pos(iter, POS_MIN); + bch2_btree_iter_set_pos(trans, iter, POS_MIN); k = bkey_s_c_null; break; } } - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); return k; } /** * bch2_btree_iter_peek_prev_min() - returns first key less than or equal to * iterator's current position + * @trans: btree transaction object * @iter: iterator to peek from * @end: search limit: returns keys greater than or equal to @end * * Returns: key if found, or an error extractable with bkey_err(). */ -struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bpos end) +struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct btree_iter *iter, + struct bpos end) { if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) && !bkey_eq(iter->pos, POS_MAX)) { @@ -2587,7 +2586,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp * real visible extents - easiest to just use peek_slot() (which * internally uses peek() for extents) */ - struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); if (bkey_err(k)) return k; @@ -2597,7 +2596,6 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp return k; } - struct btree_trans *trans = iter->trans; struct bpos search_key = iter->pos; struct bkey_s_c k; btree_path_idx_t saved_path = 0; @@ -2613,7 +2611,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp } while (1) { - k = __bch2_btree_iter_peek_prev(iter, search_key); + k = __bch2_btree_iter_peek_prev(trans, iter, search_key); if (unlikely(!k.k)) goto end; if (unlikely(bkey_err(k))) @@ -2704,10 +2702,10 @@ out_no_locked: bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_intent); bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); return k; end: - bch2_btree_iter_set_pos(iter, end); + bch2_btree_iter_set_pos(trans, iter, end); k = bkey_s_c_null; goto out_no_locked; } @@ -2715,27 +2713,27 @@ end: /** * bch2_btree_iter_prev() - returns first key less than iterator's current * position + * @trans: btree transaction object * @iter: iterator to peek from * * Returns: key if found, or an error extractable with bkey_err(). */ -struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *trans, struct btree_iter *iter) { - if (!bch2_btree_iter_rewind(iter)) + if (!bch2_btree_iter_rewind(trans, iter)) return bkey_s_c_null; - return bch2_btree_iter_peek_prev(iter); + return bch2_btree_iter_peek_prev(trans, iter); } -struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btree_iter *iter) { - struct btree_trans *trans = iter->trans; struct bpos search_key; struct bkey_s_c k; int ret; bch2_trans_verify_not_unlocked_or_in_restart(trans); - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); bch2_btree_iter_verify_entry_exit(iter); EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache)); @@ -2751,7 +2749,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) if (iter->pos.inode == KEY_INODE_MAX) return bkey_s_c_null; - bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); + bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos)); } search_key = btree_iter_search_key(iter); @@ -2785,7 +2783,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) goto out; if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && - (k = btree_trans_peek_key_cache(iter, iter->pos)).k) { + (k = btree_trans_peek_key_cache(trans, iter, iter->pos)).k) { if (!bkey_err(k)) iter->k = *k.k; /* We're not returning a key from iter->path: */ @@ -2812,8 +2810,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) if (iter->flags & BTREE_ITER_intent) { struct btree_iter iter2; - bch2_trans_copy_iter(&iter2, iter); - k = bch2_btree_iter_peek_max(&iter2, end); + bch2_trans_copy_iter(trans, &iter2, iter); + k = bch2_btree_iter_peek_max(trans, &iter2, end); if (k.k && !bkey_err(k)) { swap(iter->key_cache_path, iter2.key_cache_path); @@ -2824,9 +2822,9 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } else { struct bpos pos = iter->pos; - k = bch2_btree_iter_peek_max(iter, end); + k = bch2_btree_iter_peek_max(trans, iter, end); if (unlikely(bkey_err(k))) - bch2_btree_iter_set_pos(iter, pos); + bch2_btree_iter_set_pos(trans, iter, pos); else iter->pos = pos; } @@ -2857,39 +2855,39 @@ out: btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); out_no_locked: bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(iter); - ret = bch2_btree_iter_verify_ret(iter, k); + bch2_btree_iter_verify(trans, iter); + ret = bch2_btree_iter_verify_ret(trans, iter, k); if (unlikely(ret)) return bkey_s_c_err(ret); return k; } -struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *trans, struct btree_iter *iter) { - if (!bch2_btree_iter_advance(iter)) + if (!bch2_btree_iter_advance(trans, iter)) return bkey_s_c_null; - return bch2_btree_iter_peek_slot(iter); + return bch2_btree_iter_peek_slot(trans, iter); } -struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *trans, struct btree_iter *iter) { - if (!bch2_btree_iter_rewind(iter)) + if (!bch2_btree_iter_rewind(trans, iter)) return bkey_s_c_null; - return bch2_btree_iter_peek_slot(iter); + return bch2_btree_iter_peek_slot(trans, iter); } /* Obsolete, but still used by rust wrapper in -tools */ -struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *trans, struct btree_iter *iter) { struct bkey_s_c k; - while (btree_trans_too_many_iters(iter->trans) || - (k = bch2_btree_iter_peek_type(iter, iter->flags), + while (btree_trans_too_many_iters(trans) || + (k = bch2_btree_iter_peek_type(trans, iter, iter->flags), bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) - bch2_trans_begin(iter->trans); + bch2_trans_begin(trans); return k; } @@ -3035,7 +3033,6 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) iter->path = 0; iter->update_path = 0; iter->key_cache_path = 0; - iter->trans = NULL; } void bch2_trans_iter_init_outlined(struct btree_trans *trans, @@ -3075,10 +3072,9 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, BUG_ON(iter->min_depth != depth); } -void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) +void bch2_trans_copy_iter(struct btree_trans *trans, + struct btree_iter *dst, struct btree_iter *src) { - struct btree_trans *trans = src->trans; - *dst = *src; #ifdef TRACK_PATH_ALLOCATED dst->ip_allocated = _RET_IP_; @@ -3090,7 +3086,19 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) dst->key_cache_path = 0; } -void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_trans_kmalloc_trace_to_text(struct printbuf *out, + darray_trans_kmalloc_trace *trace) +{ + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 60); + + darray_for_each(*trace, i) + prt_printf(out, "%pS\t%zu\n", (void *) i->ip, i->bytes); +} +#endif + +void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long ip) { struct bch_fs *c = trans->c; unsigned new_top = trans->mem_top + size; @@ -3100,14 +3108,33 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) void *new_mem; void *p; - WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); + if (WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX)) { +#ifdef CONFIG_BCACHEFS_DEBUG + struct printbuf buf = PRINTBUF; + bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace); + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); +#endif + } ret = trans_maybe_inject_restart(trans, _RET_IP_); if (ret) return ERR_PTR(ret); struct btree_transaction_stats *s = btree_trans_stats(trans); - s->max_mem = max(s->max_mem, new_bytes); + if (new_bytes > s->max_mem) { +#ifdef CONFIG_BCACHEFS_DEBUG + darray_resize(&s->trans_kmalloc_trace, trans->trans_kmalloc_trace.nr); + s->trans_kmalloc_trace.nr = min(s->trans_kmalloc_trace.size, + trans->trans_kmalloc_trace.nr); + + memcpy(s->trans_kmalloc_trace.data, + trans->trans_kmalloc_trace.data, + sizeof(s->trans_kmalloc_trace.data[0]) * + s->trans_kmalloc_trace.nr); +#endif + s->max_mem = new_bytes; + } if (trans->used_mempool) { if (trans->mem_bytes >= new_bytes) @@ -3167,6 +3194,8 @@ out_new_mem: BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); } out_change_top: + bch2_trans_kmalloc_trace(trans, size, ip); + p = trans->mem + trans->mem_top; trans->mem_top += size; memset(p, 0, size); @@ -3280,6 +3309,10 @@ u32 bch2_trans_begin(struct btree_trans *trans) } #endif +#ifdef CONFIG_BCACHEFS_DEBUG + trans->trans_kmalloc_trace.nr = 0; +#endif + trans_set_locked(trans, false); if (trans->restarted) { @@ -3448,6 +3481,7 @@ void bch2_trans_put(struct btree_trans *trans) #ifdef CONFIG_BCACHEFS_DEBUG darray_exit(&trans->last_restarted_trace); + darray_exit(&trans->trans_kmalloc_trace); #endif unsigned long *paths_allocated = trans->paths_allocated; @@ -3603,6 +3637,9 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) for (s = c->btree_transaction_stats; s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); s++) { +#ifdef CONFIG_BCACHEFS_DEBUG + darray_exit(&s->trans_kmalloc_trace); +#endif kfree(s->max_paths_text); bch2_time_stats_exit(&s->lock_hold_times); } diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index e6f51a3b..7d00d2ff 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -393,36 +393,37 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct void bch2_trans_node_drop(struct btree_trans *trans, struct btree *); void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *); -int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter); -int __must_check bch2_btree_iter_traverse(struct btree_iter *); +int __must_check __bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *); +int __must_check bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *); -struct btree *bch2_btree_iter_peek_node(struct btree_iter *); -struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *); -struct btree *bch2_btree_iter_next_node(struct btree_iter *); +struct btree *bch2_btree_iter_peek_node(struct btree_trans *, struct btree_iter *); +struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *, struct btree_iter *); +struct btree *bch2_btree_iter_next_node(struct btree_trans *, struct btree_iter *); -struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *, struct bpos); -struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *, struct btree_iter *, struct bpos); +struct bkey_s_c bch2_btree_iter_next(struct btree_trans *, struct btree_iter *); -static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) +static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_trans *trans, + struct btree_iter *iter) { - return bch2_btree_iter_peek_max(iter, SPOS_MAX); + return bch2_btree_iter_peek_max(trans, iter, SPOS_MAX); } -struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *, struct bpos); +struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *, struct btree_iter *, struct bpos); -static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) +static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter) { - return bch2_btree_iter_peek_prev_min(iter, POS_MIN); + return bch2_btree_iter_peek_prev_min(trans, iter, POS_MIN); } -struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *, struct btree_iter *); -struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); -struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); -struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *, struct btree_iter *); +struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *, struct btree_iter *); +struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *, struct btree_iter *); -bool bch2_btree_iter_advance(struct btree_iter *); -bool bch2_btree_iter_rewind(struct btree_iter *); +bool bch2_btree_iter_advance(struct btree_trans *, struct btree_iter *); +bool bch2_btree_iter_rewind(struct btree_trans *, struct btree_iter *); static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) { @@ -433,10 +434,9 @@ static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpo iter->k.size = 0; } -static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) +static inline void bch2_btree_iter_set_pos(struct btree_trans *trans, + struct btree_iter *iter, struct bpos new_pos) { - struct btree_trans *trans = iter->trans; - if (unlikely(iter->update_path)) bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_intent); @@ -454,13 +454,14 @@ static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *it iter->pos = bkey_start_pos(&iter->k); } -static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot) +static inline void bch2_btree_iter_set_snapshot(struct btree_trans *trans, + struct btree_iter *iter, u32 snapshot) { struct bpos pos = iter->pos; iter->snapshot = snapshot; pos.snapshot = snapshot; - bch2_btree_iter_set_pos(iter, pos); + bch2_btree_iter_set_pos(trans, iter, pos); } void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *); @@ -502,7 +503,6 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans, unsigned flags, unsigned long ip) { - iter->trans = trans; iter->update_path = 0; iter->key_cache_path = 0; iter->btree_id = btree_id; @@ -539,22 +539,50 @@ static inline void bch2_trans_iter_init(struct btree_trans *trans, void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *, enum btree_id, struct bpos, unsigned, unsigned, unsigned); -void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *); +void bch2_trans_copy_iter(struct btree_trans *, struct btree_iter *, struct btree_iter *); -void bch2_set_btree_iter_dontneed(struct btree_iter *); +void bch2_set_btree_iter_dontneed(struct btree_trans *, struct btree_iter *); -void *__bch2_trans_kmalloc(struct btree_trans *, size_t); +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_trans_kmalloc_trace_to_text(struct printbuf *, + darray_trans_kmalloc_trace *); +#endif -/** - * bch2_trans_kmalloc - allocate memory for use by the current transaction - * - * Must be called after bch2_trans_begin, which on second and further calls - * frees all memory allocated in this transaction - */ -static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +void *__bch2_trans_kmalloc(struct btree_trans *, size_t, unsigned long); + +static inline void bch2_trans_kmalloc_trace(struct btree_trans *trans, size_t size, + unsigned long ip) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + darray_push(&trans->trans_kmalloc_trace, + ((struct trans_kmalloc_trace) { .ip = ip, .bytes = size })); +#endif +} + +static __always_inline void *bch2_trans_kmalloc_nomemzero_ip(struct btree_trans *trans, size_t size, + unsigned long ip) { size = roundup(size, 8); + bch2_trans_kmalloc_trace(trans, size, ip); + + if (likely(trans->mem_top + size <= trans->mem_bytes)) { + void *p = trans->mem + trans->mem_top; + + trans->mem_top += size; + return p; + } else { + return __bch2_trans_kmalloc(trans, size, ip); + } +} + +static __always_inline void *bch2_trans_kmalloc_ip(struct btree_trans *trans, size_t size, + unsigned long ip) +{ + size = roundup(size, 8); + + bch2_trans_kmalloc_trace(trans, size, ip); + if (likely(trans->mem_top + size <= trans->mem_bytes)) { void *p = trans->mem + trans->mem_top; @@ -562,22 +590,24 @@ static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) memset(p, 0, size); return p; } else { - return __bch2_trans_kmalloc(trans, size); + return __bch2_trans_kmalloc(trans, size, ip); } } -static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size) +/** + * bch2_trans_kmalloc - allocate memory for use by the current transaction + * + * Must be called after bch2_trans_begin, which on second and further calls + * frees all memory allocated in this transaction + */ +static __always_inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) { - size = round_up(size, 8); + return bch2_trans_kmalloc_ip(trans, size, _THIS_IP_); +} - if (likely(trans->mem_top + size <= trans->mem_bytes)) { - void *p = trans->mem + trans->mem_top; - - trans->mem_top += size; - return p; - } else { - return __bch2_trans_kmalloc(trans, size); - } +static __always_inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size) +{ + return bch2_trans_kmalloc_nomemzero_ip(trans, size, _THIS_IP_); } static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans, @@ -588,7 +618,7 @@ static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans, struct bkey_s_c k; bch2_trans_iter_init(trans, iter, btree_id, pos, flags); - k = bch2_btree_iter_peek_slot(iter); + k = bch2_btree_iter_peek_slot(trans, iter); if (!bkey_err(k) && type && k.k->type != type) k = bkey_s_c_err(-BCH_ERR_ENOENT_bkey_type_mismatch); @@ -658,14 +688,14 @@ u32 bch2_trans_begin(struct btree_trans *); int _ret3 = 0; \ do { \ _ret3 = lockrestart_do((_trans), ({ \ - struct btree *_b = bch2_btree_iter_peek_node(&_iter); \ + struct btree *_b = bch2_btree_iter_peek_node(_trans, &_iter);\ if (!_b) \ break; \ \ PTR_ERR_OR_ZERO(_b) ?: (_do); \ })) ?: \ lockrestart_do((_trans), \ - PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(&_iter))); \ + PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(_trans, &_iter)));\ } while (!_ret3); \ \ bch2_trans_iter_exit((_trans), &(_iter)); \ @@ -677,31 +707,34 @@ u32 bch2_trans_begin(struct btree_trans *); __for_each_btree_node(_trans, _iter, _btree_id, _start, \ 0, 0, _flags, _b, _do) -static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter, +static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_trans *trans, + struct btree_iter *iter, unsigned flags) { - return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) : - bch2_btree_iter_peek_prev(iter); + return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) : + bch2_btree_iter_peek_prev(trans, iter); } -static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, +static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_trans *trans, + struct btree_iter *iter, unsigned flags) { - return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) : - bch2_btree_iter_peek(iter); + return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) : + bch2_btree_iter_peek(trans, iter); } -static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_iter *iter, - struct bpos end, - unsigned flags) +static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos end, + unsigned flags) { if (!(flags & BTREE_ITER_slots)) - return bch2_btree_iter_peek_max(iter, end); + return bch2_btree_iter_peek_max(trans, iter, end); if (bkey_gt(iter->pos, end)) return bkey_s_c_null; - return bch2_btree_iter_peek_slot(iter); + return bch2_btree_iter_peek_slot(trans, iter); } int __bch2_btree_trans_too_many_iters(struct btree_trans *); @@ -768,14 +801,14 @@ transaction_restart: \ \ do { \ _ret3 = lockrestart_do(_trans, ({ \ - (_k) = bch2_btree_iter_peek_max_type(&(_iter), \ + (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), \ _end, (_flags)); \ if (!(_k).k) \ break; \ \ bkey_err(_k) ?: (_do); \ })); \ - } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \ + } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \ \ bch2_trans_iter_exit((_trans), &(_iter)); \ _ret3; \ @@ -813,14 +846,14 @@ transaction_restart: \ \ do { \ _ret3 = lockrestart_do(_trans, ({ \ - (_k) = bch2_btree_iter_peek_prev_type(&(_iter), \ + (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), \ (_flags)); \ if (!(_k).k) \ break; \ \ bkey_err(_k) ?: (_do); \ })); \ - } while (!_ret3 && bch2_btree_iter_rewind(&(_iter))); \ + } while (!_ret3 && bch2_btree_iter_rewind(_trans, &(_iter))); \ \ bch2_trans_iter_exit((_trans), &(_iter)); \ _ret3; \ @@ -850,37 +883,38 @@ transaction_restart: \ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ (_journal_seq), (_commit_flags))) -struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *, + struct btree_iter *); #define for_each_btree_key_max_norestart(_trans, _iter, _btree_id, \ _start, _end, _flags, _k, _ret) \ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ - (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags),\ + (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags),\ !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(&(_iter))) + bch2_btree_iter_advance(_trans, &(_iter))) -#define for_each_btree_key_max_continue_norestart(_iter, _end, _flags, _k, _ret)\ +#define for_each_btree_key_max_continue_norestart(_trans, _iter, _end, _flags, _k, _ret)\ for (; \ - (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags), \ + (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags), \ !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(&(_iter))) + bch2_btree_iter_advance(_trans, &(_iter))) #define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ _start, _flags, _k, _ret) \ for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start,\ SPOS_MAX, _flags, _k, _ret) -#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \ - _start, _flags, _k, _ret) \ - for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - (_k) = bch2_btree_iter_peek_prev_type(&(_iter), _flags), \ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_rewind(&(_iter))) +#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \ + _start, _flags, _k, _ret) \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), _flags), \ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_rewind(_trans, &(_iter))) -#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ - for_each_btree_key_max_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret) +#define for_each_btree_key_continue_norestart(_trans, _iter, _flags, _k, _ret) \ + for_each_btree_key_max_continue_norestart(_trans, _iter, SPOS_MAX, _flags, _k, _ret) /* * This should not be used in a fastpath, without first trying _do in diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index edce5943..2b186584 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -287,6 +287,19 @@ err: return ret; } +static noinline_for_stack void do_trace_key_cache_fill(struct btree_trans *trans, + struct btree_path *ck_path, + struct bkey_s_c k) +{ + struct printbuf buf = PRINTBUF; + + bch2_bpos_to_text(&buf, ck_path->pos); + prt_char(&buf, ' '); + bch2_bkey_val_to_text(&buf, trans->c, k); + trace_key_cache_fill(trans, buf.buf); + printbuf_exit(&buf); +} + static noinline int btree_key_cache_fill(struct btree_trans *trans, struct btree_path *ck_path, unsigned flags) @@ -306,7 +319,7 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, BTREE_ITER_key_cache_fill| BTREE_ITER_cached_nofill); iter.flags &= ~BTREE_ITER_with_journal; - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); ret = bkey_err(k); if (ret) goto err; @@ -320,18 +333,11 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, if (ret) goto err; - if (trace_key_cache_fill_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bpos_to_text(&buf, ck_path->pos); - prt_char(&buf, ' '); - bch2_bkey_val_to_text(&buf, trans->c, k); - trace_key_cache_fill(trans, buf.buf); - printbuf_exit(&buf); - } + if (trace_key_cache_fill_enabled()) + do_trace_key_cache_fill(trans, ck_path, k); out: /* We're not likely to need this iterator again: */ - bch2_set_btree_iter_dontneed(&iter); + bch2_set_btree_iter_dontneed(trans, &iter); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -412,7 +418,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, BTREE_ITER_intent); b_iter.flags &= ~BTREE_ITER_with_key_cache; - ret = bch2_btree_iter_traverse(&c_iter); + ret = bch2_btree_iter_traverse(trans, &c_iter); if (ret) goto out; @@ -444,7 +450,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, !test_bit(JOURNAL_space_low, &c->journal.flags)) commit_flags |= BCH_TRANS_COMMIT_no_journal_res; - struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(&b_iter); + struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(trans, &b_iter); ret = bkey_err(btree_k); if (ret) goto err; diff --git a/libbcachefs/btree_node_scan.c b/libbcachefs/btree_node_scan.c index 25d54b77..8c9fdb72 100644 --- a/libbcachefs/btree_node_scan.c +++ b/libbcachefs/btree_node_scan.c @@ -271,7 +271,7 @@ static int read_btree_nodes_worker(void *p) err: bio_put(bio); free_page((unsigned long) buf); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); closure_put(w->cl); kfree(w); return 0; @@ -291,7 +291,7 @@ static int read_btree_nodes(struct find_btree_nodes *f) struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); if (!w) { - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); ret = -ENOMEM; goto err; } @@ -303,14 +303,14 @@ static int read_btree_nodes(struct find_btree_nodes *f) struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); ret = PTR_ERR_OR_ZERO(t); if (ret) { - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); kfree(w); bch_err_msg(c, ret, "starting kthread"); break; } closure_get(&cl); - percpu_ref_get(&ca->io_ref); + percpu_ref_get(&ca->io_ref[READ]); wake_up_process(t); } err: diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c index 7d7e52dd..4297d8b5 100644 --- a/libbcachefs/btree_trans_commit.c +++ b/libbcachefs/btree_trans_commit.c @@ -20,6 +20,7 @@ #include "snapshot.h" #include <linux/prefetch.h> +#include <linux/string_helpers.h> static const char * const trans_commit_flags_strs[] = { #define x(n, ...) #n, @@ -366,7 +367,8 @@ static noinline void journal_transaction_name(struct btree_trans *trans) struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); - strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64)); + memcpy_and_pad(l->d, JSET_ENTRY_LOG_U64s * sizeof(u64), + trans->fn, strlen(trans->fn), 0); } static inline int btree_key_can_insert(struct btree_trans *trans, diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 77578da2..dd109dea 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -367,7 +367,6 @@ static inline unsigned long btree_path_ip_allocated(struct btree_path *path) * @nodes_intent_locked - bitmask indicating which locks are intent locks */ struct btree_iter { - struct btree_trans *trans; btree_path_idx_t path; btree_path_idx_t update_path; btree_path_idx_t key_cache_path; @@ -478,6 +477,12 @@ struct btree_trans_paths { struct btree_path paths[]; }; +struct trans_kmalloc_trace { + unsigned long ip; + size_t bytes; +}; +typedef DARRAY(struct trans_kmalloc_trace) darray_trans_kmalloc_trace; + struct btree_trans { struct bch_fs *c; @@ -489,6 +494,9 @@ struct btree_trans { void *mem; unsigned mem_top; unsigned mem_bytes; +#ifdef CONFIG_BCACHEFS_DEBUG + darray_trans_kmalloc_trace trans_kmalloc_trace; +#endif btree_path_idx_t nr_sorted; btree_path_idx_t nr_paths; diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c index c05394f5..2bffd512 100644 --- a/libbcachefs/btree_update.c +++ b/libbcachefs/btree_update.c @@ -14,6 +14,8 @@ #include "snapshot.h" #include "trace.h" +#include <linux/string_helpers.h> + static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, const struct btree_insert_entry *r) { @@ -126,7 +128,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, struct bpos new_pos) { struct bch_fs *c = trans->c; - struct btree_iter old_iter, new_iter = { NULL }; + struct btree_iter old_iter, new_iter = {}; struct bkey_s_c old_k, new_k; snapshot_id_list s; struct bkey_i *update; @@ -140,7 +142,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, bch2_trans_iter_init(trans, &old_iter, id, old_pos, BTREE_ITER_not_extents| BTREE_ITER_all_snapshots); - while ((old_k = bch2_btree_iter_prev(&old_iter)).k && + while ((old_k = bch2_btree_iter_prev(trans, &old_iter)).k && !(ret = bkey_err(old_k)) && bkey_eq(old_pos, old_k.k->p)) { struct bpos whiteout_pos = @@ -296,7 +298,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans, BTREE_ITER_intent| BTREE_ITER_with_updates| BTREE_ITER_not_extents); - k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX)); + k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX)); if ((ret = bkey_err(k))) goto err; if (!k.k) @@ -322,8 +324,8 @@ static int bch2_trans_update_extent(struct btree_trans *trans, if (done) goto out; next: - bch2_btree_iter_advance(&iter); - k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX)); + bch2_btree_iter_advance(trans, &iter); + k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX)); if ((ret = bkey_err(k))) goto err; if (!k.k) @@ -592,13 +594,13 @@ int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, enum btree_id btree, struct bpos end) { bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent); - struct bkey_s_c k = bch2_btree_iter_peek_prev(iter); + struct bkey_s_c k = bch2_btree_iter_peek_prev(trans, iter); int ret = bkey_err(k); if (ret) goto err; - bch2_btree_iter_advance(iter); - k = bch2_btree_iter_peek_slot(iter); + bch2_btree_iter_advance(trans, iter); + k = bch2_btree_iter_peek_slot(trans, iter); ret = bkey_err(k); if (ret) goto err; @@ -634,7 +636,7 @@ int bch2_btree_insert_nonextent(struct btree_trans *trans, BTREE_ITER_cached| BTREE_ITER_not_extents| BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(&iter) ?: + ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_trans_update(trans, &iter, k, flags); bch2_trans_iter_exit(trans, &iter); return ret; @@ -646,7 +648,7 @@ int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id, struct btree_iter iter; bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), BTREE_ITER_intent|flags); - int ret = bch2_btree_iter_traverse(&iter) ?: + int ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_trans_update(trans, &iter, k, flags); bch2_trans_iter_exit(trans, &iter); return ret; @@ -695,7 +697,7 @@ int bch2_btree_delete(struct btree_trans *trans, bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_cached| BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(&iter) ?: + ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_btree_delete_at(trans, &iter, update_flags); bch2_trans_iter_exit(trans, &iter); @@ -713,7 +715,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, int ret = 0; bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent); - while ((k = bch2_btree_iter_peek_max(&iter, end)).k) { + while ((k = bch2_btree_iter_peek_max(trans, &iter, end)).k) { struct disk_reservation disk_res = bch2_disk_reservation_init(trans->c, 0); struct bkey_i delete; @@ -808,7 +810,7 @@ int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, struct btree_iter iter; bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent); - int ret = bch2_btree_iter_traverse(&iter) ?: + int ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_btree_bit_mod_iter(trans, &iter, set); bch2_trans_iter_exit(trans, &iter); return ret; @@ -829,7 +831,6 @@ int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree, int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) { unsigned u64s = DIV_ROUND_UP(buf->pos, sizeof(u64)); - prt_chars(buf, '\0', u64s * sizeof(u64) - buf->pos); int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; if (ret) @@ -842,7 +843,7 @@ int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry); journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s); - memcpy(l->d, buf->buf, buf->pos); + memcpy_and_pad(l->d, u64s * sizeof(u64), buf->buf, buf->pos, 0); return 0; } @@ -868,7 +869,6 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, prt_vprintf(&buf, fmt, args); unsigned u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); - prt_chars(&buf, '\0', u64s * sizeof(u64) - buf.pos); int ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; if (ret) @@ -881,7 +881,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, struct jset_entry_log *l = (void *) &darray_top(c->journal.early_journal_entries); journal_entry_init(&l->entry, BCH_JSET_ENTRY_log, 0, 1, u64s); - memcpy(l->d, buf.buf, buf.pos); + memcpy_and_pad(l->d, u64s * sizeof(u64), buf.buf, buf.pos, 0); c->journal.early_journal_entries.nr += jset_u64s(u64s); } else { ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags, diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index 568e56c9..e674419c 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -222,7 +222,7 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans) trans->extra_disk_res = 0; } -static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k, +static __always_inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k, unsigned type, unsigned min_bytes) { unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k)); @@ -245,7 +245,7 @@ static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *t return mut; } -static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k) +static __always_inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k) { return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0); } diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index bf7e1dac..55fbeeb8 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -2147,7 +2147,7 @@ static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter, bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p, BTREE_MAX_DEPTH, b->c.level, BTREE_ITER_intent); - int ret = bch2_btree_iter_traverse(iter); + int ret = bch2_btree_iter_traverse(trans, iter); if (ret) goto err; @@ -2239,7 +2239,7 @@ static int bch2_btree_node_rewrite_key(struct btree_trans *trans, bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, BTREE_MAX_DEPTH, level, 0); - struct btree *b = bch2_btree_iter_peek_node(&iter); + struct btree *b = bch2_btree_iter_peek_node(trans, &iter); int ret = PTR_ERR_OR_ZERO(b); if (ret) goto out; @@ -2262,7 +2262,7 @@ int bch2_btree_node_rewrite_pos(struct btree_trans *trans, /* Traverse one depth lower to get a pointer to the node itself: */ struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0); - struct btree *b = bch2_btree_iter_peek_node(&iter); + struct btree *b = bch2_btree_iter_peek_node(trans, &iter); int ret = PTR_ERR_OR_ZERO(b); if (ret) goto err; @@ -2406,7 +2406,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, bool skip_triggers) { struct bch_fs *c = trans->c; - struct btree_iter iter2 = { NULL }; + struct btree_iter iter2 = {}; struct btree *parent; int ret; @@ -2430,7 +2430,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, parent = btree_node_parent(btree_iter_path(trans, iter), b); if (parent) { - bch2_trans_copy_iter(&iter2, iter); + bch2_trans_copy_iter(trans, &iter2, iter); iter2.path = bch2_btree_path_make_mut(trans, iter2.path, iter2.flags & BTREE_ITER_intent, @@ -2444,7 +2444,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, trans->paths_sorted = false; - ret = bch2_btree_iter_traverse(&iter2) ?: + ret = bch2_btree_iter_traverse(trans, &iter2) ?: bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun); if (ret) goto err; diff --git a/libbcachefs/btree_write_buffer.c b/libbcachefs/btree_write_buffer.c index 2c09d19d..52db2d91 100644 --- a/libbcachefs/btree_write_buffer.c +++ b/libbcachefs/btree_write_buffer.c @@ -144,7 +144,7 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq); EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq); - ret = bch2_btree_iter_traverse(iter); + ret = bch2_btree_iter_traverse(trans, iter); if (ret) return ret; @@ -208,7 +208,7 @@ btree_write_buffered_insert(struct btree_trans *trans, trans->journal_res.seq = wb->journal_seq; - ret = bch2_btree_iter_traverse(&iter) ?: + ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_trans_update(trans, &iter, &wb->k, BTREE_UPDATE_internal_snapshot_node); bch2_trans_iter_exit(trans, &iter); @@ -285,7 +285,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) struct bch_fs *c = trans->c; struct journal *j = &c->journal; struct btree_write_buffer *wb = &c->btree_write_buffer; - struct btree_iter iter = { NULL }; + struct btree_iter iter = {}; size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0; bool write_locked = false; bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags); @@ -368,7 +368,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) write_locked = false; ret = lockrestart_do(trans, - bch2_btree_iter_traverse(&iter) ?: + bch2_btree_iter_traverse(trans, &iter) ?: bch2_foreground_maybe_merge(trans, iter.path, 0, BCH_WATERMARK_reclaim| BCH_TRANS_COMMIT_journal_reclaim| @@ -385,7 +385,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) BTREE_ITER_intent|BTREE_ITER_all_snapshots); } - bch2_btree_iter_set_pos(&iter, k->k.k.p); + bch2_btree_iter_set_pos(trans, &iter, k->k.k.p); btree_iter_path(trans, &iter)->preserve = false; bool accounting_accumulated = false; @@ -866,13 +866,18 @@ void bch2_fs_btree_write_buffer_exit(struct bch_fs *c) darray_exit(&wb->inc.keys); } -int bch2_fs_btree_write_buffer_init(struct bch_fs *c) +void bch2_fs_btree_write_buffer_init_early(struct bch_fs *c) { struct btree_write_buffer *wb = &c->btree_write_buffer; mutex_init(&wb->inc.lock); mutex_init(&wb->flushing.lock); INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work); +} + +int bch2_fs_btree_write_buffer_init(struct bch_fs *c) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; /* Will be resized by journal as needed: */ unsigned initial_size = 1 << 16; diff --git a/libbcachefs/btree_write_buffer.h b/libbcachefs/btree_write_buffer.h index d535cea2..05f56fd1 100644 --- a/libbcachefs/btree_write_buffer.h +++ b/libbcachefs/btree_write_buffer.h @@ -101,6 +101,7 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_t int bch2_btree_write_buffer_resize(struct bch_fs *, size_t); void bch2_fs_btree_write_buffer_exit(struct bch_fs *); +void bch2_fs_btree_write_buffer_init_early(struct bch_fs *); int bch2_fs_btree_write_buffer_init(struct bch_fs *); #endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */ diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 0903311c..fea61e60 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -29,6 +29,12 @@ #include <linux/preempt.h> void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage) +{ + for (unsigned i = 0; i < BCH_DATA_NR; i++) + usage->buckets[i] = percpu_u64_get(&ca->usage->d[i].buckets); +} + +void bch2_dev_usage_full_read_fast(struct bch_dev *ca, struct bch_dev_usage_full *usage) { memset(usage, 0, sizeof(*usage)); acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, dev_usage_u64s()); @@ -75,7 +81,7 @@ bch2_fs_usage_read_short(struct bch_fs *c) void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev *ca, - struct bch_dev_usage *usage) + struct bch_dev_usage_full *usage) { if (out->nr_tabstops < 5) { printbuf_tabstops_reset(out); @@ -365,7 +371,7 @@ found: struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level, BTREE_ITER_intent|BTREE_ITER_all_snapshots); - ret = bch2_btree_iter_traverse(&iter) ?: + ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node| BTREE_TRIGGER_norun); @@ -707,7 +713,7 @@ err: struct disk_accounting_pos acc; memset(&acc, 0, sizeof(acc)); acc.type = BCH_DISK_ACCOUNTING_replicas; - memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e)); + unsafe_memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e), "VLA"); gc_stripe_unlock(m); acc.replicas.data_type = data_type; @@ -1132,7 +1138,7 @@ int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c, for_each_online_member(c, ca) { int ret = bch2_trans_mark_dev_sb(c, ca, flags); if (ret) { - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); return ret; } } @@ -1331,7 +1337,7 @@ void bch2_dev_buckets_free(struct bch_dev *ca) int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) { - ca->usage = alloc_percpu(struct bch_dev_usage); + ca->usage = alloc_percpu(struct bch_dev_usage_full); if (!ca->usage) return -BCH_ERR_ENOMEM_usage_init; diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index c5363256..1c38b165 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -172,7 +172,16 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) return ret; } -void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage *); +void bch2_dev_usage_full_read_fast(struct bch_dev *, struct bch_dev_usage_full *); +static inline struct bch_dev_usage_full bch2_dev_usage_full_read(struct bch_dev *ca) +{ + struct bch_dev_usage_full ret; + + bch2_dev_usage_full_read_fast(ca, &ret); + return ret; +} + +void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage_full *); static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark) { @@ -207,7 +216,7 @@ static inline u64 dev_buckets_free(struct bch_dev *ca, enum bch_watermark watermark) { return max_t(s64, 0, - usage.d[BCH_DATA_free].buckets - + usage.buckets[BCH_DATA_free]- ca->nr_open_buckets - bch2_dev_buckets_reserved(ca, watermark)); } @@ -217,10 +226,10 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca, enum bch_watermark watermark) { return max_t(s64, 0, - usage.d[BCH_DATA_free].buckets - + usage.d[BCH_DATA_cached].buckets - + usage.d[BCH_DATA_need_gc_gens].buckets - + usage.d[BCH_DATA_need_discard].buckets + usage.buckets[BCH_DATA_free] + + usage.buckets[BCH_DATA_cached] + + usage.buckets[BCH_DATA_need_gc_gens] + + usage.buckets[BCH_DATA_need_discard] - ca->nr_open_buckets - bch2_dev_buckets_reserved(ca, watermark)); } diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index 900b8680..0aed2500 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -54,7 +54,12 @@ struct bucket_gens { u8 b[] __counted_by(nbuckets); }; +/* Only info on bucket countns: */ struct bch_dev_usage { + u64 buckets[BCH_DATA_NR]; +}; + +struct bch_dev_usage_full { struct bch_dev_usage_type { u64 buckets; u64 sectors; /* _compressed_ sectors: */ diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index 584f4a3e..5891b3a1 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -350,8 +350,8 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf, if (ctx->arg.op == BCH_DATA_OP_scrub) { struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev); if (ca) { - struct bch_dev_usage u; - bch2_dev_usage_read_fast(ca, &u); + struct bch_dev_usage_full u; + bch2_dev_usage_full_read_fast(ca, &u); for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++) if (ctx->arg.scrub.data_types & BIT(i)) e.p.sectors_total += u.d[i].sectors; @@ -473,7 +473,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, struct bch_ioctl_dev_usage __user *user_arg) { struct bch_ioctl_dev_usage arg; - struct bch_dev_usage src; + struct bch_dev_usage_full src; struct bch_dev *ca; unsigned i; @@ -493,7 +493,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, if (IS_ERR(ca)) return PTR_ERR(ca); - src = bch2_dev_usage_read(ca); + src = bch2_dev_usage_full_read(ca); arg.state = ca->mi.state; arg.bucket_size = ca->mi.bucket_size; @@ -514,7 +514,7 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c, struct bch_ioctl_dev_usage_v2 __user *user_arg) { struct bch_ioctl_dev_usage_v2 arg; - struct bch_dev_usage src; + struct bch_dev_usage_full src; struct bch_dev *ca; int ret = 0; @@ -534,7 +534,7 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c, if (IS_ERR(ca)) return PTR_ERR(ca); - src = bch2_dev_usage_read(ca); + src = bch2_dev_usage_full_read(ca); arg.state = ca->mi.state; arg.bucket_size = ca->mi.bucket_size; @@ -615,7 +615,7 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c, for_each_online_member(c, ca) if (ca->dev == dev) { - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); return ca->dev_idx; } diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c index 85fc9034..d68c3c78 100644 --- a/libbcachefs/compress.c +++ b/libbcachefs/compress.c @@ -371,13 +371,14 @@ static int attempt_compress(struct bch_fs *c, }; zlib_set_workspace(&strm, workspace); - zlib_deflateInit2(&strm, + if (zlib_deflateInit2(&strm, compression.level ? clamp_t(unsigned, compression.level, Z_BEST_SPEED, Z_BEST_COMPRESSION) : Z_DEFAULT_COMPRESSION, Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL, - Z_DEFAULT_STRATEGY); + Z_DEFAULT_STRATEGY) != Z_OK) + return 0; if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END) return 0; @@ -713,7 +714,7 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, ret = match_string(bch2_compression_opts, -1, type_str); if (ret < 0 && err) - prt_str(err, "invalid compression type"); + prt_str(err, "invalid compression type\n"); if (ret < 0) goto err; @@ -728,7 +729,7 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, if (!ret && level > 15) ret = -EINVAL; if (ret < 0 && err) - prt_str(err, "invalid compression level"); + prt_str(err, "invalid compression level\n"); if (ret < 0) goto err; diff --git a/libbcachefs/darray.h b/libbcachefs/darray.h index c6151495..88f0ca3f 100644 --- a/libbcachefs/darray.h +++ b/libbcachefs/darray.h @@ -20,7 +20,17 @@ struct { \ #define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0) typedef DARRAY(char) darray_char; -typedef DARRAY(char *) darray_str; +typedef DARRAY(char *) darray_str; + +typedef DARRAY(u8) darray_u8; +typedef DARRAY(u16) darray_u16; +typedef DARRAY(u32) darray_u32; +typedef DARRAY(u64) darray_u64; + +typedef DARRAY(s8) darray_s8; +typedef DARRAY(s16) darray_s16; +typedef DARRAY(s32) darray_s32; +typedef DARRAY(s64) darray_s64; int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c index fe400dfc..b211c972 100644 --- a/libbcachefs/data_update.c +++ b/libbcachefs/data_update.c @@ -216,7 +216,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, bch2_trans_begin(trans); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); ret = bkey_err(k); if (ret) goto err; @@ -398,7 +398,7 @@ restart_drop_extra_replicas: BCH_TRANS_COMMIT_no_enospc| m->data_opts.btree_insert_flags); if (!ret) { - bch2_btree_iter_set_pos(&iter, next_pos); + bch2_btree_iter_set_pos(trans, &iter, next_pos); this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size); if (trace_io_move_finish_enabled()) @@ -426,7 +426,7 @@ nowork: count_event(c, io_move_fail); - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); goto next; } out: @@ -497,7 +497,7 @@ static int bch2_update_unwritten_extent(struct btree_trans *trans, bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos, BTREE_ITER_slots); ret = lockrestart_do(trans, ({ - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); bkey_err(k); })); bch2_trans_iter_exit(trans, &iter); @@ -607,7 +607,7 @@ void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update prt_newline(out); printbuf_indent_add(out, 2); bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); - prt_printf(out, "read_done:\t\%u\n", m->read_done); + prt_printf(out, "read_done:\t%u\n", m->read_done); bch2_write_op_to_text(out, &m->op); printbuf_indent_sub(out, 2); } diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index 788af88f..09b67279 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -57,7 +57,7 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, submit_bio_wait(bio); bio_put(bio); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); memcpy(n_ondisk, n_sorted, btree_buf_bytes(b)); @@ -297,7 +297,7 @@ out: if (bio) bio_put(bio); kvfree(n_ondisk); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); } #ifdef CONFIG_DEBUG_FS @@ -770,6 +770,12 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, mutex_lock(&s->lock); prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem); +#ifdef CONFIG_BCACHEFS_DEBUG + printbuf_indent_add(&i->buf, 2); + bch2_trans_kmalloc_trace_to_text(&i->buf, &s->trans_kmalloc_trace); + printbuf_indent_sub(&i->buf, 2); +#endif + prt_printf(&i->buf, "Transaction duration:\n"); printbuf_indent_add(&i->buf, 2); @@ -927,7 +933,11 @@ void bch2_fs_debug_init(struct bch_fs *c) if (IS_ERR_OR_NULL(bch_debug)) return; - snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); + if (!c->opts.single_device) + snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); + else + strscpy(name, c->name, sizeof(name)); + c->fs_debug_dir = debugfs_create_dir(name, bch_debug); if (IS_ERR_OR_NULL(c->fs_debug_dir)) return; diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index d7f9f793..8488a757 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -287,8 +287,8 @@ static void dirent_init_casefolded_name(struct bkey_i_dirent *dirent, EBUG_ON(!dirent->v.d_casefold); EBUG_ON(!cf_name->len); - dirent->v.d_cf_name_block.d_name_len = name->len; - dirent->v.d_cf_name_block.d_cf_name_len = cf_name->len; + dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len); + dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_name->len); memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len); memcpy(&dirent->v.d_cf_name_block.d_names[name->len], cf_name->name, cf_name->len); memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_name->len], 0, @@ -417,8 +417,8 @@ int bch2_dirent_rename(struct btree_trans *trans, enum bch_rename_mode mode) { struct qstr src_name_lookup, dst_name_lookup; - struct btree_iter src_iter = { NULL }; - struct btree_iter dst_iter = { NULL }; + struct btree_iter src_iter = {}; + struct btree_iter dst_iter = {}; struct bkey_s_c old_src, old_dst = bkey_s_c_null; struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; struct bpos dst_pos = @@ -586,16 +586,16 @@ out_set_src: } if (delete_src) { - bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); - ret = bch2_btree_iter_traverse(&src_iter) ?: + bch2_btree_iter_set_snapshot(trans, &src_iter, old_src.k->p.snapshot); + ret = bch2_btree_iter_traverse(trans, &src_iter) ?: bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_internal_snapshot_node); if (ret) goto out; } if (delete_dst) { - bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot); - ret = bch2_btree_iter_traverse(&dst_iter) ?: + bch2_btree_iter_set_snapshot(trans, &dst_iter, old_dst.k->p.snapshot); + ret = bch2_btree_iter_traverse(trans, &dst_iter) ?: bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_internal_snapshot_node); if (ret) goto out; @@ -642,7 +642,7 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, const struct qstr *name, subvol_inum *inum) { struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = { NULL }; + struct btree_iter iter = {}; int ret = lockrestart_do(trans, bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0)); @@ -771,7 +771,7 @@ int bch2_fsck_remove_dirent(struct btree_trans *trans, struct bpos pos) bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(&iter) ?: + ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_hash_delete_at(trans, bch2_dirent_hash_desc, &dir_hash_info, &iter, BTREE_UPDATE_internal_snapshot_node); diff --git a/libbcachefs/disk_accounting.c b/libbcachefs/disk_accounting.c index a59f6c12..b007319b 100644 --- a/libbcachefs/disk_accounting.c +++ b/libbcachefs/disk_accounting.c @@ -739,7 +739,7 @@ int bch2_accounting_read(struct bch_fs *c) struct disk_accounting_pos next; memset(&next, 0, sizeof(next)); next.type = acc_k.type + 1; - bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); + bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next)); continue; } @@ -930,7 +930,7 @@ void bch2_verify_accounting_clean(struct bch_fs *c) struct disk_accounting_pos next; memset(&next, 0, sizeof(next)); next.type = acc_k.type + 1; - bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); + bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next)); continue; } diff --git a/libbcachefs/disk_groups.c b/libbcachefs/disk_groups.c index 5df8de0b..1186280b 100644 --- a/libbcachefs/disk_groups.c +++ b/libbcachefs/disk_groups.c @@ -555,9 +555,9 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) ? rcu_dereference(c->devs[t.dev]) : NULL; - if (ca && percpu_ref_tryget(&ca->io_ref)) { + if (ca && percpu_ref_tryget(&ca->io_ref[READ])) { prt_printf(out, "/dev/%s", ca->name); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); } else if (ca) { prt_printf(out, "offline device %u", t.dev); } else { diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 6faeda7a..42600370 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -105,6 +105,7 @@ struct ec_bio { struct bch_dev *ca; struct ec_stripe_buf *buf; size_t idx; + int rw; u64 submit_time; struct bio bio; }; @@ -462,7 +463,8 @@ int bch2_trigger_stripe(struct btree_trans *trans, return ret; if (gc) - memcpy(&gc->r.e, &acc.replicas, replicas_entry_bytes(&acc.replicas)); + unsafe_memcpy(&gc->r.e, &acc.replicas, + replicas_entry_bytes(&acc.replicas), "VLA"); } if (old_s) { @@ -703,6 +705,7 @@ static void ec_block_endio(struct bio *bio) struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx]; struct bch_dev *ca = ec_bio->ca; struct closure *cl = bio->bi_private; + int rw = ec_bio->rw; bch2_account_io_completion(ca, bio_data_dir(bio), ec_bio->submit_time, !bio->bi_status); @@ -724,7 +727,7 @@ static void ec_block_endio(struct bio *bio) } bio_put(&ec_bio->bio); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[rw]); closure_put(cl); } @@ -775,6 +778,7 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, ec_bio->ca = ca; ec_bio->buf = buf; ec_bio->idx = idx; + ec_bio->rw = rw; ec_bio->submit_time = local_clock(); ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); @@ -784,14 +788,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); closure_get(cl); - percpu_ref_get(&ca->io_ref); + percpu_ref_get(&ca->io_ref[rw]); submit_bio(&ec_bio->bio); offset += b; } - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[rw]); } static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, @@ -1264,7 +1268,7 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c, ob->sectors_free, GFP_KERNEL, 0); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[WRITE]); if (ret) s->err = ret; @@ -1712,23 +1716,32 @@ err: } static int new_stripe_alloc_buckets(struct btree_trans *trans, + struct alloc_request *req, struct ec_stripe_head *h, struct ec_stripe_new *s, - enum bch_watermark watermark, struct closure *cl) + struct closure *cl) { struct bch_fs *c = trans->c; - struct bch_devs_mask devs = h->devs; struct open_bucket *ob; - struct open_buckets buckets; struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; unsigned i, j, nr_have_parity = 0, nr_have_data = 0; - bool have_cache = true; int ret = 0; + req->scratch_data_type = req->data_type; + req->scratch_ptrs = req->ptrs; + req->scratch_nr_replicas = req->nr_replicas; + req->scratch_nr_effective = req->nr_effective; + req->scratch_have_cache = req->have_cache; + req->scratch_devs_may_alloc = req->devs_may_alloc; + + req->devs_may_alloc = h->devs; + req->have_cache = true; + BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity); BUG_ON(v->nr_redundant != s->nr_parity); /* * We bypass the sector allocator which normally does this: */ - bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); + bitmap_and(req->devs_may_alloc.d, req->devs_may_alloc.d, + c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) { /* @@ -1738,7 +1751,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, * block when updating the stripe */ if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID) - __clear_bit(v->ptrs[i].dev, devs.d); + __clear_bit(v->ptrs[i].dev, req->devs_may_alloc.d); if (i < s->nr_data) nr_have_data++; @@ -1749,60 +1762,58 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, BUG_ON(nr_have_data > s->nr_data); BUG_ON(nr_have_parity > s->nr_parity); - buckets.nr = 0; + req->ptrs.nr = 0; if (nr_have_parity < s->nr_parity) { - ret = bch2_bucket_alloc_set_trans(trans, &buckets, - &h->parity_stripe, - &devs, - s->nr_parity, - &nr_have_parity, - &have_cache, 0, - BCH_DATA_parity, - watermark, - cl); + req->nr_replicas = s->nr_parity; + req->nr_effective = nr_have_parity; + req->data_type = BCH_DATA_parity; - open_bucket_for_each(c, &buckets, ob, i) { + ret = bch2_bucket_alloc_set_trans(trans, req, &h->parity_stripe, cl); + + open_bucket_for_each(c, &req->ptrs, ob, i) { j = find_next_zero_bit(s->blocks_gotten, s->nr_data + s->nr_parity, s->nr_data); BUG_ON(j >= s->nr_data + s->nr_parity); - s->blocks[j] = buckets.v[i]; + s->blocks[j] = req->ptrs.v[i]; v->ptrs[j] = bch2_ob_ptr(c, ob); __set_bit(j, s->blocks_gotten); } if (ret) - return ret; + goto err; } - buckets.nr = 0; + req->ptrs.nr = 0; if (nr_have_data < s->nr_data) { - ret = bch2_bucket_alloc_set_trans(trans, &buckets, - &h->block_stripe, - &devs, - s->nr_data, - &nr_have_data, - &have_cache, 0, - BCH_DATA_user, - watermark, - cl); + req->nr_replicas = s->nr_data; + req->nr_effective = nr_have_data; + req->data_type = BCH_DATA_user; - open_bucket_for_each(c, &buckets, ob, i) { + ret = bch2_bucket_alloc_set_trans(trans, req, &h->block_stripe, cl); + + open_bucket_for_each(c, &req->ptrs, ob, i) { j = find_next_zero_bit(s->blocks_gotten, s->nr_data, 0); BUG_ON(j >= s->nr_data); - s->blocks[j] = buckets.v[i]; + s->blocks[j] = req->ptrs.v[i]; v->ptrs[j] = bch2_ob_ptr(c, ob); __set_bit(j, s->blocks_gotten); } if (ret) - return ret; + goto err; } - - return 0; +err: + req->data_type = req->scratch_data_type; + req->ptrs = req->scratch_ptrs; + req->nr_replicas = req->scratch_nr_replicas; + req->nr_effective = req->scratch_nr_effective; + req->have_cache = req->scratch_have_cache; + req->devs_may_alloc = req->scratch_devs_may_alloc; + return ret; } static int __get_existing_stripe(struct btree_trans *trans, @@ -1836,7 +1847,7 @@ static int __get_existing_stripe(struct btree_trans *trans, ret = 1; } out: - bch2_set_btree_iter_dontneed(&iter); + bch2_set_btree_iter_dontneed(trans, &iter); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -1949,7 +1960,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st if (bkey_gt(k.k->p, POS(0, U32_MAX))) { if (start_pos.offset) { start_pos = min_pos; - bch2_btree_iter_set_pos(&iter, start_pos); + bch2_btree_iter_set_pos(trans, &iter, start_pos); continue; } @@ -1983,17 +1994,15 @@ err: } struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, - unsigned target, + struct alloc_request *req, unsigned algo, - unsigned redundancy, - enum bch_watermark watermark, struct closure *cl) { struct bch_fs *c = trans->c; - struct ec_stripe_head *h; - bool waiting = false; + unsigned redundancy = req->nr_replicas - 1; unsigned disk_label = 0; - struct target t = target_decode(target); + struct target t = target_decode(req->target); + bool waiting = false; int ret; if (t.type == TARGET_GROUP) { @@ -2004,7 +2013,9 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, disk_label = t.group + 1; /* 0 == no label */ } - h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, watermark); + struct ec_stripe_head *h = + __bch2_ec_stripe_head_get(trans, disk_label, algo, + redundancy, req->watermark); if (IS_ERR_OR_NULL(h)) return h; @@ -2028,8 +2039,12 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, goto alloc_existing; /* First, try to allocate a full stripe: */ - ret = new_stripe_alloc_buckets(trans, h, s, BCH_WATERMARK_stripe, NULL) ?: + enum bch_watermark saved_watermark = BCH_WATERMARK_stripe; + swap(req->watermark, saved_watermark); + ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?: __bch2_ec_stripe_head_reserve(trans, h, s); + swap(req->watermark, saved_watermark); + if (!ret) goto allocate_buf; if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || @@ -2047,8 +2062,8 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) goto err; - if (watermark == BCH_WATERMARK_copygc) { - ret = new_stripe_alloc_buckets(trans, h, s, watermark, NULL) ?: + if (req->watermark == BCH_WATERMARK_copygc) { + ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?: __bch2_ec_stripe_head_reserve(trans, h, s); if (ret) goto err; @@ -2067,7 +2082,7 @@ alloc_existing: * Retry allocating buckets, with the watermark for this * particular write: */ - ret = new_stripe_alloc_buckets(trans, h, s, watermark, cl); + ret = new_stripe_alloc_buckets(trans, req, h, s, cl); if (ret) goto err; diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h index 62d27e04..6780292d 100644 --- a/libbcachefs/ec.h +++ b/libbcachefs/ec.h @@ -254,9 +254,10 @@ void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int); int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); + +struct alloc_request; struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *, - unsigned, unsigned, unsigned, - enum bch_watermark, struct closure *); + struct alloc_request *, unsigned, struct closure *); void bch2_do_stripe_deletes(struct bch_fs *); void bch2_ec_do_stripe_creates(struct bch_fs *); diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index c8696f01..e3c85288 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -192,6 +192,8 @@ x(BCH_ERR_data_update_done, data_update_done_no_rw_devs) \ x(EINVAL, device_state_not_allowed) \ x(EINVAL, member_info_missing) \ + x(EINVAL, single_device_filesystem) \ + x(EINVAL, not_single_device_filesystem) \ x(EINVAL, mismatched_block_size) \ x(EINVAL, block_size_too_small) \ x(EINVAL, bucket_size_too_small) \ @@ -211,6 +213,7 @@ x(EINVAL, inode_unpack_error) \ x(EINVAL, varint_decode_error) \ x(EINVAL, erasure_coding_found_btree_node) \ + x(EINVAL, option_negative) \ x(EOPNOTSUPP, may_not_use_incompat_feature) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ diff --git a/libbcachefs/error.c b/libbcachefs/error.c index d4dfd13a..baf5dfb3 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -34,7 +34,7 @@ bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out) journal_cur_seq(&c->journal)); return true; case BCH_ON_ERROR_panic: - bch2_print_string_as_lines(KERN_ERR, out->buf); + bch2_print_string_as_lines_nonblocking(KERN_ERR, out->buf); panic(bch2_fmt(c, "panic after error")); return true; default: @@ -45,6 +45,8 @@ bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out) bool bch2_inconsistent_error(struct bch_fs *c) { struct printbuf buf = PRINTBUF; + buf.atomic++; + printbuf_indent_add_nextline(&buf, 2); bool ret = __bch2_inconsistent_error(c, &buf); @@ -59,6 +61,7 @@ static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *tra const char *fmt, va_list args) { struct printbuf buf = PRINTBUF; + buf.atomic++; bch2_log_msg_start(c, &buf); @@ -68,7 +71,7 @@ static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *tra if (trans) bch2_trans_updates_to_text(&buf, trans); bool ret = __bch2_inconsistent_error(c, &buf); - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf); printbuf_exit(&buf); return ret; diff --git a/libbcachefs/extent_update.c b/libbcachefs/extent_update.c index 6aac579a..6bb42985 100644 --- a/libbcachefs/extent_update.c +++ b/libbcachefs/extent_update.c @@ -112,7 +112,7 @@ int bch2_extent_atomic_end(struct btree_trans *trans, unsigned nr_iters = 0; int ret; - ret = bch2_btree_iter_traverse(iter); + ret = bch2_btree_iter_traverse(trans, iter); if (ret) return ret; @@ -126,9 +126,9 @@ int bch2_extent_atomic_end(struct btree_trans *trans, if (ret < 0) return ret; - bch2_trans_copy_iter(©, iter); + bch2_trans_copy_iter(trans, ©, iter); - for_each_btree_key_max_continue_norestart(copy, insert->k.p, 0, k, ret) { + for_each_btree_key_max_continue_norestart(trans, copy, insert->k.p, 0, k, ret) { unsigned offset = 0; if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index ae7c7a17..98de81bd 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -136,12 +136,8 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, if (k.k->type == KEY_TYPE_error) return -BCH_ERR_key_type_error; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) - return -BCH_ERR_extent_poisened; - rcu_read_lock(); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; u64 pick_latency; diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c index a03e2c78..e3a75dcc 100644 --- a/libbcachefs/fs-io-buffered.c +++ b/libbcachefs/fs-io-buffered.c @@ -183,12 +183,12 @@ static void bchfs_read(struct btree_trans *trans, if (ret) goto err; - bch2_btree_iter_set_snapshot(&iter, snapshot); + bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - bch2_btree_iter_set_pos(&iter, + bch2_btree_iter_set_pos(trans, &iter, POS(inum.inum, rbio->bio.bi_iter.bi_sector)); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); ret = bkey_err(k); if (ret) goto err; @@ -225,11 +225,26 @@ static void bchfs_read(struct btree_trans *trans, bch2_read_extent(trans, rbio, iter.pos, data_btree, k, offset_into_extent, flags); - swap(rbio->bio.bi_iter.bi_size, bytes); + /* + * Careful there's a landmine here if bch2_read_extent() ever + * starts returning transaction restarts here. + * + * We've changed rbio->bi_iter.bi_size to be "bytes we can read + * from this extent" with the swap call, and we restore it + * below. That restore needs to come before checking for + * errors. + * + * But unlike __bch2_read(), we use the rbio bvec iter, not one + * on the stack, so we can't do the restore right after the + * bch2_read_extent() call: we don't own that iterator anymore + * if BCH_READ_last_fragment is set, since we may have submitted + * that rbio instead of cloning it. + */ if (flags & BCH_READ_last_fragment) break; + swap(rbio->bio.bi_iter.bi_size, bytes); bio_advance(&rbio->bio, bytes); err: if (ret && diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index c80ed3a5..409bba39 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -48,7 +48,7 @@ static void nocow_flush_endio(struct bio *_bio) struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); closure_put(bio->cl); - percpu_ref_put(&bio->ca->io_ref); + percpu_ref_put(&bio->ca->io_ref[WRITE]); bio_put(&bio->bio); } @@ -71,7 +71,7 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { rcu_read_lock(); ca = rcu_dereference(c->devs[dev]); - if (ca && !percpu_ref_tryget(&ca->io_ref)) + if (ca && !percpu_ref_tryget(&ca->io_ref[WRITE])) ca = NULL; rcu_read_unlock(); @@ -241,6 +241,7 @@ out: if (!ret) ret = err; + bch_err_fn(c, ret); return ret; } @@ -636,9 +637,9 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, if (ret) goto bkey_err; - bch2_btree_iter_set_snapshot(&iter, snapshot); + bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); if ((ret = bkey_err(k))) goto bkey_err; @@ -649,13 +650,13 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, /* already reserved */ if (bkey_extent_is_reservation(k) && bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) { - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); continue; } if (bkey_extent_is_data(k.k) && !(mode & FALLOC_FL_ZERO_RANGE)) { - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); continue; } @@ -676,7 +677,7 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, if (ret) goto bkey_err; } - bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start)); + bch2_btree_iter_set_pos(trans, &iter, POS(iter.pos.inode, hole_start)); if (ret) goto bkey_err; diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index bb303791..5796844f 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -88,7 +88,7 @@ int __must_check bch2_write_inode(struct bch_fs *c, void *p, unsigned fields) { struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = { NULL }; + struct btree_iter iter = {}; struct bch_inode_unpacked inode_u; int ret; retry: @@ -1075,7 +1075,7 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap, struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_qid qid; struct btree_trans *trans; - struct btree_iter inode_iter = { NULL }; + struct btree_iter inode_iter = {}; struct bch_inode_unpacked inode_u; struct posix_acl *acl = NULL; kuid_t kuid; @@ -1330,9 +1330,9 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, if (ret) continue; - bch2_btree_iter_set_snapshot(&iter, snapshot); + bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - k = bch2_btree_iter_peek_max(&iter, end); + k = bch2_btree_iter_peek_max(trans, &iter, end); ret = bkey_err(k); if (ret) continue; @@ -1342,7 +1342,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, if (!bkey_extent_is_data(k.k) && k.k->type != KEY_TYPE_reservation) { - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); continue; } @@ -1380,7 +1380,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bkey_copy(prev.k, cur.k); have_extent = true; - bch2_btree_iter_set_pos(&iter, + bch2_btree_iter_set_pos(trans, &iter, POS(iter.pos.inode, iter.pos.offset + sectors)); } bch2_trans_iter_exit(trans, &iter); @@ -1697,17 +1697,17 @@ retry: if (ret) goto err; - bch2_btree_iter_set_snapshot(&iter1, snapshot); - bch2_btree_iter_set_snapshot(&iter2, snapshot); + bch2_btree_iter_set_snapshot(trans, &iter1, snapshot); + bch2_btree_iter_set_snapshot(trans, &iter2, snapshot); ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u); if (ret) goto err; if (inode_u.bi_dir == dir->ei_inode.bi_inum) { - bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); + bch2_btree_iter_set_pos(trans, &iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); - k = bch2_btree_iter_peek_slot(&iter1); + k = bch2_btree_iter_peek_slot(trans, &iter1); ret = bkey_err(k); if (ret) goto err; @@ -1731,7 +1731,7 @@ retry: * File with multiple hardlinks and our backref is to the wrong * directory - linear search: */ - for_each_btree_key_continue_norestart(iter2, 0, k, ret) { + for_each_btree_key_continue_norestart(trans, iter2, 0, k, ret) { if (k.k->p.inode > dir->ei_inode.bi_inum) break; @@ -2237,7 +2237,7 @@ got_sb: /* XXX: create an anonymous device for multi device filesystems */ sb->s_bdev = bdev; sb->s_dev = bdev->bd_dev; - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); break; } diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 52320295..8e95cc7f 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -186,7 +186,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, { struct bch_fs *c = trans->c; struct qstr lostfound_str = QSTR("lost+found"); - struct btree_iter lostfound_iter = { NULL }; + struct btree_iter lostfound_iter = {}; u64 inum = 0; unsigned d_type = 0; int ret; @@ -295,8 +295,8 @@ create_lostfound: if (ret) goto err; - bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot); - ret = bch2_btree_iter_traverse(&lostfound_iter); + bch2_btree_iter_set_snapshot(trans, &lostfound_iter, snapshot); + ret = bch2_btree_iter_traverse(trans, &lostfound_iter); if (ret) goto err; @@ -544,7 +544,7 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub new_inode.bi_subvol = subvolid; int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?: - bch2_btree_iter_traverse(&inode_iter) ?: + bch2_btree_iter_traverse(trans, &inode_iter) ?: bch2_inode_write(trans, &inode_iter, &new_inode); bch2_trans_iter_exit(trans, &inode_iter); if (ret) @@ -609,7 +609,7 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 struct btree_iter iter = {}; bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); - struct bkey_s_c k = bch2_btree_iter_peek_prev_min(&iter, POS(inum, 0)); + struct bkey_s_c k = bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum, 0)); bch2_trans_iter_exit(trans, &iter); int ret = bkey_err(k); if (ret) @@ -1557,7 +1557,7 @@ static int overlapping_extents_found(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; - struct btree_iter iter1, iter2 = { NULL }; + struct btree_iter iter1, iter2 = {}; struct bkey_s_c k1, k2; int ret; @@ -1566,7 +1566,7 @@ static int overlapping_extents_found(struct btree_trans *trans, bch2_trans_iter_init(trans, &iter1, btree, pos1, BTREE_ITER_all_snapshots| BTREE_ITER_not_extents); - k1 = bch2_btree_iter_peek_max(&iter1, POS(pos1.inode, U64_MAX)); + k1 = bch2_btree_iter_peek_max(trans, &iter1, POS(pos1.inode, U64_MAX)); ret = bkey_err(k1); if (ret) goto err; @@ -1586,12 +1586,12 @@ static int overlapping_extents_found(struct btree_trans *trans, goto err; } - bch2_trans_copy_iter(&iter2, &iter1); + bch2_trans_copy_iter(trans, &iter2, &iter1); while (1) { - bch2_btree_iter_advance(&iter2); + bch2_btree_iter_advance(trans, &iter2); - k2 = bch2_btree_iter_peek_max(&iter2, POS(pos1.inode, U64_MAX)); + k2 = bch2_btree_iter_peek_max(trans, &iter2, POS(pos1.inode, U64_MAX)); ret = bkey_err(k2); if (ret) goto err; @@ -1791,9 +1791,9 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { struct btree_iter iter2; - bch2_trans_copy_iter(&iter2, iter); - bch2_btree_iter_set_snapshot(&iter2, i->snapshot); - ret = bch2_btree_iter_traverse(&iter2) ?: + bch2_trans_copy_iter(trans, &iter2, iter); + bch2_btree_iter_set_snapshot(trans, &iter2, i->snapshot); + ret = bch2_btree_iter_traverse(trans, &iter2) ?: bch2_btree_delete_at(trans, &iter2, BTREE_UPDATE_internal_snapshot_node); bch2_trans_iter_exit(trans, &iter2); @@ -2185,7 +2185,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, BTREE_ID_dirents, SPOS(k.k->p.inode, k.k->p.offset, *i), BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(&delete_iter) ?: + ret = bch2_btree_iter_traverse(trans, &delete_iter) ?: bch2_hash_delete_at(trans, bch2_dirent_hash_desc, hash_info, &delete_iter, @@ -2366,8 +2366,6 @@ int bch2_check_root(struct bch_fs *c) return ret; } -typedef DARRAY(u32) darray_u32; - static bool darray_u32_has(darray_u32 *d, u32 v) { darray_for_each(*d, i) @@ -2412,7 +2410,7 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, bch2_trans_iter_exit(trans, &parent_iter); bch2_trans_iter_init(trans, &parent_iter, BTREE_ID_subvolumes, POS(0, parent), 0); - k = bch2_btree_iter_peek_slot(&parent_iter); + k = bch2_btree_iter_peek_slot(trans, &parent_iter); ret = bkey_err(k); if (ret) goto err; diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 80051073..b51d98cf 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -940,7 +940,7 @@ int bch2_inode_create(struct btree_trans *trans, BTREE_ITER_intent); struct bkey_s_c k; again: - while ((k = bch2_btree_iter_peek(iter)).k && + while ((k = bch2_btree_iter_peek(trans, iter)).k && !(ret = bkey_err(k)) && bkey_lt(k.k->p, POS(0, max))) { if (pos < iter->pos.offset) @@ -951,7 +951,7 @@ again: * we've found just one: */ pos = iter->pos.offset + 1; - bch2_btree_iter_set_pos(iter, POS(0, pos)); + bch2_btree_iter_set_pos(trans, iter, POS(0, pos)); } if (!ret && pos < max) @@ -967,12 +967,12 @@ again: /* Retry from start */ pos = start = min; - bch2_btree_iter_set_pos(iter, POS(0, pos)); + bch2_btree_iter_set_pos(trans, iter, POS(0, pos)); le32_add_cpu(&cursor->v.gen, 1); goto again; found_slot: - bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot)); - k = bch2_btree_iter_peek_slot(iter); + bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, snapshot)); + k = bch2_btree_iter_peek_slot(trans, iter); ret = bkey_err(k); if (ret) { bch2_trans_iter_exit(trans, iter); @@ -1009,9 +1009,9 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, if (ret) goto err; - bch2_btree_iter_set_snapshot(&iter, snapshot); + bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - k = bch2_btree_iter_peek_max(&iter, end); + k = bch2_btree_iter_peek_max(trans, &iter, end); ret = bkey_err(k); if (ret) goto err; @@ -1042,7 +1042,7 @@ err: int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) { struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = { NULL }; + struct btree_iter iter = {}; struct bkey_s_c k; u32 snapshot; int ret; @@ -1207,7 +1207,7 @@ int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_i static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) { struct bch_fs *c = trans->c; - struct btree_iter iter = { NULL }; + struct btree_iter iter = {}; struct bkey_i_inode_generation delete; struct bch_inode_unpacked inode_u; struct bkey_s_c k; diff --git a/libbcachefs/io_misc.c b/libbcachefs/io_misc.c index 6b842c8d..cc07729a 100644 --- a/libbcachefs/io_misc.c +++ b/libbcachefs/io_misc.c @@ -43,7 +43,7 @@ int bch2_extent_fallocate(struct btree_trans *trans, bch2_bkey_buf_init(&new); closure_init_stack(&cl); - k = bch2_btree_iter_peek_slot(iter); + k = bch2_btree_iter_peek_slot(trans, iter); ret = bkey_err(k); if (ret) return ret; @@ -164,12 +164,12 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, if (ret) continue; - bch2_btree_iter_set_snapshot(iter, snapshot); + bch2_btree_iter_set_snapshot(trans, iter, snapshot); /* * peek_max() doesn't have ideal semantics for extents: */ - k = bch2_btree_iter_peek_max(iter, end_pos); + k = bch2_btree_iter_peek_max(trans, iter, end_pos); if (!k.k) break; @@ -230,7 +230,7 @@ static int truncate_set_isize(struct btree_trans *trans, u64 new_i_size, bool warn) { - struct btree_iter iter = { NULL }; + struct btree_iter iter = {}; struct bch_inode_unpacked inode_u; int ret; @@ -399,7 +399,7 @@ case LOGGED_OP_FINSERT_start: if (ret) goto err; } else { - bch2_btree_iter_set_pos(&iter, POS(inum.inum, src_offset)); + bch2_btree_iter_set_pos(trans, &iter, POS(inum.inum, src_offset)); ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -425,12 +425,12 @@ case LOGGED_OP_FINSERT_shift_extents: if (ret) goto btree_err; - bch2_btree_iter_set_snapshot(&iter, snapshot); - bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot)); + bch2_btree_iter_set_snapshot(trans, &iter, snapshot); + bch2_btree_iter_set_pos(trans, &iter, SPOS(inum.inum, pos, snapshot)); k = insert - ? bch2_btree_iter_peek_prev_min(&iter, POS(inum.inum, 0)) - : bch2_btree_iter_peek_max(&iter, POS(inum.inum, U64_MAX)); + ? bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum.inum, 0)) + : bch2_btree_iter_peek_max(trans, &iter, POS(inum.inum, U64_MAX)); if ((ret = bkey_err(k))) goto btree_err; diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c index fd01e67b..81430372 100644 --- a/libbcachefs/io_read.c +++ b/libbcachefs/io_read.c @@ -25,6 +25,7 @@ #include "subvolume.h" #include "trace.h" +#include <linux/moduleparam.h> #include <linux/random.h> #include <linux/sched/mm.h> @@ -34,6 +35,12 @@ module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); MODULE_PARM_DESC(read_corrupt_ratio, ""); #endif +static bool bch2_poison_extents_on_checksum_error; +module_param_named(poison_extents_on_checksum_error, + bch2_poison_extents_on_checksum_error, bool, 0644); +MODULE_PARM_DESC(poison_extents_on_checksum_error, + "Extents with checksum errors are marked as poisoned - unsafe without read fua support"); + #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT static bool bch2_target_congested(struct bch_fs *c, u16 target) @@ -296,6 +303,13 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans, bool *read_full, struct bch_io_failures *failed) { + /* + * We're in the retry path, but we don't know what to repair yet, and we + * don't want to do a promote here: + */ + if (failed && !failed->nr) + return NULL; + struct bch_fs *c = trans->c; /* * if failed != NULL we're not actually doing a promote, we're @@ -394,7 +408,7 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) if (rbio->have_ioref) { struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); } if (rbio->split) { @@ -430,6 +444,74 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) bio_endio(&rbio->bio); } +static void get_rbio_extent(struct btree_trans *trans, + struct bch_read_bio *rbio, + struct bkey_buf *sk) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = lockrestart_do(trans, + bkey_err(k = bch2_bkey_get_iter(trans, &iter, + rbio->data_btree, rbio->data_pos, 0))); + if (ret) + return; + + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr(ptrs, ptr) + if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr)) { + bch2_bkey_buf_reassemble(sk, trans->c, k); + break; + } + + bch2_trans_iter_exit(trans, &iter); +} + +static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio, + enum btree_id btree, struct bkey_s_c read_k) +{ + if (!bch2_poison_extents_on_checksum_error) + return 0; + + struct bch_fs *c = trans->c; + + struct data_update *u = rbio_data_update(rbio); + if (u) + read_k = bkey_i_to_s_c(u->k.k); + + u64 flags = bch2_bkey_extent_flags(read_k); + if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) + return 0; + + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(read_k.k), + BTREE_ITER_intent); + int ret = bkey_err(k); + if (ret) + return ret; + + if (!bkey_and_val_eq(k, read_k)) + goto out; + + struct bkey_i *new = bch2_trans_kmalloc(trans, + bkey_bytes(k.k) + sizeof(struct bch_extent_flags)); + ret = PTR_ERR_OR_ZERO(new) ?: + (bkey_reassemble(new, k), 0) ?: + bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?: + bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node) ?: + bch2_trans_commit(trans, NULL, NULL, 0); + + /* + * Propagate key change back to data update path, in particular so it + * knows the extent has been poisoned and it's safe to change the + * checksum + */ + if (u && !ret) + bch2_bkey_buf_copy(&u->k, c, new); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, @@ -463,7 +545,8 @@ retry: err: bch2_trans_iter_exit(trans, &iter); - if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + bch2_err_matches(ret, BCH_ERR_data_read_retry)) goto retry; if (ret) { @@ -487,13 +570,21 @@ static void bch2_rbio_retry(struct work_struct *work) .inum = rbio->read_pos.inode, }; struct bch_io_failures failed = { .nr = 0 }; + struct btree_trans *trans = bch2_trans_get(c); + struct bkey_buf sk; + bch2_bkey_buf_init(&sk); + bkey_init(&sk.k->k); + trace_io_read_retry(&rbio->bio); this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], bvec_iter_sectors(rbio->bvec_iter)); - if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) + get_rbio_extent(trans, rbio, &sk); + + if (!bkey_deleted(&sk.k->k) && + bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) bch2_mark_io_failure(&failed, &rbio->pick, rbio->ret == -BCH_ERR_data_read_retry_csum_err); @@ -514,7 +605,7 @@ static void bch2_rbio_retry(struct work_struct *work) int ret = rbio->data_update ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags) - : __bch2_read(trans, rbio, iter, inum, &failed, flags); + : __bch2_read(trans, rbio, iter, inum, &failed, &sk, flags); if (ret) { rbio->ret = ret; @@ -535,6 +626,7 @@ static void bch2_rbio_retry(struct work_struct *work) } bch2_rbio_done(rbio); + bch2_bkey_buf_exit(&sk, c); bch2_trans_put(trans); } @@ -909,7 +1001,7 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, prt_printf(&buf, "memory gen: %u", gen); - ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter))); if (!ret) { prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, k); @@ -959,6 +1051,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, bvec_iter_sectors(iter)); goto out_read_done; } + + if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) && + !orig->data_update) + return -BCH_ERR_extent_poisened; retry_pick: ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); @@ -967,6 +1063,16 @@ retry_pick: goto hole; if (unlikely(ret < 0)) { + if (ret == -BCH_ERR_data_read_csum_err) { + int ret2 = maybe_poison_extent(trans, orig, data_btree, k); + if (ret2) { + ret = ret2; + goto err; + } + + trace_and_count(c, io_read_fail_and_poison, &orig->bio); + } + struct printbuf buf = PRINTBUF; bch2_read_err_msg_trans(trans, &buf, orig, read_pos); prt_printf(&buf, "%s\n ", bch2_err_str(ret)); @@ -1003,7 +1109,7 @@ retry_pick: unlikely(dev_ptr_stale(ca, &pick.ptr))) { read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); bch2_mark_io_failure(failed, &pick, false); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); goto retry_pick; } @@ -1036,7 +1142,7 @@ retry_pick: */ if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { if (ca) - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); rbio->ret = -BCH_ERR_data_read_buffer_too_small; goto out_read_done; } @@ -1260,12 +1366,15 @@ out_read_done: int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, subvol_inum inum, - struct bch_io_failures *failed, unsigned flags) + struct bch_io_failures *failed, + struct bkey_buf *prev_read, + unsigned flags) { struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_buf sk; struct bkey_s_c k; + enum btree_id data_btree; int ret; EBUG_ON(rbio->data_update); @@ -1276,7 +1385,7 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, BTREE_ITER_slots); while (1) { - enum btree_id data_btree = BTREE_ID_extents; + data_btree = BTREE_ID_extents; bch2_trans_begin(trans); @@ -1285,12 +1394,12 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, if (ret) goto err; - bch2_btree_iter_set_snapshot(&iter, snapshot); + bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - bch2_btree_iter_set_pos(&iter, + bch2_btree_iter_set_pos(trans, &iter, POS(inum.inum, bvec_iter.bi_sector)); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); ret = bkey_err(k); if (ret) goto err; @@ -1308,6 +1417,12 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, k = bkey_i_to_s_c(sk.k); + if (unlikely(flags & BCH_READ_in_retry)) { + if (!bkey_and_val_eq(k, bkey_i_to_s_c(prev_read->k))) + failed->nr = 0; + bch2_bkey_buf_copy(prev_read, c, sk.k); + } + /* * With indirect extents, the amount of data to read is the min * of the original extent and the indirect extent: @@ -1342,9 +1457,7 @@ err: break; } - bch2_trans_iter_exit(trans, &iter); - - if (ret) { + if (unlikely(ret)) { struct printbuf buf = PRINTBUF; lockrestart_do(trans, bch2_inum_offset_err_msg_trans(trans, &buf, inum, @@ -1360,6 +1473,7 @@ err: bch2_rbio_done(rbio); } + bch2_trans_iter_exit(trans, &iter); bch2_bkey_buf_exit(&sk, c); return ret; } @@ -1370,10 +1484,18 @@ void bch2_fs_io_read_exit(struct bch_fs *c) rhashtable_destroy(&c->promote_table); bioset_exit(&c->bio_read_split); bioset_exit(&c->bio_read); + mempool_exit(&c->bio_bounce_pages); } int bch2_fs_io_read_init(struct bch_fs *c) { + if (mempool_init_page_pool(&c->bio_bounce_pages, + max_t(unsigned, + c->opts.btree_node_size, + c->opts.encoded_extent_max) / + PAGE_SIZE, 0)) + return -BCH_ERR_ENOMEM_bio_bounce_pages_init; + if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), BIOSET_NEED_BVECS)) return -BCH_ERR_ENOMEM_bio_read_init; diff --git a/libbcachefs/io_read.h b/libbcachefs/io_read.h index c78025d8..1a85b092 100644 --- a/libbcachefs/io_read.h +++ b/libbcachefs/io_read.h @@ -144,7 +144,8 @@ static inline void bch2_read_extent(struct btree_trans *trans, } int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter, - subvol_inum, struct bch_io_failures *, unsigned flags); + subvol_inum, + struct bch_io_failures *, struct bkey_buf *, unsigned flags); static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, subvol_inum inum) @@ -154,7 +155,7 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, rbio->subvol = inum.subvol; bch2_trans_run(c, - __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, + __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, NULL, BCH_READ_retry_if_stale| BCH_READ_may_promote| BCH_READ_user_mapped)); diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c index 07b55839..4dabff3a 100644 --- a/libbcachefs/io_write.c +++ b/libbcachefs/io_write.c @@ -168,9 +168,9 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, *i_sectors_delta = 0; *disk_sectors_delta = 0; - bch2_trans_copy_iter(&iter, extent_iter); + bch2_trans_copy_iter(trans, &iter, extent_iter); - for_each_btree_key_max_continue_norestart(iter, + for_each_btree_key_max_continue_norestart(trans, iter, new->k.p, BTREE_ITER_slots, old, ret) { s64 sectors = min(new->k.p.offset, old.k->p.offset) - max(bkey_start_offset(&new->k), @@ -292,7 +292,7 @@ int bch2_extent_update(struct btree_trans *trans, * path already traversed at iter->pos because * bch2_trans_extent_update() will use it to attempt extent merging */ - ret = __bch2_btree_iter_traverse(iter); + ret = __bch2_btree_iter_traverse(trans, iter); if (ret) return ret; @@ -337,7 +337,7 @@ int bch2_extent_update(struct btree_trans *trans, if (i_sectors_delta_total) *i_sectors_delta_total += i_sectors_delta; - bch2_btree_iter_set_pos(iter, next_pos); + bch2_btree_iter_set_pos(trans, iter, next_pos); return 0; } @@ -445,6 +445,11 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, BUG_ON(c->opts.nochanges); bkey_for_each_ptr(ptrs, ptr) { + /* + * XXX: btree writes should be using io_ref[WRITE], but we + * aren't retrying failed btree writes yet (due to device + * removal/ro): + */ struct bch_dev *ca = nocow ? bch2_dev_have_ref(c, ptr->dev) : bch2_dev_get_ioref(c, ptr->dev, type == BCH_DATA_btree ? READ : WRITE); @@ -697,12 +702,19 @@ static void bch2_write_endio(struct bio *bio) bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, wbio->submit_time, !bio->bi_status); - if (bio->bi_status) { - bch_err_inum_offset_ratelimited(ca, - op->pos.inode, - wbio->inode_offset << 9, - "data write error: %s", - bch2_blk_status_to_str(bio->bi_status)); + if (unlikely(bio->bi_status)) { + if (ca) + bch_err_inum_offset_ratelimited(ca, + op->pos.inode, + wbio->inode_offset << 9, + "data write error: %s", + bch2_blk_status_to_str(bio->bi_status)); + else + bch_err_inum_offset_ratelimited(c, + op->pos.inode, + wbio->inode_offset << 9, + "data write error: %s", + bch2_blk_status_to_str(bio->bi_status)); set_bit(wbio->dev, op->failed.d); op->flags |= BCH_WRITE_io_error; } @@ -715,7 +727,7 @@ static void bch2_write_endio(struct bio *bio) } if (wbio->have_ioref) - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[WRITE]); if (wbio->bounce) bch2_bio_free_pages_pool(c, bio); @@ -1293,7 +1305,7 @@ retry: if (ret) break; - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); ret = bkey_err(k); if (ret) break; @@ -1377,7 +1389,7 @@ retry: bch2_keylist_push(&op->insert_keys); if (op->flags & BCH_WRITE_submitted) break; - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); } out: bch2_trans_iter_exit(trans, &iter); @@ -1414,7 +1426,7 @@ err: return; err_get_ioref: darray_for_each(buckets, i) - percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref); + percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE]); /* Fall back to COW path: */ goto out; @@ -1711,7 +1723,6 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) void bch2_fs_io_write_exit(struct bch_fs *c) { - mempool_exit(&c->bio_bounce_pages); bioset_exit(&c->replica_set); bioset_exit(&c->bio_write); } @@ -1722,12 +1733,5 @@ int bch2_fs_io_write_init(struct bch_fs *c) bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0)) return -BCH_ERR_ENOMEM_bio_write_init; - if (mempool_init_page_pool(&c->bio_bounce_pages, - max_t(unsigned, - c->opts.btree_node_size, - c->opts.encoded_extent_max) / - PAGE_SIZE, 0)) - return -BCH_ERR_ENOMEM_bio_bounce_pages_init; - return 0; } diff --git a/libbcachefs/io_write.h b/libbcachefs/io_write.h index b8ab19a1..2c0a8f35 100644 --- a/libbcachefs/io_write.h +++ b/libbcachefs/io_write.h @@ -17,34 +17,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, __printf(3, 4) void bch2_write_op_error(struct bch_write_op *op, u64, const char *, ...); -#define BCH_WRITE_FLAGS() \ - x(alloc_nowait) \ - x(cached) \ - x(data_encoded) \ - x(pages_stable) \ - x(pages_owned) \ - x(only_specified_devs) \ - x(wrote_data_inline) \ - x(check_enospc) \ - x(sync) \ - x(move) \ - x(in_worker) \ - x(submitted) \ - x(io_error) \ - x(convert_unwritten) - -enum __bch_write_flags { -#define x(f) __BCH_WRITE_##f, - BCH_WRITE_FLAGS() -#undef x -}; - -enum bch_write_flags { -#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f), - BCH_WRITE_FLAGS() -#undef x -}; - static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) { return op->watermark == BCH_WATERMARK_copygc diff --git a/libbcachefs/io_write_types.h b/libbcachefs/io_write_types.h index 3ef6df91..b4a6a44a 100644 --- a/libbcachefs/io_write_types.h +++ b/libbcachefs/io_write_types.h @@ -13,6 +13,34 @@ #include <linux/llist.h> #include <linux/workqueue.h> +#define BCH_WRITE_FLAGS() \ + x(alloc_nowait) \ + x(cached) \ + x(data_encoded) \ + x(pages_stable) \ + x(pages_owned) \ + x(only_specified_devs) \ + x(wrote_data_inline) \ + x(check_enospc) \ + x(sync) \ + x(move) \ + x(in_worker) \ + x(submitted) \ + x(io_error) \ + x(convert_unwritten) + +enum __bch_write_flags { +#define x(f) __BCH_WRITE_##f, + BCH_WRITE_FLAGS() +#undef x +}; + +enum bch_write_flags { +#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f), + BCH_WRITE_FLAGS() +#undef x +}; + struct bch_write_bio { struct_group(wbio, struct bch_fs *c; diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 8a36d553..55ed5704 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -1315,7 +1315,7 @@ int bch2_fs_journal_alloc(struct bch_fs *c) int ret = bch2_dev_journal_alloc(ca, true); if (ret) { - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); return ret; } } @@ -1404,13 +1404,19 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) nr = cur_seq - last_seq; - if (nr + 1 > j->pin.size) { - free_fifo(&j->pin); - init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL); - if (!j->pin.data) { - bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); - return -BCH_ERR_ENOMEM_journal_pin_fifo; - } + /* + * Extra fudge factor, in case we crashed when the journal pin fifo was + * nearly or completely full. We'll need to be able to open additional + * journal entries (at least a few) in order for journal replay to get + * going: + */ + nr += nr / 4; + + nr = max(nr, JOURNAL_PIN); + init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL); + if (!j->pin.data) { + bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); + return -BCH_ERR_ENOMEM_journal_pin_fifo; } j->replay_journal_seq = last_seq; @@ -1454,18 +1460,15 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) j->last_empty_seq = cur_seq - 1; /* to match j->seq */ spin_lock(&j->lock); - set_bit(JOURNAL_running, &j->flags); j->last_flush_write = jiffies; j->reservations.idx = journal_cur_seq(j); c->last_bucket_seq_cleanup = journal_cur_seq(j); - - bch2_journal_space_available(j); spin_unlock(&j->lock); - return bch2_journal_reclaim_start(j); + return 0; } /* init/exit: */ @@ -1554,7 +1557,7 @@ void bch2_fs_journal_exit(struct journal *j) free_fifo(&j->pin); } -int bch2_fs_journal_init(struct journal *j) +void bch2_fs_journal_init_early(struct journal *j) { static struct lock_class_key res_key; @@ -1573,10 +1576,10 @@ int bch2_fs_journal_init(struct journal *j) atomic64_set(&j->reservations.counter, ((union journal_res_state) { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); +} - if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) - return -BCH_ERR_ENOMEM_journal_pin_fifo; - +int bch2_fs_journal_init(struct journal *j) +{ j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN; j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL); if (!j->free_buf) @@ -1585,8 +1588,6 @@ int bch2_fs_journal_init(struct journal *j) for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) j->buf[i].idx = i; - j->pin.front = j->pin.back = 1; - j->wq = alloc_workqueue("bcachefs_journal", WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512); if (!j->wq) diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 47828771..0b92a8b5 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -463,6 +463,7 @@ int bch2_fs_journal_start(struct journal *, u64); void bch2_dev_journal_exit(struct bch_dev *); int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); void bch2_fs_journal_exit(struct journal *); +void bch2_fs_journal_init_early(struct journal *); int bch2_fs_journal_init(struct journal *); #endif /* _BCACHEFS_JOURNAL_H */ diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 2debc213..2a54ac79 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -1218,7 +1218,7 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) out: bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); kvfree(buf.data); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); closure_return(cl); return; err: @@ -1253,7 +1253,7 @@ int bch2_journal_read(struct bch_fs *c, if ((ca->mi.state == BCH_MEMBER_STATE_rw || ca->mi.state == BCH_MEMBER_STATE_ro) && - percpu_ref_tryget(&ca->io_ref)) + percpu_ref_tryget(&ca->io_ref[READ])) closure_call(&ca->journal.read, bch2_journal_read_device, system_unbound_wq, @@ -1460,7 +1460,7 @@ fsck_err: static void journal_advance_devs_to_next_bucket(struct journal *j, struct dev_alloc_list *devs, - unsigned sectors, u64 seq) + unsigned sectors, __le64 seq) { struct bch_fs *c = container_of(j, struct bch_fs, journal); @@ -1768,7 +1768,7 @@ static void journal_write_endio(struct bio *bio) } closure_put(&w->io); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[WRITE]); } static CLOSURE_CALLBACK(journal_write_submit) @@ -1843,7 +1843,7 @@ static CLOSURE_CALLBACK(journal_write_preflush) if (w->separate_flush) { for_each_rw_member(c, ca) { - percpu_ref_get(&ca->io_ref); + percpu_ref_get(&ca->io_ref[WRITE]); struct journal_device *ja = &ca->journal; struct bio *bio = &ja->bio[w->idx]->bio; diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index 8e0eba77..51104bbb 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -151,8 +151,6 @@ enum journal_flags { #undef x }; -typedef DARRAY(u64) darray_u64; - struct journal_bio { struct bch_dev *ca; unsigned buf_idx; diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index 57ad6628..90dcf80b 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -130,7 +130,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, retry: ret = 0; while (bch2_trans_begin(trans), - (b = bch2_btree_iter_peek_node(&iter)) && + (b = bch2_btree_iter_peek_node(trans, &iter)) && !(ret = PTR_ERR_OR_ZERO(b))) { bch2_progress_update_iter(trans, progress, &iter, "dropping metadata"); @@ -154,7 +154,7 @@ retry: if (ret) break; next: - bch2_btree_iter_next_node(&iter); + bch2_btree_iter_next_node(trans, &iter); } if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 5d41260e..29a5c708 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -126,26 +126,40 @@ static void move_write_done(struct bch_write_op *op) static void move_write(struct moving_io *io) { + struct bch_fs *c = io->write.op.c; struct moving_context *ctxt = io->write.ctxt; + struct bch_read_bio *rbio = &io->write.rbio; if (ctxt->stats) { - if (io->write.rbio.bio.bi_status) + if (rbio->bio.bi_status) atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, &ctxt->stats->sectors_error_uncorrected); - else if (io->write.rbio.saw_error) + else if (rbio->saw_error) atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, &ctxt->stats->sectors_error_corrected); } - if (unlikely(io->write.rbio.ret || - io->write.rbio.bio.bi_status || - io->write.data_opts.scrub)) { + /* + * If the extent has been bitrotted, we're going to have to give it a + * new checksum in order to move it - but the poison bit will ensure + * that userspace still gets the appropriate error. + */ + if (unlikely(rbio->ret == -BCH_ERR_data_read_csum_err && + (bch2_bkey_extent_flags(bkey_i_to_s_c(io->write.k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) { + struct bch_extent_crc_unpacked crc = rbio->pick.crc; + struct nonce nonce = extent_nonce(rbio->version, crc); + + rbio->pick.crc.csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type, + nonce, &rbio->bio); + rbio->ret = 0; + } + + if (unlikely(rbio->ret || io->write.data_opts.scrub)) { move_free(io); return; } if (trace_io_move_write_enabled()) { - struct bch_fs *c = io->write.op.c; struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k)); @@ -545,7 +559,7 @@ static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans * BTREE_ID_reflink, reflink_pos, BTREE_ITER_not_extents); - struct bkey_s_c k = bch2_btree_iter_peek(iter); + struct bkey_s_c k = bch2_btree_iter_peek(trans, iter); if (!k.k || bkey_err(k)) { bch2_trans_iter_exit(trans, iter); return k; @@ -603,7 +617,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, bch2_trans_begin(trans); - k = bch2_btree_iter_peek(&iter); + k = bch2_btree_iter_peek(trans, &iter); if (!k.k) break; @@ -681,7 +695,7 @@ next: if (ctxt->stats) atomic64_add(k.k->size, &ctxt->stats->sectors_seen); next_nondata: - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); } bch2_trans_iter_exit(trans, &reflink_iter); @@ -794,7 +808,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, bch2_trans_begin(trans); - k = bch2_btree_iter_peek(&bp_iter); + k = bch2_btree_iter_peek(trans, &bp_iter); ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; @@ -876,7 +890,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, if (ctxt->stats) atomic64_add(sectors, &ctxt->stats->sectors_seen); next: - bch2_btree_iter_advance(&bp_iter); + bch2_btree_iter_advance(trans, &bp_iter); } err: bch2_trans_iter_exit(trans, &bp_iter); @@ -991,7 +1005,7 @@ static int bch2_move_btree(struct bch_fs *c, retry: ret = 0; while (bch2_trans_begin(trans), - (b = bch2_btree_iter_peek_node(&iter)) && + (b = bch2_btree_iter_peek_node(trans, &iter)) && !(ret = PTR_ERR_OR_ZERO(b))) { if (kthread && kthread_should_stop()) break; @@ -1011,7 +1025,7 @@ retry: if (ret) break; next: - bch2_btree_iter_next_node(&iter); + bch2_btree_iter_next_node(trans, &iter); } if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 5126c870..159410c5 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -280,7 +280,11 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) s64 wait = S64_MAX, fragmented_allowed, fragmented; for_each_rw_member(c, ca) { - struct bch_dev_usage usage = bch2_dev_usage_read(ca); + struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca); + struct bch_dev_usage usage; + + for (unsigned i = 0; i < BCH_DATA_NR; i++) + usage.buckets[i] = usage_full.d[i].buckets; fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) * ca->mi.bucket_size) >> 1); @@ -288,7 +292,7 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) for (unsigned i = 0; i < BCH_DATA_NR; i++) if (data_type_movable(i)) - fragmented += usage.d[i].fragmented; + fragmented += usage_full.d[i].fragmented; wait = min(wait, max(0LL, fragmented_allowed - fragmented)); } diff --git a/libbcachefs/namei.c b/libbcachefs/namei.c index ee725170..0d65ea96 100644 --- a/libbcachefs/namei.c +++ b/libbcachefs/namei.c @@ -28,8 +28,8 @@ int bch2_create_trans(struct btree_trans *trans, unsigned flags) { struct bch_fs *c = trans->c; - struct btree_iter dir_iter = { NULL }; - struct btree_iter inode_iter = { NULL }; + struct btree_iter dir_iter = {}; + struct btree_iter inode_iter = {}; subvol_inum new_inum = dir; u64 now = bch2_current_time(c); u64 cpu = raw_smp_processor_id(); @@ -127,8 +127,8 @@ int bch2_create_trans(struct btree_trans *trans, if (ret) goto err; - bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot); - ret = bch2_btree_iter_traverse(&dir_iter); + bch2_btree_iter_set_snapshot(trans, &dir_iter, dir_snapshot); + ret = bch2_btree_iter_traverse(trans, &dir_iter); if (ret) goto err; } @@ -177,9 +177,9 @@ int bch2_create_trans(struct btree_trans *trans, new_inode->bi_depth = dir_u->bi_depth + 1; inode_iter.flags &= ~BTREE_ITER_all_snapshots; - bch2_btree_iter_set_snapshot(&inode_iter, snapshot); + bch2_btree_iter_set_snapshot(trans, &inode_iter, snapshot); - ret = bch2_btree_iter_traverse(&inode_iter) ?: + ret = bch2_btree_iter_traverse(trans, &inode_iter) ?: bch2_inode_write(trans, &inode_iter, new_inode); err: bch2_trans_iter_exit(trans, &inode_iter); @@ -193,8 +193,8 @@ int bch2_link_trans(struct btree_trans *trans, const struct qstr *name) { struct bch_fs *c = trans->c; - struct btree_iter dir_iter = { NULL }; - struct btree_iter inode_iter = { NULL }; + struct btree_iter dir_iter = {}; + struct btree_iter inode_iter = {}; struct bch_hash_info dir_hash; u64 now = bch2_current_time(c); u64 dir_offset = 0; @@ -253,9 +253,9 @@ int bch2_unlink_trans(struct btree_trans *trans, bool deleting_subvol) { struct bch_fs *c = trans->c; - struct btree_iter dir_iter = { NULL }; - struct btree_iter dirent_iter = { NULL }; - struct btree_iter inode_iter = { NULL }; + struct btree_iter dir_iter = {}; + struct btree_iter dirent_iter = {}; + struct btree_iter inode_iter = {}; struct bch_hash_info dir_hash; subvol_inum inum; u64 now = bch2_current_time(c); @@ -301,7 +301,7 @@ int bch2_unlink_trans(struct btree_trans *trans, if (ret) goto err; - k = bch2_btree_iter_peek_slot(&dirent_iter); + k = bch2_btree_iter_peek_slot(trans, &dirent_iter); ret = bkey_err(k); if (ret) goto err; @@ -310,8 +310,8 @@ int bch2_unlink_trans(struct btree_trans *trans, * If we're deleting a subvolume, we need to really delete the * dirent, not just emit a whiteout in the current snapshot: */ - bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot); - ret = bch2_btree_iter_traverse(&dirent_iter); + bch2_btree_iter_set_snapshot(trans, &dirent_iter, k.k->p.snapshot); + ret = bch2_btree_iter_traverse(trans, &dirent_iter); if (ret) goto err; } else { @@ -390,10 +390,10 @@ int bch2_rename_trans(struct btree_trans *trans, enum bch_rename_mode mode) { struct bch_fs *c = trans->c; - struct btree_iter src_dir_iter = { NULL }; - struct btree_iter dst_dir_iter = { NULL }; - struct btree_iter src_inode_iter = { NULL }; - struct btree_iter dst_inode_iter = { NULL }; + struct btree_iter src_dir_iter = {}; + struct btree_iter dst_dir_iter = {}; + struct btree_iter src_inode_iter = {}; + struct btree_iter dst_inode_iter = {}; struct bch_hash_info src_hash, dst_hash; subvol_inum src_inum, dst_inum; u64 src_offset, dst_offset; @@ -666,7 +666,7 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; - struct btree_iter bp_iter = { NULL }; + struct btree_iter bp_iter = {}; int ret = 0; if (inode_points_to_dirent(target, d)) diff --git a/libbcachefs/nocow_locking.c b/libbcachefs/nocow_locking.c index 3c21981a..962218fa 100644 --- a/libbcachefs/nocow_locking.c +++ b/libbcachefs/nocow_locking.c @@ -133,12 +133,10 @@ void bch2_fs_nocow_locking_exit(struct bch_fs *c) BUG_ON(atomic_read(&l->l[j])); } -int bch2_fs_nocow_locking_init(struct bch_fs *c) +void bch2_fs_nocow_locking_init_early(struct bch_fs *c) { struct bucket_nocow_lock_table *t = &c->nocow_locks; for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) spin_lock_init(&l->lock); - - return 0; } diff --git a/libbcachefs/nocow_locking.h b/libbcachefs/nocow_locking.h index f9d6a426..48b8a003 100644 --- a/libbcachefs/nocow_locking.h +++ b/libbcachefs/nocow_locking.h @@ -45,6 +45,6 @@ static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t, void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *); void bch2_fs_nocow_locking_exit(struct bch_fs *); -int bch2_fs_nocow_locking_init(struct bch_fs *); +void bch2_fs_nocow_locking_init_early(struct bch_fs *); #endif /* _BCACHEFS_NOCOW_LOCKING_H */ diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index af325881..a07119d8 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -19,6 +19,11 @@ const char * const bch2_error_actions[] = { NULL }; +const char * const bch2_degraded_actions[] = { + BCH_DEGRADED_ACTIONS() + NULL +}; + const char * const bch2_fsck_fix_opts[] = { BCH_FIX_ERRORS_OPTS() NULL @@ -273,20 +278,20 @@ int bch2_opt_lookup(const char *name) return -1; } -struct synonym { +struct opt_synonym { const char *s1, *s2; }; -static const struct synonym bch_opt_synonyms[] = { +static const struct opt_synonym bch2_opt_synonyms[] = { { "quota", "usrquota" }, }; static int bch2_mount_opt_lookup(const char *name) { - const struct synonym *i; + const struct opt_synonym *i; - for (i = bch_opt_synonyms; - i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms); + for (i = bch2_opt_synonyms; + i < bch2_opt_synonyms + ARRAY_SIZE(bch2_opt_synonyms); i++) if (!strcmp(name, i->s1)) name = i->s2; @@ -294,6 +299,30 @@ static int bch2_mount_opt_lookup(const char *name) return bch2_opt_lookup(name); } +struct opt_val_synonym { + const char *opt, *v1, *v2; +}; + +static const struct opt_val_synonym bch2_opt_val_synonyms[] = { + { "degraded", "true", "yes" }, + { "degraded", "false", "no" }, + { "degraded", "1", "yes" }, + { "degraded", "0", "no" }, +}; + +static const char *bch2_opt_val_synonym_lookup(const char *opt, const char *val) +{ + const struct opt_val_synonym *i; + + for (i = bch2_opt_val_synonyms; + i < bch2_opt_val_synonyms + ARRAY_SIZE(bch2_opt_val_synonyms); + i++) + if (!strcmp(opt, i->opt) && !strcmp(val, i->v1)) + return i->v2; + + return val; +} + int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err) { if (v < opt->min) { @@ -339,19 +368,14 @@ int bch2_opt_parse(struct bch_fs *c, switch (opt->type) { case BCH_OPT_BOOL: - if (val) { - ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool); - if (ret != -BCH_ERR_option_not_bool) { - *res = ret; - } else { - if (err) - prt_printf(err, "%s: must be bool", opt->attr.name); - return ret; - } + ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool); + if (ret != -BCH_ERR_option_not_bool) { + *res = ret; } else { - *res = 1; + if (err) + prt_printf(err, "%s: must be bool", opt->attr.name); + return ret; } - break; case BCH_OPT_UINT: if (!val) { @@ -360,9 +384,15 @@ int bch2_opt_parse(struct bch_fs *c, return -EINVAL; } - ret = opt->flags & OPT_HUMAN_READABLE - ? bch2_strtou64_h(val, res) - : kstrtou64(val, 10, res); + if (*val != '-') { + ret = opt->flags & OPT_HUMAN_READABLE + ? bch2_strtou64_h(val, res) + : kstrtou64(val, 10, res); + } else { + prt_printf(err, "%s: must be a non-negative number", opt->attr.name); + return -BCH_ERR_option_negative; + } + if (ret < 0) { if (err) prt_printf(err, "%s: must be a number", @@ -498,6 +528,14 @@ int bch2_opt_check_may_set(struct bch_fs *c, struct bch_dev *ca, int id, u64 v) if (v) bch2_check_set_feature(c, BCH_FEATURE_ec); break; + case Opt_single_device: + if (v) { + mutex_lock(&c->sb_lock); + if (bch2_sb_nr_devices(c->disk_sb.sb) > 1) + ret = -BCH_ERR_not_single_device_filesystem; + mutex_unlock(&c->sb_lock); + } + break; } return ret; @@ -536,6 +574,11 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, if (id < 0) return 0; + if (!val) + val = "1"; + + val = bch2_opt_val_synonym_lookup(name, val); + if (!(bch2_opt_table[id].flags & OPT_MOUNT)) goto bad_opt; diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 4d063130..ae24a2f0 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -11,6 +11,7 @@ struct bch_fs; extern const char * const bch2_error_actions[]; +extern const char * const bch2_degraded_actions[]; extern const char * const bch2_fsck_fix_opts[]; extern const char * const bch2_version_upgrade_opts[]; extern const char * const bch2_sb_features[]; @@ -302,14 +303,9 @@ enum fsck_err_opts { NULL, "Enable project quotas") \ x(degraded, u8, \ OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ + OPT_STR(bch2_degraded_actions), \ + BCH_SB_DEGRADED_ACTION, BCH_DEGRADED_ask, \ NULL, "Allow mounting in degraded mode") \ - x(very_degraded, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Allow mounting in when data will be missing") \ x(no_splitbrain_check, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ @@ -517,7 +513,7 @@ enum fsck_err_opts { BCH_MEMBER_DATA_ALLOWED, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\ "types", "Allowed data types for this device: journal, btree, and/or user")\ x(discard, u8, \ - OPT_MOUNT|OPT_DEVICE|OPT_RUNTIME, \ + OPT_MOUNT|OPT_FS|OPT_DEVICE|OPT_RUNTIME, \ OPT_BOOL(), \ BCH_MEMBER_DISCARD, true, \ NULL, "Enable discard/TRIM support") \ @@ -525,8 +521,13 @@ enum fsck_err_opts { OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ - NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\ - " prefetched sequentially") + NULL, "BTREE_ITER_prefetch causes btree nodes to be\n"\ + " prefetched sequentially") \ + x(single_device, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_SINGLE_DEVICE, false, \ + NULL, "Devices with the same UUID may be mounted simultaneously") struct bch_opts { #define x(_name, _bits, ...) unsigned _name##_defined:1; diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c index 8b857fc3..3d4755d7 100644 --- a/libbcachefs/quota.c +++ b/libbcachefs/quota.c @@ -516,7 +516,7 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans, bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, KEY_TYPE_QUOTA_NOCHECK); advance: - bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); + bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos)); return 0; } diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c index b9bde04b..63f01349 100644 --- a/libbcachefs/rebalance.c +++ b/libbcachefs/rebalance.c @@ -95,6 +95,9 @@ static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) + return 0; + return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) | bch2_bkey_ptrs_need_move(c, opts, ptrs); } @@ -107,6 +110,9 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) if (!opts) return 0; + if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) + return 0; + const union bch_extent_entry *entry; struct extent_ptr_decoded p; u64 sectors = 0; @@ -233,7 +239,7 @@ int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum) bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), BTREE_ITER_intent); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); ret = bkey_err(k); if (ret) goto err; @@ -281,7 +287,7 @@ static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), BTREE_ITER_intent); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); ret = bkey_err(k); if (ret) goto err; @@ -301,7 +307,7 @@ static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans, struct btree_iter *work_iter) { return !kthread_should_stop() - ? bch2_btree_iter_peek(work_iter) + ? bch2_btree_iter_peek(trans, work_iter) : bkey_s_c_null; } @@ -335,7 +341,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink, work_pos, BTREE_ITER_all_snapshots); - struct bkey_s_c k = bch2_btree_iter_peek_slot(extent_iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, extent_iter); if (bkey_err(k)) return k; @@ -511,7 +517,7 @@ static int do_rebalance(struct moving_context *ctxt) struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; struct bch_fs_rebalance *r = &c->rebalance; - struct btree_iter rebalance_work_iter, extent_iter = { NULL }; + struct btree_iter rebalance_work_iter, extent_iter = {}; struct bkey_s_c k; int ret = 0; @@ -552,7 +558,7 @@ static int do_rebalance(struct moving_context *ctxt) if (ret) break; - bch2_btree_iter_advance(&rebalance_work_iter); + bch2_btree_iter_advance(trans, &rebalance_work_iter); } bch2_trans_iter_exit(trans, &extent_iter); diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 266c5770..79fd18a5 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -198,7 +198,7 @@ static int bch2_journal_replay_accounting_key(struct btree_trans *trans, bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, BTREE_MAX_DEPTH, k->level, BTREE_ITER_intent); - int ret = bch2_btree_iter_traverse(&iter); + int ret = bch2_btree_iter_traverse(trans, &iter); if (ret) goto out; @@ -261,7 +261,7 @@ static int bch2_journal_replay_key(struct btree_trans *trans, bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, BTREE_MAX_DEPTH, k->level, iter_flags); - ret = bch2_btree_iter_traverse(&iter); + ret = bch2_btree_iter_traverse(trans, &iter); if (ret) goto out; @@ -270,7 +270,7 @@ static int bch2_journal_replay_key(struct btree_trans *trans, bch2_trans_iter_exit(trans, &iter); bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, BTREE_MAX_DEPTH, 0, iter_flags); - ret = bch2_btree_iter_traverse(&iter) ?: + ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_btree_increase_depth(trans, iter.path, 0) ?: -BCH_ERR_transaction_restart_nested; goto out; diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c index ee23f1f9..710178e3 100644 --- a/libbcachefs/reflink.c +++ b/libbcachefs/reflink.c @@ -495,7 +495,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, bool reflink_p_may_update_opts_field) { struct bch_fs *c = trans->c; - struct btree_iter reflink_iter = { NULL }; + struct btree_iter reflink_iter = {}; struct bkey_s_c k; struct bkey_i *r_v; struct bkey_i_reflink_p *r_p; @@ -507,7 +507,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX, BTREE_ITER_intent); - k = bch2_btree_iter_peek_prev(&reflink_iter); + k = bch2_btree_iter_peek_prev(trans, &reflink_iter); ret = bkey_err(k); if (ret) goto err; @@ -569,12 +569,13 @@ err: return ret; } -static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) +static struct bkey_s_c get_next_src(struct btree_trans *trans, + struct btree_iter *iter, struct bpos end) { struct bkey_s_c k; int ret; - for_each_btree_key_max_continue_norestart(*iter, end, 0, k, ret) { + for_each_btree_key_max_continue_norestart(trans, *iter, end, 0, k, ret) { if (bkey_extent_is_unwritten(k)) continue; @@ -583,7 +584,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) } if (bkey_ge(iter->pos, end)) - bch2_btree_iter_set_pos(iter, end); + bch2_btree_iter_set_pos(trans, iter, end); return ret ? bkey_s_c_err(ret) : bkey_s_c_null; } @@ -647,27 +648,27 @@ s64 bch2_remap_range(struct bch_fs *c, if (ret) continue; - bch2_btree_iter_set_snapshot(&src_iter, src_snapshot); + bch2_btree_iter_set_snapshot(trans, &src_iter, src_snapshot); ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol, &dst_snapshot); if (ret) continue; - bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot); + bch2_btree_iter_set_snapshot(trans, &dst_iter, dst_snapshot); if (dst_inum.inum < src_inum.inum) { /* Avoid some lock cycle transaction restarts */ - ret = bch2_btree_iter_traverse(&dst_iter); + ret = bch2_btree_iter_traverse(trans, &dst_iter); if (ret) continue; } dst_done = dst_iter.pos.offset - dst_start.offset; src_want = POS(src_start.inode, src_start.offset + dst_done); - bch2_btree_iter_set_pos(&src_iter, src_want); + bch2_btree_iter_set_pos(trans, &src_iter, src_want); - src_k = get_next_src(&src_iter, src_end); + src_k = get_next_src(trans, &src_iter, src_end); ret = bkey_err(src_k); if (ret) continue; @@ -738,7 +739,7 @@ s64 bch2_remap_range(struct bch_fs *c, do { struct bch_inode_unpacked inode_u; - struct btree_iter inode_iter = { NULL }; + struct btree_iter inode_iter = {}; bch2_trans_begin(trans); diff --git a/libbcachefs/sb-counters_format.h b/libbcachefs/sb-counters_format.h index fa27ec59..5c4e5de7 100644 --- a/libbcachefs/sb-counters_format.h +++ b/libbcachefs/sb-counters_format.h @@ -16,6 +16,7 @@ enum counters_flags { x(io_read_split, 33, TYPE_COUNTER) \ x(io_read_reuse_race, 34, TYPE_COUNTER) \ x(io_read_retry, 32, TYPE_COUNTER) \ + x(io_read_fail_and_poison, 82, TYPE_COUNTER) \ x(io_write, 1, TYPE_SECTORS) \ x(io_move, 2, TYPE_SECTORS) \ x(io_move_read, 35, TYPE_SECTORS) \ diff --git a/libbcachefs/sb-members.h b/libbcachefs/sb-members.h index 38261638..06bb41a3 100644 --- a/libbcachefs/sb-members.h +++ b/libbcachefs/sb-members.h @@ -20,7 +20,7 @@ struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i); static inline bool bch2_dev_is_online(struct bch_dev *ca) { - return !percpu_ref_is_zero(&ca->io_ref); + return !percpu_ref_is_zero(&ca->io_ref[READ]); } static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned); @@ -156,33 +156,34 @@ static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, struct bch_dev *ca, - unsigned state_mask) + unsigned state_mask, + int rw) { rcu_read_lock(); if (ca) - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[rw]); while ((ca = __bch2_next_dev(c, ca, NULL)) && (!((1 << ca->mi.state) & state_mask) || - !percpu_ref_tryget(&ca->io_ref))) + !percpu_ref_tryget(&ca->io_ref[rw]))) ; rcu_read_unlock(); return ca; } -#define __for_each_online_member(_c, _ca, state_mask) \ +#define __for_each_online_member(_c, _ca, state_mask, rw) \ for (struct bch_dev *_ca = NULL; \ - (_ca = bch2_get_next_online_dev(_c, _ca, state_mask));) + (_ca = bch2_get_next_online_dev(_c, _ca, state_mask, rw));) #define for_each_online_member(c, ca) \ - __for_each_online_member(c, ca, ~0) + __for_each_online_member(c, ca, ~0, READ) #define for_each_rw_member(c, ca) \ - __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw)) + __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), WRITE) #define for_each_readable_member(c, ca) \ - __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro)) + __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro), READ) static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev) { @@ -287,7 +288,7 @@ static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu(c, dev); - if (ca && !percpu_ref_tryget(&ca->io_ref)) + if (ca && !percpu_ref_tryget(&ca->io_ref[rw])) ca = NULL; rcu_read_unlock(); @@ -297,7 +298,7 @@ static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, return ca; if (ca) - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[rw]); return NULL; } diff --git a/libbcachefs/snapshot.c b/libbcachefs/snapshot.c index 0c65065b..df14237a 100644 --- a/libbcachefs/snapshot.c +++ b/libbcachefs/snapshot.c @@ -281,6 +281,16 @@ fsck_err: return ret; } +static int bch2_snapshot_table_make_room(struct bch_fs *c, u32 id) +{ + mutex_lock(&c->snapshot_table_lock); + int ret = snapshot_t_mut(c, id) + ? 0 + : -BCH_ERR_ENOMEM_mark_snapshot; + mutex_unlock(&c->snapshot_table_lock); + return ret; +} + static int __bch2_mark_snapshot(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s_c new, @@ -843,9 +853,6 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) { struct bch_fs *c = trans->c; - if (bch2_snapshot_exists(c, id)) - return 0; - /* Do we need to reconstruct the snapshot_tree entry as well? */ struct btree_iter iter; struct bkey_s_c k; @@ -890,9 +897,8 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) } bch2_trans_iter_exit(trans, &iter); - return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?: - bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, - bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0); + return bch2_snapshot_table_make_room(c, id) ?: + bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0); } /* Figure out which snapshot nodes belong in the same tree: */ @@ -1074,9 +1080,9 @@ static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s) static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) { struct bch_fs *c = trans->c; - struct btree_iter iter, p_iter = (struct btree_iter) { NULL }; - struct btree_iter c_iter = (struct btree_iter) { NULL }; - struct btree_iter tree_iter = (struct btree_iter) { NULL }; + struct btree_iter iter, p_iter = {}; + struct btree_iter c_iter = {}; + struct btree_iter tree_iter = {}; struct bkey_s_c_snapshot s; u32 parent_id, child_id; unsigned i; @@ -1193,13 +1199,13 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS_MIN, BTREE_ITER_intent); - k = bch2_btree_iter_peek(&iter); + k = bch2_btree_iter_peek(trans, &iter); ret = bkey_err(k); if (ret) goto err; for (i = 0; i < nr_snapids; i++) { - k = bch2_btree_iter_prev_slot(&iter); + k = bch2_btree_iter_prev_slot(trans, &iter); ret = bkey_err(k); if (ret) goto err; diff --git a/libbcachefs/str_hash.c b/libbcachefs/str_hash.c index 602afca2..a90bf7b8 100644 --- a/libbcachefs/str_hash.c +++ b/libbcachefs/str_hash.c @@ -195,7 +195,7 @@ int __bch2_str_hash_check_key(struct btree_trans *trans, struct btree_iter *k_iter, struct bkey_s_c hash_k) { struct bch_fs *c = trans->c; - struct btree_iter iter = { NULL }; + struct btree_iter iter = {}; struct printbuf buf = PRINTBUF; struct bkey_s_c k; int ret = 0; diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h index 575ad1e0..09a354a2 100644 --- a/libbcachefs/str_hash.h +++ b/libbcachefs/str_hash.h @@ -231,11 +231,11 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, struct bkey_s_c k; int ret; - bch2_trans_copy_iter(&iter, start); + bch2_trans_copy_iter(trans, &iter, start); - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); - for_each_btree_key_continue_norestart(iter, BTREE_ITER_slots, k, ret) { + for_each_btree_key_continue_norestart(trans, iter, BTREE_ITER_slots, k, ret) { if (k.k->type != desc.key_type && k.k->type != KEY_TYPE_hash_whiteout) break; @@ -280,7 +280,7 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans, } if (!slot.path && !(flags & STR_HASH_must_replace)) - bch2_trans_copy_iter(&slot, iter); + bch2_trans_copy_iter(trans, &slot, iter); if (k.k->type != KEY_TYPE_hash_whiteout) goto not_found; diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c index cd0d8e5e..c9acaf13 100644 --- a/libbcachefs/subvolume.c +++ b/libbcachefs/subvolume.c @@ -275,7 +275,7 @@ int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol) struct btree_iter iter; bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0); - struct bkey_s_c k = bch2_btree_iter_peek(&iter); + struct bkey_s_c k = bch2_btree_iter_peek(trans, &iter); bch2_trans_iter_exit(trans, &iter); return bkey_err(k) ?: k.k && k.k->p.inode == subvol @@ -478,13 +478,11 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor { struct bch_fs *c = container_of(work, struct bch_fs, snapshot_wait_for_pagecache_and_delete_work); - snapshot_id_list s; - u32 *id; int ret = 0; while (!ret) { mutex_lock(&c->snapshots_unlinked_lock); - s = c->snapshots_unlinked; + snapshot_id_list s = c->snapshots_unlinked; darray_init(&c->snapshots_unlinked); mutex_unlock(&c->snapshots_unlinked_lock); @@ -493,7 +491,7 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor bch2_evict_subvolume_inodes(c, &s); - for (id = s.data; id < s.data + s.nr; id++) { + darray_for_each(s, id) { ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id)); bch_err_msg(c, ret, "deleting subvolume %u", *id); if (ret) @@ -574,7 +572,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, bool ro) { struct bch_fs *c = trans->c; - struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL }; + struct btree_iter dst_iter, src_iter = {}; struct bkey_i_subvolume *new_subvol = NULL; struct bkey_i_subvolume *src_subvol = NULL; u32 parent = 0, new_nodes[2], snapshot_subvols[2]; @@ -715,11 +713,10 @@ int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) return ret; } -int bch2_fs_subvolumes_init(struct bch_fs *c) +void bch2_fs_subvolumes_init_early(struct bch_fs *c) { INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work); INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work, bch2_subvolume_wait_for_pagecache_and_delete); mutex_init(&c->snapshots_unlinked_lock); - return 0; } diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h index 910f6196..ee5e4e5a 100644 --- a/libbcachefs/subvolume.h +++ b/libbcachefs/subvolume.h @@ -33,16 +33,16 @@ int bch2_subvol_is_ro_trans(struct btree_trans *, u32); int bch2_subvol_is_ro(struct bch_fs *, u32); static inline struct bkey_s_c -bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos end, - u32 subvolid, unsigned flags) +bch2_btree_iter_peek_in_subvolume_max_type(struct btree_trans *trans, struct btree_iter *iter, + struct bpos end, u32 subvolid, unsigned flags) { u32 snapshot; - int ret = bch2_subvolume_get_snapshot(iter->trans, subvolid, &snapshot); + int ret = bch2_subvolume_get_snapshot(trans, subvolid, &snapshot); if (ret) return bkey_s_c_err(ret); - bch2_btree_iter_set_snapshot(iter, snapshot); - return bch2_btree_iter_peek_max_type(iter, end, flags); + bch2_btree_iter_set_snapshot(trans, iter, snapshot); + return bch2_btree_iter_peek_max_type(trans, iter, end, flags); } #define for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \ @@ -53,14 +53,14 @@ bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos \ do { \ _ret3 = lockrestart_do(_trans, ({ \ - (_k) = bch2_btree_iter_peek_in_subvolume_max_type(&(_iter), \ + (_k) = bch2_btree_iter_peek_in_subvolume_max_type(trans, &(_iter),\ _end, _subvolid, (_flags)); \ if (!(_k).k) \ break; \ \ bkey_err(_k) ?: (_do); \ })); \ - } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \ + } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \ \ bch2_trans_iter_exit((_trans), &(_iter)); \ _ret3; \ @@ -86,6 +86,6 @@ int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, boo int bch2_initialize_subvolumes(struct bch_fs *); int bch2_fs_upgrade_for_subvolumes(struct bch_fs *); -int bch2_fs_subvolumes_init(struct bch_fs *); +void bch2_fs_subvolumes_init_early(struct bch_fs *); #endif /* _BCACHEFS_SUBVOLUME_H */ diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 572b06bf..7d0c5cb6 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -248,7 +248,7 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb, struct bch_sb_handle *dev_sb = &ca->disk_sb; if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) { - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); return NULL; } } @@ -945,7 +945,7 @@ static void write_super_endio(struct bio *bio) } closure_put(&ca->fs->sb_write); - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); } static void read_back_super(struct bch_fs *c, struct bch_dev *ca) @@ -963,7 +963,7 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca) this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio)); - percpu_ref_get(&ca->io_ref); + percpu_ref_get(&ca->io_ref[READ]); closure_bio_submit(bio, &c->sb_write); } @@ -989,7 +989,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb], bio_sectors(bio)); - percpu_ref_get(&ca->io_ref); + percpu_ref_get(&ca->io_ref[READ]); closure_bio_submit(bio, &c->sb_write); } @@ -1006,7 +1006,7 @@ int bch2_write_super(struct bch_fs *c) trace_and_count(c, write_super, c, _RET_IP_); - if (c->opts.very_degraded) + if (c->opts.degraded == BCH_DEGRADED_very) degraded_flags |= BCH_FORCE_IF_LOST; lockdep_assert_held(&c->sb_lock); @@ -1014,13 +1014,20 @@ int bch2_write_super(struct bch_fs *c) closure_init_stack(cl); memset(&sb_written, 0, sizeof(sb_written)); + /* + * Note: we do writes to RO devices here, and we might want to change + * that in the future. + * + * For now, we expect to be able to call write_super() when we're not + * yet RW: + */ for_each_online_member(c, ca) { ret = darray_push(&online_devices, ca); if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) { - percpu_ref_put(&ca->io_ref); + percpu_ref_put(&ca->io_ref[READ]); goto out; } - percpu_ref_get(&ca->io_ref); + percpu_ref_get(&ca->io_ref[READ]); } /* Make sure we're using the new magic numbers: */ @@ -1186,7 +1193,7 @@ out: /* Make new options visible after they're persistent: */ bch2_sb_update(c); darray_for_each(online_devices, ca) - percpu_ref_put(&(*ca)->io_ref); + percpu_ref_put(&(*ca)->io_ref[READ]); darray_exit(&online_devices); printbuf_exit(&err); return ret; diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 20208f3c..521f5570 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -79,6 +79,8 @@ MODULE_SOFTDEP("pre: chacha20"); MODULE_SOFTDEP("pre: poly1305"); MODULE_SOFTDEP("pre: xxhash"); +typedef DARRAY(struct bch_sb_handle) bch_sb_handles; + const char * const bch2_fs_flag_strs[] = { #define x(n) #n, BCH_FS_FLAGS() @@ -185,7 +187,9 @@ static void bch2_dev_unlink(struct bch_dev *); static void bch2_dev_free(struct bch_dev *); static int bch2_dev_alloc(struct bch_fs *, unsigned); static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); +static void bch2_dev_io_ref_stop(struct bch_dev *, int); static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); +static int bch2_fs_init_rw(struct bch_fs *); struct bch_fs *bch2_dev_to_fs(dev_t dev) { @@ -294,8 +298,10 @@ static void __bch2_fs_read_only(struct bch_fs *c) /* * After stopping journal: */ - for_each_member_device(c, ca) + for_each_member_device(c, ca) { + bch2_dev_io_ref_stop(ca, WRITE); bch2_dev_allocator_remove(c, ca); + } } #ifndef BCH_WRITE_REF_DEBUG @@ -461,11 +467,11 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) bch_info(c, "going read-write"); - ret = bch2_sb_members_v2_init(c); + ret = bch2_fs_init_rw(c); if (ret) goto err; - ret = bch2_fs_mark_dirty(c); + ret = bch2_sb_members_v2_init(c); if (ret) goto err; @@ -480,10 +486,24 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) set_bit(JOURNAL_need_flush_write, &c->journal.flags); set_bit(JOURNAL_running, &c->journal.flags); - for_each_rw_member(c, ca) + __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) { bch2_dev_allocator_add(c, ca); + percpu_ref_reinit(&ca->io_ref[WRITE]); + } bch2_recalc_capacity(c); + ret = bch2_fs_mark_dirty(c); + if (ret) + goto err; + + spin_lock(&c->journal.lock); + bch2_journal_space_available(&c->journal); + spin_unlock(&c->journal.lock); + + ret = bch2_journal_reclaim_start(&c->journal); + if (ret) + goto err; + set_bit(BCH_FS_rw, &c->flags); set_bit(BCH_FS_was_rw, &c->flags); @@ -495,11 +515,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) atomic_long_inc(&c->writes[i]); } #endif - - ret = bch2_journal_reclaim_start(&c->journal); - if (ret) - goto err; - if (!early) { ret = bch2_fs_read_write_late(c); if (ret) @@ -573,7 +588,6 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_io_clock_exit(&c->io_clock[WRITE]); bch2_io_clock_exit(&c->io_clock[READ]); bch2_fs_compress_exit(c); - bch2_fs_btree_gc_exit(c); bch2_journal_keys_put_initial(c); bch2_find_btree_nodes_exit(&c->found_btree_nodes); BUG_ON(atomic_read(&c->journal_keys.ref)); @@ -606,8 +620,8 @@ static void __bch2_fs_free(struct bch_fs *c) destroy_workqueue(c->btree_read_complete_wq); if (c->copygc_wq) destroy_workqueue(c->copygc_wq); - if (c->btree_io_complete_wq) - destroy_workqueue(c->btree_io_complete_wq); + if (c->btree_write_complete_wq) + destroy_workqueue(c->btree_write_complete_wq); if (c->btree_update_wq) destroy_workqueue(c->btree_update_wq); @@ -675,6 +689,7 @@ void bch2_fs_free(struct bch_fs *c) if (ca) { EBUG_ON(atomic_long_read(&ca->ref) != 1); + bch2_dev_io_ref_stop(ca, READ); bch2_free_super(&ca->disk_sb); bch2_dev_free(ca); } @@ -697,7 +712,8 @@ static int bch2_fs_online(struct bch_fs *c) lockdep_assert_held(&bch_fs_list_lock); - if (__bch2_uuid_to_fs(c->sb.uuid)) { + if (!c->opts.single_device && + __bch2_uuid_to_fs(c->sb.uuid)) { bch_err(c, "filesystem UUID already open"); return -EINVAL; } @@ -710,7 +726,9 @@ static int bch2_fs_online(struct bch_fs *c) bch2_fs_debug_init(c); - ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?: + ret = (!c->opts.single_device + ? kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) + : kobject_add(&c->kobj, NULL, "%s", c->name)) ?: kobject_add(&c->internal, &c->kobj, "internal") ?: kobject_add(&c->opts_dir, &c->kobj, "options") ?: #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT @@ -741,7 +759,37 @@ err: return ret; } -static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) +static int bch2_fs_init_rw(struct bch_fs *c) +{ + if (test_bit(BCH_FS_rw_init_done, &c->flags)) + return 0; + + if (!(c->btree_update_wq = alloc_workqueue("bcachefs", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || + !(c->btree_write_complete_wq = alloc_workqueue("bcachefs_btree_write_complete", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || + !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || + !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || + !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", + WQ_FREEZABLE, 0))) + return -BCH_ERR_ENOMEM_fs_other_alloc; + + int ret = bch2_fs_btree_interior_update_init(c) ?: + bch2_fs_btree_write_buffer_init(c) ?: + bch2_fs_fs_io_buffered_init(c) ?: + bch2_fs_io_write_init(c) ?: + bch2_fs_journal_init(&c->journal); + if (ret) + return ret; + + set_bit(BCH_FS_rw_init_done, &c->flags); + return 0; +} + +static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts, + bch_sb_handles *sbs) { struct bch_fs *c; struct printbuf name = PRINTBUF; @@ -784,18 +832,24 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_init(&c->times[i]); - bch2_fs_copygc_init(c); - bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); - bch2_fs_btree_iter_init_early(c); - bch2_fs_btree_interior_update_init_early(c); - bch2_fs_journal_keys_init(c); bch2_fs_allocator_background_init(c); bch2_fs_allocator_foreground_init(c); - bch2_fs_rebalance_init(c); - bch2_fs_quota_init(c); + bch2_fs_btree_cache_init_early(&c->btree_cache); + bch2_fs_btree_gc_init_early(c); + bch2_fs_btree_interior_update_init_early(c); + bch2_fs_btree_iter_init_early(c); + bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); + bch2_fs_btree_write_buffer_init_early(c); + bch2_fs_copygc_init(c); bch2_fs_ec_init_early(c); + bch2_fs_journal_init_early(&c->journal); + bch2_fs_journal_keys_init(c); bch2_fs_move_init(c); + bch2_fs_nocow_locking_init_early(c); + bch2_fs_quota_init(c); + bch2_fs_rebalance_init(c); bch2_fs_sb_errors_init_early(c); + bch2_fs_subvolumes_init_early(c); INIT_LIST_HEAD(&c->list); @@ -821,8 +875,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; - bch2_fs_btree_cache_init_early(&c->btree_cache); - mutex_init(&c->sectors_available_lock); ret = percpu_init_rwsem(&c->mark_lock); @@ -855,14 +907,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) } #endif - pr_uuid(&name, c->sb.user_uuid.b); - ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; - if (ret) - goto err; - - strscpy(c->name, name.buf, sizeof(c->name)); - printbuf_exit(&name); - /* Compat: */ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) @@ -893,22 +937,24 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) goto err; } + if (sbs->nr != 1 && !c->opts.single_device) + pr_uuid(&name, c->sb.user_uuid.b); + else + prt_bdevname(&name, sbs->data[0].bdev); + + ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; + if (ret) + goto err; + + strscpy(c->name, name.buf, sizeof(c->name)); + printbuf_exit(&name); + iter_size = sizeof(struct sort_iter) + (btree_blocks(c) + 1) * 2 * sizeof(struct sort_iter_set); - if (!(c->btree_update_wq = alloc_workqueue("bcachefs", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || - !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || - !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || - !(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", + if (!(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || - !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || - !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", - WQ_FREEZABLE, 0)) || #ifndef BCH_WRITE_REF_DEBUG percpu_ref_init(&c->writes, bch2_writes_disabled, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || @@ -928,29 +974,22 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) goto err; } - ret = bch2_fs_counters_init(c) ?: - bch2_fs_sb_errors_init(c) ?: + ret = + bch2_fs_btree_cache_init(c) ?: + bch2_fs_btree_iter_init(c) ?: + bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: + bch2_fs_buckets_waiting_for_journal_init(c) ?: bch2_io_clock_init(&c->io_clock[READ]) ?: bch2_io_clock_init(&c->io_clock[WRITE]) ?: - bch2_fs_journal_init(&c->journal) ?: - bch2_fs_btree_iter_init(c) ?: - bch2_fs_btree_cache_init(c) ?: - bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: - bch2_fs_btree_interior_update_init(c) ?: - bch2_fs_btree_gc_init(c) ?: - bch2_fs_buckets_waiting_for_journal_init(c) ?: - bch2_fs_btree_write_buffer_init(c) ?: - bch2_fs_subvolumes_init(c) ?: - bch2_fs_io_read_init(c) ?: - bch2_fs_io_write_init(c) ?: - bch2_fs_nocow_locking_init(c) ?: - bch2_fs_encryption_init(c) ?: bch2_fs_compress_init(c) ?: + bch2_fs_counters_init(c) ?: bch2_fs_ec_init(c) ?: - bch2_fs_vfs_init(c) ?: + bch2_fs_encryption_init(c) ?: bch2_fs_fsio_init(c) ?: - bch2_fs_fs_io_buffered_init(c) ?: - bch2_fs_fs_io_direct_init(c); + bch2_fs_fs_io_direct_init(c) ?: + bch2_fs_io_read_init(c) ?: + bch2_fs_sb_errors_init(c) ?: + bch2_fs_vfs_init(c); if (ret) goto err; @@ -993,12 +1032,6 @@ static void print_mount_opts(struct bch_fs *c) prt_str(&p, "starting version "); bch2_version_to_text(&p, c->sb.version); - if (c->opts.read_only) { - prt_str(&p, " opts="); - first = false; - prt_printf(&p, "ro"); - } - for (i = 0; i < bch2_opts_nr; i++) { const struct bch_option *opt = &bch2_opt_table[i]; u64 v = bch2_opt_get_by_id(&c->opts, i); @@ -1098,6 +1131,9 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) { struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); + if (c->opts.single_device) + return -BCH_ERR_single_device_filesystem; + if (le16_to_cpu(sb->block_size) != block_sectors(c)) return -BCH_ERR_mismatched_block_size; @@ -1199,6 +1235,15 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, /* Device startup/shutdown: */ +static void bch2_dev_io_ref_stop(struct bch_dev *ca, int rw) +{ + if (!percpu_ref_is_zero(&ca->io_ref[rw])) { + reinit_completion(&ca->io_ref_completion[rw]); + percpu_ref_kill(&ca->io_ref[rw]); + wait_for_completion(&ca->io_ref_completion[rw]); + } +} + static void bch2_dev_release(struct kobject *kobj) { struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); @@ -1208,6 +1253,9 @@ static void bch2_dev_release(struct kobject *kobj) static void bch2_dev_free(struct bch_dev *ca) { + WARN_ON(!percpu_ref_is_zero(&ca->io_ref[WRITE])); + WARN_ON(!percpu_ref_is_zero(&ca->io_ref[READ])); + cancel_work_sync(&ca->io_error_work); bch2_dev_unlink(ca); @@ -1226,7 +1274,8 @@ static void bch2_dev_free(struct bch_dev *ca) bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); - percpu_ref_exit(&ca->io_ref); + percpu_ref_exit(&ca->io_ref[WRITE]); + percpu_ref_exit(&ca->io_ref[READ]); #ifndef CONFIG_BCACHEFS_DEBUG percpu_ref_exit(&ca->ref); #endif @@ -1238,14 +1287,12 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) lockdep_assert_held(&c->state_lock); - if (percpu_ref_is_zero(&ca->io_ref)) + if (percpu_ref_is_zero(&ca->io_ref[READ])) return; __bch2_dev_read_only(c, ca); - reinit_completion(&ca->io_ref_completion); - percpu_ref_kill(&ca->io_ref); - wait_for_completion(&ca->io_ref_completion); + bch2_dev_io_ref_stop(ca, READ); bch2_dev_unlink(ca); @@ -1262,11 +1309,18 @@ static void bch2_dev_ref_complete(struct percpu_ref *ref) } #endif -static void bch2_dev_io_ref_complete(struct percpu_ref *ref) +static void bch2_dev_io_ref_read_complete(struct percpu_ref *ref) { - struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref); + struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref[READ]); - complete(&ca->io_ref_completion); + complete(&ca->io_ref_completion[READ]); +} + +static void bch2_dev_io_ref_write_complete(struct percpu_ref *ref) +{ + struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref[WRITE]); + + complete(&ca->io_ref_completion[WRITE]); } static void bch2_dev_unlink(struct bch_dev *ca) @@ -1330,7 +1384,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, kobject_init(&ca->kobj, &bch2_dev_ktype); init_completion(&ca->ref_completion); - init_completion(&ca->io_ref_completion); + init_completion(&ca->io_ref_completion[READ]); + init_completion(&ca->io_ref_completion[WRITE]); INIT_WORK(&ca->io_error_work, bch2_io_error_work); @@ -1356,7 +1411,9 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, bch2_dev_allocator_background_init(ca); - if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, + if (percpu_ref_init(&ca->io_ref[READ], bch2_dev_io_ref_read_complete, + PERCPU_REF_INIT_DEAD, GFP_KERNEL) || + percpu_ref_init(&ca->io_ref[WRITE], bch2_dev_io_ref_write_complete, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) || bch2_dev_buckets_alloc(c, ca) || @@ -1419,7 +1476,8 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) return -BCH_ERR_device_size_too_small; } - BUG_ON(!percpu_ref_is_zero(&ca->io_ref)); + BUG_ON(!percpu_ref_is_zero(&ca->io_ref[READ])); + BUG_ON(!percpu_ref_is_zero(&ca->io_ref[WRITE])); ret = bch2_dev_journal_init(ca, sb->sb); if (ret) @@ -1438,7 +1496,7 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) ca->dev = ca->disk_sb.bdev->bd_dev; - percpu_ref_reinit(&ca->io_ref); + percpu_ref_reinit(&ca->io_ref[READ]); return 0; } @@ -1466,11 +1524,7 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) struct printbuf name = PRINTBUF; prt_bdevname(&name, ca->disk_sb.bdev); - - if (c->sb.nr_devices == 1) - strscpy(c->name, name.buf, sizeof(c->name)); strscpy(ca->name, name.buf, sizeof(ca->name)); - printbuf_exit(&name); rebalance_wakeup(c); @@ -1535,19 +1589,18 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, static bool bch2_fs_may_start(struct bch_fs *c) { struct bch_dev *ca; - unsigned i, flags = 0; + unsigned flags = 0; - if (c->opts.very_degraded) + switch (c->opts.degraded) { + case BCH_DEGRADED_very: flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; - - if (c->opts.degraded) + break; + case BCH_DEGRADED_yes: flags |= BCH_FORCE_IF_DEGRADED; - - if (!c->opts.degraded && - !c->opts.very_degraded) { + break; + default: mutex_lock(&c->sb_lock); - - for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { + for (unsigned i = 0; i < c->disk_sb.sb->nr_devices; i++) { if (!bch2_member_exists(c->disk_sb.sb, i)) continue; @@ -1561,6 +1614,7 @@ static bool bch2_fs_may_start(struct bch_fs *c) } } mutex_unlock(&c->sb_lock); + break; } return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true); @@ -1568,6 +1622,8 @@ static bool bch2_fs_may_start(struct bch_fs *c) static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) { + bch2_dev_io_ref_stop(ca, WRITE); + /* * The allocator thread itself allocates btree nodes, so stop it first: */ @@ -1584,6 +1640,10 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); + + if (percpu_ref_is_zero(&ca->io_ref[WRITE])) + percpu_ref_reinit(&ca->io_ref[WRITE]); + bch2_dev_do_discards(ca); } @@ -1731,7 +1791,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) return 0; err: if (ca->mi.state == BCH_MEMBER_STATE_rw && - !percpu_ref_is_zero(&ca->io_ref)) + !percpu_ref_is_zero(&ca->io_ref[READ])) __bch2_dev_read_write(c, ca); up_write(&c->state_lock); return ret; @@ -1741,7 +1801,7 @@ err: int bch2_dev_add(struct bch_fs *c, const char *path) { struct bch_opts opts = bch2_opts_empty(); - struct bch_sb_handle sb; + struct bch_sb_handle sb = {}; struct bch_dev *ca = NULL; struct printbuf errbuf = PRINTBUF; struct printbuf label = PRINTBUF; @@ -2126,7 +2186,7 @@ static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, struct bch_opts opts) { - DARRAY(struct bch_sb_handle) sbs = { 0 }; + bch_sb_handles sbs = {}; struct bch_fs *c = NULL; struct bch_sb_handle *best = NULL; struct printbuf errbuf = PRINTBUF; @@ -2179,7 +2239,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, goto err_print; } - c = bch2_fs_alloc(best->sb, opts); + c = bch2_fs_alloc(best->sb, opts, &sbs); ret = PTR_ERR_OR_ZERO(c); if (ret) goto err; diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c index 6c646981..c265b102 100644 --- a/libbcachefs/tests.c +++ b/libbcachefs/tests.c @@ -43,7 +43,7 @@ static int test_delete(struct bch_fs *c, u64 nr) BTREE_ITER_intent); ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(&iter) ?: + bch2_btree_iter_traverse(trans, &iter) ?: bch2_trans_update(trans, &iter, &k.k_i, 0)); bch_err_msg(c, ret, "update error"); if (ret) @@ -51,7 +51,7 @@ static int test_delete(struct bch_fs *c, u64 nr) pr_info("deleting once"); ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(&iter) ?: + bch2_btree_iter_traverse(trans, &iter) ?: bch2_btree_delete_at(trans, &iter, 0)); bch_err_msg(c, ret, "delete error (first)"); if (ret) @@ -59,7 +59,7 @@ static int test_delete(struct bch_fs *c, u64 nr) pr_info("deleting twice"); ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(&iter) ?: + bch2_btree_iter_traverse(trans, &iter) ?: bch2_btree_delete_at(trans, &iter, 0)); bch_err_msg(c, ret, "delete error (second)"); if (ret) @@ -84,7 +84,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr) BTREE_ITER_intent); ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(&iter) ?: + bch2_btree_iter_traverse(trans, &iter) ?: bch2_trans_update(trans, &iter, &k.k_i, 0)); bch_err_msg(c, ret, "update error"); if (ret) @@ -94,7 +94,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr) bch2_journal_flush_all_pins(&c->journal); ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(&iter) ?: + bch2_btree_iter_traverse(trans, &iter) ?: bch2_btree_delete_at(trans, &iter, 0)); bch_err_msg(c, ret, "delete error"); if (ret) @@ -349,10 +349,10 @@ static int test_peek_end(struct bch_fs *c, u64 nr) bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), 0); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); BUG_ON(k.k); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); BUG_ON(k.k); bch2_trans_iter_exit(trans, &iter); @@ -369,10 +369,10 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr) bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), 0); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); BUG_ON(k.k); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); BUG_ON(k.k); bch2_trans_iter_exit(trans, &iter); @@ -488,7 +488,7 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) trans = bch2_trans_get(c); bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, SPOS(0, 0, snapid_lo), 0); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); BUG_ON(k.k->p.snapshot != U32_MAX); @@ -602,9 +602,9 @@ static int rand_lookup(struct bch_fs *c, u64 nr) SPOS(0, 0, U32_MAX), 0); for (i = 0; i < nr; i++) { - bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX)); + bch2_btree_iter_set_pos(trans, &iter, SPOS(0, test_rand(), U32_MAX)); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(trans, &iter))); ret = bkey_err(k); if (ret) break; @@ -623,9 +623,9 @@ static int rand_mixed_trans(struct btree_trans *trans, struct bkey_s_c k; int ret; - bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX)); + bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, U32_MAX)); - k = bch2_btree_iter_peek(iter); + k = bch2_btree_iter_peek(trans, iter); ret = bkey_err(k); bch_err_msg(trans->c, ret, "lookup error"); if (ret) @@ -672,7 +672,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, BTREE_ITER_intent); - k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)); + k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)); ret = bkey_err(k); if (ret) goto err; diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h index 519d00d6..8c07189a 100644 --- a/libbcachefs/trace.h +++ b/libbcachefs/trace.h @@ -339,6 +339,11 @@ DEFINE_EVENT(bio, io_read_reuse_race, TP_ARGS(bio) ); +DEFINE_EVENT(bio, io_read_fail_and_poison, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + /* ec.c */ TRACE_EVENT(stripe_create, diff --git a/libbcachefs/util.h b/libbcachefs/util.h index 1e94f89a..b2a29aae 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -55,15 +55,16 @@ static inline size_t buf_pages(void *p, size_t len) PAGE_SIZE); } -static inline void *bch2_kvmalloc(size_t n, gfp_t flags) +static inline void *bch2_kvmalloc_noprof(size_t n, gfp_t flags) { void *p = unlikely(n >= INT_MAX) - ? vmalloc(n) - : kvmalloc(n, flags & ~__GFP_ZERO); + ? vmalloc_noprof(n) + : kvmalloc_noprof(n, flags & ~__GFP_ZERO); if (p && (flags & __GFP_ZERO)) memset(p, 0, n); return p; } +#define bch2_kvmalloc(...) alloc_hooks(bch2_kvmalloc_noprof(__VA_ARGS__)) #define init_heap(heap, _size, gfp) \ ({ \ diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index f9667b94..651da52b 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -168,7 +168,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, int type, int flags) { struct bch_fs *c = trans->c; - struct btree_iter inode_iter = { NULL }; + struct btree_iter inode_iter = {}; int ret; ret = bch2_subvol_is_ro_trans(trans, inum.subvol) ?: diff --git a/src/commands/list.rs b/src/commands/list.rs index 757d6063..d9a1b460 100644 --- a/src/commands/list.rs +++ b/src/commands/list.rs @@ -7,6 +7,7 @@ use bch_bindgen::btree::BtreeNodeIter; use bch_bindgen::btree::BtreeTrans; use bch_bindgen::fs::Fs; use bch_bindgen::opt_set; +use bch_bindgen::c::bch_degraded_actions; use clap::Parser; use std::io::{stdout, IsTerminal}; @@ -167,8 +168,7 @@ fn cmd_list_inner(opt: &Cli) -> anyhow::Result<()> { opt_set!(fs_opts, nochanges, 1); opt_set!(fs_opts, read_only, 1); opt_set!(fs_opts, norecovery, 1); - opt_set!(fs_opts, degraded, 1); - opt_set!(fs_opts, very_degraded, 1); + opt_set!(fs_opts, degraded, bch_degraded_actions::BCH_DEGRADED_very as u8); opt_set!( fs_opts, errors,