From 62f17daf8197fca8fed0545b78a06891df3ce90d Mon Sep 17 00:00:00 2001 From: Alexander Miroshnichenko Date: Sun, 3 Aug 2025 20:16:04 +0300 Subject: [PATCH] bcachefs: patches form master branch on 03-Aug-2025 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 8bit Signed-off-by: Alexander Miroshnichenko --- .../filesystems/bcachefs/casefolding.rst | 18 + .../filesystems/bcachefs/future/idle_work.rst | 78 + Documentation/filesystems/bcachefs/index.rst | 7 + fs/bcachefs/Kconfig | 8 + fs/bcachefs/Makefile | 4 + fs/bcachefs/acl.c | 29 +- fs/bcachefs/alloc_background.c | 763 +++++---- fs/bcachefs/alloc_background.h | 19 +- fs/bcachefs/alloc_foreground.c | 835 +++++----- fs/bcachefs/alloc_foreground.h | 86 +- fs/bcachefs/alloc_types.h | 16 - fs/bcachefs/async_objs.c | 141 ++ fs/bcachefs/async_objs.h | 45 + fs/bcachefs/async_objs_types.h | 25 + fs/bcachefs/backpointers.c | 439 +++--- fs/bcachefs/backpointers.h | 17 +- fs/bcachefs/bcachefs.h | 360 +++-- fs/bcachefs/bcachefs_format.h | 30 +- fs/bcachefs/bkey.c | 51 +- fs/bcachefs/bkey.h | 4 +- fs/bcachefs/bkey_methods.c | 2 +- fs/bcachefs/bset.c | 124 +- fs/bcachefs/bset.h | 22 +- fs/bcachefs/btree_cache.c | 280 ++-- fs/bcachefs/btree_cache.h | 20 + fs/bcachefs/btree_gc.c | 346 +++-- fs/bcachefs/btree_gc.h | 3 +- fs/bcachefs/btree_io.c | 567 ++++--- fs/bcachefs/btree_io.h | 12 +- fs/bcachefs/btree_iter.c | 818 +++++----- fs/bcachefs/btree_iter.h | 456 +++--- fs/bcachefs/btree_journal_iter.c | 119 +- fs/bcachefs/btree_journal_iter_types.h | 5 +- fs/bcachefs/btree_key_cache.c | 147 +- fs/bcachefs/btree_locking.c | 267 ++-- fs/bcachefs/btree_locking.h | 78 +- fs/bcachefs/btree_node_scan.c | 142 +- fs/bcachefs/btree_node_scan.h | 2 +- fs/bcachefs/btree_trans_commit.c | 202 ++- fs/bcachefs/btree_types.h | 102 +- fs/bcachefs/btree_update.c | 405 +++-- fs/bcachefs/btree_update.h | 198 ++- fs/bcachefs/btree_update_interior.c | 560 ++++--- fs/bcachefs/btree_update_interior.h | 22 +- fs/bcachefs/btree_write_buffer.c | 100 +- fs/bcachefs/btree_write_buffer.h | 5 + fs/bcachefs/buckets.c | 426 +++--- fs/bcachefs/buckets.h | 12 +- fs/bcachefs/buckets_waiting_for_journal.c | 31 +- fs/bcachefs/chardev.c | 162 +- fs/bcachefs/checksum.c | 66 +- fs/bcachefs/checksum.h | 2 + fs/bcachefs/clock.c | 64 +- fs/bcachefs/clock.h | 1 + fs/bcachefs/compress.c | 53 +- fs/bcachefs/compress.h | 36 +- fs/bcachefs/darray.h | 59 +- fs/bcachefs/data_update.c | 411 +++-- fs/bcachefs/data_update.h | 15 + fs/bcachefs/debug.c | 212 +-- fs/bcachefs/debug.h | 20 +- fs/bcachefs/dirent.c | 253 ++- fs/bcachefs/dirent.h | 25 +- fs/bcachefs/disk_accounting.c | 382 ++--- fs/bcachefs/disk_accounting.h | 27 +- fs/bcachefs/disk_groups.c | 167 +- fs/bcachefs/ec.c | 630 ++++---- fs/bcachefs/ec.h | 11 +- fs/bcachefs/ec_types.h | 7 +- fs/bcachefs/enumerated_ref.c | 142 ++ fs/bcachefs/enumerated_ref.h | 66 + fs/bcachefs/enumerated_ref_types.h | 19 + fs/bcachefs/errcode.c | 7 +- fs/bcachefs/errcode.h | 42 +- fs/bcachefs/error.c | 240 +-- fs/bcachefs/error.h | 27 +- fs/bcachefs/extent_update.c | 87 +- fs/bcachefs/extent_update.h | 2 +- fs/bcachefs/extents.c | 217 ++- fs/bcachefs/extents.h | 6 + fs/bcachefs/extents_types.h | 1 + fs/bcachefs/fast_list.c | 168 ++ fs/bcachefs/fast_list.h | 41 + fs/bcachefs/fs-io-buffered.c | 119 +- fs/bcachefs/fs-io-direct.c | 33 +- fs/bcachefs/fs-io-pagecache.c | 57 +- fs/bcachefs/fs-io.c | 177 +-- fs/bcachefs/fs-io.h | 19 +- fs/bcachefs/fs-ioctl.c | 43 +- fs/bcachefs/fs.c | 316 ++-- fs/bcachefs/fsck.c | 1361 +++++++++-------- fs/bcachefs/fsck.h | 6 + fs/bcachefs/inode.c | 389 +++-- fs/bcachefs/inode.h | 49 +- fs/bcachefs/inode_format.h | 7 +- fs/bcachefs/io_misc.c | 112 +- fs/bcachefs/io_misc.h | 2 + fs/bcachefs/io_read.c | 558 ++++--- fs/bcachefs/io_read.h | 44 +- fs/bcachefs/io_write.c | 177 ++- fs/bcachefs/io_write.h | 28 - fs/bcachefs/io_write_types.h | 32 + fs/bcachefs/journal.c | 455 +++--- fs/bcachefs/journal.h | 13 +- fs/bcachefs/journal_io.c | 603 +++++--- fs/bcachefs/journal_io.h | 8 + fs/bcachefs/journal_reclaim.c | 293 ++-- fs/bcachefs/journal_sb.c | 2 +- fs/bcachefs/journal_seq_blacklist.c | 68 +- fs/bcachefs/journal_seq_blacklist.h | 4 + fs/bcachefs/journal_types.h | 2 - fs/bcachefs/logged_ops.c | 16 +- fs/bcachefs/logged_ops.h | 2 +- fs/bcachefs/lru.c | 54 +- fs/bcachefs/migrate.c | 144 +- fs/bcachefs/migrate.h | 3 +- fs/bcachefs/move.c | 589 ++++--- fs/bcachefs/move.h | 31 +- fs/bcachefs/move_types.h | 8 +- fs/bcachefs/movinggc.c | 251 ++- fs/bcachefs/movinggc.h | 5 +- fs/bcachefs/namei.c | 439 ++++-- fs/bcachefs/namei.h | 7 + fs/bcachefs/nocow_locking.c | 14 +- fs/bcachefs/nocow_locking.h | 2 +- fs/bcachefs/opts.c | 199 ++- fs/bcachefs/opts.h | 56 +- fs/bcachefs/printbuf.h | 12 + fs/bcachefs/progress.c | 6 +- fs/bcachefs/progress.h | 3 + fs/bcachefs/quota.c | 103 +- fs/bcachefs/rcu_pending.c | 22 +- fs/bcachefs/rebalance.c | 330 ++-- fs/bcachefs/rebalance.h | 14 +- fs/bcachefs/rebalance_types.h | 6 + fs/bcachefs/recovery.c | 373 +++-- fs/bcachefs/recovery.h | 3 +- fs/bcachefs/recovery_passes.c | 666 ++++++-- fs/bcachefs/recovery_passes.h | 47 +- fs/bcachefs/recovery_passes_format.h | 106 ++ fs/bcachefs/recovery_passes_types.h | 93 +- fs/bcachefs/reflink.c | 163 +- fs/bcachefs/replicas.c | 182 +-- fs/bcachefs/sb-clean.c | 36 +- fs/bcachefs/sb-counters_format.h | 11 + fs/bcachefs/sb-downgrade.c | 33 +- fs/bcachefs/sb-errors.c | 67 +- fs/bcachefs/sb-errors.h | 1 + fs/bcachefs/sb-errors_format.h | 46 +- fs/bcachefs/sb-members.c | 276 ++-- fs/bcachefs/sb-members.h | 134 +- fs/bcachefs/sb-members_format.h | 8 +- fs/bcachefs/sb-members_types.h | 1 + fs/bcachefs/six.c | 28 +- fs/bcachefs/snapshot.c | 982 +++++++----- fs/bcachefs/snapshot.h | 140 +- fs/bcachefs/snapshot_format.h | 4 +- fs/bcachefs/snapshot_types.h | 57 + fs/bcachefs/str_hash.c | 390 +++-- fs/bcachefs/str_hash.h | 83 +- fs/bcachefs/subvolume.c | 338 ++-- fs/bcachefs/subvolume.h | 25 +- fs/bcachefs/subvolume_types.h | 27 - fs/bcachefs/super-io.c | 169 +- fs/bcachefs/super-io.h | 1 + fs/bcachefs/super.c | 1160 ++++++++------ fs/bcachefs/super.h | 10 +- fs/bcachefs/sysfs.c | 273 +++- fs/bcachefs/tests.c | 340 ++-- fs/bcachefs/thread_with_file.c | 52 +- fs/bcachefs/time_stats.c | 7 +- fs/bcachefs/trace.h | 398 ++--- fs/bcachefs/util.c | 75 +- fs/bcachefs/util.h | 27 +- fs/bcachefs/xattr.c | 81 +- fs/bcachefs/xattr.h | 4 +- fs/bcachefs/xattr_format.h | 4 +- 177 files changed, 15223 insertions(+), 11107 deletions(-) create mode 100644 Documentation/filesystems/bcachefs/future/idle_work.rst create mode 100644 fs/bcachefs/async_objs.c create mode 100644 fs/bcachefs/async_objs.h create mode 100644 fs/bcachefs/async_objs_types.h create mode 100644 fs/bcachefs/enumerated_ref.c create mode 100644 fs/bcachefs/enumerated_ref.h create mode 100644 fs/bcachefs/enumerated_ref_types.h create mode 100644 fs/bcachefs/fast_list.c create mode 100644 fs/bcachefs/fast_list.h create mode 100644 fs/bcachefs/recovery_passes_format.h create mode 100644 fs/bcachefs/snapshot_types.h diff --git a/Documentation/filesystems/bcachefs/casefolding.rst b/Documentation/filesystems/bcachefs/casefolding.rst index ba5de97d155f..871a38f557e8 100644 --- a/Documentation/filesystems/bcachefs/casefolding.rst +++ b/Documentation/filesystems/bcachefs/casefolding.rst @@ -88,3 +88,21 @@ This would fail if negative dentry's were cached. This is slightly suboptimal, but could be fixed in future with some vfs work. + +References +---------- + +(from Peter Anvin, on the list) + +It is worth noting that Microsoft has basically declared their +"recommended" case folding (upcase) table to be permanently frozen (for +new filesystem instances in the case where they use an on-disk +translation table created at format time.) As far as I know they have +never supported anything other than 1:1 conversion of BMP code points, +nor normalization. + +The exFAT specification enumerates the full recommended upcase table, +although in a somewhat annoying format (basically a hex dump of +compressed data): + +https://learn.microsoft.com/en-us/windows/win32/fileio/exfat-specification diff --git a/Documentation/filesystems/bcachefs/future/idle_work.rst b/Documentation/filesystems/bcachefs/future/idle_work.rst new file mode 100644 index 000000000000..59a332509dcd --- /dev/null +++ b/Documentation/filesystems/bcachefs/future/idle_work.rst @@ -0,0 +1,78 @@ +Idle/background work classes design doc: + +Right now, our behaviour at idle isn't ideal, it was designed for servers that +would be under sustained load, to keep pending work at a "medium" level, to +let work build up so we can process it in more efficient batches, while also +giving headroom for bursts in load. + +But for desktops or mobile - scenarios where work is less sustained and power +usage is more important - we want to operate differently, with a "rush to +idle" so the system can go to sleep. We don't want to be dribbling out +background work while the system should be idle. + +The complicating factor is that there are a number of background tasks, which +form a heirarchy (or a digraph, depending on how you divide it up) - one +background task may generate work for another. + +Thus proper idle detection needs to model this heirarchy. + +- Foreground writes +- Page cache writeback +- Copygc, rebalance +- Journal reclaim + +When we implement idle detection and rush to idle, we need to be careful not +to disturb too much the existing behaviour that works reasonably well when the +system is under sustained load (or perhaps improve it in the case of +rebalance, which currently does not actively attempt to let work batch up). + +SUSTAINED LOAD REGIME +--------------------- + +When the system is under continuous load, we want these jobs to run +continuously - this is perhaps best modelled with a P/D controller, where +they'll be trying to keep a target value (i.e. fragmented disk space, +available journal space) roughly in the middle of some range. + +The goal under sustained load is to balance our ability to handle load spikes +without running out of x resource (free disk space, free space in the +journal), while also letting some work accumululate to be batched (or become +unnecessary). + +For example, we don't want to run copygc too aggressively, because then it +will be evacuating buckets that would have become empty (been overwritten or +deleted) anyways, and we don't want to wait until we're almost out of free +space because then the system will behave unpredicably - suddenly we're doing +a lot more work to service each write and the system becomes much slower. + +IDLE REGIME +----------- + +When the system becomes idle, we should start flushing our pending work +quicker so the system can go to sleep. + +Note that the definition of "idle" depends on where in the heirarchy a task +is - a task should start flushing work more quickly when the task above it has +stopped generating new work. + +e.g. rebalance should start flushing more quickly when page cache writeback is +idle, and journal reclaim should only start flushing more quickly when both +copygc and rebalance are idle. + +It's important to let work accumulate when more work is still incoming and we +still have room, because flushing is always more efficient if we let it batch +up. New writes may overwrite data before rebalance moves it, and tasks may be +generating more updates for the btree nodes that journal reclaim needs to flush. + +On idle, how much work we do at each interval should be proportional to the +length of time we have been idle for. If we're idle only for a short duration, +we shouldn't flush everything right away; the system might wake up and start +generating new work soon, and flushing immediately might end up doing a lot of +work that would have been unnecessary if we'd allowed things to batch more. + +To summarize, we will need: + + - A list of classes for background tasks that generate work, which will + include one "foreground" class. + - Tracking for each class - "Am I doing work, or have I gone to sleep?" + - And each class should check the class above it when deciding how much work to issue. diff --git a/Documentation/filesystems/bcachefs/index.rst b/Documentation/filesystems/bcachefs/index.rst index 3864d0ae89c1..e5c4c2120b93 100644 --- a/Documentation/filesystems/bcachefs/index.rst +++ b/Documentation/filesystems/bcachefs/index.rst @@ -29,3 +29,10 @@ At this moment, only a few of these are described here. casefolding errorcodes + +Future design +------------- +.. toctree:: + :maxdepth: 1 + + future/idle_work diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index 07709b0d7688..8cb2b9d5da96 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -103,6 +103,14 @@ config BCACHEFS_PATH_TRACEPOINTS Enable extra tracepoints for debugging btree_path operations; we don't normally want these enabled because they happen at very high rates. +config BCACHEFS_TRANS_KMALLOC_TRACE + bool "Trace bch2_trans_kmalloc() calls" + depends on BCACHEFS_FS + +config BCACHEFS_ASYNC_OBJECT_LISTS + bool "Keep async objects on fast_lists for debugfs visibility" + depends on BCACHEFS_FS && DEBUG_FS + config MEAN_AND_VARIANCE_UNIT_TEST tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 9af65079374f..93c8ee5425c8 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -35,11 +35,13 @@ bcachefs-y := \ disk_accounting.o \ disk_groups.o \ ec.o \ + enumerated_ref.o \ errcode.o \ error.o \ extents.o \ extent_update.o \ eytzinger.o \ + fast_list.o \ fs.o \ fs-ioctl.o \ fs-io.o \ @@ -97,6 +99,8 @@ bcachefs-y := \ varint.o \ xattr.o +bcachefs-$(CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS) += async_objs.o + obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o # Silence "note: xyz changed in GCC X.X" messages diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index d03adc36100e..3befa1f36e72 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -138,8 +138,8 @@ static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans, acl = allocate_dropping_locks(trans, ret, posix_acl_alloc(count, _gfp)); - if (!acl) - return ERR_PTR(-ENOMEM); + if (!acl && !ret) + ret = bch_err_throw(trans->c, ENOMEM_acl); if (ret) { kfree(acl); return ERR_PTR(ret); @@ -273,13 +273,13 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu) struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); - struct btree_iter iter = {}; + struct btree_iter iter = { NULL }; struct posix_acl *acl = NULL; if (rcu) return ERR_PTR(-ECHILD); - struct btree_trans *trans = bch2_trans_get(c); + CLASS(btree_trans, trans)(c); retry: bch2_trans_begin(trans); @@ -303,8 +303,7 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu) if (!IS_ERR_OR_NULL(acl)) set_cached_acl(&inode->v, type, acl); - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); + bch2_trans_iter_exit(&iter); return acl; } @@ -344,14 +343,14 @@ int bch2_set_acl(struct mnt_idmap *idmap, { struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_iter inode_iter = {}; + struct btree_iter inode_iter = { NULL }; struct bch_inode_unpacked inode_u; struct posix_acl *acl; umode_t mode; int ret; - mutex_lock(&inode->ei_update_lock); - struct btree_trans *trans = bch2_trans_get(c); + guard(mutex)(&inode->ei_update_lock); + CLASS(btree_trans, trans)(c); retry: bch2_trans_begin(trans); acl = _acl; @@ -380,22 +379,18 @@ int bch2_set_acl(struct mnt_idmap *idmap, ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?: bch2_trans_commit(trans, NULL, NULL, 0); btree_err: - bch2_trans_iter_exit(trans, &inode_iter); + bch2_trans_iter_exit(&inode_iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; if (unlikely(ret)) - goto err; + return ret; bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME|ATTR_MODE); set_cached_acl(&inode->v, type, acl); -err: - bch2_trans_put(trans); - mutex_unlock(&inode->ei_update_lock); - - return ret; + return 0; } int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, @@ -436,7 +431,7 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, *new_acl = acl; acl = NULL; err: - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); if (!IS_ERR_OR_NULL(acl)) kfree(acl); return ret; diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 94ea9e49aec4..3fc728efbf5c 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -17,10 +17,11 @@ #include "debug.h" #include "disk_accounting.h" #include "ec.h" +#include "enumerated_ref.h" #include "error.h" #include "lru.h" +#include "progress.h" #include "recovery.h" -#include "trace.h" #include "varint.h" #include @@ -308,7 +309,8 @@ int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, "data type inconsistency"); bkey_fsck_err_on(!a.io_time[READ] && - c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, + !(c->recovery.passes_to_run & + BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs)), c, alloc_key_cached_but_read_time_zero, "cached bucket with read_time == 0"); break; @@ -335,11 +337,11 @@ void bch2_alloc_v4_swab(struct bkey_s k) a->stripe_sectors = swab32(a->stripe_sectors); } -void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +static inline void __bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k, + const struct bch_alloc_v4 *a) { - struct bch_alloc_v4 _a; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); - struct bch_dev *ca = c ? bch2_dev_bucket_tryget_noerror(c, k.k->p) : NULL; + struct bch_dev *ca = c ? bch2_dev_tryget_noerror(c, k.k->p.inode) : NULL; prt_newline(out); printbuf_indent_add(out, 2); @@ -348,11 +350,14 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c bch2_prt_data_type(out, a->data_type); prt_newline(out); prt_printf(out, "journal_seq_nonempty %llu\n", a->journal_seq_nonempty); - prt_printf(out, "journal_seq_empty %llu\n", a->journal_seq_empty); + if (bkey_val_bytes(k.k) > offsetof(struct bch_alloc_v4, journal_seq_empty)) + prt_printf(out, "journal_seq_empty %llu\n", a->journal_seq_empty); + prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a)); prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a)); prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors); - prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors); + if (bkey_val_bytes(k.k) > offsetof(struct bch_alloc_v4, stripe_sectors)) + prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors); prt_printf(out, "cached_sectors %u\n", a->cached_sectors); prt_printf(out, "stripe %u\n", a->stripe); prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy); @@ -367,12 +372,25 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c bch2_dev_put(ca); } +void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + struct bch_alloc_v4 _a; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); + + __bch2_alloc_v4_to_text(out, c, k, a); +} + +void bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + __bch2_alloc_v4_to_text(out, c, k, bkey_s_c_to_alloc_v4(k).v); +} + void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) { if (k.k->type == KEY_TYPE_alloc_v4) { void *src, *dst; - *out = *bkey_s_c_to_alloc_v4(k).v; + bkey_val_copy(out, bkey_s_c_to_alloc_v4(k)); src = alloc_v4_backpointers(out); SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); @@ -455,13 +473,14 @@ struct bkey_i_alloc_v4 * bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter, struct bpos pos) { - struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos, - BTREE_ITER_with_updates| - BTREE_ITER_cached| - BTREE_ITER_intent); + bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos, + BTREE_ITER_with_updates| + BTREE_ITER_cached| + BTREE_ITER_intent); + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); int ret = bkey_err(k); if (unlikely(ret)) - return ERR_PTR(ret); + goto err; struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k); ret = PTR_ERR_OR_ZERO(a); @@ -469,7 +488,7 @@ bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_i goto err; return a; err: - bch2_trans_iter_exit(trans, iter); + bch2_trans_iter_exit(iter); return ERR_PTR(ret); } @@ -477,14 +496,24 @@ __flatten struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos, enum btree_iter_update_trigger_flags flags) { - struct btree_iter iter; - struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos); - int ret = PTR_ERR_OR_ZERO(a); - if (ret) + CLASS(btree_iter, iter)(trans, BTREE_ID_alloc, pos, + BTREE_ITER_with_updates| + BTREE_ITER_cached| + BTREE_ITER_intent); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + int ret = bkey_err(k); + if (unlikely(ret)) return ERR_PTR(ret); - ret = bch2_trans_update(trans, &iter, &a->k_i, flags); - bch2_trans_iter_exit(trans, &iter); + if ((void *) k.v >= trans->mem && + (void *) k.v < trans->mem + trans->mem_top) + return container_of(bkey_s_c_to_alloc_v4(k).v, struct bkey_i_alloc_v4, v); + + struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k); + if (IS_ERR(a)) + return a; + + ret = bch2_trans_update_ip(trans, &iter, &a->k_i, flags, _RET_IP_); return unlikely(ret) ? ERR_PTR(ret) : a; } @@ -537,11 +566,11 @@ void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bke int bch2_bucket_gens_init(struct bch_fs *c) { - struct btree_trans *trans = bch2_trans_get(c); struct bkey_i_bucket_gens g; bool have_bucket_gens_key = false; int ret; + CLASS(btree_trans, trans)(c); ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_prefetch, k, ({ /* @@ -581,17 +610,14 @@ int bch2_bucket_gens_init(struct bch_fs *c) BCH_TRANS_COMMIT_no_enospc, bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); - bch2_trans_put(trans); - - bch_err_fn(c, ret); return ret; } int bch2_alloc_read(struct bch_fs *c) { - down_read(&c->state_lock); + guard(rwsem_read)(&c->state_lock); - struct btree_trans *trans = bch2_trans_get(c); + CLASS(btree_trans, trans)(c); struct bch_dev *ca = NULL; int ret; @@ -610,7 +636,7 @@ int bch2_alloc_read(struct bch_fs *c) * bch2_check_alloc_key() which runs later: */ if (!ca) { - bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); continue; } @@ -631,17 +657,17 @@ int bch2_alloc_read(struct bch_fs *c) * bch2_check_alloc_key() which runs later: */ if (!ca) { - bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); continue; } if (k.k->p.offset < ca->mi.first_bucket) { - bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode, ca->mi.first_bucket)); + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket)); continue; } if (k.k->p.offset >= ca->mi.nbuckets) { - bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); continue; } @@ -652,10 +678,6 @@ int bch2_alloc_read(struct bch_fs *c) } bch2_dev_put(ca); - bch2_trans_put(trans); - - up_read(&c->state_lock); - bch_err_fn(c, ret); return ret; } @@ -671,7 +693,7 @@ static int __need_discard_or_freespace_err(struct btree_trans *trans, ? BCH_FSCK_ERR_need_discard_key_wrong : BCH_FSCK_ERR_freespace_key_wrong; enum btree_id btree = discard ? BTREE_ID_need_discard : BTREE_ID_freespace; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_bkey_val_to_text(&buf, c, alloc_k); @@ -680,11 +702,9 @@ static int __need_discard_or_freespace_err(struct btree_trans *trans, set ? "" : "un", bch2_btree_id_str(btree), buf.buf); - if (ret == -BCH_ERR_fsck_ignore || - ret == -BCH_ERR_fsck_errors_not_fixed) + if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) || + bch2_err_matches(ret, BCH_ERR_fsck_errors_not_fixed)) ret = 0; - - printbuf_exit(&buf); return ret; } @@ -720,8 +740,8 @@ static int bch2_bucket_do_index(struct btree_trans *trans, return 0; } - struct btree_iter iter; - struct bkey_s_c old = bch2_bkey_get_iter(trans, &iter, btree, pos, BTREE_ITER_intent); + CLASS(btree_iter, iter)(trans, btree, pos, BTREE_ITER_intent); + struct bkey_s_c old = bch2_btree_iter_peek_slot(&iter); int ret = bkey_err(old); if (ret) return ret; @@ -731,30 +751,25 @@ static int bch2_bucket_do_index(struct btree_trans *trans, trans, alloc_k, set, btree == BTREE_ID_need_discard, false); - ret = bch2_btree_bit_mod_iter(trans, &iter, set); + return bch2_btree_bit_mod_iter(trans, &iter, set); fsck_err: - bch2_trans_iter_exit(trans, &iter); return ret; } static noinline int bch2_bucket_gen_update(struct btree_trans *trans, struct bpos bucket, u8 gen) { - struct btree_iter iter; - unsigned offset; - struct bpos pos = alloc_gens_pos(bucket, &offset); - struct bkey_i_bucket_gens *g; - struct bkey_s_c k; - int ret; - - g = bch2_trans_kmalloc(trans, sizeof(*g)); - ret = PTR_ERR_OR_ZERO(g); + struct bkey_i_bucket_gens *g = bch2_trans_kmalloc(trans, sizeof(*g)); + int ret = PTR_ERR_OR_ZERO(g); if (ret) return ret; - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos, - BTREE_ITER_intent| - BTREE_ITER_with_updates); + unsigned offset; + struct bpos pos = alloc_gens_pos(bucket, &offset); + + CLASS(btree_iter, iter)(trans, BTREE_ID_bucket_gens, pos, + BTREE_ITER_intent|BTREE_ITER_with_updates); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) return ret; @@ -769,7 +784,7 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans, g->v.gens[offset] = gen; ret = bch2_trans_update(trans, &iter, &g->k_i, 0); - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); return ret; } @@ -832,12 +847,12 @@ int bch2_trigger_alloc(struct btree_trans *trans, enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); int ret = 0; - struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p); + CLASS(bch2_dev_bucket_tryget, ca)(c, new.k->p); if (!ca) - return -BCH_ERR_trigger_alloc; + return bch_err_throw(c, trigger_alloc); struct bch_alloc_v4 old_a_convert; const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); @@ -851,7 +866,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c); ret = PTR_ERR_OR_ZERO(new_ka); if (unlikely(ret)) - goto err; + return ret; new_a = &new_ka->v; } @@ -885,7 +900,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, ret = bch2_bucket_do_index(trans, ca, old, old_a, false) ?: bch2_bucket_do_index(trans, ca, new.s_c, new_a, true); if (ret) - goto err; + return ret; } if (new_a->data_type == BCH_DATA_cached && @@ -897,7 +912,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, alloc_lru_idx_read(*old_a), alloc_lru_idx_read(*new_a)); if (ret) - goto err; + return ret; ret = bch2_lru_change(trans, BCH_LRU_BUCKET_FRAGMENTATION, @@ -905,26 +920,17 @@ int bch2_trigger_alloc(struct btree_trans *trans, alloc_lru_idx_fragmentation(*old_a, ca), alloc_lru_idx_fragmentation(*new_a, ca)); if (ret) - goto err; + return ret; if (old_a->gen != new_a->gen) { ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen); if (ret) - goto err; - } - - if ((flags & BTREE_TRIGGER_bucket_invalidate) && - old_a->cached_sectors) { - ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx, - -((s64) old_a->cached_sectors), - flags & BTREE_TRIGGER_gc); - if (ret) - goto err; + return ret; } ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags); if (ret) - goto err; + return ret; } if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { @@ -975,19 +981,16 @@ int bch2_trigger_alloc(struct btree_trans *trans, if (bch2_fs_fatal_err_on(ret, c, "setting bucket_needs_journal_commit: %s", bch2_err_str(ret))) - goto err; + return ret; } } if (new_a->gen != old_a->gen) { - rcu_read_lock(); + guard(rcu)(); u8 *gen = bucket_gen(ca, new.k->p.offset); - if (unlikely(!gen)) { - rcu_read_unlock(); + if (unlikely(!gen)) goto invalid_bucket; - } *gen = new_a->gen; - rcu_read_unlock(); } #define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; }) @@ -1013,36 +1016,28 @@ int bch2_trigger_alloc(struct btree_trans *trans, } if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) { - rcu_read_lock(); + guard(rcu)(); struct bucket *g = gc_bucket(ca, new.k->p.offset); - if (unlikely(!g)) { - rcu_read_unlock(); + if (unlikely(!g)) goto invalid_bucket; - } g->gen_valid = 1; g->gen = new_a->gen; - rcu_read_unlock(); } -err: fsck_err: - printbuf_exit(&buf); - bch2_dev_put(ca); return ret; invalid_bucket: bch2_fs_inconsistent(c, "reference to invalid bucket\n%s", (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)); - ret = -BCH_ERR_trigger_alloc; - goto err; + return bch_err_throw(c, trigger_alloc); } /* * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for * extents style btrees, but works on non-extents btrees: */ -static struct bkey_s_c bch2_get_key_or_hole(struct btree_trans *trans, struct btree_iter *iter, - struct bpos end, struct bkey *hole) +static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole) { - struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); if (bkey_err(k)) return k; @@ -1053,9 +1048,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_trans *trans, struct bt struct btree_iter iter2; struct bpos next; - bch2_trans_copy_iter(trans, &iter2, iter); + bch2_trans_copy_iter(&iter2, iter); - struct btree_path *path = btree_iter_path(trans, iter); + struct btree_path *path = btree_iter_path(iter->trans, iter); if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX)) end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p)); @@ -1065,9 +1060,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_trans *trans, struct bt * btree node min/max is a closed interval, upto takes a half * open interval: */ - k = bch2_btree_iter_peek_max(trans, &iter2, end); + k = bch2_btree_iter_peek_max(&iter2, end); next = iter2.pos; - bch2_trans_iter_exit(trans, &iter2); + bch2_trans_iter_exit(&iter2); BUG_ON(next.offset >= iter->pos.offset + U32_MAX); @@ -1097,25 +1092,23 @@ static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *buck bucket->offset = 0; } - rcu_read_lock(); + guard(rcu)(); *ca = __bch2_next_dev_idx(c, bucket->inode, NULL); if (*ca) { *bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket); bch2_dev_get(*ca); } - rcu_read_unlock(); return *ca != NULL; } -static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_dev **ca, struct bkey *hole) +static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, + struct bch_dev **ca, struct bkey *hole) { - struct bch_fs *c = trans->c; + struct bch_fs *c = iter->trans->c; struct bkey_s_c k; again: - k = bch2_get_key_or_hole(trans, iter, POS_MAX, hole); + k = bch2_get_key_or_hole(iter, POS_MAX, hole); if (bkey_err(k)) return k; @@ -1128,7 +1121,7 @@ static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_trans *tran if (!next_bucket(c, ca, &hole_start)) return bkey_s_c_null; - bch2_btree_iter_set_pos(trans, iter, hole_start); + bch2_btree_iter_set_pos(iter, hole_start); goto again; } @@ -1152,10 +1145,10 @@ int bch2_check_alloc_key(struct btree_trans *trans, const struct bch_alloc_v4 *a; unsigned gens_offset; struct bkey_s_c k; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); int ret = 0; - struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p); + CLASS(bch2_dev_bucket_tryget_noerror, ca)(c, alloc_k.k->p); if (fsck_err_on(!ca, trans, alloc_key_to_missing_dev_bucket, "alloc key for invalid device:bucket %llu:%llu", @@ -1165,43 +1158,43 @@ int bch2_check_alloc_key(struct btree_trans *trans, return ret; if (!ca->mi.freespace_initialized) - goto out; + return 0; a = bch2_alloc_to_v4(alloc_k, &a_convert); - bch2_btree_iter_set_pos(trans, discard_iter, alloc_k.k->p); - k = bch2_btree_iter_peek_slot(trans, discard_iter); + bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p); + k = bch2_btree_iter_peek_slot(discard_iter); ret = bkey_err(k); if (ret) - goto err; + return ret; bool is_discarded = a->data_type == BCH_DATA_need_discard; if (need_discard_or_freespace_err_on(!!k.k->type != is_discarded, trans, alloc_k, !is_discarded, true, true)) { ret = bch2_btree_bit_mod_iter(trans, discard_iter, is_discarded); if (ret) - goto err; + return ret; } - bch2_btree_iter_set_pos(trans, freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); - k = bch2_btree_iter_peek_slot(trans, freespace_iter); + bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); + k = bch2_btree_iter_peek_slot(freespace_iter); ret = bkey_err(k); if (ret) - goto err; + return ret; bool is_free = a->data_type == BCH_DATA_free; if (need_discard_or_freespace_err_on(!!k.k->type != is_free, trans, alloc_k, !is_free, false, true)) { ret = bch2_btree_bit_mod_iter(trans, freespace_iter, is_free); if (ret) - goto err; + return ret; } - bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset)); - k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter); + bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset)); + k = bch2_btree_iter_peek_slot(bucket_gens_iter); ret = bkey_err(k); if (ret) - goto err; + return ret; if (fsck_err_on(a->gen != alloc_gen(k, gens_offset), trans, bucket_gens_key_wrong, @@ -1214,7 +1207,7 @@ int bch2_check_alloc_key(struct btree_trans *trans, ret = PTR_ERR_OR_ZERO(g); if (ret) - goto err; + return ret; if (k.k->type == KEY_TYPE_bucket_gens) { bkey_reassemble(&g->k_i, k); @@ -1227,13 +1220,9 @@ int bch2_check_alloc_key(struct btree_trans *trans, ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0); if (ret) - goto err; + return ret; } -out: -err: fsck_err: - bch2_dev_put(ca); - printbuf_exit(&buf); return ret; } @@ -1245,18 +1234,18 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans, struct btree_iter *freespace_iter) { struct bkey_s_c k; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); int ret; if (!ca->mi.freespace_initialized) return 0; - bch2_btree_iter_set_pos(trans, freespace_iter, start); + bch2_btree_iter_set_pos(freespace_iter, start); - k = bch2_btree_iter_peek_slot(trans, freespace_iter); + k = bch2_btree_iter_peek_slot(freespace_iter); ret = bkey_err(k); if (ret) - goto err; + return ret; *end = bkey_min(k.k->p, *end); @@ -1269,10 +1258,9 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans, end->offset)) { struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(*update)); - ret = PTR_ERR_OR_ZERO(update); if (ret) - goto err; + return ret; bkey_init(&update->k); update->k.type = KEY_TYPE_set; @@ -1283,11 +1271,9 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans, ret = bch2_trans_update(trans, freespace_iter, update, 0); if (ret) - goto err; + return ret; } -err: fsck_err: - printbuf_exit(&buf); return ret; } @@ -1298,16 +1284,16 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, struct btree_iter *bucket_gens_iter) { struct bkey_s_c k; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); unsigned i, gens_offset, gens_end_offset; int ret; - bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); + bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); - k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter); + k = bch2_btree_iter_peek_slot(bucket_gens_iter); ret = bkey_err(k); if (ret) - goto err; + return ret; if (bkey_cmp(alloc_gens_pos(start, &gens_offset), alloc_gens_pos(*end, &gens_end_offset))) @@ -1333,23 +1319,20 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, if (need_update) { struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); - ret = PTR_ERR_OR_ZERO(u); if (ret) - goto err; + return ret; memcpy(u, &g, sizeof(g)); ret = bch2_trans_update(trans, bucket_gens_iter, u, 0); if (ret) - goto err; + return ret; } } *end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0)); -err: fsck_err: - printbuf_exit(&buf); return ret; } @@ -1361,17 +1344,17 @@ struct check_discard_freespace_key_async { static int bch2_recheck_discard_freespace_key(struct btree_trans *trans, struct bbpos pos) { - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, pos.btree, pos.pos, 0); + CLASS(btree_iter, iter)(trans, pos.btree, pos.pos, 0); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); int ret = bkey_err(k); if (ret) return ret; u8 gen; ret = k.k->type != KEY_TYPE_set - ? bch2_check_discard_freespace_key(trans, &iter, &gen, false) + ? __bch2_check_discard_freespace_key(trans, &iter, &gen, FSCK_ERR_SILENT) : 0; - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); return ret; } @@ -1381,18 +1364,21 @@ static void check_discard_freespace_key_work(struct work_struct *work) container_of(work, struct check_discard_freespace_key_async, work); bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos)); - bch2_write_ref_put(w->c, BCH_WRITE_REF_check_discard_freespace_key); + enumerated_ref_put(&w->c->writes, BCH_WRITE_REF_check_discard_freespace_key); kfree(w); } -int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen, - bool async_repair) +int __bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen, + enum bch_fsck_flags fsck_flags) { struct bch_fs *c = trans->c; enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard ? BCH_DATA_need_discard : BCH_DATA_free; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); + + bool async_repair = fsck_flags & FSCK_ERR_NO_LOG; + fsck_flags |= FSCK_CAN_FIX|FSCK_CAN_IGNORE; struct bpos bucket = iter->pos; bucket.offset &= ~(~0ULL << 56); @@ -1407,9 +1393,10 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite return ret; if (!bch2_dev_bucket_exists(c, bucket)) { - if (fsck_err(trans, need_discard_freespace_key_to_invalid_dev_bucket, - "entry in %s btree for nonexistant dev:bucket %llu:%llu", - bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset)) + if (__fsck_err(trans, fsck_flags, + need_discard_freespace_key_to_invalid_dev_bucket, + "entry in %s btree for nonexistant dev:bucket %llu:%llu", + bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset)) goto delete; ret = 1; goto out; @@ -1421,7 +1408,8 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite if (a->data_type != state || (state == BCH_DATA_free && genbits != alloc_freespace_genbits(*a))) { - if (fsck_err(trans, need_discard_freespace_key_bad, + if (__fsck_err(trans, fsck_flags, + need_discard_freespace_key_bad, "%s\nincorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), bch2_btree_id_str(iter->btree_id), @@ -1437,16 +1425,15 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite *gen = a->gen; out: fsck_err: - bch2_set_btree_iter_dontneed(trans, &alloc_iter); - bch2_trans_iter_exit(trans, &alloc_iter); - printbuf_exit(&buf); + bch2_set_btree_iter_dontneed(&alloc_iter); + bch2_trans_iter_exit(&alloc_iter); return ret; delete: if (!async_repair) { ret = bch2_btree_bit_mod_iter(trans, iter, false) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_commit; + bch_err_throw(c, transaction_restart_commit); goto out; } else { /* @@ -1458,7 +1445,7 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite if (!w) goto out; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_check_discard_freespace_key)) { + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_check_discard_freespace_key)) { kfree(w); goto out; } @@ -1467,14 +1454,16 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite w->c = c; w->pos = BBPOS(iter->btree_id, iter->pos); queue_work(c->write_ref_wq, &w->work); + + ret = 1; /* don't allocate from this bucket */ goto out; } } -static int bch2_check_discard_freespace_key_fsck(struct btree_trans *trans, struct btree_iter *iter) +static int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter) { u8 gen; - int ret = bch2_check_discard_freespace_key(trans, iter, &gen, false); + int ret = __bch2_check_discard_freespace_key(trans, iter, &gen, 0); return ret < 0 ? ret : 0; } @@ -1494,19 +1483,19 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; u64 b; bool need_update = false; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); int ret = 0; BUG_ON(k.k->type != KEY_TYPE_bucket_gens); bkey_reassemble(&g.k_i, k); - struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode); + CLASS(bch2_dev_tryget_noerror, ca)(c, k.k->p.inode); if (!ca) { if (fsck_err(trans, bucket_gens_to_invalid_dev, "bucket_gens key for invalid device:\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = bch2_btree_delete_at(trans, iter, 0); - goto out; + return bch2_btree_delete_at(trans, iter, 0); + return 0; } if (fsck_err_on(end <= ca->mi.first_bucket || @@ -1514,8 +1503,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, trans, bucket_gens_to_invalid_buckets, "bucket_gens key for invalid buckets:\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch2_btree_delete_at(trans, iter, 0); - goto out; + return bch2_btree_delete_at(trans, iter, 0); } for (b = start; b < ca->mi.first_bucket; b++) @@ -1536,30 +1524,29 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, if (need_update) { struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); - ret = PTR_ERR_OR_ZERO(u); if (ret) - goto out; + return ret; memcpy(u, &g, sizeof(g)); - ret = bch2_trans_update(trans, iter, u, 0); + return bch2_trans_update(trans, iter, u, 0); } -out: fsck_err: - bch2_dev_put(ca); - printbuf_exit(&buf); return ret; } int bch2_check_alloc_info(struct bch_fs *c) { - struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter; struct bch_dev *ca = NULL; struct bkey hole; struct bkey_s_c k; int ret = 0; + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_alloc)); + + CLASS(btree_trans, trans)(c); bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_prefetch); bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN, @@ -1574,7 +1561,7 @@ int bch2_check_alloc_info(struct bch_fs *c) bch2_trans_begin(trans); - k = bch2_get_key_or_real_bucket_hole(trans, &iter, &ca, &hole); + k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole); ret = bkey_err(k); if (ret) goto bkey_err; @@ -1582,6 +1569,8 @@ int bch2_check_alloc_info(struct bch_fs *c) if (!k.k) break; + progress_update_iter(trans, &progress, &iter); + if (k.k->type) { next = bpos_nosnap_successor(k.k->p); @@ -1612,67 +1601,63 @@ int bch2_check_alloc_info(struct bch_fs *c) if (ret) goto bkey_err; - bch2_btree_iter_set_pos(trans, &iter, next); + bch2_btree_iter_set_pos(&iter, next); bkey_err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) break; } - bch2_trans_iter_exit(trans, &bucket_gens_iter); - bch2_trans_iter_exit(trans, &freespace_iter); - bch2_trans_iter_exit(trans, &discard_iter); - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&bucket_gens_iter); + bch2_trans_iter_exit(&freespace_iter); + bch2_trans_iter_exit(&discard_iter); + bch2_trans_iter_exit(&iter); bch2_dev_put(ca); ca = NULL; if (ret < 0) - goto err; + return ret; ret = for_each_btree_key(trans, iter, BTREE_ID_need_discard, POS_MIN, BTREE_ITER_prefetch, k, - bch2_check_discard_freespace_key_fsck(trans, &iter)); + bch2_check_discard_freespace_key(trans, &iter)); if (ret) - goto err; + return ret; bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN, BTREE_ITER_prefetch); while (1) { bch2_trans_begin(trans); - k = bch2_btree_iter_peek(trans, &iter); + k = bch2_btree_iter_peek(&iter); if (!k.k) break; ret = bkey_err(k) ?: - bch2_check_discard_freespace_key_fsck(trans, &iter); + bch2_check_discard_freespace_key(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { ret = 0; continue; } if (ret) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_bkey_val_to_text(&buf, c, k); - bch_err(c, "while checking %s", buf.buf); - printbuf_exit(&buf); break; } - bch2_btree_iter_set_pos(trans, &iter, bpos_nosnap_successor(iter.pos)); + bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos)); } - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); if (ret) - goto err; + return ret; ret = for_each_btree_key_commit(trans, iter, BTREE_ID_bucket_gens, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_check_bucket_gens_key(trans, &iter, k)); -err: - bch2_trans_put(trans); - bch_err_fn(c, ret); + return ret; } @@ -1684,10 +1669,10 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a; struct bkey_s_c alloc_k; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); int ret; - alloc_k = bch2_btree_iter_peek(trans, alloc_iter); + alloc_k = bch2_btree_iter_peek(alloc_iter); if (!alloc_k.k) return 0; @@ -1695,7 +1680,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, if (ret) return ret; - struct bch_dev *ca = bch2_dev_tryget_noerror(c, alloc_k.k->p.inode); + CLASS(bch2_dev_tryget_noerror, ca)(c, alloc_k.k->p.inode); if (!ca) return 0; @@ -1707,96 +1692,84 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, bucket_to_u64(alloc_k.k->p), lru_idx, alloc_k, last_flushed); if (ret) - goto err; + return ret; } - if (a->data_type != BCH_DATA_cached) - goto err; + if (a->data_type == BCH_DATA_cached) { + if (fsck_err_on(!a->io_time[READ], + trans, alloc_key_cached_but_read_time_zero, + "cached bucket with read_time 0\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { + struct bkey_i_alloc_v4 *a_mut = + bch2_alloc_to_v4_mut(trans, alloc_k); + ret = PTR_ERR_OR_ZERO(a_mut); + if (ret) + return ret; - if (fsck_err_on(!a->io_time[READ], - trans, alloc_key_cached_but_read_time_zero, - "cached bucket with read_time 0\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { - struct bkey_i_alloc_v4 *a_mut = - bch2_alloc_to_v4_mut(trans, alloc_k); - ret = PTR_ERR_OR_ZERO(a_mut); - if (ret) - goto err; + a_mut->v.io_time[READ] = bch2_current_io_time(c, READ); + ret = bch2_trans_update(trans, alloc_iter, + &a_mut->k_i, BTREE_TRIGGER_norun); + if (ret) + return ret; - a_mut->v.io_time[READ] = bch2_current_io_time(c, READ); - ret = bch2_trans_update(trans, alloc_iter, - &a_mut->k_i, BTREE_TRIGGER_norun); - if (ret) - goto err; + a = &a_mut->v; + } - a = &a_mut->v; + ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, + bucket_to_u64(alloc_k.k->p), + a->io_time[READ], + alloc_k, last_flushed); } - - ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, - bucket_to_u64(alloc_k.k->p), - a->io_time[READ], - alloc_k, last_flushed); - if (ret) - goto err; -err: fsck_err: - bch2_dev_put(ca); - printbuf_exit(&buf); return ret; } int bch2_check_alloc_to_lru_refs(struct bch_fs *c) { struct bkey_buf last_flushed; - bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_alloc)); + + CLASS(btree_trans, trans)(c); + int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))) ?: - bch2_check_stripe_to_lru_refs(c); + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + progress_update_iter(trans, &progress, &iter); + bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed); + }))?: bch2_check_stripe_to_lru_refs(trans); bch2_bkey_buf_exit(&last_flushed, c); - bch_err_fn(c, ret); return ret; } static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress) { - int ret; + struct bch_fs *c = ca->fs; - mutex_lock(&ca->discard_buckets_in_flight_lock); - darray_for_each(ca->discard_buckets_in_flight, i) - if (i->bucket == bucket) { - ret = -BCH_ERR_EEXIST_discard_in_flight_add; - goto out; - } + guard(mutex)(&ca->discard_buckets_in_flight_lock); + struct discard_in_flight *i = + darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket); + if (i) + return bch_err_throw(c, EEXIST_discard_in_flight_add); - ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) { + return darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) { .in_progress = in_progress, .bucket = bucket, })); -out: - mutex_unlock(&ca->discard_buckets_in_flight_lock); - return ret; } static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket) { - mutex_lock(&ca->discard_buckets_in_flight_lock); - darray_for_each(ca->discard_buckets_in_flight, i) - if (i->bucket == bucket) { - BUG_ON(!i->in_progress); - darray_remove_item(&ca->discard_buckets_in_flight, i); - goto found; - } - BUG(); -found: - mutex_unlock(&ca->discard_buckets_in_flight_lock); + guard(mutex)(&ca->discard_buckets_in_flight_lock); + struct discard_in_flight *i = + darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket); + BUG_ON(!i || !i->in_progress); + + darray_remove_item(&ca->discard_buckets_in_flight, i); } struct discard_buckets_state { @@ -1806,19 +1779,6 @@ struct discard_buckets_state { u64 discarded; }; -/* - * This is needed because discard is both a filesystem option and a device - * option, and mount options are supposed to apply to that mount and not be - * persisted, i.e. if it's set as a mount option we can't propagate it to the - * device. - */ -static inline bool discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca) -{ - return test_bit(BCH_FS_discard_mount_opt_set, &c->flags) - ? c->opts.discard - : ca->mi.discard; -} - static int bch2_discard_one_bucket(struct btree_trans *trans, struct bch_dev *ca, struct btree_iter *need_discard_iter, @@ -1828,16 +1788,12 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bpos pos = need_discard_iter->pos; - struct btree_iter iter = {}; - struct bkey_s_c k; - struct bkey_i_alloc_v4 *a; - struct printbuf buf = PRINTBUF; bool discard_locked = false; int ret = 0; if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { s->open++; - goto out; + return 0; } u64 seq_ready = bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal, @@ -1845,30 +1801,29 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, if (seq_ready > c->journal.flushed_seq_ondisk) { if (seq_ready > c->journal.flushing_seq) s->need_journal_commit++; - goto out; + return 0; } - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, - need_discard_iter->pos, - BTREE_ITER_cached); + CLASS(btree_iter, iter)(trans, BTREE_ID_alloc, need_discard_iter->pos, BTREE_ITER_cached); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) - goto out; + return ret; - a = bch2_alloc_to_v4_mut(trans, k); + struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k); ret = PTR_ERR_OR_ZERO(a); if (ret) - goto out; + return ret; if (a->v.data_type != BCH_DATA_need_discard) { if (need_discard_or_freespace_err(trans, k, true, true, true)) { ret = bch2_btree_bit_mod_iter(trans, need_discard_iter, false); if (ret) - goto out; + return ret; goto commit; } - goto out; + return 0; } if (!fastpath) { @@ -1882,7 +1837,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, s->discarded++; *discard_pos_done = iter.pos; - if (discard_opt_enabled(c, ca) && !c->opts.nochanges) { + if (bch2_discard_opt_enabled(c, ca) && !c->opts.nochanges) { /* * This works without any other locks because this is the only * thread that removes items from the need_discard tree @@ -1921,8 +1876,6 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, discard_in_flight_remove(ca, iter.pos.offset); if (!ret) s->seen++; - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); return ret; } @@ -1952,26 +1905,26 @@ static void bch2_do_discards_work(struct work_struct *work) trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); - percpu_ref_put(&ca->io_ref[WRITE]); - bch2_write_ref_put(c, BCH_WRITE_REF_discard); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard); } void bch2_dev_do_discards(struct bch_dev *ca) { struct bch_fs *c = ca->fs; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard)) return; - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_dev_do_discards)) goto put_write_ref; if (queue_work(c->write_ref_wq, &ca->discard_work)) return; - percpu_ref_put(&ca->io_ref[WRITE]); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards); put_write_ref: - bch2_write_ref_put(c, BCH_WRITE_REF_discard); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard); } void bch2_do_discards(struct bch_fs *c) @@ -1986,9 +1939,8 @@ static int bch2_do_discards_fast_one(struct btree_trans *trans, struct bpos *discard_pos_done, struct discard_buckets_state *s) { - struct btree_iter need_discard_iter; - struct bkey_s_c discard_k = bch2_bkey_get_iter(trans, &need_discard_iter, - BTREE_ID_need_discard, POS(ca->dev_idx, bucket), 0); + CLASS(btree_iter, need_discard_iter)(trans, BTREE_ID_need_discard, POS(ca->dev_idx, bucket), 0); + struct bkey_s_c discard_k = bch2_btree_iter_peek_slot(&need_discard_iter); int ret = bkey_err(discard_k); if (ret) return ret; @@ -1997,12 +1949,10 @@ static int bch2_do_discards_fast_one(struct btree_trans *trans, trans, discarding_bucket_not_in_need_discard_btree, "attempting to discard bucket %u:%llu not in need_discard btree", ca->dev_idx, bucket)) - goto out; + return 0; - ret = bch2_discard_one_bucket(trans, ca, &need_discard_iter, discard_pos_done, s, true); -out: + return bch2_discard_one_bucket(trans, ca, &need_discard_iter, discard_pos_done, s, true); fsck_err: - bch2_trans_iter_exit(trans, &need_discard_iter); return ret; } @@ -2019,17 +1969,16 @@ static void bch2_do_discards_fast_work(struct work_struct *work) bool got_bucket = false; u64 bucket; - mutex_lock(&ca->discard_buckets_in_flight_lock); - darray_for_each(ca->discard_buckets_in_flight, i) { - if (i->in_progress) - continue; + scoped_guard(mutex, &ca->discard_buckets_in_flight_lock) + darray_for_each(ca->discard_buckets_in_flight, i) { + if (i->in_progress) + continue; - got_bucket = true; - bucket = i->bucket; - i->in_progress = true; - break; - } - mutex_unlock(&ca->discard_buckets_in_flight_lock); + got_bucket = true; + bucket = i->bucket; + i->in_progress = true; + break; + } if (!got_bucket) break; @@ -2047,8 +1996,8 @@ static void bch2_do_discards_fast_work(struct work_struct *work) trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); bch2_trans_put(trans); - percpu_ref_put(&ca->io_ref[WRITE]); - bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast); } static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) @@ -2058,18 +2007,18 @@ static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) if (discard_in_flight_add(ca, bucket, false)) return; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard_fast)) return; - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_discard_one_bucket_fast)) goto put_ref; if (queue_work(c->write_ref_wq, &ca->discard_fast_work)) return; - percpu_ref_put(&ca->io_ref[WRITE]); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast); put_ref: - bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast); } static int invalidate_one_bp(struct btree_trans *trans, @@ -2096,7 +2045,7 @@ static int invalidate_one_bp(struct btree_trans *trans, bch2_bkey_drop_device(bkey_i_to_s(n), ca->dev_idx); err: - bch2_trans_iter_exit(trans, &extent_iter); + bch2_trans_iter_exit(&extent_iter); return ret; } @@ -2137,9 +2086,8 @@ static int invalidate_one_bucket(struct btree_trans *trans, s64 *nr_to_invalidate) { struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); struct bpos bucket = u64_to_bucket(lru_k.k->p.offset); - struct btree_iter alloc_iter = {}; int ret = 0; if (*nr_to_invalidate <= 0) @@ -2150,52 +2098,53 @@ static int invalidate_one_bucket(struct btree_trans *trans, "lru key points to nonexistent device:bucket %llu:%llu", bucket.inode, bucket.offset)) return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false); - goto out; + return 0; } if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) return 0; - struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, - BTREE_ID_alloc, bucket, - BTREE_ITER_cached); - ret = bkey_err(alloc_k); - if (ret) - return ret; + { + CLASS(btree_iter, alloc_iter)(trans, BTREE_ID_alloc, bucket, BTREE_ITER_cached); + struct bkey_s_c alloc_k = bch2_btree_iter_peek_slot(&alloc_iter); + ret = bkey_err(alloc_k); + if (ret) + return ret; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); - /* We expect harmless races here due to the btree write buffer: */ - if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(*a)) - goto out; + /* We expect harmless races here due to the btree write buffer: */ + if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(*a)) + return 0; - /* - * Impossible since alloc_lru_idx_read() only returns nonzero if the - * bucket is supposed to be on the cached bucket LRU (i.e. - * BCH_DATA_cached) - * - * bch2_lru_validate() also disallows lru keys with lru_pos_time() == 0 - */ - BUG_ON(a->data_type != BCH_DATA_cached); - BUG_ON(a->dirty_sectors); + /* + * Impossible since alloc_lru_idx_read() only returns nonzero if the + * bucket is supposed to be on the cached bucket LRU (i.e. + * BCH_DATA_cached) + * + * bch2_lru_validate() also disallows lru keys with lru_pos_time() == 0 + */ + BUG_ON(a->data_type != BCH_DATA_cached); + BUG_ON(a->dirty_sectors); - if (!a->cached_sectors) - bch_err(c, "invalidating empty bucket, confused"); + if (!a->cached_sectors) { + bch2_check_bucket_backpointer_mismatch(trans, ca, bucket.offset, + true, last_flushed); + return 0; + } - unsigned cached_sectors = a->cached_sectors; - u8 gen = a->gen; + unsigned cached_sectors = a->cached_sectors; + u8 gen = a->gen; - ret = invalidate_one_bucket_by_bps(trans, ca, bucket, gen, last_flushed); - if (ret) - goto out; + ret = invalidate_one_bucket_by_bps(trans, ca, bucket, gen, last_flushed); + if (ret) + return ret; - trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors); - --*nr_to_invalidate; -out: + trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors); + --*nr_to_invalidate; + } fsck_err: - bch2_trans_iter_exit(trans, &alloc_iter); - printbuf_exit(&buf); return ret; } @@ -2204,9 +2153,9 @@ static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter { struct bkey_s_c k; again: - k = bch2_btree_iter_peek_max(trans, iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); + k = bch2_btree_iter_peek_max(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); if (!k.k && !*wrapped) { - bch2_btree_iter_set_pos(trans, iter, lru_pos(ca->dev_idx, 0, 0)); + bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0)); *wrapped = true; goto again; } @@ -2218,7 +2167,7 @@ static void bch2_do_invalidates_work(struct work_struct *work) { struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work); struct bch_fs *c = ca->fs; - struct btree_trans *trans = bch2_trans_get(c); + CLASS(btree_trans, trans)(c); int ret = 0; struct bkey_buf last_flushed; @@ -2256,32 +2205,31 @@ static void bch2_do_invalidates_work(struct work_struct *work) if (ret) break; - bch2_btree_iter_advance(trans, &iter); + bch2_btree_iter_advance(&iter); } - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); err: - bch2_trans_put(trans); - percpu_ref_put(&ca->io_ref[WRITE]); bch2_bkey_buf_exit(&last_flushed, c); - bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate); } void bch2_dev_do_invalidates(struct bch_dev *ca) { struct bch_fs *c = ca->fs; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_invalidate)) return; - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_do_invalidates)) goto put_ref; if (queue_work(c->write_ref_wq, &ca->invalidate_work)) return; - percpu_ref_put(&ca->io_ref[WRITE]); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates); put_ref: - bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate); } void bch2_do_invalidates(struct bch_fs *c) @@ -2293,18 +2241,17 @@ void bch2_do_invalidates(struct bch_fs *c) int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, u64 bucket_start, u64 bucket_end) { - struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; struct bkey hole; struct bpos end = POS(ca->dev_idx, bucket_end); - struct bch_member *m; unsigned long last_updated = jiffies; int ret; BUG_ON(bucket_start > bucket_end); BUG_ON(bucket_end > ca->mi.nbuckets); + CLASS(btree_trans, trans)(c); bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)), BTREE_ITER_prefetch); @@ -2326,7 +2273,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, break; } - k = bch2_get_key_or_hole(trans, &iter, end, &hole); + k = bch2_get_key_or_hole(&iter, end, &hole); ret = bkey_err(k); if (ret) goto bkey_err; @@ -2345,7 +2292,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, if (ret) goto bkey_err; - bch2_btree_iter_advance(trans, &iter); + bch2_btree_iter_advance(&iter); } else { struct bkey_i *freespace; @@ -2365,7 +2312,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, if (ret) goto bkey_err; - bch2_btree_iter_set_pos(trans, &iter, k.k->p); + bch2_btree_iter_set_pos(&iter, k.k->p); } bkey_err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -2374,32 +2321,32 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, break; } - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); + bch2_trans_iter_exit(&iter); if (ret < 0) { bch_err_msg(ca, ret, "initializing free space"); return ret; } - mutex_lock(&c->sb_lock); - m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true); - mutex_unlock(&c->sb_lock); + scoped_guard(mutex, &c->sb_lock) { + struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true); + } return 0; } int bch2_fs_freespace_init(struct bch_fs *c) { - int ret = 0; - bool doing_init = false; + if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) + return 0; /* * We can crash during the device add path, so we need to check this on * every mount: */ + bool doing_init = false; for_each_member_device(c, ca) { if (ca->mi.freespace_initialized) continue; @@ -2409,7 +2356,7 @@ int bch2_fs_freespace_init(struct bch_fs *c) doing_init = true; } - ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); + int ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); if (ret) { bch2_dev_put(ca); bch_err_fn(c, ret); @@ -2418,9 +2365,8 @@ int bch2_fs_freespace_init(struct bch_fs *c) } if (doing_init) { - mutex_lock(&c->sb_lock); + guard(mutex)(&c->sb_lock); bch2_write_super(c); - mutex_unlock(&c->sb_lock); bch_verbose(c, "done initializing freespace"); } @@ -2439,8 +2385,7 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) * We clear the LRU and need_discard btrees first so that we don't race * with bch2_do_invalidates() and bch2_do_discards() */ - ret = bch2_dev_remove_stripes(c, ca->dev_idx) ?: - bch2_btree_delete_range(c, BTREE_ID_lru, start, end, + ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end, BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, BTREE_TRIGGER_norun, NULL) ?: @@ -2480,7 +2425,7 @@ static int __bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, 0); out: - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); return ret; } @@ -2503,15 +2448,15 @@ void bch2_recalc_capacity(struct bch_fs *c) lockdep_assert_held(&c->state_lock); - for_each_online_member(c, ca) { - struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi; + guard(rcu)(); + for_each_member_device_rcu(c, ca, NULL) { + struct block_device *bdev = READ_ONCE(ca->disk_sb.bdev); + if (bdev) + ra_pages += bdev->bd_disk->bdi->ra_pages; - ra_pages += bdi->ra_pages; - } - - bch2_set_ra_pages(c, ra_pages); + if (ca->mi.state != BCH_MEMBER_STATE_rw) + continue; - __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) { u64 dev_reserve = 0; /* @@ -2549,6 +2494,8 @@ void bch2_recalc_capacity(struct bch_fs *c) ca->mi.bucket_size); } + bch2_set_ra_pages(c, ra_pages); + gc_reserve = c->opts.gc_reserve_bytes ? c->opts.gc_reserve_bytes >> 9 : div64_u64(capacity * c->opts.gc_reserve_percent, 100); @@ -2570,7 +2517,8 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *c) { u64 ret = U64_MAX; - for_each_rw_member(c, ca) + guard(rcu)(); + for_each_rw_member_rcu(c, ca) ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size); return ret; } @@ -2578,19 +2526,31 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *c) static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) { struct open_bucket *ob; - bool ret = false; for (ob = c->open_buckets; ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { - spin_lock(&ob->lock); - if (ob->valid && !ob->on_partial_list && - ob->dev == ca->dev_idx) - ret = true; - spin_unlock(&ob->lock); + scoped_guard(spinlock, &ob->lock) { + if (ob->valid && !ob->on_partial_list && + ob->dev == ca->dev_idx) + return true; + } } - return ret; + return false; +} + +void bch2_dev_allocator_set_rw(struct bch_fs *c, struct bch_dev *ca, bool rw) +{ + /* BCH_DATA_free == all rw devs */ + + for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) + if (rw && + (i == BCH_DATA_free || + (ca->mi.data_allowed & BIT(i)))) + set_bit(ca->dev_idx, c->rw_devs[i].d); + else + clear_bit(ca->dev_idx, c->rw_devs[i].d); } /* device goes ro: */ @@ -2599,9 +2559,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) lockdep_assert_held(&c->state_lock); /* First, remove device from allocation groups: */ - - for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) - clear_bit(ca->dev_idx, c->rw_devs[i].d); + bch2_dev_allocator_set_rw(c, ca, false); c->rw_devs_change_count++; @@ -2635,10 +2593,7 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) { lockdep_assert_held(&c->state_lock); - for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) - if (ca->mi.data_allowed & (1 << i)) - set_bit(ca->dev_idx, c->rw_devs[i].d); - + bch2_dev_allocator_set_rw(c, ca, true); c->rw_devs_change_count++; } diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index 34b3d6ac4fbb..c2e8482fbbe6 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -13,11 +13,9 @@ static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos) { - rcu_read_lock(); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode); - bool ret = ca && bucket_valid(ca, pos.offset); - rcu_read_unlock(); - return ret; + return ca && bucket_valid(ca, pos.offset); } static inline u64 bucket_to_u64(struct bpos bucket) @@ -253,6 +251,7 @@ int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); void bch2_alloc_v4_swab(struct bkey_s); void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +void bch2_alloc_v4_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_alloc ((struct bkey_ops) { \ .key_validate = bch2_alloc_v1_validate, \ @@ -277,7 +276,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) { \ .key_validate = bch2_alloc_v4_validate, \ - .val_to_text = bch2_alloc_to_text, \ + .val_to_text = bch2_alloc_v4_to_text, \ .swab = bch2_alloc_v4_swab, \ .trigger = bch2_trigger_alloc, \ .min_val_size = 48, \ @@ -310,7 +309,14 @@ int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags); -int bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *, bool); +int __bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *, + enum bch_fsck_flags); + +static inline int bch2_check_discard_freespace_key_async(struct btree_trans *trans, struct btree_iter *iter, u8 *gen) +{ + return __bch2_check_discard_freespace_key(trans, iter, gen, FSCK_ERR_NO_LOG); +} + int bch2_check_alloc_info(struct bch_fs *); int bch2_check_alloc_to_lru_refs(struct bch_fs *); void bch2_dev_do_discards(struct bch_dev *); @@ -350,6 +356,7 @@ int bch2_dev_remove_alloc(struct bch_fs *, struct bch_dev *); void bch2_recalc_capacity(struct bch_fs *); u64 bch2_min_rw_member_capacity(struct bch_fs *); +void bch2_dev_allocator_set_rw(struct bch_fs *, struct bch_dev *, bool); void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 7ec022e9361a..0a5b3d31d52c 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -69,10 +69,9 @@ const char * const bch2_watermarks[] = { void bch2_reset_alloc_cursors(struct bch_fs *c) { - rcu_read_lock(); + guard(rcu)(); for_each_member_device_rcu(c, ca, NULL) memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor)); - rcu_read_unlock(); } static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob) @@ -107,20 +106,20 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) return; } - spin_lock(&ob->lock); - ob->valid = false; - ob->data_type = 0; - spin_unlock(&ob->lock); + scoped_guard(spinlock, &ob->lock) { + ob->valid = false; + ob->data_type = 0; + } - spin_lock(&c->freelist_lock); - bch2_open_bucket_hash_remove(c, ob); + scoped_guard(spinlock, &c->freelist_lock) { + bch2_open_bucket_hash_remove(c, ob); - ob->freelist = c->open_buckets_freelist; - c->open_buckets_freelist = ob - c->open_buckets; + ob->freelist = c->open_buckets_freelist; + c->open_buckets_freelist = ob - c->open_buckets; - c->open_buckets_nr_free++; - ca->nr_open_buckets--; - spin_unlock(&c->freelist_lock); + c->open_buckets_nr_free++; + ca->nr_open_buckets--; + } closure_wake_up(&c->open_buckets_wait); } @@ -154,7 +153,7 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) { - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs) + if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_trans_mark_dev_sbs)) return false; return bch2_is_superblock_bucket(ca, b); @@ -165,26 +164,25 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) BUG_ON(c->open_buckets_partial_nr >= ARRAY_SIZE(c->open_buckets_partial)); - spin_lock(&c->freelist_lock); - rcu_read_lock(); - bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++; - rcu_read_unlock(); + scoped_guard(spinlock, &c->freelist_lock) { + guard(rcu)(); + bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++; - ob->on_partial_list = true; - c->open_buckets_partial[c->open_buckets_partial_nr++] = - ob - c->open_buckets; - spin_unlock(&c->freelist_lock); + ob->on_partial_list = true; + c->open_buckets_partial[c->open_buckets_partial_nr++] = + ob - c->open_buckets; + } closure_wake_up(&c->open_buckets_wait); closure_wake_up(&c->freelist_wait); } static inline bool may_alloc_bucket(struct bch_fs *c, - struct bpos bucket, - struct bucket_alloc_state *s) + struct alloc_request *req, + struct bpos bucket) { if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) { - s->skipped_open++; + req->counters.skipped_open++; return false; } @@ -193,60 +191,59 @@ static inline bool may_alloc_bucket(struct bch_fs *c, bucket.inode, bucket.offset); if (journal_seq_ready > c->journal.flushed_seq_ondisk) { if (journal_seq_ready > c->journal.flushing_seq) - s->need_journal_commit++; - s->skipped_need_journal_commit++; + req->counters.need_journal_commit++; + req->counters.skipped_need_journal_commit++; return false; } if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) { - s->skipped_nocow++; + req->counters.skipped_nocow++; return false; } return true; } -static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, +static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, + struct alloc_request *req, u64 bucket, u8 gen, - enum bch_watermark watermark, - struct bucket_alloc_state *s, struct closure *cl) { + struct bch_dev *ca = req->ca; + if (unlikely(is_superblock_bucket(c, ca, bucket))) return NULL; if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { - s->skipped_nouse++; + req->counters.skipped_nouse++; return NULL; } - spin_lock(&c->freelist_lock); + guard(spinlock)(&c->freelist_lock); - if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark))) { + if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(req->watermark))) { if (cl) closure_wait(&c->open_buckets_wait, cl); track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true); - spin_unlock(&c->freelist_lock); - return ERR_PTR(-BCH_ERR_open_buckets_empty); + return ERR_PTR(bch_err_throw(c, open_buckets_empty)); } /* Recheck under lock: */ if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { - spin_unlock(&c->freelist_lock); - s->skipped_open++; + req->counters.skipped_open++; return NULL; } struct open_bucket *ob = bch2_open_bucket_alloc(c); - spin_lock(&ob->lock); - ob->valid = true; - ob->sectors_free = ca->mi.bucket_size; - ob->dev = ca->dev_idx; - ob->gen = gen; - ob->bucket = bucket; - spin_unlock(&ob->lock); + scoped_guard(spinlock, &ob->lock) { + ob->valid = true; + ob->sectors_free = ca->mi.bucket_size; + ob->dev = ca->dev_idx; + ob->gen = gen; + ob->bucket = bucket; + } ca->nr_open_buckets++; bch2_open_bucket_hash_add(c, ob); @@ -254,30 +251,28 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false); track_event_change(&c->times[BCH_TIME_blocked_allocate], false); - spin_unlock(&c->freelist_lock); return ob; } -static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, - enum bch_watermark watermark, - struct bucket_alloc_state *s, +static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, + struct alloc_request *req, struct btree_iter *freespace_iter, struct closure *cl) { struct bch_fs *c = trans->c; u64 b = freespace_iter->pos.offset & ~(~0ULL << 56); - if (!may_alloc_bucket(c, POS(ca->dev_idx, b), s)) + if (!may_alloc_bucket(c, req, POS(req->ca->dev_idx, b))) return NULL; u8 gen; - int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen, true); + int ret = bch2_check_discard_freespace_key_async(trans, freespace_iter, &gen); if (ret < 0) return ERR_PTR(ret); if (ret) return NULL; - return __try_alloc_bucket(c, ca, b, gen, watermark, s, cl); + return __try_alloc_bucket(c, req, b, gen, cl); } /* @@ -285,17 +280,15 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc */ static noinline struct open_bucket * bch2_bucket_alloc_early(struct btree_trans *trans, - struct bch_dev *ca, - enum bch_watermark watermark, - struct bucket_alloc_state *s, + struct alloc_request *req, struct closure *cl) { struct bch_fs *c = trans->c; - struct btree_iter iter, citer; - struct bkey_s_c k, ck; + struct bch_dev *ca = req->ca; + struct bkey_s_c k; struct open_bucket *ob = NULL; u64 first_bucket = ca->mi.first_bucket; - u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; + u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap]; u64 alloc_start = max(first_bucket, *dev_alloc_cursor); u64 alloc_cursor = alloc_start; int ret; @@ -312,24 +305,24 @@ bch2_bucket_alloc_early(struct btree_trans *trans, again: for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor), BTREE_ITER_slots, k, ret) { - u64 bucket = k.k->p.offset; + u64 bucket = alloc_cursor = k.k->p.offset; if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) break; - if (s->btree_bitmap != BTREE_BITMAP_ANY && - s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, + if (req->btree_bitmap != BTREE_BITMAP_ANY && + req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { - if (s->btree_bitmap == BTREE_BITMAP_YES && + if (req->btree_bitmap == BTREE_BITMAP_YES && bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) break; bucket = sector_to_bucket(ca, round_up(bucket_to_sector(ca, bucket) + 1, 1ULL << ca->mi.btree_bitmap_shift)); - bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, bucket)); - s->buckets_seen++; - s->skipped_mi_btree_bitmap++; + bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, bucket)); + req->counters.buckets_seen++; + req->counters.skipped_mi_btree_bitmap++; continue; } @@ -339,30 +332,23 @@ bch2_bucket_alloc_early(struct btree_trans *trans, continue; /* now check the cached key to serialize concurrent allocs of the bucket */ - ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_cached); + CLASS(btree_iter, citer)(trans, BTREE_ID_alloc, k.k->p, BTREE_ITER_cached|BTREE_ITER_nopreserve); + struct bkey_s_c ck = bch2_btree_iter_peek_slot(&citer); ret = bkey_err(ck); if (ret) break; a = bch2_alloc_to_v4(ck, &a_convert); - if (a->data_type != BCH_DATA_free) - goto next; - - s->buckets_seen++; + if (a->data_type == BCH_DATA_free) { + req->counters.buckets_seen++; - ob = may_alloc_bucket(c, k.k->p, s) - ? __try_alloc_bucket(c, ca, k.k->p.offset, a->gen, - watermark, s, cl) - : NULL; -next: - bch2_set_btree_iter_dontneed(trans, &citer); - bch2_trans_iter_exit(trans, &citer); - if (ob) - break; + ob = may_alloc_bucket(c, req, k.k->p) + ? __try_alloc_bucket(c, req, k.k->p.offset, a->gen, cl) + : NULL; + if (ob) + break; + } } - bch2_trans_iter_exit(trans, &iter); - - alloc_cursor = iter.pos.offset; if (!ob && ret) ob = ERR_PTR(ret); @@ -378,15 +364,13 @@ bch2_bucket_alloc_early(struct btree_trans *trans, } static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, - struct bch_dev *ca, - enum bch_watermark watermark, - struct bucket_alloc_state *s, - struct closure *cl) + struct alloc_request *req, + struct closure *cl) { - struct btree_iter iter; + struct bch_dev *ca = req->ca; struct bkey_s_c k; struct open_bucket *ob = NULL; - u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; + u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap]; u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor)); u64 alloc_cursor = alloc_start; int ret; @@ -402,13 +386,13 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, iter.k.size = iter.k.p.offset - iter.pos.offset; while (iter.k.size) { - s->buckets_seen++; + req->counters.buckets_seen++; u64 bucket = iter.pos.offset & ~(~0ULL << 56); - if (s->btree_bitmap != BTREE_BITMAP_ANY && - s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, + if (req->btree_bitmap != BTREE_BITMAP_ANY && + req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { - if (s->btree_bitmap == BTREE_BITMAP_YES && + if (req->btree_bitmap == BTREE_BITMAP_YES && bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) goto fail; @@ -417,16 +401,16 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, 1ULL << ca->mi.btree_bitmap_shift)); alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56)); - bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, alloc_cursor)); - s->skipped_mi_btree_bitmap++; + bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor)); + req->counters.skipped_mi_btree_bitmap++; goto next; } - ob = try_alloc_bucket(trans, ca, watermark, s, &iter, cl); + ob = try_alloc_bucket(trans, req, &iter, cl); if (ob) { if (!IS_ERR(ob)) *dev_alloc_cursor = iter.pos.offset; - bch2_set_btree_iter_dontneed(trans, &iter); + bch2_set_btree_iter_dontneed(&iter); break; } @@ -438,7 +422,6 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, break; } fail: - bch2_trans_iter_exit(trans, &iter); BUG_ON(ob && ret); @@ -453,33 +436,30 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, return ob; } -static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, - enum bch_watermark watermark, - enum bch_data_type data_type, +static noinline void trace_bucket_alloc2(struct bch_fs *c, + struct alloc_request *req, struct closure *cl, - struct bch_dev_usage *usage, - struct bucket_alloc_state *s, struct open_bucket *ob) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); printbuf_tabstop_push(&buf, 24); - prt_printf(&buf, "dev\t%s (%u)\n", ca->name, ca->dev_idx); - prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[watermark]); - prt_printf(&buf, "data type\t%s\n", __bch2_data_types[data_type]); + prt_printf(&buf, "dev\t%s (%u)\n", req->ca->name, req->ca->dev_idx); + prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[req->watermark]); + prt_printf(&buf, "data type\t%s\n", __bch2_data_types[req->data_type]); prt_printf(&buf, "blocking\t%u\n", cl != NULL); - prt_printf(&buf, "free\t%llu\n", usage->buckets[BCH_DATA_free]); - prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(ca, *usage, watermark)); - prt_printf(&buf, "copygc_wait\t%lu/%lli\n", + prt_printf(&buf, "free\t%llu\n", req->usage.buckets[BCH_DATA_free]); + prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(req->ca, req->usage, req->watermark)); + prt_printf(&buf, "copygc_wait\t%llu/%lli\n", bch2_copygc_wait_amount(c), c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now)); - prt_printf(&buf, "seen\t%llu\n", s->buckets_seen); - prt_printf(&buf, "open\t%llu\n", s->skipped_open); - prt_printf(&buf, "need journal commit\t%llu\n", s->skipped_need_journal_commit); - prt_printf(&buf, "nocow\t%llu\n", s->skipped_nocow); - prt_printf(&buf, "nouse\t%llu\n", s->skipped_nouse); - prt_printf(&buf, "mi_btree_bitmap\t%llu\n", s->skipped_mi_btree_bitmap); + prt_printf(&buf, "seen\t%llu\n", req->counters.buckets_seen); + prt_printf(&buf, "open\t%llu\n", req->counters.skipped_open); + prt_printf(&buf, "need journal commit\t%llu\n", req->counters.skipped_need_journal_commit); + prt_printf(&buf, "nocow\t%llu\n", req->counters.skipped_nocow); + prt_printf(&buf, "nouse\t%llu\n", req->counters.skipped_nouse); + prt_printf(&buf, "mi_btree_bitmap\t%llu\n", req->counters.skipped_mi_btree_bitmap); if (!IS_ERR(ob)) { prt_printf(&buf, "allocated\t%llu\n", ob->bucket); @@ -488,54 +468,48 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, prt_printf(&buf, "err\t%s\n", bch2_err_str(PTR_ERR(ob))); trace_bucket_alloc_fail(c, buf.buf); } - - printbuf_exit(&buf); } /** * bch2_bucket_alloc_trans - allocate a single bucket from a specific device * @trans: transaction object - * @ca: device to allocate from - * @watermark: how important is this allocation? - * @data_type: BCH_DATA_journal, btree, user... + * @req: state for the entire allocation * @cl: if not NULL, closure to be used to wait if buckets not available * @nowait: if true, do not wait for buckets to become available - * @usage: for secondarily also returning the current device usage * * Returns: an open_bucket on success, or an ERR_PTR() on failure. */ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, - struct bch_dev *ca, - enum bch_watermark watermark, - enum bch_data_type data_type, - struct closure *cl, - bool nowait, - struct bch_dev_usage *usage) + struct alloc_request *req, + struct closure *cl, + bool nowait) { struct bch_fs *c = trans->c; + struct bch_dev *ca = req->ca; struct open_bucket *ob = NULL; bool freespace = READ_ONCE(ca->mi.freespace_initialized); u64 avail; - struct bucket_alloc_state s = { - .btree_bitmap = data_type == BCH_DATA_btree, - }; bool waiting = nowait; + + req->btree_bitmap = req->data_type == BCH_DATA_btree; + memset(&req->counters, 0, sizeof(req->counters)); again: - bch2_dev_usage_read_fast(ca, usage); - avail = dev_buckets_free(ca, *usage, watermark); + bch2_dev_usage_read_fast(ca, &req->usage); + avail = dev_buckets_free(ca, req->usage, req->watermark); - if (usage->buckets[BCH_DATA_need_discard] > avail) + if (req->usage.buckets[BCH_DATA_need_discard] > + min(avail, ca->mi.nbuckets >> 7)) bch2_dev_do_discards(ca); - if (usage->buckets[BCH_DATA_need_gc_gens] > avail) + if (req->usage.buckets[BCH_DATA_need_gc_gens] > avail) bch2_gc_gens_async(c); - if (should_invalidate_buckets(ca, *usage)) + if (should_invalidate_buckets(ca, req->usage)) bch2_dev_do_invalidates(ca); if (!avail) { - if (watermark > BCH_WATERMARK_normal && - c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) + if (req->watermark > BCH_WATERMARK_normal && + c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations) goto alloc; if (cl && !waiting) { @@ -546,7 +520,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, track_event_change(&c->times[BCH_TIME_blocked_allocate], true); - ob = ERR_PTR(-BCH_ERR_freelist_empty); + ob = ERR_PTR(bch_err_throw(c, freelist_empty)); goto err; } @@ -554,27 +528,27 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, closure_wake_up(&c->freelist_wait); alloc: ob = likely(freespace) - ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl) - : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl); + ? bch2_bucket_alloc_freelist(trans, req, cl) + : bch2_bucket_alloc_early(trans, req, cl); - if (s.need_journal_commit * 2 > avail) + if (req->counters.need_journal_commit * 2 > avail) bch2_journal_flush_async(&c->journal, NULL); - if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) { - s.btree_bitmap = BTREE_BITMAP_ANY; + if (!ob && req->btree_bitmap != BTREE_BITMAP_ANY) { + req->btree_bitmap = BTREE_BITMAP_ANY; goto alloc; } - if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { + if (!ob && freespace && c->recovery.pass_done < BCH_RECOVERY_PASS_check_alloc_info) { freespace = false; goto alloc; } err: if (!ob) - ob = ERR_PTR(-BCH_ERR_no_buckets_found); + ob = ERR_PTR(bch_err_throw(c, no_buckets_found)); if (!IS_ERR(ob)) - ob->data_type = data_type; + ob->data_type = req->data_type; if (!IS_ERR(ob)) count_event(c, bucket_alloc); @@ -584,7 +558,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, if (!IS_ERR(ob) ? trace_bucket_alloc_enabled() : trace_bucket_alloc_fail_enabled()) - trace_bucket_alloc2(c, ca, watermark, data_type, cl, usage, &s, ob); + trace_bucket_alloc2(c, req, cl, ob); return ob; } @@ -594,12 +568,16 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, enum bch_data_type data_type, struct closure *cl) { - struct bch_dev_usage usage; struct open_bucket *ob; + struct alloc_request req = { + .watermark = watermark, + .data_type = data_type, + .ca = ca, + }; - bch2_trans_do(c, - PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark, - data_type, cl, false, &usage))); + CLASS(btree_trans, trans)(c); + lockrestart_do(trans, + PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req, cl, false))); return ob; } @@ -611,18 +589,18 @@ static int __dev_stripe_cmp(struct dev_stripe_state *stripe, #define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) -struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, - struct dev_stripe_state *stripe, - struct bch_devs_mask *devs) +void bch2_dev_alloc_list(struct bch_fs *c, + struct dev_stripe_state *stripe, + struct bch_devs_mask *devs, + struct dev_alloc_list *ret) { - struct dev_alloc_list ret = { .nr = 0 }; - unsigned i; + ret->nr = 0; + unsigned i; for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) - ret.data[ret.nr++] = i; + ret->data[ret->nr++] = i; - bubble_sort(ret.data, ret.nr, dev_stripe_cmp); - return ret; + bubble_sort(ret->data, ret->nr, dev_stripe_cmp); } static const u64 stripe_clock_hand_rescale = 1ULL << 62; /* trigger rescale at */ @@ -693,64 +671,53 @@ void bch2_dev_stripe_increment(struct bch_dev *ca, } static int add_new_bucket(struct bch_fs *c, - struct open_buckets *ptrs, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - struct open_bucket *ob) + struct alloc_request *req, + struct open_bucket *ob) { unsigned durability = ob_dev(c, ob)->mi.durability; - BUG_ON(*nr_effective >= nr_replicas); + BUG_ON(req->nr_effective >= req->nr_replicas); - __clear_bit(ob->dev, devs_may_alloc->d); - *nr_effective += durability; - *have_cache |= !durability; + __clear_bit(ob->dev, req->devs_may_alloc.d); + req->nr_effective += durability; + req->have_cache |= !durability; - ob_push(c, ptrs, ob); + ob_push(c, &req->ptrs, ob); - if (*nr_effective >= nr_replicas) + if (req->nr_effective >= req->nr_replicas) return 1; if (ob->ec) return 1; return 0; } -int bch2_bucket_alloc_set_trans(struct btree_trans *trans, - struct open_buckets *ptrs, - struct dev_stripe_state *stripe, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum bch_write_flags flags, - enum bch_data_type data_type, - enum bch_watermark watermark, - struct closure *cl) +inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans, + struct alloc_request *req, + struct dev_stripe_state *stripe, + struct closure *cl) { struct bch_fs *c = trans->c; - int ret = -BCH_ERR_insufficient_devices; + int ret = 0; + + BUG_ON(req->nr_effective >= req->nr_replicas); - BUG_ON(*nr_effective >= nr_replicas); + bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc, &req->devs_sorted); - struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); - darray_for_each(devs_sorted, i) { - struct bch_dev *ca = bch2_dev_tryget_noerror(c, *i); - if (!ca) + darray_for_each(req->devs_sorted, i) { + req->ca = bch2_dev_tryget_noerror(c, *i); + if (!req->ca) continue; - if (!ca->mi.durability && *have_cache) { - bch2_dev_put(ca); + if (!req->ca->mi.durability && req->have_cache) { + bch2_dev_put(req->ca); continue; } - struct bch_dev_usage usage; - struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, - cl, flags & BCH_WRITE_alloc_nowait, &usage); + struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req, cl, + req->flags & BCH_WRITE_alloc_nowait); if (!IS_ERR(ob)) - bch2_dev_stripe_increment_inlined(ca, stripe, &usage); - bch2_dev_put(ca); + bch2_dev_stripe_increment_inlined(req->ca, stripe, &req->usage); + bch2_dev_put(req->ca); if (IS_ERR(ob)) { ret = PTR_ERR(ob); @@ -759,15 +726,16 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, continue; } - if (add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob)) { - ret = 0; + ret = add_new_bucket(c, req, ob); + if (ret) break; - } } - return ret; + if (ret == 1) + return 0; + if (ret) + return ret; + return bch_err_throw(c, insufficient_devices); } /* Allocate from stripes: */ @@ -779,35 +747,28 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, */ static int bucket_alloc_from_stripe(struct btree_trans *trans, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - u16 target, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum bch_watermark watermark, - enum bch_write_flags flags, - struct closure *cl) + struct alloc_request *req, + struct closure *cl) { struct bch_fs *c = trans->c; int ret = 0; - if (nr_replicas < 2) + if (req->nr_replicas < 2) return 0; - if (ec_open_bucket(c, ptrs)) + if (ec_open_bucket(c, &req->ptrs)) return 0; struct ec_stripe_head *h = - bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); + bch2_ec_stripe_head_get(trans, req, 0, cl); if (IS_ERR(h)) return PTR_ERR(h); if (!h) return 0; - struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); - darray_for_each(devs_sorted, i) + bch2_dev_alloc_list(c, &req->wp->stripe, &req->devs_may_alloc, &req->devs_sorted); + + darray_for_each(req->devs_sorted, i) for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { if (!h->s->blocks[ec_idx]) continue; @@ -818,9 +779,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, ob->ec = h->s; ec_stripe_new_get(h->s, STRIPE_REF_io); - ret = add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob); + ret = add_new_bucket(c, req, ob); goto out; } } @@ -832,86 +791,67 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, /* Sector allocator */ static bool want_bucket(struct bch_fs *c, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - bool *have_cache, bool ec, + struct alloc_request *req, struct open_bucket *ob) { struct bch_dev *ca = ob_dev(c, ob); - if (!test_bit(ob->dev, devs_may_alloc->d)) + if (!test_bit(ob->dev, req->devs_may_alloc.d)) return false; - if (ob->data_type != wp->data_type) + if (ob->data_type != req->wp->data_type) return false; if (!ca->mi.durability && - (wp->data_type == BCH_DATA_btree || ec || *have_cache)) + (req->wp->data_type == BCH_DATA_btree || req->ec || req->have_cache)) return false; - if (ec != (ob->ec != NULL)) + if (req->ec != (ob->ec != NULL)) return false; return true; } static int bucket_alloc_set_writepoint(struct bch_fs *c, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - bool ec) + struct alloc_request *req) { - struct open_buckets ptrs_skip = { .nr = 0 }; struct open_bucket *ob; unsigned i; int ret = 0; - open_bucket_for_each(c, &wp->ptrs, ob, i) { - if (!ret && want_bucket(c, wp, devs_may_alloc, - have_cache, ec, ob)) - ret = add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob); + req->scratch_ptrs.nr = 0; + + open_bucket_for_each(c, &req->wp->ptrs, ob, i) { + if (!ret && want_bucket(c, req, ob)) + ret = add_new_bucket(c, req, ob); else - ob_push(c, &ptrs_skip, ob); + ob_push(c, &req->scratch_ptrs, ob); } - wp->ptrs = ptrs_skip; + req->wp->ptrs = req->scratch_ptrs; return ret; } static int bucket_alloc_set_partial(struct bch_fs *c, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, bool ec, - enum bch_watermark watermark) + struct alloc_request *req) { - int i, ret = 0; - if (!c->open_buckets_partial_nr) return 0; - spin_lock(&c->freelist_lock); + guard(spinlock)(&c->freelist_lock); if (!c->open_buckets_partial_nr) - goto unlock; + return 0; - for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) { + for (int i = c->open_buckets_partial_nr - 1; i >= 0; --i) { struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; - if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) { + if (want_bucket(c, req, ob)) { struct bch_dev *ca = ob_dev(c, ob); - struct bch_dev_usage usage; u64 avail; - bch2_dev_usage_read_fast(ca, &usage); - avail = dev_buckets_free(ca, usage, watermark) + ca->nr_partial_buckets; + bch2_dev_usage_read_fast(ca, &req->usage); + avail = dev_buckets_free(ca, req->usage, req->watermark) + ca->nr_partial_buckets; if (!avail) continue; @@ -920,78 +860,54 @@ static int bucket_alloc_set_partial(struct bch_fs *c, i); ob->on_partial_list = false; - rcu_read_lock(); - bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; - rcu_read_unlock(); + scoped_guard(rcu) + bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; - ret = add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob); + int ret = add_new_bucket(c, req, ob); if (ret) - break; + return ret; } } -unlock: - spin_unlock(&c->freelist_lock); - return ret; + + return 0; } static int __open_bucket_add_buckets(struct btree_trans *trans, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_list *devs_have, - u16 target, - bool erasure_code, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum bch_watermark watermark, - enum bch_write_flags flags, - struct closure *_cl) + struct alloc_request *req, + struct closure *_cl) { struct bch_fs *c = trans->c; - struct bch_devs_mask devs; struct open_bucket *ob; struct closure *cl = NULL; unsigned i; int ret; - devs = target_rw_devs(c, wp->data_type, target); + req->devs_may_alloc = target_rw_devs(c, req->wp->data_type, req->target); /* Don't allocate from devices we already have pointers to: */ - darray_for_each(*devs_have, i) - __clear_bit(*i, devs.d); + darray_for_each(*req->devs_have, i) + __clear_bit(*i, req->devs_may_alloc.d); - open_bucket_for_each(c, ptrs, ob, i) - __clear_bit(ob->dev, devs.d); + open_bucket_for_each(c, &req->ptrs, ob, i) + __clear_bit(ob->dev, req->devs_may_alloc.d); - ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs, - nr_replicas, nr_effective, - have_cache, erasure_code); + ret = bucket_alloc_set_writepoint(c, req); if (ret) return ret; - ret = bucket_alloc_set_partial(c, ptrs, wp, &devs, - nr_replicas, nr_effective, - have_cache, erasure_code, watermark); + ret = bucket_alloc_set_partial(c, req); if (ret) return ret; - if (erasure_code) { - ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs, - target, - nr_replicas, nr_effective, - have_cache, - watermark, flags, _cl); + if (req->ec) { + ret = bucket_alloc_from_stripe(trans, req, _cl); } else { retry_blocking: /* * Try nonblocking first, so that if one device is full we'll try from * other devices: */ - ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs, - nr_replicas, nr_effective, have_cache, - flags, wp->data_type, watermark, cl); + ret = bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe, cl); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart) && !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && @@ -1005,38 +921,27 @@ static int __open_bucket_add_buckets(struct btree_trans *trans, } static int open_bucket_add_buckets(struct btree_trans *trans, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_list *devs_have, - u16 target, - unsigned erasure_code, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum bch_watermark watermark, - enum bch_write_flags flags, - struct closure *cl) + struct alloc_request *req, + struct closure *cl) { int ret; - if (erasure_code && !ec_open_bucket(trans->c, ptrs)) { - ret = __open_bucket_add_buckets(trans, ptrs, wp, - devs_have, target, erasure_code, - nr_replicas, nr_effective, have_cache, - watermark, flags, cl); + if (req->ec && !ec_open_bucket(trans->c, &req->ptrs)) { + ret = __open_bucket_add_buckets(trans, req, cl); if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || bch2_err_matches(ret, BCH_ERR_operation_blocked) || bch2_err_matches(ret, BCH_ERR_freelist_empty) || bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) return ret; - if (*nr_effective >= nr_replicas) + if (req->nr_effective >= req->nr_replicas) return 0; } - ret = __open_bucket_add_buckets(trans, ptrs, wp, - devs_have, target, false, - nr_replicas, nr_effective, have_cache, - watermark, flags, cl); + bool ec = false; + swap(ec, req->ec); + ret = __open_bucket_add_buckets(trans, req, cl); + swap(ec, req->ec); + return ret < 0 ? ret : 0; } @@ -1060,23 +965,18 @@ static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c, return ob->ec != NULL; } else if (ca) { bool drop = ob->dev == ca->dev_idx; - struct open_bucket *ob2; - unsigned i; if (!drop && ob->ec) { - unsigned nr_blocks; - - mutex_lock(&ob->ec->lock); - nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks; + guard(mutex)(&ob->ec->lock); + unsigned nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks; - for (i = 0; i < nr_blocks; i++) { + for (unsigned i = 0; i < nr_blocks; i++) { if (!ob->ec->blocks[i]) continue; - ob2 = c->open_buckets + ob->ec->blocks[i]; + struct open_bucket *ob2 = c->open_buckets + ob->ec->blocks[i]; drop |= ob2->dev == ca->dev_idx; } - mutex_unlock(&ob->ec->lock); } return drop; @@ -1092,14 +992,13 @@ static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, struct open_bucket *ob; unsigned i; - mutex_lock(&wp->lock); + guard(mutex)(&wp->lock); open_bucket_for_each(c, &wp->ptrs, ob, i) if (should_drop_bucket(ob, c, ca, ec)) bch2_open_bucket_put(c, ob); else ob_push(c, &ptrs, ob); wp->ptrs = ptrs; - mutex_unlock(&wp->lock); } void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca, @@ -1115,40 +1014,37 @@ void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca, bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point); bch2_writepoint_stop(c, ca, ec, &c->btree_write_point); - mutex_lock(&c->btree_reserve_cache_lock); - while (c->btree_reserve_cache_nr) { - struct btree_alloc *a = - &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; + scoped_guard(mutex, &c->btree_reserve_cache_lock) + while (c->btree_reserve_cache_nr) { + struct btree_alloc *a = + &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; - bch2_open_buckets_put(c, &a->ob); - } - mutex_unlock(&c->btree_reserve_cache_lock); + bch2_open_buckets_put(c, &a->ob); + } - spin_lock(&c->freelist_lock); i = 0; - while (i < c->open_buckets_partial_nr) { - struct open_bucket *ob = - c->open_buckets + c->open_buckets_partial[i]; - - if (should_drop_bucket(ob, c, ca, ec)) { - --c->open_buckets_partial_nr; - swap(c->open_buckets_partial[i], - c->open_buckets_partial[c->open_buckets_partial_nr]); - - ob->on_partial_list = false; - - rcu_read_lock(); - bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; - rcu_read_unlock(); - - spin_unlock(&c->freelist_lock); - bch2_open_bucket_put(c, ob); - spin_lock(&c->freelist_lock); - } else { - i++; + scoped_guard(spinlock, &c->freelist_lock) + while (i < c->open_buckets_partial_nr) { + struct open_bucket *ob = + c->open_buckets + c->open_buckets_partial[i]; + + if (should_drop_bucket(ob, c, ca, ec)) { + --c->open_buckets_partial_nr; + swap(c->open_buckets_partial[i], + c->open_buckets_partial[c->open_buckets_partial_nr]); + + ob->on_partial_list = false; + + scoped_guard(rcu) + bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; + + spin_unlock(&c->freelist_lock); + bch2_open_bucket_put(c, ob); + spin_lock(&c->freelist_lock); + } else { + i++; + } } - } - spin_unlock(&c->freelist_lock); bch2_ec_stop_dev(c, ca); } @@ -1167,14 +1063,11 @@ static struct write_point *__writepoint_find(struct hlist_head *head, { struct write_point *wp; - rcu_read_lock(); + guard(rcu)(); hlist_for_each_entry_rcu(wp, head, node) if (wp->write_point == write_point) - goto out; - wp = NULL; -out: - rcu_read_unlock(); - return wp; + return wp; + return NULL; } static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) @@ -1185,7 +1078,7 @@ static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) return stranded * factor > free; } -static bool try_increase_writepoints(struct bch_fs *c) +static noinline bool try_increase_writepoints(struct bch_fs *c) { struct write_point *wp; @@ -1198,29 +1091,24 @@ static bool try_increase_writepoints(struct bch_fs *c) return true; } -static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr) +static noinline bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr) { struct bch_fs *c = trans->c; struct write_point *wp; struct open_bucket *ob; unsigned i; - mutex_lock(&c->write_points_hash_lock); - if (c->write_points_nr < old_nr) { - mutex_unlock(&c->write_points_hash_lock); - return true; - } - - if (c->write_points_nr == 1 || - !too_many_writepoints(c, 8)) { - mutex_unlock(&c->write_points_hash_lock); - return false; - } + scoped_guard(mutex, &c->write_points_hash_lock) { + if (c->write_points_nr < old_nr) + return true; - wp = c->write_points + --c->write_points_nr; + if (c->write_points_nr == 1 || + !too_many_writepoints(c, 8)) + return false; - hlist_del_rcu(&wp->node); - mutex_unlock(&c->write_points_hash_lock); + wp = c->write_points + --c->write_points_nr; + hlist_del_rcu(&wp->node); + } bch2_trans_mutex_lock_norelock(trans, &wp->lock); open_bucket_for_each(c, &wp->ptrs, ob, i) @@ -1289,26 +1177,26 @@ static struct write_point *writepoint_find(struct btree_trans *trans, static noinline void deallocate_extra_replicas(struct bch_fs *c, - struct open_buckets *ptrs, - struct open_buckets *ptrs_no_use, - unsigned extra_replicas) + struct alloc_request *req) { - struct open_buckets ptrs2 = { 0 }; struct open_bucket *ob; + unsigned extra_replicas = req->nr_effective - req->nr_replicas; unsigned i; - open_bucket_for_each(c, ptrs, ob, i) { + req->scratch_ptrs.nr = 0; + + open_bucket_for_each(c, &req->ptrs, ob, i) { unsigned d = ob_dev(c, ob)->mi.durability; if (d && d <= extra_replicas) { extra_replicas -= d; - ob_push(c, ptrs_no_use, ob); + ob_push(c, &req->wp->ptrs, ob); } else { - ob_push(c, &ptrs2, ob); + ob_push(c, &req->scratch_ptrs, ob); } } - *ptrs = ptrs2; + req->ptrs = req->scratch_ptrs; } /* @@ -1327,51 +1215,53 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, struct write_point **wp_ret) { struct bch_fs *c = trans->c; - struct write_point *wp; struct open_bucket *ob; - struct open_buckets ptrs; - unsigned nr_effective, write_points_nr; - bool have_cache; - int ret; + unsigned write_points_nr; int i; + struct alloc_request *req = bch2_trans_kmalloc_nomemzero(trans, sizeof(*req)); + int ret = PTR_ERR_OR_ZERO(req); + if (unlikely(ret)) + return ret; + if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) erasure_code = false; + req->nr_replicas = nr_replicas; + req->target = target; + req->ec = erasure_code; + req->watermark = watermark; + req->flags = flags; + req->devs_have = devs_have; + BUG_ON(!nr_replicas || !nr_replicas_required); retry: - ptrs.nr = 0; - nr_effective = 0; - write_points_nr = c->write_points_nr; - have_cache = false; + req->ptrs.nr = 0; + req->nr_effective = 0; + req->have_cache = false; + write_points_nr = c->write_points_nr; - *wp_ret = wp = writepoint_find(trans, write_point.v); + *wp_ret = req->wp = writepoint_find(trans, write_point.v); + + req->data_type = req->wp->data_type; ret = bch2_trans_relock(trans); if (ret) goto err; /* metadata may not allocate on cache devices: */ - if (wp->data_type != BCH_DATA_user) - have_cache = true; + if (req->data_type != BCH_DATA_user) + req->have_cache = true; if (target && !(flags & BCH_WRITE_only_specified_devs)) { - ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, - target, erasure_code, - nr_replicas, &nr_effective, - &have_cache, watermark, - flags, NULL); + ret = open_bucket_add_buckets(trans, req, NULL); if (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto alloc_done; /* Don't retry from all devices if we're out of open buckets: */ if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) { - int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, - target, erasure_code, - nr_replicas, &nr_effective, - &have_cache, watermark, - flags, cl); + int ret2 = open_bucket_add_buckets(trans, req, cl); if (!ret2 || bch2_err_matches(ret2, BCH_ERR_transaction_restart) || bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) { @@ -1384,45 +1274,38 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, * Only try to allocate cache (durability = 0 devices) from the * specified target: */ - have_cache = true; + req->have_cache = true; + req->target = 0; - ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, - 0, erasure_code, - nr_replicas, &nr_effective, - &have_cache, watermark, - flags, cl); + ret = open_bucket_add_buckets(trans, req, cl); } else { - ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, - target, erasure_code, - nr_replicas, &nr_effective, - &have_cache, watermark, - flags, cl); + ret = open_bucket_add_buckets(trans, req, cl); } alloc_done: - BUG_ON(!ret && nr_effective < nr_replicas); + BUG_ON(!ret && req->nr_effective < req->nr_replicas); - if (erasure_code && !ec_open_bucket(c, &ptrs)) + if (erasure_code && !ec_open_bucket(c, &req->ptrs)) pr_debug("failed to get ec bucket: ret %u", ret); if (ret == -BCH_ERR_insufficient_devices && - nr_effective >= nr_replicas_required) + req->nr_effective >= nr_replicas_required) ret = 0; if (ret) goto err; - if (nr_effective > nr_replicas) - deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas); + if (req->nr_effective > req->nr_replicas) + deallocate_extra_replicas(c, req); /* Free buckets we didn't use: */ - open_bucket_for_each(c, &wp->ptrs, ob, i) + open_bucket_for_each(c, &req->wp->ptrs, ob, i) open_bucket_free_unused(c, ob); - wp->ptrs = ptrs; + req->wp->ptrs = req->ptrs; - wp->sectors_free = UINT_MAX; + req->wp->sectors_free = UINT_MAX; - open_bucket_for_each(c, &wp->ptrs, ob, i) { + open_bucket_for_each(c, &req->wp->ptrs, ob, i) { /* * Ensure proper write alignment - either due to misaligned * bucket sizes (from buggy bcachefs-tools), or writes that mix @@ -1436,58 +1319,44 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, ob->sectors_free = max_t(int, 0, ob->sectors_free - align); - wp->sectors_free = min(wp->sectors_free, ob->sectors_free); + req->wp->sectors_free = min(req->wp->sectors_free, ob->sectors_free); } - wp->sectors_free = rounddown(wp->sectors_free, block_sectors(c)); + req->wp->sectors_free = rounddown(req->wp->sectors_free, block_sectors(c)); /* Did alignment use up space in an open_bucket? */ - if (unlikely(!wp->sectors_free)) { - bch2_alloc_sectors_done(c, wp); + if (unlikely(!req->wp->sectors_free)) { + bch2_alloc_sectors_done(c, req->wp); goto retry; } - BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); + BUG_ON(!req->wp->sectors_free || req->wp->sectors_free == UINT_MAX); return 0; err: - open_bucket_for_each(c, &wp->ptrs, ob, i) - if (ptrs.nr < ARRAY_SIZE(ptrs.v)) - ob_push(c, &ptrs, ob); + open_bucket_for_each(c, &req->wp->ptrs, ob, i) + if (req->ptrs.nr < ARRAY_SIZE(req->ptrs.v)) + ob_push(c, &req->ptrs, ob); else open_bucket_free_unused(c, ob); - wp->ptrs = ptrs; + req->wp->ptrs = req->ptrs; - mutex_unlock(&wp->lock); + mutex_unlock(&req->wp->lock); if (bch2_err_matches(ret, BCH_ERR_freelist_empty) && try_decrease_writepoints(trans, write_points_nr)) goto retry; if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) - ret = -BCH_ERR_bucket_alloc_blocked; + ret = bch_err_throw(c, bucket_alloc_blocked); if (cl && !(flags & BCH_WRITE_alloc_nowait) && bch2_err_matches(ret, BCH_ERR_freelist_empty)) - ret = -BCH_ERR_bucket_alloc_blocked; + ret = bch_err_throw(c, bucket_alloc_blocked); return ret; } -struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) -{ - struct bch_dev *ca = ob_dev(c, ob); - - return (struct bch_extent_ptr) { - .type = 1 << BCH_EXTENT_ENTRY_ptr, - .gen = ob->gen, - .dev = ob->dev, - .offset = bucket_to_sector(ca, ob->bucket) + - ca->mi.bucket_size - - ob->sectors_free, - }; -} - void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, struct bkey_i *k, unsigned sectors, bool cached) @@ -1573,35 +1442,25 @@ void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct ope void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c, struct bch_dev *ca) { - struct open_bucket *ob; - - out->atomic++; + guard(printbuf_atomic)(out); - for (ob = c->open_buckets; + for (struct open_bucket *ob = c->open_buckets; ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { - spin_lock(&ob->lock); + guard(spinlock)(&ob->lock); if (ob->valid && (!ca || ob->dev == ca->dev_idx)) bch2_open_bucket_to_text(out, c, ob); - spin_unlock(&ob->lock); } - - --out->atomic; } void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c) { - unsigned i; - - out->atomic++; - spin_lock(&c->freelist_lock); + guard(printbuf_atomic)(out); + guard(spinlock)(&c->freelist_lock); - for (i = 0; i < c->open_buckets_partial_nr; i++) + for (unsigned i = 0; i < c->open_buckets_partial_nr; i++) bch2_open_bucket_to_text(out, c, c->open_buckets + c->open_buckets_partial[i]); - - spin_unlock(&c->freelist_lock); - --out->atomic; } static const char * const bch2_write_point_states[] = { @@ -1617,6 +1476,8 @@ static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob; unsigned i; + guard(mutex)(&wp->lock); + prt_printf(out, "%lu: ", wp->write_point); prt_human_readable_u64(out, wp->sectors_allocated << 9); @@ -1720,7 +1581,7 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) static noinline void bch2_print_allocator_stuck(struct bch_fs *c) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); prt_printf(&buf, "Allocator stuck? Waited for %u seconds\n", c->opts.allocator_stuck_timeout); @@ -1731,12 +1592,17 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c) printbuf_indent_sub(&buf, 2); prt_newline(&buf); - for_each_online_member(c, ca) { - prt_printf(&buf, "Dev %u:\n", ca->dev_idx); - printbuf_indent_add(&buf, 2); - bch2_dev_alloc_debug_to_text(&buf, ca); - printbuf_indent_sub(&buf, 2); - prt_newline(&buf); + bch2_printbuf_make_room(&buf, 4096); + + scoped_guard(rcu) { + guard(printbuf_atomic)(&buf); + for_each_online_member_rcu(c, ca) { + prt_printf(&buf, "Dev %u:\n", ca->dev_idx); + printbuf_indent_add(&buf, 2); + bch2_dev_alloc_debug_to_text(&buf, ca); + printbuf_indent_sub(&buf, 2); + prt_newline(&buf); + } } prt_printf(&buf, "Copygc debug:\n"); @@ -1750,8 +1616,7 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c) bch2_journal_debug_to_text(&buf, &c->journal); printbuf_indent_sub(&buf, 2); - bch2_print_string_as_lines(KERN_ERR, buf.buf); - printbuf_exit(&buf); + bch2_print_str(c, KERN_ERR, buf.buf); } static inline unsigned allocator_wait_timeout(struct bch_fs *c) diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 4c1e33cf57c0..02aef66859c3 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -3,8 +3,10 @@ #define _BCACHEFS_ALLOC_FOREGROUND_H #include "bcachefs.h" +#include "buckets.h" #include "alloc_types.h" #include "extents.h" +#include "io_write_types.h" #include "sb-members.h" #include @@ -23,9 +25,57 @@ struct dev_alloc_list { u8 data[BCH_SB_MEMBERS_MAX]; }; -struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, - struct dev_stripe_state *, - struct bch_devs_mask *); +struct alloc_request { + unsigned nr_replicas; + unsigned target; + bool ec; + enum bch_watermark watermark; + enum bch_write_flags flags; + enum bch_data_type data_type; + struct bch_devs_list *devs_have; + struct write_point *wp; + + /* These fields are used primarily by open_bucket_add_buckets */ + struct open_buckets ptrs; + unsigned nr_effective; /* sum of @ptrs durability */ + bool have_cache; /* have we allocated from a 0 durability dev */ + struct bch_devs_mask devs_may_alloc; + + /* bch2_bucket_alloc_set_trans(): */ + struct dev_alloc_list devs_sorted; + struct bch_dev_usage usage; + + /* bch2_bucket_alloc_trans(): */ + struct bch_dev *ca; + + enum { + BTREE_BITMAP_NO, + BTREE_BITMAP_YES, + BTREE_BITMAP_ANY, + } btree_bitmap; + + struct { + u64 buckets_seen; + u64 skipped_open; + u64 skipped_need_journal_commit; + u64 need_journal_commit; + u64 skipped_nocow; + u64 skipped_nouse; + u64 skipped_mi_btree_bitmap; + } counters; + + unsigned scratch_nr_replicas; + unsigned scratch_nr_effective; + bool scratch_have_cache; + enum bch_data_type scratch_data_type; + struct open_buckets scratch_ptrs; + struct bch_devs_mask scratch_devs_may_alloc; +}; + +void bch2_dev_alloc_list(struct bch_fs *, + struct dev_stripe_state *, + struct bch_devs_mask *, + struct dev_alloc_list *); void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob) @@ -160,24 +210,16 @@ static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucke static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket) { - bool ret; - if (bch2_bucket_is_open(c, dev, bucket)) return true; - spin_lock(&c->freelist_lock); - ret = bch2_bucket_is_open(c, dev, bucket); - spin_unlock(&c->freelist_lock); - - return ret; + guard(spinlock)(&c->freelist_lock); + return bch2_bucket_is_open(c, dev, bucket); } enum bch_write_flags; -int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *, - struct dev_stripe_state *, struct bch_devs_mask *, - unsigned, unsigned *, bool *, enum bch_write_flags, - enum bch_data_type, enum bch_watermark, - struct closure *); +int bch2_bucket_alloc_set_trans(struct btree_trans *, struct alloc_request *, + struct dev_stripe_state *, struct closure *); int bch2_alloc_sectors_start_trans(struct btree_trans *, unsigned, unsigned, @@ -189,7 +231,19 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *, struct closure *, struct write_point **); -struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *); +static inline struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) +{ + struct bch_dev *ca = ob_dev(c, ob); + + return (struct bch_extent_ptr) { + .type = 1 << BCH_EXTENT_ENTRY_ptr, + .gen = ob->gen, + .dev = ob->dev, + .offset = bucket_to_sector(ca, ob->bucket) + + ca->mi.bucket_size - + ob->sectors_free, + }; +} /* * Append pointers to the space we just allocated to @k, and mark @sectors space diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index 8f79f46c2a78..e7becdf22cba 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -8,22 +8,6 @@ #include "clock_types.h" #include "fifo.h" -struct bucket_alloc_state { - enum { - BTREE_BITMAP_NO, - BTREE_BITMAP_YES, - BTREE_BITMAP_ANY, - } btree_bitmap; - - u64 buckets_seen; - u64 skipped_open; - u64 skipped_need_journal_commit; - u64 need_journal_commit; - u64 skipped_nocow; - u64 skipped_nouse; - u64 skipped_mi_btree_bitmap; -}; - #define BCH_WATERMARKS() \ x(stripe) \ x(normal) \ diff --git a/fs/bcachefs/async_objs.c b/fs/bcachefs/async_objs.c new file mode 100644 index 000000000000..ad04e5f0f056 --- /dev/null +++ b/fs/bcachefs/async_objs.c @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Async obj debugging: keep asynchronous objects on (very fast) lists, make + * them visibile in debugfs: + */ + +#include "bcachefs.h" +#include "async_objs.h" +#include "btree_io.h" +#include "debug.h" +#include "io_read.h" +#include "io_write.h" + +#include + +static void promote_obj_to_text(struct printbuf *out, + struct bch_fs *c, + void *obj) +{ + bch2_promote_op_to_text(out, c, obj); +} + +static void rbio_obj_to_text(struct printbuf *out, + struct bch_fs *c, + void *obj) +{ + bch2_read_bio_to_text(out, c, obj); +} + +static void write_op_obj_to_text(struct printbuf *out, + struct bch_fs *c, + void *obj) +{ + bch2_write_op_to_text(out, obj); +} + +static void btree_read_bio_obj_to_text(struct printbuf *out, + struct bch_fs *c, + void *obj) +{ + struct btree_read_bio *rbio = obj; + bch2_btree_read_bio_to_text(out, rbio); +} + +static void btree_write_bio_obj_to_text(struct printbuf *out, + struct bch_fs *c, + void *obj) +{ + struct btree_write_bio *wbio = obj; + bch2_bio_to_text(out, &wbio->wbio.bio); +} + +static int bch2_async_obj_list_open(struct inode *inode, struct file *file) +{ + struct async_obj_list *list = inode->i_private; + struct dump_iter *i; + + i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); + if (!i) + return -ENOMEM; + + file->private_data = i; + i->from = POS_MIN; + i->iter = 0; + i->c = container_of(list, struct bch_fs, async_objs[list->idx]); + i->list = list; + i->buf = PRINTBUF; + return 0; +} + +static ssize_t bch2_async_obj_list_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct async_obj_list *list = i->list; + ssize_t ret = 0; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + struct genradix_iter iter; + void *obj; + fast_list_for_each_from(&list->list, iter, obj, i->iter) { + ret = bch2_debugfs_flush_buf(i); + if (ret) + return ret; + + if (!i->size) + break; + + list->obj_to_text(&i->buf, i->c, obj); + i->iter = iter.pos; + } + + if (i->buf.allocation_failure) + ret = -ENOMEM; + + if (!ret) + ret = bch2_debugfs_flush_buf(i); + + return ret ?: i->ret; +} + +static const struct file_operations async_obj_ops = { + .owner = THIS_MODULE, + .open = bch2_async_obj_list_open, + .release = bch2_dump_release, + .read = bch2_async_obj_list_read, +}; + +void bch2_fs_async_obj_debugfs_init(struct bch_fs *c) +{ + c->async_obj_dir = debugfs_create_dir("async_objs", c->fs_debug_dir); + +#define x(n) debugfs_create_file(#n, 0400, c->async_obj_dir, \ + &c->async_objs[BCH_ASYNC_OBJ_LIST_##n], &async_obj_ops); + BCH_ASYNC_OBJ_LISTS() +#undef x +} + +void bch2_fs_async_obj_exit(struct bch_fs *c) +{ + for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++) + fast_list_exit(&c->async_objs[i].list); +} + +int bch2_fs_async_obj_init(struct bch_fs *c) +{ + for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++) { + if (fast_list_init(&c->async_objs[i].list)) + return -BCH_ERR_ENOMEM_async_obj_init; + c->async_objs[i].idx = i; + } + +#define x(n) c->async_objs[BCH_ASYNC_OBJ_LIST_##n].obj_to_text = n##_obj_to_text; + BCH_ASYNC_OBJ_LISTS() +#undef x + + return 0; +} diff --git a/fs/bcachefs/async_objs.h b/fs/bcachefs/async_objs.h new file mode 100644 index 000000000000..451db4c51fb2 --- /dev/null +++ b/fs/bcachefs/async_objs.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ASYNC_OBJS_H +#define _BCACHEFS_ASYNC_OBJS_H + +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS +static inline void __async_object_list_del(struct fast_list *head, unsigned *idx) +{ + fast_list_remove(head, *idx); + *idx = 0; +} + +static inline int __async_object_list_add(struct fast_list *head, void *obj, unsigned *idx) +{ + int ret = fast_list_add(head, obj); + *idx = ret > 0 ? ret : 0; + return ret < 0 ? ret : 0; +} + +#define async_object_list_del(_c, _list, idx) \ + __async_object_list_del(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, &idx) + +#define async_object_list_add(_c, _list, obj, idx) \ + __async_object_list_add(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, obj, idx) + +void bch2_fs_async_obj_debugfs_init(struct bch_fs *); +void bch2_fs_async_obj_exit(struct bch_fs *); +int bch2_fs_async_obj_init(struct bch_fs *); + +#else /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */ + +#define async_object_list_del(_c, _n, idx) do {} while (0) + +static inline int __async_object_list_add(void) +{ + return 0; +} +#define async_object_list_add(_c, _n, obj, idx) __async_object_list_add() + +static inline void bch2_fs_async_obj_debugfs_init(struct bch_fs *c) {} +static inline void bch2_fs_async_obj_exit(struct bch_fs *c) {} +static inline int bch2_fs_async_obj_init(struct bch_fs *c) { return 0; } + +#endif /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */ + +#endif /* _BCACHEFS_ASYNC_OBJS_H */ diff --git a/fs/bcachefs/async_objs_types.h b/fs/bcachefs/async_objs_types.h new file mode 100644 index 000000000000..ed262c874ad0 --- /dev/null +++ b/fs/bcachefs/async_objs_types.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ASYNC_OBJS_TYPES_H +#define _BCACHEFS_ASYNC_OBJS_TYPES_H + +#define BCH_ASYNC_OBJ_LISTS() \ + x(promote) \ + x(rbio) \ + x(write_op) \ + x(btree_read_bio) \ + x(btree_write_bio) + +enum bch_async_obj_lists { +#define x(n) BCH_ASYNC_OBJ_LIST_##n, + BCH_ASYNC_OBJ_LISTS() +#undef x + BCH_ASYNC_OBJ_NR +}; + +struct async_obj_list { + struct fast_list list; + void (*obj_to_text)(struct printbuf *, struct bch_fs *, void *); + unsigned idx; +}; + +#endif /* _BCACHEFS_ASYNC_OBJS_TYPES_H */ diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 5f195d2280a4..45d3db41225a 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -12,9 +12,20 @@ #include "disk_accounting.h" #include "error.h" #include "progress.h" +#include "recovery_passes.h" #include +static int bch2_bucket_bitmap_set(struct bch_dev *, struct bucket_bitmap *, u64); + +static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) +{ + return (struct bbpos) { + .btree = bp.btree_id, + .pos = bp.pos, + }; +} + int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { @@ -37,17 +48,19 @@ void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bke { struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); - if (ca) { - u32 bucket_offset; - struct bpos bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset); - rcu_read_unlock(); + struct bch_dev *ca; + u32 bucket_offset; + struct bpos bucket; + scoped_guard(rcu) { + ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); + if (ca) + bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset); + } + + if (ca) prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset); - } else { - rcu_read_unlock(); + else prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT); - } bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); prt_str(out, " data_type="); @@ -95,7 +108,9 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, bool insert) { struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); + bool will_check = c->recovery.passes_to_run & + BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers); int ret = 0; if (insert) { @@ -110,9 +125,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, prt_printf(&buf, "for "); bch2_bkey_val_to_text(&buf, c, orig_k); - - bch_err(c, "%s", buf.buf); - } else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { + } else if (!will_check) { prt_printf(&buf, "backpointer not found when deleting\n"); printbuf_indent_add(&buf, 2); @@ -128,12 +141,11 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, bch2_bkey_val_to_text(&buf, c, orig_k); } - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers && - __bch2_inconsistent_error(c, &buf)) - ret = -BCH_ERR_erofs_unfixed_errors; + if (!will_check && __bch2_inconsistent_error(c, &buf)) + ret = bch_err_throw(c, erofs_unfixed_errors); - bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); + if (buf.buf) + bch_err(c, "%s", buf.buf); return ret; } @@ -142,12 +154,10 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, struct bkey_i_backpointer *bp, bool insert) { - struct btree_iter bp_iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, - bp->k.p, - BTREE_ITER_intent| - BTREE_ITER_slots| - BTREE_ITER_with_updates); + CLASS(btree_iter, bp_iter)(trans, BTREE_ID_backpointers, bp->k.p, + BTREE_ITER_intent| + BTREE_ITER_with_updates); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&bp_iter); int ret = bkey_err(k); if (ret) return ret; @@ -158,7 +168,7 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, memcmp(bkey_s_c_to_backpointer(k).v, &bp->v, sizeof(bp->v)))) { ret = backpointer_mod_err(trans, orig_k, bp, k, insert); if (ret) - goto err; + return ret; } if (!insert) { @@ -166,15 +176,12 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, set_bkey_val_u64s(&bp->k, 0); } - ret = bch2_trans_update(trans, &bp_iter, &bp->k_i, 0); -err: - bch2_trans_iter_exit(trans, &bp_iter); - return ret; + return bch2_trans_update(trans, &bp_iter, &bp->k_i, 0); } static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos) { - return (likely(!bch2_backpointers_no_use_write_buffer) + return (!static_branch_unlikely(&bch2_backpointers_no_use_write_buffer) ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos) : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); @@ -184,7 +191,7 @@ static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans, struct bkey_s_c visiting_k, struct bkey_buf *last_flushed) { - return likely(!bch2_backpointers_no_use_write_buffer) + return !static_branch_unlikely(&bch2_backpointers_no_use_write_buffer) ? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed) : 0; } @@ -196,7 +203,7 @@ static int backpointer_target_not_found(struct btree_trans *trans, bool commit) { struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); int ret = 0; /* @@ -232,7 +239,7 @@ static int backpointer_target_not_found(struct btree_trans *trans, "%s", buf.buf)) { ret = bch2_backpointer_del(trans, bp.k->p); if (ret || !commit) - goto out; + return ret; /* * Normally, on transaction commit from inside a transaction, @@ -250,9 +257,7 @@ static int backpointer_target_not_found(struct btree_trans *trans, */ ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } -out: fsck_err: - printbuf_exit(&buf); return ret; } @@ -272,7 +277,7 @@ static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans, 0, bp.v->level - 1, 0); - struct btree *b = bch2_btree_iter_peek_node(trans, iter); + struct btree *b = bch2_btree_iter_peek_node(iter); if (IS_ERR_OR_NULL(b)) goto err; @@ -283,14 +288,14 @@ static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans, return b; if (btree_node_will_make_reachable(b)) { - b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); + b = ERR_PTR(bch_err_throw(c, backpointer_to_overwritten_btree_node)); } else { int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed, commit); b = ret ? ERR_PTR(ret) : NULL; } err: - bch2_trans_iter_exit(trans, iter); + bch2_trans_iter_exit(iter); return b; } @@ -312,9 +317,9 @@ static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans, 0, bp.v->level, iter_flags); - struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); if (bkey_err(k)) { - bch2_trans_iter_exit(trans, iter); + bch2_trans_iter_exit(iter); return k; } @@ -334,7 +339,7 @@ static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans, extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) return k; - bch2_trans_iter_exit(trans, iter); + bch2_trans_iter_exit(iter); if (!bp.v->level) { int ret = backpointer_target_not_found(trans, bp, k, last_flushed, commit); @@ -374,44 +379,42 @@ static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, st return 0; struct bch_fs *c = trans->c; - struct btree_iter alloc_iter = {}; - struct bkey_s_c alloc_k; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); int ret = 0; struct bpos bucket; if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) { ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); if (ret) - goto out; + return ret; if (fsck_err(trans, backpointer_to_missing_device, "backpointer for missing device:\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = bch2_backpointer_del(trans, k.k->p); - goto out; + return ret; } - alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bucket, 0); - ret = bkey_err(alloc_k); - if (ret) - goto out; - - if (alloc_k.k->type != KEY_TYPE_alloc_v4) { - ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); + { + CLASS(btree_iter, alloc_iter)(trans, BTREE_ID_alloc, bucket, 0); + struct bkey_s_c alloc_k = bch2_btree_iter_peek_slot(&alloc_iter); + ret = bkey_err(alloc_k); if (ret) - goto out; + return ret; - if (fsck_err(trans, backpointer_to_missing_alloc, - "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", - alloc_iter.pos.inode, alloc_iter.pos.offset, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = bch2_backpointer_del(trans, k.k->p); + if (alloc_k.k->type != KEY_TYPE_alloc_v4) { + ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); + if (ret) + return ret; + + if (fsck_err(trans, backpointer_to_missing_alloc, + "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", + alloc_iter.pos.inode, alloc_iter.pos.offset, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_backpointer_del(trans, k.k->p); + } } -out: fsck_err: - bch2_trans_iter_exit(trans, &alloc_iter); - printbuf_exit(&buf); return ret; } @@ -422,14 +425,13 @@ int bch2_check_btree_backpointers(struct bch_fs *c) bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, + CLASS(btree_trans, trans)(c); + int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, POS_MIN, 0, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed))); + bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed)); bch2_bkey_buf_exit(&last_flushed, c); - bch_err_fn(c, ret); return ret; } @@ -459,7 +461,7 @@ static int check_extent_checksum(struct btree_trans *trans, struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent); const union bch_extent_entry *entry; struct extent_ptr_decoded p; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); void *data_buf = NULL; struct bio *bio = NULL; size_t bytes; @@ -478,7 +480,8 @@ static int check_extent_checksum(struct btree_trans *trans, bytes = p.crc.compressed_size << 9; - struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ); + struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ, + BCH_DEV_READ_REF_check_extent_checksums); if (!ca) return false; @@ -515,8 +518,8 @@ static int check_extent_checksum(struct btree_trans *trans, if (bio) bio_put(bio); kvfree(data_buf); - percpu_ref_put(&ca->io_ref[READ]); - printbuf_exit(&buf); + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_check_extent_checksums); return ret; } @@ -527,32 +530,30 @@ static int check_bp_exists(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree_iter other_extent_iter = {}; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); if (bpos_lt(bp->k.p, s->bp_start) || bpos_gt(bp->k.p, s->bp_end)) return 0; - struct btree_iter bp_iter; - struct bkey_s_c bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0); + CLASS(btree_iter, bp_iter)(trans, BTREE_ID_backpointers, bp->k.p, 0); + struct bkey_s_c bp_k = bch2_btree_iter_peek_slot(&bp_iter); int ret = bkey_err(bp_k); if (ret) - goto err; + return ret; if (bp_k.k->type != KEY_TYPE_backpointer || memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp->v, sizeof(bp->v))) { ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed); if (ret) - goto err; + return ret; goto check_existing_bp; } out: err: fsck_err: - bch2_trans_iter_exit(trans, &other_extent_iter); - bch2_trans_iter_exit(trans, &bp_iter); - printbuf_exit(&buf); + bch2_trans_iter_exit(&other_extent_iter); return ret; check_existing_bp: /* Do we have a backpointer for a different extent? */ @@ -579,6 +580,7 @@ static int check_bp_exists(struct btree_trans *trans, bkey_for_each_ptr(other_extent_ptrs, ptr) if (ptr->dev == bp->k.p.inode && dev_ptr_stale_rcu(ca, ptr)) { + rcu_read_unlock(); ret = drop_dev_and_update(trans, other_bp.v->btree_id, other_extent, bp->k.p.inode); if (ret) @@ -636,7 +638,7 @@ static int check_bp_exists(struct btree_trans *trans, prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, other_extent); bch_err(c, "%s", buf.buf); - ret = -BCH_ERR_fsck_repair_unimplemented; + ret = bch_err_throw(c, fsck_repair_unimplemented); goto err; missing: printbuf_reset(&buf); @@ -667,24 +669,32 @@ static int check_extent_to_backpointers(struct btree_trans *trans, if (p.ptr.dev == BCH_SB_MEMBER_INVALID) continue; - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); - bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches); - bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty); + bool empty; + { + /* scoped_guard() is a loop, so it breaks continue */ + guard(rcu)(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); + if (!ca) + continue; - bool stale = p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)); - rcu_read_unlock(); + if (p.ptr.cached && dev_ptr_stale_rcu(ca, &p.ptr)) + continue; - if ((check || empty) && !stale) { - struct bkey_i_backpointer bp; - bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); + u64 b = PTR_BUCKET_NR(ca, &p.ptr); + if (!bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b)) + continue; - int ret = check - ? check_bp_exists(trans, s, &bp, k) - : bch2_bucket_backpointer_mod(trans, k, &bp, true); - if (ret) - return ret; + empty = bch2_bucket_bitmap_test(&ca->bucket_backpointer_empty, b); } + + struct bkey_i_backpointer bp; + bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); + + int ret = !empty + ? check_bp_exists(trans, s, &bp, k) + : bch2_bucket_backpointer_mod(trans, k, &bp, true); + if (ret) + return ret; } return 0; @@ -703,13 +713,13 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans, retry: bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, bch2_btree_id_root(c, btree_id)->b->c.level, 0); - b = bch2_btree_iter_peek_node(trans, &iter); + b = bch2_btree_iter_peek_node(&iter); ret = PTR_ERR_OR_ZERO(b); if (ret) goto err; if (b != btree_node_root(c, b)) { - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); goto retry; } @@ -718,18 +728,10 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans, k = bkey_i_to_s_c(&b->key); ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k); err: - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); return ret; } -static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) -{ - return (struct bbpos) { - .btree = bp.btree_id, - .pos = bp.pos, - }; -} - static u64 mem_may_pin_bytes(struct bch_fs *c) { struct sysinfo i; @@ -788,6 +790,13 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, return ret; } +static inline int bch2_fs_going_ro(struct bch_fs *c) +{ + return test_bit(BCH_FS_going_ro, &c->flags) + ? -EROFS + : 0; +} + static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, struct extents_to_bp_state *s) { @@ -815,9 +824,11 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, ret = for_each_btree_key_continue(trans, iter, 0, k, ({ bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); + bch2_fs_going_ro(c) ?: check_extent_to_backpointers(trans, s, btree_id, level, k) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); })); + bch2_trans_iter_exit(&iter); if (ret) return ret; @@ -854,6 +865,7 @@ static int data_type_to_alloc_counter(enum bch_data_type t) static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos); static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k, + bool *had_mismatch, struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; @@ -861,6 +873,8 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); bool need_commit = false; + *had_mismatch = false; + if (a->data_type == BCH_DATA_sb || a->data_type == BCH_DATA_journal || a->data_type == BCH_DATA_parity) @@ -869,11 +883,10 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b u32 sectors[ALLOC_SECTORS_NR]; memset(sectors, 0, sizeof(sectors)); - struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(trans->c, alloc_k.k->p); + CLASS(bch2_dev_bucket_tryget_noerror, ca)(trans->c, alloc_k.k->p); if (!ca) return 0; - struct btree_iter iter; struct bkey_s_c bp_k; int ret = 0; for_each_btree_key_max_norestart(trans, iter, BTREE_ID_backpointers, @@ -889,7 +902,7 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b bp.v->pad)) { ret = bch2_backpointer_del(trans, bp_k.k->p); if (ret) - break; + return ret; need_commit = true; continue; @@ -904,14 +917,13 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b sectors[alloc_counter] += bp.v->bucket_len; }; - bch2_trans_iter_exit(trans, &iter); if (ret) - goto err; + return ret; if (need_commit) { ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); if (ret) - goto err; + return ret; } if (sectors[ALLOC_dirty] != a->dirty_sectors || @@ -920,27 +932,31 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) { ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed); if (ret) - goto err; + return ret; } if (sectors[ALLOC_dirty] > a->dirty_sectors || sectors[ALLOC_cached] > a->cached_sectors || sectors[ALLOC_stripe] > a->stripe_sectors) { - ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?: - -BCH_ERR_transaction_restart_nested; - goto err; + return check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?: + bch_err_throw(c, transaction_restart_nested); } - if (!sectors[ALLOC_dirty] && - !sectors[ALLOC_stripe] && - !sectors[ALLOC_cached]) - __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty); - else - __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches); + bool empty = (sectors[ALLOC_dirty] + + sectors[ALLOC_stripe] + + sectors[ALLOC_cached]) == 0; + + ret = bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_mismatch, + alloc_k.k->p.offset) ?: + (empty + ? bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_empty, + alloc_k.k->p.offset) + : 0); + + *had_mismatch = true; } -err: - bch2_dev_put(ca); - return ret; + + return 0; } static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) @@ -949,7 +965,7 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) case KEY_TYPE_btree_ptr_v2: { bool ret = false; - rcu_read_lock(); + guard(rcu)(); struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key; while (pos.inode <= k.k->p.inode) { if (pos.inode >= c->sb.nr_devices) @@ -960,8 +976,14 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) goto next; struct bpos bucket = bp_pos_to_bucket(ca, pos); - bucket.offset = find_next_bit(ca->bucket_backpointer_mismatches, - ca->mi.nbuckets, bucket.offset); + u64 next = ca->mi.nbuckets; + + unsigned long *bitmap = READ_ONCE(ca->bucket_backpointer_mismatch.buckets); + if (bitmap) + next = min_t(u64, next, + find_next_bit(bitmap, ca->mi.nbuckets, bucket.offset)); + + bucket.offset = next; if (bucket.offset == ca->mi.nbuckets) goto next; @@ -971,7 +993,6 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) next: pos = SPOS(pos.inode + 1, 0, 0); } - rcu_read_unlock(); return ret; } @@ -987,7 +1008,7 @@ static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k, { struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0); - struct btree *b = bch2_btree_iter_peek_node(trans, &iter); + struct btree *b = bch2_btree_iter_peek_node(&iter); int ret = PTR_ERR_OR_ZERO(b); if (ret) goto err; @@ -995,7 +1016,7 @@ static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k, if (b) bch2_node_pin(trans->c, b); err: - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); return ret; } @@ -1031,6 +1052,7 @@ static int bch2_pin_backpointer_nodes_with_missing(struct btree_trans *trans, bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, path->level - 1); })); + bch2_trans_iter_exit(&iter); if (ret) return ret; @@ -1060,6 +1082,7 @@ static int bch2_pin_backpointer_nodes_with_missing(struct btree_trans *trans, ret; })); + bch2_trans_iter_exit(&iter); if (ret) return ret; @@ -1070,29 +1093,7 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) { int ret = 0; - /* - * Can't allow devices to come/go/resize while we have bucket bitmaps - * allocated - */ - down_read(&c->state_lock); - - for_each_member_device(c, ca) { - BUG_ON(ca->bucket_backpointer_mismatches); - ca->bucket_backpointer_mismatches = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), - sizeof(unsigned long), - GFP_KERNEL); - ca->bucket_backpointer_empty = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), - sizeof(unsigned long), - GFP_KERNEL); - if (!ca->bucket_backpointer_mismatches || - !ca->bucket_backpointer_empty) { - bch2_dev_put(ca); - ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; - goto err_free_bitmaps; - } - } - - struct btree_trans *trans = bch2_trans_get(c); + CLASS(btree_trans, trans)(c); struct extents_to_bp_state s = { .bp_start = POS_MIN }; bch2_bkey_buf_init(&s.last_flushed); @@ -1100,23 +1101,24 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_prefetch, k, ({ - check_bucket_backpointer_mismatch(trans, k, &s.last_flushed); + bool had_mismatch; + bch2_fs_going_ro(c) ?: + check_bucket_backpointer_mismatch(trans, k, &had_mismatch, &s.last_flushed); })); if (ret) goto err; - u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0; + u64 nr_buckets = 0, nr_mismatches = 0; for_each_member_device(c, ca) { nr_buckets += ca->mi.nbuckets; - nr_mismatches += bitmap_weight(ca->bucket_backpointer_mismatches, ca->mi.nbuckets); - nr_empty += bitmap_weight(ca->bucket_backpointer_empty, ca->mi.nbuckets); + nr_mismatches += ca->bucket_backpointer_mismatch.nr; } - if (!nr_mismatches && !nr_empty) + if (!nr_mismatches) goto err; bch_info(c, "scanning for missing backpointers in %llu/%llu buckets", - nr_mismatches + nr_empty, nr_buckets); + nr_mismatches, nr_buckets); while (1) { ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end); @@ -1130,7 +1132,7 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) if (!bpos_eq(s.bp_start, POS_MIN) || !bpos_eq(s.bp_end, SPOS_MAX)) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); prt_str(&buf, "check_extents_to_backpointers(): "); bch2_bpos_to_text(&buf, s.bp_start); @@ -1138,7 +1140,6 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) bch2_bpos_to_text(&buf, s.bp_end); bch_verbose(c, "%s", buf.buf); - printbuf_exit(&buf); } ret = bch2_check_extents_to_backpointers_pass(trans, &s); @@ -1147,23 +1148,63 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) s.bp_start = bpos_successor(s.bp_end); } + + for_each_member_device(c, ca) { + bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch); + bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty); + } err: - bch2_trans_put(trans); bch2_bkey_buf_exit(&s.last_flushed, c); bch2_btree_cache_unpin(c); -err_free_bitmaps: - for_each_member_device(c, ca) { - kvfree(ca->bucket_backpointer_empty); - ca->bucket_backpointer_empty = NULL; - kvfree(ca->bucket_backpointer_mismatches); - ca->bucket_backpointer_mismatches = NULL; - } - - up_read(&c->state_lock); - bch_err_fn(c, ret); return ret; } +static int check_bucket_backpointer_pos_mismatch(struct btree_trans *trans, + struct bpos bucket, + bool *had_mismatch, + struct bkey_buf *last_flushed) +{ + CLASS(btree_iter, alloc_iter)(trans, BTREE_ID_alloc, bucket, BTREE_ITER_cached); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&alloc_iter); + int ret = bkey_err(k); + if (ret) + return ret; + + return check_bucket_backpointer_mismatch(trans, k, had_mismatch, last_flushed); +} + +int bch2_check_bucket_backpointer_mismatch(struct btree_trans *trans, + struct bch_dev *ca, u64 bucket, + bool copygc, + struct bkey_buf *last_flushed) +{ + struct bch_fs *c = trans->c; + bool had_mismatch; + int ret = lockrestart_do(trans, + check_bucket_backpointer_pos_mismatch(trans, POS(ca->dev_idx, bucket), + &had_mismatch, last_flushed)); + if (ret || !had_mismatch) + return ret; + + u64 nr = ca->bucket_backpointer_mismatch.nr; + u64 allowed = copygc ? ca->mi.nbuckets >> 7 : 0; + + CLASS(printbuf, buf)(); + __bch2_log_msg_start(ca->name, &buf); + + prt_printf(&buf, "Detected missing backpointers in bucket %llu, now have %llu/%llu with missing\n", + bucket, nr, ca->mi.nbuckets); + + bch2_run_explicit_recovery_pass(c, &buf, + BCH_RECOVERY_PASS_check_extents_to_backpointers, + nr < allowed ? RUN_RECOVERY_PASS_ratelimit : 0); + + bch2_print_str(c, KERN_ERR, buf.buf); + return 0; +} + +/* backpointers -> extents */ + static int check_one_backpointer(struct btree_trans *trans, struct bbpos start, struct bbpos end, @@ -1188,7 +1229,7 @@ static int check_one_backpointer(struct btree_trans *trans, if (ret) return ret; - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); return ret; } @@ -1235,7 +1276,7 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, int bch2_check_backpointers_to_extents(struct bch_fs *c) { - struct btree_trans *trans = bch2_trans_get(c); + CLASS(btree_trans, trans)(c); struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end; int ret; @@ -1255,7 +1296,7 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) if (bbpos_cmp(start, BBPOS_MIN) || bbpos_cmp(end, BBPOS_MAX)) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); prt_str(&buf, "check_backpointers_to_extents(): "); bch2_bbpos_to_text(&buf, start); @@ -1263,7 +1304,6 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) bch2_bbpos_to_text(&buf, end); bch_verbose(c, "%s", buf.buf); - printbuf_exit(&buf); } ret = bch2_check_backpointers_to_extents_pass(trans, start, end); @@ -1272,10 +1312,53 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) start = bbpos_successor(end); } - bch2_trans_put(trans); bch2_btree_cache_unpin(c); - - bch_err_fn(c, ret); return ret; } + +static int bch2_bucket_bitmap_set(struct bch_dev *ca, struct bucket_bitmap *b, u64 bit) +{ + scoped_guard(mutex, &b->lock) { + if (!b->buckets) { + b->buckets = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), + sizeof(unsigned long), GFP_KERNEL); + if (!b->buckets) + return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap); + } + + b->nr += !__test_and_set_bit(bit, b->buckets); + } + + return 0; +} + +int bch2_bucket_bitmap_resize(struct bch_dev *ca, struct bucket_bitmap *b, + u64 old_size, u64 new_size) +{ + scoped_guard(mutex, &b->lock) { + if (!b->buckets) + return 0; + + unsigned long *n = kvcalloc(BITS_TO_LONGS(new_size), + sizeof(unsigned long), GFP_KERNEL); + if (!n) + return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap); + + memcpy(n, b->buckets, + BITS_TO_LONGS(min(old_size, new_size)) * sizeof(unsigned long)); + kvfree(b->buckets); + b->buckets = n; + } + + return 0; +} + +void bch2_bucket_bitmap_free(struct bucket_bitmap *b) +{ + mutex_lock(&b->lock); + kvfree(b->buckets); + b->buckets = NULL; + b->nr = 0; + mutex_unlock(&b->lock); +} diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 16575dbc5736..7e71afee1ac0 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -53,11 +53,10 @@ static inline struct bpos bp_pos_to_bucket_and_offset(const struct bch_dev *ca, static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket) { - rcu_read_lock(); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp_pos.inode); if (ca) *bucket = bp_pos_to_bucket(ca, bp_pos); - rcu_read_unlock(); return ca != NULL; } @@ -102,7 +101,7 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, struct bkey_i_backpointer *bp, bool insert) { - if (unlikely(bch2_backpointers_no_use_write_buffer)) + if (static_branch_unlikely(&bch2_backpointers_no_use_write_buffer)) return bch2_bucket_backpointer_mod_nowritebuffer(trans, orig_k, bp, insert); if (!insert) { @@ -182,8 +181,20 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_b struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer, struct btree_iter *, struct bkey_buf *); +int bch2_check_bucket_backpointer_mismatch(struct btree_trans *, struct bch_dev *, u64, + bool, struct bkey_buf *); + int bch2_check_btree_backpointers(struct bch_fs *); int bch2_check_extents_to_backpointers(struct bch_fs *); int bch2_check_backpointers_to_extents(struct bch_fs *); +static inline bool bch2_bucket_bitmap_test(struct bucket_bitmap *b, u64 i) +{ + unsigned long *bitmap = READ_ONCE(b->buckets); + return bitmap && test_bit(i, bitmap); +} + +int bch2_bucket_bitmap_resize(struct bch_dev *, struct bucket_bitmap *, u64, u64); +void bch2_bucket_bitmap_free(struct bucket_bitmap *); + #endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 75f7408da173..cdf593c59922 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -183,6 +183,16 @@ #define pr_fmt(fmt) "%s() " fmt "\n", __func__ #endif +#ifdef CONFIG_BCACHEFS_DEBUG +#define ENUMERATED_REF_DEBUG +#endif + +#ifndef dynamic_fault +#define dynamic_fault(...) 0 +#endif + +#define race_fault(...) dynamic_fault("bcachefs:race") + #include #include #include @@ -209,24 +219,40 @@ #include "btree_journal_iter_types.h" #include "disk_accounting_types.h" #include "errcode.h" +#include "fast_list.h" #include "fifo.h" #include "nocow_locking_types.h" #include "opts.h" -#include "recovery_passes_types.h" #include "sb-errors_types.h" #include "seqmutex.h" +#include "snapshot_types.h" #include "time_stats.h" #include "util.h" -#ifdef CONFIG_BCACHEFS_DEBUG -#define BCH_WRITE_REF_DEBUG -#endif - -#ifndef dynamic_fault -#define dynamic_fault(...) 0 -#endif +#include "alloc_types.h" +#include "async_objs_types.h" +#include "btree_gc_types.h" +#include "btree_types.h" +#include "btree_node_scan_types.h" +#include "btree_write_buffer_types.h" +#include "buckets_types.h" +#include "buckets_waiting_for_journal_types.h" +#include "clock_types.h" +#include "disk_groups_types.h" +#include "ec_types.h" +#include "enumerated_ref_types.h" +#include "journal_types.h" +#include "keylist_types.h" +#include "quota_types.h" +#include "rebalance_types.h" +#include "recovery_passes_types.h" +#include "replicas_types.h" +#include "sb-members_types.h" +#include "subvolume_types.h" +#include "super_types.h" +#include "thread_with_file_types.h" -#define race_fault(...) dynamic_fault("bcachefs:race") +#include "trace.h" #define count_event(_c, _name) this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]) @@ -269,7 +295,7 @@ do { \ #define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") -void bch2_print_str(struct bch_fs *, const char *); +void bch2_print_str(struct bch_fs *, const char *, const char *); __printf(2, 3) void bch2_print_opts(struct bch_opts *, const char *, ...); @@ -293,19 +319,31 @@ do { \ bch2_print(_c, __VA_ARGS__); \ } while (0) -#define bch_info(c, fmt, ...) \ - bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) -#define bch_info_ratelimited(c, fmt, ...) \ - bch2_print_ratelimited(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) -#define bch_notice(c, fmt, ...) \ - bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) -#define bch_warn(c, fmt, ...) \ - bch2_print(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) -#define bch_warn_ratelimited(c, fmt, ...) \ - bch2_print_ratelimited(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) - -#define bch_err(c, fmt, ...) \ - bch2_print(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) +#define bch2_print_str_ratelimited(_c, ...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + \ + if (__ratelimit(&_rs)) \ + bch2_print_str(_c, __VA_ARGS__); \ +} while (0) + +#define bch_log(c, loglevel, fmt, ...) \ + bch2_print(c, loglevel bch2_fmt(c, fmt), ##__VA_ARGS__) +#define bch_log_ratelimited(c, loglevel, fmt, ...) \ + bch2_print_ratelimited(c, loglevel bch2_fmt(c, fmt), ##__VA_ARGS__) + +#define bch_err(c, ...) bch_log(c, KERN_ERR, __VA_ARGS__) +#define bch_err_ratelimited(c, ...) bch_log_ratelimited(c, KERN_ERR, __VA_ARGS__) +#define bch_warn(c, ...) bch_log(c, KERN_WARNING, __VA_ARGS__) +#define bch_warn_ratelimited(c, ...) bch_log_ratelimited(c, KERN_WARNING, __VA_ARGS__) +#define bch_notice(c, ...) bch_log(c, KERN_NOTICE, __VA_ARGS__) +#define bch_info(c, ...) bch_log(c, KERN_INFO, __VA_ARGS__) +#define bch_info_ratelimited(c, ...) bch_log_ratelimited(c, KERN_INFO, __VA_ARGS__) +#define bch_verbose(c, ...) bch_log(c, KERN_DEBUG, __VA_ARGS__) +#define bch_verbose_ratelimited(c, ...) bch_log_ratelimited(c, KERN_DEBUG, __VA_ARGS__) + #define bch_err_dev(ca, fmt, ...) \ bch2_print(c, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) #define bch_err_dev_offset(ca, _offset, fmt, ...) \ @@ -315,8 +353,6 @@ do { \ #define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \ bch2_print(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) -#define bch_err_ratelimited(c, fmt, ...) \ - bch2_print_ratelimited(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_err_dev_ratelimited(ca, fmt, ...) \ bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) #define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \ @@ -350,23 +386,13 @@ do { \ ##__VA_ARGS__, bch2_err_str(_ret)); \ } while (0) -#define bch_verbose(c, fmt, ...) \ -do { \ - if ((c)->opts.verbose) \ - bch_info(c, fmt, ##__VA_ARGS__); \ -} while (0) - -#define bch_verbose_ratelimited(c, fmt, ...) \ -do { \ - if ((c)->opts.verbose) \ - bch_info_ratelimited(c, fmt, ##__VA_ARGS__); \ -} while (0) +static inline int __bch2_err_trace(struct bch_fs *c, int err) +{ + trace_error_throw(c, err, _THIS_IP_); + return err; +} -#define pr_verbose_init(opts, fmt, ...) \ -do { \ - if (opt_get(opts, verbose)) \ - pr_info(fmt, ##__VA_ARGS__); \ -} while (0) +#define bch_err_throw(_c, _err) __bch2_err_trace(_c, -BCH_ERR_##_err) /* Parameters that are useful for debugging, but should always be compiled in: */ #define BCH_DEBUG_PARAMS_ALWAYS() \ @@ -390,17 +416,20 @@ do { \ "compare them") \ BCH_DEBUG_PARAM(backpointers_no_use_write_buffer, \ "Don't use the write buffer for backpointers, enabling "\ - "extra runtime checks") - -/* Parameters that should only be compiled in debug mode: */ -#define BCH_DEBUG_PARAMS_DEBUG() \ - BCH_DEBUG_PARAM(expensive_debug_checks, \ - "Enables various runtime debugging checks that " \ - "significantly affect performance") \ + "extra runtime checks") \ + BCH_DEBUG_PARAM(debug_check_btree_locking, \ + "Enable additional asserts for btree locking") \ BCH_DEBUG_PARAM(debug_check_iterators, \ "Enables extra verification for btree iterators") \ + BCH_DEBUG_PARAM(debug_check_bset_lookups, \ + "Enables extra verification for bset lookups") \ BCH_DEBUG_PARAM(debug_check_btree_accounting, \ "Verify btree accounting for keys within a node") \ + BCH_DEBUG_PARAM(debug_check_bkey_unpack, \ + "Enables extra verification for bkey unpack") + +/* Parameters that should only be compiled in debug mode: */ +#define BCH_DEBUG_PARAMS_DEBUG() \ BCH_DEBUG_PARAM(journal_seq_verify, \ "Store the journal sequence number in the version " \ "number of every btree key, and verify that btree " \ @@ -427,15 +456,9 @@ do { \ #define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() #endif -#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; -BCH_DEBUG_PARAMS() -#undef BCH_DEBUG_PARAM - -#ifndef CONFIG_BCACHEFS_DEBUG -#define BCH_DEBUG_PARAM(name, description) static const __maybe_unused bool bch2_##name; -BCH_DEBUG_PARAMS_DEBUG() +#define BCH_DEBUG_PARAM(name, description) extern struct static_key_false bch2_##name; +BCH_DEBUG_PARAMS_ALL() #undef BCH_DEBUG_PARAM -#endif #define BCH_TIME_STATS() \ x(btree_node_mem_alloc) \ @@ -443,6 +466,7 @@ BCH_DEBUG_PARAMS_DEBUG() x(btree_node_compact) \ x(btree_node_merge) \ x(btree_node_sort) \ + x(btree_node_get) \ x(btree_node_read) \ x(btree_node_read_done) \ x(btree_node_write) \ @@ -450,6 +474,10 @@ BCH_DEBUG_PARAMS_DEBUG() x(btree_interior_update_total) \ x(btree_gc) \ x(data_write) \ + x(data_write_to_submit) \ + x(data_write_to_queue) \ + x(data_write_to_btree_update) \ + x(data_write_btree_update) \ x(data_read) \ x(data_promote) \ x(journal_flush_write) \ @@ -472,26 +500,6 @@ enum bch_time_stats { BCH_TIME_STAT_NR }; -#include "alloc_types.h" -#include "btree_gc_types.h" -#include "btree_types.h" -#include "btree_node_scan_types.h" -#include "btree_write_buffer_types.h" -#include "buckets_types.h" -#include "buckets_waiting_for_journal_types.h" -#include "clock_types.h" -#include "disk_groups_types.h" -#include "ec_types.h" -#include "journal_types.h" -#include "keylist_types.h" -#include "quota_types.h" -#include "rebalance_types.h" -#include "replicas_types.h" -#include "sb-members_types.h" -#include "subvolume_types.h" -#include "super_types.h" -#include "thread_with_file_types.h" - /* Number of nodes btree coalesce will try to coalesce at once */ #define GC_MERGE_NODES 4U @@ -514,6 +522,57 @@ struct discard_in_flight { u64 bucket:63; }; +#define BCH_DEV_READ_REFS() \ + x(bch2_online_devs) \ + x(trans_mark_dev_sbs) \ + x(read_fua_test) \ + x(sb_field_resize) \ + x(write_super) \ + x(journal_read) \ + x(fs_journal_alloc) \ + x(fs_resize_on_mount) \ + x(btree_node_read) \ + x(btree_node_read_all_replicas) \ + x(btree_node_scrub) \ + x(btree_node_write) \ + x(btree_node_scan) \ + x(btree_verify_replicas) \ + x(btree_node_ondisk_to_text) \ + x(io_read) \ + x(check_extent_checksums) \ + x(ec_block) + +enum bch_dev_read_ref { +#define x(n) BCH_DEV_READ_REF_##n, + BCH_DEV_READ_REFS() +#undef x + BCH_DEV_READ_REF_NR, +}; + +#define BCH_DEV_WRITE_REFS() \ + x(journal_write) \ + x(journal_do_discards) \ + x(dev_do_discards) \ + x(discard_one_bucket_fast) \ + x(do_invalidates) \ + x(nocow_flush) \ + x(io_write) \ + x(ec_block) \ + x(ec_bucket_zero) + +enum bch_dev_write_ref { +#define x(n) BCH_DEV_WRITE_REF_##n, + BCH_DEV_WRITE_REFS() +#undef x + BCH_DEV_WRITE_REF_NR, +}; + +struct bucket_bitmap { + unsigned long *buckets; + u64 nr; + struct mutex lock; +}; + struct bch_dev { struct kobject kobj; #ifdef CONFIG_BCACHEFS_DEBUG @@ -524,8 +583,7 @@ struct bch_dev { struct percpu_ref ref; #endif struct completion ref_completion; - struct percpu_ref io_ref[2]; - struct completion io_ref_completion[2]; + struct enumerated_ref io_ref[2]; struct bch_fs *fs; @@ -559,8 +617,8 @@ struct bch_dev { u8 *oldest_gen; unsigned long *buckets_nouse; - unsigned long *bucket_backpointer_mismatches; - unsigned long *bucket_backpointer_empty; + struct bucket_bitmap bucket_backpointer_mismatch; + struct bucket_bitmap bucket_backpointer_empty; struct bch_dev_usage_full __percpu *usage; @@ -572,10 +630,6 @@ struct bch_dev { unsigned nr_partial_buckets; unsigned nr_btree_reserve; - size_t inc_gen_needs_gc; - size_t inc_gen_really_needs_gc; - size_t buckets_waiting_on_journal; - struct work_struct invalidate_work; struct work_struct discard_work; struct mutex discard_buckets_in_flight_lock; @@ -614,14 +668,15 @@ struct bch_dev { x(accounting_replay_done) \ x(may_go_rw) \ x(rw) \ + x(rw_init_done) \ x(was_rw) \ x(stopping) \ x(emergency_ro) \ x(going_ro) \ x(write_disable_complete) \ x(clean_shutdown) \ - x(recovery_running) \ - x(fsck_running) \ + x(in_recovery) \ + x(in_fsck) \ x(initial_gc_unfixed) \ x(need_delete_dead_snapshots) \ x(error) \ @@ -648,8 +703,10 @@ struct btree_transaction_stats { struct bch2_time_stats lock_hold_times; struct mutex lock; unsigned nr_max_paths; - unsigned journal_entries_size; unsigned max_mem; +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_trans_kmalloc_trace trans_kmalloc_trace; +#endif char *max_paths_text; }; @@ -670,9 +727,6 @@ struct btree_trans_buf { struct btree_trans *trans; }; -#define BCACHEFS_ROOT_SUBVOL_INUM \ - ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) - #define BCH_WRITE_REFS() \ x(journal) \ x(trans) \ @@ -694,7 +748,9 @@ struct btree_trans_buf { x(snapshot_delete_pagecache) \ x(sysfs) \ x(btree_write_buffer) \ - x(btree_node_scrub) + x(btree_node_scrub) \ + x(async_recovery_passes) \ + x(ioctl_data) enum bch_write_ref { #define x(n) BCH_WRITE_REF_##n, @@ -728,11 +784,7 @@ struct bch_fs { struct rw_semaphore state_lock; /* Counts outstanding writes, for clean transition to read-only */ -#ifdef BCH_WRITE_REF_DEBUG - atomic_long_t writes[BCH_WRITE_REF_NR]; -#else - struct percpu_ref writes; -#endif + struct enumerated_ref writes; /* * Certain operations are only allowed in single threaded mode, during * recovery, and we want to assert that this is the case: @@ -749,6 +801,7 @@ struct bch_fs { struct work_struct read_only_work; struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; + struct bch_devs_mask devs_removed; struct bch_accounting_mem accounting; @@ -762,6 +815,8 @@ struct bch_fs { struct bch_disk_groups_cpu __rcu *disk_groups; struct bch_opts opts; + unsigned loglevel; + unsigned prev_loglevel; /* Updated by bch2_sb_update():*/ struct { @@ -776,6 +831,7 @@ struct bch_fs { u8 nr_devices; u8 clean; + bool multi_device; /* true if we've ever had more than one device */ u8 encryption_type; @@ -785,15 +841,14 @@ struct bch_fs { unsigned nsec_per_time_unit; u64 features; u64 compat; + u64 recovery_passes_required; unsigned long errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)]; u64 btrees_lost_data; } sb; - DARRAY(enum bcachefs_metadata_version) - incompat_versions_requested; -#ifdef CONFIG_UNICODE + unsigned long incompat_versions_requested[BITS_TO_LONGS(BCH_VERSION_MINOR(bcachefs_metadata_version_current))]; + struct unicode_map *cf_encoding; -#endif struct bch_sb_handle disk_sb; @@ -809,7 +864,7 @@ struct bch_fs { struct mutex snapshot_table_lock; struct rw_semaphore snapshot_create_lock; - struct work_struct snapshot_delete_work; + struct snapshot_delete snapshot_delete; struct work_struct snapshot_wait_for_pagecache_and_delete_work; snapshot_id_list snapshots_unlinked; struct mutex snapshots_unlinked_lock; @@ -874,7 +929,7 @@ struct bch_fs { struct btree_write_buffer btree_write_buffer; struct workqueue_struct *btree_update_wq; - struct workqueue_struct *btree_io_complete_wq; + struct workqueue_struct *btree_write_complete_wq; /* copygc needs its own workqueue for index updates.. */ struct workqueue_struct *copygc_wq; /* @@ -885,6 +940,7 @@ struct bch_fs { struct workqueue_struct *write_ref_wq; /* ALLOCATION */ + struct bch_devs_mask online_devs; struct bch_devs_mask rw_devs[BCH_DATA_NR]; unsigned long rw_devs_change_count; @@ -979,6 +1035,10 @@ struct bch_fs { nocow_locks; struct rhashtable promote_table; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + struct async_obj_list async_objs[BCH_ASYNC_OBJ_NR]; +#endif + mempool_t compression_bounce[2]; mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR]; size_t zstd_workspace_size; @@ -1048,25 +1108,12 @@ struct bch_fs { /* RECOVERY */ u64 journal_replay_seq_start; u64 journal_replay_seq_end; - /* - * Two different uses: - * "Has this fsck pass?" - i.e. should this type of error be an - * emergency read-only - * And, in certain situations fsck will rewind to an earlier pass: used - * for signaling to the toplevel code which pass we want to run now. - */ - enum bch_recovery_pass curr_recovery_pass; - enum bch_recovery_pass next_recovery_pass; - /* bitmask of recovery passes that we actually ran */ - u64 recovery_passes_complete; - /* never rewinds version of curr_recovery_pass */ - enum bch_recovery_pass recovery_pass_done; - spinlock_t recovery_pass_lock; - struct semaphore online_fsck_mutex; + struct bch_fs_recovery recovery; /* DEBUG JUNK */ struct dentry *fs_debug_dir; struct dentry *btree_debug_dir; + struct dentry *async_obj_dir; struct btree_debug btree_debug[BTREE_ID_NR]; struct btree *verify_data; struct btree_node *verify_ondisk; @@ -1108,54 +1155,6 @@ struct bch_fs { extern struct wait_queue_head bch2_read_only_wait; -static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref) -{ -#ifdef BCH_WRITE_REF_DEBUG - atomic_long_inc(&c->writes[ref]); -#else - percpu_ref_get(&c->writes); -#endif -} - -static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) -{ -#ifdef BCH_WRITE_REF_DEBUG - return !test_bit(BCH_FS_going_ro, &c->flags) && - atomic_long_inc_not_zero(&c->writes[ref]); -#else - return percpu_ref_tryget(&c->writes); -#endif -} - -static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) -{ -#ifdef BCH_WRITE_REF_DEBUG - return !test_bit(BCH_FS_going_ro, &c->flags) && - atomic_long_inc_not_zero(&c->writes[ref]); -#else - return percpu_ref_tryget_live(&c->writes); -#endif -} - -static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref) -{ -#ifdef BCH_WRITE_REF_DEBUG - long v = atomic_long_dec_return(&c->writes[ref]); - - BUG_ON(v < 0); - if (v) - return; - for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) - if (atomic_long_read(&c->writes[i])) - return; - - set_bit(BCH_FS_write_disable_complete, &c->flags); - wake_up(&bch2_read_only_wait); -#else - percpu_ref_put(&c->writes); -#endif -} - static inline bool bch2_ro_ref_tryget(struct bch_fs *c) { if (test_bit(BCH_FS_stopping, &c->flags)) @@ -1166,7 +1165,7 @@ static inline bool bch2_ro_ref_tryget(struct bch_fs *c) static inline void bch2_ro_ref_put(struct bch_fs *c) { - if (refcount_dec_and_test(&c->ro_ref)) + if (c && refcount_dec_and_test(&c->ro_ref)) wake_up(&c->ro_ref_wait); } @@ -1256,4 +1255,33 @@ static inline unsigned data_replicas_required(struct bch_fs *c) #define BKEY_PADDED_ONSTACK(key, pad) \ struct { struct bkey_i key; __u64 key ## _pad[pad]; } +/* + * This is needed because discard is both a filesystem option and a device + * option, and mount options are supposed to apply to that mount and not be + * persisted, i.e. if it's set as a mount option we can't propagate it to the + * device. + */ +static inline bool bch2_discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca) +{ + return test_bit(BCH_FS_discard_mount_opt_set, &c->flags) + ? c->opts.discard + : ca->mi.discard; +} + +static inline int bch2_fs_casefold_enabled(struct bch_fs *c) +{ + if (!IS_ENABLED(CONFIG_UNICODE)) + return bch_err_throw(c, no_casefolding_without_utf8); + if (c->opts.casefold_disabled) + return bch_err_throw(c, casefolding_disabled); + return 0; +} + +static inline const char *strip_bch2(const char *msg) +{ + if (!strncmp("bch2_", msg, 5)) + return msg + 5; + return msg; +} + #endif /* _BCACHEFS_H */ diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index d6e4a496f02b..b4a04df5ea95 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -497,7 +497,8 @@ struct bch_sb_field { x(members_v2, 11) \ x(errors, 12) \ x(ext, 13) \ - x(downgrade, 14) + x(downgrade, 14) \ + x(recovery_passes, 15) #include "alloc_background_format.h" #include "dirent_format.h" @@ -510,6 +511,7 @@ struct bch_sb_field { #include "logged_ops_format.h" #include "lru_format.h" #include "quota_format.h" +#include "recovery_passes_format.h" #include "reflink_format.h" #include "replicas_format.h" #include "snapshot_format.h" @@ -695,7 +697,10 @@ struct bch_sb_field_ext { x(stripe_backpointers, BCH_VERSION(1, 22)) \ x(stripe_lru, BCH_VERSION(1, 23)) \ x(casefolding, BCH_VERSION(1, 24)) \ - x(extent_flags, BCH_VERSION(1, 25)) + x(extent_flags, BCH_VERSION(1, 25)) \ + x(snapshot_deletion_v2, BCH_VERSION(1, 26)) \ + x(fast_device_removal, BCH_VERSION(1, 27)) \ + x(inode_has_case_insensitive, BCH_VERSION(1, 28)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -846,7 +851,7 @@ LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); -/* one free bit */ +LE64_BITMASK(BCH_SB_MULTI_DEVICE, struct bch_sb, flags[3], 63, 64); LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34); @@ -867,7 +872,9 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED, LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4); LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14); LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20); +LE64_BITMASK(BCH_SB_DEGRADED_ACTION, struct bch_sb, flags[6], 20, 22); LE64_BITMASK(BCH_SB_CASEFOLD, struct bch_sb, flags[6], 22, 23); +LE64_BITMASK(BCH_SB_REBALANCE_AC_ONLY, struct bch_sb, flags[6], 23, 24); static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) { @@ -922,7 +929,9 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u x(alloc_v2, 17) \ x(extents_across_btree_nodes, 18) \ x(incompat_version_field, 19) \ - x(casefolding, 20) + x(casefolding, 20) \ + x(no_alloc_info, 21) \ + x(small_image, 22) #define BCH_SB_FEATURES_ALWAYS \ (BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \ @@ -989,6 +998,19 @@ enum bch_error_actions { BCH_ON_ERROR_NR }; +#define BCH_DEGRADED_ACTIONS() \ + x(ask, 0) \ + x(yes, 1) \ + x(very, 2) \ + x(no, 3) + +enum bch_degraded_actions { +#define x(t, n) BCH_DEGRADED_##t = n, + BCH_DEGRADED_ACTIONS() +#undef x + BCH_DEGRADED_ACTIONS_NR +}; + #define BCH_STR_HASH_TYPES() \ x(crc32c, 0) \ x(crc64, 1) \ diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index 995ba32e9b6e..67e39f835b96 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -47,11 +47,9 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *out, } } -#ifdef CONFIG_BCACHEFS_DEBUG - -static void bch2_bkey_pack_verify(const struct bkey_packed *packed, - const struct bkey *unpacked, - const struct bkey_format *format) +static void __bch2_bkey_pack_verify(const struct bkey_packed *packed, + const struct bkey *unpacked, + const struct bkey_format *format) { struct bkey tmp; @@ -95,11 +93,13 @@ static void bch2_bkey_pack_verify(const struct bkey_packed *packed, } } -#else static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed, - const struct bkey *unpacked, - const struct bkey_format *format) {} -#endif + const struct bkey *unpacked, + const struct bkey_format *format) +{ + if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) + __bch2_bkey_pack_verify(packed, unpacked, format); +} struct pack_state { const struct bkey_format *format; @@ -398,7 +398,6 @@ static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v) return ret; } -#ifdef CONFIG_BCACHEFS_DEBUG static bool bkey_packed_successor(struct bkey_packed *out, const struct btree *b, struct bkey_packed k) @@ -455,7 +454,6 @@ static bool bkey_format_has_too_big_fields(const struct bkey_format *f) return false; } -#endif /* * Returns a packed key that compares <= in @@ -472,9 +470,7 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, const struct bkey_format *f = &b->format; struct pack_state state = pack_state_init(f, out); u64 *w = out->_data; -#ifdef CONFIG_BCACHEFS_DEBUG struct bpos orig = in; -#endif bool exact = true; unsigned i; @@ -527,18 +523,18 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, out->format = KEY_FORMAT_LOCAL_BTREE; out->type = KEY_TYPE_deleted; -#ifdef CONFIG_BCACHEFS_DEBUG - if (exact) { - BUG_ON(bkey_cmp_left_packed(b, out, &orig)); - } else { - struct bkey_packed successor; + if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) { + if (exact) { + BUG_ON(bkey_cmp_left_packed(b, out, &orig)); + } else { + struct bkey_packed successor; - BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); - BUG_ON(bkey_packed_successor(&successor, b, *out) && - bkey_cmp_left_packed(b, &successor, &orig) < 0 && - !bkey_format_has_too_big_fields(f)); + BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); + BUG_ON(bkey_packed_successor(&successor, b, *out) && + bkey_cmp_left_packed(b, &successor, &orig) < 0 && + !bkey_format_has_too_big_fields(f)); + } } -#endif return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER; } @@ -627,14 +623,11 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) } } -#ifdef CONFIG_BCACHEFS_DEBUG - { - struct printbuf buf = PRINTBUF; - + if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) { + CLASS(printbuf, buf)(); BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf)); - printbuf_exit(&buf); } -#endif + return ret; } diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 054e2d5e8448..3ccd521c190a 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -191,6 +191,7 @@ static inline struct bpos bkey_max(struct bpos l, struct bpos r) static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) { return bpos_eq(l.k->p, r.k->p) && + l.k->size == r.k->size && bkey_bytes(l.k) == bkey_bytes(r.k) && !memcmp(l.v, r.v, bkey_val_bytes(l.k)); } @@ -397,8 +398,7 @@ __bkey_unpack_key_format_checked(const struct btree *b, compiled_unpack_fn unpack_fn = b->aux_data; unpack_fn(dst, src); - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - bch2_expensive_debug_checks) { + if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) { struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 00d05ccfaf73..fcd8c82cba4f 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -356,7 +356,7 @@ bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) return ops->key_merge && bch2_bkey_maybe_mergable(l.k, r.k) && (u64) l.k->size + r.k->size <= KEY_SIZE_MAX && - !bch2_key_merging_disabled && + !static_branch_unlikely(&bch2_key_merging_disabled) && ops->key_merge(c, l, r); } diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index 9a4a83d6fd2d..72698c0d9f0e 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -58,7 +58,7 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, struct bkey_packed *_k, *_n; struct bkey uk, n; struct bkey_s_c k; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); if (!i->u64s) return; @@ -97,8 +97,6 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, if (!bkey_deleted(k.k) && bpos_eq(n.p, k.k->p)) printk(KERN_ERR "Duplicate keys\n"); } - - printbuf_exit(&buf); } void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) @@ -113,7 +111,7 @@ void bch2_dump_btree_node_iter(struct btree *b, struct btree_node_iter *iter) { struct btree_node_iter_set *set; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); printk(KERN_ERR "btree node iter with %u/%u sets:\n", __btree_node_iter_used(iter), b->nsets); @@ -128,8 +126,6 @@ void bch2_dump_btree_node_iter(struct btree *b, printk(KERN_ERR "set %zu key %u: %s\n", t - b->set, set->k, buf.buf); } - - printbuf_exit(&buf); } struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b) @@ -144,8 +140,6 @@ struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b) return nr; } -#ifdef CONFIG_BCACHEFS_DEBUG - void __bch2_verify_btree_nr_keys(struct btree *b) { struct btree_nr_keys nr = bch2_btree_node_count_keys(b); @@ -153,7 +147,7 @@ void __bch2_verify_btree_nr_keys(struct btree *b) BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); } -static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, +static void __bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, struct btree *b) { struct btree_node_iter iter = *_iter; @@ -190,8 +184,8 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, } } -void bch2_btree_node_iter_verify(struct btree_node_iter *iter, - struct btree *b) +void __bch2_btree_node_iter_verify(struct btree_node_iter *iter, + struct btree *b) { struct btree_node_iter_set *set, *s2; struct bkey_packed *k, *p; @@ -237,8 +231,8 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter, } } -void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, - struct bkey_packed *insert, unsigned clobber_u64s) +static void __bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, + struct bkey_packed *insert, unsigned clobber_u64s) { struct bset_tree *t = bch2_bkey_to_bset(b, where); struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); @@ -285,12 +279,15 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, #endif } -#else - -static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, - struct btree *b) {} +static inline void bch2_verify_insert_pos(struct btree *b, + struct bkey_packed *where, + struct bkey_packed *insert, + unsigned clobber_u64s) +{ + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) + __bch2_verify_insert_pos(b, where, insert, clobber_u64s); +} -#endif /* Auxiliary search trees */ @@ -361,23 +358,6 @@ static struct bkey_float *bkey_float(const struct btree *b, return ro_aux_tree_base(b, t)->f + idx; } -static void bset_aux_tree_verify(struct btree *b) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - for_each_bset(b, t) { - if (t->aux_data_offset == U16_MAX) - continue; - - BUG_ON(t != b->set && - t[-1].aux_data_offset == U16_MAX); - - BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t)); - BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); - BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); - } -#endif -} - void bch2_btree_keys_init(struct btree *b) { unsigned i; @@ -495,15 +475,11 @@ static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, }; } -static void bch2_bset_verify_rw_aux_tree(struct btree *b, - struct bset_tree *t) +static void __bch2_bset_verify_rw_aux_tree(struct btree *b, struct bset_tree *t) { struct bkey_packed *k = btree_bkey_first(b, t); unsigned j = 0; - if (!bch2_expensive_debug_checks) - return; - BUG_ON(bset_has_ro_aux_tree(t)); if (!bset_has_rw_aux_tree(t)) @@ -530,6 +506,58 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b, } } +static inline void bch2_bset_verify_rw_aux_tree(struct btree *b, + struct bset_tree *t) +{ + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) + __bch2_bset_verify_rw_aux_tree(b, t); +} + +static void __bset_aux_tree_verify_ro(struct btree *b, struct bset_tree *t) +{ + struct bkey_packed *k = btree_bkey_first(b, t); + + eytzinger1_for_each(j, t->size - 1) { + while (tree_to_bkey(b, t, j) > k && + k != btree_bkey_last(b, t)) + k = bkey_p_next(k); + + BUG_ON(tree_to_bkey(b, t, j) != k); + } +} + +static void __bset_aux_tree_verify(struct btree *b) +{ + for_each_bset(b, t) { + if (t->aux_data_offset == U16_MAX) + continue; + + BUG_ON(t != b->set && + t[-1].aux_data_offset == U16_MAX); + + BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t)); + BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); + BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); + + switch (bset_aux_tree_type(t)) { + case BSET_RO_AUX_TREE: + __bset_aux_tree_verify_ro(b, t); + break; + case BSET_RW_AUX_TREE: + __bch2_bset_verify_rw_aux_tree(b, t); + break; + default: + break; + } + } +} + +static inline void bset_aux_tree_verify(struct btree *b) +{ + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) + __bset_aux_tree_verify(b); +} + /* returns idx of first entry >= offset: */ static unsigned rw_aux_tree_bsearch(struct btree *b, struct bset_tree *t, @@ -869,7 +897,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, k = p; } - if (bch2_expensive_debug_checks) { + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { BUG_ON(ret >= orig_k); for (i = ret @@ -1195,7 +1223,7 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b, bkey_iter_pos_cmp(b, m, search) < 0) m = bkey_p_next(m); - if (bch2_expensive_debug_checks) { + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); BUG_ON(prev && @@ -1435,9 +1463,9 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, void bch2_btree_node_iter_advance(struct btree_node_iter *iter, struct btree *b) { - if (bch2_expensive_debug_checks) { - bch2_btree_node_iter_verify(iter, b); - bch2_btree_node_iter_next_check(iter, b); + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { + __bch2_btree_node_iter_verify(iter, b); + __bch2_btree_node_iter_next_check(iter, b); } __bch2_btree_node_iter_advance(iter, b); @@ -1453,8 +1481,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, struct btree_node_iter_set *set; unsigned end = 0; - if (bch2_expensive_debug_checks) - bch2_btree_node_iter_verify(iter, b); + bch2_btree_node_iter_verify(iter, b); for_each_bset(b, t) { k = bch2_bkey_prev_all(b, t, @@ -1489,8 +1516,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, iter->data[0].k = __btree_node_key_to_offset(b, prev); iter->data[0].end = end; - if (bch2_expensive_debug_checks) - bch2_btree_node_iter_verify(iter, b); + bch2_btree_node_iter_verify(iter, b); return prev; } diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h index 6953d55b72cc..a15ecf9d006e 100644 --- a/fs/bcachefs/bset.h +++ b/fs/bcachefs/bset.h @@ -517,27 +517,19 @@ void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned); void bch2_dump_btree_node(struct bch_fs *, struct btree *); void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *); -#ifdef CONFIG_BCACHEFS_DEBUG - void __bch2_verify_btree_nr_keys(struct btree *); -void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); -void bch2_verify_insert_pos(struct btree *, struct bkey_packed *, - struct bkey_packed *, unsigned); - -#else +void __bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); -static inline void __bch2_verify_btree_nr_keys(struct btree *b) {} static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter, - struct btree *b) {} -static inline void bch2_verify_insert_pos(struct btree *b, - struct bkey_packed *where, - struct bkey_packed *insert, - unsigned clobber_u64s) {} -#endif + struct btree *b) +{ + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) + __bch2_btree_node_iter_verify(iter, b); +} static inline void bch2_verify_btree_nr_keys(struct btree *b) { - if (bch2_debug_check_btree_accounting) + if (static_branch_unlikely(&bch2_debug_check_btree_accounting)) __bch2_verify_btree_nr_keys(b); } diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 899891295797..9261ad043564 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -15,14 +15,9 @@ #include #include +#include #include -#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \ -do { \ - if (shrinker_counter) \ - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++; \ -} while (0) - const char * const bch2_btree_node_flags[] = { "typebit", "typebit", @@ -83,15 +78,14 @@ void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) { struct btree_cache *bc = &c->btree_cache; - mutex_lock(&bc->lock); - __bch2_btree_node_to_freelist(bc, b); - mutex_unlock(&bc->lock); + scoped_guard(mutex, &bc->lock) + __bch2_btree_node_to_freelist(bc, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); } -static void __btree_node_data_free(struct btree_cache *bc, struct btree *b) +void __btree_node_data_free(struct btree *b) { BUG_ON(!list_empty(&b->list)); BUG_ON(btree_node_hashed(b)); @@ -118,16 +112,17 @@ static void __btree_node_data_free(struct btree_cache *bc, struct btree *b) munmap(b->aux_data, btree_aux_data_bytes(b)); #endif b->aux_data = NULL; - - btree_node_to_freedlist(bc, b); } static void btree_node_data_free(struct btree_cache *bc, struct btree *b) { BUG_ON(list_empty(&b->list)); list_del_init(&b->list); + + __btree_node_data_free(b); + --bc->nr_freeable; - __btree_node_data_free(bc, b); + btree_node_to_freedlist(bc, b); } static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, @@ -155,7 +150,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) b->data = kvmalloc(btree_buf_bytes(b), gfp); if (!b->data) - return -BCH_ERR_ENOMEM_btree_node_mem_alloc; + return bch_err_throw(c, ENOMEM_btree_node_mem_alloc); #ifdef __KERNEL__ b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); #else @@ -168,7 +163,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) if (!b->aux_data) { kvfree(b->data); b->data = NULL; - return -BCH_ERR_ENOMEM_btree_node_mem_alloc; + return bch_err_throw(c, ENOMEM_btree_node_mem_alloc); } return 0; @@ -191,10 +186,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) { - struct btree_cache *bc = &c->btree_cache; - struct btree *b; - - b = __btree_node_mem_alloc(c, GFP_KERNEL); + struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL); if (!b) return NULL; @@ -204,8 +196,6 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) } bch2_btree_lock_init(&b->c, 0, GFP_KERNEL); - - __bch2_btree_node_to_freelist(bc, b); return b; } @@ -224,14 +214,13 @@ void bch2_node_pin(struct bch_fs *c, struct btree *b) { struct btree_cache *bc = &c->btree_cache; - mutex_lock(&bc->lock); - if (b != btree_node_root(c, b) && !btree_node_pinned(b)) { + guard(mutex)(&bc->lock); + if (!btree_node_is_root(c, b) && !btree_node_pinned(b)) { set_btree_node_pinned(b); list_move(&b->list, &bc->live[1].list); bc->live[0].nr--; bc->live[1].nr++; } - mutex_unlock(&bc->lock); } void bch2_btree_cache_unpin(struct bch_fs *c) @@ -239,7 +228,7 @@ void bch2_btree_cache_unpin(struct bch_fs *c) struct btree_cache *bc = &c->btree_cache; struct btree *b, *n; - mutex_lock(&bc->lock); + guard(mutex)(&bc->lock); c->btree_cache.pinned_nodes_mask[0] = 0; c->btree_cache.pinned_nodes_mask[1] = 0; @@ -249,8 +238,6 @@ void bch2_btree_cache_unpin(struct bch_fs *c) bc->live[0].nr++; bc->live[1].nr--; } - - mutex_unlock(&bc->lock); } /* Btree in memory cache - hash table */ @@ -305,11 +292,8 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, b->c.level = level; b->c.btree_id = id; - mutex_lock(&bc->lock); - int ret = __bch2_btree_node_hash_insert(bc, b); - mutex_unlock(&bc->lock); - - return ret; + guard(mutex)(&bc->lock); + return __bch2_btree_node_hash_insert(bc, b); } void bch2_btree_node_update_key_early(struct btree_trans *trans, @@ -326,7 +310,7 @@ void bch2_btree_node_update_key_early(struct btree_trans *trans, b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); if (!IS_ERR_OR_NULL(b)) { - mutex_lock(&c->btree_cache.lock); + guard(mutex)(&c->btree_cache.lock); __bch2_btree_node_hash_remove(&c->btree_cache, b); @@ -334,7 +318,6 @@ void bch2_btree_node_update_key_early(struct btree_trans *trans, ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); BUG_ON(ret); - mutex_unlock(&c->btree_cache.lock); six_unlock_read(&b->c.lock); } @@ -350,115 +333,119 @@ static inline struct btree *btree_cache_find(struct btree_cache *bc, return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); } -/* - * this version is for btree nodes that have already been freed (we're not - * reaping a real btree node) - */ -static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter) +static int __btree_node_reclaim_checks(struct bch_fs *c, struct btree *b, + bool flush, bool locked) { struct btree_cache *bc = &c->btree_cache; - int ret = 0; lockdep_assert_held(&bc->lock); -wait_on_io: - if (b->flags & ((1U << BTREE_NODE_dirty)| - (1U << BTREE_NODE_read_in_flight)| + + if (btree_node_noevict(b)) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_noevict]++; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); + } + if (btree_node_write_blocked(b)) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_blocked]++; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); + } + if (btree_node_will_make_reachable(b)) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_will_make_reachable]++; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); + } + + if (btree_node_dirty(b)) { + if (!flush) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_dirty]++; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); + } + + if (locked) { + /* + * Using the underscore version because we don't want to compact + * bsets after the write, since this node is about to be evicted + * - unless btree verify mode is enabled, since it runs out of + * the post write cleanup: + */ + if (static_branch_unlikely(&bch2_verify_btree_ondisk)) + bch2_btree_node_write(c, b, SIX_LOCK_intent, + BTREE_WRITE_cache_reclaim); + else + __bch2_btree_node_write(c, b, + BTREE_WRITE_cache_reclaim); + } + } + + if (b->flags & ((1U << BTREE_NODE_read_in_flight)| (1U << BTREE_NODE_write_in_flight))) { if (!flush) { - if (btree_node_dirty(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(dirty); - else if (btree_node_read_in_flight(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); + if (btree_node_read_in_flight(b)) + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_read_in_flight]++; else if (btree_node_write_in_flight(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); - return -BCH_ERR_ENOMEM_btree_node_reclaim; + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_in_flight]++; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); } + if (locked) + return -EINTR; + /* XXX: waiting on IO with btree cache lock held */ bch2_btree_node_wait_on_read(b); bch2_btree_node_wait_on_write(b); } + return 0; +} + +/* + * this version is for btree nodes that have already been freed (we're not + * reaping a real btree node) + */ +static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) +{ + struct btree_cache *bc = &c->btree_cache; + int ret = 0; + + lockdep_assert_held(&bc->lock); +retry_unlocked: + ret = __btree_node_reclaim_checks(c, b, flush, false); + if (ret) + return ret; + if (!six_trylock_intent(&b->c.lock)) { - BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent); - return -BCH_ERR_ENOMEM_btree_node_reclaim; + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_intent]++; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); } if (!six_trylock_write(&b->c.lock)) { - BTREE_CACHE_NOT_FREED_INCREMENT(lock_write); - goto out_unlock_intent; + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_write]++; + six_unlock_intent(&b->c.lock); + return bch_err_throw(c, ENOMEM_btree_node_reclaim); } /* recheck under lock */ - if (b->flags & ((1U << BTREE_NODE_read_in_flight)| - (1U << BTREE_NODE_write_in_flight))) { - if (!flush) { - if (btree_node_read_in_flight(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); - else if (btree_node_write_in_flight(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); - goto out_unlock; - } + ret = __btree_node_reclaim_checks(c, b, flush, true); + if (ret) { six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); - goto wait_on_io; - } - - if (btree_node_noevict(b)) { - BTREE_CACHE_NOT_FREED_INCREMENT(noevict); - goto out_unlock; - } - if (btree_node_write_blocked(b)) { - BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked); - goto out_unlock; - } - if (btree_node_will_make_reachable(b)) { - BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable); - goto out_unlock; + if (ret == -EINTR) + goto retry_unlocked; + return ret; } - if (btree_node_dirty(b)) { - if (!flush) { - BTREE_CACHE_NOT_FREED_INCREMENT(dirty); - goto out_unlock; - } - /* - * Using the underscore version because we don't want to compact - * bsets after the write, since this node is about to be evicted - * - unless btree verify mode is enabled, since it runs out of - * the post write cleanup: - */ - if (bch2_verify_btree_ondisk) - bch2_btree_node_write(c, b, SIX_LOCK_intent, - BTREE_WRITE_cache_reclaim); - else - __bch2_btree_node_write(c, b, - BTREE_WRITE_cache_reclaim); - - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - goto wait_on_io; - } -out: if (b->hash_val && !ret) - trace_and_count(c, btree_cache_reap, c, b); - return ret; -out_unlock: - six_unlock_write(&b->c.lock); -out_unlock_intent: - six_unlock_intent(&b->c.lock); - ret = -BCH_ERR_ENOMEM_btree_node_reclaim; - goto out; + trace_btree_node(c, b, btree_cache_reap); + + return 0; } -static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter) +static int btree_node_reclaim(struct bch_fs *c, struct btree *b) { - return __btree_node_reclaim(c, b, false, shrinker_counter); + return __btree_node_reclaim(c, b, false); } static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) { - return __btree_node_reclaim(c, b, true, false); + return __btree_node_reclaim(c, b, true); } static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, @@ -476,7 +463,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, unsigned long ret = SHRINK_STOP; bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4; - if (bch2_btree_shrinker_disabled) + if (static_branch_unlikely(&bch2_btree_shrinker_disabled)) return SHRINK_STOP; mutex_lock(&bc->lock); @@ -490,7 +477,10 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, * IO can always make forward progress: */ can_free = btree_cache_can_free(list); - nr = min_t(unsigned long, nr, can_free); + if (nr > can_free) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_cache_reserve] += nr - can_free; + nr = can_free; + } i = 0; list_for_each_entry_safe(b, t, &bc->freeable, list) { @@ -506,7 +496,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, if (touched >= nr) goto out; - if (!btree_node_reclaim(c, b, true)) { + if (!btree_node_reclaim(c, b)) { btree_node_data_free(bc, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); @@ -521,10 +511,11 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, if (btree_node_accessed(b)) { clear_btree_node_accessed(b); bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++; - --touched;; - } else if (!btree_node_reclaim(c, b, true)) { + --touched; + } else if (!btree_node_reclaim(c, b)) { __bch2_btree_node_hash_remove(bc, b); - __btree_node_data_free(bc, b); + __btree_node_data_free(b); + btree_node_to_freedlist(bc, b); freed++; bc->nr_freed++; @@ -569,12 +560,25 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink, { struct btree_cache_list *list = shrink->private_data; - if (bch2_btree_shrinker_disabled) + if (static_branch_unlikely(&bch2_btree_shrinker_disabled)) return 0; return btree_cache_can_free(list); } +static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink) +{ + struct btree_cache_list *list = shrink->private_data; + struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); + + char *cbuf; + size_t buflen = seq_buf_get_buf(s, &cbuf); + struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen); + + bch2_btree_cache_to_text(&out, bc); + seq_buf_commit(s, out.pos); +} + void bch2_fs_btree_cache_exit(struct bch_fs *c) { struct btree_cache *bc = &c->btree_cache; @@ -652,9 +656,12 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bch2_recalc_btree_reserve(c); - for (i = 0; i < bc->nr_reserve; i++) - if (!__bch2_btree_node_mem_alloc(c)) + for (i = 0; i < bc->nr_reserve; i++) { + struct btree *b = __bch2_btree_node_mem_alloc(c); + if (!b) goto err; + __bch2_btree_node_to_freelist(bc, b); + } list_splice_init(&bc->live[0].list, &bc->freeable); @@ -666,6 +673,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bc->live[0].shrink = shrink; shrink->count_objects = bch2_btree_cache_count; shrink->scan_objects = bch2_btree_cache_scan; + shrink->to_text = bch2_btree_cache_shrinker_to_text; shrink->seeks = 2; shrink->private_data = &bc->live[0]; shrinker_register(shrink); @@ -676,13 +684,14 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bc->live[1].shrink = shrink; shrink->count_objects = bch2_btree_cache_count; shrink->scan_objects = bch2_btree_cache_scan; + shrink->to_text = bch2_btree_cache_shrinker_to_text; shrink->seeks = 8; shrink->private_data = &bc->live[1]; shrinker_register(shrink); return 0; err: - return -BCH_ERR_ENOMEM_fs_btree_cache_init; + return bch_err_throw(c, ENOMEM_fs_btree_cache_init); } void bch2_fs_btree_cache_init_early(struct btree_cache *bc) @@ -727,7 +736,7 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure if (!cl) { trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); - return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; + return bch_err_throw(c, ENOMEM_btree_cache_cannibalize_lock); } closure_wait(&bc->alloc_wait, cl); @@ -741,7 +750,7 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure } trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); - return -BCH_ERR_btree_cache_cannibalize_lock_blocked; + return bch_err_throw(c, btree_cache_cannibalize_lock_blocked); success: trace_and_count(c, btree_cache_cannibalize_lock, trans); @@ -755,7 +764,7 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c) for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) list_for_each_entry_reverse(b, &bc->live[i].list, list) - if (!btree_node_reclaim(c, b, false)) + if (!btree_node_reclaim(c, b)) return b; while (1) { @@ -790,7 +799,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea * disk node. Check the freed list before allocating a new one: */ list_for_each_entry(b, freed, list) - if (!btree_node_reclaim(c, b, false)) { + if (!btree_node_reclaim(c, b)) { list_del_init(&b->list); goto got_node; } @@ -817,7 +826,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea * the list. Check if there's any freed nodes there: */ list_for_each_entry(b2, &bc->freeable, list) - if (!btree_node_reclaim(c, b2, false)) { + if (!btree_node_reclaim(c, b2)) { swap(b->data, b2->data); swap(b->aux_data, b2->aux_data); @@ -913,20 +922,18 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, } if (unlikely(!bkey_is_btree_ptr(&k->k))) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf); - printbuf_exit(&buf); return ERR_PTR(ret); } if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf); - printbuf_exit(&buf); return ERR_PTR(ret); } @@ -977,7 +984,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, /* Unlock before doing IO: */ six_unlock_intent(&b->c.lock); - bch2_trans_unlock_noassert(trans); + bch2_trans_unlock(trans); bch2_btree_node_read(trans, b, sync); @@ -1001,11 +1008,10 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) { - struct printbuf buf = PRINTBUF; - - if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) + if (c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations) return; + CLASS(printbuf, buf)(); prt_printf(&buf, "btree node header doesn't match ptr: "); bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); @@ -1021,8 +1027,6 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) bch2_bpos_to_text(&buf, b->data->max_key); bch2_fs_topology_error(c, "%s", buf.buf); - - printbuf_exit(&buf); } static inline void btree_check_header(struct bch_fs *c, struct btree *b) @@ -1492,9 +1496,10 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc prt_btree_cache_line(out, c, "live:", bc->live[0].nr); prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr); - prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable); + prt_btree_cache_line(out, c, "reserve:", bc->nr_reserve); + prt_btree_cache_line(out, c, "freed:", bc->nr_freeable); prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty)); - prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); + prt_printf(out, "cannibalize lock:\t%s\n", bc->alloc_lock ? "held" : "not held"); prt_newline(out); for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) { @@ -1505,6 +1510,7 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc } prt_newline(out); + prt_printf(out, "counters since mount:\n"); prt_printf(out, "freed:\t%zu\n", bc->nr_freed); prt_printf(out, "not freed:\n"); diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index ca3c1b145330..035b2cb25077 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -30,6 +30,7 @@ void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsig void bch2_btree_cache_cannibalize_unlock(struct btree_trans *); int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *); +void __btree_node_data_free(struct btree *); struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool); @@ -143,6 +144,14 @@ static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b) return r ? r->b : NULL; } +static inline bool btree_node_is_root(struct bch_fs *c, struct btree *b) +{ + struct btree *root = btree_node_root(c, b); + + BUG_ON(b != root && b->c.level >= root->c.level); + return b == root; +} + const char *bch2_btree_id_str(enum btree_id); /* avoid */ void bch2_btree_id_to_text(struct printbuf *, enum btree_id); void bch2_btree_id_level_to_text(struct printbuf *, enum btree_id, unsigned); @@ -153,4 +162,15 @@ void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btr void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *); void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *); +#define trace_btree_node(_c, _b, event) \ +do { \ + if (trace_##event##_enabled()) { \ + CLASS(printbuf, buf)(); \ + printbuf_indent_add(&buf, 2); \ + bch2_btree_pos_to_text(&buf, c, b); \ + trace_##event(c, buf.buf); \ + } \ + count_event(c, event); \ +} while (0); + #endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 37b69d89341f..6b91649688da 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -22,6 +22,7 @@ #include "debug.h" #include "disk_accounting.h" #include "ec.h" +#include "enumerated_ref.h" #include "error.h" #include "extents.h" #include "journal.h" @@ -43,10 +44,6 @@ #include #include -#define DROP_THIS_NODE 10 -#define DROP_PREV_NODE 11 -#define DID_FILL_FROM_SCAN 12 - /* * Returns true if it's a btree we can easily reconstruct, or otherwise won't * cause data loss if it's missing: @@ -94,11 +91,10 @@ static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k) static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) { - preempt_disable(); + guard(preempt)(); write_seqcount_begin(&c->gc_pos_lock); c->gc_pos = new_pos; write_seqcount_end(&c->gc_pos_lock); - preempt_enable(); } static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) @@ -137,19 +133,18 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) int ret; if (c->opts.verbose) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); prt_str(&buf, " -> "); bch2_bpos_to_text(&buf, new_min); bch_info(c, "%s(): %s", __func__, buf.buf); - printbuf_exit(&buf); } new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); if (!new) - return -BCH_ERR_ENOMEM_gc_repair_key; + return bch_err_throw(c, ENOMEM_gc_repair_key); btree_ptr_to_v2(b, new); b->data->min_key = new_min; @@ -173,14 +168,13 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) int ret; if (c->opts.verbose) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); prt_str(&buf, " -> "); bch2_bpos_to_text(&buf, new_max); bch_info(c, "%s(): %s", __func__, buf.buf); - printbuf_exit(&buf); } ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p); @@ -189,7 +183,7 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); if (!new) - return -BCH_ERR_ENOMEM_gc_repair_key; + return bch_err_throw(c, ENOMEM_gc_repair_key); btree_ptr_to_v2(b, new); b->data->max_key = new_max; @@ -204,13 +198,12 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) bch2_btree_node_drop_keys_outside_node(b); - mutex_lock(&c->btree_cache.lock); + guard(mutex)(&c->btree_cache.lock); __bch2_btree_node_hash_remove(&c->btree_cache, b); bkey_copy(&b->key, &new->k_i); ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); BUG_ON(ret); - mutex_unlock(&c->btree_cache.lock); return 0; } @@ -222,7 +215,7 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree * struct bpos expected_start = !prev ? b->data->min_key : bpos_successor(prev->key.k.p); - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); int ret = 0; BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && @@ -252,10 +245,10 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree * expected_start, bpos_predecessor(cur->data->min_key)); if (ret) - goto err; + return ret; *pulled_from_scan = cur->data->min_key; - ret = DID_FILL_FROM_SCAN; + ret = bch_err_throw(c, topology_repair_did_fill_from_scan); } else { if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key, "btree node with incorrect min_key%s", buf.buf)) @@ -266,7 +259,7 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree * if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */ if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_next_node, "btree node overwritten by next node%s", buf.buf)) - ret = DROP_PREV_NODE; + ret = bch_err_throw(c, topology_repair_drop_prev_node); } else { if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key, "btree node with incorrect max_key%s", buf.buf)) @@ -277,7 +270,7 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree * if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */ if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_prev_node, "btree node overwritten by prev node%s", buf.buf)) - ret = DROP_THIS_NODE; + ret = bch_err_throw(c, topology_repair_drop_this_node); } else { if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key, "btree node with incorrect min_key%s", buf.buf)) @@ -285,6 +278,39 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree * } } } +fsck_err: + return ret; +} + +static int btree_check_root_boundaries(struct btree_trans *trans, struct btree *b) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + int ret = 0; + + BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && + !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, + b->data->min_key)); + + prt_str(&buf, " at "); + bch2_btree_pos_to_text(&buf, c, b); + + if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), + trans, btree_node_topology_bad_root_min_key, + "btree root with incorrect min_key%s", buf.buf)) { + ret = set_node_min(c, b, POS_MIN); + if (ret) + goto err; + } + + if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), + trans, btree_node_topology_bad_root_max_key, + "btree root with incorrect min_key%s", buf.buf)) { + ret = set_node_max(c, b, SPOS_MAX); + if (ret) + goto err; + } + err: fsck_err: printbuf_exit(&buf); @@ -295,7 +321,7 @@ static int btree_repair_node_end(struct btree_trans *trans, struct btree *b, struct btree *child, struct bpos *pulled_from_scan) { struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); int ret = 0; if (bpos_eq(child->key.k.p, b->key.k.p)) @@ -316,17 +342,15 @@ static int btree_repair_node_end(struct btree_trans *trans, struct btree *b, ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0, bpos_successor(child->key.k.p), b->key.k.p); if (ret) - goto err; + return ret; *pulled_from_scan = b->key.k.p; - ret = DID_FILL_FROM_SCAN; + ret = bch_err_throw(c, topology_repair_did_fill_from_scan); } else { ret = set_node_max(c, child, b->key.k.p); } } -err: fsck_err: - printbuf_exit(&buf); return ret; } @@ -339,7 +363,7 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct struct bkey_buf prev_k, cur_k; struct btree *prev = NULL, *cur = NULL; bool have_child, new_pass = false; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); int ret = 0; if (!b->c.level) @@ -370,20 +394,13 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct prt_char(&buf, ' '); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); - if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), - trans, btree_node_read_error, - "Topology repair: unreadable btree node at\n%s", - buf.buf)) { + if (bch2_err_matches(ret, EIO)) { bch2_btree_node_evict(trans, cur_k.k); cur = NULL; ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level, cur_k.k->k.p); if (ret) break; - - ret = bch2_btree_lost_data(c, b->c.btree_id); - if (ret) - break; continue; } @@ -403,13 +420,17 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct continue; } - ret = btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan); - if (ret == DID_FILL_FROM_SCAN) { + ret = lockrestart_do(trans, + btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan)); + if (ret && !bch2_err_matches(ret, BCH_ERR_topology_repair)) + goto err; + + if (bch2_err_matches(ret, BCH_ERR_topology_repair_did_fill_from_scan)) { new_pass = true; ret = 0; } - if (ret == DROP_THIS_NODE) { + if (bch2_err_matches(ret, BCH_ERR_topology_repair_drop_this_node)) { six_unlock_read(&cur->c.lock); bch2_btree_node_evict(trans, cur_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, @@ -424,7 +445,7 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct six_unlock_read(&prev->c.lock); prev = NULL; - if (ret == DROP_PREV_NODE) { + if (bch2_err_matches(ret, BCH_ERR_topology_repair_drop_prev_node)) { bch_info(c, "dropped prev node"); bch2_btree_node_evict(trans, prev_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, @@ -444,8 +465,9 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct if (!ret && !IS_ERR_OR_NULL(prev)) { BUG_ON(cur); - ret = btree_repair_node_end(trans, b, prev, pulled_from_scan); - if (ret == DID_FILL_FROM_SCAN) { + ret = lockrestart_do(trans, + btree_repair_node_end(trans, b, prev, pulled_from_scan)); + if (bch2_err_matches(ret, BCH_ERR_topology_repair_did_fill_from_scan)) { new_pass = true; ret = 0; } @@ -486,7 +508,7 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct six_unlock_read(&cur->c.lock); cur = NULL; - if (ret == DROP_THIS_NODE) { + if (bch2_err_matches(ret, BCH_ERR_topology_repair_drop_this_node)) { bch2_btree_node_evict(trans, cur_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level, cur_k.k->k.p); @@ -504,10 +526,16 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + /* + * XXX: we're not passing the trans object here because we're not set up + * to handle a transaction restart - this code needs to be rewritten + * when we start doing online topology repair + */ + bch2_trans_unlock_long(trans); if (mustfix_fsck_err_on(!have_child, - trans, btree_node_topology_interior_node_empty, + c, btree_node_topology_interior_node_empty, "empty interior btree node at %s", buf.buf)) - ret = DROP_THIS_NODE; + ret = bch_err_throw(c, topology_repair_drop_this_node); err: fsck_err: if (!IS_ERR_OR_NULL(prev)) @@ -524,78 +552,99 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct bch2_bkey_buf_exit(&prev_k, c); bch2_bkey_buf_exit(&cur_k, c); - printbuf_exit(&buf); + if (!bch2_err_matches(ret, BCH_ERR_topology_repair)) + bch_err_fn(c, ret); return ret; } -int bch2_check_topology(struct bch_fs *c) +static int bch2_check_root(struct btree_trans *trans, enum btree_id btree, + bool *reconstructed_root) { - struct btree_trans *trans = bch2_trans_get(c); - struct bpos pulled_from_scan = POS_MIN; - struct printbuf buf = PRINTBUF; + struct bch_fs *c = trans->c; + struct btree_root *r = bch2_btree_id_root(c, btree); + CLASS(printbuf, buf)(); int ret = 0; - bch2_trans_srcu_unlock(trans); + bch2_btree_id_to_text(&buf, btree); - for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { - struct btree_root *r = bch2_btree_id_root(c, i); - bool reconstructed_root = false; + if (r->error) { + bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf); - printbuf_reset(&buf); - bch2_btree_id_to_text(&buf, i); + ret = bch2_btree_has_scanned_nodes(c, btree); + if (ret < 0) + goto err; - if (r->error) { - ret = bch2_btree_lost_data(c, i); - if (ret) - break; -reconstruct_root: - bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf); + if (!ret) { + __fsck_err(trans, + FSCK_CAN_FIX|(!btree_id_important(btree) ? FSCK_AUTOFIX : 0), + btree_root_unreadable_and_scan_found_nothing, + "no nodes found for btree %s, continue?", buf.buf); r->alive = false; r->error = 0; + bch2_btree_root_alloc_fake_trans(trans, btree, 0); + } else { + r->alive = false; + r->error = 0; + bch2_btree_root_alloc_fake_trans(trans, btree, 1); - if (!bch2_btree_has_scanned_nodes(c, i)) { - __fsck_err(trans, - FSCK_CAN_FIX|(!btree_id_important(i) ? FSCK_AUTOFIX : 0), - btree_root_unreadable_and_scan_found_nothing, - "no nodes found for btree %s, continue?", buf.buf); - bch2_btree_root_alloc_fake_trans(trans, i, 0); - } else { - bch2_btree_root_alloc_fake_trans(trans, i, 1); - bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); - ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX); - if (ret) - break; - } - - reconstructed_root = true; + bch2_shoot_down_journal_keys(c, btree, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + ret = bch2_get_scanned_nodes(c, btree, 0, POS_MIN, SPOS_MAX); + if (ret) + return ret; } + *reconstructed_root = true; + } +err: +fsck_err: + bch_err_fn(c, ret); + return ret; +} + +int bch2_check_topology(struct bch_fs *c) +{ + CLASS(btree_trans, trans)(c); + struct bpos pulled_from_scan = POS_MIN; + int ret = 0; + + bch2_trans_srcu_unlock(trans); + + for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { + bool reconstructed_root = false; +recover: + ret = lockrestart_do(trans, bch2_check_root(trans, i, &reconstructed_root)); + if (ret) + break; + + struct btree_root *r = bch2_btree_id_root(c, i); struct btree *b = r->b; btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan); + ret = btree_check_root_boundaries(trans, b) ?: + bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan); six_unlock_read(&b->c.lock); - if (ret == DROP_THIS_NODE) { - mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, b); - mutex_unlock(&c->btree_cache.lock); + if (bch2_err_matches(ret, BCH_ERR_topology_repair_drop_this_node)) { + scoped_guard(mutex, &c->btree_cache.lock) + bch2_btree_node_hash_remove(&c->btree_cache, b); r->b = NULL; - if (!reconstructed_root) - goto reconstruct_root; + if (!reconstructed_root) { + r->error = -EIO; + goto recover; + } + CLASS(printbuf, buf)(); + bch2_btree_id_to_text(&buf, i); bch_err(c, "empty btree root %s", buf.buf); bch2_btree_root_alloc_fake_trans(trans, i, 0); r->alive = false; ret = 0; } } -fsck_err: - printbuf_exit(&buf); - bch2_trans_put(trans); + return ret; } @@ -622,13 +671,13 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, struct bkey deleted = KEY(0, 0, 0); struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); int ret = 0; deleted.p = k.k->p; if (initial) { - BUG_ON(bch2_journal_seq_verify && + BUG_ON(static_branch_unlikely(&bch2_journal_seq_verify) && k.k->bversion.lo > atomic64_read(&c->journal.seq)); if (fsck_err_on(btree_id != BTREE_ID_accounting && @@ -646,10 +695,9 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - mutex_lock(&c->sb_lock); + guard(mutex)(&c->sb_lock); bch2_dev_btree_bitmap_mark(c, k); bch2_write_super(c); - mutex_unlock(&c->sb_lock); } /* @@ -664,7 +712,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, if (ret) goto out; - if (trans->nr_updates) { + if (bch2_trans_has_updates(trans)) { ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: -BCH_ERR_transaction_restart_nested; goto out; @@ -674,7 +722,6 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, BTREE_TRIGGER_gc|BTREE_TRIGGER_insert|flags); out: fsck_err: - printbuf_exit(&buf); bch_err_fn(c, ret); return ret; } @@ -702,6 +749,7 @@ static int bch2_gc_btree(struct btree_trans *trans, gc_pos_set(c, gc_pos_btree(btree, level, k.k->p)); bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial); })); + bch2_trans_iter_exit(&iter); if (ret) goto err; } @@ -714,13 +762,13 @@ static int bch2_gc_btree(struct btree_trans *trans, struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, bch2_btree_id_root(c, btree)->b->c.level, 0); - struct btree *b = bch2_btree_iter_peek_node(trans, &iter); + struct btree *b = bch2_btree_iter_peek_node(&iter); ret = PTR_ERR_OR_ZERO(b); if (ret) goto err_root; if (b != btree_node_root(c, b)) { - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); goto retry_root; } @@ -728,7 +776,7 @@ static int bch2_gc_btree(struct btree_trans *trans, struct bkey_s_c k = bkey_i_to_s_c(&b->key); ret = bch2_gc_mark_key(trans, btree, b->c.level + 1, NULL, NULL, k, initial); err_root: - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); err: bch_err_fn(c, ret); @@ -742,8 +790,8 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) static int bch2_gc_btrees(struct bch_fs *c) { - struct btree_trans *trans = bch2_trans_get(c); - struct printbuf buf = PRINTBUF; + CLASS(btree_trans, trans)(c); + CLASS(printbuf, buf)(); int ret = 0; struct progress_indicator_state progress; @@ -763,8 +811,6 @@ static int bch2_gc_btrees(struct bch_fs *c) ret = bch2_gc_btree(trans, &progress, btree, true); } - printbuf_exit(&buf); - bch2_trans_put(trans); bch_err_fn(c, ret); return ret; } @@ -916,16 +962,16 @@ static int bch2_alloc_write_key(struct btree_trans *trans, static int bch2_gc_alloc_done(struct bch_fs *c) { + CLASS(btree_trans, trans)(c); int ret = 0; for_each_member_device(c, ca) { - ret = bch2_trans_run(c, - for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc, + ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, ca->mi.first_bucket), POS(ca->dev_idx, ca->mi.nbuckets - 1), BTREE_ITER_slots|BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_alloc_write_key(trans, &iter, ca, k))); + bch2_alloc_write_key(trans, &iter, ca, k)); if (ret) { bch2_dev_put(ca); break; @@ -944,7 +990,7 @@ static int bch2_gc_alloc_start(struct bch_fs *c) ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL); if (ret) { bch2_dev_put(ca); - ret = -BCH_ERR_ENOMEM_gc_alloc_start; + ret = bch_err_throw(c, ENOMEM_gc_alloc_start); break; } } @@ -958,7 +1004,7 @@ static int bch2_gc_write_stripes_key(struct btree_trans *trans, struct bkey_s_c k) { struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); const struct bch_stripe *s; struct gc_stripe *m; bool bad = false; @@ -1003,18 +1049,17 @@ static int bch2_gc_write_stripes_key(struct btree_trans *trans, ret = bch2_trans_update(trans, iter, &new->k_i, 0); } fsck_err: - printbuf_exit(&buf); return ret; } static int bch2_gc_stripes_done(struct bch_fs *c) { - return bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, + CLASS(btree_trans, trans)(c); + return for_each_btree_key_commit(trans, iter, BTREE_ID_stripes, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_gc_write_stripes_key(trans, &iter, k))); + bch2_gc_write_stripes_key(trans, &iter, k)); } /** @@ -1043,8 +1088,8 @@ int bch2_check_allocations(struct bch_fs *c) { int ret; - down_read(&c->state_lock); - down_write(&c->gc_lock); + guard(rwsem_read)(&c->state_lock); + guard(rwsem_write)(&c->gc_lock); bch2_btree_interior_updates_flush(c); @@ -1073,22 +1118,21 @@ int bch2_check_allocations(struct bch_fs *c) bch2_gc_stripes_done(c) ?: bch2_gc_reflink_done(c); out: - percpu_down_write(&c->mark_lock); - /* Indicates that gc is no longer in progress: */ - __gc_pos_set(c, gc_phase(GC_PHASE_not_running)); - - bch2_gc_free(c); - percpu_up_write(&c->mark_lock); - - up_write(&c->gc_lock); - up_read(&c->state_lock); + scoped_guard(percpu_write, &c->mark_lock) { + /* Indicates that gc is no longer in progress: */ + __gc_pos_set(c, gc_phase(GC_PHASE_not_running)); + bch2_gc_free(c); + } /* * At startup, allocations can happen directly instead of via the * allocator thread - issue wakeup in case they blocked on gc_lock: */ closure_wake_up(&c->freelist_wait); - bch_err_fn(c, ret); + + if (!ret && !test_bit(BCH_FS_errors_not_fixed, &c->flags)) + bch2_sb_members_clean_deleted(c); + return ret; } @@ -1098,42 +1142,41 @@ static int gc_btree_gens_key(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct bkey_i *u; - int ret; if (unlikely(test_bit(BCH_FS_going_ro, &c->flags))) return -EROFS; - rcu_read_lock(); - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - if (!ca) - continue; + bool too_stale = false; + scoped_guard(rcu) { + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ca) + continue; - if (dev_ptr_stale(ca, ptr) > 16) { - rcu_read_unlock(); - goto update; + too_stale |= dev_ptr_stale(ca, ptr) > 16; } + + if (!too_stale) + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ca) + continue; + + u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; + if (gen_after(*gen, ptr->gen)) + *gen = ptr->gen; + } } - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - if (!ca) - continue; + if (too_stale) { + struct bkey_i *u = bch2_bkey_make_mut(trans, iter, &k, 0); + int ret = PTR_ERR_OR_ZERO(u); + if (ret) + return ret; - u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; - if (gen_after(*gen, ptr->gen)) - *gen = ptr->gen; + bch2_extent_normalize(c, bkey_i_to_s(u)); } - rcu_read_unlock(); - return 0; -update: - u = bch2_bkey_make_mut(trans, iter, &k, 0); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - return ret; - bch2_extent_normalize(c, bkey_i_to_s(u)); return 0; } @@ -1186,7 +1229,7 @@ int bch2_gc_gens(struct bch_fs *c) ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL); if (!ca->oldest_gen) { bch2_dev_put(ca); - ret = -BCH_ERR_ENOMEM_gc_gens; + ret = bch_err_throw(c, ENOMEM_gc_gens); goto err; } @@ -1222,7 +1265,7 @@ int bch2_gc_gens(struct bch_fs *c) BCH_TRANS_COMMIT_no_enospc, ({ ca = bch2_dev_iterate(c, ca, k.k->p.inode); if (!ca) { - bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); continue; } bch2_alloc_write_oldest_gen(trans, ca, &iter, k); @@ -1256,26 +1299,21 @@ static void bch2_gc_gens_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work); bch2_gc_gens(c); - bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens); } void bch2_gc_gens_async(struct bch_fs *c) { - if (bch2_write_ref_tryget(c, BCH_WRITE_REF_gc_gens) && + if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_gc_gens) && !queue_work(c->write_ref_wq, &c->gc_gens_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens); } -void bch2_fs_btree_gc_exit(struct bch_fs *c) -{ -} - -int bch2_fs_btree_gc_init(struct bch_fs *c) +void bch2_fs_btree_gc_init_early(struct bch_fs *c) { seqcount_init(&c->gc_pos_lock); INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work); init_rwsem(&c->gc_lock); mutex_init(&c->gc_gens_lock); - return 0; } diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h index 9693a90a48a2..ec77662369a2 100644 --- a/fs/bcachefs/btree_gc.h +++ b/fs/bcachefs/btree_gc.h @@ -83,7 +83,6 @@ void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *); int bch2_gc_gens(struct bch_fs *); void bch2_gc_gens_async(struct bch_fs *); -void bch2_fs_btree_gc_exit(struct bch_fs *); -int bch2_fs_btree_gc_init(struct bch_fs *); +void bch2_fs_btree_gc_init_early(struct bch_fs *); #endif /* _BCACHEFS_BTREE_GC_H */ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 60782f3e5aec..8a03cd75a64f 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "async_objs.h" #include "bkey_buf.h" #include "bkey_methods.h" #include "bkey_sort.h" @@ -13,6 +14,7 @@ #include "buckets.h" #include "checksum.h" #include "debug.h" +#include "enumerated_ref.h" #include "error.h" #include "extents.h" #include "io_write.h" @@ -22,8 +24,15 @@ #include "super-io.h" #include "trace.h" +#include #include +#ifdef CONFIG_BCACHEFS_DEBUG +static unsigned bch2_btree_read_corrupt_ratio; +module_param_named(btree_read_corrupt_ratio, bch2_btree_read_corrupt_ratio, uint, 0644); +MODULE_PARM_DESC(btree_read_corrupt_ratio, ""); +#endif + static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn) { bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn)); @@ -514,19 +523,23 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) static void btree_err_msg(struct printbuf *out, struct bch_fs *c, struct bch_dev *ca, + bool print_pos, struct btree *b, struct bset *i, struct bkey_packed *k, - unsigned offset, int write) + unsigned offset, int rw) { - prt_printf(out, bch2_log_msg(c, "%s"), - write == READ - ? "error validating btree node " - : "corrupt btree node before write "); + if (print_pos) { + prt_str(out, rw == READ + ? "error validating btree node " + : "corrupt btree node before write "); + prt_printf(out, "at btree "); + bch2_btree_pos_to_text(out, c, b); + prt_newline(out); + } + if (ca) - prt_printf(out, "on %s ", ca->name); - prt_printf(out, "at btree "); - bch2_btree_pos_to_text(out, c, b); + prt_printf(out, "%s ", ca->name); - prt_printf(out, "\nnode offset %u/%u", + prt_printf(out, "node offset %u/%u", b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key))); if (i) prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); @@ -537,93 +550,127 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, prt_str(out, ": "); } -__printf(10, 11) +__printf(11, 12) static int __btree_err(int ret, struct bch_fs *c, struct bch_dev *ca, struct btree *b, struct bset *i, struct bkey_packed *k, - int write, - bool have_retry, + int rw, enum bch_sb_error_id err_type, + struct bch_io_failures *failed, + struct printbuf *err_msg, const char *fmt, ...) { - bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes; + if (c->recovery.curr_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) + return ret == -BCH_ERR_btree_node_read_err_fixable + ? bch_err_throw(c, fsck_fix) + : ret; + + bool have_retry = false; + int ret2; + + if (ca) { + bch2_mark_btree_validate_failure(failed, ca->dev_idx); + + struct extent_ptr_decoded pick; + have_retry = bch2_bkey_pick_read_device(c, + bkey_i_to_s_c(&b->key), + failed, &pick, -1) == 1; + } if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry) - ret = -BCH_ERR_btree_node_read_err_fixable; + ret = bch_err_throw(c, btree_node_read_err_fixable); if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry) - ret = -BCH_ERR_btree_node_read_err_bad_node; + ret = bch_err_throw(c, btree_node_read_err_bad_node); - if (!silent && ret != -BCH_ERR_btree_node_read_err_fixable) - bch2_sb_error_count(c, err_type); + bch2_sb_error_count(c, err_type); - struct printbuf out = PRINTBUF; - if (write != WRITE && ret != -BCH_ERR_btree_node_read_err_fixable) { - printbuf_indent_add_nextline(&out, 2); -#ifdef BCACHEFS_LOG_PREFIX - prt_printf(&out, bch2_log_msg(c, "")); -#endif - } + bool print_deferred = err_msg && + rw == READ && + !(test_bit(BCH_FS_in_fsck, &c->flags) && + c->opts.fix_errors == FSCK_FIX_ask); + + CLASS(printbuf, out)(); + bch2_log_msg_start(c, &out); - btree_err_msg(&out, c, ca, b, i, k, b->written, write); + if (!print_deferred) + err_msg = &out; + + btree_err_msg(err_msg, c, ca, !print_deferred, b, i, k, b->written, rw); va_list args; va_start(args, fmt); - prt_vprintf(&out, fmt, args); + prt_vprintf(err_msg, fmt, args); va_end(args); - if (write == WRITE) { + if (print_deferred) { + prt_newline(err_msg); + + switch (ret) { + case -BCH_ERR_btree_node_read_err_fixable: + ret2 = bch2_fsck_err_opt(c, FSCK_CAN_FIX, err_type); + if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) && + !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) { + ret = ret2; + goto fsck_err; + } + + if (!have_retry) + ret = bch_err_throw(c, fsck_fix); + return ret; + case -BCH_ERR_btree_node_read_err_bad_node: + prt_str(&out, ", "); + break; + } + + return ret; + } + + if (rw == WRITE) { prt_str(&out, ", "); ret = __bch2_inconsistent_error(c, &out) ? -BCH_ERR_fsck_errors_not_fixed : 0; - silent = false; + goto print; } switch (ret) { case -BCH_ERR_btree_node_read_err_fixable: - ret = !silent - ? __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf) - : -BCH_ERR_fsck_fix; - if (ret != -BCH_ERR_fsck_fix && - ret != -BCH_ERR_fsck_ignore) + ret2 = __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf); + if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) && + !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) { + ret = ret2; goto fsck_err; - ret = -BCH_ERR_fsck_fix; - goto out; + } + + if (!have_retry) + ret = bch_err_throw(c, fsck_fix); + return ret; case -BCH_ERR_btree_node_read_err_bad_node: prt_str(&out, ", "); - ret = __bch2_topology_error(c, &out); - if (ret) - silent = false; - break; - case -BCH_ERR_btree_node_read_err_incompatible: - ret = -BCH_ERR_fsck_errors_not_fixed; - silent = false; break; } - - if (!silent) - bch2_print_string_as_lines(KERN_ERR, out.buf); -out: +print: + bch2_print_str(c, KERN_ERR, out.buf); fsck_err: - printbuf_exit(&out); return ret; } #define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \ ({ \ - int _ret = __btree_err(type, c, ca, b, i, k, write, have_retry, \ + int _ret = __btree_err(type, c, ca, b, i, k, write, \ BCH_FSCK_ERR_##_err_type, \ + failed, err_msg, \ msg, ##__VA_ARGS__); \ \ - if (_ret != -BCH_ERR_fsck_fix) { \ + if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix)) { \ ret = _ret; \ goto fsck_err; \ } \ \ - *saw_error = true; \ + true; \ }) #define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) @@ -681,13 +728,13 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b) static int validate_bset(struct bch_fs *c, struct bch_dev *ca, struct btree *b, struct bset *i, - unsigned offset, unsigned sectors, - int write, bool have_retry, bool *saw_error) + unsigned offset, int write, + struct bch_io_failures *failed, + struct printbuf *err_msg) { unsigned version = le16_to_cpu(i->version); - unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; + CLASS(printbuf, buf1)(); + CLASS(printbuf, buf2)(); int ret = 0; btree_err_on(!bch2_version_compatible(version), @@ -698,16 +745,21 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, BCH_VERSION_MAJOR(version), BCH_VERSION_MINOR(version)); - if (btree_err_on(version < c->sb.version_min, + if (c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes && + btree_err_on(version < c->sb.version_min, -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, NULL, btree_node_bset_older_than_sb_min, "bset version %u older than superblock version_min %u", version, c->sb.version_min)) { - mutex_lock(&c->sb_lock); - c->disk_sb.sb->version_min = cpu_to_le16(version); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); + if (bch2_version_compatible(version)) { + guard(mutex)(&c->sb_lock); + c->disk_sb.sb->version_min = cpu_to_le16(version); + bch2_write_super(c); + } else { + /* We have no idea what's going on: */ + i->version = cpu_to_le16(c->sb.version); + } } if (btree_err_on(BCH_VERSION_MAJOR(version) > @@ -717,10 +769,9 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_node_bset_newer_than_sb, "bset version %u newer than superblock version %u", version, c->sb.version)) { - mutex_lock(&c->sb_lock); + guard(mutex)(&c->sb_lock); c->disk_sb.sb->version = cpu_to_le16(version); bch2_write_super(c); - mutex_unlock(&c->sb_lock); } btree_err_on(BSET_SEPARATE_WHITEOUTS(i), @@ -729,15 +780,6 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_node_unsupported_version, "BSET_SEPARATE_WHITEOUTS no longer supported"); - if (!write && - btree_err_on(offset + sectors > (ptr_written ?: btree_sectors(c)), - -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, NULL, - bset_past_end_of_btree_node, - "bset past end of btree node (offset %u len %u but written %zu)", - offset, sectors, ptr_written ?: btree_sectors(c))) - i->u64s = 0; - btree_err_on(offset && !i->u64s, -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i, NULL, @@ -829,8 +871,6 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, &bn->format); } fsck_err: - printbuf_exit(&buf2); - printbuf_exit(&buf1); return ret; } @@ -895,11 +935,12 @@ static inline int btree_node_read_bkey_cmp(const struct btree *b, static int validate_bset_keys(struct bch_fs *c, struct btree *b, struct bset *i, int write, - bool have_retry, bool *saw_error) + struct bch_io_failures *failed, + struct printbuf *err_msg) { unsigned version = le16_to_cpu(i->version); struct bkey_packed *k, *prev = NULL; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); int ret = 0; @@ -1001,14 +1042,16 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, le16_add_cpu(&i->u64s, -next_good_key); memmove_u64s_down(k, (u64 *) k + next_good_key, (u64 *) vstruct_end(i) - (u64 *) k); set_btree_node_need_rewrite(b); + set_btree_node_need_rewrite_error(b); } fsck_err: - printbuf_exit(&buf); return ret; } int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, - struct btree *b, bool have_retry, bool *saw_error) + struct btree *b, + struct bch_io_failures *failed, + struct printbuf *err_msg) { struct btree_node_entry *bne; struct sort_iter *iter; @@ -1018,11 +1061,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, bool used_mempool, blacklisted; bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); - unsigned u64s; unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); u64 max_journal_seq = 0; - struct printbuf buf = PRINTBUF; - int ret = 0, retry_read = 0, write = READ; + CLASS(printbuf, buf)(); + int ret = 0, write = READ; u64 start_time = local_clock(); b->version_ondisk = U16_MAX; @@ -1099,6 +1141,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, "unknown checksum type %llu", BSET_CSUM_TYPE(i)); if (first) { + sectors = vstruct_sectors(b->data, c->block_bits); + if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)), + -BCH_ERR_btree_node_read_err_fixable, + c, ca, b, i, NULL, + bset_past_end_of_btree_node, + "bset past end of btree node (offset %u len %u but written %zu)", + b->written, sectors, ptr_written ?: btree_sectors(c))) + i->u64s = 0; if (good_csum_type) { struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); bool csum_bad = bch2_crc_cmp(b->data->csum, csum); @@ -1126,9 +1176,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, c, NULL, b, NULL, NULL, btree_node_unsupported_version, "btree node does not have NEW_EXTENT_OVERWRITE set"); - - sectors = vstruct_sectors(b->data, c->block_bits); } else { + sectors = vstruct_sectors(bne, c->block_bits); + if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)), + -BCH_ERR_btree_node_read_err_fixable, + c, ca, b, i, NULL, + bset_past_end_of_btree_node, + "bset past end of btree node (offset %u len %u but written %zu)", + b->written, sectors, ptr_written ?: btree_sectors(c))) + i->u64s = 0; if (good_csum_type) { struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); bool csum_bad = bch2_crc_cmp(bne->csum, csum); @@ -1149,22 +1205,19 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, "decrypting btree node: %s", bch2_err_str(ret))) goto fsck_err; } - - sectors = vstruct_sectors(bne, c->block_bits); } b->version_ondisk = min(b->version_ondisk, le16_to_cpu(i->version)); - ret = validate_bset(c, ca, b, i, b->written, sectors, - READ, have_retry, saw_error); + ret = validate_bset(c, ca, b, i, b->written, READ, failed, err_msg); if (ret) goto fsck_err; if (!b->written) btree_node_set_format(b, b->data->format); - ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error); + ret = validate_bset_keys(c, b, i, READ, failed, err_msg); if (ret) goto fsck_err; @@ -1225,29 +1278,23 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool); sorted->keys.u64s = 0; - set_btree_bset(b, b->set, &b->data->keys); - b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter); memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0, btree_buf_bytes(b) - sizeof(struct btree_node) - b->nr.live_u64s * sizeof(u64)); - u64s = le16_to_cpu(sorted->keys.u64s); + b->data->keys.u64s = sorted->keys.u64s; *sorted = *b->data; - sorted->keys.u64s = cpu_to_le16(u64s); swap(sorted, b->data); set_btree_bset(b, b->set, &b->data->keys); b->nsets = 1; b->data->keys.journal_seq = cpu_to_le64(max_journal_seq); - BUG_ON(b->nr.live_u64s != u64s); + BUG_ON(b->nr.live_u64s != le16_to_cpu(b->data->keys.u64s)); btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted); - if (updated_range) - bch2_btree_node_drop_keys_outside_node(b); - i = &b->data->keys; for (k = i->start; k != vstruct_last(i);) { struct bkey tmp; @@ -1255,7 +1302,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, ret = btree_node_bkey_val_validate(c, b, u.s_c, READ); if (ret == -BCH_ERR_fsck_delete_bkey || - (bch2_inject_invalid_keys && + (static_branch_unlikely(&bch2_inject_invalid_keys) && !bversion_cmp(u.k->bversion, MAX_VERSION))) { btree_keys_account_key_drop(&b->nr, 0, k); @@ -1264,6 +1311,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, (u64 *) vstruct_end(i) - (u64 *) k); set_btree_bset_end(b, b->set); set_btree_node_need_rewrite(b); + set_btree_node_need_rewrite_error(b); continue; } if (ret) @@ -1284,31 +1332,54 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_node_reset_sib_u64s(b); - rcu_read_lock(); - bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { - struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); + if (updated_range) + bch2_btree_node_drop_keys_outside_node(b); - if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) - set_btree_node_need_rewrite(b); + /* + * XXX: + * + * We deadlock if too many btree updates require node rewrites while + * we're still in journal replay. + * + * This is because btree node rewrites generate more updates for the + * interior updates (alloc, backpointers), and if those updates touch + * new nodes and generate more rewrites - well, you see the problem. + * + * The biggest cause is that we don't use the btree write buffer (for + * the backpointer updates - this needs some real thought on locking in + * order to fix. + * + * The problem with this workaround (not doing the rewrite for degraded + * nodes in journal replay) is that those degraded nodes persist, and we + * don't want that (this is a real bug when a btree node write completes + * with fewer replicas than we wanted and leaves a degraded node due to + * device _removal_, i.e. the device went away mid write). + * + * It's less of a bug here, but still a problem because we don't yet + * have a way of tracking degraded data - we another index (all + * extents/btree nodes, by replicas entry) in order to fix properly + * (re-replicate degraded data at the earliest possible time). + */ + if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay)) { + scoped_guard(rcu) + bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { + struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); + + if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) { + set_btree_node_need_rewrite(b); + set_btree_node_need_rewrite_degraded(b); + } + } } - rcu_read_unlock(); - if (!ptr_written) + if (!ptr_written) { set_btree_node_need_rewrite(b); -out: + set_btree_node_need_rewrite_ptr_written_zero(b); + } +fsck_err: mempool_free(iter, &c->fill_iter); - printbuf_exit(&buf); bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time); - return retry_read; -fsck_err: - if (ret == -BCH_ERR_btree_node_read_err_want_retry || - ret == -BCH_ERR_btree_node_read_err_must_retry) { - retry_read = 1; - } else { - set_btree_node_read_error(b); - bch2_btree_lost_data(c, b->c.btree_id); - } - goto out; + return ret; } static void btree_node_read_work(struct work_struct *work) @@ -1320,16 +1391,24 @@ static void btree_node_read_work(struct work_struct *work) struct btree *b = rb->b; struct bio *bio = &rb->bio; struct bch_io_failures failed = { .nr = 0 }; - struct printbuf buf = PRINTBUF; - bool saw_error = false; - bool retry = false; - bool can_retry; + int ret = 0; + + CLASS(printbuf, buf)(); + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "btree node read error at btree "); + bch2_btree_pos_to_text(&buf, c, b); + prt_newline(&buf); goto start; while (1) { - retry = true; - bch_info(c, "retrying read"); - ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ); + ret = bch2_bkey_pick_read_device(c, + bkey_i_to_s_c(&b->key), + &failed, &rb->pick, -1); + if (ret <= 0) + break; + + ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read); rb->have_ioref = ca != NULL; rb->start_time = local_clock(); bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); @@ -1346,60 +1425,58 @@ static void btree_node_read_work(struct work_struct *work) bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, rb->start_time, !bio->bi_status); start: - printbuf_reset(&buf); - bch2_btree_pos_to_text(&buf, c, b); - - if (ca && bio->bi_status) - bch_err_dev_ratelimited(ca, - "btree read error %s for %s", - bch2_blk_status_to_str(bio->bi_status), buf.buf); if (rb->have_ioref) - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_read); rb->have_ioref = false; - bch2_mark_io_failure(&failed, &rb->pick, false); - - can_retry = bch2_bkey_pick_read_device(c, - bkey_i_to_s_c(&b->key), - &failed, &rb->pick, -1) > 0; - - if (!bio->bi_status && - !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) { - if (retry) - bch_info(c, "retry success"); - break; + if (bio->bi_status) { + bch2_mark_io_failure(&failed, &rb->pick, false); + continue; } - saw_error = true; + memset(&bio->bi_iter, 0, sizeof(bio->bi_iter)); + bio->bi_iter.bi_size = btree_buf_bytes(b); + + bch2_maybe_corrupt_bio(bio, bch2_btree_read_corrupt_ratio); - if (!can_retry) { - set_btree_node_read_error(b); - bch2_btree_lost_data(c, b->c.btree_id); + ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf); + if (ret != -BCH_ERR_btree_node_read_err_want_retry && + ret != -BCH_ERR_btree_node_read_err_must_retry) break; - } } - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], - rb->start_time); - bio_put(&rb->bio); + bch2_io_failures_to_text(&buf, c, &failed); + + /* + * only print retry success if we read from a replica with no errors + */ + if (ret) { + set_btree_node_read_error(b); + bch2_btree_lost_data(c, &buf, b->c.btree_id); + prt_printf(&buf, "ret %s", bch2_err_str(ret)); + } else if (failed.nr) { + if (!bch2_dev_io_failures(&failed, rb->pick.ptr.dev)) + prt_printf(&buf, "retry success"); + else + prt_printf(&buf, "repair success"); + } - if ((saw_error || + if ((failed.nr || btree_node_need_rewrite(b)) && !btree_node_read_error(b) && - c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { - if (saw_error) { - printbuf_reset(&buf); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_str(&buf, " "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - bch_err_ratelimited(c, "%s: rewriting btree node at due to error\n %s", - __func__, buf.buf); - } - + c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { + prt_printf(&buf, " (rewriting node)"); bch2_btree_node_rewrite_async(c, b); } + prt_newline(&buf); + + if (failed.nr) + bch2_print_str_ratelimited(c, KERN_ERR, buf.buf); - printbuf_exit(&buf); + async_object_list_del(c, btree_read_bio, rb->list_idx); + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], + rb->start_time); + bio_put(&rb->bio); clear_btree_node_read_in_flight(b); smp_mb__after_atomic(); wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); @@ -1419,6 +1496,11 @@ static void btree_node_read_endio(struct bio *bio) queue_work(c->btree_read_complete_wq, &rb->work); } +void bch2_btree_read_bio_to_text(struct printbuf *out, struct btree_read_bio *rbio) +{ + bch2_bio_to_text(out, &rbio->bio); +} + struct btree_node_read_all { struct closure cl; struct bch_fs *c; @@ -1476,14 +1558,15 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) closure_type(ra, struct btree_node_read_all, cl); struct bch_fs *c = ra->c; struct btree *b = ra->b; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bool dump_bset_maps = false; - bool have_retry = false; int ret = 0, best = -1, write = READ; unsigned i, written = 0, written2 = 0; __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2 ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0; bool _saw_error = false, *saw_error = &_saw_error; + struct printbuf *err_msg = NULL; + struct bch_io_failures *failed = NULL; for (i = 0; i < ra->nr; i++) { struct btree_node *bn = ra->buf[i]; @@ -1576,14 +1659,18 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) if (best >= 0) { memcpy(b->data, ra->buf[best], btree_buf_bytes(b)); - ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error); + ret = bch2_btree_node_read_done(c, NULL, b, NULL, NULL); } else { ret = -1; } if (ret) { set_btree_node_read_error(b); - bch2_btree_lost_data(c, b->c.btree_id); + + CLASS(printbuf, buf)(); + bch2_btree_lost_data(c, &buf, b->c.btree_id); + if (buf.pos) + bch_err(c, "%s", buf.buf); } else if (*saw_error) bch2_btree_node_rewrite_async(c, b); @@ -1594,7 +1681,6 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) closure_debug_destroy(&ra->cl); kfree(ra); - printbuf_exit(&buf); clear_btree_node_read_in_flight(b); smp_mb__after_atomic(); @@ -1612,7 +1698,8 @@ static void btree_node_read_all_replicas_endio(struct bio *bio) struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev); bch2_latency_acct(ca, rb->start_time, READ); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_btree_node_read_all_replicas); } ra->err[rb->idx] = bio->bi_status; @@ -1634,7 +1721,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool ra = kzalloc(sizeof(*ra), GFP_NOFS); if (!ra) - return -BCH_ERR_ENOMEM_btree_node_read_all_replicas; + return bch_err_throw(c, ENOMEM_btree_node_read_all_replicas); closure_init(&ra->cl, NULL); ra->c = c; @@ -1652,7 +1739,8 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool i = 0; bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) { - struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, + BCH_DEV_READ_REF_btree_node_read_all_replicas); struct btree_read_bio *rb = container_of(ra->bio[i], struct btree_read_bio, bio); rb->c = c; @@ -1701,9 +1789,9 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, struct bio *bio; int ret; - trace_and_count(c, btree_node_read, trans, b); + trace_btree_node(c, b, btree_node_read); - if (bch2_verify_all_btree_replicas && + if (static_branch_unlikely(&bch2_verify_all_btree_replicas) && !btree_node_read_all_replicas(c, b, sync)) return; @@ -1711,26 +1799,33 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, NULL, &pick, -1); if (ret <= 0) { - struct printbuf buf = PRINTBUF; + bool ratelimit = true; + CLASS(printbuf, buf)(); + bch2_log_msg_start(c, &buf); prt_str(&buf, "btree node read error: no device to read from\n at "); bch2_btree_pos_to_text(&buf, c, b); - bch_err_ratelimited(c, "%s", buf.buf); + prt_newline(&buf); + bch2_btree_lost_data(c, &buf, b->c.btree_id); - if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && - c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) - bch2_fatal_error(c); + if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && + bch2_fs_emergency_read_only2(c, &buf)) + ratelimit = false; + + static DEFINE_RATELIMIT_STATE(rs, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + if (!ratelimit || __ratelimit(&rs)) + bch2_print_str(c, KERN_ERR, buf.buf); set_btree_node_read_error(b); - bch2_btree_lost_data(c, b->c.btree_id); clear_btree_node_read_in_flight(b); smp_mb__after_atomic(); wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); - printbuf_exit(&buf); return; } - ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read); bio = bio_alloc_bioset(NULL, buf_pages(b->data, btree_buf_bytes(b)), @@ -1749,6 +1844,8 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, bio->bi_end_io = btree_node_read_endio; bch2_bio_map(bio, b->data, btree_buf_bytes(b)); + async_object_list_add(c, btree_read_bio, rb, &rb->list_idx); + if (rb->have_ioref) { this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], bio_sectors(bio)); @@ -1801,11 +1898,10 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, bch2_btree_node_read(trans, b, true); if (btree_node_read_error(b)) { - mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, b); - mutex_unlock(&c->btree_cache.lock); + scoped_guard(mutex, &c->btree_cache.lock) + bch2_btree_node_hash_remove(&c->btree_cache, b); - ret = -BCH_ERR_btree_node_read_error; + ret = bch_err_throw(c, btree_node_read_error); goto err; } @@ -1820,7 +1916,8 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, const struct bkey_i *k, unsigned level) { - return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level)); + CLASS(btree_trans, trans)(c); + return __bch2_btree_root_read(trans, id, k, level); } struct btree_node_scrub { @@ -1899,43 +1996,26 @@ static void btree_node_scrub_work(struct work_struct *work) { struct btree_node_scrub *scrub = container_of(work, struct btree_node_scrub, work); struct bch_fs *c = scrub->c; - struct printbuf err = PRINTBUF; + CLASS(printbuf, err)(); __bch2_btree_pos_to_text(&err, c, scrub->btree, scrub->level, bkey_i_to_s_c(scrub->key.k)); prt_newline(&err); if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) { - struct btree_trans *trans = bch2_trans_get(c); - - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, scrub->btree, - scrub->key.k->k.p, 0, scrub->level - 1, 0); - - struct btree *b; - int ret = lockrestart_do(trans, - PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(trans, &iter))); - if (ret) - goto err; - - if (bkey_i_to_btree_ptr_v2(&b->key)->v.seq == scrub->seq) { - bch_err(c, "error validating btree node during scrub on %s at btree %s", - scrub->ca->name, err.buf); - - ret = bch2_btree_node_rewrite(trans, &iter, b, 0); - } -err: - bch2_trans_iter_exit(trans, &iter); - bch2_trans_begin(trans); - bch2_trans_put(trans); + int ret = bch2_trans_do(c, + bch2_btree_node_rewrite_key(trans, scrub->btree, scrub->level - 1, + scrub->key.k, 0)); + if (!bch2_err_matches(ret, ENOENT) && + !bch2_err_matches(ret, EROFS)) + bch_err_fn_ratelimited(c, ret); } - printbuf_exit(&err); - bch2_bkey_buf_exit(&scrub->key, c);; + bch2_bkey_buf_exit(&scrub->key, c); btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf); - percpu_ref_put(&scrub->ca->io_ref[READ]); + enumerated_ref_put(&scrub->ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub); kfree(scrub); - bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub); } static void btree_node_scrub_endio(struct bio *bio) @@ -1954,17 +2034,18 @@ int bch2_btree_node_scrub(struct btree_trans *trans, struct bch_fs *c = trans->c; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_node_scrub)) - return -BCH_ERR_erofs_no_writes; + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_node_scrub)) + return bch_err_throw(c, erofs_no_writes); struct extent_ptr_decoded pick; int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev); if (ret <= 0) goto err; - struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, + BCH_DEV_READ_REF_btree_node_scrub); if (!ca) { - ret = -BCH_ERR_device_offline; + ret = bch_err_throw(c, device_offline); goto err; } @@ -2002,9 +2083,9 @@ int bch2_btree_node_scrub(struct btree_trans *trans, return 0; err_free: btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub); err: - bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub); return ret; } @@ -2101,7 +2182,7 @@ static void btree_node_write_work(struct work_struct *work) bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) { - ret = -BCH_ERR_btree_node_write_all_failed; + ret = bch_err_throw(c, btree_node_write_all_failed); goto err; } @@ -2110,7 +2191,8 @@ static void btree_node_write_work(struct work_struct *work) } } else { - ret = bch2_trans_do(c, + CLASS(btree_trans, trans)(c); + ret = lockrestart_do(trans, bch2_btree_node_update_key_get_iter(trans, b, &wbio->key, BCH_WATERMARK_interior_updates| BCH_TRANS_COMMIT_journal_reclaim| @@ -2121,6 +2203,7 @@ static void btree_node_write_work(struct work_struct *work) goto err; } out: + async_object_list_del(c, btree_write_bio, wbio->list_idx); bio_put(&wbio->wbio.bio); btree_node_write_done(c, b, start_time); return; @@ -2128,11 +2211,10 @@ static void btree_node_write_work(struct work_struct *work) set_btree_node_noevict(b); if (!bch2_err_matches(ret, EROFS)) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); prt_printf(&buf, "writing btree node: %s\n ", bch2_err_str(ret)); bch2_btree_pos_to_text(&buf, c, b); bch2_fs_fatal_error(c, "%s", buf.buf); - printbuf_exit(&buf); } goto out; } @@ -2151,13 +2233,12 @@ static void btree_node_write_endio(struct bio *bio) wbio->submit_time, !bio->bi_status); if (ca && bio->bi_status) { - struct printbuf buf = PRINTBUF; - buf.atomic++; + CLASS(printbuf, buf)(); + guard(printbuf_atomic)(&buf); prt_printf(&buf, "btree write error: %s\n ", bch2_blk_status_to_str(bio->bi_status)); bch2_btree_pos_to_text(&buf, c, b); bch_err_dev_ratelimited(ca, "%s", buf.buf); - printbuf_exit(&buf); } if (bio->bi_status) { @@ -2172,7 +2253,8 @@ static void btree_node_write_endio(struct bio *bio) * btree writes yet (due to device removal/ro): */ if (wbio->have_ioref) - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_btree_node_write); if (parent) { bio_put(bio); @@ -2184,14 +2266,12 @@ static void btree_node_write_endio(struct bio *bio) smp_mb__after_atomic(); wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner); INIT_WORK(&wb->work, btree_node_write_work); - queue_work(c->btree_io_complete_wq, &wb->work); + queue_work(c->btree_write_complete_wq, &wb->work); } static int validate_bset_for_write(struct bch_fs *c, struct btree *b, - struct bset *i, unsigned sectors) + struct bset *i) { - bool saw_error; - int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key), (struct bkey_validate_context) { .from = BKEY_VALIDATE_btree_node, @@ -2204,8 +2284,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, return ret; } - ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?: - validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error); + ret = validate_bset_keys(c, b, i, WRITE, NULL, NULL) ?: + validate_bset(c, NULL, b, i, b->written, WRITE, NULL, NULL); if (ret) { bch2_inconsistent_error(c); dump_stack(); @@ -2398,7 +2478,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) /* if we're going to be encrypting, check metadata validity first: */ if (validate_before_checksum && - validate_bset_for_write(c, b, i, sectors_to_write)) + validate_bset_for_write(c, b, i)) goto err; ret = bset_encrypt(c, i, b->written << 9); @@ -2415,7 +2495,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) /* if we're not encrypting, check metadata after checksumming: */ if (!validate_before_checksum && - validate_bset_for_write(c, b, i, sectors_to_write)) + validate_bset_for_write(c, b, i)) goto err; /* @@ -2440,11 +2520,26 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) c->opts.nochanges) goto err; - trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write); + if (trace_btree_node_write_enabled()) { + CLASS(printbuf, buf)(); + printbuf_indent_add(&buf, 2); + prt_printf(&buf, "offset %u sectors %u bytes %u\n", + b->written, + sectors_to_write, + bytes_to_write); + bch2_btree_pos_to_text(&buf, c, b); + trace_btree_node_write(c, buf.buf); + } + count_event(c, btree_node_write); + + /* + * blk-wbt.c throttles all writes except those that have both REQ_SYNC + * and REQ_IDLE set... + */ wbio = container_of(bio_alloc_bioset(NULL, buf_pages(data, sectors_to_write << 9), - REQ_OP_WRITE|REQ_META, + REQ_OP_WRITE|REQ_META|REQ_SYNC|REQ_IDLE, GFP_NOFS, &c->btree_bio), struct btree_write_bio, wbio.bio); @@ -2472,6 +2567,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) atomic64_inc(&c->btree_write_stats[type].nr); atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes); + async_object_list_add(c, btree_write_bio, wbio, &wbio->list_idx); + INIT_WORK(&wbio->work, btree_write_submit); queue_work(c->btree_write_submit_wq, &wbio->work); return; diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index dbf76d22c660..30a5180532c8 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -41,6 +41,9 @@ struct btree_read_bio { u64 start_time; unsigned have_ioref:1; unsigned idx:7; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + unsigned list_idx; +#endif struct extent_ptr_decoded pick; struct work_struct work; struct bio bio; @@ -53,6 +56,9 @@ struct btree_write_bio { unsigned data_bytes; unsigned sector_offset; u64 start_time; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + unsigned list_idx; +#endif struct bch_write_bio wbio; }; @@ -128,11 +134,15 @@ void bch2_btree_build_aux_trees(struct btree *); void bch2_btree_init_next(struct btree_trans *, struct btree *); int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *, - struct btree *, bool, bool *); + struct btree *, + struct bch_io_failures *, + struct printbuf *); void bch2_btree_node_read(struct btree_trans *, struct btree *, bool); int bch2_btree_root_read(struct bch_fs *, enum btree_id, const struct bkey_i *, unsigned); +void bch2_btree_read_bio_to_text(struct printbuf *, struct btree_read_bio *); + int bch2_btree_node_scrub(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, unsigned); diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index ac5f2046550d..a67babf69d39 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -16,6 +16,7 @@ #include "journal_io.h" #include "replicas.h" #include "snapshot.h" +#include "super.h" #include "trace.h" #include @@ -114,11 +115,9 @@ static inline bool btree_path_pos_in_node(struct btree_path *path, !btree_path_pos_after_node(path, b); } -/* Btree iterator: */ +/* Debug: */ -#ifdef CONFIG_BCACHEFS_DEBUG - -static void bch2_btree_path_verify_cached(struct btree_trans *trans, +static void __bch2_btree_path_verify_cached(struct btree_trans *trans, struct btree_path *path) { struct bkey_cached *ck; @@ -135,7 +134,7 @@ static void bch2_btree_path_verify_cached(struct btree_trans *trans, btree_node_unlock(trans, path, 0); } -static void bch2_btree_path_verify_level(struct btree_trans *trans, +static void __bch2_btree_path_verify_level(struct btree_trans *trans, struct btree_path *path, unsigned level) { struct btree_path_level *l; @@ -147,16 +146,13 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans, struct printbuf buf3 = PRINTBUF; const char *msg; - if (!bch2_debug_check_iterators) - return; - l = &path->l[level]; tmp = l->iter; locked = btree_node_locked(path, level); if (path->cached) { if (!level) - bch2_btree_path_verify_cached(trans, path); + __bch2_btree_path_verify_cached(trans, path); return; } @@ -217,7 +213,7 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans, msg, level, buf1.buf, buf2.buf, buf3.buf); } -static void bch2_btree_path_verify(struct btree_trans *trans, +static void __bch2_btree_path_verify(struct btree_trans *trans, struct btree_path *path) { struct bch_fs *c = trans->c; @@ -229,23 +225,25 @@ static void bch2_btree_path_verify(struct btree_trans *trans, break; } - bch2_btree_path_verify_level(trans, path, i); + __bch2_btree_path_verify_level(trans, path, i); } - bch2_btree_path_verify_locks(path); + bch2_btree_path_verify_locks(trans, path); } -void bch2_trans_verify_paths(struct btree_trans *trans) +void __bch2_trans_verify_paths(struct btree_trans *trans) { struct btree_path *path; unsigned iter; trans_for_each_path(trans, path, iter) - bch2_btree_path_verify(trans, path); + __bch2_btree_path_verify(trans, path); } -static void bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter *iter) +static void __bch2_btree_iter_verify(struct btree_iter *iter) { + struct btree_trans *trans = iter->trans; + BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached); BUG_ON((iter->flags & BTREE_ITER_is_extents) && @@ -256,11 +254,11 @@ static void bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter !btree_type_has_snapshot_field(iter->btree_id)); if (iter->update_path) - bch2_btree_path_verify(trans, &trans->paths[iter->update_path]); - bch2_btree_path_verify(trans, btree_iter_path(trans, iter)); + __bch2_btree_path_verify(trans, &trans->paths[iter->update_path]); + __bch2_btree_path_verify(trans, btree_iter_path(trans, iter)); } -static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) +static void __bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) { BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && !iter->pos.snapshot); @@ -274,15 +272,9 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) bkey_gt(iter->pos, iter->k.p))); } -static int bch2_btree_iter_verify_ret(struct btree_trans *trans, - struct btree_iter *iter, struct bkey_s_c k) +static int __bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { - struct btree_iter copy; - struct bkey_s_c prev; - int ret = 0; - - if (!bch2_debug_check_iterators) - return 0; + struct btree_trans *trans = iter->trans; if (!(iter->flags & BTREE_ITER_filter_snapshots)) return 0; @@ -294,16 +286,16 @@ static int bch2_btree_iter_verify_ret(struct btree_trans *trans, iter->snapshot, k.k->p.snapshot)); - bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, - BTREE_ITER_nopreserve| - BTREE_ITER_all_snapshots); - prev = bch2_btree_iter_prev(trans, ©); + CLASS(btree_iter, copy)(trans, iter->btree_id, iter->pos, + BTREE_ITER_nopreserve| + BTREE_ITER_all_snapshots); + struct bkey_s_c prev = bch2_btree_iter_prev(©); if (!prev.k) - goto out; + return 0; - ret = bkey_err(prev); + int ret = bkey_err(prev); if (ret) - goto out; + return ret; if (bkey_eq(prev.k->p, k.k->p) && bch2_snapshot_is_ancestor(trans->c, iter->snapshot, @@ -319,12 +311,11 @@ static int bch2_btree_iter_verify_ret(struct btree_trans *trans, iter->snapshot, buf1.buf, buf2.buf); } -out: - bch2_trans_iter_exit(trans, ©); - return ret; + + return 0; } -void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, +void __bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, struct bpos pos) { bch2_trans_verify_not_unlocked_or_in_restart(trans); @@ -357,19 +348,39 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, panic("not locked: %s %s\n", bch2_btree_id_str(id), buf.buf); } -#else - static inline void bch2_btree_path_verify_level(struct btree_trans *trans, - struct btree_path *path, unsigned l) {} + struct btree_path *path, unsigned l) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_btree_path_verify_level(trans, path, l); +} + static inline void bch2_btree_path_verify(struct btree_trans *trans, - struct btree_path *path) {} -static inline void bch2_btree_iter_verify(struct btree_trans *trans, - struct btree_iter *iter) {} -static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {} -static inline int bch2_btree_iter_verify_ret(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k) { return 0; } + struct btree_path *path) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_btree_path_verify(trans, path); +} -#endif +static inline void bch2_btree_iter_verify(struct btree_iter *iter) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_btree_iter_verify(iter); +} + +static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_btree_iter_verify_entry_exit(iter); +} + +static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, + struct bkey_s_c k) +{ + return static_branch_unlikely(&bch2_debug_check_iterators) + ? __bch2_btree_iter_verify_ret(iter, k) + : 0; +} /* Btree path: fixups after btree updates */ @@ -523,7 +534,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, __bch2_btree_node_iter_fix(path, b, node_iter, t, where, clobber_u64s, new_u64s); - if (bch2_debug_check_iterators) + if (static_branch_unlikely(&bch2_debug_check_iterators)) bch2_btree_node_iter_verify(node_iter, b); } @@ -631,6 +642,7 @@ static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, str trans_for_each_update(trans, i) if (!i->cached && + !i->key_cache_flushing && i->level == b->c.level && i->btree_id == b->c.btree_id && bpos_cmp(i->k->k.p, b->data->min_key) >= 0 && @@ -876,8 +888,7 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, struct btree_path *path, - unsigned flags, - struct bkey_buf *out) + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; struct btree_path_level *l = path_l(path); @@ -889,7 +900,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, k = bch2_btree_and_journal_iter_peek(&jiter); if (!k.k) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); prt_str(&buf, "node not found at pos "); bch2_bpos_to_text(&buf, path->pos); @@ -897,11 +908,10 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, bch2_btree_pos_to_text(&buf, c, l->b); ret = bch2_fs_topology_error(c, "%s", buf.buf); - printbuf_exit(&buf); goto err; } - bch2_bkey_buf_reassemble(out, c, k); + bkey_reassemble(&trans->btree_path_down, k); if ((flags & BTREE_ITER_prefetch) && c->opts.btree_node_prefetch) @@ -912,9 +922,25 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, return ret; } +static noinline_for_stack int btree_node_missing_err(struct btree_trans *trans, + struct btree_path *path) +{ + struct bch_fs *c = trans->c; + CLASS(printbuf, buf)(); + + prt_str(&buf, "node not found at pos "); + bch2_bpos_to_text(&buf, path->pos); + prt_str(&buf, " within parent node "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&path_l(path)->b->key)); + + bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); + return bch_err_throw(c, btree_need_topology_repair); +} + static __always_inline int btree_path_down(struct btree_trans *trans, struct btree_path *path, - unsigned flags, + enum btree_iter_update_trigger_flags flags, unsigned long trace_ip) { struct bch_fs *c = trans->c; @@ -922,51 +948,38 @@ static __always_inline int btree_path_down(struct btree_trans *trans, struct btree *b; unsigned level = path->level - 1; enum six_lock_type lock_type = __btree_lock_want(path, level); - struct bkey_buf tmp; int ret; EBUG_ON(!btree_node_locked(path, path->level)); - bch2_bkey_buf_init(&tmp); - if (unlikely(trans->journal_replay_not_finished)) { - ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp); + ret = btree_node_iter_and_journal_peek(trans, path, flags); if (ret) - goto err; + return ret; } else { struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b); - if (!k) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "node not found at pos "); - bch2_bpos_to_text(&buf, path->pos); - prt_str(&buf, " within parent node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&l->b->key)); - - bch2_fs_fatal_error(c, "%s", buf.buf); - printbuf_exit(&buf); - ret = -BCH_ERR_btree_need_topology_repair; - goto err; - } + if (unlikely(!k)) + return btree_node_missing_err(trans, path); - bch2_bkey_buf_unpack(&tmp, c, l->b, k); + bch2_bkey_unpack(l->b, &trans->btree_path_down, k); - if ((flags & BTREE_ITER_prefetch) && + if (unlikely((flags & BTREE_ITER_prefetch)) && c->opts.btree_node_prefetch) { ret = btree_path_prefetch(trans, path); if (ret) - goto err; + return ret; } } - b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip); + b = bch2_btree_node_get(trans, path, &trans->btree_path_down, + level, lock_type, trace_ip); ret = PTR_ERR_OR_ZERO(b); if (unlikely(ret)) - goto err; + return ret; - if (likely(!trans->journal_replay_not_finished && - tmp.k->k.type == KEY_TYPE_btree_ptr_v2) && - unlikely(b != btree_node_mem_ptr(tmp.k))) + if (unlikely(b != btree_node_mem_ptr(&trans->btree_path_down)) && + likely(!trans->journal_replay_not_finished && + trans->btree_path_down.k.type == KEY_TYPE_btree_ptr_v2)) btree_node_mem_ptr_set(trans, path, level + 1, b); if (btree_node_read_locked(path, level + 1)) @@ -977,10 +990,8 @@ static __always_inline int btree_path_down(struct btree_trans *trans, path->level = level; bch2_btree_path_level_init(trans, path, b); - bch2_btree_path_verify_locks(path); -err: - bch2_bkey_buf_exit(&tmp, c); - return ret; + bch2_btree_path_verify_locks(trans, path); + return 0; } static int bch2_btree_path_traverse_all(struct btree_trans *trans) @@ -992,7 +1003,7 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans) int ret = 0; if (trans->in_traverse_all) - return -BCH_ERR_transaction_restart_in_traverse_all; + return bch_err_throw(c, transaction_restart_in_traverse_all); trans->in_traverse_all = true; retry_all: @@ -1089,7 +1100,7 @@ static void btree_path_set_level_down(struct btree_trans *trans, if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) btree_node_unlock(trans, path, l); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); bch2_btree_path_verify(trans, path); } @@ -1137,7 +1148,7 @@ static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, */ int bch2_btree_path_traverse_one(struct btree_trans *trans, btree_path_idx_t path_idx, - unsigned flags, + enum btree_iter_update_trigger_flags flags, unsigned long trace_ip) { struct btree_path *path = &trans->paths[path_idx]; @@ -1287,7 +1298,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, if (unlikely(path->cached)) { btree_node_unlock(trans, path, 0); path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); goto out; } @@ -1316,7 +1327,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, } if (unlikely(level != path->level)) { - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); __bch2_btree_path_unlock(trans, path); } out: @@ -1385,45 +1396,45 @@ static bool bch2_btree_path_can_relock(struct btree_trans *trans, struct btree_p void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent) { - struct btree_path *path = trans->paths + path_idx, *dup; + struct btree_path *path = trans->paths + path_idx, *dup = NULL; if (!__btree_path_put(trans, path, intent)) return; + if (!path->preserve && !path->should_be_locked) + goto free; + dup = path->preserve ? have_path_at_pos(trans, path) : have_node_at_pos(trans, path); - - trace_btree_path_free(trans, path_idx, dup); - - if (!dup && !(!path->preserve && !is_btree_node(path, path->level))) + if (!dup) return; - if (path->should_be_locked && !trans->restarted) { - if (!dup) - return; - + /* + * If we need this path locked, the duplicate also has te be locked + * before we free this one: + */ + if (path->should_be_locked && + !dup->should_be_locked && + !trans->restarted) { if (!(trans->locked ? bch2_btree_path_relock_norestart(trans, dup) : bch2_btree_path_can_relock(trans, dup))) return; - } - if (dup) { - dup->preserve |= path->preserve; - dup->should_be_locked |= path->should_be_locked; + dup->should_be_locked = true; } - __bch2_path_free(trans, path_idx); -} - -static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path, - bool intent) -{ - if (!__btree_path_put(trans, trans->paths + path, intent)) - return; + BUG_ON(path->should_be_locked && + !trans->restarted && + trans->locked && + !btree_node_locked(dup, dup->level)); - __bch2_path_free(trans, path); + path->should_be_locked = false; + dup->preserve |= path->preserve; +free: + trace_btree_path_free(trans, path_idx, dup); + __bch2_path_free(trans, path_idx); } void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count) @@ -1436,7 +1447,7 @@ void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_ static void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) { #ifdef CONFIG_BCACHEFS_DEBUG - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_prt_backtrace(&buf, &trans->last_restarted_trace); panic("in transaction restart: %s, last restarted by\n%s", bch2_err_str(trans->restarted), @@ -1485,7 +1496,7 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) prt_newline(buf); } - for (struct jset_entry *e = trans->journal_entries; + for (struct jset_entry *e = btree_trans_journal_entries_start(trans); e != btree_trans_journal_entries_top(trans); e = vstruct_next(e)) { bch2_journal_entry_to_text(buf, trans->c, e); @@ -1586,13 +1597,13 @@ void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans) static noinline __cold void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); + bch2_log_msg_start(trans->c, &buf); __bch2_trans_paths_to_text(&buf, trans, nosort); bch2_trans_updates_to_text(&buf, trans); - bch2_print_str(trans->c, buf.buf); - printbuf_exit(&buf); + bch2_print_str(trans->c, KERN_ERR, buf.buf); } noinline __cold @@ -1605,22 +1616,19 @@ noinline __cold static void bch2_trans_update_max_paths(struct btree_trans *trans) { struct btree_transaction_stats *s = btree_trans_stats(trans); - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); size_t nr = bitmap_weight(trans->paths_allocated, trans->nr_paths); bch2_trans_paths_to_text(&buf, trans); if (!buf.allocation_failure) { - mutex_lock(&s->lock); + guard(mutex)(&s->lock); if (nr > s->nr_max_paths) { s->nr_max_paths = nr; swap(s->max_paths_text, buf.buf); } - mutex_unlock(&s->lock); } - printbuf_exit(&buf); - trans->nr_paths_max = nr; } @@ -1628,11 +1636,10 @@ noinline __cold int __bch2_btree_trans_too_many_iters(struct btree_trans *trans) { if (trace_trans_restart_too_many_iters_enabled()) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_trans_paths_to_text(&buf, trans); trace_trans_restart_too_many_iters(trans, _THIS_IP_, buf.buf); - printbuf_exit(&buf); } count_event(trans->c, trans_restart_too_many_iters); @@ -1722,7 +1729,8 @@ static inline btree_path_idx_t btree_path_alloc(struct btree_trans *trans, btree_path_idx_t bch2_path_get(struct btree_trans *trans, enum btree_id btree_id, struct bpos pos, unsigned locks_want, unsigned level, - unsigned flags, unsigned long ip) + enum btree_iter_update_trigger_flags flags, + unsigned long ip) { struct btree_path *path; bool cached = flags & BTREE_ITER_cached; @@ -1735,6 +1743,10 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, btree_trans_sort_paths(trans); + if (intent) + locks_want = max(locks_want, level + 1); + locks_want = min(locks_want, BTREE_MAX_DEPTH); + trans_for_each_path_inorder(trans, path, iter) { if (__btree_path_cmp(path, btree_id, @@ -1749,7 +1761,8 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, if (path_pos && trans->paths[path_pos].cached == cached && trans->paths[path_pos].btree_id == btree_id && - trans->paths[path_pos].level == level) { + trans->paths[path_pos].level == level && + bch2_btree_path_upgrade_norestart(trans, trans->paths + path_pos, locks_want)) { trace_btree_path_get(trans, trans->paths + path_pos, &pos); __btree_path_get(trans, trans->paths + path_pos, intent); @@ -1781,9 +1794,6 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, if (!(flags & BTREE_ITER_nopreserve)) path->preserve = true; - if (path->intent_ref) - locks_want = max(locks_want, level + 1); - /* * If the path has locks_want greater than requested, we don't downgrade * it here - on transaction restart because btree node split needs to @@ -1792,10 +1802,6 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, * a successful transaction commit. */ - locks_want = min(locks_want, BTREE_MAX_DEPTH); - if (locks_want > path->locks_want) - bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL); - return path_idx; } @@ -1855,8 +1861,10 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey * return (struct bkey_s_c) { u, NULL }; } -void bch2_set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter) +void bch2_set_btree_iter_dontneed(struct btree_iter *iter) { + struct btree_trans *trans = iter->trans; + if (!iter->path || trans->restarted) return; @@ -1868,14 +1876,17 @@ void bch2_set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter * /* Btree iterators: */ int __must_check -__bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter) +__bch2_btree_iter_traverse(struct btree_iter *iter) { - return bch2_btree_path_traverse(trans, iter->path, iter->flags); + return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); } int __must_check -bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter) +bch2_btree_iter_traverse(struct btree_iter *iter) { + struct btree_trans *trans = iter->trans; + int ret; + bch2_trans_verify_not_unlocked_or_in_restart(trans); iter->path = bch2_btree_path_set_pos(trans, iter->path, @@ -1883,7 +1894,7 @@ bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter) iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); - int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); if (ret) return ret; @@ -1895,14 +1906,14 @@ bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter) /* Iterate across nodes (leaf and interior nodes) */ -struct btree *bch2_btree_iter_peek_node(struct btree_trans *trans, - struct btree_iter *iter) +struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) { + struct btree_trans *trans = iter->trans; struct btree *b = NULL; int ret; EBUG_ON(trans->paths[iter->path].cached); - bch2_btree_iter_verify(trans, iter); + bch2_btree_iter_verify(iter); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (ret) @@ -1924,7 +1935,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_trans *trans, btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); out: bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(trans, iter); + bch2_btree_iter_verify(iter); return b; err: @@ -1933,26 +1944,26 @@ struct btree *bch2_btree_iter_peek_node(struct btree_trans *trans, } /* Only kept for -tools */ -struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *trans, - struct btree_iter *iter) +struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter) { struct btree *b; - while (b = bch2_btree_iter_peek_node(trans, iter), + while (b = bch2_btree_iter_peek_node(iter), bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart)) - bch2_trans_begin(trans); + bch2_trans_begin(iter->trans); return b; } -struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_iter *iter) +struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) { + struct btree_trans *trans = iter->trans; struct btree *b = NULL; int ret; EBUG_ON(trans->paths[iter->path].cached); bch2_trans_verify_not_unlocked_or_in_restart(trans); - bch2_btree_iter_verify(trans, iter); + bch2_btree_iter_verify(iter); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (ret) @@ -1967,6 +1978,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_ /* got to end? */ if (!btree_path_node(path, path->level + 1)) { + path->should_be_locked = false; btree_path_set_level_up(trans, path); return NULL; } @@ -1978,12 +1990,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_ bch2_btree_path_downgrade(trans, path); if (!bch2_btree_node_relock(trans, path, path->level + 1)) { + trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); __bch2_btree_path_unlock(trans, path); path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); - trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); goto err; } @@ -2025,7 +2037,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_ EBUG_ON(btree_iter_path(trans, iter)->uptodate); out: bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(trans, iter); + bch2_btree_iter_verify(iter); return b; err: @@ -2035,7 +2047,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_ /* Iterate across keys (in leaf nodes only) */ -inline bool bch2_btree_iter_advance(struct btree_trans *trans, struct btree_iter *iter) +inline bool bch2_btree_iter_advance(struct btree_iter *iter) { struct bpos pos = iter->k.p; bool ret = !(iter->flags & BTREE_ITER_all_snapshots @@ -2044,11 +2056,11 @@ inline bool bch2_btree_iter_advance(struct btree_trans *trans, struct btree_iter if (ret && !(iter->flags & BTREE_ITER_is_extents)) pos = bkey_successor(iter, pos); - bch2_btree_iter_set_pos(trans, iter, pos); + bch2_btree_iter_set_pos(iter, pos); return ret; } -inline bool bch2_btree_iter_rewind(struct btree_trans *trans, struct btree_iter *iter) +inline bool bch2_btree_iter_rewind(struct btree_iter *iter) { struct bpos pos = bkey_start_pos(&iter->k); bool ret = !(iter->flags & BTREE_ITER_all_snapshots @@ -2057,20 +2069,20 @@ inline bool bch2_btree_iter_rewind(struct btree_trans *trans, struct btree_iter if (ret && !(iter->flags & BTREE_ITER_is_extents)) pos = bkey_predecessor(iter, pos); - bch2_btree_iter_set_pos(trans, iter, pos); + bch2_btree_iter_set_pos(iter, pos); return ret; } static noinline void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c *k) + struct bpos search_key, struct bkey_s_c *k) { struct bpos end = path_l(btree_iter_path(trans, iter))->b->data->min_key; trans_for_each_update(trans, i) if (!i->key_cache_already_flushed && i->btree_id == iter->btree_id && - bpos_le(i->k->k.p, iter->pos) && + bpos_le(i->k->k.p, search_key) && bpos_ge(i->k->k.p, k->k ? k->k->p : end)) { iter->k = i->k->k; *k = bkey_i_to_s_c(i->k); @@ -2079,6 +2091,7 @@ void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_ static noinline void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter *iter, + struct bpos search_key, struct bkey_s_c *k) { struct btree_path *path = btree_iter_path(trans, iter); @@ -2087,7 +2100,7 @@ void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter trans_for_each_update(trans, i) if (!i->key_cache_already_flushed && i->btree_id == iter->btree_id && - bpos_ge(i->k->k.p, path->pos) && + bpos_ge(i->k->k.p, search_key) && bpos_le(i->k->k.p, k->k ? k->k->p : end)) { iter->k = i->k->k; *k = bkey_i_to_s_c(i->k); @@ -2109,13 +2122,14 @@ void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_ static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, struct btree_iter *iter, + struct bpos search_pos, struct bpos end_pos) { struct btree_path *path = btree_iter_path(trans, iter); return bch2_journal_keys_peek_max(trans->c, iter->btree_id, path->level, - path->pos, + search_pos, end_pos, &iter->journal_idx); } @@ -2125,7 +2139,7 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, struct btree_iter *iter) { struct btree_path *path = btree_iter_path(trans, iter); - struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos); + struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos, path->pos); if (k) { iter->k = k->k; @@ -2138,11 +2152,12 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, static noinline void btree_trans_peek_journal(struct btree_trans *trans, struct btree_iter *iter, + struct bpos search_key, struct bkey_s_c *k) { struct btree_path *path = btree_iter_path(trans, iter); struct bkey_i *next_journal = - bch2_btree_journal_peek(trans, iter, + bch2_btree_journal_peek(trans, iter, search_key, k->k ? k->k->p : path_l(path)->b->key.k.p); if (next_journal) { iter->k = next_journal->k; @@ -2152,13 +2167,14 @@ void btree_trans_peek_journal(struct btree_trans *trans, static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans, struct btree_iter *iter, + struct bpos search_key, struct bpos end_pos) { struct btree_path *path = btree_iter_path(trans, iter); return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id, path->level, - path->pos, + search_key, end_pos, &iter->journal_idx); } @@ -2166,12 +2182,13 @@ static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans, static noinline void btree_trans_peek_prev_journal(struct btree_trans *trans, struct btree_iter *iter, + struct bpos search_key, struct bkey_s_c *k) { struct btree_path *path = btree_iter_path(trans, iter); struct bkey_i *next_journal = - bch2_btree_journal_peek_prev(trans, iter, - k->k ? k->k->p : path_l(path)->b->key.k.p); + bch2_btree_journal_peek_prev(trans, iter, search_key, + k->k ? k->k->p : path_l(path)->b->data->min_key); if (next_journal) { iter->k = next_journal->k; @@ -2184,9 +2201,9 @@ void btree_trans_peek_prev_journal(struct btree_trans *trans, * bkey_s_c_null: */ static noinline -struct bkey_s_c btree_trans_peek_key_cache(struct btree_trans *trans, struct btree_iter *iter, - struct bpos pos) +struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) { + struct btree_trans *trans = iter->trans; struct bch_fs *c = trans->c; struct bkey u; struct bkey_s_c k; @@ -2232,14 +2249,14 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_trans *trans, struct btr return k; } -static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct btree_iter *iter, - struct bpos search_key) +static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) { + struct btree_trans *trans = iter->trans; struct bkey_s_c k, k2; int ret; EBUG_ON(btree_iter_path(trans, iter)->cached); - bch2_btree_iter_verify(trans, iter); + bch2_btree_iter_verify(iter); while (1) { iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, @@ -2249,7 +2266,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (unlikely(ret)) { /* ensure that iter->k is consistent with iter->pos: */ - bch2_btree_iter_set_pos(trans, iter, iter->pos); + bch2_btree_iter_set_pos(iter, iter->pos); k = bkey_s_c_err(ret); break; } @@ -2259,7 +2276,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct if (unlikely(!l->b)) { /* No btree nodes at requested level: */ - bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); + bch2_btree_iter_set_pos(iter, SPOS_MAX); k = bkey_s_c_null; break; } @@ -2270,20 +2287,21 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && k.k && - (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) { + !bkey_deleted(k.k) && + (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { k = k2; if (bkey_err(k)) { - bch2_btree_iter_set_pos(trans, iter, iter->pos); + bch2_btree_iter_set_pos(iter, iter->pos); break; } } if (unlikely(iter->flags & BTREE_ITER_with_journal)) - btree_trans_peek_journal(trans, iter, &k); + btree_trans_peek_journal(trans, iter, search_key, &k); if (unlikely((iter->flags & BTREE_ITER_with_updates) && trans->nr_updates)) - bch2_btree_trans_peek_updates(trans, iter, &k); + bch2_btree_trans_peek_updates(trans, iter, search_key, &k); if (k.k && bkey_deleted(k.k)) { /* @@ -2306,28 +2324,41 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct search_key = bpos_successor(l->b->key.k.p); } else { /* End of btree: */ - bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); + bch2_btree_iter_set_pos(iter, SPOS_MAX); k = bkey_s_c_null; break; } } - bch2_btree_iter_verify(trans, iter); + bch2_btree_iter_verify(iter); + + if (trace___btree_iter_peek_enabled()) { + CLASS(printbuf, buf)(); + + int ret = bkey_err(k); + if (ret) + prt_str(&buf, bch2_err_str(ret)); + else if (k.k) + bch2_bkey_val_to_text(&buf, trans->c, k); + else + prt_str(&buf, "(null)"); + trace___btree_iter_peek(trans->c, buf.buf); + } + return k; } /** * bch2_btree_iter_peek_max() - returns first key greater than or equal to * iterator's current position - * @trans: btree transaction object * @iter: iterator to peek from * @end: search limit: returns keys less than or equal to @end * * Returns: key if found, or an error extractable with bkey_err(). */ -struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree_iter *iter, - struct bpos end) +struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos end) { + struct btree_trans *trans = iter->trans; struct bpos search_key = btree_iter_search_key(iter); struct bkey_s_c k; struct bpos iter_pos = iter->pos; @@ -2344,13 +2375,12 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree } if (iter->update_path) { - bch2_path_put_nokeep(trans, iter->update_path, - iter->flags & BTREE_ITER_intent); + bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_intent); iter->update_path = 0; } while (1) { - k = __bch2_btree_iter_peek(trans, iter, search_key); + k = __bch2_btree_iter_peek(iter, search_key); if (unlikely(!k.k)) goto end; if (unlikely(bkey_err(k))) @@ -2374,8 +2404,8 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree if (iter->update_path && !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) { - bch2_path_put_nokeep(trans, iter->update_path, - iter->flags & BTREE_ITER_intent); + bch2_path_put(trans, iter->update_path, + iter->flags & BTREE_ITER_intent); iter->update_path = 0; } @@ -2421,7 +2451,7 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree } if (bkey_whiteout(k.k) && - !(iter->flags & BTREE_ITER_key_cache_fill)) { + !(iter->flags & BTREE_ITER_nofilter_whiteouts)) { search_key = bkey_successor(iter, k.k->p); continue; } @@ -2464,17 +2494,30 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree if (!(iter->flags & BTREE_ITER_all_snapshots)) iter->pos.snapshot = iter->snapshot; - ret = bch2_btree_iter_verify_ret(trans, iter, k); + ret = bch2_btree_iter_verify_ret(iter, k); if (unlikely(ret)) { - bch2_btree_iter_set_pos(trans, iter, iter->pos); + bch2_btree_iter_set_pos(iter, iter->pos); k = bkey_s_c_err(ret); } bch2_btree_iter_verify_entry_exit(iter); + if (trace_btree_iter_peek_max_enabled()) { + CLASS(printbuf, buf)(); + + int ret = bkey_err(k); + if (ret) + prt_str(&buf, bch2_err_str(ret)); + else if (k.k) + bch2_bkey_val_to_text(&buf, trans->c, k); + else + prt_str(&buf, "(null)"); + trace_btree_iter_peek_max(trans->c, buf.buf); + } + return k; end: - bch2_btree_iter_set_pos(trans, iter, end); + bch2_btree_iter_set_pos(iter, end); k = bkey_s_c_null; goto out_no_locked; } @@ -2482,25 +2525,24 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree /** * bch2_btree_iter_next() - returns first key greater than iterator's current * position - * @trans: btree transaction object * @iter: iterator to peek from * * Returns: key if found, or an error extractable with bkey_err(). */ -struct bkey_s_c bch2_btree_iter_next(struct btree_trans *trans, struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) { - if (!bch2_btree_iter_advance(trans, iter)) + if (!bch2_btree_iter_advance(iter)) return bkey_s_c_null; - return bch2_btree_iter_peek(trans, iter); + return bch2_btree_iter_peek(iter); } -static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter, - struct bpos search_key) +static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, struct bpos search_key) { + struct btree_trans *trans = iter->trans; struct bkey_s_c k, k2; - bch2_btree_iter_verify(trans, iter); + bch2_btree_iter_verify(iter); while (1) { iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, @@ -2510,7 +2552,7 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, st int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (unlikely(ret)) { /* ensure that iter->k is consistent with iter->pos: */ - bch2_btree_iter_set_pos(trans, iter, iter->pos); + bch2_btree_iter_set_pos(iter, iter->pos); k = bkey_s_c_err(ret); break; } @@ -2520,7 +2562,7 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, st if (unlikely(!l->b)) { /* No btree nodes at requested level: */ - bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); + bch2_btree_iter_set_pos(iter, SPOS_MAX); k = bkey_s_c_null; break; } @@ -2536,20 +2578,21 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, st if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && k.k && - (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) { + !bkey_deleted(k.k) && + (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { k = k2; if (bkey_err(k2)) { - bch2_btree_iter_set_pos(trans, iter, iter->pos); + bch2_btree_iter_set_pos(iter, iter->pos); break; } } if (unlikely(iter->flags & BTREE_ITER_with_journal)) - btree_trans_peek_prev_journal(trans, iter, &k); + btree_trans_peek_prev_journal(trans, iter, search_key, &k); if (unlikely((iter->flags & BTREE_ITER_with_updates) && trans->nr_updates)) - bch2_btree_trans_peek_prev_updates(trans, iter, &k); + bch2_btree_trans_peek_prev_updates(trans, iter, search_key, &k); if (likely(k.k && !bkey_deleted(k.k))) { break; @@ -2560,27 +2603,25 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, st search_key = bpos_predecessor(path->l[0].b->data->min_key); } else { /* Start of btree: */ - bch2_btree_iter_set_pos(trans, iter, POS_MIN); + bch2_btree_iter_set_pos(iter, POS_MIN); k = bkey_s_c_null; break; } } - bch2_btree_iter_verify(trans, iter); + bch2_btree_iter_verify(iter); return k; } /** * bch2_btree_iter_peek_prev_min() - returns first key less than or equal to * iterator's current position - * @trans: btree transaction object * @iter: iterator to peek from * @end: search limit: returns keys greater than or equal to @end * * Returns: key if found, or an error extractable with bkey_err(). */ -struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct btree_iter *iter, - struct bpos end) +struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bpos end) { if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) && !bkey_eq(iter->pos, POS_MAX) && @@ -2595,7 +2636,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct * real visible extents - easiest to just use peek_slot() (which * internally uses peek() for extents) */ - struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); if (bkey_err(k)) return k; @@ -2605,6 +2646,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct return k; } + struct btree_trans *trans = iter->trans; struct bpos search_key = iter->pos; struct bkey_s_c k; btree_path_idx_t saved_path = 0; @@ -2620,7 +2662,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct } while (1) { - k = __bch2_btree_iter_peek_prev(trans, iter, search_key); + k = __bch2_btree_iter_peek_prev(iter, search_key); if (unlikely(!k.k)) goto end; if (unlikely(bkey_err(k))) @@ -2634,7 +2676,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct * the last possible snapshot overwrite, return * it: */ - bch2_path_put_nokeep(trans, iter->path, + bch2_path_put(trans, iter->path, iter->flags & BTREE_ITER_intent); iter->path = saved_path; saved_path = 0; @@ -2664,8 +2706,8 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct * our previous saved candidate: */ if (saved_path) { - bch2_path_put_nokeep(trans, saved_path, - iter->flags & BTREE_ITER_intent); + bch2_path_put(trans, saved_path, + iter->flags & BTREE_ITER_intent); saved_path = 0; } @@ -2702,19 +2744,32 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct } /* Extents can straddle iter->pos: */ - iter->pos = bpos_min(iter->pos, k.k->p);; + iter->pos = bpos_min(iter->pos, k.k->p); if (iter->flags & BTREE_ITER_filter_snapshots) iter->pos.snapshot = iter->snapshot; out_no_locked: if (saved_path) - bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_intent); + bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_intent); bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(trans, iter); + bch2_btree_iter_verify(iter); + + if (trace_btree_iter_peek_prev_min_enabled()) { + CLASS(printbuf, buf)(); + + int ret = bkey_err(k); + if (ret) + prt_str(&buf, bch2_err_str(ret)); + else if (k.k) + bch2_bkey_val_to_text(&buf, trans->c, k); + else + prt_str(&buf, "(null)"); + trace_btree_iter_peek_prev_min(trans->c, buf.buf); + } return k; end: - bch2_btree_iter_set_pos(trans, iter, end); + bch2_btree_iter_set_pos(iter, end); k = bkey_s_c_null; goto out_no_locked; } @@ -2722,27 +2777,27 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct /** * bch2_btree_iter_prev() - returns first key less than iterator's current * position - * @trans: btree transaction object * @iter: iterator to peek from * * Returns: key if found, or an error extractable with bkey_err(). */ -struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *trans, struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) { - if (!bch2_btree_iter_rewind(trans, iter)) + if (!bch2_btree_iter_rewind(iter)) return bkey_s_c_null; - return bch2_btree_iter_peek_prev(trans, iter); + return bch2_btree_iter_peek_prev(iter); } -struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) { + struct btree_trans *trans = iter->trans; struct bpos search_key; - struct bkey_s_c k; + struct bkey_s_c k, k2; int ret; bch2_trans_verify_not_unlocked_or_in_restart(trans); - bch2_btree_iter_verify(trans, iter); + bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache)); @@ -2755,10 +2810,12 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre /* extents can't span inode numbers: */ if ((iter->flags & BTREE_ITER_is_extents) && unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { - if (iter->pos.inode == KEY_INODE_MAX) - return bkey_s_c_null; + if (iter->pos.inode == KEY_INODE_MAX) { + k = bkey_s_c_null; + goto out2; + } - bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos)); + bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); } search_key = btree_iter_search_key(iter); @@ -2773,8 +2830,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre } struct btree_path *path = btree_iter_path(trans, iter); - if (unlikely(!btree_path_node(path, path->level))) - return bkey_s_c_null; + if (unlikely(!btree_path_node(path, path->level))) { + k = bkey_s_c_null; + goto out2; + } btree_path_set_should_be_locked(trans, path); @@ -2793,21 +2852,22 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre (k = btree_trans_peek_slot_journal(trans, iter)).k) goto out; - if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && - (k = btree_trans_peek_key_cache(trans, iter, iter->pos)).k) { - if (!bkey_err(k)) - iter->k = *k.k; - /* We're not returning a key from iter->path: */ - goto out; - } - k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k); if (unlikely(!k.k)) goto out; + if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && + !bkey_deleted(k.k) && + (k2 = btree_trans_peek_key_cache(iter, iter->pos)).k) { + k = k2; + if (bkey_err(k)) + goto out; + iter->k = *k.k; + } + if (unlikely(k.k->type == KEY_TYPE_whiteout && (iter->flags & BTREE_ITER_filter_snapshots) && - !(iter->flags & BTREE_ITER_key_cache_fill))) + !(iter->flags & BTREE_ITER_nofilter_whiteouts))) iter->k.type = KEY_TYPE_deleted; } else { struct bpos next; @@ -2821,21 +2881,21 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre if (iter->flags & BTREE_ITER_intent) { struct btree_iter iter2; - bch2_trans_copy_iter(trans, &iter2, iter); - k = bch2_btree_iter_peek_max(trans, &iter2, end); + bch2_trans_copy_iter(&iter2, iter); + k = bch2_btree_iter_peek_max(&iter2, end); if (k.k && !bkey_err(k)) { swap(iter->key_cache_path, iter2.key_cache_path); iter->k = iter2.k; k.k = &iter->k; } - bch2_trans_iter_exit(trans, &iter2); + bch2_trans_iter_exit(&iter2); } else { struct bpos pos = iter->pos; - k = bch2_btree_iter_peek_max(trans, iter, end); + k = bch2_btree_iter_peek_max(iter, end); if (unlikely(bkey_err(k))) - bch2_btree_iter_set_pos(trans, iter, pos); + bch2_btree_iter_set_pos(iter, pos); else iter->pos = pos; } @@ -2864,39 +2924,52 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre } out: bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(trans, iter); - ret = bch2_btree_iter_verify_ret(trans, iter, k); + bch2_btree_iter_verify(iter); + ret = bch2_btree_iter_verify_ret(iter, k); if (unlikely(ret)) - return bkey_s_c_err(ret); + k = bkey_s_c_err(ret); +out2: + if (trace_btree_iter_peek_slot_enabled()) { + CLASS(printbuf, buf)(); + + int ret = bkey_err(k); + if (ret) + prt_str(&buf, bch2_err_str(ret)); + else if (k.k) + bch2_bkey_val_to_text(&buf, trans->c, k); + else + prt_str(&buf, "(null)"); + trace_btree_iter_peek_slot(trans->c, buf.buf); + } return k; } -struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *trans, struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) { - if (!bch2_btree_iter_advance(trans, iter)) + if (!bch2_btree_iter_advance(iter)) return bkey_s_c_null; - return bch2_btree_iter_peek_slot(trans, iter); + return bch2_btree_iter_peek_slot(iter); } -struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *trans, struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) { - if (!bch2_btree_iter_rewind(trans, iter)) + if (!bch2_btree_iter_rewind(iter)) return bkey_s_c_null; - return bch2_btree_iter_peek_slot(trans, iter); + return bch2_btree_iter_peek_slot(iter); } /* Obsolete, but still used by rust wrapper in -tools */ -struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *trans, struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter) { struct bkey_s_c k; - while (btree_trans_too_many_iters(trans) || - (k = bch2_btree_iter_peek_type(trans, iter, iter->flags), + while (btree_trans_too_many_iters(iter->trans) || + (k = bch2_btree_iter_peek_type(iter, iter->flags), bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) - bch2_trans_begin(trans); + bch2_trans_begin(iter->trans); return k; } @@ -2929,7 +3002,7 @@ static void btree_trans_verify_sorted(struct btree_trans *trans) struct btree_path *path, *prev = NULL; struct trans_for_each_path_inorder_iter iter; - if (!bch2_debug_check_iterators) + if (!static_branch_unlikely(&bch2_debug_check_iterators)) return; trans_for_each_path_inorder(trans, path, iter) { @@ -3028,10 +3101,12 @@ static inline void btree_path_list_add(struct btree_trans *trans, btree_trans_verify_sorted_refs(trans); } -void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) +void bch2_trans_iter_exit(struct btree_iter *iter) { + struct btree_trans *trans = iter->trans; + if (iter->update_path) - bch2_path_put_nokeep(trans, iter->update_path, + bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_intent); if (iter->path) bch2_path_put(trans, iter->path, @@ -3042,16 +3117,18 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) iter->path = 0; iter->update_path = 0; iter->key_cache_path = 0; + iter->trans = NULL; } void bch2_trans_iter_init_outlined(struct btree_trans *trans, struct btree_iter *iter, enum btree_id btree_id, struct bpos pos, - unsigned flags) + enum btree_iter_update_trigger_flags flags, + unsigned long ip) { bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, bch2_btree_iter_flags(trans, btree_id, 0, flags), - _RET_IP_); + ip); } void bch2_trans_node_iter_init(struct btree_trans *trans, @@ -3060,7 +3137,7 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, struct bpos pos, unsigned locks_want, unsigned depth, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { flags |= BTREE_ITER_not_extents; flags |= BTREE_ITER_snapshot_field; @@ -3081,9 +3158,10 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, BUG_ON(iter->min_depth != depth); } -void bch2_trans_copy_iter(struct btree_trans *trans, - struct btree_iter *dst, struct btree_iter *src) +void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) { + struct btree_trans *trans = src->trans; + *dst = *src; #ifdef TRACK_PATH_ALLOCATED dst->ip_allocated = _RET_IP_; @@ -3095,7 +3173,19 @@ void bch2_trans_copy_iter(struct btree_trans *trans, dst->key_cache_path = 0; } -void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE +void bch2_trans_kmalloc_trace_to_text(struct printbuf *out, + darray_trans_kmalloc_trace *trace) +{ + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 60); + + darray_for_each(*trace, i) + prt_printf(out, "%pS\t%zu\n", (void *) i->ip, i->bytes); +} +#endif + +void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long ip) { struct bch_fs *c = trans->c; unsigned new_top = trans->mem_top + size; @@ -3105,74 +3195,75 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) void *new_mem; void *p; - WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); + if (WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX)) { +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + CLASS(printbuf, buf)(); + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "bump allocator exceeded BTREE_TRANS_MEM_MAX (%u)\n", + BTREE_TRANS_MEM_MAX); + + bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace); + bch2_print_str(c, KERN_ERR, buf.buf); +#endif + } ret = trans_maybe_inject_restart(trans, _RET_IP_); if (ret) return ERR_PTR(ret); struct btree_transaction_stats *s = btree_trans_stats(trans); - s->max_mem = max(s->max_mem, new_bytes); - - if (trans->used_mempool) { - if (trans->mem_bytes >= new_bytes) - goto out_change_top; - - /* No more space from mempool item, need malloc new one */ - new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN); - if (unlikely(!new_mem)) { - bch2_trans_unlock(trans); - - new_mem = kmalloc(new_bytes, GFP_KERNEL); - if (!new_mem) - return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); + if (new_bytes > s->max_mem) { + guard(mutex)(&s->lock); +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_resize(&s->trans_kmalloc_trace, trans->trans_kmalloc_trace.nr); + s->trans_kmalloc_trace.nr = min(s->trans_kmalloc_trace.size, + trans->trans_kmalloc_trace.nr); + + memcpy(s->trans_kmalloc_trace.data, + trans->trans_kmalloc_trace.data, + sizeof(s->trans_kmalloc_trace.data[0]) * + s->trans_kmalloc_trace.nr); +#endif + s->max_mem = new_bytes; + } - ret = bch2_trans_relock(trans); - if (ret) { - kfree(new_mem); - return ERR_PTR(ret); - } - } - memcpy(new_mem, trans->mem, trans->mem_top); - trans->used_mempool = false; - mempool_free(trans->mem, &c->btree_trans_mem_pool); - goto out_new_mem; + if (trans->used_mempool || new_bytes > BTREE_TRANS_MEM_MAX) { + EBUG_ON(trans->mem_bytes >= new_bytes); + return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); } - new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); - if (unlikely(!new_mem)) { - bch2_trans_unlock(trans); + if (old_bytes) { + trans->realloc_bytes_required = new_bytes; + trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); + return ERR_PTR(btree_trans_restart_ip(trans, + BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); + } - new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL); - if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { - new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); - new_bytes = BTREE_TRANS_MEM_MAX; - memcpy(new_mem, trans->mem, trans->mem_top); - trans->used_mempool = true; - kfree(trans->mem); - } + EBUG_ON(trans->mem); + EBUG_ON(trans->mem_bytes); + EBUG_ON(trans->mem_top); + EBUG_ON(new_bytes > BTREE_TRANS_MEM_MAX); + + bool lock_dropped = false; + new_mem = allocate_dropping_locks_norelock(trans, lock_dropped, kmalloc(new_bytes, _gfp)); + if (!new_mem) { + new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); + new_bytes = BTREE_TRANS_MEM_MAX; + trans->used_mempool = true; + } - if (!new_mem) - return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); + EBUG_ON(!new_mem); - trans->mem = new_mem; - trans->mem_bytes = new_bytes; + trans->mem = new_mem; + trans->mem_bytes = new_bytes; + if (unlikely(lock_dropped)) { ret = bch2_trans_relock(trans); if (ret) return ERR_PTR(ret); } -out_new_mem: - trans->mem = new_mem; - trans->mem_bytes = new_bytes; - if (old_bytes) { - trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); - return ERR_PTR(btree_trans_restart_ip(trans, - BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); - } -out_change_top: - p = trans->mem + trans->mem_top; + p = trans->mem; trans->mem_top += size; memset(p, 0, size); return p; @@ -3231,7 +3322,30 @@ u32 bch2_trans_begin(struct btree_trans *trans) trans->restart_count++; trans->mem_top = 0; - trans->journal_entries = NULL; + + if (unlikely(trans->restarted == BCH_ERR_transaction_restart_mem_realloced)) { + unsigned new_bytes = trans->realloc_bytes_required; + EBUG_ON(new_bytes > BTREE_TRANS_MEM_MAX); + EBUG_ON(!trans->mem); + EBUG_ON(!trans->mem_bytes); + + bool lock_dropped = false; + void *new_mem = allocate_dropping_locks_norelock(trans, lock_dropped, + krealloc(trans->mem, new_bytes, _gfp)); + (void)lock_dropped; + + if (!new_mem) { + new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); + new_bytes = BTREE_TRANS_MEM_MAX; + trans->used_mempool = true; + kfree(trans->mem); + } + + EBUG_ON(!new_mem); + + trans->mem = new_mem; + trans->mem_bytes = new_bytes; + } trans_for_each_path(trans, path, i) { path->should_be_locked = false; @@ -3285,6 +3399,10 @@ u32 bch2_trans_begin(struct btree_trans *trans) } #endif +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + trans->trans_kmalloc_trace.nr = 0; +#endif + trans_set_locked(trans, false); if (trans->restarted) { @@ -3385,7 +3503,6 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) } trans->nr_paths_max = s->nr_max_paths; - trans->journal_entries_size = s->journal_entries_size; } trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); @@ -3397,29 +3514,44 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) return trans; } -static void check_btree_paths_leaked(struct btree_trans *trans) -{ #ifdef CONFIG_BCACHEFS_DEBUG - struct bch_fs *c = trans->c; + +static bool btree_paths_leaked(struct btree_trans *trans) +{ struct btree_path *path; unsigned i; trans_for_each_path(trans, path, i) if (path->ref) - goto leaked; - return; -leaked: - bch_err(c, "btree paths leaked from %s!", trans->fn); - trans_for_each_path(trans, path, i) - if (path->ref) - printk(KERN_ERR " btree %s %pS\n", - bch2_btree_id_str(path->btree_id), - (void *) path->ip_allocated); - /* Be noisy about this: */ - bch2_fatal_error(c); -#endif + return true; + return false; } +static void check_btree_paths_leaked(struct btree_trans *trans) +{ + if (btree_paths_leaked(trans)) { + struct bch_fs *c = trans->c; + struct btree_path *path; + unsigned i; + + CLASS(printbuf, buf)(); + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "btree paths leaked from %s!\n", trans->fn); + trans_for_each_path(trans, path, i) + if (path->ref) + prt_printf(&buf, "btree %s %pS\n", + bch2_btree_id_str(path->btree_id), + (void *) path->ip_allocated); + + bch2_fs_emergency_read_only2(c, &buf); + bch2_print_str(c, KERN_ERR, buf.buf); + } +} +#else +static inline void check_btree_paths_leaked(struct btree_trans *trans) {} +#endif + void bch2_trans_put(struct btree_trans *trans) __releases(&c->btree_trans_barrier) { @@ -3454,6 +3586,9 @@ void bch2_trans_put(struct btree_trans *trans) #ifdef CONFIG_BCACHEFS_DEBUG darray_exit(&trans->last_restarted_trace); #endif +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_exit(&trans->trans_kmalloc_trace); +#endif unsigned long *paths_allocated = trans->paths_allocated; trans->paths_allocated = NULL; @@ -3500,13 +3635,12 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out, struct btree_bkey_cached_common *b) { struct six_lock_count c = six_lock_counts(&b->lock); - struct task_struct *owner; pid_t pid; - rcu_read_lock(); - owner = READ_ONCE(b->lock.owner); - pid = owner ? owner->pid : 0; - rcu_read_unlock(); + scoped_guard(rcu) { + struct task_struct *owner = READ_ONCE(b->lock.owner); + pid = owner ? owner->pid : 0; + } prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b'); bch2_btree_id_to_text(out, b->btree_id); @@ -3535,12 +3669,12 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn); /* trans->paths is rcu protected vs. freeing */ - rcu_read_lock(); - out->atomic++; + guard(rcu)(); + guard(printbuf_atomic)(out); struct btree_path *paths = rcu_dereference(trans->paths); if (!paths) - goto out; + return; unsigned long *paths_allocated = trans_paths_allocated(paths); @@ -3576,9 +3710,6 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) bch2_btree_bkey_cached_common_to_text(out, b); prt_newline(out); } -out: - --out->atomic; - rcu_read_unlock(); } void bch2_fs_btree_iter_exit(struct bch_fs *c) @@ -3608,6 +3739,9 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) for (s = c->btree_transaction_stats; s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); s++) { +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_exit(&s->trans_kmalloc_trace); +#endif kfree(s->max_paths_text); bch2_time_stats_exit(&s->lock_hold_times); } diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 9d2cccf5d21a..b117cb5d7f94 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -46,9 +46,11 @@ static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path return --path->ref == 0; } -static inline void btree_path_set_dirty(struct btree_path *path, +static inline void btree_path_set_dirty(struct btree_trans *trans, + struct btree_path *path, enum btree_path_uptodate u) { + BUG_ON(path->should_be_locked && trans->locked && !trans->restarted); path->uptodate = max_t(unsigned, path->uptodate, u); } @@ -233,12 +235,14 @@ bch2_btree_path_set_pos(struct btree_trans *trans, int __must_check bch2_btree_path_traverse_one(struct btree_trans *, btree_path_idx_t, - unsigned, unsigned long); + enum btree_iter_update_trigger_flags, + unsigned long); static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *); static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans, - btree_path_idx_t path, unsigned flags) + btree_path_idx_t path, + enum btree_iter_update_trigger_flags flags) { bch2_trans_verify_not_unlocked_or_in_restart(trans); @@ -249,7 +253,9 @@ static inline int __must_check bch2_btree_path_traverse(struct btree_trans *tran } btree_path_idx_t bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, - unsigned, unsigned, unsigned, unsigned long); + unsigned, unsigned, + enum btree_iter_update_trigger_flags, + unsigned long); btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *, enum btree_id, unsigned, struct bpos); @@ -285,14 +291,23 @@ static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex : __bch2_trans_mutex_lock(trans, lock); } -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_trans_verify_paths(struct btree_trans *); -void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos); -#else -static inline void bch2_trans_verify_paths(struct btree_trans *trans) {} -static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, - struct bpos pos) {} -#endif +/* Debug: */ + +void __bch2_trans_verify_paths(struct btree_trans *); +void __bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos); + +static inline void bch2_trans_verify_paths(struct btree_trans *trans) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_trans_verify_paths(trans); +} + +static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id btree, + struct bpos pos) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_assert_pos_locked(trans, btree, pos); +} void bch2_btree_path_fix_key_modified(struct btree_trans *trans, struct btree *, struct bkey_packed *); @@ -393,37 +408,36 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct void bch2_trans_node_drop(struct btree_trans *trans, struct btree *); void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *); -int __must_check __bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *); -int __must_check bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *); +int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter); +int __must_check bch2_btree_iter_traverse(struct btree_iter *); -struct btree *bch2_btree_iter_peek_node(struct btree_trans *, struct btree_iter *); -struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *, struct btree_iter *); -struct btree *bch2_btree_iter_next_node(struct btree_trans *, struct btree_iter *); +struct btree *bch2_btree_iter_peek_node(struct btree_iter *); +struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *); +struct btree *bch2_btree_iter_next_node(struct btree_iter *); -struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *, struct btree_iter *, struct bpos); -struct bkey_s_c bch2_btree_iter_next(struct btree_trans *, struct btree_iter *); +struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *, struct bpos); +struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); -static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_trans *trans, - struct btree_iter *iter) +static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) { - return bch2_btree_iter_peek_max(trans, iter, SPOS_MAX); + return bch2_btree_iter_peek_max(iter, SPOS_MAX); } -struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *, struct btree_iter *, struct bpos); +struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *, struct bpos); -static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter) +static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) { - return bch2_btree_iter_peek_prev_min(trans, iter, POS_MIN); + return bch2_btree_iter_peek_prev_min(iter, POS_MIN); } -struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *, struct btree_iter *); +struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); -struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *, struct btree_iter *); -struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *, struct btree_iter *); -struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *, struct btree_iter *); +struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *); -bool bch2_btree_iter_advance(struct btree_trans *, struct btree_iter *); -bool bch2_btree_iter_rewind(struct btree_trans *, struct btree_iter *); +bool bch2_btree_iter_advance(struct btree_iter *); +bool bch2_btree_iter_rewind(struct btree_iter *); static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) { @@ -434,9 +448,10 @@ static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpo iter->k.size = 0; } -static inline void bch2_btree_iter_set_pos(struct btree_trans *trans, - struct btree_iter *iter, struct bpos new_pos) +static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) { + struct btree_trans *trans = iter->trans; + if (unlikely(iter->update_path)) bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_intent); @@ -454,22 +469,21 @@ static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *it iter->pos = bkey_start_pos(&iter->k); } -static inline void bch2_btree_iter_set_snapshot(struct btree_trans *trans, - struct btree_iter *iter, u32 snapshot) +static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot) { struct bpos pos = iter->pos; iter->snapshot = snapshot; pos.snapshot = snapshot; - bch2_btree_iter_set_pos(trans, iter, pos); + bch2_btree_iter_set_pos(iter, pos); } -void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *); +void bch2_trans_iter_exit(struct btree_iter *); -static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans, - unsigned btree_id, - unsigned level, - unsigned flags) +static inline enum btree_iter_update_trigger_flags +bch2_btree_iter_flags(struct btree_trans *trans, + unsigned btree_id, unsigned level, + enum btree_iter_update_trigger_flags flags) { if (level || !btree_id_cached(trans->c, btree_id)) { flags &= ~BTREE_ITER_cached; @@ -497,15 +511,16 @@ static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans, static inline void bch2_trans_iter_init_common(struct btree_trans *trans, struct btree_iter *iter, - unsigned btree_id, struct bpos pos, + enum btree_id btree, struct bpos pos, unsigned locks_want, unsigned depth, - unsigned flags, + enum btree_iter_update_trigger_flags flags, unsigned long ip) { + iter->trans = trans; iter->update_path = 0; iter->key_cache_path = 0; - iter->btree_id = btree_id; + iter->btree_id = btree; iter->min_depth = 0; iter->flags = flags; iter->snapshot = pos.snapshot; @@ -515,99 +530,156 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans, #ifdef CONFIG_BCACHEFS_DEBUG iter->ip_allocated = ip; #endif - iter->path = bch2_path_get(trans, btree_id, iter->pos, - locks_want, depth, flags, ip); + iter->path = bch2_path_get(trans, btree, iter->pos, locks_want, depth, flags, ip); } void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *, - enum btree_id, struct bpos, unsigned); + enum btree_id, struct bpos, + enum btree_iter_update_trigger_flags, + unsigned long ip); static inline void bch2_trans_iter_init(struct btree_trans *trans, struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - unsigned flags) + enum btree_id btree, struct bpos pos, + enum btree_iter_update_trigger_flags flags) { - if (__builtin_constant_p(btree_id) && + if (__builtin_constant_p(btree) && __builtin_constant_p(flags)) - bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, - bch2_btree_iter_flags(trans, btree_id, 0, flags), - _THIS_IP_); + bch2_trans_iter_init_common(trans, iter, btree, pos, 0, 0, + bch2_btree_iter_flags(trans, btree, 0, flags), + _RET_IP_); else - bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags); + bch2_trans_iter_init_outlined(trans, iter, btree, pos, flags, _RET_IP_); } +#define bch2_trans_iter_class_init(_trans, _btree, _pos, _flags) \ +({ \ + struct btree_iter iter; \ + bch2_trans_iter_init(_trans, &iter, (_btree), (_pos), (_flags)); \ + iter; \ +}) + +DEFINE_CLASS(btree_iter, struct btree_iter, + bch2_trans_iter_exit(&_T), + bch2_trans_iter_class_init(trans, btree, pos, flags), + struct btree_trans *trans, + enum btree_id btree, struct bpos pos, + enum btree_iter_update_trigger_flags flags); + void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *, enum btree_id, struct bpos, - unsigned, unsigned, unsigned); -void bch2_trans_copy_iter(struct btree_trans *, struct btree_iter *, struct btree_iter *); + unsigned, unsigned, + enum btree_iter_update_trigger_flags); -void bch2_set_btree_iter_dontneed(struct btree_trans *, struct btree_iter *); +void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *); -void *__bch2_trans_kmalloc(struct btree_trans *, size_t); +void bch2_set_btree_iter_dontneed(struct btree_iter *); -/** - * bch2_trans_kmalloc - allocate memory for use by the current transaction - * - * Must be called after bch2_trans_begin, which on second and further calls - * frees all memory allocated in this transaction - */ -static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE +void bch2_trans_kmalloc_trace_to_text(struct printbuf *, + darray_trans_kmalloc_trace *); +#endif + +void *__bch2_trans_kmalloc(struct btree_trans *, size_t, unsigned long); + +static inline void bch2_trans_kmalloc_trace(struct btree_trans *trans, size_t size, + unsigned long ip) +{ +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_push(&trans->trans_kmalloc_trace, + ((struct trans_kmalloc_trace) { .ip = ip, .bytes = size })); +#endif +} + +static __always_inline void *bch2_trans_kmalloc_nomemzero_ip(struct btree_trans *trans, size_t size, + unsigned long ip) { size = roundup(size, 8); + bch2_trans_kmalloc_trace(trans, size, ip); + if (likely(trans->mem_top + size <= trans->mem_bytes)) { void *p = trans->mem + trans->mem_top; trans->mem_top += size; - memset(p, 0, size); return p; } else { - return __bch2_trans_kmalloc(trans, size); + return __bch2_trans_kmalloc(trans, size, ip); } } -static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size) +static __always_inline void *bch2_trans_kmalloc_ip(struct btree_trans *trans, size_t size, + unsigned long ip) { - size = round_up(size, 8); + size = roundup(size, 8); + + bch2_trans_kmalloc_trace(trans, size, ip); if (likely(trans->mem_top + size <= trans->mem_bytes)) { void *p = trans->mem + trans->mem_top; trans->mem_top += size; + memset(p, 0, size); return p; } else { - return __bch2_trans_kmalloc(trans, size); + return __bch2_trans_kmalloc(trans, size, ip); } } +/** + * bch2_trans_kmalloc - allocate memory for use by the current transaction + * + * Must be called after bch2_trans_begin, which on second and further calls + * frees all memory allocated in this transaction + */ +static __always_inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +{ + return bch2_trans_kmalloc_ip(trans, size, _THIS_IP_); +} + +static __always_inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size) +{ + return bch2_trans_kmalloc_nomemzero_ip(trans, size, _THIS_IP_); +} + static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans, struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - unsigned flags, unsigned type) + enum btree_id btree, struct bpos pos, + enum btree_iter_update_trigger_flags flags, + enum bch_bkey_type type) { struct bkey_s_c k; - bch2_trans_iter_init(trans, iter, btree_id, pos, flags); - k = bch2_btree_iter_peek_slot(trans, iter); + bch2_trans_iter_init(trans, iter, btree, pos, flags); + k = bch2_btree_iter_peek_slot(iter); if (!bkey_err(k) && type && k.k->type != type) - k = bkey_s_c_err(-BCH_ERR_ENOENT_bkey_type_mismatch); + k = bkey_s_c_err(bch_err_throw(trans->c, ENOENT_bkey_type_mismatch)); if (unlikely(bkey_err(k))) - bch2_trans_iter_exit(trans, iter); + bch2_trans_iter_exit(iter); return k; } static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans, struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - unsigned flags) + enum btree_id btree, struct bpos pos, + enum btree_iter_update_trigger_flags flags) { - return __bch2_bkey_get_iter(trans, iter, btree_id, pos, flags, 0); + return __bch2_bkey_get_iter(trans, iter, btree, pos, flags, 0); } -#define bch2_bkey_get_iter_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\ - bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \ - _btree_id, _pos, _flags, KEY_TYPE_##_type)) +static inline struct bkey_s_c __bch2_bkey_get_typed(struct btree_iter *iter, + enum bch_bkey_type type) +{ + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); + + if (!bkey_err(k) && type && k.k->type != type) + k = bkey_s_c_err(bch_err_throw(iter->trans->c, ENOENT_bkey_type_mismatch)); + return k; +} + +#define bch2_bkey_get_typed(_iter, _type) \ + bkey_s_c_to_##_type(__bch2_bkey_get_typed(_iter, KEY_TYPE_##_type)) static inline void __bkey_val_copy(void *dst_v, unsigned dst_size, struct bkey_s_c src_k) { @@ -624,18 +696,16 @@ do { \ } while (0) static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, - unsigned btree_id, struct bpos pos, - unsigned flags, unsigned type, + enum btree_id btree, struct bpos pos, + enum btree_iter_update_trigger_flags flags, + enum bch_bkey_type type, unsigned val_size, void *val) { - struct btree_iter iter; - struct bkey_s_c k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type); + CLASS(btree_iter, iter)(trans, btree, pos, flags); + struct bkey_s_c k = __bch2_bkey_get_typed(&iter, type); int ret = bkey_err(k); - if (!ret) { + if (!ret) __bkey_val_copy(val, val_size, k); - bch2_trans_iter_exit(trans, &iter); - } - return ret; } @@ -658,17 +728,17 @@ u32 bch2_trans_begin(struct btree_trans *); int _ret3 = 0; \ do { \ _ret3 = lockrestart_do((_trans), ({ \ - struct btree *_b = bch2_btree_iter_peek_node(_trans, &_iter);\ + struct btree *_b = bch2_btree_iter_peek_node(&_iter); \ if (!_b) \ break; \ \ PTR_ERR_OR_ZERO(_b) ?: (_do); \ })) ?: \ lockrestart_do((_trans), \ - PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(_trans, &_iter)));\ + PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(&_iter))); \ } while (!_ret3); \ \ - bch2_trans_iter_exit((_trans), &(_iter)); \ + bch2_trans_iter_exit(&(_iter)); \ _ret3; \ }) @@ -677,34 +747,31 @@ u32 bch2_trans_begin(struct btree_trans *); __for_each_btree_node(_trans, _iter, _btree_id, _start, \ 0, 0, _flags, _b, _do) -static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_trans *trans, - struct btree_iter *iter, - unsigned flags) +static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter, + enum btree_iter_update_trigger_flags flags) { - return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) : - bch2_btree_iter_peek_prev(trans, iter); + return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) : + bch2_btree_iter_peek_prev(iter); } -static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_trans *trans, - struct btree_iter *iter, - unsigned flags) +static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, + enum btree_iter_update_trigger_flags flags) { - return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) : - bch2_btree_iter_peek(trans, iter); + return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) : + bch2_btree_iter_peek(iter); } -static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_trans *trans, - struct btree_iter *iter, +static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_iter *iter, struct bpos end, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { if (!(flags & BTREE_ITER_slots)) - return bch2_btree_iter_peek_max(trans, iter, end); + return bch2_btree_iter_peek_max(iter, end); if (bkey_gt(iter->pos, end)) return bkey_s_c_null; - return bch2_btree_iter_peek_slot(trans, iter); + return bch2_btree_iter_peek_slot(iter); } int __bch2_btree_trans_too_many_iters(struct btree_trans *); @@ -760,7 +827,7 @@ transaction_restart: \ if (!_ret2) \ bch2_trans_verify_not_restarted(_trans, _restart_count);\ \ - _ret2 ?: trans_was_restarted(_trans, _orig_restart_count); \ + _ret2 ?: trans_was_restarted(_trans, _orig_restart_count); \ }) #define for_each_btree_key_max_continue(_trans, _iter, \ @@ -771,62 +838,52 @@ transaction_restart: \ \ do { \ _ret3 = lockrestart_do(_trans, ({ \ - (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), \ + (_k) = bch2_btree_iter_peek_max_type(&(_iter), \ _end, (_flags)); \ if (!(_k).k) \ break; \ \ bkey_err(_k) ?: (_do); \ })); \ - } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \ + } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \ \ - bch2_trans_iter_exit((_trans), &(_iter)); \ _ret3; \ }) #define for_each_btree_key_continue(_trans, _iter, _flags, _k, _do) \ for_each_btree_key_max_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do) -#define for_each_btree_key_max(_trans, _iter, _btree_id, \ - _start, _end, _flags, _k, _do) \ -({ \ - bch2_trans_begin(trans); \ - \ - struct btree_iter _iter; \ - bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - \ - for_each_btree_key_max_continue(_trans, _iter, _end, _flags, _k, _do);\ +#define for_each_btree_key_max(_trans, _iter, _btree_id, \ + _start, _end, _flags, _k, _do) \ +({ \ + bch2_trans_begin(trans); \ + \ + CLASS(btree_iter, _iter)((_trans), (_btree_id), (_start), (_flags)); \ + for_each_btree_key_max_continue(_trans, _iter, _end, _flags, _k, _do); \ }) -#define for_each_btree_key(_trans, _iter, _btree_id, \ - _start, _flags, _k, _do) \ - for_each_btree_key_max(_trans, _iter, _btree_id, _start, \ - SPOS_MAX, _flags, _k, _do) +#define for_each_btree_key(_trans, _iter, _btree_id, _start, _flags, _k, _do) \ + for_each_btree_key_max(_trans, _iter, _btree_id, _start, SPOS_MAX, _flags, _k, _do) -#define for_each_btree_key_reverse(_trans, _iter, _btree_id, \ - _start, _flags, _k, _do) \ -({ \ - struct btree_iter _iter; \ - struct bkey_s_c _k; \ - int _ret3 = 0; \ - \ - bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - \ - do { \ - _ret3 = lockrestart_do(_trans, ({ \ - (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), \ - (_flags)); \ - if (!(_k).k) \ - break; \ - \ - bkey_err(_k) ?: (_do); \ - })); \ - } while (!_ret3 && bch2_btree_iter_rewind(_trans, &(_iter))); \ - \ - bch2_trans_iter_exit((_trans), &(_iter)); \ - _ret3; \ +#define for_each_btree_key_reverse(_trans, _iter, _btree_id, \ + _start, _flags, _k, _do) \ +({ \ + int _ret3 = 0; \ + \ + CLASS(btree_iter, iter)((_trans), (_btree_id), (_start), (_flags)); \ + \ + do { \ + _ret3 = lockrestart_do(_trans, ({ \ + struct bkey_s_c _k = \ + bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\ + if (!(_k).k) \ + break; \ + \ + bkey_err(_k) ?: (_do); \ + })); \ + } while (!_ret3 && bch2_btree_iter_rewind(&(_iter))); \ + \ + _ret3; \ }) #define for_each_btree_key_commit(_trans, _iter, _btree_id, \ @@ -853,38 +910,36 @@ transaction_restart: \ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ (_journal_seq), (_commit_flags))) -struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *, - struct btree_iter *); - -#define for_each_btree_key_max_norestart(_trans, _iter, _btree_id, \ - _start, _end, _flags, _k, _ret) \ - for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags),\ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(_trans, &(_iter))) - -#define for_each_btree_key_max_continue_norestart(_trans, _iter, _end, _flags, _k, _ret)\ - for (; \ - (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags), \ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(_trans, &(_iter))) - -#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ - _start, _flags, _k, _ret) \ - for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start,\ +struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); + +#define for_each_btree_key_max_norestart(_trans, _iter, _btree_id, \ + _start, _end, _flags, _k, _ret) \ + for (CLASS(btree_iter, _iter)((_trans), (_btree_id), (_start), (_flags)); \ + (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags), \ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + +#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ + _start, _flags, _k, _ret) \ + for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start, \ SPOS_MAX, _flags, _k, _ret) -#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \ - _start, _flags, _k, _ret) \ - for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), _flags), \ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_rewind(_trans, &(_iter))) +#define for_each_btree_key_max_continue_norestart(_iter, _end, _flags, _k, _ret) \ + for (; \ + (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags), \ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + +#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ + for_each_btree_key_max_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret) -#define for_each_btree_key_continue_norestart(_trans, _iter, _flags, _k, _ret) \ - for_each_btree_key_max_continue_norestart(_trans, _iter, SPOS_MAX, _flags, _k, _ret) +#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \ + _start, _flags, _k, _ret) \ + for (CLASS(btree_iter, _iter)((_trans), (_btree_id), \ + (_start), (_flags)); \ + (_k) = bch2_btree_iter_peek_prev_type(&(_iter), _flags), \ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_rewind(&(_iter))) /* * This should not be used in a fastpath, without first trying _do in @@ -922,16 +977,20 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *, _p; \ }) -#define bch2_trans_run(_c, _do) \ +#define allocate_dropping_locks_norelock(_trans, _lock_dropped, _do) \ ({ \ - struct btree_trans *trans = bch2_trans_get(_c); \ - int _ret = (_do); \ - bch2_trans_put(trans); \ - _ret; \ + gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \ + typeof(_do) _p = _do; \ + _lock_dropped = false; \ + if (unlikely(!_p)) { \ + bch2_trans_unlock(_trans); \ + _lock_dropped = true; \ + _gfp = GFP_KERNEL; \ + _p = _do; \ + } \ + _p; \ }) -#define bch2_trans_do(_c, _do) bch2_trans_run(_c, lockrestart_do(trans, _do)) - struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned); void bch2_trans_put(struct btree_trans *); @@ -949,6 +1008,33 @@ unsigned bch2_trans_get_fn_idx(const char *); __bch2_trans_get(_c, trans_fn_idx); \ }) +/* + * We don't use DEFINE_CLASS() because using a function for the constructor + * breaks bch2_trans_get()'s use of __func__ + */ +typedef struct btree_trans * class_btree_trans_t; +static inline void class_btree_trans_destructor(struct btree_trans **p) +{ + struct btree_trans *trans = *p; + bch2_trans_put(trans); +} + +#define class_btree_trans_constructor(_c) bch2_trans_get(_c) + +/* deprecated, prefer CLASS(btree_trans) */ +#define bch2_trans_run(_c, _do) \ +({ \ + CLASS(btree_trans, trans)(_c); \ + (_do); \ +}) + +/* deprecated, prefer CLASS(btree_trans) */ +#define bch2_trans_do(_c, _do) \ +({ \ + CLASS(btree_trans, trans)(_c); \ + lockrestart_do(trans, _do); \ +}) + void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *); void bch2_fs_btree_iter_exit(struct bch_fs *); diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index ade3b5addd75..24f2fbe84ad7 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -5,6 +5,7 @@ #include "bset.h" #include "btree_cache.h" #include "btree_journal_iter.h" +#include "disk_accounting.h" #include "journal_io.h" #include @@ -137,12 +138,15 @@ struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id b struct journal_key *k; BUG_ON(*idx > keys->nr); + + if (!keys->nr) + return NULL; search: if (!*idx) *idx = __bch2_journal_key_search(keys, btree_id, level, pos); - while (*idx && - __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) { + while (*idx < keys->nr && + __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx)) >= 0) { (*idx)++; iters++; if (iters == 10) { @@ -151,18 +155,23 @@ struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id b } } + if (*idx == keys->nr) + --(*idx); + struct bkey_i *ret = NULL; rcu_read_lock(); /* for overwritten_ranges */ - while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { + while (true) { + k = idx_to_key(keys, *idx); if (__journal_key_cmp(btree_id, level, end_pos, k) > 0) break; if (k->overwritten) { if (k->overwritten_range) - *idx = rcu_dereference(k->overwritten_range)->start - 1; - else - *idx -= 1; + *idx = rcu_dereference(k->overwritten_range)->start; + if (!*idx) + break; + --(*idx); continue; } @@ -171,6 +180,8 @@ struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id b break; } + if (!*idx) + break; --(*idx); iters++; if (iters == 10) { @@ -268,12 +279,23 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, if (idx < keys->size && journal_key_cmp(&n, &keys->data[idx]) == 0) { + struct bkey_i *o = keys->data[idx].k; + + if (k->k.type == KEY_TYPE_accounting && + o->k.type == KEY_TYPE_accounting) { + if (!keys->data[idx].allocated) + goto insert; + + bch2_accounting_accumulate(bkey_i_to_accounting(k), + bkey_i_to_s_c_accounting(o)); + } + if (keys->data[idx].allocated) kfree(keys->data[idx].k); keys->data[idx] = n; return 0; } - +insert: if (idx > keys->gap) idx -= keys->size - keys->nr; @@ -292,7 +314,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, if (!new_keys.data) { bch_err(c, "%s: error allocating new key array (size %zu)", __func__, new_keys.size); - return -BCH_ERR_ENOMEM_journal_key_insert; + return bch_err_throw(c, ENOMEM_journal_key_insert); } /* Since @keys was full, there was no gap: */ @@ -331,7 +353,7 @@ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); if (!n) - return -BCH_ERR_ENOMEM_journal_key_insert; + return bch_err_throw(c, ENOMEM_journal_key_insert); bkey_copy(n, k); ret = bch2_journal_key_insert_take(c, id, level, n); @@ -440,9 +462,8 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, keys->data[idx].level == level && bpos_eq(keys->data[idx].k->k.p, pos) && !keys->data[idx].overwritten) { - mutex_lock(&keys->overwrite_lock); + guard(mutex)(&keys->overwrite_lock); __bch2_journal_key_overwritten(keys, idx); - mutex_unlock(&keys->overwrite_lock); } } @@ -457,11 +478,9 @@ static void bch2_journal_iter_advance(struct journal_iter *iter) static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) { - struct bkey_s_c ret = bkey_s_c_null; - journal_iter_verify(iter); - rcu_read_lock(); + guard(rcu)(); while (iter->idx < iter->keys->size) { struct journal_key *k = iter->keys->data + iter->idx; @@ -470,19 +489,16 @@ static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) break; BUG_ON(cmp); - if (!k->overwritten) { - ret = bkey_i_to_s_c(k->k); - break; - } + if (!k->overwritten) + return bkey_i_to_s_c(k->k); if (k->overwritten_range) iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end); else bch2_journal_iter_advance(iter); } - rcu_read_unlock(); - return ret; + return bkey_s_c_null; } static void bch2_journal_iter_exit(struct journal_iter *iter) @@ -646,10 +662,11 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) { const struct journal_key *l = _l; const struct journal_key *r = _r; + int rewind = l->rewind && r->rewind ? -1 : 1; return journal_key_cmp(l, r) ?: - cmp_int(l->journal_seq, r->journal_seq) ?: - cmp_int(l->journal_offset, r->journal_offset); + ((cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->journal_offset, r->journal_offset)) * rewind); } void bch2_journal_keys_put(struct bch_fs *c) @@ -718,6 +735,8 @@ int bch2_journal_keys_sort(struct bch_fs *c) struct journal_keys *keys = &c->journal_keys; size_t nr_read = 0; + u64 rewind_seq = c->opts.journal_rewind ?: U64_MAX; + genradix_for_each(&c->journal_entries, iter, _i) { i = *_i; @@ -726,28 +745,43 @@ int bch2_journal_keys_sort(struct bch_fs *c) cond_resched(); - for_each_jset_key(k, entry, &i->j) { - struct journal_key n = (struct journal_key) { - .btree_id = entry->btree_id, - .level = entry->level, - .k = k, - .journal_seq = le64_to_cpu(i->j.seq), - .journal_offset = k->_data - i->j._data, - }; - - if (darray_push(keys, n)) { - __journal_keys_sort(keys); - - if (keys->nr * 8 > keys->size * 7) { - bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu", - keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq)); - return -BCH_ERR_ENOMEM_journal_keys_sort; + vstruct_for_each(&i->j, entry) { + bool rewind = !entry->level && + !btree_id_is_alloc(entry->btree_id) && + le64_to_cpu(i->j.seq) >= rewind_seq; + + if (entry->type != (rewind + ? BCH_JSET_ENTRY_overwrite + : BCH_JSET_ENTRY_btree_keys)) + continue; + + if (!rewind && le64_to_cpu(i->j.seq) < c->journal_replay_seq_start) + continue; + + jset_entry_for_each_key(entry, k) { + struct journal_key n = (struct journal_key) { + .btree_id = entry->btree_id, + .level = entry->level, + .rewind = rewind, + .k = k, + .journal_seq = le64_to_cpu(i->j.seq), + .journal_offset = k->_data - i->j._data, + }; + + if (darray_push(keys, n)) { + __journal_keys_sort(keys); + + if (keys->nr * 8 > keys->size * 7) { + bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu", + keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq)); + return bch_err_throw(c, ENOMEM_journal_keys_sort); + } + + BUG_ON(darray_push(keys, n)); } - BUG_ON(darray_push(keys, n)); + nr_read++; } - - nr_read++; } } @@ -780,7 +814,7 @@ void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree, void bch2_journal_keys_dump(struct bch_fs *c) { struct journal_keys *keys = &c->journal_keys; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); pr_info("%zu keys:", keys->nr); @@ -794,7 +828,6 @@ void bch2_journal_keys_dump(struct bch_fs *c) bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); pr_err("%s", buf.buf); } - printbuf_exit(&buf); } void bch2_fs_journal_keys_init(struct bch_fs *c) diff --git a/fs/bcachefs/btree_journal_iter_types.h b/fs/bcachefs/btree_journal_iter_types.h index 8b773823704f..86aacb254fb2 100644 --- a/fs/bcachefs/btree_journal_iter_types.h +++ b/fs/bcachefs/btree_journal_iter_types.h @@ -11,8 +11,9 @@ struct journal_key { u32 journal_offset; enum btree_id btree_id:8; unsigned level:8; - bool allocated; - bool overwritten; + bool allocated:1; + bool overwritten:1; + bool rewind:1; struct journal_key_range_overwritten __rcu * overwritten_range; struct bkey_i *k; diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 669825f89cdd..e3336ab27ccc 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -13,6 +13,7 @@ #include "trace.h" #include +#include static inline bool btree_uses_pcpu_readers(enum btree_id id) { @@ -101,8 +102,8 @@ static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu kmem_cache_free(bch2_key_cache, ck); } -static void bkey_cached_free(struct btree_key_cache *bc, - struct bkey_cached *ck) +static inline void bkey_cached_free_noassert(struct btree_key_cache *bc, + struct bkey_cached *ck) { kfree(ck->k); ck->k = NULL; @@ -116,6 +117,19 @@ static void bkey_cached_free(struct btree_key_cache *bc, this_cpu_inc(*bc->nr_pending); } +static void bkey_cached_free(struct btree_trans *trans, + struct btree_key_cache *bc, + struct bkey_cached *ck) +{ + /* + * we'll hit strange issues in the SRCU code if we aren't holding an + * SRCU read lock... + */ + EBUG_ON(!trans->srcu_held); + + bkey_cached_free_noassert(bc, ck); +} + static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp) { gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; @@ -174,27 +188,23 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k static struct bkey_cached * bkey_cached_reuse(struct btree_key_cache *c) { - struct bucket_table *tbl; + + guard(rcu)(); + struct bucket_table *tbl = rht_dereference_rcu(c->table.tbl, &c->table); struct rhash_head *pos; struct bkey_cached *ck; - unsigned i; - rcu_read_lock(); - tbl = rht_dereference_rcu(c->table.tbl, &c->table); - for (i = 0; i < tbl->size; i++) + for (unsigned i = 0; i < tbl->size; i++) rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && bkey_cached_lock_for_evict(ck)) { if (bkey_cached_evict(c, ck)) - goto out; + return ck; six_unlock_write(&ck->c.lock); six_unlock_intent(&ck->c.lock); } } - ck = NULL; -out: - rcu_read_unlock(); - return ck; + return NULL; } static int btree_key_cache_create(struct btree_trans *trans, @@ -229,7 +239,7 @@ static int btree_key_cache_create(struct btree_trans *trans, if (unlikely(!ck)) { bch_err(c, "error allocating memory for key cache item, btree %s", bch2_btree_id_str(ck_path->btree_id)); - return -BCH_ERR_ENOMEM_btree_key_cache_create; + return bch_err_throw(c, ENOMEM_btree_key_cache_create); } } @@ -244,11 +254,13 @@ static int btree_key_cache_create(struct btree_trans *trans, struct bkey_i *new_k = allocate_dropping_locks(trans, ret, kmalloc(key_u64s * sizeof(u64), _gfp)); - if (unlikely(!new_k)) { + if (unlikely(!new_k && !ret)) { bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", bch2_btree_id_str(ck->key.btree_id), key_u64s); - ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; - } else if (ret) { + ret = bch_err_throw(c, ENOMEM_btree_key_cache_fill); + } + + if (unlikely(ret)) { kfree(new_k); goto err; } @@ -281,7 +293,7 @@ static int btree_key_cache_create(struct btree_trans *trans, ck_path->uptodate = BTREE_ITER_UPTODATE; return 0; err: - bkey_cached_free(bc, ck); + bkey_cached_free(trans, bc, ck); mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); return ret; @@ -291,13 +303,12 @@ static noinline_for_stack void do_trace_key_cache_fill(struct btree_trans *trans struct btree_path *ck_path, struct bkey_s_c k) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_bpos_to_text(&buf, ck_path->pos); prt_char(&buf, ' '); bch2_bkey_val_to_text(&buf, trans->c, k); trace_key_cache_fill(trans, buf.buf); - printbuf_exit(&buf); } static noinline int btree_key_cache_fill(struct btree_trans *trans, @@ -312,19 +323,17 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, } struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; - bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos, - BTREE_ITER_intent| - BTREE_ITER_key_cache_fill| - BTREE_ITER_cached_nofill); + CLASS(btree_iter, iter)(trans, ck_path->btree_id, ck_path->pos, + BTREE_ITER_intent| + BTREE_ITER_nofilter_whiteouts| + BTREE_ITER_key_cache_fill| + BTREE_ITER_cached_nofill); iter.flags &= ~BTREE_ITER_with_journal; - k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + int ret = bkey_err(k); if (ret) - goto err; + return ret; /* Recheck after btree lookup, before allocating: */ ck_path = trans->paths + ck_path_idx; @@ -334,15 +343,13 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, ret = btree_key_cache_create(trans, btree_iter_path(trans, &iter), ck_path, k); if (ret) - goto err; + return ret; if (trace_key_cache_fill_enabled()) do_trace_key_cache_fill(trans, ck_path, k); out: /* We're not likely to need this iterator again: */ - bch2_set_btree_iter_dontneed(trans, &iter); -err: - bch2_trans_iter_exit(trans, &iter); + bch2_set_btree_iter_dontneed(&iter); return ret; } @@ -398,7 +405,7 @@ int bch2_btree_path_traverse_cached(struct btree_trans *trans, btree_node_unlock(trans, path, 0); path->l[0].b = ERR_PTR(ret); } - } else { + } else if (!(flags & BTREE_ITER_cached_nofill)) { BUG_ON(path->uptodate); BUG_ON(!path->nodes_locked); } @@ -414,35 +421,34 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct journal *j = &c->journal; - struct btree_iter c_iter, b_iter; struct bkey_cached *ck = NULL; int ret; - bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, - BTREE_ITER_slots| - BTREE_ITER_intent| - BTREE_ITER_all_snapshots); - bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, - BTREE_ITER_cached| - BTREE_ITER_intent); + CLASS(btree_iter, b_iter)(trans, key.btree_id, key.pos, + BTREE_ITER_slots| + BTREE_ITER_intent| + BTREE_ITER_all_snapshots); + CLASS(btree_iter, c_iter)(trans, key.btree_id, key.pos, + BTREE_ITER_cached| + BTREE_ITER_intent); b_iter.flags &= ~BTREE_ITER_with_key_cache; - ret = bch2_btree_iter_traverse(trans, &c_iter); + ret = bch2_btree_iter_traverse(&c_iter); if (ret) - goto out; + return ret; ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b; if (!ck) - goto out; + return 0; if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { if (evict) goto evict; - goto out; + return 0; } if (journal_seq && ck->journal.seq != journal_seq) - goto out; + return 0; trans->journal_res.seq = ck->journal.seq; @@ -459,7 +465,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, !test_bit(JOURNAL_space_low, &c->journal.flags)) commit_flags |= BCH_TRANS_COMMIT_no_journal_res; - struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(trans, &b_iter); + struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(&b_iter); ret = bkey_err(btree_k); if (ret) goto err; @@ -511,15 +517,13 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); if (bkey_cached_evict(&c->btree_key_cache, ck)) { - bkey_cached_free(&c->btree_key_cache, ck); + bkey_cached_free(trans, &c->btree_key_cache, ck); } else { six_unlock_write(&ck->c.lock); six_unlock_intent(&ck->c.lock); } } out: - bch2_trans_iter_exit(trans, &b_iter); - bch2_trans_iter_exit(trans, &c_iter); return ret; } @@ -530,10 +534,10 @@ int bch2_btree_key_cache_journal_flush(struct journal *j, struct bkey_cached *ck = container_of(pin, struct bkey_cached, journal); struct bkey_cached_key key; - struct btree_trans *trans = bch2_trans_get(c); int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); int ret = 0; + CLASS(btree_trans, trans)(c); btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read); key = ck->key; @@ -556,8 +560,6 @@ int bch2_btree_key_cache_journal_flush(struct journal *j, BCH_TRANS_COMMIT_journal_reclaim, false)); unlock: srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); - - bch2_trans_put(trans); return ret; } @@ -571,6 +573,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, bool kick_reclaim = false; BUG_ON(insert->k.u64s > ck->u64s); + BUG_ON(bkey_deleted(&insert->k)); bkey_copy(ck->k, insert); @@ -625,7 +628,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, } bkey_cached_evict(bc, ck); - bkey_cached_free(bc, ck); + bkey_cached_free(trans, bc, ck); mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); @@ -633,10 +636,17 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, unsigned i; trans_for_each_path(trans, path2, i) if (path2->l[0].b == (void *) ck) { + /* + * It's safe to clear should_be_locked here because + * we're evicting from the key cache, and we still have + * the underlying btree locked: filling into the key + * cache would require taking a write lock on the btree + * node + */ + path2->should_be_locked = false; __bch2_btree_path_unlock(trans, path2); path2->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_drop); - path2->should_be_locked = false; - btree_path_set_dirty(path2, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path2, BTREE_ITER_NEED_TRAVERSE); } bch2_trans_verify_locks(trans); @@ -693,7 +703,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, } else if (!bkey_cached_lock_for_evict(ck)) { bc->skipped_lock_fail++; } else if (bkey_cached_evict(bc, ck)) { - bkey_cached_free(bc, ck); + bkey_cached_free_noassert(bc, ck); bc->freed++; freed++; } else { @@ -799,6 +809,18 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) { } +static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink) +{ + struct bch_fs *c = shrink->private_data; + struct btree_key_cache *bc = &c->btree_key_cache; + char *cbuf; + size_t buflen = seq_buf_get_buf(s, &cbuf); + struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen); + + bch2_btree_key_cache_to_text(&out, bc); + seq_buf_commit(s, out.pos); +} + int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) { struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); @@ -806,23 +828,24 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) bc->nr_pending = alloc_percpu(size_t); if (!bc->nr_pending) - return -BCH_ERR_ENOMEM_fs_btree_cache_init; + return bch_err_throw(c, ENOMEM_fs_btree_cache_init); if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) || rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free)) - return -BCH_ERR_ENOMEM_fs_btree_cache_init; + return bch_err_throw(c, ENOMEM_fs_btree_cache_init); if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) - return -BCH_ERR_ENOMEM_fs_btree_cache_init; + return bch_err_throw(c, ENOMEM_fs_btree_cache_init); bc->table_init_done = true; shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name); if (!shrink) - return -BCH_ERR_ENOMEM_fs_btree_cache_init; + return bch_err_throw(c, ENOMEM_fs_btree_cache_init); bc->shrink = shrink; shrink->count_objects = bch2_btree_key_cache_count; shrink->scan_objects = bch2_btree_key_cache_scan; + shrink->to_text = bch2_btree_key_cache_shrinker_to_text; shrink->batch = 1 << 14; shrink->seeks = 0; shrink->private_data = c; diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 94eb2b73a843..38c5643e8a78 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "btree_cache.h" #include "btree_locking.h" #include "btree_types.h" @@ -158,13 +159,11 @@ static void trace_would_deadlock(struct lock_graph *g, struct btree_trans *trans count_event(c, trans_restart_would_deadlock); if (trace_trans_restart_would_deadlock_enabled()) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); + guard(printbuf_atomic)(&buf); - buf.atomic++; print_cycle(&buf, g); - trace_trans_restart_would_deadlock(trans, buf.buf); - printbuf_exit(&buf); } } @@ -193,6 +192,29 @@ static int btree_trans_abort_preference(struct btree_trans *trans) return 3; } +static noinline __noreturn void break_cycle_fail(struct lock_graph *g) +{ + CLASS(printbuf, buf)(); + guard(printbuf_atomic)(&buf); + + prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks")); + + for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++) { + struct btree_trans *trans = i->trans; + + bch2_btree_trans_to_text(&buf, trans); + + prt_printf(&buf, "backtrace:\n"); + printbuf_indent_add(&buf, 2); + bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT); + printbuf_indent_sub(&buf, 2); + prt_newline(&buf); + } + + bch2_print_str(g->g->trans->c, KERN_ERR, buf.buf); + BUG(); +} + static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle, struct trans_waiting_for_lock *from) { @@ -218,28 +240,8 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle, } } - if (unlikely(!best)) { - struct printbuf buf = PRINTBUF; - buf.atomic++; - - prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks")); - - for (i = g->g; i < g->g + g->nr; i++) { - struct btree_trans *trans = i->trans; - - bch2_btree_trans_to_text(&buf, trans); - - prt_printf(&buf, "backtrace:\n"); - printbuf_indent_add(&buf, 2); - bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT); - printbuf_indent_sub(&buf, 2); - prt_newline(&buf); - } - - bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf); - printbuf_exit(&buf); - BUG(); - } + if (unlikely(!best)) + break_cycle_fail(g); ret = abort_lock(g, abort); out: @@ -254,15 +256,14 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, struct printbuf *cycle) { struct btree_trans *orig_trans = g->g->trans; - struct trans_waiting_for_lock *i; - for (i = g->g; i < g->g + g->nr; i++) + for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++) if (i->trans == trans) { closure_put(&trans->ref); return break_cycle(g, cycle, i); } - if (g->nr == ARRAY_SIZE(g->g)) { + if (unlikely(g->nr == ARRAY_SIZE(g->g))) { closure_put(&trans->ref); if (orig_trans->lock_may_not_fail) @@ -307,7 +308,7 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) lock_graph_down(&g, trans); /* trans->paths is rcu protected vs. freeing */ - rcu_read_lock(); + guard(rcu)(); if (cycle) cycle->atomic++; next: @@ -405,7 +406,6 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) out: if (cycle) --cycle->atomic; - rcu_read_unlock(); return ret; } @@ -450,13 +450,13 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, /* relock */ -static inline bool btree_path_get_locks(struct btree_trans *trans, - struct btree_path *path, - bool upgrade, - struct get_locks_fail *f) +static int btree_path_get_locks(struct btree_trans *trans, + struct btree_path *path, + bool upgrade, + struct get_locks_fail *f, + int restart_err) { unsigned l = path->level; - int fail_idx = -1; do { if (!btree_path_node(path, l)) @@ -464,39 +464,49 @@ static inline bool btree_path_get_locks(struct btree_trans *trans, if (!(upgrade ? bch2_btree_node_upgrade(trans, path, l) - : bch2_btree_node_relock(trans, path, l))) { - fail_idx = l; - - if (f) { - f->l = l; - f->b = path->l[l].b; - } - } + : bch2_btree_node_relock(trans, path, l))) + goto err; l++; } while (l < path->locks_want); + if (path->uptodate == BTREE_ITER_NEED_RELOCK) + path->uptodate = BTREE_ITER_UPTODATE; + + return path->uptodate < BTREE_ITER_NEED_RELOCK ? 0 : -1; +err: + if (f) { + f->l = l; + f->b = path->l[l].b; + } + + /* + * Do transaction restart before unlocking, so we don't pop + * should_be_locked asserts + */ + if (restart_err) { + btree_trans_restart(trans, restart_err); + } else if (path->should_be_locked && !trans->restarted) { + if (upgrade) + path->locks_want = l; + return -1; + } + + __bch2_btree_path_unlock(trans, path); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); + /* * When we fail to get a lock, we have to ensure that any child nodes * can't be relocked so bch2_btree_path_traverse has to walk back up to * the node that we failed to relock: */ - if (fail_idx >= 0) { - __bch2_btree_path_unlock(trans, path); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); - - do { - path->l[fail_idx].b = upgrade - ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade) - : ERR_PTR(-BCH_ERR_no_btree_node_relock); - --fail_idx; - } while (fail_idx >= 0); - } - - if (path->uptodate == BTREE_ITER_NEED_RELOCK) - path->uptodate = BTREE_ITER_UPTODATE; + do { + path->l[l].b = upgrade + ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade) + : ERR_PTR(-BCH_ERR_no_btree_node_relock); + } while (l--); - return path->uptodate < BTREE_ITER_NEED_RELOCK; + return -restart_err ?: -1; } bool __bch2_btree_node_relock(struct btree_trans *trans, @@ -583,7 +593,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans, l++) { if (!bch2_btree_node_relock(trans, path, l)) { __bch2_btree_path_unlock(trans, path); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path); return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent); } @@ -595,9 +605,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans, __flatten bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path) { - struct get_locks_fail f; - - bool ret = btree_path_get_locks(trans, path, false, &f); + bool ret = !btree_path_get_locks(trans, path, false, NULL, 0); bch2_trans_verify_locks(trans); return ret; } @@ -613,27 +621,37 @@ int __bch2_btree_path_relock(struct btree_trans *trans, return 0; } -bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want, - struct get_locks_fail *f) +bool __bch2_btree_path_upgrade_norestart(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) { - EBUG_ON(path->locks_want >= new_locks_want); - path->locks_want = new_locks_want; - bool ret = btree_path_get_locks(trans, path, true, f); - bch2_trans_verify_locks(trans); + /* + * If we need it locked, we can't touch it. Otherwise, we can return + * success - bch2_path_get() will use this path, and it'll just be + * retraversed: + */ + bool ret = !btree_path_get_locks(trans, path, true, NULL, 0) || + !path->should_be_locked; + + bch2_btree_path_verify_locks(trans, path); return ret; } -bool __bch2_btree_path_upgrade(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want, - struct get_locks_fail *f) +int __bch2_btree_path_upgrade(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) { - bool ret = bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f); - if (ret) + unsigned old_locks = path->nodes_locked; + unsigned old_locks_want = path->locks_want; + + path->locks_want = max_t(unsigned, path->locks_want, new_locks_want); + + struct get_locks_fail f = {}; + int ret = btree_path_get_locks(trans, path, true, &f, + BCH_ERR_transaction_restart_upgrade); + if (!ret) goto out; /* @@ -665,9 +683,29 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, linked->btree_id == path->btree_id && linked->locks_want < new_locks_want) { linked->locks_want = new_locks_want; - btree_path_get_locks(trans, linked, true, NULL); + btree_path_get_locks(trans, linked, true, NULL, 0); } } + + count_event(trans->c, trans_restart_upgrade); + if (trace_trans_restart_upgrade_enabled()) { + CLASS(printbuf, buf)(); + + prt_printf(&buf, "%s %pS\n", trans->fn, (void *) _RET_IP_); + prt_printf(&buf, "btree %s pos\n", bch2_btree_id_str(path->btree_id)); + bch2_bpos_to_text(&buf, path->pos); + prt_printf(&buf, "locks want %u -> %u level %u\n", + old_locks_want, new_locks_want, f.l); + prt_printf(&buf, "nodes_locked %x -> %x\n", + old_locks, path->nodes_locked); + prt_printf(&buf, "node %s ", IS_ERR(f.b) ? bch2_err_str(PTR_ERR(f.b)) : + !f.b ? "(null)" : "(node)"); + prt_printf(&buf, "path seq %u node seq %u\n", + IS_ERR_OR_NULL(f.b) ? 0 : f.b->c.lock.seq, + path->l[f.l].lock_seq); + + trace_trans_restart_upgrade(trans->c, buf.buf); + } out: bch2_trans_verify_locks(trans); return ret; @@ -699,7 +737,7 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans, } } - bch2_btree_path_verify_locks(path); + bch2_btree_path_verify_locks(trans, path); trace_path_downgrade(trans, _RET_IP_, path, old_locks_want); } @@ -728,17 +766,19 @@ static inline void __bch2_trans_unlock(struct btree_trans *trans) __bch2_btree_path_unlock(trans, path); } -static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path, - struct get_locks_fail *f, bool trace) +static noinline __cold void bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path, + struct get_locks_fail *f, bool trace, ulong ip) { if (!trace) goto out; if (trace_trans_restart_relock_enabled()) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_bpos_to_text(&buf, path->pos); - prt_printf(&buf, " l=%u seq=%u node seq=", f->l, path->l[f->l].lock_seq); + prt_printf(&buf, " %s l=%u seq=%u node seq=", + bch2_btree_id_str(path->btree_id), + f->l, path->l[f->l].lock_seq); if (IS_ERR_OR_NULL(f->b)) { prt_str(&buf, bch2_err_str(PTR_ERR(f->b))); } else { @@ -752,18 +792,16 @@ static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, str prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); } - trace_trans_restart_relock(trans, _RET_IP_, buf.buf); - printbuf_exit(&buf); + trace_trans_restart_relock(trans, ip, buf.buf); } count_event(trans->c, trans_restart_relock); out: __bch2_trans_unlock(trans); bch2_trans_verify_locks(trans); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); } -static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) +static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace, ulong ip) { bch2_trans_verify_locks(trans); @@ -777,10 +815,14 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) trans_for_each_path(trans, path, i) { struct get_locks_fail f; + int ret; if (path->should_be_locked && - !btree_path_get_locks(trans, path, false, &f)) - return bch2_trans_relock_fail(trans, path, &f, trace); + (ret = btree_path_get_locks(trans, path, false, &f, + BCH_ERR_transaction_restart_relock))) { + bch2_trans_relock_fail(trans, path, &f, trace, ip); + return ret; + } } trans_set_locked(trans, true); @@ -791,26 +833,19 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) int bch2_trans_relock(struct btree_trans *trans) { - return __bch2_trans_relock(trans, true); + return __bch2_trans_relock(trans, true, _RET_IP_); } int bch2_trans_relock_notrace(struct btree_trans *trans) { - return __bch2_trans_relock(trans, false); + return __bch2_trans_relock(trans, false, _RET_IP_); } -void bch2_trans_unlock_noassert(struct btree_trans *trans) +void bch2_trans_unlock(struct btree_trans *trans) { - __bch2_trans_unlock(trans); - trans_set_unlocked(trans); -} -void bch2_trans_unlock(struct btree_trans *trans) -{ __bch2_trans_unlock(trans); - - trans_set_unlocked(trans); } void bch2_trans_unlock_long(struct btree_trans *trans) @@ -842,32 +877,28 @@ int __bch2_trans_mutex_lock(struct btree_trans *trans, /* Debug */ -#ifdef CONFIG_BCACHEFS_DEBUG - -void bch2_btree_path_verify_locks(struct btree_path *path) +void __bch2_btree_path_verify_locks(struct btree_trans *trans, struct btree_path *path) { - /* - * A path may be uptodate and yet have nothing locked if and only if - * there is no node at path->level, which generally means we were - * iterating over all nodes and got to the end of the btree - */ - BUG_ON(path->uptodate == BTREE_ITER_UPTODATE && - btree_path_node(path, path->level) && - !path->nodes_locked); + if (!path->nodes_locked && btree_path_node(path, path->level)) { + /* + * A path may be uptodate and yet have nothing locked if and only if + * there is no node at path->level, which generally means we were + * iterating over all nodes and got to the end of the btree + */ + BUG_ON(path->uptodate == BTREE_ITER_UPTODATE); + BUG_ON(path->should_be_locked && trans->locked && !trans->restarted); + } if (!path->nodes_locked) return; for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) { int want = btree_lock_want(path, l); - int have = btree_node_locked_type(path, l); + int have = btree_node_locked_type_nowrite(path, l); BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED); - BUG_ON(is_btree_node(path, l) && - (want == BTREE_NODE_UNLOCKED || - have != BTREE_NODE_WRITE_LOCKED) && - want != have); + BUG_ON(is_btree_node(path, l) && want != have); BUG_ON(btree_node_locked(path, l) && path->l[l].lock_seq != six_lock_seq(&path->l[l].b->c.lock)); @@ -885,7 +916,7 @@ static bool bch2_trans_locked(struct btree_trans *trans) return false; } -void bch2_trans_verify_locks(struct btree_trans *trans) +void __bch2_trans_verify_locks(struct btree_trans *trans) { if (!trans->locked) { BUG_ON(bch2_trans_locked(trans)); @@ -896,7 +927,5 @@ void bch2_trans_verify_locks(struct btree_trans *trans) unsigned i; trans_for_each_path(trans, path, i) - bch2_btree_path_verify_locks(path); + __bch2_btree_path_verify_locks(trans, path); } - -#endif diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index b33ab7af8440..f2173a3316f4 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -15,7 +15,6 @@ void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags, gfp_t gfp); -void bch2_trans_unlock_noassert(struct btree_trans *); void bch2_trans_unlock_write(struct btree_trans *); static inline bool is_btree_node(struct btree_path *path, unsigned l) @@ -44,6 +43,15 @@ static inline int btree_node_locked_type(struct btree_path *path, return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3); } +static inline int btree_node_locked_type_nowrite(struct btree_path *path, + unsigned level) +{ + int have = btree_node_locked_type(path, level); + return have == BTREE_NODE_WRITE_LOCKED + ? BTREE_NODE_INTENT_LOCKED + : have; +} + static inline bool btree_node_write_locked(struct btree_path *path, unsigned l) { return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED; @@ -152,7 +160,7 @@ static inline int btree_path_highest_level_locked(struct btree_path *path) static inline void __bch2_btree_path_unlock(struct btree_trans *trans, struct btree_path *path) { - btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_RELOCK); while (path->nodes_locked) btree_node_unlock(trans, path, btree_path_lowest_level_locked(path)); @@ -367,8 +375,8 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, struct btree_path *path, unsigned level) { EBUG_ON(btree_node_locked(path, level) && - !btree_node_write_locked(path, level) && - btree_node_locked_type(path, level) != __btree_lock_want(path, level)); + btree_node_locked_type_nowrite(path, level) != + __btree_lock_want(path, level)); return likely(btree_node_locked(path, level)) || (!IS_ERR_OR_NULL(path->l[level].b) && @@ -377,31 +385,29 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, /* upgrade */ -bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *, - struct btree_path *, unsigned, - struct get_locks_fail *); +bool __bch2_btree_path_upgrade_norestart(struct btree_trans *, struct btree_path *, unsigned); -bool __bch2_btree_path_upgrade(struct btree_trans *, - struct btree_path *, unsigned, - struct get_locks_fail *); +static inline bool bch2_btree_path_upgrade_norestart(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) +{ + return new_locks_want > path->locks_want + ? __bch2_btree_path_upgrade_norestart(trans, path, new_locks_want) + : true; +} + +int __bch2_btree_path_upgrade(struct btree_trans *, + struct btree_path *, unsigned); static inline int bch2_btree_path_upgrade(struct btree_trans *trans, struct btree_path *path, unsigned new_locks_want) { - struct get_locks_fail f = {}; - unsigned old_locks_want = path->locks_want; - new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); - if (path->locks_want < new_locks_want - ? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f) - : path->nodes_locked) - return 0; - - trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path, - old_locks_want, new_locks_want, &f); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); + return likely(path->locks_want >= new_locks_want && path->nodes_locked) + ? 0 + : __bch2_btree_path_upgrade(trans, path, new_locks_want); } /* misc: */ @@ -411,8 +417,10 @@ static inline void btree_path_set_should_be_locked(struct btree_trans *trans, st EBUG_ON(!btree_node_locked(path, path->level)); EBUG_ON(path->uptodate); - path->should_be_locked = true; - trace_btree_path_should_be_locked(trans, path); + if (!path->should_be_locked) { + path->should_be_locked = true; + trace_btree_path_should_be_locked(trans, path); + } } static inline void __btree_path_set_level_up(struct btree_trans *trans, @@ -427,7 +435,7 @@ static inline void btree_path_set_level_up(struct btree_trans *trans, struct btree_path *path) { __btree_path_set_level_up(trans, path, path->level++); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); } /* debug */ @@ -439,12 +447,20 @@ struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *, int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *); -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_btree_path_verify_locks(struct btree_path *); -void bch2_trans_verify_locks(struct btree_trans *); -#else -static inline void bch2_btree_path_verify_locks(struct btree_path *path) {} -static inline void bch2_trans_verify_locks(struct btree_trans *trans) {} -#endif +void __bch2_btree_path_verify_locks(struct btree_trans *, struct btree_path *); +void __bch2_trans_verify_locks(struct btree_trans *); + +static inline void bch2_btree_path_verify_locks(struct btree_trans *trans, + struct btree_path *path) +{ + if (static_branch_unlikely(&bch2_debug_check_btree_locking)) + __bch2_btree_path_verify_locks(trans, path); +} + +static inline void bch2_trans_verify_locks(struct btree_trans *trans) +{ + if (static_branch_unlikely(&bch2_debug_check_btree_locking)) + __bch2_trans_verify_locks(trans); +} #endif /* _BCACHEFS_BTREE_LOCKING_H */ diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 86acf037590c..4b7b5ca74ba1 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -65,49 +65,6 @@ static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_n memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs); } -static inline u64 bkey_journal_seq(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_inode_v3: - return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_journal_seq); - default: - return 0; - } -} - -static bool found_btree_node_is_readable(struct btree_trans *trans, - struct found_btree_node *f) -{ - struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp; - - found_btree_node_to_key(&tmp.k, f); - - struct btree *b = bch2_btree_node_get_noiter(trans, &tmp.k, f->btree_id, f->level, false); - bool ret = !IS_ERR_OR_NULL(b); - if (!ret) - return ret; - - f->sectors_written = b->written; - f->journal_seq = le64_to_cpu(b->data->keys.journal_seq); - - struct bkey_s_c k; - struct bkey unpacked; - struct btree_node_iter iter; - for_each_btree_node_key_unpack(b, k, &iter, &unpacked) - f->journal_seq = max(f->journal_seq, bkey_journal_seq(k)); - - six_unlock_read(&b->c.lock); - - /* - * We might update this node's range; if that happens, we need the node - * to be re-read so the read path can trim keys that are no longer in - * this node - */ - if (b != btree_node_root(trans->c, b)) - bch2_btree_node_evict(trans, &tmp.k); - return ret; -} - static int found_btree_node_cmp_cookie(const void *_l, const void *_r) { const struct found_btree_node *l = _l; @@ -159,17 +116,17 @@ static const struct min_heap_callbacks found_btree_node_heap_cbs = { }; static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, - struct bio *bio, struct btree_node *bn, u64 offset) + struct btree *b, struct bio *bio, u64 offset) { struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); + struct btree_node *bn = b->data; bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); bio->bi_iter.bi_sector = offset; - bch2_bio_map(bio, bn, PAGE_SIZE); + bch2_bio_map(bio, b->data, c->opts.block_size); u64 submit_time = local_clock(); submit_bio_wait(bio); - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); if (bio->bi_status) { @@ -217,18 +174,37 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, }; rcu_read_unlock(); - if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) { - mutex_lock(&f->lock); + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); + bio->bi_iter.bi_sector = offset; + bch2_bio_map(bio, b->data, c->opts.btree_node_size); + + submit_time = local_clock(); + submit_bio_wait(bio); + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); + + found_btree_node_to_key(&b->key, &n); + + CLASS(printbuf, buf)(); + if (!bch2_btree_node_read_done(c, ca, b, NULL, &buf)) { + /* read_done will swap out b->data for another buffer */ + bn = b->data; + /* + * Grab journal_seq here because we want the max journal_seq of + * any bset; read_done sorts down to a single set and picks the + * max journal_seq + */ + n.journal_seq = le64_to_cpu(bn->keys.journal_seq), + n.sectors_written = b->written; + + guard(mutex)(&f->lock); if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) { bch_err(c, "try_read_btree_node() can't handle endian conversion"); f->ret = -EINVAL; - goto unlock; + return; } if (darray_push(&f->nodes, n)) f->ret = -ENOMEM; -unlock: - mutex_unlock(&f->lock); } } @@ -237,12 +213,20 @@ static int read_btree_nodes_worker(void *p) struct find_btree_nodes_worker *w = p; struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes); struct bch_dev *ca = w->ca; - void *buf = (void *) __get_free_page(GFP_KERNEL); - struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL); unsigned long last_print = jiffies; + struct btree *b = NULL; + struct bio *bio = NULL; - if (!buf || !bio) { - bch_err(c, "read_btree_nodes_worker: error allocating bio/buf"); + b = __bch2_btree_node_mem_alloc(c); + if (!b) { + bch_err(c, "read_btree_nodes_worker: error allocating buf"); + w->f->ret = -ENOMEM; + goto err; + } + + bio = bio_alloc(NULL, buf_pages(b->data, c->opts.btree_node_size), 0, GFP_KERNEL); + if (!bio) { + bch_err(c, "read_btree_nodes_worker: error allocating bio"); w->f->ret = -ENOMEM; goto err; } @@ -266,12 +250,14 @@ static int read_btree_nodes_worker(void *p) !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c))) continue; - try_read_btree_node(w->f, ca, bio, buf, sector); + try_read_btree_node(w->f, ca, b, bio, sector); } err: + if (b) + __btree_node_data_free(b); + kfree(b); bio_put(bio); - free_page((unsigned long) buf); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); closure_put(w->cl); kfree(w); return 0; @@ -284,14 +270,17 @@ static int read_btree_nodes(struct find_btree_nodes *f) int ret = 0; closure_init_stack(&cl); + CLASS(printbuf, buf)(); + + prt_printf(&buf, "scanning for btree nodes on"); - for_each_online_member(c, ca) { + for_each_online_member(c, ca, BCH_DEV_READ_REF_btree_node_scan) { if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree))) continue; struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); if (!w) { - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); ret = -ENOMEM; goto err; } @@ -303,16 +292,20 @@ static int read_btree_nodes(struct find_btree_nodes *f) struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); ret = PTR_ERR_OR_ZERO(t); if (ret) { - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); kfree(w); bch_err_msg(c, ret, "starting kthread"); break; } + prt_printf(&buf, " %s", ca->name); + closure_get(&cl); - percpu_ref_get(&ca->io_ref[READ]); + enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); wake_up_process(t); } + + bch_notice(c, "%s", buf.buf); err: while (closure_sync_timeout(&cl, sysctl_hung_task_timeout_secs * HZ / 2)) ; @@ -363,6 +356,8 @@ static int handle_overwrites(struct bch_fs *c, min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL); } } + + cond_resched(); } return 0; @@ -371,7 +366,7 @@ static int handle_overwrites(struct bch_fs *c, int bch2_scan_for_btree_nodes(struct bch_fs *c) { struct find_btree_nodes *f = &c->found_btree_nodes; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); found_btree_nodes nodes_heap = {}; size_t dst; int ret = 0; @@ -395,7 +390,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) printbuf_reset(&buf); prt_printf(&buf, "%s: nodes found:\n", __func__); found_btree_nodes_to_text(&buf, c, f->nodes); - bch2_print_string_as_lines(KERN_INFO, buf.buf); + bch2_print_str(c, KERN_INFO, buf.buf); } sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL); @@ -424,7 +419,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) printbuf_reset(&buf); prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__); found_btree_nodes_to_text(&buf, c, f->nodes); - bch2_print_string_as_lines(KERN_INFO, buf.buf); + bch2_print_str(c, KERN_INFO, buf.buf); } swap(nodes_heap, f->nodes); @@ -470,7 +465,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) printbuf_reset(&buf); prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__); found_btree_nodes_to_text(&buf, c, f->nodes); - bch2_print_string_as_lines(KERN_INFO, buf.buf); + bch2_print_str(c, KERN_INFO, buf.buf); } else { bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr); } @@ -478,7 +473,6 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); err: darray_exit(&nodes_heap); - printbuf_exit(&buf); return ret; } @@ -519,8 +513,12 @@ bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b) return false; } -bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) +int bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) { + int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); + if (ret) + return ret; + struct found_btree_node search = { .btree_id = btree, .level = 0, @@ -541,12 +539,12 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, struct find_btree_nodes *f = &c->found_btree_nodes; - int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); + int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); if (ret) return ret; if (c->opts.verbose) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); prt_str(&buf, "recovery "); bch2_btree_id_level_to_text(&buf, btree, level); @@ -556,7 +554,6 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, bch2_bpos_to_text(&buf, node_max); bch_info(c, "%s(): %s", __func__, buf.buf); - printbuf_exit(&buf); } struct found_btree_node search = { @@ -580,10 +577,9 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, found_btree_node_to_key(&tmp.k, &n); if (c->opts.verbose) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k)); bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); - printbuf_exit(&buf); } BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), diff --git a/fs/bcachefs/btree_node_scan.h b/fs/bcachefs/btree_node_scan.h index 08687b209787..66e6f9ed19d0 100644 --- a/fs/bcachefs/btree_node_scan.h +++ b/fs/bcachefs/btree_node_scan.h @@ -4,7 +4,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *); bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *); -bool bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id); +int bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id); int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos); void bch2_find_btree_nodes_exit(struct find_btree_nodes *); diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 7d7e52ddde02..8b94a8156fbf 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -11,6 +11,7 @@ #include "btree_write_buffer.h" #include "buckets.h" #include "disk_accounting.h" +#include "enumerated_ref.h" #include "errcode.h" #include "error.h" #include "journal.h" @@ -20,6 +21,7 @@ #include "snapshot.h" #include +#include static const char * const trans_commit_flags_strs[] = { #define x(n, ...) #n, @@ -44,6 +46,9 @@ void bch2_trans_commit_flags_to_text(struct printbuf *out, enum bch_trans_commit static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i) { #ifdef CONFIG_BCACHEFS_DEBUG + if (i->key_cache_flushing) + return; + struct bch_fs *c = trans->c; struct bkey u; struct bkey_s_c k = bch2_btree_path_peek_slot_exact(trans->paths + i->path, &u); @@ -230,10 +235,10 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, struct bch_fs *c = container_of(j, struct bch_fs, journal); struct btree_write *w = container_of(pin, struct btree_write, journal); struct btree *b = container_of(w, struct btree, writes[i]); - struct btree_trans *trans = bch2_trans_get(c); unsigned long old, new; unsigned idx = w - b->writes; + CLASS(btree_trans, trans)(c); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); old = READ_ONCE(b->flags); @@ -252,8 +257,6 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, btree_node_write_if_need(trans, b, SIX_LOCK_read); six_unlock_read(&b->c.lock); - - bch2_trans_put(trans); return 0; } @@ -335,6 +338,9 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, BUG_ON(!bpos_eq(i->k->k.p, path->pos)); BUG_ON(i->cached != path->cached); + BUG_ON(i->cached && + !i->key_cache_already_flushed && + bkey_deleted(&i->k->k));; BUG_ON(i->level != path->level); BUG_ON(i->btree_id != path->btree_id); BUG_ON(i->bkey_type != __btree_node_type(path->level, path->btree_id)); @@ -366,14 +372,15 @@ static noinline void journal_transaction_name(struct btree_trans *trans) struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); - strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64)); + memcpy_and_pad(l->d, JSET_ENTRY_LOG_U64s * sizeof(u64), + trans->fn, strlen(trans->fn), 0); } static inline int btree_key_can_insert(struct btree_trans *trans, struct btree *b, unsigned u64s) { if (!bch2_btree_node_insert_fits(b, u64s)) - return -BCH_ERR_btree_insert_btree_node_full; + return bch_err_throw(trans->c, btree_insert_btree_node_full); return 0; } @@ -391,9 +398,10 @@ btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); if (!new_k) { - bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", + struct bch_fs *c = trans->c; + bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", bch2_btree_id_str(path->btree_id), new_u64s); - return -BCH_ERR_ENOMEM_btree_key_cache_insert; + return bch_err_throw(c, ENOMEM_btree_key_cache_insert); } ret = bch2_trans_relock(trans) ?: @@ -429,7 +437,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags if (watermark < BCH_WATERMARK_reclaim && !test_bit(BKEY_CACHED_DIRTY, &ck->flags) && bch2_btree_key_cache_must_wait(c)) - return -BCH_ERR_btree_insert_need_journal_reclaim; + return bch_err_throw(c, btree_insert_need_journal_reclaim); /* * bch2_varint_decode can read past the end of the buffer by at most 7 @@ -581,7 +589,8 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) } static inline int -bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, +bch2_trans_commit_write_locked(struct btree_trans *trans, + enum bch_trans_commit_flags flags, struct btree_insert_entry **stopped_at, unsigned long trace_ip) { @@ -591,12 +600,13 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, int ret = 0; bch2_trans_verify_not_unlocked_or_in_restart(trans); - +#if 0 + /* todo: bring back dynamic fault injection */ if (race_fault()) { trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject); } - +#endif /* * Check if the insert will fit in the leaf node with the write lock * held, otherwise another thread could write the node changing the @@ -644,10 +654,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && !(flags & BCH_TRANS_COMMIT_no_journal_res)) { - if (bch2_journal_seq_verify) + if (static_branch_unlikely(&bch2_journal_seq_verify)) trans_for_each_update(trans, i) i->k->k.bversion.lo = trans->journal_res.seq; - else if (bch2_inject_invalid_keys) + else if (static_branch_unlikely(&bch2_inject_invalid_keys)) trans_for_each_update(trans, i) i->k->k.bversion = MAX_VERSION; } @@ -660,19 +670,22 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, h = h->next; } - struct jset_entry *entry = trans->journal_entries; - - percpu_down_read(&c->mark_lock); - for (entry = trans->journal_entries; - entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); - entry = vstruct_next(entry)) - if (entry->type == BCH_JSET_ENTRY_write_buffer_keys && - entry->start->k.type == KEY_TYPE_accounting) { - ret = bch2_accounting_trans_commit_hook(trans, bkey_i_to_accounting(entry->start), flags); - if (ret) - goto revert_fs_usage; + struct bkey_i *accounting; + + scoped_guard(percpu_read, &c->mark_lock) + for (accounting = btree_trans_subbuf_base(trans, &trans->accounting); + accounting != btree_trans_subbuf_top(trans, &trans->accounting); + accounting = bkey_next(accounting)) { + ret = bch2_accounting_trans_commit_hook(trans, + bkey_i_to_accounting(accounting), flags); + if (unlikely(ret)) { + for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); + i != accounting; + i = bkey_next(i)) + bch2_accounting_trans_commit_revert(trans, bkey_i_to_accounting(i), flags); + return ret; + } } - percpu_up_read(&c->mark_lock); /* XXX: we only want to run this if deltas are nonzero */ bch2_trans_account_disk_usage_change(trans); @@ -695,8 +708,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) validate_context.flags = BCH_VALIDATE_write|BCH_VALIDATE_commit; - for (struct jset_entry *i = trans->journal_entries; - i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + for (struct jset_entry *i = btree_trans_journal_entries_start(trans); + i != btree_trans_journal_entries_top(trans); i = vstruct_next(i)) { ret = bch2_journal_entry_validate(c, NULL, i, bcachefs_metadata_version_current, @@ -751,11 +764,21 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, } memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), - trans->journal_entries, - trans->journal_entries_u64s); + btree_trans_journal_entries_start(trans), + trans->journal_entries.u64s); + + EBUG_ON(trans->journal_res.u64s < trans->journal_entries.u64s); - trans->journal_res.offset += trans->journal_entries_u64s; - trans->journal_res.u64s -= trans->journal_entries_u64s; + trans->journal_res.offset += trans->journal_entries.u64s; + trans->journal_res.u64s -= trans->journal_entries.u64s; + + if (trans->accounting.u64s) + memcpy_u64s_small(bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_write_buffer_keys, + BTREE_ID_accounting, 0, + trans->accounting.u64s)->_data, + btree_trans_subbuf_base(trans, &trans->accounting), + trans->accounting.u64s); if (trans->journal_seq) *trans->journal_seq = trans->journal_res.seq; @@ -775,16 +798,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, return 0; fatal_err: bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret)); - percpu_down_read(&c->mark_lock); -revert_fs_usage: - for (struct jset_entry *entry2 = trans->journal_entries; - entry2 != entry; - entry2 = vstruct_next(entry2)) - if (entry2->type == BCH_JSET_ENTRY_write_buffer_keys && - entry2->start->k.type == KEY_TYPE_accounting) - bch2_accounting_trans_commit_revert(trans, - bkey_i_to_accounting(entry2->start), flags); - percpu_up_read(&c->mark_lock); return ret; } @@ -810,7 +823,8 @@ static int bch2_trans_commit_journal_pin_flush(struct journal *j, /* * Get journal reservation, take write locks, and attempt to do btree update(s): */ -static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags, +static inline int do_bch2_trans_commit(struct btree_trans *trans, + enum bch_trans_commit_flags flags, struct btree_insert_entry **stopped_at, unsigned long trace_ip) { @@ -888,7 +902,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, */ if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && watermark < BCH_WATERMARK_reclaim) { - ret = -BCH_ERR_journal_reclaim_would_deadlock; + ret = bch_err_throw(c, journal_reclaim_would_deadlock); goto out; } @@ -946,35 +960,90 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, * do. */ static noinline int -do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) +do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans, + enum bch_trans_commit_flags flags) { struct bch_fs *c = trans->c; + int ret = 0; BUG_ON(current != c->recovery_task); + struct bkey_i *accounting; +retry: + percpu_down_read(&c->mark_lock); + for (accounting = btree_trans_subbuf_base(trans, &trans->accounting); + accounting != btree_trans_subbuf_top(trans, &trans->accounting); + accounting = bkey_next(accounting)) { + ret = likely(!(flags & BCH_TRANS_COMMIT_skip_accounting_apply)) + ? bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(accounting), + BCH_ACCOUNTING_normal, false) + : 0; + if (ret) + goto revert_fs_usage; + } + percpu_up_read(&c->mark_lock); + trans_for_each_update(trans, i) { - int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); + ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); if (ret) - return ret; + goto fatal_err; } - for (struct jset_entry *i = trans->journal_entries; - i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); - i = vstruct_next(i)) + for (struct jset_entry *i = btree_trans_journal_entries_start(trans); + i != btree_trans_journal_entries_top(trans); + i = vstruct_next(i)) { if (i->type == BCH_JSET_ENTRY_btree_keys || i->type == BCH_JSET_ENTRY_write_buffer_keys) { - int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->start); - if (ret) - return ret; + jset_entry_for_each_key(i, k) { + ret = bch2_journal_key_insert(c, i->btree_id, i->level, k); + if (ret) + goto fatal_err; + } } + if (i->type == BCH_JSET_ENTRY_btree_root) { + guard(mutex)(&c->btree_root_lock); + + struct btree_root *r = bch2_btree_id_root(c, i->btree_id); + + bkey_copy(&r->key, i->start); + r->level = i->level; + r->alive = true; + } + } + + for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); + i != btree_trans_subbuf_top(trans, &trans->accounting); + i = bkey_next(i)) { + ret = bch2_journal_key_insert(c, BTREE_ID_accounting, 0, i); + if (ret) + goto fatal_err; + } + return 0; +fatal_err: + bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret)); + percpu_down_read(&c->mark_lock); +revert_fs_usage: + for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); + i != accounting; + i = bkey_next(i)) + bch2_accounting_trans_commit_revert(trans, bkey_i_to_accounting(i), flags); + percpu_up_read(&c->mark_lock); + + if (bch2_err_matches(ret, BCH_ERR_btree_insert_need_mark_replicas)) { + ret = drop_locks_do(trans, bch2_accounting_update_sb(trans)); + if (!ret) + goto retry; + } + return ret; } -int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) +int __bch2_trans_commit(struct btree_trans *trans, enum bch_trans_commit_flags flags) { struct btree_insert_entry *errored_at = NULL; struct bch_fs *c = trans->c; + unsigned journal_u64s = 0; int ret = 0; bch2_trans_verify_not_unlocked_or_in_restart(trans); @@ -983,8 +1052,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) if (unlikely(ret)) goto out_reset; - if (!trans->nr_updates && - !trans->journal_entries_u64s) + if (!bch2_trans_has_updates(trans)) goto out_reset; ret = bch2_trans_commit_run_triggers(trans); @@ -992,20 +1060,24 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) goto out_reset; if (!(flags & BCH_TRANS_COMMIT_no_check_rw) && - unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { + unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_trans))) { if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) - ret = do_bch2_trans_commit_to_journal_replay(trans); + ret = do_bch2_trans_commit_to_journal_replay(trans, flags); else - ret = -BCH_ERR_erofs_trans_commit; + ret = bch_err_throw(c, erofs_trans_commit); goto out_reset; } EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); - trans->journal_u64s = trans->journal_entries_u64s; + journal_u64s = 0; + trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); if (trans->journal_transaction_names) - trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); + journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); + + if (trans->accounting.u64s) + journal_u64s += jset_u64s(trans->accounting.u64s); trans_for_each_update(trans, i) { struct btree_path *path = trans->paths + i->path; @@ -1025,11 +1097,11 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) continue; /* we're going to journal the key being updated: */ - trans->journal_u64s += jset_u64s(i->k->k.u64s); + journal_u64s += jset_u64s(i->k->k.u64s); /* and we're also going to log the overwrite: */ if (trans->journal_transaction_names) - trans->journal_u64s += jset_u64s(i->old_k.u64s); + journal_u64s += jset_u64s(i->old_k.u64s); } if (trans->extra_disk_res) { @@ -1047,6 +1119,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) memset(&trans->journal_res, 0, sizeof(trans->journal_res)); memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); + trans->journal_u64s = journal_u64s + trans->journal_entries.u64s; + ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_); /* make sure we didn't drop or screw up locks: */ @@ -1058,7 +1132,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) trace_and_count(c, transaction_commit, trans, _RET_IP_); out: if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw))) - bch2_write_ref_put(c, BCH_WRITE_REF_trans); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_trans); out_reset: if (!ret) bch2_trans_downgrade(trans); @@ -1078,7 +1152,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) * restart: */ if (flags & BCH_TRANS_COMMIT_no_journal_res) { - ret = -BCH_ERR_transaction_restart_nested; + ret = bch_err_throw(c, transaction_restart_nested); goto out; } diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 023c472dc9ee..e893eb938bb3 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -139,6 +139,7 @@ struct btree { }; #define BCH_BTREE_CACHE_NOT_FREED_REASONS() \ + x(cache_reserve) \ x(lock_intent) \ x(lock_write) \ x(dirty) \ @@ -228,6 +229,7 @@ struct btree_node_iter { x(snapshot_field) \ x(all_snapshots) \ x(filter_snapshots) \ + x(nofilter_whiteouts) \ x(nopreserve) \ x(cached_nofill) \ x(key_cache_fill) \ @@ -257,9 +259,6 @@ struct btree_node_iter { * * BTREE_TRIGGER_insert - @new is entering the btree * BTREE_TRIGGER_overwrite - @old is leaving the btree - * - * BTREE_TRIGGER_bucket_invalidate - signal from bucket invalidate path to alloc - * trigger */ #define BTREE_TRIGGER_FLAGS() \ x(norun) \ @@ -269,8 +268,7 @@ struct btree_node_iter { x(gc) \ x(insert) \ x(overwrite) \ - x(is_root) \ - x(bucket_invalidate) + x(is_root) enum { #define x(n) BTREE_ITER_FLAG_BIT_##n, @@ -367,6 +365,7 @@ static inline unsigned long btree_path_ip_allocated(struct btree_path *path) * @nodes_intent_locked - bitmask indicating which locks are intent locks */ struct btree_iter { + struct btree_trans *trans; btree_path_idx_t path; btree_path_idx_t update_path; btree_path_idx_t key_cache_path; @@ -425,14 +424,16 @@ struct btree_insert_entry { u8 sort_order; u8 bkey_type; enum btree_id btree_id:8; - u8 level:4; + u8 level:3; bool cached:1; bool insert_trigger_run:1; bool overwrite_trigger_run:1; bool key_cache_already_flushed:1; + bool key_cache_flushing:1; /* - * @old_k may be a key from the journal; @old_btree_u64s always refers - * to the size of the key being overwritten in the btree: + * @old_k may be a key from the journal or the key cache; + * @old_btree_u64s always refers to the size of the key being + * overwritten in the btree: */ u8 old_btree_u64s; btree_path_idx_t path; @@ -477,6 +478,18 @@ struct btree_trans_paths { struct btree_path paths[]; }; +struct trans_kmalloc_trace { + unsigned long ip; + size_t bytes; +}; +typedef DARRAY(struct trans_kmalloc_trace) darray_trans_kmalloc_trace; + +struct btree_trans_subbuf { + u16 base; + u16 u64s; + u16 size; +}; + struct btree_trans { struct bch_fs *c; @@ -488,6 +501,10 @@ struct btree_trans { void *mem; unsigned mem_top; unsigned mem_bytes; + unsigned realloc_bytes_required; +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_trans_kmalloc_trace trans_kmalloc_trace; +#endif btree_path_idx_t nr_sorted; btree_path_idx_t nr_paths; @@ -528,9 +545,8 @@ struct btree_trans { int srcu_idx; /* update path: */ - u16 journal_entries_u64s; - u16 journal_entries_size; - struct jset_entry *journal_entries; + struct btree_trans_subbuf journal_entries; + struct btree_trans_subbuf accounting; struct btree_trans_commit_hook *hooks; struct journal_entry_pin *journal_pin; @@ -544,6 +560,8 @@ struct btree_trans { unsigned journal_u64s; unsigned extra_disk_res; /* XXX kill */ + __BKEY_PADDED(btree_path_down, BKEY_BTREE_PTR_VAL_U64s_MAX); + #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif @@ -604,6 +622,9 @@ enum btree_write_type { x(dying) \ x(fake) \ x(need_rewrite) \ + x(need_rewrite_error) \ + x(need_rewrite_degraded) \ + x(need_rewrite_ptr_written_zero) \ x(never_write) \ x(pinned) @@ -628,6 +649,32 @@ static inline void clear_btree_node_ ## flag(struct btree *b) \ BTREE_FLAGS() #undef x +#define BTREE_NODE_REWRITE_REASON() \ + x(none) \ + x(unknown) \ + x(error) \ + x(degraded) \ + x(ptr_written_zero) + +enum btree_node_rewrite_reason { +#define x(n) BTREE_NODE_REWRITE_##n, + BTREE_NODE_REWRITE_REASON() +#undef x +}; + +static inline enum btree_node_rewrite_reason btree_node_rewrite_reason(struct btree *b) +{ + if (btree_node_need_rewrite_ptr_written_zero(b)) + return BTREE_NODE_REWRITE_ptr_written_zero; + if (btree_node_need_rewrite_degraded(b)) + return BTREE_NODE_REWRITE_degraded; + if (btree_node_need_rewrite_error(b)) + return BTREE_NODE_REWRITE_error; + if (btree_node_need_rewrite(b)) + return BTREE_NODE_REWRITE_unknown; + return BTREE_NODE_REWRITE_none; +} + static inline struct btree_write *btree_current_write(struct btree *b) { return b->writes + btree_node_write_idx(b); @@ -647,13 +694,13 @@ static inline struct bset_tree *bset_tree_last(struct btree *b) static inline void * __btree_node_offset_to_ptr(const struct btree *b, u16 offset) { - return (void *) ((u64 *) b->data + 1 + offset); + return (void *) ((u64 *) b->data + offset); } static inline u16 __btree_node_ptr_to_offset(const struct btree *b, const void *p) { - u16 ret = (u64 *) p - 1 - (u64 *) b->data; + u16 ret = (u64 *) p - (u64 *) b->data; EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p); return ret; @@ -793,15 +840,15 @@ static inline bool btree_node_type_has_triggers(enum btree_node_type type) return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRIGGERS; } -static inline bool btree_id_is_extents(enum btree_id btree) -{ - const u64 mask = 0 +static const u64 btree_is_extents_mask = 0 #define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_extents)) << nr) - BCH_BTREE_IDS() +BCH_BTREE_IDS() #undef x - ; +; - return BIT_ULL(btree) & mask; +static inline bool btree_id_is_extents(enum btree_id btree) +{ + return BIT_ULL(btree) & btree_is_extents_mask; } static inline bool btree_node_type_is_extents(enum btree_node_type type) @@ -809,15 +856,20 @@ static inline bool btree_node_type_is_extents(enum btree_node_type type) return type != BKEY_TYPE_btree && btree_id_is_extents(type - 1); } -static inline bool btree_type_has_snapshots(enum btree_id btree) -{ - const u64 mask = 0 +static const u64 btree_has_snapshots_mask = 0 #define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_snapshots)) << nr) - BCH_BTREE_IDS() +BCH_BTREE_IDS() #undef x - ; +; - return BIT_ULL(btree) & mask; +static inline bool btree_type_has_snapshots(enum btree_id btree) +{ + return BIT_ULL(btree) & btree_has_snapshots_mask; +} + +static inline bool btree_id_is_extents_snapshots(enum btree_id btree) +{ + return BIT_ULL(btree) & btree_has_snapshots_mask & btree_is_extents_mask; } static inline bool btree_type_has_snapshot_field(enum btree_id btree) diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 1e6b7836cc01..6f3b57573cba 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -14,6 +14,8 @@ #include "snapshot.h" #include "trace.h" +#include + static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, const struct btree_insert_entry *r) { @@ -93,7 +95,6 @@ static noinline int extent_back_merge(struct btree_trans *trans, static int need_whiteout_for_snapshot(struct btree_trans *trans, enum btree_id btree_id, struct bpos pos) { - struct btree_iter iter; struct bkey_s_c k; u32 snapshot = pos.snapshot; int ret; @@ -115,71 +116,45 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans, break; } } - bch2_trans_iter_exit(trans, &iter); return ret; } int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, - enum btree_id id, - struct bpos old_pos, - struct bpos new_pos) + enum btree_id btree, struct bpos pos, + snapshot_id_list *s) { - struct bch_fs *c = trans->c; - struct btree_iter old_iter, new_iter = {}; - struct bkey_s_c old_k, new_k; - snapshot_id_list s; - struct bkey_i *update; int ret = 0; - if (!bch2_snapshot_has_children(c, old_pos.snapshot)) - return 0; + darray_for_each(*s, id) { + pos.snapshot = *id; - darray_init(&s); - - bch2_trans_iter_init(trans, &old_iter, id, old_pos, - BTREE_ITER_not_extents| - BTREE_ITER_all_snapshots); - while ((old_k = bch2_btree_iter_prev(trans, &old_iter)).k && - !(ret = bkey_err(old_k)) && - bkey_eq(old_pos, old_k.k->p)) { - struct bpos whiteout_pos = - SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot); - - if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) || - snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot)) - continue; - - new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos, - BTREE_ITER_not_extents| - BTREE_ITER_intent); - ret = bkey_err(new_k); + CLASS(btree_iter, iter)(trans, btree, pos, BTREE_ITER_not_extents|BTREE_ITER_intent); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); if (ret) break; - if (new_k.k->type == KEY_TYPE_deleted) { - update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); + if (k.k->type == KEY_TYPE_deleted) { + struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); ret = PTR_ERR_OR_ZERO(update); - if (ret) + if (ret) { break; + } bkey_init(&update->k); - update->k.p = whiteout_pos; + update->k.p = pos; update->k.type = KEY_TYPE_whiteout; - ret = bch2_trans_update(trans, &new_iter, update, + ret = bch2_trans_update(trans, &iter, update, BTREE_UPDATE_internal_snapshot_node); } - bch2_trans_iter_exit(trans, &new_iter); - ret = snapshot_list_add(c, &s, old_k.k->p.snapshot); if (ret) break; } - bch2_trans_iter_exit(trans, &new_iter); - bch2_trans_iter_exit(trans, &old_iter); - darray_exit(&s); + darray_exit(s); return ret; } @@ -240,7 +215,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, return ret; } - if (bkey_le(old.k->p, new.k->p)) { + if (!back_split) { update = bch2_trans_kmalloc(trans, sizeof(*update)); if ((ret = PTR_ERR_OR_ZERO(update))) return ret; @@ -263,9 +238,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, BTREE_UPDATE_internal_snapshot_node|flags); if (ret) return ret; - } - - if (back_split) { + } else { update = bch2_bkey_make_mut_noupdate(trans, old); if ((ret = PTR_ERR_OR_ZERO(update))) return ret; @@ -287,18 +260,16 @@ static int bch2_trans_update_extent(struct btree_trans *trans, struct bkey_i *insert, enum btree_iter_update_trigger_flags flags) { - struct btree_iter iter; - struct bkey_s_c k; enum btree_id btree_id = orig_iter->btree_id; - int ret = 0; - bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k), - BTREE_ITER_intent| - BTREE_ITER_with_updates| - BTREE_ITER_not_extents); - k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX)); - if ((ret = bkey_err(k))) - goto err; + CLASS(btree_iter, iter)(trans, btree_id, bkey_start_pos(&insert->k), + BTREE_ITER_intent| + BTREE_ITER_with_updates| + BTREE_ITER_not_extents); + struct bkey_s_c k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX)); + int ret = bkey_err(k); + if (ret) + return ret; if (!k.k) goto out; @@ -306,7 +277,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans, if (bch2_bkey_maybe_mergable(k.k, &insert->k)) { ret = extent_front_merge(trans, &iter, k, &insert, flags); if (ret) - goto err; + return ret; } goto next; @@ -317,15 +288,15 @@ static int bch2_trans_update_extent(struct btree_trans *trans, ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert)); if (ret) - goto err; + return ret; if (done) goto out; next: - bch2_btree_iter_advance(trans, &iter); - k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX)); + bch2_btree_iter_advance(&iter); + k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX)); if ((ret = bkey_err(k))) - goto err; + return ret; if (!k.k) goto out; } @@ -333,58 +304,19 @@ static int bch2_trans_update_extent(struct btree_trans *trans, if (bch2_bkey_maybe_mergable(&insert->k, k.k)) { ret = extent_back_merge(trans, &iter, insert, k); if (ret) - goto err; + return ret; } out: - if (!bkey_deleted(&insert->k)) - ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags); -err: - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -static noinline int flush_new_cached_update(struct btree_trans *trans, - struct btree_insert_entry *i, - enum btree_iter_update_trigger_flags flags, - unsigned long ip) -{ - struct bkey k; - int ret; - - btree_path_idx_t path_idx = - bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0, - BTREE_ITER_intent, _THIS_IP_); - ret = bch2_btree_path_traverse(trans, path_idx, 0); - if (ret) - goto out; - - struct btree_path *btree_path = trans->paths + path_idx; - - /* - * The old key in the insert entry might actually refer to an existing - * key in the btree that has been deleted from cache and not yet - * flushed. Check for this and skip the flush so we don't run triggers - * against a stale key. - */ - bch2_btree_path_peek_slot_exact(btree_path, &k); - if (!bkey_deleted(&k)) - goto out; - - i->key_cache_already_flushed = true; - i->flags |= BTREE_TRIGGER_norun; - - btree_path_set_should_be_locked(trans, btree_path); - ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip); -out: - bch2_path_put(trans, path_idx, true); - return ret; + return !bkey_deleted(&insert->k) + ? bch2_btree_insert_nonextent(trans, btree_id, insert, flags) + : 0; } -static int __must_check -bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, - struct bkey_i *k, enum btree_iter_update_trigger_flags flags, - unsigned long ip) +static inline struct btree_insert_entry * +__btree_trans_update_by_path(struct btree_trans *trans, + btree_path_idx_t path_idx, + struct bkey_i *k, enum btree_iter_update_trigger_flags flags, + unsigned long ip) { struct bch_fs *c = trans->c; struct btree_insert_entry *i, n; @@ -455,6 +387,58 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, __btree_path_get(trans, trans->paths + i->path, true); trace_update_by_path(trans, path, i, overwrite); + return i; +} + +static noinline int flush_new_cached_update(struct btree_trans *trans, + struct btree_insert_entry *i, + enum btree_iter_update_trigger_flags flags, + unsigned long ip) +{ + btree_path_idx_t path_idx = + bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0, + BTREE_ITER_intent, _THIS_IP_); + int ret = bch2_btree_path_traverse(trans, path_idx, 0); + if (ret) + goto out; + + struct btree_path *btree_path = trans->paths + path_idx; + + btree_path_set_should_be_locked(trans, btree_path); +#if 0 + /* + * The old key in the insert entry might actually refer to an existing + * key in the btree that has been deleted from cache and not yet + * flushed. Check for this and skip the flush so we don't run triggers + * against a stale key. + */ + struct bkey k; + bch2_btree_path_peek_slot_exact(btree_path, &k); + if (!bkey_deleted(&k)) + goto out; +#endif + i->key_cache_already_flushed = true; + i->flags |= BTREE_TRIGGER_norun; + + struct bkey old_k = i->old_k; + const struct bch_val *old_v = i->old_v; + + i = __btree_trans_update_by_path(trans, path_idx, i->k, flags, _THIS_IP_); + + i->old_k = old_k; + i->old_v = old_v; + i->key_cache_flushing = true; +out: + bch2_path_put(trans, path_idx, true); + return ret; +} + +static int __must_check +bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, + struct bkey_i *k, enum btree_iter_update_trigger_flags flags, + unsigned long ip) +{ + struct btree_insert_entry *i = __btree_trans_update_by_path(trans, path_idx, k, flags, ip); /* * If a key is present in the key cache, it must also exist in the @@ -463,10 +447,9 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, * the key cache - but the key has to exist in the btree for that to * work: */ - if (path->cached && !i->old_btree_u64s) - return flush_new_cached_update(trans, i, flags, ip); - - return 0; + return i->cached && (!i->old_btree_u64s || bkey_deleted(&k->k)) + ? flush_new_cached_update(trans, i, flags, ip) + : 0; } static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, @@ -509,8 +492,9 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, return 0; } -int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *k, enum btree_iter_update_trigger_flags flags) +int __must_check bch2_trans_update_ip(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_i *k, enum btree_iter_update_trigger_flags flags, + unsigned long ip) { kmsan_check_memory(k, bkey_bytes(&k->k)); @@ -546,7 +530,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter path_idx = iter->key_cache_path; } - return bch2_trans_update_by_path(trans, path_idx, k, flags, _RET_IP_); + return bch2_trans_update_by_path(trans, path_idx, k, flags, ip); } int bch2_btree_insert_clone_trans(struct btree_trans *trans, @@ -562,43 +546,48 @@ int bch2_btree_insert_clone_trans(struct btree_trans *trans, return bch2_btree_insert_trans(trans, btree, n, 0); } -struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) +void *__bch2_trans_subbuf_alloc(struct btree_trans *trans, + struct btree_trans_subbuf *buf, + unsigned u64s, ulong ip) { - unsigned new_top = trans->journal_entries_u64s + u64s; - unsigned old_size = trans->journal_entries_size; + unsigned new_top = buf->u64s + u64s; + unsigned new_size = buf->size; - if (new_top > trans->journal_entries_size) { - trans->journal_entries_size = roundup_pow_of_two(new_top); + BUG_ON(roundup_pow_of_two(new_top) > U16_MAX); - btree_trans_stats(trans)->journal_entries_size = trans->journal_entries_size; - } + if (new_top > new_size) + new_size = roundup_pow_of_two(new_top); - struct jset_entry *n = - bch2_trans_kmalloc_nomemzero(trans, - trans->journal_entries_size * sizeof(u64)); + void *n = bch2_trans_kmalloc_nomemzero_ip(trans, new_size * sizeof(u64), ip); if (IS_ERR(n)) - return ERR_CAST(n); + return n; + + unsigned offset = (u64 *) n - (u64 *) trans->mem; + BUG_ON(offset > U16_MAX); - if (trans->journal_entries) - memcpy(n, trans->journal_entries, old_size * sizeof(u64)); - trans->journal_entries = n; + if (buf->u64s) + memcpy(n, + btree_trans_subbuf_base(trans, buf), + buf->u64s * sizeof(u64)); + buf->base = (u64 *) n - (u64 *) trans->mem; + buf->size = new_size; - struct jset_entry *e = btree_trans_journal_entries_top(trans); - trans->journal_entries_u64s = new_top; - return e; + void *p = btree_trans_subbuf_top(trans, buf); + buf->u64s = new_top; + return p; } int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, enum btree_id btree, struct bpos end) { bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent); - struct bkey_s_c k = bch2_btree_iter_peek_prev(trans, iter); + struct bkey_s_c k = bch2_btree_iter_peek_prev(iter); int ret = bkey_err(k); if (ret) goto err; - bch2_btree_iter_advance(trans, iter); - k = bch2_btree_iter_peek_slot(trans, iter); + bch2_btree_iter_advance(iter); + k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) goto err; @@ -606,13 +595,13 @@ int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, BUG_ON(k.k->type != KEY_TYPE_deleted); if (bkey_gt(k.k->p, end)) { - ret = -BCH_ERR_ENOSPC_btree_slot; + ret = bch_err_throw(trans->c, ENOSPC_btree_slot); goto err; } return 0; err: - bch2_trans_iter_exit(trans, iter); + bch2_trans_iter_exit(iter); return ret; } @@ -627,29 +616,21 @@ int bch2_btree_insert_nonextent(struct btree_trans *trans, enum btree_id btree, struct bkey_i *k, enum btree_iter_update_trigger_flags flags) { - struct btree_iter iter; - int ret; - - bch2_trans_iter_init(trans, &iter, btree, k->k.p, - BTREE_ITER_cached| - BTREE_ITER_not_extents| - BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(trans, &iter) ?: + CLASS(btree_iter, iter)(trans, btree, k->k.p, + BTREE_ITER_cached| + BTREE_ITER_not_extents| + BTREE_ITER_intent); + return bch2_btree_iter_traverse(&iter) ?: bch2_trans_update(trans, &iter, k, flags); - bch2_trans_iter_exit(trans, &iter); - return ret; } -int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id, +int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id btree, struct bkey_i *k, enum btree_iter_update_trigger_flags flags) { - struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), - BTREE_ITER_intent|flags); - int ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, k, flags); - bch2_trans_iter_exit(trans, &iter); - return ret; + CLASS(btree_iter, iter)(trans, btree, bkey_start_pos(&k->k), + BTREE_ITER_intent|flags); + return bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, k, flags); } /** @@ -659,21 +640,23 @@ int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id, * @k: key to insert * @disk_res: must be non-NULL whenever inserting or potentially * splitting data extents - * @flags: transaction commit flags + * @commit_flags: transaction commit flags * @iter_flags: btree iter update trigger flags * * Returns: 0 on success, error code on failure */ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k, - struct disk_reservation *disk_res, int flags, + struct disk_reservation *disk_res, + enum bch_trans_commit_flags commit_flags, enum btree_iter_update_trigger_flags iter_flags) { - return bch2_trans_commit_do(c, disk_res, NULL, flags, - bch2_btree_insert_trans(trans, id, k, iter_flags)); + CLASS(btree_trans, trans)(c); + return commit_do(trans, disk_res, NULL, commit_flags, + bch2_btree_insert_trans(trans, id, k, iter_flags)); } -int bch2_btree_delete_at(struct btree_trans *trans, - struct btree_iter *iter, unsigned update_flags) +int bch2_btree_delete_at(struct btree_trans *trans, struct btree_iter *iter, + enum btree_iter_update_trigger_flags flags) { struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); int ret = PTR_ERR_OR_ZERO(k); @@ -682,38 +665,32 @@ int bch2_btree_delete_at(struct btree_trans *trans, bkey_init(&k->k); k->k.p = iter->pos; - return bch2_trans_update(trans, iter, k, update_flags); + return bch2_trans_update(trans, iter, k, flags); } int bch2_btree_delete(struct btree_trans *trans, enum btree_id btree, struct bpos pos, - unsigned update_flags) + enum btree_iter_update_trigger_flags flags) { - struct btree_iter iter; - int ret; - - bch2_trans_iter_init(trans, &iter, btree, pos, - BTREE_ITER_cached| - BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_btree_delete_at(trans, &iter, update_flags); - bch2_trans_iter_exit(trans, &iter); - - return ret; + CLASS(btree_iter, iter)(trans, btree, pos, + BTREE_ITER_cached| + BTREE_ITER_intent); + return bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(trans, &iter, flags); } -int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, +int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id btree, struct bpos start, struct bpos end, - unsigned update_flags, + enum btree_iter_update_trigger_flags flags, u64 *journal_seq) { u32 restart_count = trans->restart_count; - struct btree_iter iter; struct bkey_s_c k; int ret = 0; - bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent); - while ((k = bch2_btree_iter_peek_max(trans, &iter, end)).k) { + CLASS(btree_iter, iter)(trans, btree, start, BTREE_ITER_intent|flags); + + while ((k = bch2_btree_iter_peek_max(&iter, end)).k) { struct disk_reservation disk_res = bch2_disk_reservation_init(trans->c, 0); struct bkey_i delete; @@ -745,7 +722,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, bpos_min(end, k.k->p).offset - iter.pos.offset); - ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?: + ret = bch2_trans_update(trans, &iter, &delete, flags) ?: bch2_trans_commit(trans, &disk_res, journal_seq, BCH_TRANS_COMMIT_no_enospc); bch2_disk_reservation_put(trans->c, &disk_res); @@ -763,7 +740,6 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, if (ret) break; } - bch2_trans_iter_exit(trans, &iter); return ret ?: trans_was_restarted(trans, restart_count); } @@ -775,12 +751,11 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, */ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, struct bpos start, struct bpos end, - unsigned update_flags, + enum btree_iter_update_trigger_flags flags, u64 *journal_seq) { - int ret = bch2_trans_run(c, - bch2_btree_delete_range_trans(trans, id, start, end, - update_flags, journal_seq)); + CLASS(btree_trans, trans)(c); + int ret = bch2_btree_delete_range_trans(trans, id, start, end, flags, journal_seq); if (ret == -BCH_ERR_transaction_restart_nested) ret = 0; return ret; @@ -805,13 +780,10 @@ int bch2_btree_bit_mod_iter(struct btree_trans *trans, struct btree_iter *iter, int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, struct bpos pos, bool set) { - struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent); + CLASS(btree_iter, iter)(trans, btree, pos, BTREE_ITER_intent); - int ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_btree_bit_mod_iter(trans, &iter, set); - bch2_trans_iter_exit(trans, &iter); - return ret; + return bch2_btree_iter_traverse(&iter) ?: + bch2_btree_bit_mod_iter(trans, &iter, set); } int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree, @@ -826,30 +798,40 @@ int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree, return bch2_trans_update_buffered(trans, btree, &k); } -int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) +static int __bch2_trans_log_str(struct btree_trans *trans, const char *str, unsigned len, ulong ip) { - unsigned u64s = DIV_ROUND_UP(buf->pos, sizeof(u64)); - prt_chars(buf, '\0', u64s * sizeof(u64) - buf->pos); + unsigned u64s = DIV_ROUND_UP(len, sizeof(u64)); - int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; - if (ret) - return ret; - - struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s)); - ret = PTR_ERR_OR_ZERO(e); + struct jset_entry *e = bch2_trans_jset_entry_alloc_ip(trans, jset_u64s(u64s), ip); + int ret = PTR_ERR_OR_ZERO(e); if (ret) return ret; struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry); journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s); - memcpy(l->d, buf->buf, buf->pos); + memcpy_and_pad(l->d, u64s * sizeof(u64), str, len, 0); return 0; } +int bch2_trans_log_str(struct btree_trans *trans, const char *str) +{ + return __bch2_trans_log_str(trans, str, strlen(str), _RET_IP_); +} + +int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) +{ + int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; + if (ret) + return ret; + + return __bch2_trans_log_str(trans, buf->buf, buf->pos, _RET_IP_); +} + int bch2_trans_log_bkey(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_i *k) { - struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s)); + struct jset_entry *e = bch2_trans_jset_entry_alloc_ip(trans, + jset_u64s(k->k.u64s), _RET_IP_); int ret = PTR_ERR_OR_ZERO(e); if (ret) return ret; @@ -864,32 +846,31 @@ static int __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, va_list args) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); prt_vprintf(&buf, fmt, args); unsigned u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); - prt_chars(&buf, '\0', u64s * sizeof(u64) - buf.pos); int ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; if (ret) - goto err; + return ret; if (!test_bit(JOURNAL_running, &c->journal.flags)) { ret = darray_make_room(&c->journal.early_journal_entries, jset_u64s(u64s)); if (ret) - goto err; + return ret; struct jset_entry_log *l = (void *) &darray_top(c->journal.early_journal_entries); journal_entry_init(&l->entry, BCH_JSET_ENTRY_log, 0, 1, u64s); - memcpy(l->d, buf.buf, buf.pos); + memcpy_and_pad(l->d, u64s * sizeof(u64), buf.buf, buf.pos, 0); c->journal.early_journal_entries.nr += jset_u64s(u64s); } else { - ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags, - bch2_trans_log_msg(trans, &buf)); + CLASS(btree_trans, trans)(c); + ret = commit_do(trans, NULL, NULL, commit_flags, + bch2_trans_log_msg(trans, &buf)); } -err: - printbuf_exit(&buf); - return ret; + + return 0; } __printf(2, 3) diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 568e56c91190..663739db82b1 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -4,6 +4,7 @@ #include "btree_iter.h" #include "journal.h" +#include "snapshot.h" struct bch_fs; struct btree; @@ -46,22 +47,27 @@ enum bch_trans_commit_flags { void bch2_trans_commit_flags_to_text(struct printbuf *, enum bch_trans_commit_flags); -int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); -int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned); +int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, + enum btree_iter_update_trigger_flags); +int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, + enum btree_iter_update_trigger_flags); int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id, struct bkey_i *, enum btree_iter_update_trigger_flags); int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *, enum btree_iter_update_trigger_flags); -int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct - disk_reservation *, int flags, enum - btree_iter_update_trigger_flags iter_flags); +int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, + struct disk_reservation *, + enum bch_trans_commit_flags, + enum btree_iter_update_trigger_flags); int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, - struct bpos, struct bpos, unsigned, u64 *); + struct bpos, struct bpos, + enum btree_iter_update_trigger_flags, u64 *); int bch2_btree_delete_range(struct bch_fs *, enum btree_id, - struct bpos, struct bpos, unsigned, u64 *); + struct bpos, struct bpos, + enum btree_iter_update_trigger_flags, u64 *); int bch2_btree_bit_mod_iter(struct btree_trans *, struct btree_iter *, bool); int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool); @@ -74,7 +80,7 @@ static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans, } int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id, - struct bpos, struct bpos); + struct bpos, snapshot_id_list *); /* * For use when splitting extents in existing snapshots: @@ -88,11 +94,20 @@ static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans, struct bpos old_pos, struct bpos new_pos) { + BUG_ON(old_pos.snapshot != new_pos.snapshot); + if (!btree_type_has_snapshots(btree) || bkey_eq(old_pos, new_pos)) return 0; - return __bch2_insert_snapshot_whiteouts(trans, btree, old_pos, new_pos); + snapshot_id_list s; + int ret = bch2_get_snapshot_overwrites(trans, btree, old_pos, &s); + if (ret) + return ret; + + return s.nr + ? __bch2_insert_snapshot_whiteouts(trans, btree, new_pos, &s) + : 0; } int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *, @@ -102,32 +117,92 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter * int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *, enum btree_id, struct bpos); -int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, - struct bkey_i *, enum btree_iter_update_trigger_flags); +int __must_check bch2_trans_update_ip(struct btree_trans *, struct btree_iter *, + struct bkey_i *, enum btree_iter_update_trigger_flags, + unsigned long); + +static inline int __must_check +bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_i *k, enum btree_iter_update_trigger_flags flags) +{ + return bch2_trans_update_ip(trans, iter, k, flags, _THIS_IP_); +} + +static inline void *btree_trans_subbuf_base(struct btree_trans *trans, + struct btree_trans_subbuf *buf) +{ + return (u64 *) trans->mem + buf->base; +} -struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned); +static inline void *btree_trans_subbuf_top(struct btree_trans *trans, + struct btree_trans_subbuf *buf) +{ + return (u64 *) trans->mem + buf->base + buf->u64s; +} + +void *__bch2_trans_subbuf_alloc(struct btree_trans *, + struct btree_trans_subbuf *, + unsigned, ulong); + +static inline void * +bch2_trans_subbuf_alloc_ip(struct btree_trans *trans, + struct btree_trans_subbuf *buf, + unsigned u64s, ulong ip) +{ + if (buf->u64s + u64s > buf->size) + return __bch2_trans_subbuf_alloc(trans, buf, u64s, ip); + + void *p = btree_trans_subbuf_top(trans, buf); + buf->u64s += u64s; + return p; +} + +static inline void * +bch2_trans_subbuf_alloc(struct btree_trans *trans, + struct btree_trans_subbuf *buf, + unsigned u64s) +{ + return bch2_trans_subbuf_alloc_ip(trans, buf, u64s, _THIS_IP_); +} + +static inline struct jset_entry *btree_trans_journal_entries_start(struct btree_trans *trans) +{ + return btree_trans_subbuf_base(trans, &trans->journal_entries); +} static inline struct jset_entry *btree_trans_journal_entries_top(struct btree_trans *trans) { - return (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + return btree_trans_subbuf_top(trans, &trans->journal_entries); } static inline struct jset_entry * -bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) +bch2_trans_jset_entry_alloc_ip(struct btree_trans *trans, unsigned u64s, ulong ip) { - if (!trans->journal_entries || - trans->journal_entries_u64s + u64s > trans->journal_entries_size) - return __bch2_trans_jset_entry_alloc(trans, u64s); + return bch2_trans_subbuf_alloc_ip(trans, &trans->journal_entries, u64s, ip); +} - struct jset_entry *e = btree_trans_journal_entries_top(trans); - trans->journal_entries_u64s += u64s; - return e; +static inline struct jset_entry * +bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) +{ + return bch2_trans_jset_entry_alloc_ip(trans, u64s, _THIS_IP_); } int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *); -int bch2_btree_write_buffer_insert_err(struct btree_trans *, - enum btree_id, struct bkey_i *); +int bch2_btree_write_buffer_insert_err(struct bch_fs *, enum btree_id, struct bkey_i *); + +static inline int bch2_btree_write_buffer_insert_checks(struct bch_fs *c, enum btree_id btree, + struct bkey_i *k) +{ + if (unlikely(!btree_type_uses_write_buffer(btree) || + k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX)) { + int ret = bch2_btree_write_buffer_insert_err(c, btree, k); + dump_stack(); + return ret; + } + + return 0; +} static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans, enum btree_id btree, @@ -135,11 +210,10 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr { kmsan_check_memory(k, bkey_bytes(&k->k)); - if (unlikely(!btree_type_uses_write_buffer(btree))) { - int ret = bch2_btree_write_buffer_insert_err(trans, btree, k); - dump_stack(); + int ret = bch2_btree_write_buffer_insert_checks(trans->c, btree, k); + if (unlikely(ret)) return ret; - } + /* * Most updates skip the btree write buffer until journal replay is * finished because synchronization with journal replay relies on having @@ -156,7 +230,7 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr return bch2_btree_insert_clone_trans(trans, btree, k); struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s)); - int ret = PTR_ERR_OR_ZERO(e); + ret = PTR_ERR_OR_ZERO(e); if (ret) return ret; @@ -167,8 +241,9 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr void bch2_trans_commit_hook(struct btree_trans *, struct btree_trans_commit_hook *); -int __bch2_trans_commit(struct btree_trans *, unsigned); +int __bch2_trans_commit(struct btree_trans *, enum bch_trans_commit_flags); +int bch2_trans_log_str(struct btree_trans *, const char *); int bch2_trans_log_msg(struct btree_trans *, struct printbuf *); int bch2_trans_log_bkey(struct btree_trans *, enum btree_id, unsigned, struct bkey_i *); @@ -203,6 +278,7 @@ static inline int bch2_trans_commit(struct btree_trans *trans, nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ (_journal_seq), (_flags))) +/* deprecated, prefer CLASS(btree_trans) */ #define bch2_trans_commit_do(_c, _disk_res, _journal_seq, _flags, _do) \ bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do)) @@ -211,18 +287,28 @@ static inline int bch2_trans_commit(struct btree_trans *trans, (_i) < (_trans)->updates + (_trans)->nr_updates; \ (_i)++) +static inline bool bch2_trans_has_updates(struct btree_trans *trans) +{ + return trans->nr_updates || + trans->journal_entries.u64s || + trans->accounting.u64s; +} + static inline void bch2_trans_reset_updates(struct btree_trans *trans) { trans_for_each_update(trans, i) bch2_path_put(trans, i->path, true); trans->nr_updates = 0; - trans->journal_entries_u64s = 0; + trans->journal_entries.u64s = 0; + trans->journal_entries.size = 0; + trans->accounting.u64s = 0; + trans->accounting.size = 0; trans->hooks = NULL; trans->extra_disk_res = 0; } -static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k, +static __always_inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k, unsigned type, unsigned min_bytes) { unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k)); @@ -245,7 +331,7 @@ static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *t return mut; } -static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k) +static __always_inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k) { return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0); } @@ -284,72 +370,52 @@ static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, bkey_i_to_##_type(__bch2_bkey_make_mut(_trans, _iter, _k, _flags,\ KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) -static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - enum btree_iter_update_trigger_flags flags, +static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_iter *iter, unsigned type, unsigned min_bytes) { - struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter, - btree_id, pos, flags|BTREE_ITER_intent, type); - struct bkey_i *ret = IS_ERR(k.k) + struct bkey_s_c k = __bch2_bkey_get_typed(iter, type); + return IS_ERR(k.k) ? ERR_CAST(k.k) - : __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes); - if (IS_ERR(ret)) - bch2_trans_iter_exit(trans, iter); - return ret; + : __bch2_bkey_make_mut_noupdate(iter->trans, k, 0, min_bytes); } -static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - enum btree_iter_update_trigger_flags flags) +static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_iter *iter) { - return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0); + return __bch2_bkey_get_mut_noupdate(iter, 0, 0); } static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, + enum btree_id btree, struct bpos pos, enum btree_iter_update_trigger_flags flags, unsigned type, unsigned min_bytes) { - struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter, - btree_id, pos, flags|BTREE_ITER_intent, type, min_bytes); - int ret; - + CLASS(btree_iter, iter)(trans, btree, pos, flags|BTREE_ITER_intent); + struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(&iter, type, min_bytes); if (IS_ERR(mut)) return mut; - - ret = bch2_trans_update(trans, iter, mut, flags); - if (ret) { - bch2_trans_iter_exit(trans, iter); + int ret = bch2_trans_update(trans, &iter, mut, flags); + if (ret) return ERR_PTR(ret); - } - return mut; } static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans, - struct btree_iter *iter, unsigned btree_id, struct bpos pos, enum btree_iter_update_trigger_flags flags, unsigned min_bytes) { - return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes); + return __bch2_bkey_get_mut(trans, btree_id, pos, flags, 0, min_bytes); } static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans, - struct btree_iter *iter, unsigned btree_id, struct bpos pos, enum btree_iter_update_trigger_flags flags) { - return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0); + return __bch2_bkey_get_mut(trans, btree_id, pos, flags, 0, 0); } -#define bch2_bkey_get_mut_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\ - bkey_i_to_##_type(__bch2_bkey_get_mut(_trans, _iter, \ - _btree_id, _pos, _flags, \ +#define bch2_bkey_get_mut_typed(_trans, _btree_id, _pos, _flags, _type) \ + bkey_i_to_##_type(__bch2_bkey_get_mut(_trans, _btree_id, _pos, _flags, \ KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter, diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 00307356d7c8..76897cf15946 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -14,6 +14,8 @@ #include "btree_locking.h" #include "buckets.h" #include "clock.h" +#include "disk_groups.h" +#include "enumerated_ref.h" #include "error.h" #include "extents.h" #include "io_write.h" @@ -52,12 +54,10 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) : b->data->min_key; struct btree_and_journal_iter iter; struct bkey_s_c k; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); struct bkey_buf prev; int ret = 0; - printbuf_indent_add_nextline(&buf, 2); - BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, b->data->min_key)); @@ -66,22 +66,29 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) bkey_init(&prev.k->k); bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); + /* + * Don't use btree_node_is_root(): we're called by btree split, after + * creating a new root but before setting it + */ if (b == btree_node_root(c, b)) { if (!bpos_eq(b->data->min_key, POS_MIN)) { - ret = __bch2_topology_error(c, &buf); - + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "btree root with incorrect min_key: "); bch2_bpos_to_text(&buf, b->data->min_key); - log_fsck_err(trans, btree_root_bad_min_key, - "btree root with incorrect min_key: %s", buf.buf); - goto out; + prt_newline(&buf); + + bch2_count_fsck_err(c, btree_root_bad_min_key, &buf); + goto err; } if (!bpos_eq(b->data->max_key, SPOS_MAX)) { - ret = __bch2_topology_error(c, &buf); + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "btree root with incorrect max_key: "); bch2_bpos_to_text(&buf, b->data->max_key); - log_fsck_err(trans, btree_root_bad_max_key, - "btree root with incorrect max_key: %s", buf.buf); - goto out; + prt_newline(&buf); + + bch2_count_fsck_err(c, btree_root_bad_max_key, &buf); + goto err; } } @@ -99,19 +106,15 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) : bpos_successor(prev.k->k.p); if (!bpos_eq(expected_min, bp.v->min_key)) { - ret = __bch2_topology_error(c, &buf); - - prt_str(&buf, "end of prev node doesn't match start of next node\nin "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_str(&buf, " node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, "end of prev node doesn't match start of next node"); prt_str(&buf, "\nprev "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); prt_str(&buf, "\nnext "); bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); - log_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf); - goto out; + bch2_count_fsck_err(c, btree_node_topology_bad_min_key, &buf); + goto err; } bch2_bkey_buf_reassemble(&prev, c, k); @@ -119,32 +122,33 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) } if (bkey_deleted(&prev.k->k)) { - ret = __bch2_topology_error(c, &buf); - - prt_str(&buf, "empty interior node\nin "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_str(&buf, " node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - - log_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf); - } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) { - ret = __bch2_topology_error(c, &buf); + prt_printf(&buf, "empty interior node\n"); + bch2_count_fsck_err(c, btree_node_topology_empty_interior_node, &buf); + goto err; + } - prt_str(&buf, "last child node doesn't end at end of parent node\nin "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_str(&buf, " node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - prt_str(&buf, "\nlast key "); + if (!bpos_eq(prev.k->k.p, b->key.k.p)) { + prt_str(&buf, "last child node doesn't end at end of parent node\nchild: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); + prt_newline(&buf); - log_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf); + bch2_count_fsck_err(c, btree_node_topology_bad_max_key, &buf); + goto err; } out: -fsck_err: bch2_btree_and_journal_iter_exit(&iter); bch2_bkey_buf_exit(&prev, c); - printbuf_exit(&buf); return ret; +err: + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_char(&buf, ' '); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_newline(&buf); + + ret = __bch2_topology_error(c, &buf); + bch2_print_str(c, KERN_ERR, buf.buf); + BUG_ON(!ret); + goto out; } /* Calculate ideal packed bkey format for new btree nodes: */ @@ -217,7 +221,7 @@ static void __btree_node_free(struct btree_trans *trans, struct btree *b) { struct bch_fs *c = trans->c; - trace_and_count(c, btree_node_free, trans, b); + trace_btree_node(c, b, btree_node_free); BUG_ON(btree_node_write_blocked(b)); BUG_ON(btree_node_dirty(b)); @@ -240,9 +244,8 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, __btree_node_free(trans, b); - mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, b); - mutex_unlock(&c->btree_cache.lock); + scoped_guard(mutex, &c->btree_cache.lock) + bch2_btree_node_hash_remove(&c->btree_cache, b); six_unlock_write(&b->c.lock); mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); @@ -268,9 +271,8 @@ static void bch2_btree_node_free_never_used(struct btree_update *as, clear_btree_node_dirty_acct(c, b); clear_btree_node_need_write(b); - mutex_lock(&c->btree_cache.lock); - __bch2_btree_node_hash_remove(&c->btree_cache, b); - mutex_unlock(&c->btree_cache.lock); + scoped_guard(mutex, &c->btree_cache.lock) + __bch2_btree_node_hash_remove(&c->btree_cache, b); BUG_ON(p->nr >= ARRAY_SIZE(p->b)); p->b[p->nr++] = b; @@ -280,17 +282,46 @@ static void bch2_btree_node_free_never_used(struct btree_update *as, bch2_trans_node_drop(trans, b); } +static bool can_use_btree_node(struct bch_fs *c, + struct disk_reservation *res, + unsigned target, + struct bkey_s_c k) +{ + if (!bch2_bkey_devs_rw(c, k)) + return false; + + if (target && !bch2_bkey_in_target(c, k, target)) + return false; + + unsigned durability = bch2_bkey_durability(c, k); + + if (durability >= res->nr_replicas) + return true; + + struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_btree, target); + + guard(rcu)(); + + unsigned durability_available = 0, i; + for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { + struct bch_dev *ca = bch2_dev_rcu_noerror(c, i); + if (ca) + durability_available += ca->mi.durability; + } + + return durability >= durability_available; +} + static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, struct disk_reservation *res, struct closure *cl, bool interior_node, - unsigned flags) + unsigned target, + enum bch_trans_commit_flags flags) { struct bch_fs *c = trans->c; struct write_point *wp; struct btree *b; - BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; - struct open_buckets obs = { .nr = 0 }; struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim @@ -306,17 +337,27 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, mutex_lock(&c->btree_reserve_cache_lock); if (c->btree_reserve_cache_nr > nr_reserve) { - struct btree_alloc *a = - &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; + for (struct btree_alloc *a = c->btree_reserve_cache; + a < c->btree_reserve_cache + c->btree_reserve_cache_nr;) { + /* check if it has sufficient durability */ + + if (!can_use_btree_node(c, res, target, bkey_i_to_s_c(&a->k))) { + bch2_open_buckets_put(c, &a->ob); + *a = c->btree_reserve_cache[--c->btree_reserve_cache_nr]; + continue; + } - obs = a->ob; - bkey_copy(&tmp.k, &a->k); - mutex_unlock(&c->btree_reserve_cache_lock); - goto out; + bkey_copy(&b->key, &a->k); + b->ob = a->ob; + *a = c->btree_reserve_cache[--c->btree_reserve_cache_nr]; + mutex_unlock(&c->btree_reserve_cache_lock); + goto out; + } } mutex_unlock(&c->btree_reserve_cache_lock); retry: ret = bch2_alloc_sectors_start_trans(trans, + target ?: c->opts.metadata_target ?: c->opts.foreground_target, 0, @@ -325,7 +366,9 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, res->nr_replicas, min(res->nr_replicas, c->opts.metadata_replicas_required), - watermark, 0, cl, &wp); + watermark, + target ? BCH_WRITE_only_specified_devs : 0, + cl, &wp); if (unlikely(ret)) goto err; @@ -341,14 +384,12 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, goto retry; } - bkey_btree_ptr_v2_init(&tmp.k); - bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false); + bkey_btree_ptr_v2_init(&b->key); + bch2_alloc_sectors_append_ptrs(c, wp, &b->key, btree_sectors(c), false); - bch2_open_bucket_get(c, wp, &obs); + bch2_open_bucket_get(c, wp, &b->ob); bch2_alloc_sectors_done(c, wp); out: - bkey_copy(&b->key, &tmp.k); - b->ob = obs; six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); @@ -406,7 +447,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); BUG_ON(ret); - trace_and_count(c, btree_node_alloc, trans, b); + trace_btree_node(c, b, btree_node_alloc); bch2_increment_clock(c, btree_sectors(c), WRITE); return b; } @@ -505,33 +546,29 @@ static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans * static int bch2_btree_reserve_get(struct btree_trans *trans, struct btree_update *as, unsigned nr_nodes[2], + unsigned target, unsigned flags, struct closure *cl) { - struct btree *b; - unsigned interior; - int ret = 0; - BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX); /* * Protects reaping from the btree node cache and using the btree node * open bucket reserve: */ - ret = bch2_btree_cache_cannibalize_lock(trans, cl); + int ret = bch2_btree_cache_cannibalize_lock(trans, cl); if (ret) return ret; - for (interior = 0; interior < 2; interior++) { + for (unsigned interior = 0; interior < 2; interior++) { struct prealloc_nodes *p = as->prealloc_nodes + interior; while (p->nr < nr_nodes[interior]) { - b = __bch2_btree_node_alloc(trans, &as->disk_res, cl, - interior, flags); - if (IS_ERR(b)) { - ret = PTR_ERR(b); + struct btree *b = __bch2_btree_node_alloc(trans, &as->disk_res, + cl, interior, target, flags); + ret = PTR_ERR_OR_ZERO(b); + if (ret) goto err; - } p->b[p->nr++] = b; } @@ -559,7 +596,8 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans * bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total], as->start_time); - mutex_lock(&c->btree_interior_update_lock); + guard(mutex)(&c->btree_interior_update_lock); + list_del(&as->unwritten_list); list_del(&as->list); @@ -571,8 +609,6 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans * * since being on btree_interior_update_list is our ref on @c: */ closure_wake_up(&c->btree_interior_update_wait); - - mutex_unlock(&c->btree_interior_update_lock); } static void btree_update_add_key(struct btree_update *as, @@ -601,12 +637,11 @@ static void btree_update_new_nodes_mark_sb(struct btree_update *as) { struct bch_fs *c = as->c; - mutex_lock(&c->sb_lock); + guard(mutex)(&c->sb_lock); for_each_keylist_key(&as->new_keys, k) bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k)); bch2_write_super(c); - mutex_unlock(&c->sb_lock); } /* @@ -658,7 +693,7 @@ static void btree_update_nodes_written(struct btree_update *as) { struct bch_fs *c = as->c; struct btree *b; - struct btree_trans *trans = bch2_trans_get(c); + CLASS(btree_trans, trans)(c); u64 journal_seq = 0; unsigned i; int ret; @@ -679,12 +714,31 @@ static void btree_update_nodes_written(struct btree_update *as) /* * Wait for any in flight writes to finish before we free the old nodes - * on disk: + * on disk. But we haven't pinned those old nodes in the btree cache, + * they might have already been evicted. + * + * The update we're completing deleted references to those nodes from the + * btree, so we know if they've been evicted they can't be pulled back in. + * We just have to check if the nodes we have pointers to are still those + * old nodes, and haven't been reused. + * + * This can't be done locklessly because the data buffer might have been + * vmalloc allocated, and they're not RCU freed. We also need the + * __no_kmsan_checks annotation because even with the btree node read + * lock, nothing tells us that the data buffer has been initialized (if + * the btree node has been reused for a different node, and the data + * buffer swapped for a new data buffer). */ for (i = 0; i < as->nr_old_nodes; i++) { b = as->old_nodes[i]; - if (btree_node_seq_matches(b, as->old_nodes_seq[i])) + bch2_trans_begin(trans); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); + bool seq_matches = btree_node_seq_matches(b, as->old_nodes_seq[i]); + six_unlock_read(&b->c.lock); + bch2_trans_unlock_long(trans); + + if (seq_matches) wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, TASK_UNINTERRUPTIBLE); } @@ -798,15 +852,15 @@ static void btree_update_nodes_written(struct btree_update *as) bch2_journal_pin_drop(&c->journal, &as->journal); - mutex_lock(&c->btree_interior_update_lock); - for (i = 0; i < as->nr_new_nodes; i++) { - b = as->new_nodes[i]; + scoped_guard(mutex, &c->btree_interior_update_lock) { + for (i = 0; i < as->nr_new_nodes; i++) { + b = as->new_nodes[i]; - BUG_ON(b->will_make_reachable != (unsigned long) as); - b->will_make_reachable = 0; - clear_btree_node_will_make_reachable(b); + BUG_ON(b->will_make_reachable != (unsigned long) as); + b->will_make_reachable = 0; + clear_btree_node_will_make_reachable(b); + } } - mutex_unlock(&c->btree_interior_update_lock); for (i = 0; i < as->nr_new_nodes; i++) { b = as->new_nodes[i]; @@ -820,7 +874,6 @@ static void btree_update_nodes_written(struct btree_update *as) bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]); bch2_btree_update_free(as, trans); - bch2_trans_put(trans); } static void btree_interior_update_work(struct work_struct *work) @@ -830,12 +883,12 @@ static void btree_interior_update_work(struct work_struct *work) struct btree_update *as; while (1) { - mutex_lock(&c->btree_interior_update_lock); - as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, - struct btree_update, unwritten_list); - if (as && !as->nodes_written) - as = NULL; - mutex_unlock(&c->btree_interior_update_lock); + scoped_guard(mutex, &c->btree_interior_update_lock) { + as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, + struct btree_update, unwritten_list); + if (as && !as->nodes_written) + as = NULL; + } if (!as) break; @@ -849,9 +902,8 @@ static CLOSURE_CALLBACK(btree_update_set_nodes_written) closure_type(as, struct btree_update, cl); struct bch_fs *c = as->c; - mutex_lock(&c->btree_interior_update_lock); - as->nodes_written = true; - mutex_unlock(&c->btree_interior_update_lock); + scoped_guard(mutex, &c->btree_interior_update_lock) + as->nodes_written = true; queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); } @@ -869,7 +921,7 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) BUG_ON(!btree_node_dirty(b)); BUG_ON(!b->c.level); - mutex_lock(&c->btree_interior_update_lock); + guard(mutex)(&c->btree_interior_update_lock); list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); as->mode = BTREE_UPDATE_node; @@ -878,8 +930,6 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) set_btree_node_write_blocked(b); list_add(&as->write_blocked_list, &b->write_blocked); - - mutex_unlock(&c->btree_interior_update_lock); } static int bch2_update_reparent_journal_pin_flush(struct journal *j, @@ -918,11 +968,11 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b) b->c.btree_id, b->c.level, insert, insert->k.u64s); - mutex_lock(&c->btree_interior_update_lock); - list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); + scoped_guard(mutex, &c->btree_interior_update_lock) { + list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); - as->mode = BTREE_UPDATE_root; - mutex_unlock(&c->btree_interior_update_lock); + as->mode = BTREE_UPDATE_root; + } } /* @@ -943,7 +993,8 @@ static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree closure_get(&as->cl); - mutex_lock(&c->btree_interior_update_lock); + guard(mutex)(&c->btree_interior_update_lock); + BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes)); BUG_ON(b->will_make_reachable); @@ -951,8 +1002,6 @@ static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree b->will_make_reachable = 1UL|(unsigned long) as; set_btree_node_will_make_reachable(b); - mutex_unlock(&c->btree_interior_update_lock); - btree_update_add_key(as, &as->new_keys, b); if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { @@ -971,31 +1020,29 @@ static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b) { struct btree_update *as; unsigned long v; - unsigned i; - mutex_lock(&c->btree_interior_update_lock); - /* - * When b->will_make_reachable != 0, it owns a ref on as->cl that's - * dropped when it gets written by bch2_btree_complete_write - the - * xchg() is for synchronization with bch2_btree_complete_write: - */ - v = xchg(&b->will_make_reachable, 0); - clear_btree_node_will_make_reachable(b); - as = (struct btree_update *) (v & ~1UL); + scoped_guard(mutex, &c->btree_interior_update_lock) { + /* + * When b->will_make_reachable != 0, it owns a ref on as->cl that's + * dropped when it gets written by bch2_btree_complete_write - the + * xchg() is for synchronization with bch2_btree_complete_write: + */ + v = xchg(&b->will_make_reachable, 0); + clear_btree_node_will_make_reachable(b); + as = (struct btree_update *) (v & ~1UL); - if (!as) { - mutex_unlock(&c->btree_interior_update_lock); - return; - } + if (!as) + return; - for (i = 0; i < as->nr_new_nodes; i++) - if (as->new_nodes[i] == b) - goto found; + unsigned i; + for (i = 0; i < as->nr_new_nodes; i++) + if (as->new_nodes[i] == b) + goto found; - BUG(); -found: - array_remove_item(as->new_nodes, as->nr_new_nodes, i); - mutex_unlock(&c->btree_interior_update_lock); + BUG(); + found: + array_remove_item(as->new_nodes, as->nr_new_nodes, i); + } if (v & 1) closure_put(&as->cl); @@ -1114,9 +1161,18 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans * start_time); } +static const char * const btree_node_reawrite_reason_strs[] = { +#define x(n) #n, + BTREE_NODE_REWRITE_REASON() +#undef x + NULL, +}; + static struct btree_update * bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, - unsigned level_start, bool split, unsigned flags) + unsigned level_start, bool split, + unsigned target, + enum bch_trans_commit_flags flags) { struct bch_fs *c = trans->c; struct btree_update *as; @@ -1203,9 +1259,17 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, bch2_keylist_init(&as->new_keys, as->_new_keys); bch2_keylist_init(&as->parent_keys, as->inline_keys); - mutex_lock(&c->btree_interior_update_lock); - list_add_tail(&as->list, &c->btree_interior_update_list); - mutex_unlock(&c->btree_interior_update_lock); + scoped_guard(mutex, &c->btree_interior_update_lock) + list_add_tail(&as->list, &c->btree_interior_update_list); + + struct btree *b = btree_path_node(path, path->level); + as->node_start = b->data->min_key; + as->node_end = b->data->max_key; + as->node_needed_rewrite = btree_node_rewrite_reason(b); + as->node_written = b->written; + as->node_sectors = btree_buf_bytes(b) >> 9; + as->node_remaining = __bch2_btree_u64s_remaining(b, + btree_bkey_last(b, bset_tree_last(b))); /* * We don't want to allocate if we're in an error state, that can cause @@ -1226,7 +1290,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, if (ret) goto err; - ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL); + ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, NULL); if (bch2_err_matches(ret, ENOSPC) || bch2_err_matches(ret, ENOMEM)) { struct closure cl; @@ -1238,18 +1302,19 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, if (bch2_err_matches(ret, ENOSPC) && (flags & BCH_TRANS_COMMIT_journal_reclaim) && watermark < BCH_WATERMARK_reclaim) { - ret = -BCH_ERR_journal_reclaim_would_deadlock; + ret = bch_err_throw(c, journal_reclaim_would_deadlock); goto err; } closure_init_stack(&cl); do { - ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl); - + ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, &cl); + if (!bch2_err_matches(ret, BCH_ERR_operation_blocked)) + break; bch2_trans_unlock(trans); bch2_wait_on_allocator(c, &cl); - } while (bch2_err_matches(ret, BCH_ERR_operation_blocked)); + } while (1); } if (ret) { @@ -1279,13 +1344,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) { /* Root nodes cannot be reaped */ - mutex_lock(&c->btree_cache.lock); - list_del_init(&b->list); - mutex_unlock(&c->btree_cache.lock); + scoped_guard(mutex, &c->btree_cache.lock) + list_del_init(&b->list); - mutex_lock(&c->btree_root_lock); - bch2_btree_id_root(c, b->c.btree_id)->b = b; - mutex_unlock(&c->btree_root_lock); + scoped_guard(mutex, &c->btree_root_lock) + bch2_btree_id_root(c, b->c.btree_id)->b = b; bch2_recalc_btree_reserve(c); } @@ -1298,7 +1361,7 @@ static int bch2_btree_set_root(struct btree_update *as, { struct bch_fs *c = as->c; - trace_and_count(c, btree_node_set_root, trans, b); + trace_btree_node(c, b, btree_node_set_root); struct btree *old = btree_node_root(c, b); @@ -1340,7 +1403,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, { struct bch_fs *c = as->c; struct bkey_packed *k; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); unsigned long old, new; BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && @@ -1385,8 +1448,6 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, new |= BTREE_WRITE_interior; new |= 1 << BTREE_NODE_need_write; } while (!try_cmpxchg(&b->flags, &old, new)); - - printbuf_exit(&buf); } static int @@ -1413,7 +1474,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, int ret = bch2_btree_node_check_topology(trans, b); if (ret) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); for (struct bkey_i *k = keys->keys; k != insert; @@ -1598,7 +1659,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, int ret = 0; bch2_verify_btree_nr_keys(b); - BUG_ON(!parent && (b != btree_node_root(c, b))); + BUG_ON(!parent && !btree_node_is_root(c, b)); BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1)); ret = bch2_btree_node_check_topology(trans, b); @@ -1608,7 +1669,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { struct btree *n[2]; - trace_and_count(c, btree_node_split, trans, b); + trace_btree_node(c, b, btree_node_split); n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level); n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level); @@ -1670,7 +1731,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, goto err; } } else { - trace_and_count(c, btree_node_compact, trans, b); + trace_btree_node(c, b, btree_node_compact); n1 = bch2_btree_node_alloc_replacement(as, trans, b); @@ -1800,16 +1861,15 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t bch2_verify_keylist_sorted(keys); if (!btree_node_intent_locked(path, b->c.level)) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_log_msg_start(c, &buf); prt_printf(&buf, "%s(): node not locked at level %u\n", __func__, b->c.level); bch2_btree_update_to_text(&buf, as); bch2_btree_path_to_text(&buf, trans, path_idx); + bch2_fs_emergency_read_only2(c, &buf); - bch2_print_string_as_lines(KERN_ERR, buf.buf); - printbuf_exit(&buf); - bch2_fs_emergency_read_only(c); + bch2_print_str(c, KERN_ERR, buf.buf); return -EIO; } @@ -1878,7 +1938,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans, as = bch2_btree_update_start(trans, trans->paths + path, trans->paths[path].level, - true, flags); + true, 0, flags); if (IS_ERR(as)) return PTR_ERR(as); @@ -1932,9 +1992,8 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans * bch2_trans_node_add(trans, path, n); six_unlock_intent(&n->c.lock); - mutex_lock(&c->btree_cache.lock); - list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list); - mutex_unlock(&c->btree_cache.lock); + scoped_guard(mutex, &c->btree_cache.lock) + list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list); bch2_trans_verify_locks(trans); } @@ -1948,7 +2007,8 @@ int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, return bch2_btree_split_leaf(trans, path, flags); struct btree_update *as = - bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags); + bch2_btree_update_start(trans, trans->paths + path, b->c.level, + true, 0, flags); if (IS_ERR(as)) return PTR_ERR(as); @@ -2010,7 +2070,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, sib_path = bch2_path_get(trans, btree, sib_pos, U8_MAX, level, BTREE_ITER_intent, _THIS_IP_); - ret = bch2_btree_path_traverse(trans, sib_path, false); + ret = bch2_btree_path_traverse(trans, sib_path, 0); if (ret) goto err; @@ -2033,7 +2093,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, } if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); printbuf_indent_add_nextline(&buf, 2); prt_printf(&buf, "%s(): ", __func__); @@ -2048,7 +2108,6 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_bpos_to_text(&buf, next->data->min_key); bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); goto err; } @@ -2077,12 +2136,15 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, parent = btree_node_parent(trans->paths + path, b); as = bch2_btree_update_start(trans, trans->paths + path, level, false, - BCH_TRANS_COMMIT_no_enospc|flags); + 0, BCH_TRANS_COMMIT_no_enospc|flags); ret = PTR_ERR_OR_ZERO(as); if (ret) goto err; - trace_and_count(c, btree_node_merge, trans, b); + as->node_start = prev->data->min_key; + as->node_end = next->data->max_key; + + trace_btree_node(c, b, btree_node_merge); n = bch2_btree_node_alloc(as, trans, b->c.level); @@ -2162,7 +2224,7 @@ static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter, bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p, BTREE_MAX_DEPTH, b->c.level, BTREE_ITER_intent); - int ret = bch2_btree_iter_traverse(trans, iter); + int ret = bch2_btree_iter_traverse(iter); if (ret) goto err; @@ -2170,21 +2232,22 @@ static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter, if (btree_iter_path(trans, iter)->l[b->c.level].b != b) { /* node has been freed: */ BUG_ON(!btree_node_dying(b)); - ret = -BCH_ERR_btree_node_dying; + ret = bch_err_throw(trans->c, btree_node_dying); goto err; } BUG_ON(!btree_node_hashed(b)); return 0; err: - bch2_trans_iter_exit(trans, iter); + bch2_trans_iter_exit(iter); return ret; } int bch2_btree_node_rewrite(struct btree_trans *trans, struct btree_iter *iter, struct btree *b, - unsigned flags) + unsigned target, + enum bch_trans_commit_flags flags) { struct bch_fs *c = trans->c; struct btree *n, *parent; @@ -2196,7 +2259,8 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, struct btree_path *path = btree_iter_path(trans, iter); parent = btree_node_parent(path, b); - as = bch2_btree_update_start(trans, path, b->c.level, false, flags); + as = bch2_btree_update_start(trans, path, b->c.level, + false, target, flags); ret = PTR_ERR_OR_ZERO(as); if (ret) goto out; @@ -2212,8 +2276,6 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, trans->paths + new_path, n); - trace_and_count(c, btree_node_rewrite, trans, b); - if (parent) { bch2_keylist_add(&as->parent_keys, &n->key); ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys); @@ -2224,6 +2286,8 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, if (ret) goto err; + trace_btree_node(c, b, btree_node_rewrite); + bch2_btree_interior_update_will_free_node(as, b); bch2_btree_update_get_open_buckets(as, n); @@ -2246,58 +2310,62 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, goto out; } -static int bch2_btree_node_rewrite_key(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_i *k, unsigned flags) +int bch2_btree_node_rewrite_key(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_i *k, + enum bch_trans_commit_flags flags) { struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, BTREE_MAX_DEPTH, level, 0); - struct btree *b = bch2_btree_iter_peek_node(trans, &iter); + struct btree *b = bch2_btree_iter_peek_node(&iter); int ret = PTR_ERR_OR_ZERO(b); if (ret) goto out; bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k); ret = found - ? bch2_btree_node_rewrite(trans, &iter, b, flags) + ? bch2_btree_node_rewrite(trans, &iter, b, 0, flags) : -ENOENT; out: - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); return ret; } int bch2_btree_node_rewrite_pos(struct btree_trans *trans, enum btree_id btree, unsigned level, - struct bpos pos, unsigned flags) + struct bpos pos, + unsigned target, + enum bch_trans_commit_flags flags) { BUG_ON(!level); /* Traverse one depth lower to get a pointer to the node itself: */ struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0); - struct btree *b = bch2_btree_iter_peek_node(trans, &iter); + struct btree *b = bch2_btree_iter_peek_node(&iter); int ret = PTR_ERR_OR_ZERO(b); if (ret) goto err; - ret = bch2_btree_node_rewrite(trans, &iter, b, flags); + ret = bch2_btree_node_rewrite(trans, &iter, b, target, flags); err: - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); return ret; } int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans, - struct btree *b, unsigned flags) + struct btree *b, + enum bch_trans_commit_flags flags) { struct btree_iter iter; int ret = get_iter_to_node(trans, &iter, b); if (ret) return ret == -BCH_ERR_btree_node_dying ? 0 : ret; - ret = bch2_btree_node_rewrite(trans, &iter, b, flags); - bch2_trans_iter_exit(trans, &iter); + ret = bch2_btree_node_rewrite(trans, &iter, b, 0, flags); + bch2_trans_iter_exit(&iter); return ret; } @@ -2318,19 +2386,17 @@ static void async_btree_node_rewrite_work(struct work_struct *work) int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans, a->btree_id, a->level, a->key.k, 0)); - if (ret != -ENOENT && - !bch2_err_matches(ret, EROFS) && - ret != -BCH_ERR_journal_shutdown) + if (!bch2_err_matches(ret, ENOENT) && + !bch2_err_matches(ret, EROFS)) bch_err_fn_ratelimited(c, ret); - spin_lock(&c->btree_node_rewrites_lock); - list_del(&a->list); - spin_unlock(&c->btree_node_rewrites_lock); + scoped_guard(spinlock, &c->btree_node_rewrites_lock) + list_del(&a->list); closure_wake_up(&c->btree_node_rewrites_wait); bch2_bkey_buf_exit(&a->key, c); - bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_node_rewrite); kfree(a); } @@ -2350,16 +2416,16 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) bool now = false, pending = false; - spin_lock(&c->btree_node_rewrites_lock); - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_journal_replay && - bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { - list_add(&a->list, &c->btree_node_rewrites); - now = true; - } else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { - list_add(&a->list, &c->btree_node_rewrites_pending); - pending = true; + scoped_guard(spinlock, &c->btree_node_rewrites_lock) { + if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay) && + enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_node_rewrite)) { + list_add(&a->list, &c->btree_node_rewrites); + now = true; + } else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { + list_add(&a->list, &c->btree_node_rewrites_pending); + pending = true; + } } - spin_unlock(&c->btree_node_rewrites_lock); if (now) { queue_work(c->btree_node_rewrite_worker, &a->work); @@ -2380,18 +2446,19 @@ void bch2_async_btree_node_rewrites_flush(struct bch_fs *c) void bch2_do_pending_node_rewrites(struct bch_fs *c) { while (1) { - spin_lock(&c->btree_node_rewrites_lock); - struct async_btree_rewrite *a = - list_pop_entry(&c->btree_node_rewrites_pending, - struct async_btree_rewrite, list); - if (a) - list_add(&a->list, &c->btree_node_rewrites); - spin_unlock(&c->btree_node_rewrites_lock); + struct async_btree_rewrite *a; + + scoped_guard(spinlock, &c->btree_node_rewrites_lock) { + a = list_pop_entry(&c->btree_node_rewrites_pending, + struct async_btree_rewrite, list); + if (a) + list_add(&a->list, &c->btree_node_rewrites); + } if (!a) break; - bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); + enumerated_ref_get(&c->writes, BCH_WRITE_REF_node_rewrite); queue_work(c->btree_node_rewrite_worker, &a->work); } } @@ -2399,11 +2466,11 @@ void bch2_do_pending_node_rewrites(struct bch_fs *c) void bch2_free_pending_node_rewrites(struct bch_fs *c) { while (1) { - spin_lock(&c->btree_node_rewrites_lock); - struct async_btree_rewrite *a = - list_pop_entry(&c->btree_node_rewrites_pending, - struct async_btree_rewrite, list); - spin_unlock(&c->btree_node_rewrites_lock); + struct async_btree_rewrite *a; + + scoped_guard(spinlock, &c->btree_node_rewrites_lock) + a = list_pop_entry(&c->btree_node_rewrites_pending, + struct async_btree_rewrite, list); if (!a) break; @@ -2421,7 +2488,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, bool skip_triggers) { struct bch_fs *c = trans->c; - struct btree_iter iter2 = {}; + struct btree_iter iter2 = { NULL }; struct btree *parent; int ret; @@ -2445,7 +2512,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, parent = btree_node_parent(btree_iter_path(trans, iter), b); if (parent) { - bch2_trans_copy_iter(trans, &iter2, iter); + bch2_trans_copy_iter(&iter2, iter); iter2.path = bch2_btree_path_make_mut(trans, iter2.path, iter2.flags & BTREE_ITER_intent, @@ -2459,12 +2526,12 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, trans->paths_sorted = false; - ret = bch2_btree_iter_traverse(trans, &iter2) ?: + ret = bch2_btree_iter_traverse(&iter2) ?: bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun); if (ret) goto err; } else { - BUG_ON(btree_node_root(c, b) != b); + BUG_ON(!btree_node_is_root(c, b)); struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(new_key->k.u64s)); @@ -2485,7 +2552,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, bch2_btree_node_lock_write_nofail(trans, btree_iter_path(trans, iter), &b->c); if (new_hash) { - mutex_lock(&c->btree_cache.lock); + guard(mutex)(&c->btree_cache.lock); bch2_btree_node_hash_remove(&c->btree_cache, new_hash); __bch2_btree_node_hash_remove(&c->btree_cache, b); @@ -2493,20 +2560,18 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, bkey_copy(&b->key, new_key); ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); BUG_ON(ret); - mutex_unlock(&c->btree_cache.lock); } else { bkey_copy(&b->key, new_key); } bch2_btree_node_unlock_write(trans, btree_iter_path(trans, iter), b); out: - bch2_trans_iter_exit(trans, &iter2); + bch2_trans_iter_exit(&iter2); return ret; err: if (new_hash) { - mutex_lock(&c->btree_cache.lock); + guard(mutex)(&c->btree_cache.lock); bch2_btree_node_hash_remove(&c->btree_cache, b); - mutex_unlock(&c->btree_cache.lock); } goto out; } @@ -2572,7 +2637,7 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, ret = bch2_btree_node_update_key(trans, &iter, b, new_key, commit_flags, skip_triggers); - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); return ret; } @@ -2641,7 +2706,8 @@ int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level) { - bch2_trans_run(c, lockrestart_do(trans, bch2_btree_root_alloc_fake_trans(trans, id, level))); + CLASS(btree_trans, trans)(c); + lockrestart_do(trans, bch2_btree_root_alloc_fake_trans(trans, id, level)); } static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) @@ -2651,9 +2717,19 @@ static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update prt_str(out, " "); bch2_btree_id_to_text(out, as->btree_id); - prt_printf(out, " l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", + prt_printf(out, " l=%u-%u ", as->update_level_start, - as->update_level_end, + as->update_level_end); + bch2_bpos_to_text(out, as->node_start); + prt_char(out, ' '); + bch2_bpos_to_text(out, as->node_end); + prt_printf(out, "\nwritten %u/%u u64s_remaining %u need_rewrite %s", + as->node_written, + as->node_sectors, + as->node_remaining, + btree_node_reawrite_reason_strs[as->node_needed_rewrite]); + + prt_printf(out, "\nmode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", bch2_btree_update_modes[as->mode], as->nodes_written, closure_nr_remaining(&as->cl), @@ -2664,21 +2740,15 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) { struct btree_update *as; - mutex_lock(&c->btree_interior_update_lock); + guard(mutex)(&c->btree_interior_update_lock); list_for_each_entry(as, &c->btree_interior_update_list, list) bch2_btree_update_to_text(out, as); - mutex_unlock(&c->btree_interior_update_lock); } static bool bch2_btree_interior_updates_pending(struct bch_fs *c) { - bool ret; - - mutex_lock(&c->btree_interior_update_lock); - ret = !list_empty(&c->btree_interior_update_list); - mutex_unlock(&c->btree_interior_update_lock); - - return ret; + guard(mutex)(&c->btree_interior_update_lock); + return !list_empty(&c->btree_interior_update_list); } bool bch2_btree_interior_updates_flush(struct bch_fs *c) @@ -2695,13 +2765,11 @@ void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry { struct btree_root *r = bch2_btree_id_root(c, entry->btree_id); - mutex_lock(&c->btree_root_lock); + guard(mutex)(&c->btree_interior_update_lock); r->level = entry->level; r->alive = true; bkey_copy(&r->key, (struct bkey_i *) entry->start); - - mutex_unlock(&c->btree_root_lock); } struct jset_entry * @@ -2709,11 +2777,9 @@ bch2_btree_roots_to_journal_entries(struct bch_fs *c, struct jset_entry *end, unsigned long skip) { - unsigned i; - - mutex_lock(&c->btree_root_lock); + guard(mutex)(&c->btree_interior_update_lock); - for (i = 0; i < btree_id_nr_alive(c); i++) { + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); if (r->alive && !test_bit(i, &skip)) { @@ -2723,8 +2789,6 @@ bch2_btree_roots_to_journal_entries(struct bch_fs *c, } } - mutex_unlock(&c->btree_root_lock); - return end; } @@ -2780,16 +2844,16 @@ int bch2_fs_btree_interior_update_init(struct bch_fs *c) c->btree_interior_update_worker = alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8); if (!c->btree_interior_update_worker) - return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; + return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init); c->btree_node_rewrite_worker = alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND); if (!c->btree_node_rewrite_worker) - return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; + return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init); if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, sizeof(struct btree_update))) - return -BCH_ERR_ENOMEM_btree_interior_update_pool_init; + return bch_err_throw(c, ENOMEM_btree_interior_update_pool_init); return 0; } diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index be71cd73b864..6ed049f19a9a 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -57,6 +57,13 @@ struct btree_update { unsigned took_gc_lock:1; enum btree_id btree_id; + struct bpos node_start; + struct bpos node_end; + enum btree_node_rewrite_reason node_needed_rewrite; + u16 node_written; + u16 node_sectors; + u16 node_remaining; + unsigned update_level_start; unsigned update_level_end; @@ -144,7 +151,7 @@ static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, EBUG_ON(!btree_node_locked(path, level)); - if (bch2_btree_node_merging_disabled) + if (static_branch_unlikely(&bch2_btree_node_merging_disabled)) return 0; b = path->l[level].b; @@ -168,12 +175,19 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, } int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, - struct btree *, unsigned); + struct btree *, unsigned, + enum bch_trans_commit_flags); +int bch2_btree_node_rewrite_key(struct btree_trans *, + enum btree_id, unsigned, + struct bkey_i *, + enum bch_trans_commit_flags); int bch2_btree_node_rewrite_pos(struct btree_trans *, enum btree_id, unsigned, - struct bpos, unsigned); + struct bpos, unsigned, + enum bch_trans_commit_flags); int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *, - struct btree *, unsigned); + struct btree *, + enum bch_trans_commit_flags); void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 0941fb2c026d..afad11831e1d 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -7,6 +7,7 @@ #include "btree_update_interior.h" #include "btree_write_buffer.h" #include "disk_accounting.h" +#include "enumerated_ref.h" #include "error.h" #include "extents.h" #include "journal.h" @@ -144,7 +145,7 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq); EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq); - ret = bch2_btree_iter_traverse(trans, iter); + ret = bch2_btree_iter_traverse(iter); if (ret) return ret; @@ -181,6 +182,8 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite return wb_flush_one_slowpath(trans, iter, wb); } + EBUG_ON(!bpos_eq(wb->k.k.p, path->pos)); + bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq); (*fast)++; return 0; @@ -200,19 +203,14 @@ static int btree_write_buffered_insert(struct btree_trans *trans, struct btree_write_buffered_key *wb) { - struct btree_iter iter; - int ret; - - bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k), - BTREE_ITER_cached|BTREE_ITER_intent); + CLASS(btree_iter, iter)(trans, wb->btree, bkey_start_pos(&wb->k.k), + BTREE_ITER_cached|BTREE_ITER_intent); trans->journal_res.seq = wb->journal_seq; - ret = bch2_btree_iter_traverse(trans, &iter) ?: + return bch2_btree_iter_traverse(&iter) ?: bch2_trans_update(trans, &iter, &wb->k, BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &iter); - return ret; } static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb) @@ -256,19 +254,17 @@ static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb) bch2_btree_write_buffer_journal_flush); if (j->watermark) { - spin_lock(&j->lock); + guard(spinlock)(&j->lock); bch2_journal_set_watermark(j); - spin_unlock(&j->lock); } BUG_ON(wb->sorted.size < wb->flushing.keys.nr); } -int bch2_btree_write_buffer_insert_err(struct btree_trans *trans, +int bch2_btree_write_buffer_insert_err(struct bch_fs *c, enum btree_id btree, struct bkey_i *k) { - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); prt_printf(&buf, "attempting to do write buffer update on non wb btree="); bch2_btree_id_to_text(&buf, btree); @@ -276,7 +272,6 @@ int bch2_btree_write_buffer_insert_err(struct btree_trans *trans, bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); bch2_fs_inconsistent(c, "%s", buf.buf); - printbuf_exit(&buf); return -EROFS; } @@ -285,7 +280,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) struct bch_fs *c = trans->c; struct journal *j = &c->journal; struct btree_write_buffer *wb = &c->btree_write_buffer; - struct btree_iter iter = {}; + struct btree_iter iter = { NULL }; size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0; bool write_locked = false; bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags); @@ -298,9 +293,8 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) bch2_trans_unlock(trans); bch2_trans_begin(trans); - mutex_lock(&wb->inc.lock); - move_keys_from_inc_to_flushing(wb); - mutex_unlock(&wb->inc.lock); + scoped_guard(mutex, &wb->inc.lock) + move_keys_from_inc_to_flushing(wb); for (size_t i = 0; i < wb->flushing.keys.nr; i++) { wb->sorted.data[i].idx = i; @@ -328,10 +322,9 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) darray_for_each(wb->sorted, i) { struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx]; - if (unlikely(!btree_type_uses_write_buffer(k->btree))) { - ret = bch2_btree_write_buffer_insert_err(trans, k->btree, &k->k); + ret = bch2_btree_write_buffer_insert_checks(c, k->btree, &k->k); + if (unlikely(ret)) goto err; - } for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++) prefetch(&wb->flushing.keys.data[n->idx]); @@ -368,7 +361,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) write_locked = false; ret = lockrestart_do(trans, - bch2_btree_iter_traverse(trans, &iter) ?: + bch2_btree_iter_traverse(&iter) ?: bch2_foreground_maybe_merge(trans, iter.path, 0, BCH_WATERMARK_reclaim| BCH_TRANS_COMMIT_journal_reclaim| @@ -380,18 +373,18 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) } if (!iter.path || iter.btree_id != k->btree) { - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p, BTREE_ITER_intent|BTREE_ITER_all_snapshots); } - bch2_btree_iter_set_pos(trans, &iter, k->k.k.p); + bch2_btree_iter_set_pos(&iter, k->k.k.p); btree_iter_path(trans, &iter)->preserve = false; bool accounting_accumulated = false; do { if (race_fault()) { - ret = -BCH_ERR_journal_reclaim_would_deadlock; + ret = bch_err_throw(c, journal_reclaim_would_deadlock); break; } @@ -414,7 +407,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) struct btree_path *path = btree_iter_path(trans, &iter); bch2_btree_node_unlock_write(trans, path, path->l[0].b); } - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); if (ret) goto err; @@ -532,9 +525,8 @@ static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 max_seq) ret = bch2_journal_keys_to_write_buffer(c, buf); if (!blocked && !ret) { - spin_lock(&j->lock); + guard(spinlock)(&j->lock); buf->need_flush_to_write_buffer = false; - spin_unlock(&j->lock); } mutex_unlock(&j->buf_lock); @@ -566,9 +558,8 @@ static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 max_seq, * On memory allocation failure, bch2_btree_write_buffer_flush_locked() * is not guaranteed to empty wb->inc: */ - mutex_lock(&wb->flushing.lock); - ret = bch2_btree_write_buffer_flush_locked(trans); - mutex_unlock(&wb->flushing.lock); + scoped_guard(mutex, &wb->flushing.lock) + ret = bch2_btree_write_buffer_flush_locked(trans); } while (!ret && (fetch_from_journal_err || (wb->inc.pin.seq && wb->inc.pin.seq <= max_seq) || @@ -581,9 +572,10 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j, struct journal_entry_pin *_pin, u64 seq) { struct bch_fs *c = container_of(j, struct bch_fs, journal); + CLASS(btree_trans, trans)(c); bool did_work = false; - return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq, &did_work)); + return btree_write_buffer_flush_seq(trans, seq, &did_work); } int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans) @@ -605,9 +597,9 @@ bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *c) if (bch2_journal_error(&c->journal)) return false; + CLASS(btree_trans, trans)(c); bool did_work = false; - bch2_trans_run(c, btree_write_buffer_flush_seq(trans, - journal_cur_seq(&c->journal), &did_work)); + btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal), &did_work); return did_work; } @@ -629,11 +621,11 @@ int bch2_btree_write_buffer_tryflush(struct btree_trans *trans) { struct bch_fs *c = trans->c; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer)) - return -BCH_ERR_erofs_no_writes; + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer)) + return bch_err_throw(c, erofs_no_writes); int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans); - bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); return ret; } @@ -654,11 +646,10 @@ int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans, if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) { if (trace_write_buffer_maybe_flush_enabled()) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_bkey_val_to_text(&buf, c, referring_k); trace_write_buffer_maybe_flush(trans, _RET_IP_, buf.buf); - printbuf_exit(&buf); } bch2_bkey_buf_reassemble(&tmp, c, referring_k); @@ -673,7 +664,10 @@ int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans, goto err; bch2_bkey_buf_copy(last_flushed, c, tmp.k); - ret = -BCH_ERR_transaction_restart_write_buffer_flush; + + /* can we avoid the unconditional restart? */ + trace_and_count(c, trans_restart_write_buffer_flush, trans, _RET_IP_); + ret = bch_err_throw(c, transaction_restart_write_buffer_flush); } err: bch2_bkey_buf_exit(&tmp, c); @@ -686,13 +680,14 @@ static void bch2_btree_write_buffer_flush_work(struct work_struct *work) struct btree_write_buffer *wb = &c->btree_write_buffer; int ret; - mutex_lock(&wb->flushing.lock); - do { - ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans)); - } while (!ret && bch2_btree_write_buffer_should_flush(c)); - mutex_unlock(&wb->flushing.lock); + scoped_guard(mutex, &wb->flushing.lock) { + CLASS(btree_trans, trans)(c); + do { + ret = bch2_btree_write_buffer_flush_locked(trans); + } while (!ret && bch2_btree_write_buffer_should_flush(c)); + } - bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); } static void wb_accounting_sort(struct btree_write_buffer *wb) @@ -821,9 +816,9 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_ bch2_journal_pin_drop(&c->journal, &dst->wb->pin); if (bch2_btree_write_buffer_should_flush(c) && - __bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) && + __enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer) && !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); if (dst->wb == &wb->flushing) mutex_unlock(&wb->flushing.lock); @@ -866,13 +861,18 @@ void bch2_fs_btree_write_buffer_exit(struct bch_fs *c) darray_exit(&wb->inc.keys); } -int bch2_fs_btree_write_buffer_init(struct bch_fs *c) +void bch2_fs_btree_write_buffer_init_early(struct bch_fs *c) { struct btree_write_buffer *wb = &c->btree_write_buffer; mutex_init(&wb->inc.lock); mutex_init(&wb->flushing.lock); INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work); +} + +int bch2_fs_btree_write_buffer_init(struct bch_fs *c) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; /* Will be resized by journal as needed: */ unsigned initial_size = 1 << 16; diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h index d535cea28bde..e484cd6b90b0 100644 --- a/fs/bcachefs/btree_write_buffer.h +++ b/fs/bcachefs/btree_write_buffer.h @@ -89,6 +89,10 @@ static inline int bch2_journal_key_to_wb(struct bch_fs *c, struct journal_keys_to_wb *dst, enum btree_id btree, struct bkey_i *k) { + int ret = bch2_btree_write_buffer_insert_checks(c, btree, k); + if (unlikely(ret)) + return ret; + EBUG_ON(!dst->seq); return k->k.type == KEY_TYPE_accounting @@ -101,6 +105,7 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_t int bch2_btree_write_buffer_resize(struct bch_fs *, size_t); void bch2_fs_btree_write_buffer_exit(struct bch_fs *); +void bch2_fs_btree_write_buffer_init_early(struct bch_fs *); int bch2_fs_btree_write_buffer_init(struct bch_fs *); #endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */ diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 31fbc2716d8b..87a6f4dce296 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -71,13 +71,8 @@ __bch2_fs_usage_read_short(struct bch_fs *c) struct bch_fs_usage_short bch2_fs_usage_read_short(struct bch_fs *c) { - struct bch_fs_usage_short ret; - - percpu_down_read(&c->mark_lock); - ret = __bch2_fs_usage_read_short(c); - percpu_up_read(&c->mark_lock); - - return ret; + guard(percpu_read)(&c->mark_lock); + return __bch2_fs_usage_read_short(c); } void bch2_dev_usage_to_text(struct printbuf *out, @@ -113,10 +108,10 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, bool *do_update) { struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); int ret = 0; - struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); + CLASS(bch2_dev_tryget, ca)(c, p.ptr.dev); if (!ca) { if (fsck_err_on(p.ptr.dev != BCH_SB_MEMBER_INVALID, trans, ptr_to_invalid_device, @@ -138,7 +133,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) *do_update = true; - goto out; + return 0; } enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry); @@ -156,10 +151,14 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, g->gen_valid = true; g->gen = p.ptr.gen; } else { + /* this pointer will be dropped */ *do_update = true; + return 0; } } + /* g->gen_valid == true */ + if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, trans, ptr_gen_newer_than_bucket_gen, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" @@ -172,15 +171,13 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, if (!p.ptr.cached && (g->data_type != BCH_DATA_btree || data_type == BCH_DATA_btree)) { - g->gen_valid = true; - g->gen = p.ptr.gen; - g->data_type = 0; + g->data_type = data_type; g->stripe_sectors = 0; g->dirty_sectors = 0; g->cached_sectors = 0; - } else { - *do_update = true; } + + *do_update = true; } if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, @@ -206,7 +203,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, *do_update = true; if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) - goto out; + return 0; if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type), trans, ptr_bucket_data_type_mismatch, @@ -217,9 +214,21 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, bch2_data_type_str(data_type), (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - if (data_type == BCH_DATA_btree) { - g->gen_valid = true; - g->gen = p.ptr.gen; + if (!p.ptr.cached && + data_type == BCH_DATA_btree) { + switch (g->data_type) { + case BCH_DATA_sb: + bch_err(c, "btree and superblock in the same bucket - cannot repair"); + return bch_err_throw(c, fsck_repair_unimplemented); + case BCH_DATA_journal: + ret = bch2_dev_journal_bucket_delete(ca, PTR_BUCKET_NR(ca, &p.ptr)); + bch_err_msg(c, ret, "error deleting journal bucket %zu", + PTR_BUCKET_NR(ca, &p.ptr)); + if (ret) + return ret; + break; + } + g->data_type = data_type; g->stripe_sectors = 0; g->dirty_sectors = 0; @@ -250,10 +259,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, bch2_bkey_val_to_text(&buf, c, k), buf.buf))) *do_update = true; } -out: fsck_err: - bch2_dev_put(ca); - printbuf_exit(&buf); return ret; } @@ -266,30 +272,26 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, const union bch_extent_entry *entry_c; struct extent_ptr_decoded p = { 0 }; bool do_update = false; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); int ret = 0; + /* We don't yet do btree key updates correctly for when we're RW */ + BUG_ON(test_bit(BCH_FS_rw, &c->flags)); + bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) { ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update); if (ret) - goto err; + return ret; } if (do_update) { - if (flags & BTREE_TRIGGER_is_root) { - bch_err(c, "cannot update btree roots yet"); - ret = -EINVAL; - goto err; - } - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); ret = PTR_ERR_OR_ZERO(new); if (ret) - goto err; + return ret; - rcu_read_lock(); - bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev)); - rcu_read_unlock(); + scoped_guard(rcu) + bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev)); if (level) { /* @@ -298,14 +300,11 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, * sort it out: */ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - rcu_read_lock(); - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - struct bucket *g = PTR_GC_BUCKET(ca, ptr); - - ptr->gen = g->gen; - } - rcu_read_unlock(); + scoped_guard(rcu) + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + ptr->gen = PTR_GC_BUCKET(ca, ptr)->gen; + } } else { struct bkey_ptrs ptrs; union bch_extent_entry *entry; @@ -369,52 +368,76 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, bch_info(c, "new key %s", buf.buf); } - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level, - BTREE_ITER_intent|BTREE_ITER_all_snapshots); - ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, new, - BTREE_UPDATE_internal_snapshot_node| - BTREE_TRIGGER_norun); - bch2_trans_iter_exit(trans, &iter); - if (ret) - goto err; + if (!(flags & BTREE_TRIGGER_is_root)) { + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level, + BTREE_ITER_intent|BTREE_ITER_all_snapshots); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, new, + BTREE_UPDATE_internal_snapshot_node| + BTREE_TRIGGER_norun); + bch2_trans_iter_exit(&iter); + if (ret) + return ret; + + if (level) + bch2_btree_node_update_key_early(trans, btree, level - 1, k, new); + } else { + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, + jset_u64s(new->k.u64s)); + ret = PTR_ERR_OR_ZERO(e); + if (ret) + return ret; - if (level) - bch2_btree_node_update_key_early(trans, btree, level - 1, k, new); + journal_entry_set(e, + BCH_JSET_ENTRY_btree_root, + btree, level - 1, + new, new->k.u64s); + + /* + * no locking, we're single threaded and not rw yet, see + * the big assertino above that we repeat here: + */ + BUG_ON(test_bit(BCH_FS_rw, &c->flags)); + + struct btree *b = bch2_btree_id_root(c, btree)->b; + bkey_copy(&b->key, new); + } } -err: - printbuf_exit(&buf); - return ret; + + return 0; } static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf, struct bkey_s_c k, bool insert, enum bch_sb_error_id id) { struct bch_fs *c = trans->c; - bool repeat = false, print = true, suppress = false; prt_printf(buf, "\nwhile marking "); bch2_bkey_val_to_text(buf, c, k); prt_newline(buf); - __bch2_count_fsck_err(c, id, buf->buf, &repeat, &print, &suppress); + bool print = __bch2_count_fsck_err(c, id, buf); - int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); + int ret = bch2_run_explicit_recovery_pass(c, buf, + BCH_RECOVERY_PASS_check_allocations, 0); if (insert) { - print = true; - suppress = false; - bch2_trans_updates_to_text(buf, trans); __bch2_inconsistent_error(c, buf); - ret = -BCH_ERR_bucket_ref_update; + /* + * If we're in recovery, run_explicit_recovery_pass might give + * us an error code for rewinding recovery + */ + if (!ret) + ret = bch_err_throw(c, bucket_ref_update); + } else { + /* Always ignore overwrite errors, so that deletion works */ + ret = 0; } - if (suppress) - prt_printf(buf, "Ratelimiting new instances of previous error\n"); - if (print) - bch2_print_string_as_lines(KERN_ERR, buf->buf); + if (print || insert) + bch2_print_str(c, KERN_ERR, buf->buf); return ret; } @@ -427,9 +450,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, { struct bch_fs *c = trans->c; size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bool inserting = sectors > 0; - int ret = 0; BUG_ON(!sectors); @@ -441,9 +463,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen); - ret = bucket_ref_update_err(trans, &buf, k, inserting, - BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen); - goto out; + return bucket_ref_update_err(trans, &buf, k, inserting, + BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen); } if (unlikely(gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX)) { @@ -454,15 +475,12 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen); - ret = bucket_ref_update_err(trans, &buf, k, inserting, - BCH_FSCK_ERR_ptr_too_stale); - goto out; + return bucket_ref_update_err(trans, &buf, k, inserting, + BCH_FSCK_ERR_ptr_too_stale); } - if (b_gen != ptr->gen && ptr->cached) { - ret = 1; - goto out; - } + if (b_gen != ptr->gen && ptr->cached) + return 1; if (unlikely(b_gen != ptr->gen)) { bch2_log_msg_start(c, &buf); @@ -473,9 +491,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen); - ret = bucket_ref_update_err(trans, &buf, k, inserting, - BCH_FSCK_ERR_stale_dirty_ptr); - goto out; + return bucket_ref_update_err(trans, &buf, k, inserting, + BCH_FSCK_ERR_stale_dirty_ptr); } if (unlikely(bucket_data_type_mismatch(bucket_data_type, ptr_data_type))) { @@ -485,9 +502,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, bch2_data_type_str(bucket_data_type), bch2_data_type_str(ptr_data_type)); - ret = bucket_ref_update_err(trans, &buf, k, inserting, + return bucket_ref_update_err(trans, &buf, k, inserting, BCH_FSCK_ERR_ptr_bucket_data_type_mismatch); - goto out; } if (unlikely((u64) *bucket_sectors + sectors > U32_MAX)) { @@ -498,16 +514,13 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, bch2_data_type_str(bucket_data_type ?: ptr_data_type), *bucket_sectors, sectors); - ret = bucket_ref_update_err(trans, &buf, k, inserting, - BCH_FSCK_ERR_bucket_sector_count_overflow); sectors = -*bucket_sectors; - goto out; + return bucket_ref_update_err(trans, &buf, k, inserting, + BCH_FSCK_ERR_bucket_sector_count_overflow); } *bucket_sectors += sectors; -out: - printbuf_exit(&buf); - return ret; + return 0; } void bch2_trans_account_disk_usage_change(struct btree_trans *trans) @@ -517,7 +530,7 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *trans) static int warned_disk_usage = 0; bool warn = false; - percpu_down_read(&c->mark_lock); + guard(percpu_read)(&c->mark_lock); struct bch_fs_usage_base *src = &trans->fs_usage_delta; s64 added = src->btree + src->data + src->reserved; @@ -545,11 +558,10 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *trans) this_cpu_sub(*c->online_reserved, added); } - preempt_disable(); - struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); - acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64)); - preempt_enable(); - percpu_up_read(&c->mark_lock); + scoped_guard(preempt) { + struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); + acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64)); + } if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) bch2_trans_inconsistent(trans, @@ -588,40 +600,34 @@ static int bch2_trigger_pointer(struct btree_trans *trans, { struct bch_fs *c = trans->c; bool insert = !(flags & BTREE_TRIGGER_overwrite); - struct printbuf buf = PRINTBUF; - int ret = 0; + CLASS(printbuf, buf)(); struct bkey_i_backpointer bp; bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp); *sectors = insert ? bp.v.bucket_len : -(s64) bp.v.bucket_len; - struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); + CLASS(bch2_dev_tryget, ca)(c, p.ptr.dev); if (unlikely(!ca)) { if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID) - ret = -BCH_ERR_trigger_pointer; - goto err; + return bch_err_throw(c, trigger_pointer); + return 0; } struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr); if (!bucket_valid(ca, bucket.offset)) { if (insert) { bch2_dev_bucket_missing(ca, bucket.offset); - ret = -BCH_ERR_trigger_pointer; + return bch_err_throw(c, trigger_pointer); } - goto err; + return 0; } if (flags & BTREE_TRIGGER_transactional) { struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0); - ret = PTR_ERR_OR_ZERO(a) ?: - __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v, insert); - if (ret) - goto err; - - ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert); - if (ret) - goto err; + return PTR_ERR_OR_ZERO(a) ?: + __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v, insert) ?: + bch2_bucket_backpointer_mod(trans, k, &bp, insert); } if (flags & BTREE_TRIGGER_gc) { @@ -629,23 +635,22 @@ static int bch2_trigger_pointer(struct btree_trans *trans, if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", p.ptr.dev, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = -BCH_ERR_trigger_pointer; - goto err; + return bch_err_throw(c, trigger_pointer); } bucket_lock(g); struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; - ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new, insert); + int ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new, insert); alloc_to_bucket(g, new); bucket_unlock(g); - if (!ret) - ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); + if (ret) + return ret; + + return bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); } -err: - bch2_dev_put(ca); - printbuf_exit(&buf); - return ret; + + return 0; } static int bch2_trigger_stripe_ptr(struct btree_trans *trans, @@ -655,25 +660,26 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, s64 sectors, enum btree_iter_update_trigger_flags flags) { + struct bch_fs *c = trans->c; + if (flags & BTREE_TRIGGER_transactional) { - struct btree_iter iter; - struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_stripes, POS(0, p.ec.idx), - BTREE_ITER_with_updates, stripe); + struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, + BTREE_ID_stripes, POS(0, p.ec.idx), + BTREE_ITER_with_updates, + stripe); int ret = PTR_ERR_OR_ZERO(s); if (unlikely(ret)) { bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans, "pointer to nonexistent stripe %llu", (u64) p.ec.idx); - goto err; + return ret; } if (!bch2_ptr_matches_stripe(&s->v, p)) { bch2_trans_inconsistent(trans, "stripe pointer doesn't match stripe %llu", (u64) p.ec.idx); - ret = -BCH_ERR_trigger_stripe_pointer; - goto err; + return bch_err_throw(c, trigger_stripe_pointer); } stripe_blockcount_set(&s->v, p.ec.block, @@ -685,35 +691,29 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, acc.type = BCH_DISK_ACCOUNTING_replicas; bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); acc.replicas.data_type = data_type; - ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; + return bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); } if (flags & BTREE_TRIGGER_gc) { - struct bch_fs *c = trans->c; - struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL); if (!m) { bch_err(c, "error allocating memory for gc_stripes, idx %llu", (u64) p.ec.idx); - return -BCH_ERR_ENOMEM_mark_stripe_ptr; + return bch_err_throw(c, ENOMEM_mark_stripe_ptr); } gc_stripe_lock(m); if (!m || !m->alive) { gc_stripe_unlock(m); - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_log_msg_start(c, &buf); prt_printf(&buf, "pointer to nonexistent stripe %llu\n while marking ", (u64) p.ec.idx); bch2_bkey_val_to_text(&buf, c, k); __bch2_inconsistent_error(c, &buf); - bch2_print_string_as_lines(KERN_ERR, buf.buf); - printbuf_exit(&buf); - return -BCH_ERR_trigger_stripe_pointer; + bch2_print_str(c, KERN_ERR, buf.buf); + return bch_err_throw(c, trigger_stripe_pointer); } m->block_sectors[p.ec.block] += sectors; @@ -736,8 +736,7 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, static int __trigger_extent(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c k, - enum btree_iter_update_trigger_flags flags, - s64 *replicas_sectors) + enum btree_iter_update_trigger_flags flags) { bool gc = flags & BTREE_TRIGGER_gc; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -748,6 +747,8 @@ static int __trigger_extent(struct btree_trans *trans, : BCH_DATA_user; int ret = 0; + s64 replicas_sectors = 0; + struct disk_accounting_pos acc_replicas_key; memset(&acc_replicas_key, 0, sizeof(acc_replicas_key)); acc_replicas_key.type = BCH_DISK_ACCOUNTING_replicas; @@ -774,7 +775,7 @@ static int __trigger_extent(struct btree_trans *trans, if (ret) return ret; } else if (!p.has_ec) { - *replicas_sectors += disk_sectors; + replicas_sectors += disk_sectors; replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev); } else { ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags); @@ -812,13 +813,13 @@ static int __trigger_extent(struct btree_trans *trans, } if (acc_replicas_key.replicas.nr_devs) { - ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, replicas_sectors, 1, gc); + ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc); if (ret) return ret; } if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) { - ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, snapshot, k.k->p.snapshot); + ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, snapshot, k.k->p.snapshot); if (ret) return ret; } @@ -834,7 +835,7 @@ static int __trigger_extent(struct btree_trans *trans, } if (level) { - ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, btree, btree_id); + ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, btree, btree_id); if (ret) return ret; } else { @@ -843,7 +844,7 @@ static int __trigger_extent(struct btree_trans *trans, s64 v[3] = { insert ? 1 : -1, insert ? k.k->size : -((s64) k.k->size), - *replicas_sectors, + replicas_sectors, }; ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode); if (ret) @@ -875,20 +876,16 @@ int bch2_trigger_extent(struct btree_trans *trans, return 0; if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { - s64 old_replicas_sectors = 0, new_replicas_sectors = 0; - if (old.k->type) { int ret = __trigger_extent(trans, btree, level, old, - flags & ~BTREE_TRIGGER_insert, - &old_replicas_sectors); + flags & ~BTREE_TRIGGER_insert); if (ret) return ret; } if (new.k->type) { int ret = __trigger_extent(trans, btree, level, new.s_c, - flags & ~BTREE_TRIGGER_overwrite, - &new_replicas_sectors); + flags & ~BTREE_TRIGGER_overwrite); if (ret) return ret; } @@ -966,15 +963,24 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, return PTR_ERR(a); if (a->v.data_type && type && a->v.data_type != type) { - bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); - log_fsck_err(trans, bucket_metadata_type_mismatch, - "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" - "while marking %s", - iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_type_str(a->v.data_type), - bch2_data_type_str(type), - bch2_data_type_str(type)); - ret = -BCH_ERR_metadata_bucket_inconsistency; + CLASS(printbuf, buf)(); + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s\n", + iter.pos.inode, iter.pos.offset, a->v.gen, + bch2_data_type_str(a->v.data_type), + bch2_data_type_str(type), + bch2_data_type_str(type)); + + bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf); + + ret = bch2_run_explicit_recovery_pass(c, &buf, + BCH_RECOVERY_PASS_check_allocations, 0); + + /* Always print, this is always fatal */ + bch2_print_str(c, KERN_ERR, buf.buf); + if (!ret) + ret = bch_err_throw(c, metadata_bucket_inconsistency); goto err; } @@ -985,8 +991,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, ret = bch2_trans_update(trans, &iter, &a->k_i, 0); } err: -fsck_err: - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); return ret; } @@ -995,7 +1000,6 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev * enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; - int ret = 0; struct bucket *g = gc_bucket(ca, b); if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s", @@ -1023,12 +1027,11 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev * g->dirty_sectors += sectors; struct bch_alloc_v4 new = bucket_m_to_alloc(*g); bucket_unlock(g); - ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); - return ret; + return bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); err_unlock: bucket_unlock(g); err: - return -BCH_ERR_metadata_bucket_inconsistency; + return bch_err_throw(c, metadata_bucket_inconsistency); } int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, @@ -1086,10 +1089,10 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *c enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; + struct bch_sb_layout layout; - mutex_lock(&c->sb_lock); - struct bch_sb_layout layout = ca->disk_sb.sb->layout; - mutex_unlock(&c->sb_lock); + scoped_guard(mutex, &c->sb_lock) + layout = ca->disk_sb.sb->layout; u64 bucket = 0; unsigned i, bucket_sectors = 0; @@ -1134,8 +1137,8 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *c int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca, enum btree_iter_update_trigger_flags flags) { - int ret = bch2_trans_run(c, - __bch2_trans_mark_dev_sb(trans, ca, flags)); + CLASS(btree_trans, trans)(c); + int ret = __bch2_trans_mark_dev_sb(trans, ca, flags); bch_err_fn(c, ret); return ret; } @@ -1143,10 +1146,10 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca, int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c, enum btree_iter_update_trigger_flags flags) { - for_each_online_member(c, ca) { + for_each_online_member(c, ca, BCH_DEV_READ_REF_trans_mark_dev_sbs) { int ret = bch2_trans_mark_dev_sb(c, ca, flags); if (ret) { - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_trans_mark_dev_sbs); return ret; } } @@ -1188,15 +1191,38 @@ bool bch2_is_superblock_bucket(struct bch_dev *ca, u64 b) #define SECTORS_CACHE 1024 +static int disk_reservation_recalc_sectors_available(struct bch_fs *c, + struct disk_reservation *res, + u64 sectors, enum bch_reservation_flags flags) +{ + guard(mutex)(&c->sectors_available_lock); + + percpu_u64_set(&c->pcpu->sectors_available, 0); + u64 sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free); + + if (sectors_available && (flags & BCH_DISK_RESERVATION_PARTIAL)) + sectors = min(sectors, sectors_available); + + if (sectors <= sectors_available || + (flags & BCH_DISK_RESERVATION_NOFAIL)) { + atomic64_set(&c->sectors_available, + max_t(s64, 0, sectors_available - sectors)); + this_cpu_add(*c->online_reserved, sectors); + res->sectors += sectors; + return 0; + } else { + atomic64_set(&c->sectors_available, sectors_available); + return bch_err_throw(c, ENOSPC_disk_reservation); + } +} + int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, u64 sectors, enum bch_reservation_flags flags) { struct bch_fs_pcpu *pcpu; u64 old, get; - u64 sectors_available; - int ret; - percpu_down_read(&c->mark_lock); + guard(percpu_read)(&c->mark_lock); preempt_disable(); pcpu = this_cpu_ptr(c->pcpu); @@ -1207,9 +1233,10 @@ int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, do { get = min((u64) sectors + SECTORS_CACHE, old); - if (get < sectors) { + if (unlikely(get < sectors)) { preempt_enable(); - goto recalculate; + return disk_reservation_recalc_sectors_available(c, + res, sectors, flags); } } while (!atomic64_try_cmpxchg(&c->sectors_available, &old, old - get)); @@ -1220,36 +1247,8 @@ int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, pcpu->sectors_available -= sectors; this_cpu_add(*c->online_reserved, sectors); res->sectors += sectors; - preempt_enable(); - percpu_up_read(&c->mark_lock); return 0; - -recalculate: - mutex_lock(&c->sectors_available_lock); - - percpu_u64_set(&c->pcpu->sectors_available, 0); - sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free); - - if (sectors_available && (flags & BCH_DISK_RESERVATION_PARTIAL)) - sectors = min(sectors, sectors_available); - - if (sectors <= sectors_available || - (flags & BCH_DISK_RESERVATION_NOFAIL)) { - atomic64_set(&c->sectors_available, - max_t(s64, 0, sectors_available - sectors)); - this_cpu_add(*c->online_reserved, sectors); - res->sectors += sectors; - ret = 0; - } else { - atomic64_set(&c->sectors_available, sectors_available); - ret = -BCH_ERR_ENOSPC_disk_reservation; - } - - mutex_unlock(&c->sectors_available_lock); - percpu_up_read(&c->mark_lock); - - return ret; } /* Startup/shutdown: */ @@ -1272,7 +1271,7 @@ int bch2_buckets_nouse_alloc(struct bch_fs *c) GFP_KERNEL|__GFP_ZERO); if (!ca->buckets_nouse) { bch2_dev_put(ca); - return -BCH_ERR_ENOMEM_buckets_nouse; + return bch_err_throw(c, ENOMEM_buckets_nouse); } } @@ -1297,12 +1296,12 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) lockdep_assert_held(&c->state_lock); if (resize && ca->buckets_nouse) - return -BCH_ERR_no_resize_with_buckets_nouse; + return bch_err_throw(c, no_resize_with_buckets_nouse); bucket_gens = bch2_kvmalloc(struct_size(bucket_gens, b, nbuckets), GFP_KERNEL|__GFP_ZERO); if (!bucket_gens) { - ret = -BCH_ERR_ENOMEM_bucket_gens; + ret = bch_err_throw(c, ENOMEM_bucket_gens); goto err; } @@ -1321,6 +1320,11 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) sizeof(bucket_gens->b[0]) * copy); } + ret = bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_mismatch, + ca->mi.nbuckets, nbuckets) ?: + bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_empty, + ca->mi.nbuckets, nbuckets); + rcu_assign_pointer(ca->bucket_gens, bucket_gens); bucket_gens = old_bucket_gens; @@ -1345,7 +1349,7 @@ int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) { ca->usage = alloc_percpu(struct bch_dev_usage_full); if (!ca->usage) - return -BCH_ERR_ENOMEM_usage_init; + return bch_err_throw(c, ENOMEM_usage_init); return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets); } diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index af1532de4a37..49a3807a5eab 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -84,10 +84,8 @@ static inline int bucket_gen_get_rcu(struct bch_dev *ca, size_t b) static inline int bucket_gen_get(struct bch_dev *ca, size_t b) { - rcu_read_lock(); - int ret = bucket_gen_get_rcu(ca, b); - rcu_read_unlock(); - return ret; + guard(rcu)(); + return bucket_gen_get_rcu(ca, b); } static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, @@ -156,10 +154,8 @@ static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ */ static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr) { - rcu_read_lock(); - int ret = dev_ptr_stale_rcu(ca, ptr); - rcu_read_unlock(); - return ret; + guard(rcu)(); + return dev_ptr_stale_rcu(ca, ptr); } /* Device usage: */ diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c index c8a488e6b7b8..ca341586920b 100644 --- a/fs/bcachefs/buckets_waiting_for_journal.c +++ b/fs/bcachefs/buckets_waiting_for_journal.c @@ -25,25 +25,20 @@ static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_ u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *b, unsigned dev, u64 bucket) { - struct buckets_waiting_for_journal_table *t; u64 dev_bucket = (u64) dev << 56 | bucket; - u64 ret = 0; - mutex_lock(&b->lock); - t = b->t; + guard(mutex)(&b->lock); + + struct buckets_waiting_for_journal_table *t = b->t; for (unsigned i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { struct bucket_hashed *h = bucket_hash(t, i, dev_bucket); - if (h->dev_bucket == dev_bucket) { - ret = h->journal_seq; - break; - } + if (h->dev_bucket == dev_bucket) + return h->journal_seq; } - mutex_unlock(&b->lock); - - return ret; + return 0; } static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t, @@ -92,12 +87,11 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, .journal_seq = journal_seq, }; size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0, nr_rehashes_this_size = 0; - int ret = 0; - mutex_lock(&b->lock); + guard(mutex)(&b->lock); if (likely(bucket_table_insert(b->t, &new, flushed_seq))) - goto out; + return 0; t = b->t; size = 1UL << t->bits; @@ -108,8 +102,8 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, realloc: n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL); if (!n) { - ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set; - goto out; + struct bch_fs *c = container_of(b, struct bch_fs, buckets_waiting_for_journal); + return bch_err_throw(c, ENOMEM_buckets_waiting_for_journal_set); } retry_rehash: @@ -142,10 +136,7 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, pr_debug("took %zu rehashes, table at %zu/%lu elements", nr_rehashes, nr_elements, 1UL << b->t->bits); -out: - mutex_unlock(&b->lock); - - return ret; + return 0; } void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c) diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 5891b3a1e61c..467fc45e84fe 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -52,6 +52,11 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, return ca; } +DEFINE_CLASS(bch2_device_lookup, struct bch_dev *, + bch2_dev_put(_T), + bch2_device_lookup(c, dev, flags), + struct bch_fs *c, u64 dev, unsigned flags); + #if 0 static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg) { @@ -207,8 +212,6 @@ static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg) static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg) { - struct bch_dev *ca; - if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -219,7 +222,7 @@ static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg) arg.pad) return -EINVAL; - ca = bch2_device_lookup(c, arg.dev, arg.flags); + struct bch_dev *ca = bch2_device_lookup(c, arg.dev, arg.flags); if (IS_ERR(ca)) return PTR_ERR(ca); @@ -249,9 +252,6 @@ static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg) static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg) { - struct bch_dev *ca; - int ret; - if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -262,21 +262,16 @@ static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg) arg.pad) return -EINVAL; - ca = bch2_device_lookup(c, arg.dev, arg.flags); + CLASS(bch2_device_lookup, ca)(c, arg.dev, arg.flags); if (IS_ERR(ca)) return PTR_ERR(ca); - ret = bch2_dev_offline(c, ca, arg.flags); - bch2_dev_put(ca); - return ret; + return bch2_dev_offline(c, ca, arg.flags); } static long bch2_ioctl_disk_set_state(struct bch_fs *c, struct bch_ioctl_disk_set_state arg) { - struct bch_dev *ca; - int ret; - if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -288,15 +283,12 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c, arg.new_state >= BCH_MEMBER_STATE_NR) return -EINVAL; - ca = bch2_device_lookup(c, arg.dev, arg.flags); + CLASS(bch2_device_lookup, ca)(c, arg.dev, arg.flags); if (IS_ERR(ca)) return PTR_ERR(ca); - ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags); - if (ret) - bch_err(c, "Error setting device state: %s", bch2_err_str(ret)); - - bch2_dev_put(ca); + int ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags); + bch_err_msg(ca, ret, "setting device state"); return ret; } @@ -312,13 +304,14 @@ static int bch2_data_thread(void *arg) { struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr); - ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); + ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, &ctx->arg); if (ctx->thr.ret == -BCH_ERR_device_offline) ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_device_offline; else { ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done; ctx->stats.data_type = (int) DATA_PROGRESS_DATA_TYPE_done; } + enumerated_ref_put(&ctx->c->writes, BCH_WRITE_REF_ioctl_data); return 0; } @@ -348,14 +341,13 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf, }; if (ctx->arg.op == BCH_DATA_OP_scrub) { - struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev); + CLASS(bch2_dev_tryget_noerror, ca)(c, ctx->arg.scrub.dev); if (ca) { struct bch_dev_usage_full u; bch2_dev_usage_full_read_fast(ca, &u); for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++) if (ctx->arg.scrub.data_types & BIT(i)) e.p.sectors_total += u.d[i].sectors; - bch2_dev_put(ca); } } else { e.p.sectors_total = bch2_fs_usage_read_short(c).used; @@ -378,15 +370,24 @@ static long bch2_ioctl_data(struct bch_fs *c, struct bch_data_ctx *ctx; int ret; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_ioctl_data)) + return -EROFS; - if (arg.op >= BCH_DATA_OP_NR || arg.flags) - return -EINVAL; + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto put_ref; + } + + if (arg.op >= BCH_DATA_OP_NR || arg.flags) { + ret = -EINVAL; + goto put_ref; + } ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) - return -ENOMEM; + if (!ctx) { + ret = -ENOMEM; + goto put_ref; + } ctx->c = c; ctx->arg = arg; @@ -395,17 +396,21 @@ static long bch2_ioctl_data(struct bch_fs *c, &bcachefs_data_ops, bch2_data_thread); if (ret < 0) - kfree(ctx); + goto cleanup; + return ret; +cleanup: + kfree(ctx); +put_ref: + enumerated_ref_put(&c->writes, BCH_WRITE_REF_ioctl_data); return ret; } -static long bch2_ioctl_fs_usage(struct bch_fs *c, +static noinline_for_stack long bch2_ioctl_fs_usage(struct bch_fs *c, struct bch_ioctl_fs_usage __user *user_arg) { struct bch_ioctl_fs_usage arg = {}; - darray_char replicas = {}; + CLASS(darray_char, replicas)(); u32 replica_entries_bytes; - int ret = 0; if (!test_bit(BCH_FS_started, &c->flags)) return -EINVAL; @@ -413,11 +418,11 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes)) return -EFAULT; - ret = bch2_fs_replicas_usage_read(c, &replicas) ?: + int ret = bch2_fs_replicas_usage_read(c, &replicas) ?: (replica_entries_bytes < replicas.nr ? -ERANGE : 0) ?: copy_to_user_errcode(&user_arg->replicas, replicas.data, replicas.nr); if (ret) - goto err; + return ret; struct bch_fs_usage_short u = bch2_fs_usage_read_short(c); arg.capacity = c->capacity; @@ -434,52 +439,41 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, &arg.persistent_reserved[i], 1); } - ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg)); -err: - darray_exit(&replicas); - return ret; + return copy_to_user_errcode(user_arg, &arg, sizeof(arg)); } static long bch2_ioctl_query_accounting(struct bch_fs *c, struct bch_ioctl_query_accounting __user *user_arg) { struct bch_ioctl_query_accounting arg; - darray_char accounting = {}; - int ret = 0; + CLASS(darray_char, accounting)(); if (!test_bit(BCH_FS_started, &c->flags)) return -EINVAL; - ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg)) ?: + int ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg)) ?: bch2_fs_accounting_read(c, &accounting, arg.accounting_types_mask) ?: (arg.accounting_u64s * sizeof(u64) < accounting.nr ? -ERANGE : 0) ?: copy_to_user_errcode(&user_arg->accounting, accounting.data, accounting.nr); if (ret) - goto err; + return ret; arg.capacity = c->capacity; arg.used = bch2_fs_usage_read_short(c).used; arg.online_reserved = percpu_u64_get(c->online_reserved); arg.accounting_u64s = accounting.nr / sizeof(u64); - ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg)); -err: - darray_exit(&accounting); - return ret; + return copy_to_user_errcode(user_arg, &arg, sizeof(arg)); } /* obsolete, didn't allow for new data types: */ -static long bch2_ioctl_dev_usage(struct bch_fs *c, +static noinline_for_stack long bch2_ioctl_dev_usage(struct bch_fs *c, struct bch_ioctl_dev_usage __user *user_arg) { - struct bch_ioctl_dev_usage arg; - struct bch_dev_usage_full src; - struct bch_dev *ca; - unsigned i; - if (!test_bit(BCH_FS_started, &c->flags)) return -EINVAL; + struct bch_ioctl_dev_usage arg; if (copy_from_user(&arg, user_arg, sizeof(arg))) return -EFAULT; @@ -489,38 +483,32 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, arg.pad[2]) return -EINVAL; - ca = bch2_device_lookup(c, arg.dev, arg.flags); + CLASS(bch2_device_lookup, ca)(c, arg.dev, arg.flags); if (IS_ERR(ca)) return PTR_ERR(ca); - src = bch2_dev_usage_full_read(ca); + struct bch_dev_usage_full src = bch2_dev_usage_full_read(ca); arg.state = ca->mi.state; arg.bucket_size = ca->mi.bucket_size; arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; - for (i = 0; i < ARRAY_SIZE(arg.d); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(arg.d); i++) { arg.d[i].buckets = src.d[i].buckets; arg.d[i].sectors = src.d[i].sectors; arg.d[i].fragmented = src.d[i].fragmented; } - bch2_dev_put(ca); - return copy_to_user_errcode(user_arg, &arg, sizeof(arg)); } static long bch2_ioctl_dev_usage_v2(struct bch_fs *c, struct bch_ioctl_dev_usage_v2 __user *user_arg) { - struct bch_ioctl_dev_usage_v2 arg; - struct bch_dev_usage_full src; - struct bch_dev *ca; - int ret = 0; - if (!test_bit(BCH_FS_started, &c->flags)) return -EINVAL; + struct bch_ioctl_dev_usage_v2 arg; if (copy_from_user(&arg, user_arg, sizeof(arg))) return -EFAULT; @@ -530,20 +518,20 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c, arg.pad[2]) return -EINVAL; - ca = bch2_device_lookup(c, arg.dev, arg.flags); + CLASS(bch2_device_lookup, ca)(c, arg.dev, arg.flags); if (IS_ERR(ca)) return PTR_ERR(ca); - src = bch2_dev_usage_full_read(ca); + struct bch_dev_usage_full src = bch2_dev_usage_full_read(ca); arg.state = ca->mi.state; arg.bucket_size = ca->mi.bucket_size; arg.nr_data_types = min(arg.nr_data_types, BCH_DATA_NR); arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; - ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg)); + int ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg)); if (ret) - goto err; + return ret; for (unsigned i = 0; i < arg.nr_data_types; i++) { struct bch_ioctl_dev_usage_type t = { @@ -554,11 +542,10 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c, ret = copy_to_user_errcode(&user_arg->d[i], &t, sizeof(t)); if (ret) - goto err; + return ret; } -err: - bch2_dev_put(ca); - return ret; + + return 0; } static long bch2_ioctl_read_super(struct bch_fs *c, @@ -575,13 +562,13 @@ static long bch2_ioctl_read_super(struct bch_fs *c, arg.pad) return -EINVAL; - mutex_lock(&c->sb_lock); + guard(mutex)(&c->sb_lock); if (arg.flags & BCH_READ_DEV) { ca = bch2_device_lookup(c, arg.dev, arg.flags); ret = PTR_ERR_OR_ZERO(ca); if (ret) - goto err_unlock; + return ret; sb = ca->disk_sb.sb; } else { @@ -597,8 +584,6 @@ static long bch2_ioctl_read_super(struct bch_fs *c, vstruct_bytes(sb)); err: bch2_dev_put(ca); -err_unlock: - mutex_unlock(&c->sb_lock); return ret; } @@ -613,21 +598,17 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c, if (!dev) return -EINVAL; - for_each_online_member(c, ca) - if (ca->dev == dev) { - percpu_ref_put(&ca->io_ref[READ]); + guard(rcu)(); + for_each_online_member_rcu(c, ca) + if (ca->dev == dev) return ca->dev_idx; - } - return -BCH_ERR_ENOENT_dev_idx_not_found; + return bch_err_throw(c, ENOENT_dev_idx_not_found); } static long bch2_ioctl_disk_resize(struct bch_fs *c, struct bch_ioctl_disk_resize arg) { - struct bch_dev *ca; - int ret; - if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -635,22 +616,16 @@ static long bch2_ioctl_disk_resize(struct bch_fs *c, arg.pad) return -EINVAL; - ca = bch2_device_lookup(c, arg.dev, arg.flags); + CLASS(bch2_device_lookup, ca)(c, arg.dev, arg.flags); if (IS_ERR(ca)) return PTR_ERR(ca); - ret = bch2_dev_resize(c, ca, arg.nbuckets); - - bch2_dev_put(ca); - return ret; + return bch2_dev_resize(c, ca, arg.nbuckets); } static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, struct bch_ioctl_disk_resize_journal arg) { - struct bch_dev *ca; - int ret; - if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -661,14 +636,11 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, if (arg.nbuckets > U32_MAX) return -EINVAL; - ca = bch2_device_lookup(c, arg.dev, arg.flags); + CLASS(bch2_device_lookup, ca)(c, arg.dev, arg.flags); if (IS_ERR(ca)) return PTR_ERR(ca); - ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets); - - bch2_dev_put(ca); - return ret; + return bch2_set_nr_journal_buckets(c, ca, arg.nbuckets); } #define BCH_IOCTL(_name, _argtype) \ diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index d0a34a097b80..b1b78643d1d0 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -106,8 +106,8 @@ static void bch2_chacha20_init(u32 state[CHACHA_STATE_WORDS], memzero_explicit(key_words, sizeof(key_words)); } -static void bch2_chacha20(const struct bch_key *key, struct nonce nonce, - void *data, size_t len) +void bch2_chacha20(const struct bch_key *key, struct nonce nonce, + void *data, size_t len) { u32 state[CHACHA_STATE_WORDS]; @@ -173,7 +173,7 @@ int bch2_encrypt(struct bch_fs *c, unsigned type, if (bch2_fs_inconsistent_on(!c->chacha20_key_set, c, "attempting to encrypt without encryption key")) - return -BCH_ERR_no_encryption_key; + return bch_err_throw(c, no_encryption_key); bch2_chacha20(&c->chacha20_key, nonce, data, len); return 0; @@ -262,7 +262,7 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type, if (bch2_fs_inconsistent_on(!c->chacha20_key_set, c, "attempting to encrypt without encryption key")) - return -BCH_ERR_no_encryption_key; + return bch_err_throw(c, no_encryption_key); bch2_chacha20_init(chacha_state, &c->chacha20_key, nonce); @@ -361,7 +361,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, extent_nonce(version, crc_old), bio); if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n" " expected %0llx:%0llx got %0llx:%0llx (old type ", __func__, @@ -374,8 +374,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, bch2_prt_csum_type(&buf, new_csum_type); prt_str(&buf, ")"); WARN_RATELIMIT(1, "%s", buf.buf); - printbuf_exit(&buf); - return -BCH_ERR_recompute_checksum; + return bch_err_throw(c, recompute_checksum); } for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { @@ -438,23 +437,21 @@ const struct bch_sb_field_ops bch_sb_field_ops_crypt = { #ifdef __KERNEL__ static int __bch2_request_key(char *key_description, struct bch_key *key) { - struct key *keyring_key; - const struct user_key_payload *ukp; int ret; - keyring_key = request_key(&key_type_user, key_description, NULL); + struct key *keyring_key = request_key(&key_type_user, key_description, NULL); if (IS_ERR(keyring_key)) return PTR_ERR(keyring_key); - down_read(&keyring_key->sem); - ukp = dereference_key_locked(keyring_key); - if (ukp->datalen == sizeof(*key)) { - memcpy(key, ukp->data, ukp->datalen); - ret = 0; - } else { - ret = -EINVAL; + scoped_guard(rwsem_read, &keyring_key->sem) { + const struct user_key_payload *ukp = dereference_key_locked(keyring_key); + if (ukp->datalen == sizeof(*key)) { + memcpy(key, ukp->data, ukp->datalen); + ret = 0; + } else { + ret = -EINVAL; + } } - up_read(&keyring_key->sem); key_put(keyring_key); return ret; @@ -495,14 +492,13 @@ static int __bch2_request_key(char *key_description, struct bch_key *key) int bch2_request_key(struct bch_sb *sb, struct bch_key *key) { - struct printbuf key_description = PRINTBUF; + CLASS(printbuf, key_description)(); int ret; prt_printf(&key_description, "bcachefs:"); pr_uuid(&key_description, sb->user_uuid.b); ret = __bch2_request_key(key_description.buf, key); - printbuf_exit(&key_description); #ifndef __KERNEL__ if (ret) { @@ -524,13 +520,12 @@ int bch2_request_key(struct bch_sb *sb, struct bch_key *key) int bch2_revoke_key(struct bch_sb *sb) { key_serial_t key_id; - struct printbuf key_description = PRINTBUF; + CLASS(printbuf, key_description)(); prt_printf(&key_description, "bcachefs:"); pr_uuid(&key_description, sb->user_uuid.b); key_id = request_key("user", key_description.buf, NULL, KEY_SPEC_USER_KEYRING); - printbuf_exit(&key_description); if (key_id < 0) return errno; @@ -584,34 +579,28 @@ int bch2_decrypt_sb_key(struct bch_fs *c, */ int bch2_disable_encryption(struct bch_fs *c) { - struct bch_sb_field_crypt *crypt; - struct bch_key key; - int ret = -EINVAL; - - mutex_lock(&c->sb_lock); + guard(mutex)(&c->sb_lock); - crypt = bch2_sb_field_get(c->disk_sb.sb, crypt); + struct bch_sb_field_crypt *crypt = bch2_sb_field_get(c->disk_sb.sb, crypt); if (!crypt) - goto out; + return -EINVAL; /* is key encrypted? */ ret = 0; if (bch2_key_is_encrypted(&crypt->key)) - goto out; + return 0; - ret = bch2_decrypt_sb_key(c, crypt, &key); + struct bch_key key; + int ret = bch2_decrypt_sb_key(c, crypt, &key); if (ret) - goto out; + return ret; crypt->key.magic = cpu_to_le64(BCH_KEY_MAGIC); crypt->key.key = key; SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0); bch2_write_super(c); -out: - mutex_unlock(&c->sb_lock); - - return ret; + return 0; } /* @@ -625,7 +614,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) struct bch_sb_field_crypt *crypt; int ret = -EINVAL; - mutex_lock(&c->sb_lock); + guard(mutex)(&c->sb_lock); /* Do we already have an encryption key? */ if (bch2_sb_field_get(c->disk_sb.sb, crypt)) @@ -659,7 +648,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) crypt = bch2_sb_field_resize(&c->disk_sb, crypt, sizeof(*crypt) / sizeof(u64)); if (!crypt) { - ret = -BCH_ERR_ENOSPC_sb_crypt; + ret = bch_err_throw(c, ENOSPC_sb_crypt); goto err; } @@ -669,7 +658,6 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1); bch2_write_super(c); err: - mutex_unlock(&c->sb_lock); memzero_explicit(&user_key, sizeof(user_key)); memzero_explicit(&key, sizeof(key)); return ret; diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 1310782d3ae9..7bd9cf6104ca 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -69,6 +69,8 @@ static inline void bch2_csum_err_msg(struct printbuf *out, bch2_csum_to_text(out, type, expected); } +void bch2_chacha20(const struct bch_key *, struct nonce, void *, size_t); + int bch2_request_key(struct bch_sb *, struct bch_key *); #ifndef __KERNEL__ int bch2_revoke_key(struct bch_sb *); diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c index d6dd12d74d4f..1c6d0cdca3c5 100644 --- a/fs/bcachefs/clock.c +++ b/fs/bcachefs/clock.c @@ -40,20 +40,17 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) { - spin_lock(&clock->timer_lock); + guard(spinlock)(&clock->timer_lock); for (size_t i = 0; i < clock->timers.nr; i++) if (clock->timers.data[i] == timer) { min_heap_del(&clock->timers, i, &callbacks, NULL); - break; + return; } - - spin_unlock(&clock->timer_lock); } struct io_clock_wait { struct io_timer io_timer; - struct timer_list cpu_timer; struct task_struct *task; int expired; }; @@ -67,15 +64,6 @@ static void io_clock_wait_fn(struct io_timer *timer) wake_up_process(wait->task); } -static void io_clock_cpu_timeout(struct timer_list *timer) -{ - struct io_clock_wait *wait = container_of(timer, - struct io_clock_wait, cpu_timer); - - wait->expired = 1; - wake_up_process(wait->task); -} - void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until) { struct io_clock_wait wait = { @@ -90,8 +78,8 @@ void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until) bch2_io_timer_del(clock, &wait.io_timer); } -void bch2_kthread_io_clock_wait(struct io_clock *clock, - u64 io_until, unsigned long cpu_timeout) +unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *clock, + u64 io_until, unsigned long cpu_timeout) { bool kthread = (current->flags & PF_KTHREAD) != 0; struct io_clock_wait wait = { @@ -103,27 +91,26 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, bch2_io_timer_add(clock, &wait.io_timer); - timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0); - - if (cpu_timeout != MAX_SCHEDULE_TIMEOUT) - mod_timer(&wait.cpu_timer, cpu_timeout + jiffies); - - do { - set_current_state(TASK_INTERRUPTIBLE); - if (kthread && kthread_should_stop()) - break; - - if (wait.expired) - break; - - schedule(); + set_current_state(TASK_INTERRUPTIBLE); + if (!(kthread && kthread_should_stop())) { + cpu_timeout = schedule_timeout(cpu_timeout); try_to_freeze(); - } while (0); + } __set_current_state(TASK_RUNNING); - timer_delete_sync(&wait.cpu_timer); - destroy_timer_on_stack(&wait.cpu_timer); bch2_io_timer_del(clock, &wait.io_timer); + return cpu_timeout; +} + +void bch2_kthread_io_clock_wait(struct io_clock *clock, + u64 io_until, unsigned long cpu_timeout) +{ + bool kthread = (current->flags & PF_KTHREAD) != 0; + + while (!(kthread && kthread_should_stop()) && + cpu_timeout && + atomic64_read(&clock->now) < io_until) + cpu_timeout = bch2_kthread_io_clock_wait_once(clock, io_until, cpu_timeout); } static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now) @@ -144,28 +131,27 @@ void __bch2_increment_clock(struct io_clock *clock, u64 sectors) struct io_timer *timer; u64 now = atomic64_add_return(sectors, &clock->now); - spin_lock(&clock->timer_lock); + guard(spinlock)(&clock->timer_lock); + while ((timer = get_expired_timer(clock, now))) timer->fn(timer); - spin_unlock(&clock->timer_lock); } void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) { - out->atomic++; - spin_lock(&clock->timer_lock); u64 now = atomic64_read(&clock->now); printbuf_tabstop_push(out, 40); prt_printf(out, "current time:\t%llu\n", now); + guard(printbuf_atomic)(out); + guard(spinlock)(&clock->timer_lock); + for (unsigned i = 0; i < clock->timers.nr; i++) prt_printf(out, "%ps %ps:\t%llu\n", clock->timers.data[i]->fn, clock->timers.data[i]->fn2, clock->timers.data[i]->expire); - spin_unlock(&clock->timer_lock); - --out->atomic; } void bch2_io_clock_exit(struct io_clock *clock) diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h index 82c79c8baf92..8769be2aa21e 100644 --- a/fs/bcachefs/clock.h +++ b/fs/bcachefs/clock.h @@ -4,6 +4,7 @@ void bch2_io_timer_add(struct io_clock *, struct io_timer *); void bch2_io_timer_del(struct io_clock *, struct io_timer *); +unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *, u64, unsigned long); void bch2_kthread_io_clock_wait(struct io_clock *, u64, unsigned long); void __bch2_increment_clock(struct io_clock *, u64); diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 28ed32449913..aeb9b9bd7d33 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -187,7 +187,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, __bch2_compression_types[crc.compression_type])) ret = bch2_check_set_has_compressed_data(c, opt); else - ret = -BCH_ERR_compression_workspace_not_initialized; + ret = bch_err_throw(c, compression_workspace_not_initialized); if (ret) goto err; } @@ -200,7 +200,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, ret2 = LZ4_decompress_safe_partial(src_data.b, dst_data, src_len, dst_len, dst_len); if (ret2 != dst_len) - ret = -BCH_ERR_decompress_lz4; + ret = bch_err_throw(c, decompress_lz4); break; case BCH_COMPRESSION_TYPE_gzip: { z_stream strm = { @@ -219,7 +219,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, mempool_free(workspace, workspace_pool); if (ret2 != Z_STREAM_END) - ret = -BCH_ERR_decompress_gzip; + ret = bch_err_throw(c, decompress_gzip); break; } case BCH_COMPRESSION_TYPE_zstd: { @@ -227,7 +227,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, size_t real_src_len = le32_to_cpup(src_data.b); if (real_src_len > src_len - 4) { - ret = -BCH_ERR_decompress_zstd_src_len_bad; + ret = bch_err_throw(c, decompress_zstd_src_len_bad); goto err; } @@ -241,7 +241,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, mempool_free(workspace, workspace_pool); if (ret2 != dst_len) - ret = -BCH_ERR_decompress_zstd; + ret = bch_err_throw(c, decompress_zstd); break; } default: @@ -270,7 +270,7 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op, bch2_write_op_error(op, op->pos.offset, "extent too big to decompress (%u > %u)", crc->uncompressed_size << 9, c->opts.encoded_extent_max); - return -BCH_ERR_decompress_exceeded_max_encoded_extent; + return bch_err_throw(c, decompress_exceeded_max_encoded_extent); } data = __bounce_alloc(c, dst_len, WRITE); @@ -314,7 +314,7 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max || crc.compressed_size << 9 > c->opts.encoded_extent_max) - return -BCH_ERR_decompress_exceeded_max_encoded_extent; + return bch_err_throw(c, decompress_exceeded_max_encoded_extent); dst_data = dst_len == dst_iter.bi_size ? __bio_map_or_bounce(c, dst, dst_iter, WRITE) @@ -336,7 +336,7 @@ static int attempt_compress(struct bch_fs *c, void *workspace, void *dst, size_t dst_len, void *src, size_t src_len, - struct bch_compression_opt compression) + union bch_compression_opt compression) { enum bch_compression_type compression_type = __bch2_compression_opt_to_type[compression.type]; @@ -426,7 +426,7 @@ static int attempt_compress(struct bch_fs *c, static unsigned __bio_compress(struct bch_fs *c, struct bio *dst, size_t *dst_len, struct bio *src, size_t *src_len, - struct bch_compression_opt compression) + union bch_compression_opt compression) { struct bbuf src_data = { NULL }, dst_data = { NULL }; void *workspace; @@ -553,7 +553,7 @@ unsigned bch2_bio_compress(struct bch_fs *c, compression_type = __bio_compress(c, dst, dst_len, src, src_len, - bch2_compression_decode(compression_opt)); + (union bch_compression_opt){ .value = compression_opt }); dst->bi_iter.bi_size = orig_dst; src->bi_iter.bi_size = orig_src; @@ -579,30 +579,25 @@ static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f) if ((c->sb.features & f) == f) return 0; - mutex_lock(&c->sb_lock); + guard(mutex)(&c->sb_lock); - if ((c->sb.features & f) == f) { - mutex_unlock(&c->sb_lock); + if ((c->sb.features & f) == f) return 0; - } ret = __bch2_fs_compress_init(c, c->sb.features|f); - if (ret) { - mutex_unlock(&c->sb_lock); + if (ret) return ret; - } c->disk_sb.sb->features[0] |= cpu_to_le64(f); bch2_write_super(c); - mutex_unlock(&c->sb_lock); - return 0; } int bch2_check_set_has_compressed_data(struct bch_fs *c, unsigned compression_opt) { - unsigned compression_type = bch2_compression_decode(compression_opt).type; + unsigned int compression_type = ((union bch_compression_opt){ .value = compression_opt }) + .type; BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature)); @@ -656,12 +651,12 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) if (!mempool_initialized(&c->compression_bounce[READ]) && mempool_init_kvmalloc_pool(&c->compression_bounce[READ], 1, c->opts.encoded_extent_max)) - return -BCH_ERR_ENOMEM_compression_bounce_read_init; + return bch_err_throw(c, ENOMEM_compression_bounce_read_init); if (!mempool_initialized(&c->compression_bounce[WRITE]) && mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE], 1, c->opts.encoded_extent_max)) - return -BCH_ERR_ENOMEM_compression_bounce_write_init; + return bch_err_throw(c, ENOMEM_compression_bounce_write_init); for (i = compression_types; i < compression_types + ARRAY_SIZE(compression_types); @@ -675,7 +670,7 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) if (mempool_init_kvmalloc_pool( &c->compress_workspace[i->type], 1, i->compress_workspace)) - return -BCH_ERR_ENOMEM_compression_workspace_init; + return bch_err_throw(c, ENOMEM_compression_workspace_init); } return 0; @@ -683,7 +678,7 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) static u64 compression_opt_to_feature(unsigned v) { - unsigned type = bch2_compression_decode(v).type; + unsigned int type = ((union bch_compression_opt){ .value = v }).type; return BIT_ULL(bch2_compression_opt_to_feature[type]); } @@ -703,7 +698,7 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, { char *val = kstrdup(_val, GFP_KERNEL); char *p = val, *type_str, *level_str; - struct bch_compression_opt opt = { 0 }; + union bch_compression_opt opt = { 0 }; int ret; if (!val) @@ -714,7 +709,7 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, ret = match_string(bch2_compression_opts, -1, type_str); if (ret < 0 && err) - prt_str(err, "invalid compression type"); + prt_printf(err, "invalid compression type\n"); if (ret < 0) goto err; @@ -729,14 +724,14 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, if (!ret && level > 15) ret = -EINVAL; if (ret < 0 && err) - prt_str(err, "invalid compression level"); + prt_printf(err, "invalid compression level\n"); if (ret < 0) goto err; opt.level = level; } - *res = bch2_compression_encode(opt); + *res = opt.value; err: kfree(val); return ret; @@ -744,7 +739,7 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, void bch2_compression_opt_to_text(struct printbuf *out, u64 v) { - struct bch_compression_opt opt = bch2_compression_decode(v); + union bch_compression_opt opt = { .value = v }; if (opt.type < BCH_COMPRESSION_OPT_NR) prt_str(out, bch2_compression_opts[opt.type]); diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h index bec2f05bfd52..667ddb91d47a 100644 --- a/fs/bcachefs/compress.h +++ b/fs/bcachefs/compress.h @@ -10,41 +10,27 @@ static const unsigned __bch2_compression_opt_to_type[] = { #undef x }; -struct bch_compression_opt { - u8 type:4, - level:4; -}; - -static inline struct bch_compression_opt __bch2_compression_decode(unsigned v) -{ - return (struct bch_compression_opt) { - .type = v & 15, - .level = v >> 4, +union bch_compression_opt { + u8 value; + struct { +#if defined(__LITTLE_ENDIAN_BITFIELD) + u8 type:4, level:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + u8 level:4, type:4; +#endif }; -} +}; static inline bool bch2_compression_opt_valid(unsigned v) { - struct bch_compression_opt opt = __bch2_compression_decode(v); + union bch_compression_opt opt = { .value = v }; return opt.type < ARRAY_SIZE(__bch2_compression_opt_to_type) && !(!opt.type && opt.level); } -static inline struct bch_compression_opt bch2_compression_decode(unsigned v) -{ - return bch2_compression_opt_valid(v) - ? __bch2_compression_decode(v) - : (struct bch_compression_opt) { 0 }; -} - -static inline unsigned bch2_compression_encode(struct bch_compression_opt opt) -{ - return opt.type|(opt.level << 4); -} - static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) { - return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; + return __bch2_compression_opt_to_type[((union bch_compression_opt){ .value = v }).type]; } struct bch_write_op; diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h index c6151495985f..4080ee99aadd 100644 --- a/fs/bcachefs/darray.h +++ b/fs/bcachefs/darray.h @@ -8,6 +8,7 @@ * Inspired by CCAN's darray */ +#include #include #define DARRAY_PREALLOCATED(_type, _nr) \ @@ -20,7 +21,18 @@ struct { \ #define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0) typedef DARRAY(char) darray_char; -typedef DARRAY(char *) darray_str; +typedef DARRAY(char *) darray_str; +typedef DARRAY(const char *) darray_const_str; + +typedef DARRAY(u8) darray_u8; +typedef DARRAY(u16) darray_u16; +typedef DARRAY(u32) darray_u32; +typedef DARRAY(u64) darray_u64; + +typedef DARRAY(s8) darray_s8; +typedef DARRAY(s16) darray_s16; +typedef DARRAY(s32) darray_s32; +typedef DARRAY(s64) darray_s64; int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); @@ -76,7 +88,23 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); #define darray_remove_item(_d, _pos) \ array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data) -#define __darray_for_each(_d, _i) \ +#define darray_find_p(_d, _i, cond) \ +({ \ + typeof((_d).data) _ret = NULL; \ + \ + darray_for_each(_d, _i) \ + if (cond) { \ + _ret = _i; \ + break; \ + } \ + _ret; \ +}) + +#define darray_find(_d, _item) darray_find_p(_d, _i, *_i == _item) + +/* Iteration: */ + +#define __darray_for_each(_d, _i) \ for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++) #define darray_for_each(_d, _i) \ @@ -85,6 +113,8 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); #define darray_for_each_reverse(_d, _i) \ for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i) +/* Init/exit */ + #define darray_init(_d) \ do { \ (_d)->nr = 0; \ @@ -100,4 +130,29 @@ do { \ darray_init(_d); \ } while (0) +#define DEFINE_DARRAY_CLASS(_type) \ +DEFINE_CLASS(_type, _type, darray_exit(&(_T)), (_type) {}, void) + +#define DEFINE_DARRAY(_type) \ +typedef DARRAY(_type) darray_##_type; \ +DEFINE_DARRAY_CLASS(darray_##_type) + +#define DEFINE_DARRAY_NAMED(_name, _type) \ +typedef DARRAY(_type) _name; \ +DEFINE_DARRAY_CLASS(_name) + +DEFINE_DARRAY_CLASS(darray_char); +DEFINE_DARRAY_CLASS(darray_str) +DEFINE_DARRAY_CLASS(darray_const_str) + +DEFINE_DARRAY_CLASS(darray_u8) +DEFINE_DARRAY_CLASS(darray_u16) +DEFINE_DARRAY_CLASS(darray_u32) +DEFINE_DARRAY_CLASS(darray_u64) + +DEFINE_DARRAY_CLASS(darray_s8) +DEFINE_DARRAY_CLASS(darray_s16) +DEFINE_DARRAY_CLASS(darray_s32) +DEFINE_DARRAY_CLASS(darray_s64) + #endif /* _BCACHEFS_DARRAY_H */ diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index b211c97238ab..01838a3a189d 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -66,46 +66,56 @@ static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k) } } -static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_s_c k) +static noinline_for_stack +bool __bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs, + const struct bch_extent_ptr *start) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + if (!ctxt) { + bkey_for_each_ptr(ptrs, ptr) { + if (ptr == start) + break; + + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); + bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); + } + return false; + } - bkey_for_each_ptr(ptrs, ptr) { + __bkey_for_each_ptr(start, ptrs.end, ptr) { struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - if (ctxt) { - bool locked; - - move_ctxt_wait_event(ctxt, - (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) || - list_empty(&ctxt->ios)); + bool locked; + move_ctxt_wait_event(ctxt, + (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) || + list_empty(&ctxt->ios)); + if (!locked) + bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); + } + return true; +} - if (!locked) - bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); - } else { - if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) { - bkey_for_each_ptr(ptrs, ptr2) { - if (ptr2 == ptr) - break; +static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs) +{ + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - ca = bch2_dev_have_ref(c, ptr2->dev); - bucket = PTR_BUCKET_POS(ca, ptr2); - bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); - } - return false; - } - } + if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) + return __bkey_nocow_lock(c, ctxt, ptrs, ptr); } + return true; } -static noinline void trace_io_move_finish2(struct data_update *u, - struct bkey_i *new, - struct bkey_i *insert) +noinline_for_stack +static void trace_io_move_finish2(struct data_update *u, + struct bkey_i *new, + struct bkey_i *insert) { struct bch_fs *c = u->op.c; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); prt_newline(&buf); @@ -121,9 +131,9 @@ static noinline void trace_io_move_finish2(struct data_update *u, prt_newline(&buf); trace_io_move_finish(c, buf.buf); - printbuf_exit(&buf); } +noinline_for_stack static void trace_io_move_fail2(struct data_update *m, struct bkey_s_c new, struct bkey_s_c wrote, @@ -132,7 +142,7 @@ static void trace_io_move_fail2(struct data_update *m, { struct bch_fs *c = m->op.c; struct bkey_s_c old = bkey_i_to_s_c(m->k.k); - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); unsigned rewrites_found = 0; if (!trace_io_move_fail_enabled()) @@ -176,27 +186,83 @@ static void trace_io_move_fail2(struct data_update *m, } trace_io_move_fail(c, buf.buf); - printbuf_exit(&buf); +} + +noinline_for_stack +static void trace_data_update2(struct data_update *m, + struct bkey_s_c old, struct bkey_s_c k, + struct bkey_i *insert) +{ + struct bch_fs *c = m->op.c; + CLASS(printbuf, buf)(); + + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, old); + prt_str(&buf, "\nk: "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + + trace_data_update(c, buf.buf); +} + +noinline_for_stack +static void trace_io_move_created_rebalance2(struct data_update *m, + struct bkey_s_c old, struct bkey_s_c k, + struct bkey_i *insert) +{ + struct bch_fs *c = m->op.c; + CLASS(printbuf, buf)(); + + bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); + + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, old); + prt_str(&buf, "\nk: "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + + trace_io_move_created_rebalance(c, buf.buf); + + this_cpu_inc(c->counters[BCH_COUNTER_io_move_created_rebalance]); +} + +noinline_for_stack +static int data_update_invalid_bkey(struct data_update *m, + struct bkey_s_c old, struct bkey_s_c k, + struct bkey_i *insert) +{ + struct bch_fs *c = m->op.c; + CLASS(printbuf, buf)(); + bch2_log_msg_start(c, &buf); + + prt_str(&buf, "about to insert invalid key in data update path"); + prt_printf(&buf, "\nop.nonce: %u", m->op.nonce); + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, old); + prt_str(&buf, "\nk: "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + prt_newline(&buf); + + bch2_fs_emergency_read_only2(c, &buf); + + bch2_print_str(c, KERN_ERR, buf.buf); + + return bch_err_throw(c, invalid_bkey); } static int __bch2_data_update_index_update(struct btree_trans *trans, struct bch_write_op *op) { struct bch_fs *c = op->c; - struct btree_iter iter; - struct data_update *m = - container_of(op, struct data_update, op); - struct keylist *keys = &op->insert_keys; - struct bkey_buf _new, _insert; - struct printbuf journal_msg = PRINTBUF; + struct data_update *m = container_of(op, struct data_update, op); int ret = 0; - bch2_bkey_buf_init(&_new); - bch2_bkey_buf_init(&_insert); - bch2_bkey_buf_realloc(&_insert, c, U8_MAX); - - bch2_trans_iter_init(trans, &iter, m->btree_id, - bkey_start_pos(&bch2_keylist_front(keys)->k), + CLASS(btree_iter, iter)(trans, m->btree_id, + bkey_start_pos(&bch2_keylist_front(&op->insert_keys)->k), BTREE_ITER_slots|BTREE_ITER_intent); while (1) { @@ -216,24 +282,35 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, bch2_trans_begin(trans); - k = bch2_btree_iter_peek_slot(trans, &iter); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto err; - new = bkey_i_to_extent(bch2_keylist_front(keys)); + new = bkey_i_to_extent(bch2_keylist_front(&op->insert_keys)); if (!bch2_extents_match(k, old)) { trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), - NULL, "no match:"); + NULL, "no match:"); goto nowork; } - bkey_reassemble(_insert.k, k); - insert = _insert.k; + insert = bch2_trans_kmalloc(trans, + bkey_bytes(k.k) + + bkey_val_bytes(&new->k) + + sizeof(struct bch_extent_rebalance)); + ret = PTR_ERR_OR_ZERO(insert); + if (ret) + goto err; + + bkey_reassemble(insert, k); + + new = bch2_trans_kmalloc(trans, bkey_bytes(&new->k)); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto err; - bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys)); - new = bkey_i_to_extent(_new.k); + bkey_copy(&new->k_i, bch2_keylist_front(&op->insert_keys)); bch2_cut_front(iter.pos, &new->k_i); bch2_cut_front(iter.pos, insert); @@ -294,21 +371,21 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i)); /* Now, drop excess replicas: */ - rcu_read_lock(); + scoped_guard(rcu) { restart_drop_extra_replicas: - bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { - unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); + bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { + unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); - if (!p.ptr.cached && - durability - ptr_durability >= m->op.opts.data_replicas) { - durability -= ptr_durability; + if (!p.ptr.cached && + durability - ptr_durability >= m->op.opts.data_replicas) { + durability -= ptr_durability; - bch2_extent_ptr_set_cached(c, &m->op.opts, - bkey_i_to_s(insert), &entry->ptr); - goto restart_drop_extra_replicas; + bch2_extent_ptr_set_cached(c, &m->op.opts, + bkey_i_to_s(insert), &entry->ptr); + goto restart_drop_extra_replicas; + } } } - rcu_read_unlock(); /* Finally, add the pointers we just wrote: */ extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) @@ -346,44 +423,12 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, .btree = m->btree_id, .flags = BCH_VALIDATE_commit, }); - if (invalid) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "about to insert invalid key in data update path"); - prt_printf(&buf, "\nop.nonce: %u", m->op.nonce); - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, old); - prt_str(&buf, "\nk: "); - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, "\nnew: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - - bch2_print_string_as_lines(KERN_ERR, buf.buf); - printbuf_exit(&buf); - - bch2_fatal_error(c); - ret = -BCH_ERR_invalid_bkey; + if (unlikely(invalid)) { + ret = data_update_invalid_bkey(m, old, k, insert); goto out; } - if (trace_data_update_enabled()) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, old); - prt_str(&buf, "\nk: "); - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, "\nnew: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - - trace_data_update(c, buf.buf); - printbuf_exit(&buf); - } - - printbuf_reset(&journal_msg); - prt_str(&journal_msg, bch2_data_update_type_strs[m->type]); - - ret = bch2_trans_log_msg(trans, &journal_msg) ?: + ret = bch2_trans_log_str(trans, bch2_data_update_type_strs[m->type]) ?: bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?: bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, bkey_start_pos(&insert->k)) ?: @@ -391,28 +436,39 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, k.k->p, insert->k.p) ?: bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?: bch2_trans_update(trans, &iter, insert, - BTREE_UPDATE_internal_snapshot_node) ?: - bch2_trans_commit(trans, &op->res, + BTREE_UPDATE_internal_snapshot_node); + if (ret) + goto err; + + if (trace_data_update_enabled()) + trace_data_update2(m, old, k, insert); + + if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) * k.k->size > + bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size) + trace_io_move_created_rebalance2(m, old, k, insert); + + ret = bch2_trans_commit(trans, &op->res, NULL, BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc| m->data_opts.btree_insert_flags); - if (!ret) { - bch2_btree_iter_set_pos(trans, &iter, next_pos); + if (ret) + goto err; - this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size); - if (trace_io_move_finish_enabled()) - trace_io_move_finish2(m, &new->k_i, insert); - } + bch2_btree_iter_set_pos(&iter, next_pos); + + this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size); + if (trace_io_move_finish_enabled()) + trace_io_move_finish2(m, &new->k_i, insert); err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ret = 0; if (ret) break; next: - while (bkey_ge(iter.pos, bch2_keylist_front(keys)->k.p)) { - bch2_keylist_pop_front(keys); - if (bch2_keylist_empty(keys)) + while (bkey_ge(iter.pos, bch2_keylist_front(&op->insert_keys)->k.p)) { + bch2_keylist_pop_front(&op->insert_keys); + if (bch2_keylist_empty(&op->insert_keys)) goto out; } continue; @@ -426,21 +482,18 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, count_event(c, io_move_fail); - bch2_btree_iter_advance(trans, &iter); + bch2_btree_iter_advance(&iter); goto next; } out: - printbuf_exit(&journal_msg); - bch2_trans_iter_exit(trans, &iter); - bch2_bkey_buf_exit(&_insert, c); - bch2_bkey_buf_exit(&_new, c); BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); return ret; } int bch2_data_update_index_update(struct bch_write_op *op) { - return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op)); + CLASS(btree_trans, trans)(op->c); + return __bch2_data_update_index_update(trans, op); } void bch2_data_update_read_done(struct data_update *m) @@ -474,8 +527,9 @@ void bch2_data_update_exit(struct data_update *update) bch2_bkey_buf_exit(&update->k, c); } -static int bch2_update_unwritten_extent(struct btree_trans *trans, - struct data_update *update) +static noinline_for_stack +int bch2_update_unwritten_extent(struct btree_trans *trans, + struct data_update *update) { struct bch_fs *c = update->op.c; struct bkey_i_extent *e; @@ -497,10 +551,10 @@ static int bch2_update_unwritten_extent(struct btree_trans *trans, bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos, BTREE_ITER_slots); ret = lockrestart_do(trans, ({ - k = bch2_btree_iter_peek_slot(trans, &iter); + k = bch2_btree_iter_peek_slot(&iter); bkey_err(k); })); - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); if (ret || !bch2_extents_match(k, bkey_i_to_s_c(update->k.k))) break; @@ -587,6 +641,10 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, prt_str_indented(out, "extra replicas:\t"); prt_u64(out, data_opts->extra_replicas); + prt_newline(out); + + prt_str_indented(out, "scrub:\t"); + prt_u64(out, data_opts->scrub); } void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) @@ -607,9 +665,17 @@ void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update prt_newline(out); printbuf_indent_add(out, 2); bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); - prt_printf(out, "read_done:\t%u\n", m->read_done); - bch2_write_op_to_text(out, &m->op); - printbuf_indent_sub(out, 2); + + if (!m->read_done) { + prt_printf(out, "read:\n"); + printbuf_indent_add(out, 2); + bch2_read_bio_to_text(out, m->op.c, &m->rbio); + } else { + prt_printf(out, "write:\n"); + printbuf_indent_add(out, 2); + bch2_write_op_to_text(out, &m->op); + } + printbuf_indent_sub(out, 4); } int bch2_extent_drop_ptrs(struct btree_trans *trans, @@ -655,18 +721,10 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } -int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, - struct bch_io_opts *io_opts) +static int __bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, + struct bch_io_opts *io_opts, + unsigned buf_bytes) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - /* write path might have to decompress data: */ - unsigned buf_bytes = 0; - bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry) - buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); - unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL); @@ -690,11 +748,26 @@ int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, return 0; } +int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, + struct bch_io_opts *io_opts) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + /* write path might have to decompress data: */ + unsigned buf_bytes = 0; + bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry) + buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); + + return __bch2_data_update_bios_init(m, c, io_opts, buf_bytes); +} + static int can_write_extent(struct bch_fs *c, struct data_update *m) { if ((m->op.flags & BCH_WRITE_alloc_nowait) && unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark))) - return -BCH_ERR_data_update_done_would_block; + return bch_err_throw(c, data_update_done_would_block); unsigned target = m->op.flags & BCH_WRITE_only_specified_devs ? m->op.target @@ -704,27 +777,38 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m) darray_for_each(m->op.devs_have, i) __clear_bit(*i, devs.d); - rcu_read_lock(); + CLASS(printbuf, buf)(); + + guard(printbuf_atomic)(&buf); + guard(rcu)(); + unsigned nr_replicas = 0, i; for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { - struct bch_dev *ca = bch2_dev_rcu(c, i); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, i); + if (!ca) + continue; struct bch_dev_usage usage; bch2_dev_usage_read_fast(ca, &usage); - if (!dev_buckets_free(ca, usage, m->op.watermark)) + u64 nr_free = dev_buckets_free(ca, usage, m->op.watermark); + + prt_printf(&buf, "%s=%llu ", ca->name, nr_free); + + if (!nr_free) continue; nr_replicas += ca->mi.durability; if (nr_replicas >= m->op.nr_replicas) break; } - rcu_read_unlock(); - if (!nr_replicas) - return -BCH_ERR_data_update_done_no_rw_devs; + if (!nr_replicas) { + trace_data_update_done_no_rw_devs(c, buf.buf); + return bch_err_throw(c, data_update_done_no_rw_devs); + } if (nr_replicas < m->op.nr_replicas) - return -BCH_ERR_insufficient_devices; + return bch_err_throw(c, insufficient_devices); return 0; } @@ -739,19 +823,21 @@ int bch2_data_update_init(struct btree_trans *trans, struct bkey_s_c k) { struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned reserve_sectors = k.k->size * data_opts.extra_replicas; int ret = 0; - /* - * fs is corrupt we have a key for a snapshot node that doesn't exist, - * and we have to check for this because we go rw before repairing the - * snapshots table - just skip it, we can move it later. - */ - if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot))) - return -BCH_ERR_data_update_done_no_snapshot; + if (k.k->p.snapshot) { + ret = bch2_check_key_has_snapshot(trans, iter, k); + if (bch2_err_matches(ret, BCH_ERR_recovery_will_run)) { + /* Can't repair yet, waiting on other recovery passes */ + return bch_err_throw(c, data_update_done_no_snapshot); + } + if (ret < 0) + return ret; + if (ret) /* key was deleted */ + return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + bch_err_throw(c, data_update_done_no_snapshot); + ret = 0; + } bch2_bkey_buf_init(&m->k); bch2_bkey_buf_reassemble(&m->k, c, k); @@ -779,10 +865,17 @@ int bch2_data_update_init(struct btree_trans *trans, unsigned durability_have = 0, durability_removing = 0; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned reserve_sectors = k.k->size * data_opts.extra_replicas; + unsigned buf_bytes = 0; + bool unwritten = false; + unsigned ptr_bit = 1; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { if (!p.ptr.cached) { - rcu_read_lock(); + guard(rcu)(); if (ptr_bit & m->data_opts.rewrite_ptrs) { if (crc_is_compressed(p.crc)) reserve_sectors += k.k->size; @@ -793,7 +886,6 @@ int bch2_data_update_init(struct btree_trans *trans, bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); durability_have += bch2_extent_ptr_durability(c, &p); } - rcu_read_unlock(); } /* @@ -809,6 +901,9 @@ int bch2_data_update_init(struct btree_trans *trans, if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) m->op.incompressible = true; + buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); + unwritten |= p.ptr.unwritten; + ptr_bit <<= 1; } @@ -847,7 +942,7 @@ int bch2_data_update_init(struct btree_trans *trans, if (iter) ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts); if (!ret) - ret = -BCH_ERR_data_update_done_no_writes_needed; + ret = bch_err_throw(c, data_update_done_no_writes_needed); goto out_bkey_buf_exit; } @@ -878,23 +973,25 @@ int bch2_data_update_init(struct btree_trans *trans, } if (!bkey_get_dev_refs(c, k)) { - ret = -BCH_ERR_data_update_done_no_dev_refs; + ret = bch_err_throw(c, data_update_done_no_dev_refs); goto out_put_disk_res; } if (c->opts.nocow_enabled && - !bkey_nocow_lock(c, ctxt, k)) { - ret = -BCH_ERR_nocow_lock_blocked; + !bkey_nocow_lock(c, ctxt, ptrs)) { + ret = bch_err_throw(c, nocow_lock_blocked); goto out_put_dev_refs; } - if (bkey_extent_is_unwritten(k)) { + if (unwritten) { ret = bch2_update_unwritten_extent(trans, m) ?: - -BCH_ERR_data_update_done_unwritten; + bch_err_throw(c, data_update_done_unwritten); goto out_nocow_unlock; } - ret = bch2_data_update_bios_init(m, c, io_opts); + bch2_trans_unlock(trans); + + ret = __bch2_data_update_bios_init(m, c, io_opts, buf_bytes); if (ret) goto out_nocow_unlock; diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h index ed05125867da..5e14d13568de 100644 --- a/fs/bcachefs/data_update.h +++ b/fs/bcachefs/data_update.h @@ -50,6 +50,21 @@ struct data_update { struct bio_vec *bvecs; }; +struct promote_op { + struct rcu_head rcu; + u64 start_time; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + unsigned list_idx; +#endif + + struct rhash_head hash; + struct bpos pos; + + struct work_struct work; + struct data_update write; + struct bio_vec bi_inline_vecs[]; /* must be last */ +}; + void bch2_data_update_to_text(struct printbuf *, struct data_update *); void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *); diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 5a8bc7013512..33cb94f70b19 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -8,6 +8,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "async_objs.h" #include "bkey_methods.h" #include "btree_cache.h" #include "btree_io.h" @@ -16,6 +17,7 @@ #include "btree_update.h" #include "btree_update_interior.h" #include "buckets.h" +#include "data_update.h" #include "debug.h" #include "error.h" #include "extents.h" @@ -40,9 +42,10 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, struct btree_node *n_sorted = c->verify_data->data; struct bset *sorted, *inmemory = &b->data->keys; struct bio *bio; - bool failed = false, saw_error = false; + bool failed = false; - struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, + BCH_DEV_READ_REF_btree_verify_replicas); if (!ca) return false; @@ -57,12 +60,13 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, submit_bio_wait(bio); bio_put(bio); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_btree_verify_replicas); memcpy(n_ondisk, n_sorted, btree_buf_bytes(b)); v->written = 0; - if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error) + if (bch2_btree_node_read_done(c, ca, v, NULL, NULL)) return false; n_sorted = c->verify_data->data; @@ -137,7 +141,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) return; bch2_btree_node_io_lock(b); - mutex_lock(&c->verify_lock); + guard(mutex)(&c->verify_lock); if (!c->verify_ondisk) { c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL); @@ -149,8 +153,6 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) c->verify_data = __bch2_btree_node_mem_alloc(c); if (!c->verify_data) goto out; - - list_del_init(&c->verify_data->list); } BUG_ON(b->nsets != 1); @@ -170,14 +172,11 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) failed |= bch2_btree_verify_replica(c, b, p); if (failed) { - struct printbuf buf = PRINTBUF; - + CLASS(printbuf, buf)(); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); bch2_fs_fatal_error(c, ": btree node verify failed for: %s\n", buf.buf); - printbuf_exit(&buf); } out: - mutex_unlock(&c->verify_lock); bch2_btree_node_io_unlock(b); } @@ -196,7 +195,8 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, return; } - ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, + BCH_DEV_READ_REF_btree_node_ondisk_to_text); if (!ca) { prt_printf(out, "error getting device to read from: not online\n"); return; @@ -297,28 +297,13 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, if (bio) bio_put(bio); kvfree(n_ondisk); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_btree_node_ondisk_to_text); } #ifdef CONFIG_DEBUG_FS -/* XXX: bch_fs refcounting */ - -struct dump_iter { - struct bch_fs *c; - enum btree_id id; - struct bpos from; - struct bpos prev_node; - u64 iter; - - struct printbuf buf; - - char __user *ubuf; /* destination user buffer */ - size_t size; /* size of requested read */ - ssize_t ret; /* bytes read so far */ -}; - -static ssize_t flush_buf(struct dump_iter *i) +ssize_t bch2_debugfs_flush_buf(struct dump_iter *i) { if (i->buf.pos) { size_t bytes = min_t(size_t, i->buf.pos, i->size); @@ -330,6 +315,11 @@ static ssize_t flush_buf(struct dump_iter *i) i->buf.pos -= copied; memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos); + if (i->buf.last_newline >= copied) + i->buf.last_newline -= copied; + if (i->buf.last_field >= copied) + i->buf.last_field -= copied; + if (copied != bytes) return -EFAULT; } @@ -356,7 +346,7 @@ static int bch2_dump_open(struct inode *inode, struct file *file) return 0; } -static int bch2_dump_release(struct inode *inode, struct file *file) +int bch2_dump_release(struct inode *inode, struct file *file) { struct dump_iter *i = file->private_data; @@ -374,17 +364,17 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, i->size = size; i->ret = 0; - return flush_buf(i) ?: - bch2_trans_run(i->c, - for_each_btree_key(trans, iter, i->id, i->from, - BTREE_ITER_prefetch| - BTREE_ITER_all_snapshots, k, ({ - bch2_bkey_val_to_text(&i->buf, i->c, k); - prt_newline(&i->buf); - bch2_trans_unlock(trans); - i->from = bpos_successor(iter.pos); - flush_buf(i); - }))) ?: + CLASS(btree_trans, trans)(i->c); + return bch2_debugfs_flush_buf(i) ?: + for_each_btree_key(trans, iter, i->id, i->from, + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, ({ + bch2_bkey_val_to_text(&i->buf, i->c, k); + prt_newline(&i->buf); + bch2_trans_unlock(trans); + i->from = bpos_successor(iter.pos); + bch2_debugfs_flush_buf(i); + })) ?: i->ret; } @@ -404,22 +394,22 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, i->size = size; i->ret = 0; - ssize_t ret = flush_buf(i); + ssize_t ret = bch2_debugfs_flush_buf(i); if (ret) return ret; if (bpos_eq(SPOS_MAX, i->from)) return i->ret; - return bch2_trans_run(i->c, - for_each_btree_node(trans, iter, i->id, i->from, 0, b, ({ - bch2_btree_node_to_text(&i->buf, i->c, b); - i->from = !bpos_eq(SPOS_MAX, b->key.k.p) - ? bpos_successor(b->key.k.p) - : b->key.k.p; + CLASS(btree_trans, trans)(i->c); + return for_each_btree_node(trans, iter, i->id, i->from, 0, b, ({ + bch2_btree_node_to_text(&i->buf, i->c, b); + i->from = !bpos_eq(SPOS_MAX, b->key.k.p) + ? bpos_successor(b->key.k.p) + : b->key.k.p; - drop_locks_do(trans, flush_buf(i)); - }))) ?: i->ret; + drop_locks_do(trans, bch2_debugfs_flush_buf(i)); + })) ?: i->ret; } static const struct file_operations btree_format_debug_ops = { @@ -438,27 +428,27 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, i->size = size; i->ret = 0; - return flush_buf(i) ?: - bch2_trans_run(i->c, - for_each_btree_key(trans, iter, i->id, i->from, - BTREE_ITER_prefetch| - BTREE_ITER_all_snapshots, k, ({ - struct btree_path_level *l = - &btree_iter_path(trans, &iter)->l[0]; - struct bkey_packed *_k = - bch2_btree_node_iter_peek(&l->iter, l->b); - - if (bpos_gt(l->b->key.k.p, i->prev_node)) { - bch2_btree_node_to_text(&i->buf, i->c, l->b); - i->prev_node = l->b->key.k.p; - } - - bch2_bfloat_to_text(&i->buf, l->b, _k); - bch2_trans_unlock(trans); - i->from = bpos_successor(iter.pos); - flush_buf(i); - }))) ?: - i->ret; + CLASS(btree_trans, trans)(i->c); + return bch2_debugfs_flush_buf(i) ?: + for_each_btree_key(trans, iter, i->id, i->from, + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, ({ + struct btree_path_level *l = + &btree_iter_path(trans, &iter)->l[0]; + struct bkey_packed *_k = + bch2_btree_node_iter_peek(&l->iter, l->b); + + if (bpos_gt(l->b->key.k.p, i->prev_node)) { + bch2_btree_node_to_text(&i->buf, i->c, l->b); + i->prev_node = l->b->key.k.p; + } + + bch2_bfloat_to_text(&i->buf, l->b, _k); + bch2_trans_unlock(trans); + i->from = bpos_successor(iter.pos); + bch2_debugfs_flush_buf(i); + })) ?: + i->ret; } static const struct file_operations bfloat_failed_debug_ops = { @@ -472,7 +462,7 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs * struct btree *b) { if (!out->nr_tabstops) - printbuf_tabstop_push(out, 32); + printbuf_tabstop_push(out, 36); prt_printf(out, "%px ", b); bch2_btree_id_level_to_text(out, b->c.btree_id, b->c.level); @@ -497,6 +487,8 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs * prt_printf(out, "journal pin %px:\t%llu\n", &b->writes[1].journal, b->writes[1].journal.seq); + prt_printf(out, "ob:\t%u\n", b->ob.nr); + printbuf_indent_sub(out, 2); } @@ -513,34 +505,33 @@ static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf, i->ret = 0; do { - struct bucket_table *tbl; - struct rhash_head *pos; - struct btree *b; - - ret = flush_buf(i); + ret = bch2_debugfs_flush_buf(i); if (ret) return ret; - rcu_read_lock(); - i->buf.atomic++; - tbl = rht_dereference_rcu(c->btree_cache.table.tbl, - &c->btree_cache.table); - if (i->iter < tbl->size) { - rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash) - bch2_cached_btree_node_to_text(&i->buf, c, b); - i->iter++; - } else { - done = true; + scoped_guard(rcu) { + guard(printbuf_atomic)(&i->buf); + struct bucket_table *tbl = + rht_dereference_rcu(c->btree_cache.table.tbl, + &c->btree_cache.table); + if (i->iter < tbl->size) { + struct rhash_head *pos; + struct btree *b; + + rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash) + bch2_cached_btree_node_to_text(&i->buf, c, b); + i->iter++; + } else { + done = true; + } } - --i->buf.atomic; - rcu_read_unlock(); } while (!done); if (i->buf.allocation_failure) ret = -ENOMEM; if (!ret) - ret = flush_buf(i); + ret = bch2_debugfs_flush_buf(i); return ret ?: i->ret; } @@ -589,6 +580,8 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, i->ubuf = buf; i->size = size; i->ret = 0; + + int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); restart: seqmutex_lock(&c->btree_trans_lock); list_sort(&c->btree_trans_list, list_ptr_order_cmp); @@ -602,6 +595,11 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, if (!closure_get_not_zero(&trans->ref)) continue; + if (!trans->srcu_held) { + closure_put(&trans->ref); + continue; + } + u32 seq = seqmutex_unlock(&c->btree_trans_lock); bch2_btree_trans_to_text(&i->buf, trans); @@ -614,7 +612,7 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, closure_put(&trans->ref); - ret = flush_buf(i); + ret = bch2_debugfs_flush_buf(i); if (ret) goto unlocked; @@ -623,11 +621,13 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, } seqmutex_unlock(&c->btree_trans_lock); unlocked: + srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); + if (i->buf.allocation_failure) ret = -ENOMEM; if (!ret) - ret = flush_buf(i); + ret = bch2_debugfs_flush_buf(i); return ret ?: i->ret; } @@ -652,7 +652,7 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, i->ret = 0; while (1) { - err = flush_buf(i); + err = bch2_debugfs_flush_buf(i); if (err) return err; @@ -695,7 +695,7 @@ static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf, i->iter++; } - err = flush_buf(i); + err = bch2_debugfs_flush_buf(i); if (err) return err; @@ -753,7 +753,7 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, while (1) { struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter]; - err = flush_buf(i); + err = bch2_debugfs_flush_buf(i); if (err) return err; @@ -767,9 +767,15 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, prt_printf(&i->buf, "%s:\n", bch2_btree_transaction_fns[i->iter]); printbuf_indent_add(&i->buf, 2); - mutex_lock(&s->lock); + guard(mutex)(&s->lock); prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem); +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + printbuf_indent_add(&i->buf, 2); + bch2_trans_kmalloc_trace_to_text(&i->buf, &s->trans_kmalloc_trace); + printbuf_indent_sub(&i->buf, 2); +#endif + prt_printf(&i->buf, "Transaction duration:\n"); printbuf_indent_add(&i->buf, 2); @@ -792,8 +798,6 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, printbuf_indent_sub(&i->buf, 2); } - mutex_unlock(&s->lock); - printbuf_indent_sub(&i->buf, 2); prt_newline(&i->buf); i->iter++; @@ -868,7 +872,7 @@ static ssize_t bch2_simple_print(struct file *file, char __user *buf, ret = -ENOMEM; if (!ret) - ret = flush_buf(i); + ret = bch2_debugfs_flush_buf(i); return ret ?: i->ret; } @@ -927,7 +931,11 @@ void bch2_fs_debug_init(struct bch_fs *c) if (IS_ERR_OR_NULL(bch_debug)) return; - snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); + if (c->sb.multi_device) + snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); + else + strscpy(name, c->name, sizeof(name)); + c->fs_debug_dir = debugfs_create_dir(name, bch_debug); if (IS_ERR_OR_NULL(c->fs_debug_dir)) return; @@ -953,6 +961,8 @@ void bch2_fs_debug_init(struct bch_fs *c) debugfs_create_file("write_points", 0400, c->fs_debug_dir, c->btree_debug, &write_points_ops); + bch2_fs_async_obj_debugfs_init(c); + c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir); if (IS_ERR_OR_NULL(c->btree_debug_dir)) return; diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h index 2c37143b5fd1..d88b1194b8ac 100644 --- a/fs/bcachefs/debug.h +++ b/fs/bcachefs/debug.h @@ -14,11 +14,29 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *, static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) { - if (bch2_verify_btree_ondisk) + if (static_branch_unlikely(&bch2_verify_btree_ondisk)) __bch2_btree_verify(c, b); } #ifdef CONFIG_DEBUG_FS +struct dump_iter { + struct bch_fs *c; + struct async_obj_list *list; + enum btree_id id; + struct bpos from; + struct bpos prev_node; + u64 iter; + + struct printbuf buf; + + char __user *ubuf; /* destination user buffer */ + size_t size; /* size of requested read */ + ssize_t ret; /* bytes read so far */ +}; + +ssize_t bch2_debugfs_flush_buf(struct dump_iter *); +int bch2_dump_release(struct inode *, struct file *); + void bch2_fs_debug_exit(struct bch_fs *); void bch2_fs_debug_init(struct bch_fs *); #else diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index a51195088227..cb44b35e0f1d 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -13,14 +13,18 @@ #include +#if IS_ENABLED(CONFIG_UNICODE) int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, const struct qstr *str, struct qstr *out_cf) { *out_cf = (struct qstr) QSTR_INIT(NULL, 0); -#ifdef CONFIG_UNICODE + int ret = bch2_fs_casefold_enabled(trans->c); + if (ret) + return ret; + unsigned char *buf = bch2_trans_kmalloc(trans, BCH_NAME_MAX + 1); - int ret = PTR_ERR_OR_ZERO(buf); + ret = PTR_ERR_OR_ZERO(buf); if (ret) return ret; @@ -30,10 +34,8 @@ int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, *out_cf = (struct qstr) QSTR_INIT(buf, ret); return 0; -#else - return -EOPNOTSUPP; -#endif } +#endif static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) { @@ -212,82 +214,87 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); struct qstr d_name = bch2_dirent_get_name(d); - prt_printf(out, "%.*s -> ", d_name.len, d_name.name); + prt_bytes(out, d_name.name, d_name.len); + + if (d.v->d_casefold) { + prt_str(out, " (casefold "); + struct qstr d_name = bch2_dirent_get_lookup_name(d); + prt_bytes(out, d_name.name, d_name.len); + prt_char(out, ')'); + } + + prt_str(out, " ->"); if (d.v->d_type != DT_SUBVOL) - prt_printf(out, "%llu", le64_to_cpu(d.v->d_inum)); + prt_printf(out, " %llu", le64_to_cpu(d.v->d_inum)); else - prt_printf(out, "%u -> %u", + prt_printf(out, " %u -> %u", le32_to_cpu(d.v->d_parent_subvol), le32_to_cpu(d.v->d_child_subvol)); prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type)); } -static struct bkey_i_dirent *dirent_alloc_key(struct btree_trans *trans, - subvol_inum dir, - u8 type, - int name_len, int cf_name_len, - u64 dst) +int bch2_dirent_init_name(struct bch_fs *c, + struct bkey_i_dirent *dirent, + const struct bch_hash_info *hash_info, + const struct qstr *name, + const struct qstr *cf_name) { - struct bkey_i_dirent *dirent; - unsigned u64s = BKEY_U64s + dirent_val_u64s(name_len, cf_name_len); - - BUG_ON(u64s > U8_MAX); + EBUG_ON(hash_info->cf_encoding == NULL && cf_name); + int cf_len = 0; - dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); - if (IS_ERR(dirent)) - return dirent; + if (name->len > BCH_NAME_MAX) + return -ENAMETOOLONG; - bkey_dirent_init(&dirent->k_i); - dirent->k.u64s = u64s; + dirent->v.d_casefold = hash_info->cf_encoding != NULL; - if (type != DT_SUBVOL) { - dirent->v.d_inum = cpu_to_le64(dst); + if (!dirent->v.d_casefold) { + memcpy(&dirent->v.d_name[0], name->name, name->len); + memset(&dirent->v.d_name[name->len], 0, + bkey_val_bytes(&dirent->k) - + offsetof(struct bch_dirent, d_name) - + name->len); } else { - dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol); - dirent->v.d_child_subvol = cpu_to_le32(dst); - } + int ret = bch2_fs_casefold_enabled(c); + if (ret) + return ret; - dirent->v.d_type = type; - dirent->v.d_unused = 0; - dirent->v.d_casefold = cf_name_len ? 1 : 0; +#if IS_ENABLED(CONFIG_UNICODE) + memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len); - return dirent; -} + char *cf_out = &dirent->v.d_cf_name_block.d_names[name->len]; + void *val_end = bkey_val_end(bkey_i_to_s(&dirent->k_i)); -static void dirent_init_regular_name(struct bkey_i_dirent *dirent, - const struct qstr *name) -{ - EBUG_ON(dirent->v.d_casefold); + if (cf_name) { + cf_len = cf_name->len; - memcpy(&dirent->v.d_name[0], name->name, name->len); - memset(&dirent->v.d_name[name->len], 0, - bkey_val_bytes(&dirent->k) - - offsetof(struct bch_dirent, d_name) - - name->len); -} + memcpy(cf_out, cf_name->name, cf_name->len); + } else { + cf_len = utf8_casefold(hash_info->cf_encoding, name, + cf_out, val_end - (void *) cf_out); + if (cf_len <= 0) + return cf_len; + } -static void dirent_init_casefolded_name(struct bkey_i_dirent *dirent, - const struct qstr *name, - const struct qstr *cf_name) -{ - EBUG_ON(!dirent->v.d_casefold); - EBUG_ON(!cf_name->len); - - dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len); - dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_name->len); - memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len); - memcpy(&dirent->v.d_cf_name_block.d_names[name->len], cf_name->name, cf_name->len); - memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_name->len], 0, - bkey_val_bytes(&dirent->k) - - offsetof(struct bch_dirent, d_cf_name_block.d_names) - - name->len + cf_name->len); - - EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_name->len); + void *name_end = &dirent->v.d_cf_name_block.d_names[name->len + cf_len]; + BUG_ON(name_end > val_end); + memset(name_end, 0, val_end - name_end); + + dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len); + dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_len); + + EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_len); +#endif + } + + unsigned u64s = dirent_val_u64s(name->len, cf_len); + BUG_ON(u64s > bkey_val_u64s(&dirent->k)); + set_bkey_val_u64s(&dirent->k, u64s); + return 0; } -static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, +struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *trans, const struct bch_hash_info *hash_info, subvol_inum dir, u8 type, @@ -295,31 +302,28 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, const struct qstr *cf_name, u64 dst) { - struct bkey_i_dirent *dirent; - struct qstr _cf_name; - - if (name->len > BCH_NAME_MAX) - return ERR_PTR(-ENAMETOOLONG); + struct bkey_i_dirent *dirent = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64)); + if (IS_ERR(dirent)) + return dirent; - if (hash_info->cf_encoding && !cf_name) { - int ret = bch2_casefold(trans, hash_info, name, &_cf_name); - if (ret) - return ERR_PTR(ret); + bkey_dirent_init(&dirent->k_i); + dirent->k.u64s = BKEY_U64s_MAX; - cf_name = &_cf_name; + if (type != DT_SUBVOL) { + dirent->v.d_inum = cpu_to_le64(dst); + } else { + dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol); + dirent->v.d_child_subvol = cpu_to_le32(dst); } - dirent = dirent_alloc_key(trans, dir, type, name->len, cf_name ? cf_name->len : 0, dst); - if (IS_ERR(dirent)) - return dirent; + dirent->v.d_type = type; + dirent->v.d_unused = 0; - if (cf_name) - dirent_init_casefolded_name(dirent, name, cf_name); - else - dirent_init_regular_name(dirent, name); + int ret = bch2_dirent_init_name(trans->c, dirent, hash_info, name, cf_name); + if (ret) + return ERR_PTR(ret); EBUG_ON(bch2_dirent_get_name(dirent_i_to_s_c(dirent)).len != name->len); - return dirent; } @@ -334,7 +338,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans, struct bkey_i_dirent *dirent; int ret; - dirent = dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum); + dirent = bch2_dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum); ret = PTR_ERR_OR_ZERO(dirent); if (ret) return ret; @@ -358,7 +362,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, struct bkey_i_dirent *dirent; int ret; - dirent = dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum); + dirent = bch2_dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum); ret = PTR_ERR_OR_ZERO(dirent); if (ret) return ret; @@ -395,15 +399,15 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, } int bch2_dirent_rename(struct btree_trans *trans, - subvol_inum src_dir, struct bch_hash_info *src_hash, u64 *src_dir_i_size, - subvol_inum dst_dir, struct bch_hash_info *dst_hash, u64 *dst_dir_i_size, + subvol_inum src_dir, struct bch_hash_info *src_hash, + subvol_inum dst_dir, struct bch_hash_info *dst_hash, const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset, const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset, enum bch_rename_mode mode) { struct qstr src_name_lookup, dst_name_lookup; - struct btree_iter src_iter = {}; - struct btree_iter dst_iter = {}; + struct btree_iter src_iter = { NULL }; + struct btree_iter dst_iter = { NULL }; struct bkey_s_c old_src, old_dst = bkey_s_c_null; struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; struct bpos dst_pos = @@ -463,8 +467,8 @@ int bch2_dirent_rename(struct btree_trans *trans, *src_offset = dst_iter.pos.offset; /* Create new dst key: */ - new_dst = dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name, - dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0); + new_dst = bch2_dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name, + dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0); ret = PTR_ERR_OR_ZERO(new_dst); if (ret) goto out; @@ -474,8 +478,8 @@ int bch2_dirent_rename(struct btree_trans *trans, /* Create new src key: */ if (mode == BCH_RENAME_EXCHANGE) { - new_src = dirent_create_key(trans, src_hash, src_dir, 0, src_name, - src_hash->cf_encoding ? &src_name_lookup : NULL, 0); + new_src = bch2_dirent_create_key(trans, src_hash, src_dir, 0, src_name, + src_hash->cf_encoding ? &src_name_lookup : NULL, 0); ret = PTR_ERR_OR_ZERO(new_src); if (ret) goto out; @@ -535,14 +539,6 @@ int bch2_dirent_rename(struct btree_trans *trans, new_src->v.d_type == DT_SUBVOL) new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol); - if (old_dst.k) - *dst_dir_i_size -= bkey_bytes(old_dst.k); - *src_dir_i_size -= bkey_bytes(old_src.k); - - if (mode == BCH_RENAME_EXCHANGE) - *src_dir_i_size += bkey_bytes(&new_src->k); - *dst_dir_i_size += bkey_bytes(&new_dst->k); - ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); if (ret) goto out; @@ -571,16 +567,16 @@ int bch2_dirent_rename(struct btree_trans *trans, } if (delete_src) { - bch2_btree_iter_set_snapshot(trans, &src_iter, old_src.k->p.snapshot); - ret = bch2_btree_iter_traverse(trans, &src_iter) ?: + bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); + ret = bch2_btree_iter_traverse(&src_iter) ?: bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_internal_snapshot_node); if (ret) goto out; } if (delete_dst) { - bch2_btree_iter_set_snapshot(trans, &dst_iter, old_dst.k->p.snapshot); - ret = bch2_btree_iter_traverse(trans, &dst_iter) ?: + bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot); + ret = bch2_btree_iter_traverse(&dst_iter) ?: bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_internal_snapshot_node); if (ret) goto out; @@ -590,8 +586,8 @@ int bch2_dirent_rename(struct btree_trans *trans, *src_offset = new_src->k.p.offset; *dst_offset = new_dst->k.p.offset; out: - bch2_trans_iter_exit(trans, &src_iter); - bch2_trans_iter_exit(trans, &dst_iter); + bch2_trans_iter_exit(&src_iter); + bch2_trans_iter_exit(&dst_iter); return ret; } @@ -618,7 +614,7 @@ int bch2_dirent_lookup_trans(struct btree_trans *trans, ret = -ENOENT; err: if (ret) - bch2_trans_iter_exit(trans, iter); + bch2_trans_iter_exit(iter); return ret; } @@ -626,19 +622,17 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, const struct bch_hash_info *hash_info, const struct qstr *name, subvol_inum *inum) { - struct btree_trans *trans = bch2_trans_get(c); + CLASS(btree_trans, trans)(c); struct btree_iter iter = {}; int ret = lockrestart_do(trans, bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0)); - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); + bch2_trans_iter_exit(&iter); return ret; } int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 snapshot) { - struct btree_iter iter; struct bkey_s_c k; int ret; @@ -649,10 +643,9 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol) continue; - ret = -BCH_ERR_ENOTEMPTY_dir_not_empty; + ret = bch_err_throw(trans->c, ENOTEMPTY_dir_not_empty); break; } - bch2_trans_iter_exit(trans, &iter); return ret; } @@ -685,13 +678,15 @@ static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subv return !ret; } -int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) +int bch2_readdir(struct bch_fs *c, subvol_inum inum, + struct bch_hash_info *hash_info, + struct dir_context *ctx) { struct bkey_buf sk; bch2_bkey_buf_init(&sk); - int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_dirents, + CLASS(btree_trans, trans)(c); + int ret = for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_dirents, POS(inum.inum, ctx->pos), POS(inum.inum, U64_MAX), inum.subvol, 0, k, ({ @@ -703,12 +698,16 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k); subvol_inum target; - int ret2 = bch2_dirent_read_target(trans, inum, dirent, &target); + + bool need_second_pass = false; + int ret2 = bch2_str_hash_check_key(trans, NULL, &bch2_dirent_hash_desc, + hash_info, &iter, k, &need_second_pass) ?: + bch2_dirent_read_target(trans, inum, dirent, &target); if (ret2 > 0) continue; ret2 ?: (bch2_trans_unlock(trans), bch2_dir_emit(ctx, dirent, target)); - }))); + })); bch2_bkey_buf_exit(&sk, c); @@ -720,7 +719,6 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, struct bch_inode_unpacked *inode) { - struct btree_iter iter; struct bkey_s_c k; int ret; @@ -733,34 +731,31 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, ret = bch2_inode_unpack(k, inode); goto found; } - ret = -BCH_ERR_ENOENT_inode; + ret = bch_err_throw(trans->c, ENOENT_inode); found: bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); - bch2_trans_iter_exit(trans, &iter); return ret; } int bch2_fsck_remove_dirent(struct btree_trans *trans, struct bpos pos) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bch_inode_unpacked dir_inode; - struct bch_hash_info dir_hash_info; - int ret; - ret = lookup_first_inode(trans, pos.inode, &dir_inode); + struct bch_inode_unpacked dir_inode; + int ret = lookup_first_inode(trans, pos.inode, &dir_inode); if (ret) goto err; - dir_hash_info = bch2_hash_info_init(c, &dir_inode); + { + struct bch_hash_info dir_hash_info = bch2_hash_info_init(c, &dir_inode); - bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); + CLASS(btree_iter, iter)(trans, BTREE_ID_dirents, pos, BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - &dir_hash_info, &iter, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &iter); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + &dir_hash_info, &iter, + BTREE_UPDATE_internal_snapshot_node); + } err: bch_err_fn(c, ret); return ret; diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index d3e7ae669575..efb58d2dcf68 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -23,8 +23,16 @@ struct bch_fs; struct bch_hash_info; struct bch_inode_info; +#if IS_ENABLED(CONFIG_UNICODE) int bch2_casefold(struct btree_trans *, const struct bch_hash_info *, const struct qstr *, struct qstr *); +#else +static inline int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, + const struct qstr *str, struct qstr *out_cf) +{ + return bch_err_throw(trans->c, no_casefolding_without_utf8); +} +#endif static inline int bch2_maybe_casefold(struct btree_trans *trans, const struct bch_hash_info *info, @@ -38,7 +46,7 @@ static inline int bch2_maybe_casefold(struct btree_trans *trans, } } -struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d); +struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent); static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len) { @@ -59,6 +67,15 @@ static inline void dirent_copy_target(struct bkey_i_dirent *dst, dst->v.d_type = src.v->d_type; } +int bch2_dirent_init_name(struct bch_fs *, + struct bkey_i_dirent *, + const struct bch_hash_info *, + const struct qstr *, + const struct qstr *); +struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *, + const struct bch_hash_info *, subvol_inum, u8, + const struct qstr *, const struct qstr *, u64); + int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32, const struct bch_hash_info *, u8, const struct qstr *, u64, u64 *, @@ -80,8 +97,8 @@ enum bch_rename_mode { }; int bch2_dirent_rename(struct btree_trans *, - subvol_inum, struct bch_hash_info *, u64 *, - subvol_inum, struct bch_hash_info *, u64 *, + subvol_inum, struct bch_hash_info *, + subvol_inum, struct bch_hash_info *, const struct qstr *, subvol_inum *, u64 *, const struct qstr *, subvol_inum *, u64 *, enum bch_rename_mode); @@ -95,7 +112,7 @@ u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum, int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32); int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); -int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); +int bch2_readdir(struct bch_fs *, subvol_inum, struct bch_hash_info *, struct dir_context *); int bch2_fsck_remove_dirent(struct btree_trans *, struct bpos); diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 1f0422bfae35..f96530c70262 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -68,23 +68,31 @@ static const char * const disk_accounting_type_strs[] = { NULL }; -static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos, - s64 *d, unsigned nr) +static inline void __accounting_key_init(struct bkey_i *k, struct bpos pos, + s64 *d, unsigned nr) { struct bkey_i_accounting *acc = bkey_accounting_init(k); - acc->k.p = disk_accounting_pos_to_bpos(pos); + acc->k.p = pos; set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr); memcpy_u64s_small(acc->v.d, d, nr); } +static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos, + s64 *d, unsigned nr) +{ + return __accounting_key_init(k, disk_accounting_pos_to_bpos(pos), d, nr); +} + static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos); int bch2_disk_accounting_mod(struct btree_trans *trans, struct disk_accounting_pos *k, s64 *d, unsigned nr, bool gc) { + BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS); + /* Normalize: */ switch (k->type) { case BCH_DISK_ACCOUNTING_replicas: @@ -92,21 +100,49 @@ int bch2_disk_accounting_mod(struct btree_trans *trans, break; } - BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS); + struct bpos pos = disk_accounting_pos_to_bpos(k); + + if (likely(!gc)) { + struct bkey_i_accounting *a; +#if 0 + for (a = btree_trans_subbuf_base(trans, &trans->accounting); + a != btree_trans_subbuf_top(trans, &trans->accounting); + a = (void *) bkey_next(&a->k_i)) + if (bpos_eq(a->k.p, pos)) { + BUG_ON(nr != bch2_accounting_counters(&a->k)); + acc_u64s(a->v.d, d, nr); + + if (bch2_accounting_key_is_zero(accounting_i_to_s_c(a))) { + unsigned offset = (u64 *) a - + (u64 *) btree_trans_subbuf_base(trans, &trans->accounting); + + trans->accounting.u64s -= a->k.u64s; + memmove_u64s_down(a, + bkey_next(&a->k_i), + trans->accounting.u64s - offset); + } + return 0; + } +#endif + unsigned u64s = sizeof(*a) / sizeof(u64) + nr; + a = bch2_trans_subbuf_alloc(trans, &trans->accounting, u64s); + int ret = PTR_ERR_OR_ZERO(a); + if (ret) + return ret; - struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; + __accounting_key_init(&a->k_i, pos, d, nr); + return 0; + } else { + struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; - accounting_key_init(&k_i.k, k, d, nr); + __accounting_key_init(&k_i.k, pos, d, nr); - if (unlikely(gc)) { int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); if (ret == -BCH_ERR_btree_insert_need_mark_replicas) ret = drop_locks_do(trans, bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?: bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); return ret; - } else { - return bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k); } } @@ -287,7 +323,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p) { - struct bch_replicas_padded r; + union bch_replicas_padded r; return accounting_to_replicas(&r.e, p) ? bch2_mark_replicas(c, &r.e) : 0; @@ -299,14 +335,13 @@ static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p) */ int bch2_accounting_update_sb(struct btree_trans *trans) { - for (struct jset_entry *i = trans->journal_entries; - i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); - i = vstruct_next(i)) - if (jset_entry_is_key(i) && i->start->k.type == KEY_TYPE_accounting) { - int ret = bch2_accounting_update_sb_one(trans->c, i->start->k.p); - if (ret) - return ret; - } + for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); + i != btree_trans_subbuf_top(trans, &trans->accounting); + i = bkey_next(i)) { + int ret = bch2_accounting_update_sb_one(trans->c, i->k.p); + if (ret) + return ret; + } return 0; } @@ -345,33 +380,32 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun accounting_pos_cmp, NULL); if (trace_accounting_mem_insert_enabled()) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_accounting_to_text(&buf, c, a.s_c); trace_accounting_mem_insert(c, buf.buf); - printbuf_exit(&buf); } return 0; err: free_percpu(n.v[1]); free_percpu(n.v[0]); - return -BCH_ERR_ENOMEM_disk_accounting; + return bch_err_throw(c, ENOMEM_disk_accounting); } int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, enum bch_accounting_mode mode) { - struct bch_replicas_padded r; + union bch_replicas_padded r; if (mode != BCH_ACCOUNTING_read && accounting_to_replicas(&r.e, a.k->p) && !bch2_replicas_marked_locked(c, &r.e)) - return -BCH_ERR_btree_insert_need_mark_replicas; + return bch_err_throw(c, btree_insert_need_mark_replicas); percpu_up_read(&c->mark_lock); - percpu_down_write(&c->mark_lock); - int ret = __bch2_accounting_mem_insert(c, a); - percpu_up_write(&c->mark_lock); + int ret; + scoped_guard(percpu_write, &c->mark_lock) + ret = __bch2_accounting_mem_insert(c, a); percpu_down_read(&c->mark_lock); return ret; } @@ -379,12 +413,12 @@ int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounting a, enum bch_accounting_mode mode) { - struct bch_replicas_padded r; + union bch_replicas_padded r; if (mode != BCH_ACCOUNTING_read && accounting_to_replicas(&r.e, a.k->p) && !bch2_replicas_marked_locked(c, &r.e)) - return -BCH_ERR_btree_insert_need_mark_replicas; + return bch_err_throw(c, btree_insert_need_mark_replicas); return __bch2_accounting_mem_insert(c, a); } @@ -403,7 +437,7 @@ void bch2_accounting_mem_gc(struct bch_fs *c) { struct bch_accounting_mem *acc = &c->accounting; - percpu_down_write(&c->mark_lock); + guard(percpu_write)(&c->mark_lock); struct accounting_mem_entry *dst = acc->k.data; darray_for_each(acc->k, src) { @@ -418,7 +452,6 @@ void bch2_accounting_mem_gc(struct bch_fs *c) acc->k.nr = dst - acc->k.data; eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, NULL); - percpu_up_write(&c->mark_lock); } /* @@ -436,12 +469,14 @@ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage) darray_init(usage); - percpu_down_read(&c->mark_lock); + guard(percpu_read)(&c->mark_lock); darray_for_each(acc->k, i) { - struct { + union { + u8 bytes[struct_size_t(struct bch_replicas_usage, r.devs, + BCH_BKEY_PTRS_MAX)]; struct bch_replicas_usage r; - u8 pad[BCH_BKEY_PTRS_MAX]; } u; + u.r.r.nr_devs = BCH_BKEY_PTRS_MAX; if (!accounting_to_replicas(&u.r.r, i->pos)) continue; @@ -457,7 +492,6 @@ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage) memcpy(&darray_top(*usage), &u.r, replicas_usage_bytes(&u.r)); usage->nr += replicas_usage_bytes(&u.r); } - percpu_up_read(&c->mark_lock); if (ret) darray_exit(usage); @@ -472,7 +506,7 @@ int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned acc darray_init(out_buf); - percpu_down_read(&c->mark_lock); + guard(percpu_read)(&c->mark_lock); darray_for_each(acc->k, i) { struct disk_accounting_pos a_p; bpos_to_disk_accounting_pos(&a_p, i->pos); @@ -496,8 +530,6 @@ int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned acc out_buf->nr += bkey_bytes(&a_out->k); } - percpu_up_read(&c->mark_lock); - if (ret) darray_exit(out_buf); return ret; @@ -516,32 +548,30 @@ int bch2_gc_accounting_start(struct bch_fs *c) struct bch_accounting_mem *acc = &c->accounting; int ret = 0; - percpu_down_write(&c->mark_lock); + guard(percpu_write)(&c->mark_lock); darray_for_each(acc->k, e) { e->v[1] = __alloc_percpu_gfp(e->nr_counters * sizeof(u64), sizeof(u64), GFP_KERNEL); if (!e->v[1]) { bch2_accounting_free_counters(acc, true); - ret = -BCH_ERR_ENOMEM_disk_accounting; + ret = bch_err_throw(c, ENOMEM_disk_accounting); break; } } acc->gc_running = !ret; - percpu_up_write(&c->mark_lock); - return ret; } int bch2_gc_accounting_done(struct bch_fs *c) { struct bch_accounting_mem *acc = &c->accounting; - struct btree_trans *trans = bch2_trans_get(c); - struct printbuf buf = PRINTBUF; + CLASS(btree_trans, trans)(c); + CLASS(printbuf, buf)(); struct bpos pos = POS_MIN; int ret = 0; - percpu_down_write(&c->mark_lock); + guard(percpu_write)(&c->mark_lock); while (1) { unsigned idx = eytzinger0_find_ge(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, &pos); @@ -570,20 +600,23 @@ int bch2_gc_accounting_done(struct bch_fs *c) prt_str(&buf, "accounting mismatch for "); bch2_accounting_key_to_text(&buf, &acc_k); - prt_str(&buf, ": got"); + prt_str(&buf, ":\n got"); for (unsigned j = 0; j < nr; j++) prt_printf(&buf, " %llu", dst_v[j]); - prt_str(&buf, " should be"); + prt_str(&buf, "\nshould be"); for (unsigned j = 0; j < nr; j++) prt_printf(&buf, " %llu", src_v[j]); for (unsigned j = 0; j < nr; j++) src_v[j] -= dst_v[j]; - if (fsck_err(trans, accounting_mismatch, "%s", buf.buf)) { + bch2_trans_unlock_long(trans); + + if (fsck_err(c, accounting_mismatch, "%s", buf.buf)) { percpu_up_write(&c->mark_lock); - ret = commit_do(trans, NULL, NULL, 0, + ret = commit_do(trans, NULL, NULL, + BCH_TRANS_COMMIT_skip_accounting_apply, bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false)); percpu_down_write(&c->mark_lock); if (ret) @@ -598,20 +631,16 @@ int bch2_gc_accounting_done(struct bch_fs *c) bkey_i_to_s_c_accounting(&k_i.k), BCH_ACCOUNTING_normal, true); - preempt_disable(); + guard(preempt)(); struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); struct bch_fs_usage_base *src = &trans->fs_usage_delta; acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64)); - preempt_enable(); } } } } err: fsck_err: - percpu_up_write(&c->mark_lock); - printbuf_exit(&buf); - bch2_trans_put(trans); bch_err_fn(c, ret); return ret; } @@ -623,25 +652,23 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) if (k.k->type != KEY_TYPE_accounting) return 0; - percpu_down_read(&c->mark_lock); - int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), - BCH_ACCOUNTING_read, false); - percpu_up_read(&c->mark_lock); - return ret; + guard(percpu_read)(&c->mark_lock); + return bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), + BCH_ACCOUNTING_read, false); } static int bch2_disk_accounting_validate_late(struct btree_trans *trans, - struct disk_accounting_pos acc, + struct disk_accounting_pos *acc, u64 *v, unsigned nr) { struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); int ret = 0, invalid_dev = -1; - switch (acc.type) { + switch (acc->type) { case BCH_DISK_ACCOUNTING_replicas: { - struct bch_replicas_padded r; - __accounting_to_replicas(&r.e, &acc); + union bch_replicas_padded r; + __accounting_to_replicas(&r.e, acc); for (unsigned i = 0; i < r.e.nr_devs; i++) if (r.e.devs[i] != BCH_SB_MEMBER_INVALID && @@ -660,7 +687,7 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, trans, accounting_replicas_not_marked, "accounting not marked in superblock replicas\n%s", (printbuf_reset(&buf), - bch2_accounting_key_to_text(&buf, &acc), + bch2_accounting_key_to_text(&buf, acc), buf.buf))) { /* * We're not RW yet and still single threaded, dropping @@ -676,31 +703,30 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, } case BCH_DISK_ACCOUNTING_dev_data_type: - if (!bch2_dev_exists(c, acc.dev_data_type.dev)) { - invalid_dev = acc.dev_data_type.dev; + if (!bch2_dev_exists(c, acc->dev_data_type.dev)) { + invalid_dev = acc->dev_data_type.dev; goto invalid_device; } break; } fsck_err: - printbuf_exit(&buf); return ret; invalid_device: if (fsck_err(trans, accounting_to_invalid_device, "accounting entry points to invalid device %i\n%s", invalid_dev, (printbuf_reset(&buf), - bch2_accounting_key_to_text(&buf, &acc), + bch2_accounting_key_to_text(&buf, acc), buf.buf))) { for (unsigned i = 0; i < nr; i++) v[i] = -v[i]; ret = commit_do(trans, NULL, NULL, 0, - bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?: + bch2_disk_accounting_mod(trans, acc, v, nr, false)) ?: -BCH_ERR_remove_disk_accounting_entry; } else { - ret = -BCH_ERR_remove_disk_accounting_entry; + ret = bch_err_throw(c, remove_disk_accounting_entry); } goto fsck_err; } @@ -712,8 +738,8 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, int bch2_accounting_read(struct bch_fs *c) { struct bch_accounting_mem *acc = &c->accounting; - struct btree_trans *trans = bch2_trans_get(c); - struct printbuf buf = PRINTBUF; + CLASS(btree_trans, trans)(c); + CLASS(printbuf, buf)(); /* * We might run more than once if we rewind to start topology repair or @@ -722,13 +748,13 @@ int bch2_accounting_read(struct bch_fs *c) * * Instead, zero out any accounting we have: */ - percpu_down_write(&c->mark_lock); - darray_for_each(acc->k, e) - percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters); - for_each_member_device(c, ca) - percpu_memset(ca->usage, 0, sizeof(*ca->usage)); - percpu_memset(c->usage, 0, sizeof(*c->usage)); - percpu_up_write(&c->mark_lock); + scoped_guard(percpu_write, &c->mark_lock) { + darray_for_each(acc->k, e) + percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters); + for_each_member_device(c, ca) + percpu_memset(ca->usage, 0, sizeof(*ca->usage)); + percpu_memset(c->usage, 0, sizeof(*c->usage)); + } struct btree_iter iter; bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN, @@ -748,18 +774,19 @@ int bch2_accounting_read(struct bch_fs *c) if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) break; - if (!bch2_accounting_is_mem(acc_k)) { + if (!bch2_accounting_is_mem(&acc_k)) { struct disk_accounting_pos next; memset(&next, 0, sizeof(next)); next.type = acc_k.type + 1; - bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next)); + bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); continue; } accounting_read_key(trans, k); })); + bch2_trans_iter_exit(&iter); if (ret) - goto err; + return ret; struct journal_keys *keys = &c->journal_keys; struct journal_key *dst = keys->data; @@ -770,7 +797,7 @@ int bch2_accounting_read(struct bch_fs *c) struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, i->k->k.p); - if (!bch2_accounting_is_mem(acc_k)) + if (!bch2_accounting_is_mem(&acc_k)) continue; struct bkey_s_c k = bkey_i_to_s_c(i->k); @@ -798,14 +825,14 @@ int bch2_accounting_read(struct bch_fs *c) ret = accounting_read_key(trans, k); if (ret) - goto err; + return ret; } *dst++ = *i; } keys->gap = keys->nr = dst - keys->data; - percpu_down_write(&c->mark_lock); + guard(percpu_write)(&c->mark_lock); darray_for_each_reverse(acc->k, i) { struct disk_accounting_pos acc_k; @@ -826,7 +853,7 @@ int bch2_accounting_read(struct bch_fs *c) */ ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) ? -BCH_ERR_remove_disk_accounting_entry - : bch2_disk_accounting_validate_late(trans, acc_k, v, i->nr_counters); + : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters); if (ret == -BCH_ERR_remove_disk_accounting_entry) { free_percpu(i->v[0]); @@ -837,60 +864,55 @@ int bch2_accounting_read(struct bch_fs *c) } if (ret) - goto fsck_err; + return ret; } eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, NULL); - preempt_disable(); - struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); + scoped_guard(preempt) { + struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); - for (unsigned i = 0; i < acc->k.nr; i++) { - struct disk_accounting_pos k; - bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos); + for (unsigned i = 0; i < acc->k.nr; i++) { + struct disk_accounting_pos k; + bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos); - u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); + u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; + bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); - switch (k.type) { - case BCH_DISK_ACCOUNTING_persistent_reserved: - usage->reserved += v[0] * k.persistent_reserved.nr_replicas; - break; - case BCH_DISK_ACCOUNTING_replicas: - fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]); - break; - case BCH_DISK_ACCOUNTING_dev_data_type: - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev); - if (ca) { - struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type]; - percpu_u64_set(&d->buckets, v[0]); - percpu_u64_set(&d->sectors, v[1]); - percpu_u64_set(&d->fragmented, v[2]); - - if (k.dev_data_type.data_type == BCH_DATA_sb || - k.dev_data_type.data_type == BCH_DATA_journal) - usage->hidden += v[0] * ca->mi.bucket_size; + switch (k.type) { + case BCH_DISK_ACCOUNTING_persistent_reserved: + usage->reserved += v[0] * k.persistent_reserved.nr_replicas; + break; + case BCH_DISK_ACCOUNTING_replicas: + fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]); + break; + case BCH_DISK_ACCOUNTING_dev_data_type: { + guard(rcu)(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev); + if (ca) { + struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type]; + percpu_u64_set(&d->buckets, v[0]); + percpu_u64_set(&d->sectors, v[1]); + percpu_u64_set(&d->fragmented, v[2]); + + if (k.dev_data_type.data_type == BCH_DATA_sb || + k.dev_data_type.data_type == BCH_DATA_journal) + usage->hidden += v[0] * ca->mi.bucket_size; + } + break; + } } - rcu_read_unlock(); - break; } } - preempt_enable(); -fsck_err: - percpu_up_write(&c->mark_lock); -err: - printbuf_exit(&buf); - bch2_trans_put(trans); - bch_err_fn(c, ret); + return ret; } int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev) { - return bch2_trans_run(c, - bch2_btree_write_buffer_flush_sync(trans) ?: + CLASS(btree_trans, trans)(c); + return bch2_btree_write_buffer_flush_sync(trans) ?: for_each_btree_key_commit(trans, iter, BTREE_ID_accounting, POS_MIN, BTREE_ITER_all_snapshots, k, NULL, NULL, 0, ({ struct disk_accounting_pos acc; @@ -901,15 +923,16 @@ int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev) ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_accounting, k.k->p, 0) : 0; })) ?: - bch2_btree_write_buffer_flush_sync(trans)); + bch2_btree_write_buffer_flush_sync(trans); } int bch2_dev_usage_init(struct bch_dev *ca, bool gc) { struct bch_fs *c = ca->fs; + CLASS(btree_trans, trans)(c); u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 }; - int ret = bch2_trans_do(c, ({ + int ret = lockrestart_do(trans, ({ bch2_disk_accounting_mod2(trans, gc, v, dev_data_type, .dev = ca->dev_idx, @@ -925,80 +948,77 @@ void bch2_verify_accounting_clean(struct bch_fs *c) bool mismatch = false; struct bch_fs_usage_base base = {}, base_inmem = {}; - bch2_trans_run(c, - for_each_btree_key(trans, iter, - BTREE_ID_accounting, POS_MIN, - BTREE_ITER_all_snapshots, k, ({ - u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - struct bkey_s_c_accounting a = bkey_s_c_to_accounting(k); - unsigned nr = bch2_accounting_counters(k.k); + CLASS(btree_trans, trans)(c); + for_each_btree_key(trans, iter, + BTREE_ID_accounting, POS_MIN, + BTREE_ITER_all_snapshots, k, ({ + u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; + struct bkey_s_c_accounting a = bkey_s_c_to_accounting(k); + unsigned nr = bch2_accounting_counters(k.k); - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, k.k->p); + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, k.k->p); - if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) - break; + if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) + break; - if (!bch2_accounting_is_mem(acc_k)) { - struct disk_accounting_pos next; - memset(&next, 0, sizeof(next)); - next.type = acc_k.type + 1; - bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next)); - continue; - } + if (!bch2_accounting_is_mem(&acc_k)) { + struct disk_accounting_pos next; + memset(&next, 0, sizeof(next)); + next.type = acc_k.type + 1; + bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); + continue; + } - bch2_accounting_mem_read(c, k.k->p, v, nr); + bch2_accounting_mem_read(c, k.k->p, v, nr); - if (memcmp(a.v->d, v, nr * sizeof(u64))) { - struct printbuf buf = PRINTBUF; + if (memcmp(a.v->d, v, nr * sizeof(u64))) { + CLASS(printbuf, buf)(); - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, " !="); - for (unsigned j = 0; j < nr; j++) - prt_printf(&buf, " %llu", v[j]); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, " !="); + for (unsigned j = 0; j < nr; j++) + prt_printf(&buf, " %llu", v[j]); - pr_err("%s", buf.buf); - printbuf_exit(&buf); - mismatch = true; - } + pr_err("%s", buf.buf); + mismatch = true; + } - switch (acc_k.type) { - case BCH_DISK_ACCOUNTING_persistent_reserved: - base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; - break; - case BCH_DISK_ACCOUNTING_replicas: - fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]); - break; - case BCH_DISK_ACCOUNTING_dev_data_type: { - rcu_read_lock(); + switch (acc_k.type) { + case BCH_DISK_ACCOUNTING_persistent_reserved: + base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; + break; + case BCH_DISK_ACCOUNTING_replicas: + fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]); + break; + case BCH_DISK_ACCOUNTING_dev_data_type: { + { + guard(rcu)(); /* scoped guard is a loop, and doesn't play nicely with continue */ struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); - if (!ca) { - rcu_read_unlock(); + if (!ca) continue; - } v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets); v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors); v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented); - rcu_read_unlock(); + } - if (memcmp(a.v->d, v, 3 * sizeof(u64))) { - struct printbuf buf = PRINTBUF; + if (memcmp(a.v->d, v, 3 * sizeof(u64))) { + CLASS(printbuf, buf)(); - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, " in mem"); - for (unsigned j = 0; j < nr; j++) - prt_printf(&buf, " %llu", v[j]); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, " in mem"); + for (unsigned j = 0; j < nr; j++) + prt_printf(&buf, " %llu", v[j]); - pr_err("dev accounting mismatch: %s", buf.buf); - printbuf_exit(&buf); - mismatch = true; - } - } + pr_err("dev accounting mismatch: %s", buf.buf); + mismatch = true; } + } + } - 0; - }))); + 0; + })); acc_u64s_percpu(&base_inmem.hidden, &c->usage->hidden, sizeof(base_inmem) / sizeof(u64)); diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index d557b99b3c0a..43f4b21d0aab 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -139,10 +139,10 @@ int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum int bch2_accounting_mem_insert_locked(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); void bch2_accounting_mem_gc(struct bch_fs *); -static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc) +static inline bool bch2_accounting_is_mem(struct disk_accounting_pos *acc) { - return acc.type < BCH_DISK_ACCOUNTING_TYPE_NR && - acc.type != BCH_DISK_ACCOUNTING_inum; + return acc->type < BCH_DISK_ACCOUNTING_TYPE_NR && + acc->type != BCH_DISK_ACCOUNTING_inum; } /* @@ -163,7 +163,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, if (gc && !acc->gc_running) return 0; - if (!bch2_accounting_is_mem(acc_k)) + if (!bch2_accounting_is_mem(&acc_k)) return 0; if (mode == BCH_ACCOUNTING_normal) { @@ -174,17 +174,17 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, case BCH_DISK_ACCOUNTING_replicas: fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]); break; - case BCH_DISK_ACCOUNTING_dev_data_type: - rcu_read_lock(); + case BCH_DISK_ACCOUNTING_dev_data_type: { + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); if (ca) { this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]); this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]); this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]); } - rcu_read_unlock(); break; } + } } unsigned idx; @@ -211,10 +211,8 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc) { - percpu_down_read(&trans->c->mark_lock); - int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal, false); - percpu_up_read(&trans->c->mark_lock); - return ret; + guard(percpu_read)(&trans->c->mark_lock); + return bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal, false); } static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem *acc, @@ -236,13 +234,12 @@ static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem * static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p, u64 *v, unsigned nr) { - percpu_down_read(&c->mark_lock); + guard(percpu_read)(&c->mark_lock); struct bch_accounting_mem *acc = &c->accounting; unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, &p); bch2_accounting_mem_read_counters(acc, idx, v, nr, false); - percpu_up_read(&c->mark_lock); } static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset) @@ -259,8 +256,8 @@ static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans, struct bkey_i_accounting *a, unsigned commit_flags) { - a->k.bversion = journal_pos_to_bversion(&trans->journal_res, - (u64 *) a - (u64 *) trans->journal_entries); + u64 *base = (u64 *) btree_trans_subbuf_base(trans, &trans->accounting); + a->k.bversion = journal_pos_to_bversion(&trans->journal_res, (u64 *) a - base); EBUG_ON(bversion_zero(a->k.bversion)); diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index 2ca3cbf12b71..293e47268508 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -86,35 +86,6 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb, struct bch_sb_field * return ret; } -void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) -{ - out->atomic++; - rcu_read_lock(); - - struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); - if (!g) - goto out; - - for (unsigned i = 0; i < g->nr; i++) { - if (i) - prt_printf(out, " "); - - if (g->entries[i].deleted) { - prt_printf(out, "[deleted]"); - continue; - } - - prt_printf(out, "[parent %d devs", g->entries[i].parent); - for_each_member_device_rcu(c, ca, &g->entries[i].devs) - prt_printf(out, " %s", ca->name); - prt_printf(out, "]"); - } - -out: - rcu_read_unlock(); - out->atomic--; -} - static void bch2_sb_disk_groups_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field *f) @@ -159,7 +130,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) cpu_g = kzalloc(struct_size(cpu_g, entries, nr_groups), GFP_KERNEL); if (!cpu_g) - return -BCH_ERR_ENOMEM_disk_groups_to_cpu; + return bch_err_throw(c, ENOMEM_disk_groups_to_cpu); cpu_g->nr = nr_groups; @@ -199,36 +170,28 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target) { struct target t = target_decode(target); - struct bch_devs_mask *devs; - rcu_read_lock(); + guard(rcu)(); switch (t.type) { case TARGET_NULL: - devs = NULL; - break; + return NULL; case TARGET_DEV: { struct bch_dev *ca = t.dev < c->sb.nr_devices ? rcu_dereference(c->devs[t.dev]) : NULL; - devs = ca ? &ca->self : NULL; - break; + return ca ? &ca->self : NULL; } case TARGET_GROUP: { struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); - devs = g && t.group < g->nr && !g->entries[t.group].deleted + return g && t.group < g->nr && !g->entries[t.group].deleted ? &g->entries[t.group].devs : NULL; - break; } default: BUG(); } - - rcu_read_unlock(); - - return devs; } bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) @@ -241,20 +204,13 @@ bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) case TARGET_DEV: return dev == t.dev; case TARGET_GROUP: { - struct bch_disk_groups_cpu *g; - const struct bch_devs_mask *m; - bool ret; - - rcu_read_lock(); - g = rcu_dereference(c->disk_groups); - m = g && t.group < g->nr && !g->entries[t.group].deleted + struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); + const struct bch_devs_mask *m = + g && t.group < g->nr && !g->entries[t.group].deleted ? &g->entries[t.group].devs : NULL; - ret = m ? test_bit(dev, m->d) : false; - rcu_read_unlock(); - - return ret; + return m ? test_bit(dev, m->d) : false; } default: BUG(); @@ -377,54 +333,76 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) return v; } -void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) +static void __bch2_disk_path_to_text(struct printbuf *out, struct bch_disk_groups_cpu *g, + unsigned v) { - struct bch_disk_groups_cpu *groups; - struct bch_disk_group_cpu *g; - unsigned nr = 0; u16 path[32]; - - out->atomic++; - rcu_read_lock(); - groups = rcu_dereference(c->disk_groups); - if (!groups) - goto invalid; + unsigned nr = 0; while (1) { if (nr == ARRAY_SIZE(path)) goto invalid; - if (v >= groups->nr) + if (v >= (g ? g->nr : 0)) goto invalid; - g = groups->entries + v; + struct bch_disk_group_cpu *e = g->entries + v; - if (g->deleted) + if (e->deleted) goto invalid; path[nr++] = v; - if (!g->parent) + if (!e->parent) break; - v = g->parent - 1; + v = e->parent - 1; } while (nr) { - v = path[--nr]; - g = groups->entries + v; + struct bch_disk_group_cpu *e = g->entries + path[--nr]; - prt_printf(out, "%.*s", (int) sizeof(g->label), g->label); + prt_printf(out, "%.*s", (int) sizeof(e->label), e->label); if (nr) prt_printf(out, "."); } -out: - rcu_read_unlock(); - out->atomic--; return; invalid: prt_printf(out, "invalid label %u", v); - goto out; +} + +void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) +{ + bch2_printbuf_make_room(out, 4096); + + guard(printbuf_atomic)(out); + guard(rcu)(); + struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); + + for (unsigned i = 0; i < (g ? g->nr : 0); i++) { + prt_printf(out, "%2u: ", i); + + if (g->entries[i].deleted) { + prt_printf(out, "[deleted]"); + goto next; + } + + __bch2_disk_path_to_text(out, g, i); + + prt_printf(out, " devs"); + + for_each_member_device_rcu(c, ca, &g->entries[i].devs) + prt_printf(out, " %s", ca->name); +next: + prt_newline(out); + } +} + +void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) +{ + guard(printbuf_atomic)(out); + guard(rcu)(); + __bch2_disk_path_to_text(out, rcu_dereference(c->disk_groups), v); } void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) @@ -490,14 +468,9 @@ int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) { - int ret; - - mutex_lock(&c->sb_lock); - ret = __bch2_dev_group_set(c, ca, name) ?: + guard(mutex)(&c->sb_lock); + return __bch2_dev_group_set(c, ca, name) ?: bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - return ret; } int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res, @@ -525,9 +498,8 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res, return 0; } - mutex_lock(&c->sb_lock); - g = bch2_disk_path_find(&c->disk_sb, val); - mutex_unlock(&c->sb_lock); + scoped_guard(mutex, &c->sb_lock) + g = bch2_disk_path_find(&c->disk_sb, val); if (g >= 0) { *res = group_to_target(g); @@ -544,32 +516,25 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) switch (t.type) { case TARGET_NULL: prt_printf(out, "none"); - break; + return; case TARGET_DEV: { - struct bch_dev *ca; - - out->atomic++; - rcu_read_lock(); - ca = t.dev < c->sb.nr_devices + guard(printbuf_atomic)(out); + guard(rcu)(); + struct bch_dev *ca = t.dev < c->sb.nr_devices ? rcu_dereference(c->devs[t.dev]) : NULL; - if (ca && percpu_ref_tryget(&ca->io_ref[READ])) { + if (ca && ca->disk_sb.bdev) prt_printf(out, "/dev/%s", ca->name); - percpu_ref_put(&ca->io_ref[READ]); - } else if (ca) { + else if (ca) prt_printf(out, "offline device %u", t.dev); - } else { + else prt_printf(out, "invalid device %u", t.dev); - } - - rcu_read_unlock(); - out->atomic--; - break; + return; } case TARGET_GROUP: bch2_disk_path_to_text(out, c, t.group); - break; + return; default: BUG(); } diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index c6cb26981923..c2840cb674b2 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -16,6 +16,7 @@ #include "disk_accounting.h" #include "disk_groups.h" #include "ec.h" +#include "enumerated_ref.h" #include "error.h" #include "io_read.h" #include "io_write.h" @@ -196,8 +197,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, bool parity = ptr_idx >= nr_data; enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; s64 sectors = parity ? le16_to_cpu(s.v->sectors) : 0; - struct printbuf buf = PRINTBUF; - int ret = 0; + CLASS(printbuf, buf)(); struct bch_fs *c = trans->c; if (deleting) @@ -211,10 +211,8 @@ static int __mark_stripe_bucket(struct btree_trans *trans, bch2_data_type_str(a->data_type), a->dirty_sectors, a->stripe, s.k->p.offset, - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -BCH_ERR_mark_stripe; - goto err; - } + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) + return bch_err_throw(c, mark_stripe); if (bch2_trans_inconsistent_on(parity && bch2_bucket_sectors_total(*a), trans, "bucket %llu:%llu gen %u data type %s dirty_sectors %u cached_sectors %u: data already in parity bucket\n%s", @@ -222,30 +220,24 @@ static int __mark_stripe_bucket(struct btree_trans *trans, bch2_data_type_str(a->data_type), a->dirty_sectors, a->cached_sectors, - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -BCH_ERR_mark_stripe; - goto err; - } + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) + return bch_err_throw(c, mark_stripe); } else { if (bch2_trans_inconsistent_on(a->stripe != s.k->p.offset || a->stripe_redundancy != s.v->nr_redundant, trans, "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe (got %u)\n%s", bucket.inode, bucket.offset, a->gen, a->stripe, - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -BCH_ERR_mark_stripe; - goto err; - } + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) + return bch_err_throw(c, mark_stripe); if (bch2_trans_inconsistent_on(a->data_type != data_type, trans, "bucket %llu:%llu gen %u data type %s: wrong data type when stripe, should be %s\n%s", bucket.inode, bucket.offset, a->gen, bch2_data_type_str(a->data_type), bch2_data_type_str(data_type), - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -BCH_ERR_mark_stripe; - goto err; - } + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) + return bch_err_throw(c, mark_stripe); if (bch2_trans_inconsistent_on(parity && (a->dirty_sectors != -sectors || @@ -254,17 +246,15 @@ static int __mark_stripe_bucket(struct btree_trans *trans, bucket.inode, bucket.offset, a->gen, a->dirty_sectors, a->cached_sectors, - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -BCH_ERR_mark_stripe; - goto err; - } + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) + return bch_err_throw(c, mark_stripe); } if (sectors) { - ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type, - a->gen, a->data_type, &a->dirty_sectors); + int ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type, + a->gen, a->data_type, &a->dirty_sectors); if (ret) - goto err; + return ret; } if (!deleting) { @@ -276,9 +266,8 @@ static int __mark_stripe_bucket(struct btree_trans *trans, a->stripe_redundancy = 0; alloc_data_type_set(a, BCH_DATA_user); } -err: - printbuf_exit(&buf); - return ret; + + return 0; } static int mark_stripe_bucket(struct btree_trans *trans, @@ -288,14 +277,13 @@ static int mark_stripe_bucket(struct btree_trans *trans, { struct bch_fs *c = trans->c; const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx; - struct printbuf buf = PRINTBUF; - int ret = 0; + CLASS(printbuf, buf)(); - struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev); + CLASS(bch2_dev_tryget, ca)(c, ptr->dev); if (unlikely(!ca)) { if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite)) - ret = -BCH_ERR_mark_stripe; - goto err; + return bch_err_throw(c, mark_stripe); + return 0; } struct bpos bucket = PTR_BUCKET_POS(ca, ptr); @@ -311,36 +299,32 @@ static int mark_stripe_bucket(struct btree_trans *trans, struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0); - ret = PTR_ERR_OR_ZERO(a) ?: + int ret = PTR_ERR_OR_ZERO(a) ?: __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags) ?: bch2_bucket_backpointer_mod(trans, s.s_c, &bp, !(flags & BTREE_TRIGGER_overwrite)); if (ret) - goto err; + return ret; } if (flags & BTREE_TRIGGER_gc) { struct bucket *g = gc_bucket(ca, bucket.offset); if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n%s", ptr->dev, - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -BCH_ERR_mark_stripe; - goto err; - } + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) + return bch_err_throw(c, mark_stripe); bucket_lock(g); struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; - ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags); + int ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags); alloc_to_bucket(g, new); bucket_unlock(g); if (!ret) ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); } -err: - bch2_dev_put(ca); - printbuf_exit(&buf); - return ret; + + return 0; } static int mark_stripe_buckets(struct btree_trans *trans, @@ -427,7 +411,7 @@ int bch2_trigger_stripe(struct btree_trans *trans, gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); if (!gc) { bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx); - return -BCH_ERR_ENOMEM_mark_stripe; + return bch_err_throw(c, ENOMEM_mark_stripe); } /* @@ -535,7 +519,8 @@ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) } /* XXX: this is a non-mempoolified memory allocation: */ -static int ec_stripe_buf_init(struct ec_stripe_buf *buf, +static int ec_stripe_buf_init(struct bch_fs *c, + struct ec_stripe_buf *buf, unsigned offset, unsigned size) { struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; @@ -563,7 +548,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf, return 0; err: ec_stripe_buf_exit(buf); - return -BCH_ERR_ENOMEM_stripe_buf; + return bch_err_throw(c, ENOMEM_stripe_buf); } /* Checksumming: */ @@ -628,16 +613,15 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) struct bch_csum got = ec_block_checksum(buf, i, offset); if (bch2_crc_cmp(want, got)) { - struct bch_dev *ca = bch2_dev_tryget(c, v->ptrs[i].dev); + CLASS(bch2_dev_tryget, ca)(c, v->ptrs[i].dev); if (ca) { - struct printbuf err = PRINTBUF; + CLASS(printbuf, err)(); prt_str(&err, "stripe "); bch2_csum_err_msg(&err, v->csum_type, want, got); prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i); bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key)); bch_err_ratelimited(ca, "%s", err.buf); - printbuf_exit(&err); bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); } @@ -700,6 +684,9 @@ static void ec_block_endio(struct bio *bio) struct bch_dev *ca = ec_bio->ca; struct closure *cl = bio->bi_private; int rw = ec_bio->rw; + unsigned ref = rw == READ + ? (unsigned) BCH_DEV_READ_REF_ec_block + : (unsigned) BCH_DEV_WRITE_REF_ec_block; bch2_account_io_completion(ca, bio_data_dir(bio), ec_bio->submit_time, !bio->bi_status); @@ -721,7 +708,7 @@ static void ec_block_endio(struct bio *bio) } bio_put(&ec_bio->bio); - percpu_ref_put(&ca->io_ref[rw]); + enumerated_ref_put(&ca->io_ref[rw], ref); closure_put(cl); } @@ -735,8 +722,11 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, ? BCH_DATA_user : BCH_DATA_parity; int rw = op_is_write(opf); + unsigned ref = rw == READ + ? (unsigned) BCH_DEV_READ_REF_ec_block + : (unsigned) BCH_DEV_WRITE_REF_ec_block; - struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw); + struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw, ref); if (!ca) { clear_bit(idx, buf->valid); return; @@ -782,36 +772,28 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); closure_get(cl); - percpu_ref_get(&ca->io_ref[rw]); + enumerated_ref_get(&ca->io_ref[rw], ref); submit_bio(&ec_bio->bio); offset += b; } - percpu_ref_put(&ca->io_ref[rw]); + enumerated_ref_put(&ca->io_ref[rw], ref); } static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, struct ec_stripe_buf *stripe) { - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, - POS(0, idx), BTREE_ITER_slots); - ret = bkey_err(k); + CLASS(btree_iter, iter)(trans, BTREE_ID_stripes, POS(0, idx), BTREE_ITER_slots); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + int ret = bkey_err(k); if (ret) - goto err; - if (k.k->type != KEY_TYPE_stripe) { - ret = -ENOENT; - goto err; - } + return ret; + if (k.k->type != KEY_TYPE_stripe) + return -ENOENT; bkey_reassemble(&stripe->key, k); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; + return 0; } /* recovery read path: */ @@ -824,7 +806,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, struct bch_stripe *v; unsigned i, offset; const char *msg = NULL; - struct printbuf msgbuf = PRINTBUF; + CLASS(printbuf, msgbuf)(); int ret = 0; closure_init_stack(&cl); @@ -833,7 +815,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, buf = kzalloc(sizeof(*buf), GFP_NOFS); if (!buf) - return -BCH_ERR_ENOMEM_ec_read_extent; + return bch_err_throw(c, ENOMEM_ec_read_extent); ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf)); if (ret) { @@ -854,7 +836,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, goto err; } - ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio)); + ret = ec_stripe_buf_init(c, buf, offset, bio_sectors(&rbio->bio)); if (ret) { msg = "-ENOMEM"; goto err; @@ -886,8 +868,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, bch2_bkey_val_to_text(&msgbuf, c, orig_k); bch_err_ratelimited(c, "error doing reconstruct read: %s\n %s", msg, msgbuf.buf); - printbuf_exit(&msgbuf); - ret = -BCH_ERR_stripe_reconstruct; + ret = bch_err_throw(c, stripe_reconstruct); goto out; } @@ -897,7 +878,7 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) { if (c->gc_pos.phase != GC_PHASE_not_running && !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) - return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; + return bch_err_throw(c, ENOMEM_ec_stripe_mem_alloc); return 0; } @@ -928,31 +909,22 @@ static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx) static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx) { - bool ret = false; - - spin_lock(&c->ec_stripes_new_lock); - ret = __bch2_stripe_is_open(c, idx); - spin_unlock(&c->ec_stripes_new_lock); - - return ret; + guard(spinlock)(&c->ec_stripes_new_lock); + return __bch2_stripe_is_open(c, idx); } static bool bch2_try_open_stripe(struct bch_fs *c, struct ec_stripe_new *s, u64 idx) { - bool ret; - - spin_lock(&c->ec_stripes_new_lock); - ret = !__bch2_stripe_is_open(c, idx); + guard(spinlock)(&c->ec_stripes_new_lock); + bool ret = !__bch2_stripe_is_open(c, idx); if (ret) { unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); s->idx = idx; hlist_add_head(&s->hash, &c->ec_stripes_new[hash]); } - spin_unlock(&c->ec_stripes_new_lock); - return ret; } @@ -960,9 +932,8 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) { BUG_ON(!s->idx); - spin_lock(&c->ec_stripes_new_lock); + guard(spinlock)(&c->ec_stripes_new_lock); hlist_del_init(&s->hash); - spin_unlock(&c->ec_stripes_new_lock); s->idx = 0; } @@ -971,13 +942,11 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) static int ec_stripe_delete(struct btree_trans *trans, u64 idx) { - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, - BTREE_ID_stripes, POS(0, idx), - BTREE_ITER_intent); + CLASS(btree_iter, iter)(trans, BTREE_ID_stripes, POS(0, idx), BTREE_ITER_intent); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); int ret = bkey_err(k); if (ret) - goto err; + return ret; /* * We expect write buffer races here @@ -986,10 +955,9 @@ static int ec_stripe_delete(struct btree_trans *trans, u64 idx) if (k.k->type == KEY_TYPE_stripe && !bch2_stripe_is_open(trans->c, idx) && stripe_lru_pos(bkey_s_c_to_stripe(k).v) == 1) - ret = bch2_btree_delete_at(trans, &iter, 0); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; + return bch2_btree_delete_at(trans, &iter, 0); + + return 0; } /* @@ -1011,14 +979,14 @@ static void ec_stripe_delete_work(struct work_struct *work) BCH_TRANS_COMMIT_no_enospc, ({ ec_stripe_delete(trans, lru_k.k->p.offset); }))); - bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete); } void bch2_do_stripe_deletes(struct bch_fs *c) { - if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) && + if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_stripe_delete) && !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete); } /* stripe creation: */ @@ -1030,20 +998,17 @@ static int ec_stripe_key_update(struct btree_trans *trans, struct bch_fs *c = trans->c; bool create = !old; - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, - new->k.p, BTREE_ITER_intent); + CLASS(btree_iter, iter)(trans, BTREE_ID_stripes, new->k.p, BTREE_ITER_intent); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); int ret = bkey_err(k); if (ret) - goto err; + return ret; if (bch2_fs_inconsistent_on(k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe), c, "error %s stripe: got existing key type %s", create ? "creating" : "updating", - bch2_bkey_types[k.k->type])) { - ret = -EINVAL; - goto err; - } + bch2_bkey_types[k.k->type])) + return -EINVAL; if (k.k->type == KEY_TYPE_stripe) { const struct bch_stripe *v = bkey_s_c_to_stripe(k).v; @@ -1055,7 +1020,7 @@ static int ec_stripe_key_update(struct btree_trans *trans, unsigned sectors = stripe_blockcount_get(v, i); if (!bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]) && sectors) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); prt_printf(&buf, "stripe changed nonempty block %u", i); prt_str(&buf, "\nold: "); @@ -1063,9 +1028,7 @@ static int ec_stripe_key_update(struct btree_trans *trans, prt_str(&buf, "\nnew: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new->k_i)); bch2_fs_inconsistent(c, "%s", buf.buf); - printbuf_exit(&buf); - ret = -EINVAL; - goto err; + return -EINVAL; } /* @@ -1083,10 +1046,7 @@ static int ec_stripe_key_update(struct btree_trans *trans, } } - ret = bch2_trans_update(trans, &iter, &new->k_i, 0); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; + return bch2_trans_update(trans, &iter, &new->k_i, 0); } static int ec_stripe_update_extent(struct btree_trans *trans, @@ -1107,22 +1067,19 @@ static int ec_stripe_update_extent(struct btree_trans *trans, int ret, dev, block; if (bp.v->level) { - struct printbuf buf = PRINTBUF; struct btree_iter node_iter; - struct btree *b; - - b = bch2_backpointer_get_node(trans, bp, &node_iter, last_flushed); - bch2_trans_iter_exit(trans, &node_iter); + struct btree *b = bch2_backpointer_get_node(trans, bp, &node_iter, last_flushed); + bch2_trans_iter_exit(&node_iter); if (!b) return 0; + CLASS(printbuf, buf)(); prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b); bch2_bkey_val_to_text(&buf, c, bp.s_c); bch2_fs_inconsistent(c, "%s", buf.buf); - printbuf_exit(&buf); - return -BCH_ERR_erasure_coding_found_btree_node; + return bch_err_throw(c, erasure_coding_found_btree_node); } k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed); @@ -1174,7 +1131,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, ret = bch2_trans_update(trans, &iter, n, 0); out: - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); return ret; } @@ -1186,9 +1143,9 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b struct bch_extent_ptr ptr = v->ptrs[block]; int ret = 0; - struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev); + CLASS(bch2_dev_tryget, ca)(c, ptr.dev); if (!ca) - return -BCH_ERR_ENOENT_dev_not_found; + return bch_err_throw(c, ENOENT_dev_not_found); struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); @@ -1217,28 +1174,26 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b })); bch2_bkey_buf_exit(&last_flushed, c); - bch2_dev_put(ca); return ret; } static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) { - struct btree_trans *trans = bch2_trans_get(c); + CLASS(btree_trans, trans)(c); struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; unsigned nr_data = v->nr_blocks - v->nr_redundant; int ret = bch2_btree_write_buffer_flush_sync(trans); if (ret) - goto err; + return ret; for (unsigned i = 0; i < nr_data; i++) { ret = ec_stripe_update_bucket(trans, s, i); if (ret) - break; + return ret; } -err: - bch2_trans_put(trans); - return ret; + + return 0; } static void zero_out_rest_of_ec_bucket(struct bch_fs *c, @@ -1246,9 +1201,10 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c, unsigned block, struct open_bucket *ob) { - struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE); + struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE, + BCH_DEV_WRITE_REF_ec_bucket_zero); if (!ca) { - s->err = -BCH_ERR_erofs_no_writes; + s->err = bch_err_throw(c, erofs_no_writes); return; } @@ -1262,7 +1218,7 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c, ob->sectors_free, GFP_KERNEL, 0); - percpu_ref_put(&ca->io_ref[WRITE]); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_ec_bucket_zero); if (ret) s->err = ret; @@ -1312,7 +1268,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (ec_do_recov(c, &s->existing_stripe)) { bch_err(c, "error creating stripe: error reading existing stripe"); - ret = -BCH_ERR_ec_block_read; + ret = bch_err_throw(c, ec_block_read); goto err; } @@ -1338,7 +1294,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (ec_nr_failed(&s->new_stripe)) { bch_err(c, "error creating stripe: error writing redundancy buckets"); - ret = -BCH_ERR_ec_block_write; + ret = bch_err_throw(c, ec_block_write); goto err; } @@ -1376,9 +1332,8 @@ static void ec_stripe_create(struct ec_stripe_new *s) } } - mutex_lock(&c->ec_stripe_new_lock); - list_del(&s->list); - mutex_unlock(&c->ec_stripe_new_lock); + scoped_guard(mutex, &c->ec_stripe_new_lock) + list_del(&s->list); wake_up(&c->ec_stripe_new_wait); ec_stripe_buf_exit(&s->existing_stripe); @@ -1392,15 +1347,11 @@ static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c) { struct ec_stripe_new *s; - mutex_lock(&c->ec_stripe_new_lock); + guard(mutex)(&c->ec_stripe_new_lock); list_for_each_entry(s, &c->ec_stripe_new_list, list) if (!atomic_read(&s->ref[STRIPE_REF_io])) - goto out; - s = NULL; -out: - mutex_unlock(&c->ec_stripe_new_lock); - - return s; + return s; + return NULL; } static void ec_stripe_create_work(struct work_struct *work) @@ -1412,15 +1363,15 @@ static void ec_stripe_create_work(struct work_struct *work) while ((s = get_pending_stripe(c))) ec_stripe_create(s); - bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create); } void bch2_ec_do_stripe_creates(struct bch_fs *c) { - bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create); + enumerated_ref_get(&c->writes, BCH_WRITE_REF_stripe_create); if (!queue_work(system_long_wq, &c->ec_stripe_create_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create); } static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h) @@ -1434,9 +1385,8 @@ static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h h->s = NULL; s->pending = true; - mutex_lock(&c->ec_stripe_new_lock); - list_add(&s->list, &c->ec_stripe_new_list); - mutex_unlock(&c->ec_stripe_new_lock); + scoped_guard(mutex, &c->ec_stripe_new_lock) + list_add(&s->list, &c->ec_stripe_new_list); ec_stripe_new_put(c, s, STRIPE_REF_io); } @@ -1570,26 +1520,26 @@ static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_str static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h) { struct bch_devs_mask devs = h->devs; + unsigned nr_devs, nr_devs_with_durability; - rcu_read_lock(); - h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label - ? group_to_target(h->disk_label - 1) - : 0); - unsigned nr_devs = dev_mask_nr(&h->devs); - - for_each_member_device_rcu(c, ca, &h->devs) - if (!ca->mi.durability) - __clear_bit(ca->dev_idx, h->devs.d); - unsigned nr_devs_with_durability = dev_mask_nr(&h->devs); + scoped_guard(rcu) { + h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label + ? group_to_target(h->disk_label - 1) + : 0); + nr_devs = dev_mask_nr(&h->devs); - h->blocksize = pick_blocksize(c, &h->devs); + for_each_member_device_rcu(c, ca, &h->devs) + if (!ca->mi.durability) + __clear_bit(ca->dev_idx, h->devs.d); + nr_devs_with_durability = dev_mask_nr(&h->devs); - h->nr_active_devs = 0; - for_each_member_device_rcu(c, ca, &h->devs) - if (ca->mi.bucket_size == h->blocksize) - h->nr_active_devs++; + h->blocksize = pick_blocksize(c, &h->devs); - rcu_read_unlock(); + h->nr_active_devs = 0; + for_each_member_device_rcu(c, ca, &h->devs) + if (ca->mi.bucket_size == h->blocksize) + h->nr_active_devs++; + } /* * If we only have redundancy + 1 devices, we're better off with just @@ -1674,7 +1624,7 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, return ERR_PTR(ret); if (test_bit(BCH_FS_going_ro, &c->flags)) { - h = ERR_PTR(-BCH_ERR_erofs_no_writes); + h = ERR_PTR(bch_err_throw(c, erofs_no_writes)); goto err; } @@ -1693,7 +1643,7 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark); if (!h) { - h = ERR_PTR(-BCH_ERR_ENOMEM_stripe_head_alloc); + h = ERR_PTR(bch_err_throw(c, ENOMEM_stripe_head_alloc)); goto err; } found: @@ -1710,23 +1660,32 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, } static int new_stripe_alloc_buckets(struct btree_trans *trans, + struct alloc_request *req, struct ec_stripe_head *h, struct ec_stripe_new *s, - enum bch_watermark watermark, struct closure *cl) + struct closure *cl) { struct bch_fs *c = trans->c; - struct bch_devs_mask devs = h->devs; struct open_bucket *ob; - struct open_buckets buckets; struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; unsigned i, j, nr_have_parity = 0, nr_have_data = 0; - bool have_cache = true; int ret = 0; + req->scratch_data_type = req->data_type; + req->scratch_ptrs = req->ptrs; + req->scratch_nr_replicas = req->nr_replicas; + req->scratch_nr_effective = req->nr_effective; + req->scratch_have_cache = req->have_cache; + req->scratch_devs_may_alloc = req->devs_may_alloc; + + req->devs_may_alloc = h->devs; + req->have_cache = true; + BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity); BUG_ON(v->nr_redundant != s->nr_parity); /* * We bypass the sector allocator which normally does this: */ - bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); + bitmap_and(req->devs_may_alloc.d, req->devs_may_alloc.d, + c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) { /* @@ -1736,7 +1695,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, * block when updating the stripe */ if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID) - __clear_bit(v->ptrs[i].dev, devs.d); + __clear_bit(v->ptrs[i].dev, req->devs_may_alloc.d); if (i < s->nr_data) nr_have_data++; @@ -1747,60 +1706,58 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, BUG_ON(nr_have_data > s->nr_data); BUG_ON(nr_have_parity > s->nr_parity); - buckets.nr = 0; + req->ptrs.nr = 0; if (nr_have_parity < s->nr_parity) { - ret = bch2_bucket_alloc_set_trans(trans, &buckets, - &h->parity_stripe, - &devs, - s->nr_parity, - &nr_have_parity, - &have_cache, 0, - BCH_DATA_parity, - watermark, - cl); - - open_bucket_for_each(c, &buckets, ob, i) { + req->nr_replicas = s->nr_parity; + req->nr_effective = nr_have_parity; + req->data_type = BCH_DATA_parity; + + ret = bch2_bucket_alloc_set_trans(trans, req, &h->parity_stripe, cl); + + open_bucket_for_each(c, &req->ptrs, ob, i) { j = find_next_zero_bit(s->blocks_gotten, s->nr_data + s->nr_parity, s->nr_data); BUG_ON(j >= s->nr_data + s->nr_parity); - s->blocks[j] = buckets.v[i]; + s->blocks[j] = req->ptrs.v[i]; v->ptrs[j] = bch2_ob_ptr(c, ob); __set_bit(j, s->blocks_gotten); } if (ret) - return ret; + goto err; } - buckets.nr = 0; + req->ptrs.nr = 0; if (nr_have_data < s->nr_data) { - ret = bch2_bucket_alloc_set_trans(trans, &buckets, - &h->block_stripe, - &devs, - s->nr_data, - &nr_have_data, - &have_cache, 0, - BCH_DATA_user, - watermark, - cl); - - open_bucket_for_each(c, &buckets, ob, i) { + req->nr_replicas = s->nr_data; + req->nr_effective = nr_have_data; + req->data_type = BCH_DATA_user; + + ret = bch2_bucket_alloc_set_trans(trans, req, &h->block_stripe, cl); + + open_bucket_for_each(c, &req->ptrs, ob, i) { j = find_next_zero_bit(s->blocks_gotten, s->nr_data, 0); BUG_ON(j >= s->nr_data); - s->blocks[j] = buckets.v[i]; + s->blocks[j] = req->ptrs.v[i]; v->ptrs[j] = bch2_ob_ptr(c, ob); __set_bit(j, s->blocks_gotten); } if (ret) - return ret; + goto err; } - - return 0; +err: + req->data_type = req->scratch_data_type; + req->ptrs = req->scratch_ptrs; + req->nr_replicas = req->scratch_nr_replicas; + req->nr_effective = req->scratch_nr_effective; + req->have_cache = req->scratch_have_cache; + req->devs_may_alloc = req->scratch_devs_may_alloc; + return ret; } static int __get_existing_stripe(struct btree_trans *trans, @@ -1810,20 +1767,19 @@ static int __get_existing_stripe(struct btree_trans *trans, { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, - BTREE_ID_stripes, POS(0, idx), 0); + CLASS(btree_iter, iter)(trans, BTREE_ID_stripes, POS(0, idx), BTREE_ITER_nopreserve); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); int ret = bkey_err(k); if (ret) - goto err; + return ret; /* We expect write buffer races here */ if (k.k->type != KEY_TYPE_stripe) - goto out; + return 0; struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); if (stripe_lru_pos(s.v) <= 1) - goto out; + return 0; if (s.v->disk_label == head->disk_label && s.v->algorithm == head->algo && @@ -1831,13 +1787,10 @@ static int __get_existing_stripe(struct btree_trans *trans, le16_to_cpu(s.v->sectors) == head->blocksize && bch2_try_open_stripe(c, head->s, idx)) { bkey_reassemble(&stripe->key, k); - ret = 1; + return 1; } -out: - bch2_set_btree_iter_dontneed(trans, &iter); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; + + return 0; } static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new *s) @@ -1850,7 +1803,7 @@ static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new s->nr_data = existing_v->nr_blocks - existing_v->nr_redundant; - int ret = ec_stripe_buf_init(&s->existing_stripe, 0, le16_to_cpu(existing_v->sectors)); + int ret = ec_stripe_buf_init(c, &s->existing_stripe, 0, le16_to_cpu(existing_v->sectors)); if (ret) { bch2_stripe_close(c, s); return ret; @@ -1896,7 +1849,6 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri if (may_create_new_stripe(c)) return -1; - struct btree_iter lru_iter; struct bkey_s_c lru_k; int ret = 0; @@ -1908,9 +1860,8 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri if (ret) break; } - bch2_trans_iter_exit(trans, &lru_iter); if (!ret) - ret = -BCH_ERR_stripe_alloc_blocked; + ret = bch_err_throw(c, stripe_alloc_blocked); if (ret == 1) ret = 0; if (ret) @@ -1923,7 +1874,6 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st struct ec_stripe_new *s) { struct bch_fs *c = trans->c; - struct btree_iter iter; struct bkey_s_c k; struct bpos min_pos = POS(0, 1); struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); @@ -1944,54 +1894,44 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st */ for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos, BTREE_ITER_slots|BTREE_ITER_intent, k, ret) { + c->ec_stripe_hint = iter.pos.offset; + if (bkey_gt(k.k->p, POS(0, U32_MAX))) { if (start_pos.offset) { start_pos = min_pos; - bch2_btree_iter_set_pos(trans, &iter, start_pos); + bch2_btree_iter_set_pos(&iter, start_pos); continue; } - ret = -BCH_ERR_ENOSPC_stripe_create; + ret = bch_err_throw(c, ENOSPC_stripe_create); break; } if (bkey_deleted(k.k) && - bch2_try_open_stripe(c, s, k.k->p.offset)) + bch2_try_open_stripe(c, s, k.k->p.offset)) { + ret = ec_stripe_mem_alloc(trans, &iter); + if (ret) + bch2_stripe_close(c, s); + s->new_stripe.key.k.p = iter.pos; break; + } } - c->ec_stripe_hint = iter.pos.offset; - if (ret) - goto err; - - ret = ec_stripe_mem_alloc(trans, &iter); - if (ret) { - bch2_stripe_close(c, s); - goto err; - } - - s->new_stripe.key.k.p = iter.pos; -out: - bch2_trans_iter_exit(trans, &iter); + bch2_disk_reservation_put(c, &s->res); return ret; -err: - bch2_disk_reservation_put(c, &s->res); - goto out; } struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, - unsigned target, + struct alloc_request *req, unsigned algo, - unsigned redundancy, - enum bch_watermark watermark, struct closure *cl) { struct bch_fs *c = trans->c; - struct ec_stripe_head *h; - bool waiting = false; + unsigned redundancy = req->nr_replicas - 1; unsigned disk_label = 0; - struct target t = target_decode(target); + struct target t = target_decode(req->target); + bool waiting = false; int ret; if (t.type == TARGET_GROUP) { @@ -2002,14 +1942,16 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, disk_label = t.group + 1; /* 0 == no label */ } - h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, watermark); + struct ec_stripe_head *h = + __bch2_ec_stripe_head_get(trans, disk_label, algo, + redundancy, req->watermark); if (IS_ERR_OR_NULL(h)) return h; if (!h->s) { h->s = ec_new_stripe_alloc(c, h); if (!h->s) { - ret = -BCH_ERR_ENOMEM_ec_new_stripe_alloc; + ret = bch_err_throw(c, ENOMEM_ec_new_stripe_alloc); bch_err(c, "failed to allocate new stripe"); goto err; } @@ -2026,8 +1968,12 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, goto alloc_existing; /* First, try to allocate a full stripe: */ - ret = new_stripe_alloc_buckets(trans, h, s, BCH_WATERMARK_stripe, NULL) ?: + enum bch_watermark saved_watermark = BCH_WATERMARK_stripe; + swap(req->watermark, saved_watermark); + ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?: __bch2_ec_stripe_head_reserve(trans, h, s); + swap(req->watermark, saved_watermark); + if (!ret) goto allocate_buf; if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || @@ -2045,8 +1991,8 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) goto err; - if (watermark == BCH_WATERMARK_copygc) { - ret = new_stripe_alloc_buckets(trans, h, s, watermark, NULL) ?: + if (req->watermark == BCH_WATERMARK_copygc) { + ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?: __bch2_ec_stripe_head_reserve(trans, h, s); if (ret) goto err; @@ -2065,12 +2011,12 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, * Retry allocating buckets, with the watermark for this * particular write: */ - ret = new_stripe_alloc_buckets(trans, h, s, watermark, cl); + ret = new_stripe_alloc_buckets(trans, req, h, s, cl); if (ret) goto err; allocate_buf: - ret = ec_stripe_buf_init(&s->new_stripe, 0, h->blocksize); + ret = ec_stripe_buf_init(c, &s->new_stripe, 0, h->blocksize); if (ret) goto err; @@ -2081,29 +2027,27 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, BUG_ON(trans->restarted); return h; err: + if (waiting && + !bch2_err_matches(ret, BCH_ERR_operation_blocked)) + closure_wake_up(&c->freelist_wait); bch2_ec_stripe_head_put(c, h); return ERR_PTR(ret); } /* device removal */ -static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_s_c k_a) +int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + unsigned dev_idx, + unsigned flags) { - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert); - - if (!a->stripe) + if (k.k->type != KEY_TYPE_stripe) return 0; - if (a->stripe_sectors) { - bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data"); - return -BCH_ERR_invalidate_stripe_to_dev; - } - - struct btree_iter iter; + struct bch_fs *c = trans->c; struct bkey_i_stripe *s = - bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe), - BTREE_ITER_slots, stripe); + bch2_bkey_make_mut_typed(trans, iter, &k, 0, stripe); int ret = PTR_ERR_OR_ZERO(s); if (ret) return ret; @@ -2120,12 +2064,30 @@ static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_ acc.replicas.data_type = BCH_DATA_user; ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); if (ret) - goto err; + return ret; struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i)); - bkey_for_each_ptr(ptrs, ptr) - if (ptr->dev == k_a.k->p.inode) - ptr->dev = BCH_SB_MEMBER_INVALID; + + /* XXX: how much redundancy do we still have? check degraded flags */ + + unsigned nr_good = 0; + + scoped_guard(rcu) + bkey_for_each_ptr(ptrs, ptr) { + if (ptr->dev == dev_idx) + ptr->dev = BCH_SB_MEMBER_INVALID; + + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed; + } + + if (nr_good < s->v.nr_blocks && !(flags & BCH_FORCE_IF_DATA_DEGRADED)) + return bch_err_throw(c, remove_would_lose_data); + + unsigned nr_data = s->v.nr_blocks - s->v.nr_redundant; + + if (nr_good < nr_data && !(flags & BCH_FORCE_IF_DATA_LOST)) + return bch_err_throw(c, remove_would_lose_data); sectors = -sectors; @@ -2133,23 +2095,44 @@ static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_ acc.type = BCH_DISK_ACCOUNTING_replicas; bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); acc.replicas.data_type = BCH_DATA_user; - ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); + return bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); +} + +static int bch2_invalidate_stripe_to_dev_from_alloc(struct btree_trans *trans, struct bkey_s_c k_a, + unsigned flags) +{ + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert); + + if (!a->stripe) + return 0; + + if (a->stripe_sectors) { + struct bch_fs *c = trans->c; + bch_err(c, "trying to invalidate device in stripe when bucket has stripe data"); + return bch_err_throw(c, invalidate_stripe_to_dev); + } + + CLASS(btree_iter, iter)(trans, BTREE_ID_stripes, POS(0, a->stripe), 0); + struct bkey_s_c_stripe s = bch2_bkey_get_typed(&iter, stripe); + int ret = bkey_err(s); if (ret) - goto err; -err: - bch2_trans_iter_exit(trans, &iter); - return ret; + return ret; + + return bch2_invalidate_stripe_to_dev(trans, &iter, s.s_c, k_a.k->p.inode, flags); } -int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx) +int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx, unsigned flags) { - return bch2_trans_run(c, - for_each_btree_key_max_commit(trans, iter, + CLASS(btree_trans, trans)(c); + int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX), BTREE_ITER_intent, k, NULL, NULL, 0, ({ - bch2_invalidate_stripe_to_dev(trans, k); - }))); + bch2_invalidate_stripe_to_dev_from_alloc(trans, k, flags); + })); + bch_err_fn(c, ret); + return ret; } /* startup/shutdown */ @@ -2157,33 +2140,28 @@ int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx) static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca) { struct ec_stripe_head *h; - struct open_bucket *ob; - unsigned i; - mutex_lock(&c->ec_stripe_head_lock); + guard(mutex)(&c->ec_stripe_head_lock); list_for_each_entry(h, &c->ec_stripe_head_list, list) { - mutex_lock(&h->lock); + guard(mutex)(&h->lock); if (!h->s) - goto unlock; + continue; if (!ca) goto found; - for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) { + for (unsigned i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) { if (!h->s->blocks[i]) continue; - ob = c->open_buckets + h->s->blocks[i]; + struct open_bucket *ob = c->open_buckets + h->s->blocks[i]; if (ob->dev == ca->dev_idx) goto found; } - goto unlock; + continue; found: ec_stripe_new_cancel(c, h, -BCH_ERR_erofs_no_writes); -unlock: - mutex_unlock(&h->lock); } - mutex_unlock(&c->ec_stripe_head_lock); } void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) @@ -2200,11 +2178,8 @@ static bool bch2_fs_ec_flush_done(struct bch_fs *c) { sched_annotate_sleep(); - mutex_lock(&c->ec_stripe_new_lock); - bool ret = list_empty(&c->ec_stripe_new_list); - mutex_unlock(&c->ec_stripe_new_lock); - - return ret; + guard(mutex)(&c->ec_stripe_new_lock); + return list_empty(&c->ec_stripe_new_list); } void bch2_fs_ec_flush(struct bch_fs *c) @@ -2241,41 +2216,40 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) struct ec_stripe_head *h; struct ec_stripe_new *s; - mutex_lock(&c->ec_stripe_head_lock); - list_for_each_entry(h, &c->ec_stripe_head_list, list) { - prt_printf(out, "disk label %u algo %u redundancy %u %s nr created %llu:\n", - h->disk_label, h->algo, h->redundancy, - bch2_watermarks[h->watermark], - h->nr_created); + scoped_guard(mutex, &c->ec_stripe_head_lock) + list_for_each_entry(h, &c->ec_stripe_head_list, list) { + prt_printf(out, "disk label %u algo %u redundancy %u %s nr created %llu:\n", + h->disk_label, h->algo, h->redundancy, + bch2_watermarks[h->watermark], + h->nr_created); - if (h->s) - bch2_new_stripe_to_text(out, c, h->s); - } - mutex_unlock(&c->ec_stripe_head_lock); + if (h->s) + bch2_new_stripe_to_text(out, c, h->s); + } prt_printf(out, "in flight:\n"); - mutex_lock(&c->ec_stripe_new_lock); - list_for_each_entry(s, &c->ec_stripe_new_list, list) - bch2_new_stripe_to_text(out, c, s); - mutex_unlock(&c->ec_stripe_new_lock); + scoped_guard(mutex, &c->ec_stripe_new_lock) + list_for_each_entry(s, &c->ec_stripe_new_list, list) + bch2_new_stripe_to_text(out, c, s); } void bch2_fs_ec_exit(struct bch_fs *c) { - struct ec_stripe_head *h; - unsigned i; while (1) { - mutex_lock(&c->ec_stripe_head_lock); - h = list_pop_entry(&c->ec_stripe_head_list, struct ec_stripe_head, list); - mutex_unlock(&c->ec_stripe_head_lock); + struct ec_stripe_head *h; + + scoped_guard(mutex, &c->ec_stripe_head_lock) + h = list_pop_entry(&c->ec_stripe_head_list, struct ec_stripe_head, list); if (!h) break; if (h->s) { - for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) + for (unsigned i = 0; + i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; + i++) BUG_ON(h->s->blocks[i]); kfree(h->s); @@ -2328,20 +2302,18 @@ static int bch2_check_stripe_to_lru_ref(struct btree_trans *trans, return 0; } -int bch2_check_stripe_to_lru_refs(struct bch_fs *c) +int bch2_check_stripe_to_lru_refs(struct btree_trans *trans) { struct bkey_buf last_flushed; - bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_stripes, + int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_stripes, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_stripe_to_lru_ref(trans, k, &last_flushed))); + bch2_check_stripe_to_lru_ref(trans, k, &last_flushed)); - bch2_bkey_buf_exit(&last_flushed, c); - bch_err_fn(c, ret); + bch2_bkey_buf_exit(&last_flushed, trans->c); + bch_err_fn(trans->c, ret); return ret; } diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 51893e1ee874..e807e7027d7a 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -255,9 +255,10 @@ void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int); int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); + +struct alloc_request; struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *, - unsigned, unsigned, unsigned, - enum bch_watermark, struct closure *); + struct alloc_request *, unsigned, struct closure *); void bch2_do_stripe_deletes(struct bch_fs *); void bch2_ec_do_stripe_creates(struct bch_fs *); @@ -287,7 +288,9 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s, } } -int bch2_dev_remove_stripes(struct bch_fs *, unsigned); +int bch2_invalidate_stripe_to_dev(struct btree_trans *, struct btree_iter *, + struct bkey_s_c, unsigned, unsigned); +int bch2_dev_remove_stripes(struct bch_fs *, unsigned, unsigned); void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); void bch2_fs_ec_stop(struct bch_fs *); @@ -301,6 +304,6 @@ void bch2_fs_ec_exit(struct bch_fs *); void bch2_fs_ec_init_early(struct bch_fs *); int bch2_fs_ec_init(struct bch_fs *); -int bch2_check_stripe_to_lru_refs(struct bch_fs *); +int bch2_check_stripe_to_lru_refs(struct btree_trans *); #endif /* _BCACHEFS_EC_H */ diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h index 06144bfd9c19..809446c78951 100644 --- a/fs/bcachefs/ec_types.h +++ b/fs/bcachefs/ec_types.h @@ -4,9 +4,10 @@ #include "bcachefs_format.h" -struct bch_replicas_padded { +union bch_replicas_padded { + u8 bytes[struct_size_t(struct bch_replicas_entry_v1, + devs, BCH_BKEY_PTRS_MAX)]; struct bch_replicas_entry_v1 e; - u8 pad[BCH_BKEY_PTRS_MAX]; }; struct stripe { @@ -28,7 +29,7 @@ struct gc_stripe { u16 block_sectors[BCH_BKEY_PTRS_MAX]; struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX]; - struct bch_replicas_padded r; + union bch_replicas_padded r; }; #endif /* _BCACHEFS_EC_TYPES_H */ diff --git a/fs/bcachefs/enumerated_ref.c b/fs/bcachefs/enumerated_ref.c new file mode 100644 index 000000000000..2ded74135977 --- /dev/null +++ b/fs/bcachefs/enumerated_ref.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "enumerated_ref.h" +#include "util.h" + +#include + +#ifdef ENUMERATED_REF_DEBUG +void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx) +{ + BUG_ON(idx >= ref->nr); + atomic_long_inc(&ref->refs[idx]); +} + +bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) +{ + BUG_ON(idx >= ref->nr); + return atomic_long_inc_not_zero(&ref->refs[idx]); +} + +bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) +{ + BUG_ON(idx >= ref->nr); + return !ref->dying && + atomic_long_inc_not_zero(&ref->refs[idx]); +} + +void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx) +{ + BUG_ON(idx >= ref->nr); + long v = atomic_long_dec_return(&ref->refs[idx]); + + BUG_ON(v < 0); + if (v) + return; + + for (unsigned i = 0; i < ref->nr; i++) + if (atomic_long_read(&ref->refs[i])) + return; + + if (ref->stop_fn) + ref->stop_fn(ref); + complete(&ref->stop_complete); +} +#endif + +#ifndef ENUMERATED_REF_DEBUG +static void enumerated_ref_kill_cb(struct percpu_ref *percpu_ref) +{ + struct enumerated_ref *ref = + container_of(percpu_ref, struct enumerated_ref, ref); + + if (ref->stop_fn) + ref->stop_fn(ref); + complete(&ref->stop_complete); +} +#endif + +void enumerated_ref_stop_async(struct enumerated_ref *ref) +{ + reinit_completion(&ref->stop_complete); + +#ifndef ENUMERATED_REF_DEBUG + percpu_ref_kill(&ref->ref); +#else + ref->dying = true; + for (unsigned i = 0; i < ref->nr; i++) + enumerated_ref_put(ref, i); +#endif +} + +void enumerated_ref_stop(struct enumerated_ref *ref, + const char * const names[]) +{ + enumerated_ref_stop_async(ref); + while (!wait_for_completion_timeout(&ref->stop_complete, HZ * 10)) { + CLASS(printbuf, buf)(); + prt_str(&buf, "Waited for 10 seconds to shutdown enumerated ref\n"); + prt_str(&buf, "Outstanding refs:\n"); + enumerated_ref_to_text(&buf, ref, names); + printk(KERN_ERR "%s", buf.buf); + } +} + +void enumerated_ref_start(struct enumerated_ref *ref) +{ +#ifndef ENUMERATED_REF_DEBUG + percpu_ref_reinit(&ref->ref); +#else + ref->dying = false; + for (unsigned i = 0; i < ref->nr; i++) { + BUG_ON(atomic_long_read(&ref->refs[i])); + atomic_long_inc(&ref->refs[i]); + } +#endif +} + +void enumerated_ref_exit(struct enumerated_ref *ref) +{ +#ifndef ENUMERATED_REF_DEBUG + percpu_ref_exit(&ref->ref); +#else + kfree(ref->refs); + ref->refs = NULL; + ref->nr = 0; +#endif +} + +int enumerated_ref_init(struct enumerated_ref *ref, unsigned nr, + void (*stop_fn)(struct enumerated_ref *)) +{ + init_completion(&ref->stop_complete); + ref->stop_fn = stop_fn; + +#ifndef ENUMERATED_REF_DEBUG + return percpu_ref_init(&ref->ref, enumerated_ref_kill_cb, + PERCPU_REF_INIT_DEAD, GFP_KERNEL); +#else + ref->refs = kzalloc(sizeof(ref->refs[0]) * nr, GFP_KERNEL); + if (!ref->refs) + return -ENOMEM; + + ref->nr = nr; + return 0; +#endif +} + +void enumerated_ref_to_text(struct printbuf *out, + struct enumerated_ref *ref, + const char * const names[]) +{ +#ifdef ENUMERATED_REF_DEBUG + bch2_printbuf_tabstop_push(out, 32); + + for (unsigned i = 0; i < ref->nr; i++) + prt_printf(out, "%s\t%li\n", names[i], + atomic_long_read(&ref->refs[i])); +#else + prt_str(out, "(not in debug mode)\n"); +#endif +} diff --git a/fs/bcachefs/enumerated_ref.h b/fs/bcachefs/enumerated_ref.h new file mode 100644 index 000000000000..ec01cf59ef80 --- /dev/null +++ b/fs/bcachefs/enumerated_ref.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ENUMERATED_REF_H +#define _BCACHEFS_ENUMERATED_REF_H + +#include "enumerated_ref_types.h" + +/* + * A refcount where the users are enumerated: in debug mode, we create sepate + * refcounts for each user, to make leaks and refcount errors easy to track + * down: + */ + +#ifdef ENUMERATED_REF_DEBUG +void enumerated_ref_get(struct enumerated_ref *, unsigned); +bool __enumerated_ref_tryget(struct enumerated_ref *, unsigned); +bool enumerated_ref_tryget(struct enumerated_ref *, unsigned); +void enumerated_ref_put(struct enumerated_ref *, unsigned); +#else + +static inline void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx) +{ + percpu_ref_get(&ref->ref); +} + +static inline bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) +{ + return percpu_ref_tryget(&ref->ref); +} + +static inline bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) +{ + return percpu_ref_tryget_live(&ref->ref); +} + +static inline void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx) +{ + percpu_ref_put(&ref->ref); +} +#endif + +static inline bool enumerated_ref_is_zero(struct enumerated_ref *ref) +{ +#ifndef ENUMERATED_REF_DEBUG + return percpu_ref_is_zero(&ref->ref); +#else + for (unsigned i = 0; i < ref->nr; i++) + if (atomic_long_read(&ref->refs[i])) + return false; + return true; +#endif +} + +void enumerated_ref_stop_async(struct enumerated_ref *); +void enumerated_ref_stop(struct enumerated_ref *, const char * const[]); +void enumerated_ref_start(struct enumerated_ref *); + +void enumerated_ref_exit(struct enumerated_ref *); +int enumerated_ref_init(struct enumerated_ref *, unsigned, + void (*stop_fn)(struct enumerated_ref *)); + +struct printbuf; +void enumerated_ref_to_text(struct printbuf *, + struct enumerated_ref *, + const char * const[]); + +#endif /* _BCACHEFS_ENUMERATED_REF_H */ diff --git a/fs/bcachefs/enumerated_ref_types.h b/fs/bcachefs/enumerated_ref_types.h new file mode 100644 index 000000000000..0e6076f466d3 --- /dev/null +++ b/fs/bcachefs/enumerated_ref_types.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ENUMERATED_REF_TYPES_H +#define _BCACHEFS_ENUMERATED_REF_TYPES_H + +#include + +struct enumerated_ref { +#ifdef ENUMERATED_REF_DEBUG + unsigned nr; + bool dying; + atomic_long_t *refs; +#else + struct percpu_ref ref; +#endif + void (*stop_fn)(struct enumerated_ref *); + struct completion stop_complete; +}; + +#endif /* _BCACHEFS_ENUMERATED_REF_TYPES_H */ diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c index 43557bebd0f8..86264b8c343c 100644 --- a/fs/bcachefs/errcode.c +++ b/fs/bcachefs/errcode.c @@ -13,19 +13,21 @@ static const char * const bch2_errcode_strs[] = { NULL }; -static unsigned bch2_errcode_parents[] = { +static const unsigned bch2_errcode_parents[] = { #define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class, BCH_ERRCODES() #undef x }; +__attribute__((const)) const char *bch2_err_str(int err) { const char *errstr; err = abs(err); - BUG_ON(err >= BCH_ERR_MAX); + if (err >= BCH_ERR_MAX) + return "(Invalid error)"; if (err >= BCH_ERR_START) errstr = bch2_errcode_strs[err - BCH_ERR_START]; @@ -36,6 +38,7 @@ const char *bch2_err_str(int err) return errstr ?: "(Invalid error)"; } +__attribute__((const)) bool __bch2_err_matches(int err, int class) { err = abs(err); diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index d9ebffa5b3a2..adc1f9315eab 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -5,6 +5,7 @@ #define BCH_ERRCODES() \ x(ERANGE, ERANGE_option_too_small) \ x(ERANGE, ERANGE_option_too_big) \ + x(ERANGE, projid_too_big) \ x(EINVAL, injected) \ x(BCH_ERR_injected, injected_fs_start) \ x(EINVAL, mount_option) \ @@ -53,6 +54,7 @@ x(ENOMEM, ENOMEM_dio_write_bioset_init) \ x(ENOMEM, ENOMEM_nocow_flush_bioset_init) \ x(ENOMEM, ENOMEM_promote_table_init) \ + x(ENOMEM, ENOMEM_async_obj_init) \ x(ENOMEM, ENOMEM_compression_bounce_read_init) \ x(ENOMEM, ENOMEM_compression_bounce_write_init) \ x(ENOMEM, ENOMEM_compression_workspace_init) \ @@ -88,6 +90,8 @@ x(ENOMEM, ENOMEM_disk_accounting) \ x(ENOMEM, ENOMEM_stripe_head_alloc) \ x(ENOMEM, ENOMEM_journal_read_bucket) \ + x(ENOMEM, ENOMEM_acl) \ + x(ENOMEM, ENOMEM_move_extent) \ x(ENOSPC, ENOSPC_disk_reservation) \ x(ENOSPC, ENOSPC_bucket_alloc) \ x(ENOSPC, ENOSPC_disk_label_add) \ @@ -115,6 +119,7 @@ x(ENOENT, ENOENT_not_directory) \ x(ENOENT, ENOENT_directory_dead) \ x(ENOENT, ENOENT_subvolume) \ + x(ENOENT, ENOENT_snapshot) \ x(ENOENT, ENOENT_snapshot_tree) \ x(ENOENT, ENOENT_dirent_doesnt_match_inode) \ x(ENOENT, ENOENT_dev_not_found) \ @@ -136,7 +141,6 @@ x(BCH_ERR_transaction_restart, transaction_restart_relock) \ x(BCH_ERR_transaction_restart, transaction_restart_relock_path) \ x(BCH_ERR_transaction_restart, transaction_restart_relock_path_intent) \ - x(BCH_ERR_transaction_restart, transaction_restart_relock_after_fill) \ x(BCH_ERR_transaction_restart, transaction_restart_too_many_iters) \ x(BCH_ERR_transaction_restart, transaction_restart_lock_node_reused) \ x(BCH_ERR_transaction_restart, transaction_restart_fill_relock) \ @@ -147,11 +151,8 @@ x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock_write)\ x(BCH_ERR_transaction_restart, transaction_restart_deadlock_recursion_limit)\ x(BCH_ERR_transaction_restart, transaction_restart_upgrade) \ - x(BCH_ERR_transaction_restart, transaction_restart_key_cache_upgrade) \ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_fill) \ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_raced) \ - x(BCH_ERR_transaction_restart, transaction_restart_key_cache_realloced)\ - x(BCH_ERR_transaction_restart, transaction_restart_journal_preres_get) \ x(BCH_ERR_transaction_restart, transaction_restart_split_race) \ x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \ x(BCH_ERR_transaction_restart, transaction_restart_nested) \ @@ -174,16 +175,19 @@ x(0, backpointer_to_overwritten_btree_node) \ x(0, journal_reclaim_would_deadlock) \ x(EINVAL, fsck) \ + x(BCH_ERR_fsck, fsck_ask) \ x(BCH_ERR_fsck, fsck_fix) \ x(BCH_ERR_fsck, fsck_delete_bkey) \ x(BCH_ERR_fsck, fsck_ignore) \ x(BCH_ERR_fsck, fsck_errors_not_fixed) \ x(BCH_ERR_fsck, fsck_repair_unimplemented) \ x(BCH_ERR_fsck, fsck_repair_impossible) \ - x(EINVAL, restart_recovery) \ - x(EINVAL, not_in_recovery) \ - x(EINVAL, cannot_rewind_recovery) \ + x(EINVAL, recovery_will_run) \ + x(BCH_ERR_recovery_will_run, restart_recovery) \ + x(BCH_ERR_recovery_will_run, cannot_rewind_recovery) \ + x(BCH_ERR_recovery_will_run, recovery_pass_will_run) \ x(0, data_update_done) \ + x(0, bkey_was_deleted) \ x(BCH_ERR_data_update_done, data_update_done_would_block) \ x(BCH_ERR_data_update_done, data_update_done_unwritten) \ x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \ @@ -201,6 +205,7 @@ x(EINVAL, device_has_been_removed) \ x(EINVAL, device_splitbrain) \ x(EINVAL, device_already_online) \ + x(EINVAL, filesystem_uuid_already_open) \ x(EINVAL, insufficient_devices_to_start) \ x(EINVAL, invalid) \ x(EINVAL, internal_fsck_err) \ @@ -209,9 +214,22 @@ x(EINVAL, remove_would_lose_data) \ x(EINVAL, no_resize_with_buckets_nouse) \ x(EINVAL, inode_unpack_error) \ + x(EINVAL, inode_not_unlinked) \ + x(EINVAL, inode_has_child_snapshot) \ x(EINVAL, varint_decode_error) \ x(EINVAL, erasure_coding_found_btree_node) \ + x(EINVAL, option_negative) \ + x(EINVAL, topology_repair) \ + x(BCH_ERR_topology_repair, topology_repair_drop_this_node) \ + x(BCH_ERR_topology_repair, topology_repair_drop_prev_node) \ + x(BCH_ERR_topology_repair, topology_repair_did_fill_from_scan) \ x(EOPNOTSUPP, may_not_use_incompat_feature) \ + x(EOPNOTSUPP, no_casefolding_without_utf8) \ + x(EOPNOTSUPP, casefolding_disabled) \ + x(EOPNOTSUPP, casefold_opt_is_dir_only) \ + x(EOPNOTSUPP, unsupported_fsx_flag) \ + x(EOPNOTSUPP, unsupported_fa_flag) \ + x(EOPNOTSUPP, unsupported_fallocate_mode) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ x(EROFS, erofs_journal_err) \ @@ -219,6 +237,8 @@ x(EROFS, erofs_unfixed_errors) \ x(EROFS, erofs_norecovery) \ x(EROFS, erofs_nochanges) \ + x(EROFS, erofs_no_alloc_info) \ + x(EROFS, erofs_filesystem_full) \ x(EROFS, insufficient_devices) \ x(0, operation_blocked) \ x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \ @@ -231,7 +251,6 @@ x(BCH_ERR_journal_res_blocked, journal_buf_enomem) \ x(BCH_ERR_journal_res_blocked, journal_stuck) \ x(BCH_ERR_journal_res_blocked, journal_retry_open) \ - x(BCH_ERR_journal_res_blocked, journal_preres_get_blocked) \ x(BCH_ERR_journal_res_blocked, bucket_alloc_blocked) \ x(BCH_ERR_journal_res_blocked, stripe_alloc_blocked) \ x(BCH_ERR_invalid, invalid_sb) \ @@ -277,7 +296,6 @@ x(EIO, sb_not_downgraded) \ x(EIO, btree_node_write_all_failed) \ x(EIO, btree_node_read_error) \ - x(EIO, btree_node_read_validate_error) \ x(EIO, btree_need_topology_repair) \ x(EIO, bucket_ref_update) \ x(EIO, trigger_alloc) \ @@ -352,9 +370,11 @@ enum bch_errcode { BCH_ERR_MAX }; -const char *bch2_err_str(int); -bool __bch2_err_matches(int, int); +__attribute__((const)) const char *bch2_err_str(int); +__attribute__((const)) bool __bch2_err_matches(int, int); + +__attribute__((const)) static inline bool _bch2_err_matches(int err, int class) { return err < 0 && __bch2_err_matches(err, class); diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 6b8695b1349c..32a286b3a74e 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -11,12 +11,12 @@ #define FSCK_ERR_RATELIMIT_NR 10 -void bch2_log_msg_start(struct bch_fs *c, struct printbuf *out) +void __bch2_log_msg_start(const char *fs_or_dev_name, struct printbuf *out) { printbuf_indent_add_nextline(out, 2); #ifdef BCACHEFS_LOG_PREFIX - prt_printf(out, bch2_log_msg(c, "")); + prt_printf(out, "bcachefs (%s): ", fs_or_dev_name); #endif } @@ -29,12 +29,10 @@ bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out) return false; case BCH_ON_ERROR_fix_safe: case BCH_ON_ERROR_ro: - if (bch2_fs_emergency_read_only(c)) - prt_printf(out, "inconsistency detected - emergency read only at journal seq %llu\n", - journal_cur_seq(&c->journal)); + bch2_fs_emergency_read_only2(c, out); return true; case BCH_ON_ERROR_panic: - bch2_print_string_as_lines_nonblocking(KERN_ERR, out->buf); + bch2_print_str(c, KERN_ERR, out->buf); panic(bch2_fmt(c, "panic after error")); return true; default: @@ -44,15 +42,14 @@ bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out) bool bch2_inconsistent_error(struct bch_fs *c) { - struct printbuf buf = PRINTBUF; - buf.atomic++; + CLASS(printbuf, buf)(); + guard(printbuf_atomic)(&buf); printbuf_indent_add_nextline(&buf, 2); bool ret = __bch2_inconsistent_error(c, &buf); if (ret) bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); return ret; } @@ -60,8 +57,8 @@ __printf(3, 0) static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *trans, const char *fmt, va_list args) { - struct printbuf buf = PRINTBUF; - buf.atomic++; + CLASS(printbuf, buf)(); + guard(printbuf_atomic)(&buf); bch2_log_msg_start(c, &buf); @@ -71,9 +68,7 @@ static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *tra if (trans) bch2_trans_updates_to_text(&buf, trans); bool ret = __bch2_inconsistent_error(c, &buf); - bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf); - - printbuf_exit(&buf); + bch2_print_str(c, KERN_ERR, buf.buf); return ret; } @@ -100,19 +95,18 @@ int __bch2_topology_error(struct bch_fs *c, struct printbuf *out) prt_printf(out, "btree topology error: "); set_bit(BCH_FS_topology_error, &c->flags); - if (!test_bit(BCH_FS_recovery_running, &c->flags)) { + if (!test_bit(BCH_FS_in_recovery, &c->flags)) { __bch2_inconsistent_error(c, out); - return -BCH_ERR_btree_need_topology_repair; + return bch_err_throw(c, btree_need_topology_repair); } else { - return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: - -BCH_ERR_btree_node_read_validate_error; + return bch2_run_explicit_recovery_pass(c, out, BCH_RECOVERY_PASS_check_topology, 0) ?: + bch_err_throw(c, btree_need_topology_repair); } } int bch2_fs_topology_error(struct bch_fs *c, const char *fmt, ...) { - struct printbuf buf = PRINTBUF; - + CLASS(printbuf, buf)(); bch2_log_msg_start(c, &buf); va_list args; @@ -121,9 +115,7 @@ int bch2_fs_topology_error(struct bch_fs *c, const char *fmt, ...) va_end(args); int ret = __bch2_topology_error(c, &buf); - bch2_print_string_as_lines(KERN_ERR, buf.buf); - - printbuf_exit(&buf); + bch2_print_str(c, KERN_ERR, buf.buf); return ret; } @@ -140,28 +132,28 @@ void bch2_io_error_work(struct work_struct *work) /* XXX: if it's reads or checksums that are failing, set it to failed */ - down_write(&c->state_lock); + guard(rwsem_write)(&c->state_lock); unsigned long write_errors_start = READ_ONCE(ca->write_errors_start); if (write_errors_start && time_after(jiffies, write_errors_start + c->opts.write_error_timeout * HZ)) { if (ca->mi.state >= BCH_MEMBER_STATE_ro) - goto out; + return; bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, BCH_FORCE_IF_DEGRADED); + CLASS(printbuf, buf)(); + __bch2_log_msg_start(ca->name, &buf); - bch_err(ca, - "writes erroring for %u seconds, setting %s ro", + prt_printf(&buf, "writes erroring for %u seconds, setting %s ro", c->opts.write_error_timeout, dev ? "device" : "filesystem"); if (!dev) - bch2_fs_emergency_read_only(c); + bch2_fs_emergency_read_only2(c, &buf); + bch2_print_str(c, KERN_ERR, buf.buf); } -out: - up_write(&c->state_lock); } void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type) @@ -328,7 +320,7 @@ static int do_fsck_ask_yn(struct bch_fs *c, if (bch2_fs_stdio_redirect(c)) bch2_print(c, "%s", question->buf); else - bch2_print_string_as_lines(KERN_ERR, question->buf); + bch2_print_str(c, KERN_ERR, question->buf); int ask = bch2_fsck_ask_yn(c, trans); @@ -376,15 +368,63 @@ static struct fsck_err_state *count_fsck_err_locked(struct bch_fs *c, return s; } -void __bch2_count_fsck_err(struct bch_fs *c, - enum bch_sb_error_id id, const char *msg, - bool *repeat, bool *print, bool *suppress) +bool __bch2_count_fsck_err(struct bch_fs *c, + enum bch_sb_error_id id, struct printbuf *msg) { bch2_sb_error_count(c, id); - mutex_lock(&c->fsck_error_msgs_lock); - count_fsck_err_locked(c, id, msg, repeat, print, suppress); - mutex_unlock(&c->fsck_error_msgs_lock); + bool print = true, repeat = false, suppress = false; + + scoped_guard(mutex, &c->fsck_error_msgs_lock) + count_fsck_err_locked(c, id, msg->buf, &repeat, &print, &suppress); + + if (suppress) + prt_printf(msg, "Ratelimiting new instances of previous error\n"); + + return print && !repeat; +} + +int bch2_fsck_err_opt(struct bch_fs *c, + enum bch_fsck_flags flags, + enum bch_sb_error_id err) +{ + if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) + flags |= fsck_flags_extra[err]; + + if (test_bit(BCH_FS_in_fsck, &c->flags) || + test_bit(BCH_FS_in_recovery, &c->flags)) { + if (!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) + return bch_err_throw(c, fsck_repair_unimplemented); + + switch (c->opts.fix_errors) { + case FSCK_FIX_exit: + return bch_err_throw(c, fsck_errors_not_fixed); + case FSCK_FIX_yes: + if (flags & FSCK_CAN_FIX) + return bch_err_throw(c, fsck_fix); + fallthrough; + case FSCK_FIX_no: + if (flags & FSCK_CAN_IGNORE) + return bch_err_throw(c, fsck_ignore); + return bch_err_throw(c, fsck_errors_not_fixed); + case FSCK_FIX_ask: + if (flags & FSCK_AUTOFIX) + return bch_err_throw(c, fsck_fix); + return bch_err_throw(c, fsck_ask); + default: + BUG(); + } + } else { + if ((flags & FSCK_AUTOFIX) && + (c->opts.errors == BCH_ON_ERROR_continue || + c->opts.errors == BCH_ON_ERROR_fix_safe)) + return bch_err_throw(c, fsck_fix); + + if (c->opts.errors == BCH_ON_ERROR_continue && + (flags & FSCK_CAN_IGNORE)) + return bch_err_throw(c, fsck_ignore); + return bch_err_throw(c, fsck_errors_not_fixed); + } } int __bch2_fsck_err(struct bch_fs *c, @@ -394,8 +434,9 @@ int __bch2_fsck_err(struct bch_fs *c, const char *fmt, ...) { va_list args; - struct printbuf buf = PRINTBUF, *out = &buf; - int ret = -BCH_ERR_fsck_ignore; + CLASS(printbuf, buf)(); + struct printbuf *out = &buf; + int ret = 0; const char *action_orig = "fix?", *action = action_orig; might_sleep(); @@ -423,10 +464,13 @@ int __bch2_fsck_err(struct bch_fs *c, !trans && bch2_current_has_btree_trans(c)); - if (test_bit(err, c->sb.errors_silent)) - return flags & FSCK_CAN_FIX - ? -BCH_ERR_fsck_fix - : -BCH_ERR_fsck_ignore; + if ((flags & FSCK_ERR_SILENT) || + test_bit(err, c->sb.errors_silent)) { + ret = flags & FSCK_CAN_FIX + ? bch_err_throw(c, fsck_fix) + : bch_err_throw(c, fsck_ignore); + goto err; + } printbuf_indent_add_nextline(out, 2); @@ -468,14 +512,14 @@ int __bch2_fsck_err(struct bch_fs *c, prt_str(out, ", "); if (flags & FSCK_CAN_FIX) { prt_actioning(out, action); - ret = -BCH_ERR_fsck_fix; + ret = bch_err_throw(c, fsck_fix); } else { prt_str(out, ", continuing"); - ret = -BCH_ERR_fsck_ignore; + ret = bch_err_throw(c, fsck_ignore); } goto print; - } else if (!test_bit(BCH_FS_fsck_running, &c->flags)) { + } else if (!test_bit(BCH_FS_in_fsck, &c->flags)) { if (c->opts.errors != BCH_ON_ERROR_continue || !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { prt_str_indented(out, ", shutting down\n" @@ -483,18 +527,18 @@ int __bch2_fsck_err(struct bch_fs *c, "run fsck, and forward to devs so error can be marked for self-healing"); inconsistent = true; print = true; - ret = -BCH_ERR_fsck_errors_not_fixed; + ret = bch_err_throw(c, fsck_errors_not_fixed); } else if (flags & FSCK_CAN_FIX) { prt_str(out, ", "); prt_actioning(out, action); - ret = -BCH_ERR_fsck_fix; + ret = bch_err_throw(c, fsck_fix); } else { prt_str(out, ", continuing"); - ret = -BCH_ERR_fsck_ignore; + ret = bch_err_throw(c, fsck_ignore); } } else if (c->opts.fix_errors == FSCK_FIX_exit) { prt_str(out, ", exiting"); - ret = -BCH_ERR_fsck_errors_not_fixed; + ret = bch_err_throw(c, fsck_errors_not_fixed); } else if (flags & FSCK_CAN_FIX) { int fix = s && s->fix ? s->fix @@ -513,30 +557,37 @@ int __bch2_fsck_err(struct bch_fs *c, : FSCK_FIX_yes; ret = ret & 1 - ? -BCH_ERR_fsck_fix - : -BCH_ERR_fsck_ignore; + ? bch_err_throw(c, fsck_fix) + : bch_err_throw(c, fsck_ignore); } else if (fix == FSCK_FIX_yes || (c->opts.nochanges && !(flags & FSCK_CAN_IGNORE))) { prt_str(out, ", "); prt_actioning(out, action); - ret = -BCH_ERR_fsck_fix; + ret = bch_err_throw(c, fsck_fix); } else { prt_str(out, ", not "); prt_actioning(out, action); + ret = bch_err_throw(c, fsck_ignore); + } + } else { + if (flags & FSCK_CAN_IGNORE) { + prt_str(out, ", continuing"); + ret = bch_err_throw(c, fsck_ignore); + } else { + prt_str(out, " (repair unimplemented)"); + ret = bch_err_throw(c, fsck_repair_unimplemented); } - } else if (!(flags & FSCK_CAN_IGNORE)) { - prt_str(out, " (repair unimplemented)"); } - if (ret == -BCH_ERR_fsck_ignore && + if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) && (c->opts.fix_errors == FSCK_FIX_exit || !(flags & FSCK_CAN_IGNORE))) - ret = -BCH_ERR_fsck_errors_not_fixed; + ret = bch_err_throw(c, fsck_errors_not_fixed); - if (test_bit(BCH_FS_fsck_running, &c->flags) && - (ret != -BCH_ERR_fsck_fix && - ret != -BCH_ERR_fsck_ignore)) { + if (test_bit(BCH_FS_in_fsck, &c->flags) && + (!bch2_err_matches(ret, BCH_ERR_fsck_fix) && + !bch2_err_matches(ret, BCH_ERR_fsck_ignore))) { exiting = true; print = true; } @@ -559,31 +610,37 @@ int __bch2_fsck_err(struct bch_fs *c, if (bch2_fs_stdio_redirect(c)) bch2_print(c, "%s", out->buf); else - bch2_print_string_as_lines(KERN_ERR, out->buf); + bch2_print_str(c, KERN_ERR, out->buf); } if (s) s->ret = ret; +err_unlock: + mutex_unlock(&c->fsck_error_msgs_lock); +err: + if (trans && + !(flags & FSCK_ERR_NO_LOG) && + ret == -BCH_ERR_fsck_fix) + ret = bch2_trans_log_str(trans, bch2_sb_error_strs[err]) ?: ret; /* * We don't yet track whether the filesystem currently has errors, for * log_fsck_err()s: that would require us to track for every error type * which recovery pass corrects it, to get the fsck exit status correct: */ - if (flags & FSCK_CAN_FIX) { - if (ret == -BCH_ERR_fsck_fix) { - set_bit(BCH_FS_errors_fixed, &c->flags); - } else { - set_bit(BCH_FS_errors_not_fixed, &c->flags); - set_bit(BCH_FS_error, &c->flags); - } + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + /* nothing */ + } else if (bch2_err_matches(ret, BCH_ERR_fsck_fix)) { + set_bit(BCH_FS_errors_fixed, &c->flags); + } else { + set_bit(BCH_FS_errors_not_fixed, &c->flags); + set_bit(BCH_FS_error, &c->flags); } -err_unlock: - mutex_unlock(&c->fsck_error_msgs_lock); -err: + if (action != action_orig) kfree(action); - printbuf_exit(&buf); + + BUG_ON(!ret); return ret; } @@ -601,19 +658,19 @@ int __bch2_bkey_fsck_err(struct bch_fs *c, const char *fmt, ...) { if (from.flags & BCH_VALIDATE_silent) - return -BCH_ERR_fsck_delete_bkey; + return bch_err_throw(c, fsck_delete_bkey); unsigned fsck_flags = 0; if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) { if (test_bit(err, c->sb.errors_silent)) - return -BCH_ERR_fsck_delete_bkey; + return bch_err_throw(c, fsck_delete_bkey); fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX; } if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) fsck_flags |= fsck_flags_extra[err]; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); prt_printf(&buf, "invalid bkey in %s", bch2_bkey_validate_contexts[from.from]); @@ -634,7 +691,6 @@ int __bch2_bkey_fsck_err(struct bch_fs *c, va_end(args); int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s, delete?", buf.buf); - printbuf_exit(&buf); return ret; } @@ -642,7 +698,7 @@ static void __bch2_flush_fsck_errs(struct bch_fs *c, bool print) { struct fsck_err_state *s, *n; - mutex_lock(&c->fsck_error_msgs_lock); + guard(mutex)(&c->fsck_error_msgs_lock); list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) { if (print && s->ratelimited && s->last_msg) @@ -652,8 +708,6 @@ static void __bch2_flush_fsck_errs(struct bch_fs *c, bool print) kfree(s->last_msg); kfree(s); } - - mutex_unlock(&c->fsck_error_msgs_lock); } void bch2_flush_fsck_errs(struct bch_fs *c) @@ -687,31 +741,16 @@ int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *o void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, subvol_inum inum, u64 offset) { - bch2_trans_do(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); + CLASS(btree_trans, trans)(c); + lockrestart_do(trans, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); } int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, struct bpos pos) { - struct bch_fs *c = trans->c; - int ret = 0; - - if (!bch2_snapshot_is_leaf(c, pos.snapshot)) - prt_str(out, "(multiple snapshots) "); - - subvol_inum inum = { - .subvol = bch2_snapshot_tree_oldest_subvol(c, pos.snapshot), - .inum = pos.inode, - }; - - if (inum.subvol) { - ret = bch2_inum_to_path(trans, inum, out); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ret; - } - - if (!inum.subvol || ret) - prt_printf(out, "inum %llu:%u", pos.inode, pos.snapshot); + int ret = bch2_inum_snapshot_to_path(trans, pos.inode, pos.snapshot, NULL, out); + if (ret) + return ret; prt_printf(out, " offset %llu: ", pos.offset << 8); return 0; @@ -720,5 +759,6 @@ int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printb void bch2_inum_snap_offset_err_msg(struct bch_fs *c, struct printbuf *out, struct bpos pos) { - bch2_trans_do(c, bch2_inum_snap_offset_err_msg_trans(trans, out, pos)); + CLASS(btree_trans, trans)(c); + lockrestart_do(trans, bch2_inum_snap_offset_err_msg_trans(trans, out, pos)); } diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 4a364fd44abe..0c3c3a24fc6f 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -18,7 +18,12 @@ struct work_struct; /* Error messages: */ -void bch2_log_msg_start(struct bch_fs *, struct printbuf *); +void __bch2_log_msg_start(const char *, struct printbuf *); + +static inline void bch2_log_msg_start(struct bch_fs *c, struct printbuf *out) +{ + __bch2_log_msg_start(c->name, out); +} /* * Inconsistency errors: The on disk data is inconsistent. If these occur during @@ -76,12 +81,14 @@ struct fsck_err_state { #define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err) -void __bch2_count_fsck_err(struct bch_fs *, - enum bch_sb_error_id, const char *, - bool *, bool *, bool *); +bool __bch2_count_fsck_err(struct bch_fs *, enum bch_sb_error_id, struct printbuf *); #define bch2_count_fsck_err(_c, _err, ...) \ __bch2_count_fsck_err(_c, BCH_FSCK_ERR_##_err, __VA_ARGS__) +int bch2_fsck_err_opt(struct bch_fs *, + enum bch_fsck_flags, + enum bch_sb_error_id); + __printf(5, 6) __cold int __bch2_fsck_err(struct bch_fs *, struct btree_trans *, enum bch_fsck_flags, @@ -98,13 +105,13 @@ void bch2_free_fsck_errs(struct bch_fs *); #define fsck_err_wrap(_do) \ ({ \ int _ret = _do; \ - if (_ret != -BCH_ERR_fsck_fix && \ - _ret != -BCH_ERR_fsck_ignore) { \ + if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) && \ + !bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) { \ ret = _ret; \ goto fsck_err; \ } \ \ - _ret == -BCH_ERR_fsck_fix; \ + bch2_err_matches(_ret, BCH_ERR_fsck_fix); \ }) #define __fsck_err(...) fsck_err_wrap(bch2_fsck_err(__VA_ARGS__)) @@ -163,10 +170,10 @@ do { \ int _ret = __bch2_bkey_fsck_err(c, k, from, \ BCH_FSCK_ERR_##_err_type, \ _err_msg, ##__VA_ARGS__); \ - if (_ret != -BCH_ERR_fsck_fix && \ - _ret != -BCH_ERR_fsck_ignore) \ + if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) && \ + !bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) \ ret = _ret; \ - ret = -BCH_ERR_fsck_delete_bkey; \ + ret = bch_err_throw(c, fsck_delete_bkey); \ goto fsck_err; \ } while (0) diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c index 6bb42985306e..c4b0ea1adaa8 100644 --- a/fs/bcachefs/extent_update.c +++ b/fs/bcachefs/extent_update.c @@ -37,16 +37,17 @@ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) return lru + ret * 2; } +#define EXTENT_ITERS_MAX 64 + static int count_iters_for_insert(struct btree_trans *trans, struct bkey_s_c k, unsigned offset, struct bpos *end, - unsigned *nr_iters, - unsigned max_iters) + unsigned *nr_iters) { int ret = 0, ret2 = 0; - if (*nr_iters >= max_iters) { + if (*nr_iters >= EXTENT_ITERS_MAX) { *end = bpos_min(*end, k.k->p); ret = 1; } @@ -56,7 +57,7 @@ static int count_iters_for_insert(struct btree_trans *trans, case KEY_TYPE_reflink_v: *nr_iters += bch2_bkey_nr_alloc_ptrs(k); - if (*nr_iters >= max_iters) { + if (*nr_iters >= EXTENT_ITERS_MAX) { *end = bpos_min(*end, k.k->p); ret = 1; } @@ -67,7 +68,6 @@ static int count_iters_for_insert(struct btree_trans *trans, u64 idx = REFLINK_P_IDX(p.v); unsigned sectors = bpos_min(*end, p.k->p).offset - bkey_start_offset(p.k); - struct btree_iter iter; struct bkey_s_c r_k; for_each_btree_key_norestart(trans, iter, @@ -81,17 +81,15 @@ static int count_iters_for_insert(struct btree_trans *trans, *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); - if (*nr_iters >= max_iters) { + if (*nr_iters >= EXTENT_ITERS_MAX) { struct bpos pos = bkey_start_pos(k.k); pos.offset += min_t(u64, k.k->size, r_k.k->p.offset - idx); *end = bpos_min(*end, pos); - ret = 1; - break; + return 1; } } - bch2_trans_iter_exit(trans, &iter); break; } @@ -100,60 +98,32 @@ static int count_iters_for_insert(struct btree_trans *trans, return ret2 ?: ret; } -#define EXTENT_ITERS_MAX (BTREE_ITER_INITIAL / 3) - int bch2_extent_atomic_end(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *insert, struct bpos *end) { - struct btree_iter copy; - struct bkey_s_c k; unsigned nr_iters = 0; - int ret; - - ret = bch2_btree_iter_traverse(trans, iter); - if (ret) - return ret; - - *end = insert->k.p; - /* extent_update_to_keys(): */ - nr_iters += 1; - - ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end, - &nr_iters, EXTENT_ITERS_MAX / 2); - if (ret < 0) - return ret; + struct btree_iter copy; + bch2_trans_copy_iter(©, iter); - bch2_trans_copy_iter(trans, ©, iter); + int ret = bch2_btree_iter_traverse(©); + if (ret) + goto err; - for_each_btree_key_max_continue_norestart(trans, copy, insert->k.p, 0, k, ret) { + struct bkey_s_c k; + for_each_btree_key_max_continue_norestart(copy, *end, 0, k, ret) { unsigned offset = 0; - if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) - offset = bkey_start_offset(&insert->k) - - bkey_start_offset(k.k); - - /* extent_handle_overwrites(): */ - switch (bch2_extent_overlap(&insert->k, k.k)) { - case BCH_EXTENT_OVERLAP_ALL: - case BCH_EXTENT_OVERLAP_FRONT: - nr_iters += 1; - break; - case BCH_EXTENT_OVERLAP_BACK: - case BCH_EXTENT_OVERLAP_MIDDLE: - nr_iters += 2; - break; - } + if (bkey_gt(iter->pos, bkey_start_pos(k.k))) + offset = iter->pos.offset - bkey_start_offset(k.k); - ret = count_iters_for_insert(trans, k, offset, end, - &nr_iters, EXTENT_ITERS_MAX); + ret = count_iters_for_insert(trans, k, offset, end, &nr_iters); if (ret) break; } - - bch2_trans_iter_exit(trans, ©); +err: + bch2_trans_iter_exit(©); return ret < 0 ? ret : 0; } @@ -161,13 +131,22 @@ int bch2_extent_trim_atomic(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *k) { - struct bpos end; - int ret; - - ret = bch2_extent_atomic_end(trans, iter, k, &end); + struct bpos end = k->k.p; + int ret = bch2_extent_atomic_end(trans, iter, &end); if (ret) return ret; - bch2_cut_back(end, k); + /* tracepoint */ + + if (bpos_lt(end, k->k.p)) { + if (trace_extent_trim_atomic_enabled()) { + CLASS(printbuf, buf)(); + bch2_bpos_to_text(&buf, end); + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k)); + trace_extent_trim_atomic(trans->c, buf.buf); + } + bch2_cut_back(end, k); + } return 0; } diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h index 6f5cf449361a..34467db53f45 100644 --- a/fs/bcachefs/extent_update.h +++ b/fs/bcachefs/extent_update.h @@ -5,7 +5,7 @@ #include "bcachefs.h" int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *, - struct bkey_i *, struct bpos *); + struct bpos *); int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *, struct bkey_i *); diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index e597fb9c9823..b879a586b7f6 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -45,6 +45,48 @@ static void bch2_extent_crc_pack(union bch_extent_crc *, struct bch_extent_crc_unpacked, enum bch_extent_entry_type); +void bch2_io_failures_to_text(struct printbuf *out, + struct bch_fs *c, + struct bch_io_failures *failed) +{ + static const char * const error_types[] = { + "btree validate", "io", "checksum", "ec reconstruct", NULL + }; + + for (struct bch_dev_io_failures *f = failed->devs; + f < failed->devs + failed->nr; + f++) { + unsigned errflags = + ((!!f->failed_btree_validate) << 0) | + ((!!f->failed_io) << 1) | + ((!!f->failed_csum_nr) << 2) | + ((!!f->failed_ec) << 3); + + bch2_printbuf_make_room(out, 1024); + scoped_guard(rcu) { + guard(printbuf_atomic)(out); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, f->dev); + if (ca) + prt_str(out, ca->name); + else + prt_printf(out, "(invalid device %u)", f->dev); + } + + prt_char(out, ' '); + + if (!errflags) { + prt_str(out, "no error - confused"); + } else if (is_power_of_2(errflags)) { + prt_bitflags(out, error_types, errflags); + prt_str(out, " error"); + } else { + prt_str(out, "errors: "); + prt_bitflags(out, error_types, errflags); + } + prt_newline(out); + } +} + struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f, unsigned dev) { @@ -79,6 +121,22 @@ void bch2_mark_io_failure(struct bch_io_failures *failed, f->failed_csum_nr++; } +void bch2_mark_btree_validate_failure(struct bch_io_failures *failed, + unsigned dev) +{ + struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, dev); + + if (!f) { + BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); + + f = &failed->devs[failed->nr++]; + memset(f, 0, sizeof(*f)); + f->dev = dev; + } + + f->failed_btree_validate = true; +} + static inline u64 dev_latency(struct bch_dev *ca) { return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX; @@ -105,7 +163,7 @@ static inline bool ptr_better(struct bch_fs *c, if (unlikely(failed_delta)) return failed_delta < 0; - if (unlikely(bch2_force_reconstruct_read)) + if (static_branch_unlikely(&bch2_force_reconstruct_read)) return p1.do_ec_reconstruct > p2.do_ec_reconstruct; if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct)) @@ -134,14 +192,10 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, bool have_dirty_ptrs = false, have_pick = false; if (k.k->type == KEY_TYPE_error) - return -BCH_ERR_key_type_error; - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) - return -BCH_ERR_extent_poisoned; + return bch_err_throw(c, key_type_error); rcu_read_lock(); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; u64 pick_latency; @@ -162,7 +216,15 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, if (dev >= 0 && p.ptr.dev != dev) continue; - struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); + + if (unlikely(!ca && p.ptr.dev != BCH_SB_MEMBER_INVALID)) { + rcu_read_unlock(); + int ret = bch2_dev_missing_bkey(c, k, p.ptr.dev); + if (ret) + return ret; + rcu_read_lock(); + } if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr))) continue; @@ -175,6 +237,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) { have_io_errors |= f->failed_io; + have_io_errors |= f->failed_btree_validate; have_io_errors |= f->failed_ec; } have_csum_errors |= !!f->failed_csum_nr; @@ -182,6 +245,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, if (p.has_ec && (f->failed_io || f->failed_csum_nr)) p.do_ec_reconstruct = true; else if (f->failed_io || + f->failed_btree_validate || f->failed_csum_nr > c->opts.checksum_err_retry_nr) continue; } @@ -194,7 +258,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, p.do_ec_reconstruct = true; } - if (bch2_force_reconstruct_read && p.has_ec) + if (static_branch_unlikely(&bch2_force_reconstruct_read) && p.has_ec) p.do_ec_reconstruct = true; u64 p_latency = dev_latency(ca); @@ -218,20 +282,20 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, if (have_pick) return 1; - if (!have_dirty_ptrs) + if (!have_dirty_ptrs && !bkey_is_btree_ptr(k.k)) return 0; - if (have_missing_devs) - return -BCH_ERR_no_device_to_read_from; + if (have_missing_devs || !have_dirty_ptrs) + return bch_err_throw(c, no_device_to_read_from); if (have_csum_errors) - return -BCH_ERR_data_read_csum_err; + return bch_err_throw(c, data_read_csum_err); if (have_io_errors) - return -BCH_ERR_data_read_io_err; + return bch_err_throw(c, data_read_io_err); /* * If we get here, we have pointers (bkey_ptrs_validate() ensures that), * but they don't point to valid devices: */ - return -BCH_ERR_no_devices_valid; + return bch_err_throw(c, no_devices_valid); } /* KEY_TYPE_btree_ptr: */ @@ -342,6 +406,8 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) lp.crc = bch2_extent_crc_unpack(l.k, NULL); rp.crc = bch2_extent_crc_unpack(r.k, NULL); + guard(rcu)(); + while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) && __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) { if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size != @@ -353,10 +419,8 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) return false; /* Extents may not straddle buckets: */ - rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev); bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr); - rcu_read_unlock(); if (!same_bucket) return false; @@ -773,11 +837,9 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) struct extent_ptr_decoded p; unsigned durability = 0; - rcu_read_lock(); + guard(rcu)(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) durability += bch2_extent_ptr_durability(c, &p); - rcu_read_unlock(); - return durability; } @@ -788,12 +850,10 @@ static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k) struct extent_ptr_decoded p; unsigned durability = 0; - rcu_read_lock(); + guard(rcu)(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev]) durability += bch2_extent_ptr_durability(c, &p); - rcu_read_unlock(); - return durability; } @@ -946,24 +1006,46 @@ const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned return NULL; } +bool bch2_bkey_devs_rw(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + guard(rcu)(); + bkey_for_each_ptr(ptrs, ptr) { + CLASS(bch2_dev_tryget, ca)(c, ptr->dev); + if (!ca || ca->mi.state != BCH_MEMBER_STATE_rw) + return false; + } + + return true; +} + bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); struct bch_dev *ca; - bool ret = false; - rcu_read_lock(); + guard(rcu)(); bkey_for_each_ptr(ptrs, ptr) if (bch2_dev_in_target(c, ptr->dev, target) && (ca = bch2_dev_rcu(c, ptr->dev)) && (!ptr->cached || - !dev_ptr_stale_rcu(ca, ptr))) { - ret = true; - break; - } - rcu_read_unlock(); + !dev_ptr_stale_rcu(ca, ptr))) + return true; - return ret; + return false; +} + +bool bch2_bkey_in_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + guard(rcu)(); + bkey_for_each_ptr(ptrs, ptr) + if (!bch2_dev_in_target(c, ptr->dev, target)) + return false; + + return true; } bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, @@ -1071,33 +1153,48 @@ void bch2_extent_ptr_set_cached(struct bch_fs *c, struct bkey_s k, struct bch_extent_ptr *ptr) { - struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + struct bkey_ptrs ptrs; union bch_extent_entry *entry; struct extent_ptr_decoded p; + bool have_cached_ptr; + unsigned drop_dev = ptr->dev; - rcu_read_lock(); - if (!want_cached_ptr(c, opts, ptr)) { - bch2_bkey_drop_ptr_noerror(k, ptr); - goto out; - } + guard(rcu)(); +restart_drop_ptrs: + ptrs = bch2_bkey_ptrs(k); + have_cached_ptr = false; - /* - * Stripes can't contain cached data, for - reasons. - * - * Possibly something we can fix in the future? - */ - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (&entry->ptr == ptr) { - if (p.has_ec) - bch2_bkey_drop_ptr_noerror(k, ptr); - else - ptr->cached = true; - goto out; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + /* + * Check if it's erasure coded - stripes can't contain cached + * data. Possibly something we can fix in the future? + */ + if (&entry->ptr == ptr && p.has_ec) + goto drop; + + if (p.ptr.cached) { + if (have_cached_ptr || !want_cached_ptr(c, opts, &p.ptr)) { + bch2_bkey_drop_ptr_noerror(k, &entry->ptr); + ptr = NULL; + goto restart_drop_ptrs; + } + + have_cached_ptr = true; } + } - BUG(); -out: - rcu_read_unlock(); + if (!ptr) + bkey_for_each_ptr(ptrs, ptr2) + if (ptr2->dev == drop_dev) + ptr = ptr2; + + if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) + goto drop; + + ptr->cached = true; + return; +drop: + bch2_bkey_drop_ptr_noerror(k, ptr); } /* @@ -1112,12 +1209,11 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) { struct bch_dev *ca; - rcu_read_lock(); + guard(rcu)(); bch2_bkey_drop_ptrs(k, ptr, ptr->cached && (!(ca = bch2_dev_rcu(c, ptr->dev)) || dev_ptr_stale_rcu(ca, ptr) > 0)); - rcu_read_unlock(); return bkey_deleted(k.k); } @@ -1135,7 +1231,7 @@ bool bch2_extent_normalize_by_opts(struct bch_fs *c, struct bkey_ptrs ptrs; bool have_cached_ptr; - rcu_read_lock(); + guard(rcu)(); restart_drop_ptrs: ptrs = bch2_bkey_ptrs(k); have_cached_ptr = false; @@ -1148,15 +1244,14 @@ bool bch2_extent_normalize_by_opts(struct bch_fs *c, } have_cached_ptr = true; } - rcu_read_unlock(); return bkey_deleted(k.k); } void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr) { - out->atomic++; - rcu_read_lock(); + guard(printbuf_atomic)(out); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); if (!ca) { prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, @@ -1180,8 +1275,6 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc else if (stale) prt_printf(out, " invalid"); } - rcu_read_unlock(); - --out->atomic; } void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_crc_unpacked *crc) @@ -1443,10 +1536,10 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, const struct bch_extent_rebalance *r = &entry->rebalance; if (!bch2_compression_opt_valid(r->compression)) { - struct bch_compression_opt opt = __bch2_compression_decode(r->compression); + union bch_compression_opt opt = { .value = r->compression }; prt_printf(err, "invalid compression opt %u:%u", opt.type, opt.level); - return -BCH_ERR_invalid_bkey; + return bch_err_throw(c, invalid_bkey); } #endif break; diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 9fe153183b36..35ee03cd5065 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -392,10 +392,13 @@ out: \ /* utility code common to all keys with pointers: */ +void bch2_io_failures_to_text(struct printbuf *, struct bch_fs *, + struct bch_io_failures *); struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *, unsigned); void bch2_mark_io_failure(struct bch_io_failures *, struct extent_ptr_decoded *, bool); +void bch2_mark_btree_validate_failure(struct bch_io_failures *, unsigned); int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, struct bch_io_failures *, struct extent_ptr_decoded *, int); @@ -611,7 +614,10 @@ static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsig return (void *) bch2_bkey_has_device_c(k.s_c, dev); } +bool bch2_bkey_devs_rw(struct bch_fs *, struct bkey_s_c); + bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); +bool bch2_bkey_in_target(struct bch_fs *, struct bkey_s_c, unsigned); void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *); diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h index e51529dca4c2..b23ce4a373c0 100644 --- a/fs/bcachefs/extents_types.h +++ b/fs/bcachefs/extents_types.h @@ -34,6 +34,7 @@ struct bch_io_failures { u8 dev; unsigned failed_csum_nr:6, failed_io:1, + failed_btree_validate:1, failed_ec:1; } devs[BCH_REPLICAS_MAX + 1]; }; diff --git a/fs/bcachefs/fast_list.c b/fs/bcachefs/fast_list.c new file mode 100644 index 000000000000..6be2a45be1dd --- /dev/null +++ b/fs/bcachefs/fast_list.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Fast, unordered lists + * + * Supports add, remove, and iterate + * + * Underneath, they're a radix tree and an IDA, with a percpu buffer for slot + * allocation and freeing. + * + * This means that adding, removing, and iterating over items is lockless, + * except when refilling/emptying the percpu slot buffers. + */ + +#include "fast_list.h" + +struct fast_list_pcpu { + u32 nr; + u32 entries[31]; +}; + +static int fast_list_alloc_idx(struct fast_list *l, gfp_t gfp) +{ + int idx = ida_alloc_range(&l->slots_allocated, 1, INT_MAX, gfp); + if (unlikely(idx < 0)) + return 0; + + if (unlikely(!genradix_ptr_alloc_inlined(&l->items, idx, gfp))) { + ida_free(&l->slots_allocated, idx); + return 0; + } + + return idx; +} + +/** + * fast_list_get_idx - get a slot in a fast_list + * @l: list to get slot in + * + * This allocates a slot in the radix tree without storing to it, so that we can + * take the potential memory allocation failure early and do the list add later + * when we can't take an allocation failure. + * + * Returns: positive integer on success, -ENOMEM on failure + */ +int fast_list_get_idx(struct fast_list *l) +{ + unsigned long flags; + int idx; +retry: + local_irq_save(flags); + struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer); + + if (unlikely(!lp->nr)) { + u32 entries[16], nr = 0; + + local_irq_restore(flags); + while (nr < ARRAY_SIZE(entries) && + (idx = fast_list_alloc_idx(l, GFP_KERNEL))) + entries[nr++] = idx; + local_irq_save(flags); + + lp = this_cpu_ptr(l->buffer); + + while (nr && lp->nr < ARRAY_SIZE(lp->entries)) + lp->entries[lp->nr++] = entries[--nr]; + + if (unlikely(nr)) { + local_irq_restore(flags); + while (nr) + ida_free(&l->slots_allocated, entries[--nr]); + goto retry; + } + + if (unlikely(!lp->nr)) { + local_irq_restore(flags); + return -ENOMEM; + } + } + + idx = lp->entries[--lp->nr]; + local_irq_restore(flags); + + return idx; +} + +/** + * fast_list_add - add an item to a fast_list + * @l: list + * @item: item to add + * + * Allocates a slot in the radix tree and stores to it and then returns the + * slot index, which must be passed to fast_list_remove(). + * + * Returns: positive integer on success, -ENOMEM on failure + */ +int fast_list_add(struct fast_list *l, void *item) +{ + int idx = fast_list_get_idx(l); + if (idx < 0) + return idx; + + *genradix_ptr_inlined(&l->items, idx) = item; + return idx; +} + +/** + * fast_list_remove - remove an item from a fast_list + * @l: list + * @idx: item's slot index + * + * Zeroes out the slot in the radix tree and frees the slot for future + * fast_list_add() operations. + */ +void fast_list_remove(struct fast_list *l, unsigned idx) +{ + u32 entries[16], nr = 0; + + if (!idx) + return; + + *genradix_ptr_inlined(&l->items, idx) = NULL; + + scoped_guard(irqsave) { + struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer); + + if (unlikely(lp->nr == ARRAY_SIZE(lp->entries))) + while (nr < ARRAY_SIZE(entries)) + entries[nr++] = lp->entries[--lp->nr]; + + lp->entries[lp->nr++] = idx; + } + + if (unlikely(nr)) + while (nr) + ida_free(&l->slots_allocated, entries[--nr]); +} + +void fast_list_exit(struct fast_list *l) +{ + if (l->buffer) { + int cpu; + for_each_possible_cpu(cpu) { + struct fast_list_pcpu *lp = per_cpu_ptr(l->buffer, cpu); + + while (lp->nr) + ida_free(&l->slots_allocated, lp->entries[--lp->nr]); + } + + free_percpu(l->buffer); + } + + WARN(ida_find_first(&l->slots_allocated) >= 0, + "fast_list still has objects on exit\n"); + + ida_destroy(&l->slots_allocated); + genradix_free(&l->items); +} + +int fast_list_init(struct fast_list *l) +{ + genradix_init(&l->items); + ida_init(&l->slots_allocated); + l->buffer = alloc_percpu(*l->buffer); + if (!l->buffer) + return -ENOMEM; + return 0; +} diff --git a/fs/bcachefs/fast_list.h b/fs/bcachefs/fast_list.h new file mode 100644 index 000000000000..f67df3f72ee2 --- /dev/null +++ b/fs/bcachefs/fast_list.h @@ -0,0 +1,41 @@ +#ifndef _LINUX_FAST_LIST_H +#define _LINUX_FAST_LIST_H + +#include +#include +#include + +struct fast_list_pcpu; + +struct fast_list { + GENRADIX(void *) items; + struct ida slots_allocated; + struct fast_list_pcpu __percpu + *buffer; +}; + +static inline void *fast_list_iter_peek(struct genradix_iter *iter, + struct fast_list *list) +{ + void **p; + while ((p = genradix_iter_peek(iter, &list->items)) && !*p) + genradix_iter_advance(iter, &list->items); + + return p ? *p : NULL; +} + +#define fast_list_for_each_from(_list, _iter, _i, _start) \ + for (_iter = genradix_iter_init(&(_list)->items, _start); \ + (_i = fast_list_iter_peek(&(_iter), _list)) != NULL; \ + genradix_iter_advance(&(_iter), &(_list)->items)) + +#define fast_list_for_each(_list, _iter, _i) \ + fast_list_for_each_from(_list, _iter, _i, 0) + +int fast_list_get_idx(struct fast_list *l); +int fast_list_add(struct fast_list *l, void *item); +void fast_list_remove(struct fast_list *l, unsigned idx); +void fast_list_exit(struct fast_list *l); +int fast_list_init(struct fast_list *l); + +#endif /* _LINUX_FAST_LIST_H */ diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index e3a75dcca60c..0005569ecace 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -145,7 +145,7 @@ static int readpage_bio_extend(struct btree_trans *trans, BUG_ON(folio_sector(folio) != bio_end_sector(bio)); - BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0)); + bio_add_folio_nofail(bio, folio, folio_size(folio), 0); } return bch2_trans_relock(trans); @@ -157,7 +157,6 @@ static void bchfs_read(struct btree_trans *trans, struct readpages_iter *readpages_iter) { struct bch_fs *c = trans->c; - struct btree_iter iter; struct bkey_buf sk; int flags = BCH_READ_retry_if_stale| BCH_READ_may_promote; @@ -167,7 +166,7 @@ static void bchfs_read(struct btree_trans *trans, bch2_bkey_buf_init(&sk); bch2_trans_begin(trans); - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + CLASS(btree_iter, iter)(trans, BTREE_ID_extents, POS(inum.inum, rbio->bio.bi_iter.bi_sector), BTREE_ITER_slots); while (1) { @@ -183,12 +182,12 @@ static void bchfs_read(struct btree_trans *trans, if (ret) goto err; - bch2_btree_iter_set_snapshot(trans, &iter, snapshot); + bch2_btree_iter_set_snapshot(&iter, snapshot); - bch2_btree_iter_set_pos(trans, &iter, + bch2_btree_iter_set_pos(&iter, POS(inum.inum, rbio->bio.bi_iter.bi_sector)); - k = bch2_btree_iter_peek_slot(trans, &iter); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto err; @@ -251,15 +250,13 @@ static void bchfs_read(struct btree_trans *trans, !bch2_err_matches(ret, BCH_ERR_transaction_restart)) break; } - bch2_trans_iter_exit(trans, &iter); if (ret) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); lockrestart_do(trans, bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9)); - prt_printf(&buf, "read error %i from btree lookup", ret); + prt_printf(&buf, "read error %s from btree lookup", bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); rbio->bio.bi_status = BLK_STS_IOERR; bio_endio(&rbio->bio); @@ -311,7 +308,7 @@ void bch2_readahead(struct readahead_control *ractl) readpage_iter_advance(&readpages_iter); rbio->bio.bi_iter.bi_sector = folio_sector(folio); - BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); + bio_add_folio_nofail(&rbio->bio, folio, folio_size(folio), 0); bchfs_read(trans, rbio, inode_inum(inode), &readpages_iter); @@ -354,7 +351,7 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) rbio->bio.bi_private = &done; rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; rbio->bio.bi_iter.bi_sector = folio_sector(folio); - BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); + bio_add_folio_nofail(&rbio->bio, folio, folio_size(folio), 0); blk_start_plug(&plug); bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0)); @@ -394,17 +391,9 @@ struct bch_writepage_state { struct bch_io_opts opts; struct bch_folio_sector *tmp; unsigned tmp_sectors; + struct blk_plug plug; }; -static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, - struct bch_inode_info *inode) -{ - struct bch_writepage_state ret = { 0 }; - - bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode); - return ret; -} - /* * Determine when a writepage io is full. We have to limit writepage bios to a * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to @@ -433,27 +422,23 @@ static void bch2_writepage_io_done(struct bch_write_op *op) set_bit(EI_INODE_ERROR, &io->inode->ei_flags); bio_for_each_folio_all(fi, bio) { - struct bch_folio *s; - mapping_set_error(fi.folio->mapping, -EIO); - s = __bch2_folio(fi.folio); - spin_lock(&s->lock); + struct bch_folio *s = __bch2_folio(fi.folio); + guard(spinlock)(&s->lock); + for (i = 0; i < folio_sectors(fi.folio); i++) s->s[i].nr_replicas = 0; - spin_unlock(&s->lock); } } if (io->op.flags & BCH_WRITE_wrote_data_inline) { bio_for_each_folio_all(fi, bio) { - struct bch_folio *s; + struct bch_folio *s = __bch2_folio(fi.folio); + guard(spinlock)(&s->lock); - s = __bch2_folio(fi.folio); - spin_lock(&s->lock); for (i = 0; i < folio_sectors(fi.folio); i++) s->s[i].nr_replicas = 0; - spin_unlock(&s->lock); } } @@ -579,30 +564,30 @@ static int __bch2_writepage(struct folio *folio, BUG_ON(ret); /* Before unlocking the page, get copy of reservations: */ - spin_lock(&s->lock); - memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors); + scoped_guard(spinlock, &s->lock) { + memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors); - for (i = 0; i < f_sectors; i++) { - if (s->s[i].state < SECTOR_dirty) - continue; + for (i = 0; i < f_sectors; i++) { + if (s->s[i].state < SECTOR_dirty) + continue; - nr_replicas_this_write = - min_t(unsigned, nr_replicas_this_write, - s->s[i].nr_replicas + - s->s[i].replicas_reserved); - } + nr_replicas_this_write = + min_t(unsigned, nr_replicas_this_write, + s->s[i].nr_replicas + + s->s[i].replicas_reserved); + } - for (i = 0; i < f_sectors; i++) { - if (s->s[i].state < SECTOR_dirty) - continue; + for (i = 0; i < f_sectors; i++) { + if (s->s[i].state < SECTOR_dirty) + continue; - s->s[i].nr_replicas = w->opts.compression - ? 0 : nr_replicas_this_write; + s->s[i].nr_replicas = w->opts.compression + ? 0 : nr_replicas_this_write; - s->s[i].replicas_reserved = 0; - bch2_folio_sector_set(folio, s, i, SECTOR_allocated); + s->s[i].replicas_reserved = 0; + bch2_folio_sector_set(folio, s, i, SECTOR_allocated); + } } - spin_unlock(&s->lock); BUG_ON(atomic_read(&s->write_count)); atomic_set(&s->write_count, 1); @@ -647,8 +632,8 @@ static int __bch2_writepage(struct folio *folio, atomic_inc(&s->write_count); BUG_ON(inode != w->io->inode); - BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, - sectors << 9, offset << 9)); + bio_add_folio_nofail(&w->io->op.wbio.bio, folio, + sectors << 9, offset << 9); w->io->op.res.sectors += reserved_sectors; w->io->op.i_sectors_delta -= dirty_sectors; @@ -666,17 +651,17 @@ static int __bch2_writepage(struct folio *folio, int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct bch_fs *c = mapping->host->i_sb->s_fs_info; - struct bch_writepage_state w = - bch_writepage_state_init(c, to_bch_ei(mapping->host)); - struct blk_plug plug; - int ret; + struct bch_writepage_state *w = kzalloc(sizeof(*w), GFP_NOFS|__GFP_NOFAIL); - blk_start_plug(&plug); - ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w); - if (w.io) - bch2_writepage_do_io(&w); - blk_finish_plug(&plug); - kfree(w.tmp); + bch2_inode_opts_get(&w->opts, c, &to_bch_ei(mapping->host)->ei_inode); + + blk_start_plug(&w->plug); + int ret = write_cache_pages(mapping, wbc, __bch2_writepage, w); + if (w->io) + bch2_writepage_do_io(w); + blk_finish_plug(&w->plug); + kfree(w->tmp); + kfree(w); return bch2_err_class(ret); } @@ -788,10 +773,9 @@ int bch2_write_end(struct file *file, struct address_space *mapping, copied = 0; } - spin_lock(&inode->v.i_lock); - if (pos + copied > inode->v.i_size) - i_size_write(&inode->v, pos + copied); - spin_unlock(&inode->v.i_lock); + scoped_guard(spinlock, &inode->v.i_lock) + if (pos + copied > inode->v.i_size) + i_size_write(&inode->v, pos + copied); if (copied) { if (!folio_test_uptodate(folio)) @@ -950,10 +934,9 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, end = pos + copied; - spin_lock(&inode->v.i_lock); - if (end > inode->v.i_size) - i_size_write(&inode->v, end); - spin_unlock(&inode->v.i_lock); + scoped_guard(spinlock, &inode->v.i_lock) + if (end > inode->v.i_size) + i_size_write(&inode->v, end); f_pos = pos; f_offset = pos - folio_pos(darray_first(fs)); diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index 535bc5fcbcc0..79823234160f 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -3,6 +3,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "enumerated_ref.h" #include "fs.h" #include "fs-io.h" #include "fs-io-direct.h" @@ -126,7 +127,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) * the dirtying of requests that are internal from the kernel (i.e. from * loopback), because we'll deadlock on page_lock. */ - dio->should_dirty = iter_is_iovec(iter); + dio->should_dirty = user_backed_iter(iter); blk_start_plug(&plug); @@ -251,12 +252,10 @@ static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum, u64 offset, u64 size, unsigned nr_replicas, bool compressed) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; + CLASS(btree_trans, trans)(c); struct bkey_s_c k; u64 end = offset + size; u32 snapshot; - bool ret = true; int err; retry: bch2_trans_begin(trans); @@ -268,25 +267,21 @@ static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum, for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, SPOS(inum.inum, offset, snapshot), BTREE_ITER_slots, k, err) { + offset = iter.pos.offset; + if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end))) break; if (k.k->p.snapshot != snapshot || nr_replicas > bch2_bkey_replicas(c, k) || - (!compressed && bch2_bkey_sectors_compressed(k))) { - ret = false; - break; - } + (!compressed && bch2_bkey_sectors_compressed(k))) + return false; } - - offset = iter.pos.offset; - bch2_trans_iter_exit(trans, &iter); err: if (bch2_err_matches(err, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_put(trans); - return err ? false : ret; + return !err; } static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio) @@ -401,7 +396,7 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio) ret = dio->op.error ?: ((long) dio->written << 9); bio_put(&dio->op.wbio.bio); - bch2_write_ref_put(c, BCH_WRITE_REF_dio_write); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write); /* inode->i_dio_count is our ref on inode and thus bch_fs */ inode_dio_end(&inode->v); @@ -427,17 +422,15 @@ static __always_inline void bch2_dio_write_end(struct dio_write *dio) dio->written += dio->op.written; if (dio->extending) { - spin_lock(&inode->v.i_lock); + guard(spinlock)(&inode->v.i_lock); if (req->ki_pos > inode->v.i_size) i_size_write(&inode->v, req->ki_pos); - spin_unlock(&inode->v.i_lock); } if (dio->op.i_sectors_delta || dio->quota_res.sectors) { - mutex_lock(&inode->ei_quota_lock); + guard(mutex)(&inode->ei_quota_lock); __bch2_i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta); __bch2_quota_reservation_put(c, inode, &dio->quota_res); - mutex_unlock(&inode->ei_quota_lock); } bio_release_pages(bio, false); @@ -606,7 +599,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) prefetch(&inode->ei_inode); prefetch((void *) &inode->ei_inode + 64); - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_dio_write)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_dio_write)) return -EROFS; inode_lock(&inode->v); @@ -675,7 +668,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) bio_put(bio); inode_dio_end(&inode->v); err_put_write_ref: - bch2_write_ref_put(c, BCH_WRITE_REF_dio_write); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write); goto out; } diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c index fbae9c1de746..469492f6264a 100644 --- a/fs/bcachefs/fs-io-pagecache.c +++ b/fs/bcachefs/fs-io-pagecache.c @@ -125,11 +125,9 @@ folio_sector_reserve(enum bch_folio_sector_state state) /* for newly allocated folios: */ struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp) { - struct bch_folio *s; - - s = kzalloc(sizeof(*s) + - sizeof(struct bch_folio_sector) * - folio_sectors(folio), gfp); + struct bch_folio *s = kzalloc(sizeof(*s) + + sizeof(struct bch_folio_sector) * + folio_sectors(folio), gfp); if (!s) return NULL; @@ -162,7 +160,7 @@ static void __bch2_folio_set(struct folio *folio, BUG_ON(pg_offset >= sectors); BUG_ON(pg_offset + pg_len > sectors); - spin_lock(&s->lock); + guard(spinlock)(&s->lock); for (i = pg_offset; i < pg_offset + pg_len; i++) { s->s[i].nr_replicas = nr_ptrs; @@ -171,8 +169,6 @@ static void __bch2_folio_set(struct folio *folio, if (i == sectors) s->uptodate = true; - - spin_unlock(&s->lock); } /* @@ -276,10 +272,9 @@ void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode, s = bch2_folio(folio); if (s) { - spin_lock(&s->lock); + guard(spinlock)(&s->lock); for (j = folio_offset; j < folio_offset + folio_len; j++) s->s[j].nr_replicas = 0; - spin_unlock(&s->lock); } folio_unlock(folio); @@ -330,13 +325,12 @@ int bch2_mark_pagecache_reserved(struct bch_inode_info *inode, unsigned folio_offset = max(*start, folio_start) - folio_start; unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; - spin_lock(&s->lock); + guard(spinlock)(&s->lock); for (unsigned j = folio_offset; j < folio_offset + folio_len; j++) { i_sectors_delta -= s->s[j].state == SECTOR_dirty; bch2_folio_sector_set(folio, s, j, folio_sector_reserve(s->s[j].state)); } - spin_unlock(&s->lock); } folio_unlock(folio); @@ -447,7 +441,7 @@ static int __bch2_folio_reservation_get(struct bch_fs *c, if (!reserved) { bch2_disk_reservation_put(c, &disk_res); - return -BCH_ERR_ENOSPC_disk_reservation; + return bch_err_throw(c, ENOSPC_disk_reservation); } break; } @@ -529,29 +523,26 @@ void bch2_set_folio_dirty(struct bch_fs *c, BUG_ON(!s->uptodate); - spin_lock(&s->lock); + scoped_guard(spinlock, &s->lock) + for (i = round_down(offset, block_bytes(c)) >> 9; + i < round_up(offset + len, block_bytes(c)) >> 9; + i++) { + unsigned sectors = sectors_to_reserve(&s->s[i], + res->disk.nr_replicas); - for (i = round_down(offset, block_bytes(c)) >> 9; - i < round_up(offset + len, block_bytes(c)) >> 9; - i++) { - unsigned sectors = sectors_to_reserve(&s->s[i], - res->disk.nr_replicas); + /* + * This can happen if we race with the error path in + * bch2_writepage_io_done(): + */ + sectors = min_t(unsigned, sectors, res->disk.sectors); - /* - * This can happen if we race with the error path in - * bch2_writepage_io_done(): - */ - sectors = min_t(unsigned, sectors, res->disk.sectors); + s->s[i].replicas_reserved += sectors; + res->disk.sectors -= sectors; - s->s[i].replicas_reserved += sectors; - res->disk.sectors -= sectors; + dirty_sectors += s->s[i].state == SECTOR_unallocated; - dirty_sectors += s->s[i].state == SECTOR_unallocated; - - bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state)); - } - - spin_unlock(&s->lock); + bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state)); + } bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors); @@ -644,6 +635,8 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) goto out; } + inode->ei_last_dirtied = (unsigned long) current; + bch2_set_folio_dirty(c, inode, folio, &res, offset, len); bch2_folio_reservation_put(c, inode, &res); diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 9657144666b8..de0d965f3fde 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -7,6 +7,7 @@ #include "btree_update.h" #include "buckets.h" #include "clock.h" +#include "enumerated_ref.h" #include "error.h" #include "extents.h" #include "extent_update.h" @@ -48,7 +49,8 @@ static void nocow_flush_endio(struct bio *_bio) struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); closure_put(bio->cl); - percpu_ref_put(&bio->ca->io_ref[WRITE]); + enumerated_ref_put(&bio->ca->io_ref[WRITE], + BCH_DEV_WRITE_REF_nocow_flush); bio_put(&bio->bio); } @@ -69,11 +71,12 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { - rcu_read_lock(); - ca = rcu_dereference(c->devs[dev]); - if (ca && !percpu_ref_tryget(&ca->io_ref[WRITE])) - ca = NULL; - rcu_read_unlock(); + scoped_guard(rcu) { + ca = rcu_dereference(c->devs[dev]); + if (ca && !enumerated_ref_tryget(&ca->io_ref[WRITE], + BCH_DEV_WRITE_REF_nocow_flush)) + ca = NULL; + } if (!ca) continue; @@ -145,17 +148,15 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, struct quota_res *quota_res, s64 sectors) { if (unlikely((s64) inode->v.i_blocks + sectors < 0)) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_log_msg_start(c, &buf); prt_printf(&buf, "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", inode->v.i_ino, (u64) inode->v.i_blocks, sectors, inode->ei_inode.bi_sectors); - bool repeat = false, print = false, suppress = false; - bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, buf.buf, &repeat, &print, &suppress); + bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, &buf); if (print) - bch2_print_str(c, buf.buf); - printbuf_exit(&buf); + bch2_print_str(c, KERN_ERR, buf.buf); if (sectors < 0) sectors = -inode->v.i_blocks; @@ -185,7 +186,6 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, static int bch2_get_inode_journal_seq_trans(struct btree_trans *trans, subvol_inum inum, u64 *seq) { - struct printbuf buf = PRINTBUF; struct bch_inode_unpacked u; struct btree_iter iter; int ret = bch2_inode_peek(trans, &iter, &u, inum, 0); @@ -195,6 +195,7 @@ static int bch2_get_inode_journal_seq_trans(struct btree_trans *trans, subvol_in u64 cur_seq = journal_cur_seq(&trans->c->journal); *seq = min(cur_seq, u.bi_journal_seq); + CLASS(printbuf, buf)(); if (fsck_err_on(u.bi_journal_seq > cur_seq, trans, inode_journal_seq_in_future, "inode journal seq in future (currently at %llu)\n%s", @@ -205,8 +206,7 @@ static int bch2_get_inode_journal_seq_trans(struct btree_trans *trans, subvol_in ret = bch2_inode_write(trans, &iter, &u); } fsck_err: - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); + bch2_trans_iter_exit(&iter); return ret; } @@ -220,15 +220,15 @@ static int bch2_flush_inode(struct bch_fs *c, if (c->opts.journal_flush_disabled) return 0; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fsync)) return -EROFS; u64 seq; int ret = bch2_trans_commit_do(c, NULL, NULL, 0, - bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?: + bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?: bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?: bch2_inode_flush_nocow_writes(c, inode); - bch2_write_ref_put(c, BCH_WRITE_REF_fsync); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_fsync); return ret; } @@ -265,11 +265,11 @@ static inline int range_has_data(struct bch_fs *c, u32 subvol, struct bpos start, struct bpos end) { - return bch2_trans_run(c, - for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, start, end, + CLASS(btree_trans, trans)(c); + return for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, start, end, subvol, 0, k, ({ - bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k); - }))); + bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k); + })); } static int __bch2_truncate_folio(struct bch_inode_info *inode, @@ -519,19 +519,16 @@ int bchfs_truncate(struct mnt_idmap *idmap, if (unlikely(!inode->v.i_size && inode->v.i_blocks && !bch2_journal_error(&c->journal))) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_log_msg_start(c, &buf); prt_printf(&buf, "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", inode->v.i_ino, (u64) inode->v.i_blocks, inode->ei_inode.bi_sectors); - bool repeat = false, print = false, suppress = false; - bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, buf.buf, - &repeat, &print, &suppress); + bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, &buf); if (print) - bch2_print_str(c, buf.buf); - printbuf_exit(&buf); + bch2_print_str(c, KERN_ERR, buf.buf); } ret = bch2_setattr_nonsize(idmap, inode, iattr); @@ -559,11 +556,10 @@ static noinline long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, l u64 block_start = round_up(offset, block_bytes(c)); u64 block_end = round_down(end, block_bytes(c)); bool truncated_last_page; - int ret = 0; - ret = bch2_truncate_folios(inode, offset, end); + int ret = bch2_truncate_folios(inode, offset, end); if (unlikely(ret < 0)) - goto err; + return ret; truncated_last_page = ret; @@ -576,19 +572,18 @@ static noinline long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, l block_start >> 9, block_end >> 9, &i_sectors_delta); bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); + + if (ret) + return ret; } - mutex_lock(&inode->ei_update_lock); - if (end >= inode->v.i_size && !truncated_last_page) { - ret = bch2_write_inode_size(c, inode, inode->v.i_size, - ATTR_MTIME|ATTR_CTIME); - } else { - ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, + guard(mutex)(&inode->ei_update_lock); + if (end >= inode->v.i_size && !truncated_last_page) + return bch2_write_inode_size(c, inode, inode->v.i_size, + ATTR_MTIME|ATTR_CTIME); + else + return bch2_write_inode(c, inode, inode_update_times_fn, NULL, ATTR_MTIME|ATTR_CTIME); - } - mutex_unlock(&inode->ei_update_lock); -err: - return ret; } static noinline long bchfs_fcollapse_finsert(struct bch_inode_info *inode, @@ -631,15 +626,14 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, u64 start_sector, u64 end_sector) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; struct bpos end_pos = POS(inode->v.i_ino, end_sector); struct bch_io_opts opts; int ret = 0; bch2_inode_opts_get(&opts, c, &inode->ei_inode); - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + CLASS(btree_trans, trans)(c); + CLASS(btree_iter, iter)(trans, BTREE_ID_extents, POS(inode->v.i_ino, start_sector), BTREE_ITER_slots|BTREE_ITER_intent); @@ -662,9 +656,9 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, if (ret) goto bkey_err; - bch2_btree_iter_set_snapshot(trans, &iter, snapshot); + bch2_btree_iter_set_snapshot(&iter, snapshot); - k = bch2_btree_iter_peek_slot(trans, &iter); + k = bch2_btree_iter_peek_slot(&iter); if ((ret = bkey_err(k))) goto bkey_err; @@ -675,13 +669,13 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, /* already reserved */ if (bkey_extent_is_reservation(k) && bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) { - bch2_btree_iter_advance(trans, &iter); + bch2_btree_iter_advance(&iter); continue; } if (bkey_extent_is_data(k.k) && !(mode & FALLOC_FL_ZERO_RANGE)) { - bch2_btree_iter_advance(trans, &iter); + bch2_btree_iter_advance(&iter); continue; } @@ -702,7 +696,7 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, if (ret) goto bkey_err; } - bch2_btree_iter_set_pos(trans, &iter, POS(iter.pos.inode, hole_start)); + bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start)); if (ret) goto bkey_err; @@ -752,8 +746,6 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, bch2_quota_reservation_put(c, inode, "a_res); } - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); return ret; } @@ -802,13 +794,11 @@ static noinline long bchfs_fallocate(struct bch_inode_info *inode, int mode, if (end >= inode->v.i_size && (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) || !(mode & FALLOC_FL_KEEP_SIZE))) { - spin_lock(&inode->v.i_lock); - i_size_write(&inode->v, end); - spin_unlock(&inode->v.i_lock); + scoped_guard(spinlock, &inode->v.i_lock) + i_size_write(&inode->v, end); - mutex_lock(&inode->ei_update_lock); - ret2 = bch2_write_inode_size(c, inode, end, 0); - mutex_unlock(&inode->ei_update_lock); + scoped_guard(mutex, &inode->ei_update_lock) + ret2 = bch2_write_inode_size(c, inode, end, 0); } return ret ?: ret2; @@ -821,7 +811,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode, struct bch_fs *c = inode->v.i_sb->s_fs_info; long ret; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fallocate)) return -EROFS; inode_lock(&inode->v); @@ -841,11 +831,11 @@ long bch2_fallocate_dispatch(struct file *file, int mode, else if (mode == FALLOC_FL_COLLAPSE_RANGE) ret = bchfs_fcollapse_finsert(inode, offset, len, false); else - ret = -EOPNOTSUPP; + ret = bch_err_throw(c, unsupported_fallocate_mode); err: bch2_pagecache_block_put(inode); inode_unlock(&inode->v); - bch2_write_ref_put(c, BCH_WRITE_REF_fallocate); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_fallocate); return bch2_err_class(ret); } @@ -861,8 +851,8 @@ static int quota_reserve_range(struct bch_inode_info *inode, struct bch_fs *c = inode->v.i_sb->s_fs_info; u64 sectors = end - start; - int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_max(trans, iter, + CLASS(btree_trans, trans)(c); + int ret = for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, POS(inode->v.i_ino, start), POS(inode->v.i_ino, end - 1), @@ -875,7 +865,7 @@ static int quota_reserve_range(struct bch_inode_info *inode, } 0; - }))); + })); return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true); } @@ -955,10 +945,9 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, bch2_i_sectors_acct(c, dst, "a_res, i_sectors_delta); - spin_lock(&dst->v.i_lock); - if (pos_dst + ret > dst->v.i_size) - i_size_write(&dst->v, pos_dst + ret); - spin_unlock(&dst->v.i_lock); + scoped_guard(spinlock, &dst->v.i_lock) + if (pos_dst + ret > dst->v.i_size) + i_size_write(&dst->v, pos_dst + ret); if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || IS_SYNC(file_inode(file_dst))) @@ -1020,38 +1009,38 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) if (offset >= isize) return -ENXIO; - int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, + CLASS(btree_trans, trans)(c); + int ret = for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, POS(inode->v.i_ino, offset >> 9), POS(inode->v.i_ino, U64_MAX), inum.subvol, BTREE_ITER_slots, k, ({ - if (k.k->p.inode != inode->v.i_ino || - !bkey_extent_is_data(k.k)) { - loff_t start_offset = k.k->p.inode == inode->v.i_ino - ? max(offset, bkey_start_offset(k.k) << 9) - : offset; - loff_t end_offset = k.k->p.inode == inode->v.i_ino - ? MAX_LFS_FILESIZE - : k.k->p.offset << 9; - - /* - * Found a hole in the btree, now make sure it's - * a hole in the pagecache. We might have to - * keep searching if this hole is entirely dirty - * in the page cache: - */ - bch2_trans_unlock(trans); - loff_t pagecache_hole = bch2_seek_pagecache_hole(&inode->v, - start_offset, end_offset, 0, false); - if (pagecache_hole < end_offset) { - next_hole = pagecache_hole; - break; - } - } else { - offset = max(offset, bkey_start_offset(k.k) << 9); + if (k.k->p.inode != inode->v.i_ino || + !bkey_extent_is_data(k.k)) { + loff_t start_offset = k.k->p.inode == inode->v.i_ino + ? max(offset, bkey_start_offset(k.k) << 9) + : offset; + loff_t end_offset = k.k->p.inode == inode->v.i_ino + ? MAX_LFS_FILESIZE + : k.k->p.offset << 9; + + /* + * Found a hole in the btree, now make sure it's + * a hole in the pagecache. We might have to + * keep searching if this hole is entirely dirty + * in the page cache: + */ + bch2_trans_unlock(trans); + loff_t pagecache_hole = bch2_seek_pagecache_hole(&inode->v, + start_offset, end_offset, 0, false); + if (pagecache_hole < end_offset) { + next_hole = pagecache_hole; + break; } - 0; - }))); + } else { + offset = max(offset, bkey_start_offset(k.k) << 9); + } + 0; + })); if (ret) return ret; diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h index ca70346e68dc..d229f7225da1 100644 --- a/fs/bcachefs/fs-io.h +++ b/fs/bcachefs/fs-io.h @@ -77,9 +77,8 @@ static inline void bch2_quota_reservation_put(struct bch_fs *c, struct quota_res *res) { if (res->sectors) { - mutex_lock(&inode->ei_quota_lock); + guard(mutex)(&inode->ei_quota_lock); __bch2_quota_reservation_put(c, inode, res); - mutex_unlock(&inode->ei_quota_lock); } } @@ -94,16 +93,15 @@ static inline int bch2_quota_reservation_add(struct bch_fs *c, if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags)) return 0; - mutex_lock(&inode->ei_quota_lock); + guard(mutex)(&inode->ei_quota_lock); ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); - if (likely(!ret)) { - inode->ei_quota_reserved += sectors; - res->sectors += sectors; - } - mutex_unlock(&inode->ei_quota_lock); + if (ret) + return ret; - return ret; + inode->ei_quota_reserved += sectors; + res->sectors += sectors; + return 0; } #else @@ -134,9 +132,8 @@ static inline void bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info * struct quota_res *quota_res, s64 sectors) { if (sectors) { - mutex_lock(&inode->ei_quota_lock); + guard(mutex)(&inode->ei_quota_lock); __bch2_i_sectors_acct(c, inode, quota_res, sectors); - mutex_unlock(&inode->ei_quota_lock); } } diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index a82dfce9e4ad..8b9d3c7d1f57 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -111,9 +111,8 @@ static int bch2_ioc_getlabel(struct bch_fs *c, char __user *user_label) BUILD_BUG_ON(BCH_SB_LABEL_SIZE >= FSLABEL_MAX); - mutex_lock(&c->sb_lock); - memcpy(label, c->disk_sb.sb->label, BCH_SB_LABEL_SIZE); - mutex_unlock(&c->sb_lock); + scoped_guard(mutex, &c->sb_lock) + memcpy(label, c->disk_sb.sb->label, BCH_SB_LABEL_SIZE); len = strnlen(label, BCH_SB_LABEL_SIZE); if (len == BCH_SB_LABEL_SIZE) { @@ -152,10 +151,10 @@ static int bch2_ioc_setlabel(struct bch_fs *c, if (ret) return ret; - mutex_lock(&c->sb_lock); - strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE); - ret = bch2_write_super(c); - mutex_unlock(&c->sb_lock); + scoped_guard(mutex, &c->sb_lock) { + strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE); + ret = bch2_write_super(c); + } mnt_drop_write_file(file); return ret; @@ -172,7 +171,10 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) if (get_user(flags, arg)) return -EFAULT; - bch_notice(c, "shutdown by ioctl type %u", flags); + CLASS(printbuf, buf)(); + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "shutdown by ioctl type %u", flags); switch (flags) { case FSOP_GOING_FLAGS_DEFAULT: @@ -180,20 +182,20 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) if (ret) break; bch2_journal_flush(&c->journal); - bch2_fs_emergency_read_only(c); + bch2_fs_emergency_read_only2(c, &buf); bdev_thaw(c->vfs_sb->s_bdev); break; case FSOP_GOING_FLAGS_LOGFLUSH: bch2_journal_flush(&c->journal); fallthrough; case FSOP_GOING_FLAGS_NOLOGFLUSH: - bch2_fs_emergency_read_only(c); + bch2_fs_emergency_read_only2(c, &buf); break; default: - ret = -EINVAL; - break; + return -EINVAL; } + bch2_print_str(c, KERN_ERR, buf.buf); return ret; } @@ -228,9 +230,8 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) { /* sync_inodes_sb enforce s_umount is locked */ - down_read(&c->vfs_sb->s_umount); + guard(rwsem_read)(&c->vfs_sb->s_umount); sync_inodes_sb(c->vfs_sb); - up_read(&c->vfs_sb->s_umount); } if (arg.src_ptr) { @@ -262,13 +263,13 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, } if (dst_dentry->d_inode) { - error = -BCH_ERR_EEXIST_subvolume_create; + error = bch_err_throw(c, EEXIST_subvolume_create); goto err3; } dir = dst_path.dentry->d_inode; if (IS_DEADDIR(dir)) { - error = -BCH_ERR_ENOENT_directory_dead; + error = bch_err_throw(c, ENOENT_directory_dead); goto err3; } @@ -295,12 +296,10 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, !arg.src_ptr) snapshot_src.subvol = inode_inum(to_bch_ei(dir)).subvol; - down_write(&c->snapshot_create_lock); - inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir), - dst_dentry, arg.mode|S_IFDIR, - 0, snapshot_src, create_flags); - up_write(&c->snapshot_create_lock); - + scoped_guard(rwsem_write, &c->snapshot_create_lock) + inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir), + dst_dentry, arg.mode|S_IFDIR, + 0, snapshot_src, create_flags); error = PTR_ERR_OR_ZERO(inode); if (error) goto err3; diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 47f1a64c5c8d..b5e3090f1cb8 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -106,14 +106,13 @@ int __must_check bch2_write_inode(struct bch_fs *c, inode_set_fn set, void *p, unsigned fields) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = {}; - struct bch_inode_unpacked inode_u; - int ret; + CLASS(btree_trans, trans)(c); retry: bch2_trans_begin(trans); - ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent); + struct btree_iter iter = {}; + struct bch_inode_unpacked inode_u; + int ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent); if (ret) goto err; @@ -124,8 +123,9 @@ int __must_check bch2_write_inode(struct bch_fs *c, goto err; struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u); + bool rebalance_changed = memcmp(&old_r, &new_r, sizeof(new_r)); - if (memcmp(&old_r, &new_r, sizeof(new_r))) { + if (rebalance_changed) { ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum); if (ret) goto err; @@ -141,18 +141,20 @@ int __must_check bch2_write_inode(struct bch_fs *c, if (!ret) bch2_inode_update_after_write(trans, inode, &inode_u, fields); err: - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; + if (rebalance_changed) + bch2_rebalance_wakeup(c); + bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c, "%s: inode %llu:%llu not found when updating", bch2_err_str(ret), inode_inum(inode).subvol, inode_inum(inode).inum); - bch2_trans_put(trans); return ret < 0 ? ret : 0; } @@ -162,40 +164,30 @@ int bch2_fs_quota_transfer(struct bch_fs *c, unsigned qtypes, enum quota_acct_mode mode) { - unsigned i; - int ret; - qtypes &= enabled_qtypes(c); - for (i = 0; i < QTYP_NR; i++) + for (unsigned i = 0; i < QTYP_NR; i++) if (new_qid.q[i] == inode->ei_qid.q[i]) qtypes &= ~(1U << i); if (!qtypes) return 0; - mutex_lock(&inode->ei_quota_lock); + guard(mutex)(&inode->ei_quota_lock); - ret = bch2_quota_transfer(c, qtypes, new_qid, + int ret = bch2_quota_transfer(c, qtypes, new_qid, inode->ei_qid, inode->v.i_blocks + inode->ei_quota_reserved, mode); if (!ret) - for (i = 0; i < QTYP_NR; i++) + for (unsigned i = 0; i < QTYP_NR; i++) if (qtypes & (1 << i)) inode->ei_qid.q[i] = new_qid.q[i]; - mutex_unlock(&inode->ei_quota_lock); - return ret; } -static bool subvol_inum_eq(subvol_inum a, subvol_inum b) -{ - return a.subvol == b.subvol && a.inum == b.inum; -} - static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) { const subvol_inum *inum = data; @@ -242,7 +234,7 @@ int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) struct bch_fs *c = trans->c; struct rhltable *ht = &c->vfs_inodes_by_inum_table; u64 inum = p.offset; - DARRAY(u32) subvols; + CLASS(darray_u32, subvols)(); int ret = 0; if (!test_bit(BCH_FS_started, &c->flags)) @@ -281,7 +273,7 @@ int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) rcu_read_unlock(); ret = darray_make_room(&subvols, 1); if (ret) - goto err; + return ret; subvols.nr = 0; goto restart_from_top; } @@ -304,14 +296,13 @@ int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) u32 snap; ret = bch2_subvolume_get_snapshot(trans, *i, &snap); if (ret) - goto err; + return ret; ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot); if (ret) break; } -err: - darray_exit(&subvols); + return ret; } @@ -352,9 +343,8 @@ static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btre if (!trans) { __wait_on_freeing_inode(c, inode, inum); } else { - bch2_trans_unlock(trans); - __wait_on_freeing_inode(c, inode, inum); - int ret = bch2_trans_relock(trans); + int ret = drop_locks_do(trans, + (__wait_on_freeing_inode(c, inode, inum), 0)); if (ret) return ERR_PTR(ret); } @@ -369,9 +359,9 @@ static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btre static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode) { - spin_lock(&inode->v.i_lock); - bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags); - spin_unlock(&inode->v.i_lock); + bool remove; + scoped_guard(spinlock, &inode->v.i_lock) + remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags); if (remove) { int ret = rhltable_remove(&c->vfs_inodes_by_inum_table, @@ -432,9 +422,8 @@ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, inode_sb_list_add(&inode->v); - mutex_lock(&c->vfs_inodes_lock); - list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); - mutex_unlock(&c->vfs_inodes_lock); + scoped_guard(mutex, &c->vfs_inodes_lock) + list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); return inode; } } @@ -516,15 +505,14 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) if (inode) return &inode->v; - struct btree_trans *trans = bch2_trans_get(c); + CLASS(btree_trans, trans)(c); struct bch_inode_unpacked inode_u; struct bch_subvolume subvol; int ret = lockrestart_do(trans, bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: - bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?: - PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); - bch2_trans_put(trans); + bch2_inode_find_by_inum_trans(trans, inum, &inode_u) ?: + PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol))); return ret ? ERR_PTR(ret) : &inode->v; } @@ -536,7 +524,6 @@ __bch2_create(struct mnt_idmap *idmap, unsigned flags) { struct bch_fs *c = dir->v.i_sb->s_fs_info; - struct btree_trans *trans; struct bch_inode_unpacked dir_u; struct bch_inode_info *inode; struct bch_inode_unpacked inode_u; @@ -557,18 +544,23 @@ __bch2_create(struct mnt_idmap *idmap, if (ret) return ERR_PTR(ret); #endif + inode = __bch2_new_inode(c, GFP_NOFS); if (unlikely(!inode)) { - inode = ERR_PTR(-ENOMEM); - goto err; + posix_acl_release(default_acl); + posix_acl_release(acl); + return ERR_PTR(-ENOMEM); } bch2_inode_init_early(c, &inode_u); if (!(flags & BCH_CREATE_TMPFILE)) mutex_lock(&dir->ei_update_lock); - - trans = bch2_trans_get(c); + /* + * posix_acl_create() calls get_acl -> btree transaction, don't start + * ours until after, ei->update_lock must also be taken first: + */ + CLASS(btree_trans, trans)(c); retry: bch2_trans_begin(trans); @@ -627,7 +619,6 @@ __bch2_create(struct mnt_idmap *idmap, * restart here. */ inode = bch2_inode_hash_insert(c, NULL, inode); - bch2_trans_put(trans); err: posix_acl_release(default_acl); posix_acl_release(acl); @@ -636,7 +627,6 @@ __bch2_create(struct mnt_idmap *idmap, if (!(flags & BCH_CREATE_TMPFILE)) mutex_unlock(&dir->ei_update_lock); - bch2_trans_put(trans); make_bad_inode(&inode->v); iput(&inode->v); inode = ERR_PTR(ret); @@ -651,7 +641,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, { struct bch_fs *c = trans->c; subvol_inum inum = {}; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); struct qstr lookup_name; int ret = bch2_maybe_casefold(trans, dir_hash_info, name, &lookup_name); @@ -702,8 +692,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, if (ret) goto err; out: - bch2_trans_iter_exit(trans, &dirent_iter); - printbuf_exit(&buf); + bch2_trans_iter_exit(&dirent_iter); return inode; err: inode = ERR_PTR(ret); @@ -724,7 +713,6 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, if (IS_ERR(inode)) inode = NULL; -#ifdef CONFIG_UNICODE if (!inode && IS_CASEFOLDED(vdir)) { /* * Do not cache a negative dentry in casefolded directories @@ -739,7 +727,6 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, */ return NULL; } -#endif return d_splice_alias(&inode->v, dentry); } @@ -774,8 +761,8 @@ static int __bch2_link(struct bch_fs *c, struct bch_inode_unpacked dir_u, inode_u; int ret; - mutex_lock(&inode->ei_update_lock); - struct btree_trans *trans = bch2_trans_get(c); + guard(mutex)(&inode->ei_update_lock); + CLASS(btree_trans, trans)(c); ret = commit_do(trans, NULL, NULL, 0, bch2_link_trans(trans, @@ -789,8 +776,6 @@ static int __bch2_link(struct bch_fs *c, bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME); } - bch2_trans_put(trans); - mutex_unlock(&inode->ei_update_lock); return ret; } @@ -825,8 +810,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, int ret; bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); - - struct btree_trans *trans = bch2_trans_get(c); + CLASS(btree_trans, trans)(c); ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, @@ -853,7 +837,6 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, if (IS_CASEFOLDED(vdir)) d_invalidate(dentry); err: - bch2_trans_put(trans); bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); return ret; @@ -922,7 +905,6 @@ static int bch2_rename2(struct mnt_idmap *idmap, struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); struct bch_inode_unpacked dst_dir_u, src_dir_u; struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u; - struct btree_trans *trans; enum bch_rename_mode mode = flags & RENAME_EXCHANGE ? BCH_RENAME_EXCHANGE : dst_dentry->d_inode @@ -946,7 +928,7 @@ static int bch2_rename2(struct mnt_idmap *idmap, src_inode, dst_inode); - trans = bch2_trans_get(c); + CLASS(btree_trans, trans)(c); ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?: bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol); @@ -1032,8 +1014,6 @@ static int bch2_rename2(struct mnt_idmap *idmap, bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u, ATTR_CTIME); err: - bch2_trans_put(trans); - bch2_fs_quota_transfer(c, src_inode, bch_qid(&src_inode->ei_inode), 1 << QTYP_PRJ, @@ -1101,7 +1081,6 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap, { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_qid qid; - struct btree_trans *trans; struct btree_iter inode_iter = {}; struct bch_inode_unpacked inode_u; struct posix_acl *acl = NULL; @@ -1109,7 +1088,7 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap, kgid_t kgid; int ret; - mutex_lock(&inode->ei_update_lock); + guard(mutex)(&inode->ei_update_lock); qid = inode->ei_qid; @@ -1126,9 +1105,9 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap, ret = bch2_fs_quota_transfer(c, inode, qid, ~0, KEY_TYPE_QUOTA_PREALLOC); if (ret) - goto err; + return ret; - trans = bch2_trans_get(c); + CLASS(btree_trans, trans)(c); retry: bch2_trans_begin(trans); kfree(acl); @@ -1152,23 +1131,18 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap, bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); btree_err: - bch2_trans_iter_exit(trans, &inode_iter); + bch2_trans_iter_exit(&inode_iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; if (unlikely(ret)) - goto err_trans; + return ret; bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid); if (acl) set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); -err_trans: - bch2_trans_put(trans); -err: - mutex_unlock(&inode->ei_update_lock); - - return bch2_err_class(ret); + return 0; } static int bch2_getattr(struct mnt_idmap *idmap, @@ -1232,18 +1206,16 @@ static int bch2_setattr(struct mnt_idmap *idmap, { struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret; lockdep_assert_held(&inode->v.i_rwsem); - ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: - setattr_prepare(idmap, dentry, iattr); - if (ret) - return ret; + int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: + setattr_prepare(idmap, dentry, iattr) ?: + (iattr->ia_valid & ATTR_SIZE + ? bchfs_truncate(idmap, inode, iattr) + : bch2_setattr_nonsize(idmap, inode, iattr)); - return iattr->ia_valid & ATTR_SIZE - ? bchfs_truncate(idmap, inode, iattr) - : bch2_setattr_nonsize(idmap, inode, iattr); + return bch2_err_class(ret); } static int bch2_tmpfile(struct mnt_idmap *idmap, @@ -1323,8 +1295,14 @@ static int bch2_fill_extent(struct bch_fs *c, flags| FIEMAP_EXTENT_DELALLOC| FIEMAP_EXTENT_UNWRITTEN); + } else if (k.k->type == KEY_TYPE_error) { + return 0; } else { - BUG(); + WARN_ONCE(1, "unhandled key type %s", + k.k->type < KEY_TYPE_MAX + ? bch2_bkey_types[k.k->type] + : "(unknown)"); + return 0; } } @@ -1419,21 +1397,20 @@ static int bch2_next_fiemap_extent(struct btree_trans *trans, if (ret) return ret; - struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - SPOS(inode->ei_inum.inum, start, snapshot), 0); + CLASS(btree_iter, iter)(trans, BTREE_ID_extents, + SPOS(inode->ei_inum.inum, start, snapshot), 0); struct bkey_s_c k = - bch2_btree_iter_peek_max(trans, &iter, POS(inode->ei_inum.inum, end)); + bch2_btree_iter_peek_max(&iter, POS(inode->ei_inum.inum, end)); ret = bkey_err(k); if (ret) - goto err; + return ret; u64 pagecache_end = k.k ? max(start, bkey_start_offset(k.k)) : end; ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, pagecache_end, cur); if (ret) - goto err; + return ret; struct bpos pagecache_start = bkey_start_pos(&cur->kbuf.k->k); @@ -1469,7 +1446,7 @@ static int bch2_next_fiemap_extent(struct btree_trans *trans, ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, &cur->kbuf); if (ret) - goto err; + return ret; struct bkey_i *k = cur->kbuf.k; sectors = min_t(unsigned, sectors, k->k.size - offset_into_extent); @@ -1481,9 +1458,8 @@ static int bch2_next_fiemap_extent(struct btree_trans *trans, k->k.p = iter.pos; k->k.p.offset += k->k.size; } -err: - bch2_trans_iter_exit(trans, &iter); - return ret; + + return 0; } static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, @@ -1491,7 +1467,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, { struct bch_fs *c = vinode->i_sb->s_fs_info; struct bch_inode_info *ei = to_bch_ei(vinode); - struct btree_trans *trans; struct bch_fiemap_extent cur, prev; int ret = 0; @@ -1509,7 +1484,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bch2_bkey_buf_init(&prev.kbuf); bkey_init(&prev.kbuf.k->k); - trans = bch2_trans_get(c); + CLASS(btree_trans, trans)(c); while (start < end) { ret = lockrestart_do(trans, @@ -1542,7 +1517,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, ret = bch2_fill_extent(c, info, &prev); } err: - bch2_trans_put(trans); bch2_bkey_buf_exit(&cur.kbuf, c); bch2_bkey_buf_exit(&prev.kbuf, c); @@ -1575,11 +1549,12 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); if (!dir_emit_dots(file, ctx)) return 0; - int ret = bch2_readdir(c, inode_inum(inode), ctx); + int ret = bch2_readdir(c, inode_inum(inode), &hash, ctx); bch_err_fn(c, ret); return bch2_err_class(ret); @@ -1695,11 +1670,15 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap, s.mask = map_defined(bch_flags_to_xflags); s.flags |= map_flags_rev(bch_flags_to_xflags, fa->fsx_xflags); - if (fa->fsx_xflags) - return -EOPNOTSUPP; + if (fa->fsx_xflags) { + ret = bch_err_throw(c, unsupported_fsx_flag); + goto err; + } - if (fa->fsx_projid >= U32_MAX) - return -EINVAL; + if (fa->fsx_projid >= U32_MAX) { + ret = bch_err_throw(c, projid_too_big); + goto err; + } /* * inode fields accessible via the xattr interface are stored with a +1 @@ -1721,8 +1700,10 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap, fa->flags &= ~FS_CASEFOLD_FL; s.flags |= map_flags_rev(bch_flags_to_uflags, fa->flags); - if (fa->flags) - return -EOPNOTSUPP; + if (fa->flags) { + ret = bch_err_throw(c, unsupported_fa_flag); + goto err; + } } mutex_lock(&inode->ei_update_lock); @@ -1733,7 +1714,8 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap, bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); - return ret; +err: + return bch2_err_class(ret); } static const struct file_operations bch_file_operations = { @@ -1964,9 +1946,6 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child struct bch_inode_info *inode = to_bch_ei(child->d_inode); struct bch_inode_info *dir = to_bch_ei(parent->d_inode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans *trans; - struct btree_iter iter1; - struct btree_iter iter2; struct bkey_s_c k; struct bkey_s_c_dirent d; struct bch_inode_unpacked inode_u; @@ -1979,12 +1958,11 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child if (!S_ISDIR(dir->v.i_mode)) return -EINVAL; - trans = bch2_trans_get(c); - - bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents, - POS(dir->ei_inode.bi_inum, 0), 0); - bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents, - POS(dir->ei_inode.bi_inum, 0), 0); + CLASS(btree_trans, trans)(c); + CLASS(btree_iter, iter1)(trans, BTREE_ID_dirents, + POS(dir->ei_inode.bi_inum, 0), 0); + CLASS(btree_iter, iter2)(trans, BTREE_ID_dirents, + POS(dir->ei_inode.bi_inum, 0), 0); retry: bch2_trans_begin(trans); @@ -1992,30 +1970,30 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child if (ret) goto err; - bch2_btree_iter_set_snapshot(trans, &iter1, snapshot); - bch2_btree_iter_set_snapshot(trans, &iter2, snapshot); + bch2_btree_iter_set_snapshot(&iter1, snapshot); + bch2_btree_iter_set_snapshot(&iter2, snapshot); ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u); if (ret) goto err; if (inode_u.bi_dir == dir->ei_inode.bi_inum) { - bch2_btree_iter_set_pos(trans, &iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); + bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); - k = bch2_btree_iter_peek_slot(trans, &iter1); + k = bch2_btree_iter_peek_slot(&iter1); ret = bkey_err(k); if (ret) goto err; if (k.k->type != KEY_TYPE_dirent) { - ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; + ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode); goto err; } d = bkey_s_c_to_dirent(k); ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); if (ret > 0) - ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; + ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode); if (ret) goto err; @@ -2026,7 +2004,7 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child * File with multiple hardlinks and our backref is to the wrong * directory - linear search: */ - for_each_btree_key_continue_norestart(trans, iter2, 0, k, ret) { + for_each_btree_key_continue_norestart(iter2, 0, k, ret) { if (k.k->p.inode > dir->ei_inode.bi_inum) break; @@ -2057,10 +2035,6 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_iter_exit(trans, &iter1); - bch2_trans_iter_exit(trans, &iter2); - bch2_trans_put(trans); - return ret; } @@ -2144,12 +2118,11 @@ static int bch2_vfs_write_inode(struct inode *vinode, { struct bch_fs *c = vinode->i_sb->s_fs_info; struct bch_inode_info *inode = to_bch_ei(vinode); - int ret; - mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, - ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); - mutex_unlock(&inode->ei_update_lock); + guard(mutex)(&inode->ei_update_lock); + + int ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, + ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); return bch2_err_class(ret); } @@ -2181,7 +2154,13 @@ static void bch2_evict_inode(struct inode *vinode) KEY_TYPE_QUOTA_WARN); bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, KEY_TYPE_QUOTA_WARN); - bch2_inode_rm(c, inode_inum(inode)); + int ret = bch2_inode_rm(c, inode_inum(inode)); + if (ret && !bch2_err_matches(ret, EROFS)) { + bch_err_msg(c, ret, "VFS incorrectly tried to delete inode %llu:%llu", + inode->ei_inum.subvol, + inode->ei_inum.inum); + bch2_sb_error_count(c, BCH_FSCK_ERR_vfs_bad_inode_rm); + } /* * If we are deleting, we need it present in the vfs hash table @@ -2190,9 +2169,8 @@ static void bch2_evict_inode(struct inode *vinode) bch2_inode_hash_remove(c, inode); } - mutex_lock(&c->vfs_inodes_lock); - list_del_init(&inode->ei_vfs_inode_list); - mutex_unlock(&c->vfs_inodes_lock); + scoped_guard(mutex, &c->vfs_inodes_lock) + list_del_init(&inode->ei_vfs_inode_list); } void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) @@ -2328,7 +2306,8 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root) struct bch_fs *c = root->d_sb->s_fs_info; bool first = true; - for_each_online_member(c, ca) { + guard(rcu)(); + for_each_online_member_rcu(c, ca) { if (!first) seq_putc(seq, ':'); first = false; @@ -2341,16 +2320,14 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root) static int bch2_show_options(struct seq_file *seq, struct dentry *root) { struct bch_fs *c = root->d_sb->s_fs_info; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb, OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE); printbuf_nul_terminate(&buf); seq_printf(seq, ",%s", buf.buf); - int ret = buf.allocation_failure ? -ENOMEM : 0; - printbuf_exit(&buf); - return ret; + return buf.allocation_failure ? -ENOMEM : 0; } static void bch2_put_super(struct super_block *sb) @@ -2372,24 +2349,20 @@ static int bch2_freeze(struct super_block *sb) { struct bch_fs *c = sb->s_fs_info; - down_write(&c->state_lock); + guard(rwsem_write)(&c->state_lock); bch2_fs_read_only(c); - up_write(&c->state_lock); return 0; } static int bch2_unfreeze(struct super_block *sb) { struct bch_fs *c = sb->s_fs_info; - int ret; if (test_bit(BCH_FS_emergency_ro, &c->flags)) return 0; - down_write(&c->state_lock); - ret = bch2_fs_read_write(c); - up_write(&c->state_lock); - return ret; + guard(rwsem_write)(&c->state_lock); + return bch2_fs_read_write(c); } static const struct super_operations bch_super_operations = { @@ -2440,7 +2413,7 @@ static int bch2_fs_get_tree(struct fs_context *fc) struct inode *vinode; struct bch2_opts_parse *opts_parse = fc->fs_private; struct bch_opts opts = opts_parse->opts; - darray_str devs; + darray_const_str devs; darray_fs devs_to_fs = {}; int ret; @@ -2464,7 +2437,7 @@ static int bch2_fs_get_tree(struct fs_context *fc) if (!IS_ERR(sb)) goto got_sb; - c = bch2_fs_open(devs.data, devs.nr, opts); + c = bch2_fs_open(&devs, &opts); ret = PTR_ERR_OR_ZERO(c); if (ret) goto err; @@ -2484,6 +2457,14 @@ static int bch2_fs_get_tree(struct fs_context *fc) if (ret) goto err_stop_fs; + /* + * We might be doing a RO mount because other options required it, or we + * have no alloc info and it's a small image with no room to regenerate + * it + */ + if (c->opts.read_only) + fc->sb_flags |= SB_RDONLY; + sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c); ret = PTR_ERR_OR_ZERO(sb); if (ret) @@ -2514,7 +2495,12 @@ static int bch2_fs_get_tree(struct fs_context *fc) sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid)); - super_set_sysfs_name_uuid(sb); + + if (c->sb.multi_device) + super_set_sysfs_name_uuid(sb); + else + strscpy(sb->s_sysfs_name, c->name, sizeof(sb->s_sysfs_name)); + sb->s_shrink->seeks = 0; c->vfs_sb = sb; strscpy(sb->s_id, c->name, sizeof(sb->s_id)); @@ -2525,14 +2511,15 @@ static int bch2_fs_get_tree(struct fs_context *fc) sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; - for_each_online_member(c, ca) { - struct block_device *bdev = ca->disk_sb.bdev; + scoped_guard(rcu) { + for_each_online_member_rcu(c, ca) { + struct block_device *bdev = ca->disk_sb.bdev; - /* XXX: create an anonymous device for multi device filesystems */ - sb->s_bdev = bdev; - sb->s_dev = bdev->bd_dev; - percpu_ref_put(&ca->io_ref[READ]); - break; + /* XXX: create an anonymous device for multi device filesystems */ + sb->s_bdev = bdev; + sb->s_dev = bdev->bd_dev; + break; + } } c->dev = sb->s_dev; @@ -2544,10 +2531,11 @@ static int bch2_fs_get_tree(struct fs_context *fc) sb->s_shrink->seeks = 0; -#ifdef CONFIG_UNICODE - sb->s_encoding = c->cf_encoding; -#endif +#if IS_ENABLED(CONFIG_UNICODE) + if (!bch2_fs_casefold_enabled(c)) + sb->s_encoding = c->cf_encoding; generic_set_sb_d_ops(sb); +#endif vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); ret = PTR_ERR_OR_ZERO(vinode); @@ -2645,7 +2633,7 @@ static int bch2_fs_reconfigure(struct fs_context *fc) opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0); if (opts->opts.read_only != c->opts.read_only) { - down_write(&c->state_lock); + guard(rwsem_write)(&c->state_lock); if (opts->opts.read_only) { bch2_fs_read_only(c); @@ -2655,22 +2643,18 @@ static int bch2_fs_reconfigure(struct fs_context *fc) ret = bch2_fs_read_write(c); if (ret) { bch_err(c, "error going rw: %i", ret); - up_write(&c->state_lock); - ret = -EINVAL; - goto err; + return -EINVAL; } sb->s_flags &= ~SB_RDONLY; } c->opts.read_only = opts->opts.read_only; - - up_write(&c->state_lock); } if (opt_defined(opts->opts, errors)) c->opts.errors = opts->opts.errors; -err: + return bch2_err_class(ret); } diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index aaf187085276..6ccea09243ab 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -12,8 +12,10 @@ #include "fs.h" #include "fsck.h" #include "inode.h" +#include "io_misc.h" #include "keylist.h" #include "namei.h" +#include "progress.h" #include "recovery_passes.h" #include "snapshot.h" #include "super.h" @@ -23,14 +25,15 @@ #include #include /* struct qstr */ -static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d, +static int dirent_points_to_inode_nowarn(struct bch_fs *c, + struct bkey_s_c_dirent d, struct bch_inode_unpacked *inode) { if (d.v->d_type == DT_SUBVOL ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol : le64_to_cpu(d.v->d_inum) == inode->bi_inum) return 0; - return -BCH_ERR_ENOENT_dirent_doesnt_match_inode; + return bch_err_throw(c, ENOENT_dirent_doesnt_match_inode); } static void dirent_inode_mismatch_msg(struct printbuf *out, @@ -49,12 +52,11 @@ static int dirent_points_to_inode(struct bch_fs *c, struct bkey_s_c_dirent dirent, struct bch_inode_unpacked *inode) { - int ret = dirent_points_to_inode_nowarn(dirent, inode); + int ret = dirent_points_to_inode_nowarn(c, dirent, inode); if (ret) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); dirent_inode_mismatch_msg(&buf, c, dirent, inode); bch_warn(c, "%s", buf.buf); - printbuf_exit(&buf); } return ret; } @@ -109,27 +111,6 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol, return ret; } -static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, inode_nr, snapshot), 0); - ret = bkey_err(k); - if (ret) - goto err; - - ret = bkey_is_inode(k.k) - ? bch2_inode_unpack(k, inode) - : -BCH_ERR_ENOENT_inode; -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - static int lookup_dirent_in_snapshot(struct btree_trans *trans, struct bch_hash_info hash_info, subvol_inum dir, struct qstr *name, @@ -145,7 +126,7 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans, struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); *target = le64_to_cpu(d.v->d_inum); *type = d.v->d_type; - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); return 0; } @@ -156,7 +137,6 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans, static int find_snapshot_tree_subvol(struct btree_trans *trans, u32 tree_id, u32 *subvol) { - struct btree_iter iter; struct bkey_s_c k; int ret; @@ -170,13 +150,11 @@ static int find_snapshot_tree_subvol(struct btree_trans *trans, if (s.v->subvol) { *subvol = le32_to_cpu(s.v->subvol); - goto found; + return 0; } } - ret = -BCH_ERR_ENOENT_no_snapshot_tree_subvol; -found: - bch2_trans_iter_exit(trans, &iter); - return ret; + + return ret ?: bch_err_throw(trans->c, ENOENT_no_snapshot_tree_subvol); } /* Get lost+found, create if it doesn't exist: */ @@ -186,7 +164,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, { struct bch_fs *c = trans->c; struct qstr lostfound_str = QSTR("lost+found"); - struct btree_iter lostfound_iter = {}; + struct btree_iter lostfound_iter = { NULL }; u64 inum = 0; unsigned d_type = 0; int ret; @@ -212,8 +190,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, return ret; if (!subvol.inode) { - struct btree_iter iter; - struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter, + struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, BTREE_ID_subvolumes, POS(0, subvolid), 0, subvolume); ret = PTR_ERR_OR_ZERO(subvol); @@ -221,7 +198,6 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, return ret; subvol->v.inode = cpu_to_le64(reattaching_inum); - bch2_trans_iter_exit(trans, &iter); } subvol_inum root_inum = { @@ -231,7 +207,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, struct bch_inode_unpacked root_inode; struct bch_hash_info root_hash_info; - ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode); + ret = bch2_inode_find_by_inum_snapshot(trans, root_inum.inum, snapshot, &root_inode, 0); bch_err_msg(c, ret, "looking up root inode %llu for subvol %u", root_inum.inum, subvolid); if (ret) @@ -250,14 +226,14 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, if (d_type != DT_DIR) { bch_err(c, "error looking up lost+found: not a directory"); - return -BCH_ERR_ENOENT_not_directory; + return bch_err_throw(c, ENOENT_not_directory); } /* * The bch2_check_dirents pass has already run, dangling dirents * shouldn't exist here: */ - ret = lookup_inode(trans, inum, snapshot, lostfound); + ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, lostfound, 0); bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)", inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot)); return ret; @@ -272,20 +248,19 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, * XXX: we could have a nicer log message here if we had a nice way to * walk backpointers to print a path */ - struct printbuf path = PRINTBUF; + CLASS(printbuf, path)(); ret = bch2_inum_to_path(trans, root_inum, &path); if (ret) goto err; bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u", path.buf, root_inum.subvol, snapshot); - printbuf_exit(&path); u64 now = bch2_current_time(c); u64 cpu = raw_smp_processor_id(); bch2_inode_init_early(c, lostfound); - bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); + bch2_inode_init_late(c, lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); lostfound->bi_dir = root_inode.bi_inum; lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot); @@ -295,8 +270,8 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, if (ret) goto err; - bch2_btree_iter_set_snapshot(trans, &lostfound_iter, snapshot); - ret = bch2_btree_iter_traverse(trans, &lostfound_iter); + bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot); + ret = bch2_btree_iter_traverse(&lostfound_iter); if (ret) goto err; @@ -312,7 +287,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, BTREE_UPDATE_internal_snapshot_node); err: bch_err_msg(c, ret, "creating lost+found"); - bch2_trans_iter_exit(trans, &lostfound_iter); + bch2_trans_iter_exit(&lostfound_iter); return ret; } @@ -347,16 +322,17 @@ static inline bool inode_should_reattach(struct bch_inode_unpacked *inode) (inode->bi_flags & BCH_INODE_has_child_snapshot)) return false; - return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked); + return !bch2_inode_has_backpointer(inode) && + !(inode->bi_flags & BCH_INODE_unlinked); } static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot) { - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents, - SPOS(d_pos.inode, d_pos.offset, snapshot), - BTREE_ITER_intent| - BTREE_ITER_with_updates); + CLASS(btree_iter, iter)(trans, BTREE_ID_dirents, + SPOS(d_pos.inode, d_pos.offset, snapshot), + BTREE_ITER_intent| + BTREE_ITER_with_updates); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); int ret = bkey_err(k); if (ret) return ret; @@ -369,16 +345,15 @@ static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); ret = PTR_ERR_OR_ZERO(k); if (ret) - goto err; + return ret; bkey_init(&k->k); k->k.type = KEY_TYPE_whiteout; k->k.p = iter.pos; - ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node); + return bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node); } -err: - bch2_trans_iter_exit(trans, &iter); - return ret; + + return 0; } static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) @@ -392,6 +367,16 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * if (inode->bi_subvol) { inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL; + struct bkey_i_subvolume *subvol = + bch2_bkey_get_mut_typed(trans, + BTREE_ID_subvolumes, POS(0, inode->bi_subvol), + 0, subvolume); + ret = PTR_ERR_OR_ZERO(subvol); + if (ret) + return ret; + + subvol->v.fs_path_parent = BCACHEFS_ROOT_SUBVOL; + u64 root_inum; ret = subvol_lookup(trans, inode->bi_parent_subvol, &dirent_snapshot, &root_inum); @@ -407,6 +392,8 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * if (ret) return ret; + bch_verbose(c, "got lostfound inum %llu", lostfound.bi_inum); + lostfound.bi_nlink += S_ISDIR(inode->bi_mode); /* ensure lost+found inode is also present in inode snapshot */ @@ -443,14 +430,23 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * if (ret) return ret; + { + CLASS(printbuf, buf)(); + ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, + inode->bi_snapshot, NULL, &buf); + if (ret) + return ret; + + bch_info(c, "reattached at %s", buf.buf); + } + /* * Fix up inodes in child snapshots: if they should also be reattached * update the backpointer field, if they should not be we need to emit * whiteouts for the dirent we just created. */ if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) { - snapshot_id_list whiteouts_done; - struct btree_iter iter; + CLASS(snapshot_id_list, whiteouts_done)(); struct bkey_s_c k; darray_init(&whiteouts_done); @@ -469,19 +465,16 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * struct bch_inode_unpacked child_inode; ret = bch2_inode_unpack(k, &child_inode); if (ret) - break; + return ret; if (!inode_should_reattach(&child_inode)) { - ret = maybe_delete_dirent(trans, - SPOS(lostfound.bi_inum, inode->bi_dir_offset, - dirent_snapshot), - k.k->p.snapshot); - if (ret) - break; - - ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot); + ret = maybe_delete_dirent(trans, + SPOS(lostfound.bi_inum, inode->bi_dir_offset, + dirent_snapshot), + k.k->p.snapshot) ?: + snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot); if (ret) - break; + return ret; } else { iter.snapshot = k.k->p.snapshot; child_inode.bi_dir = inode->bi_dir; @@ -490,11 +483,9 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * ret = bch2_inode_write_flags(trans, &iter, &child_inode, BTREE_UPDATE_internal_snapshot_node); if (ret) - break; + return ret; } } - darray_exit(&whiteouts_done); - bch2_trans_iter_exit(trans, &iter); } return ret; @@ -504,23 +495,35 @@ static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, struct btree_iter *iter, struct bpos pos) { - return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); + bch2_trans_iter_init(trans, iter, BTREE_ID_dirents, pos, 0); + struct bkey_s_c_dirent d = bch2_bkey_get_typed(iter, dirent); + if (bkey_err(d.s_c)) + bch2_trans_iter_exit(iter); + return d; } static int remove_backpointer(struct btree_trans *trans, struct bch_inode_unpacked *inode) { - if (!inode->bi_dir) + if (!bch2_inode_has_backpointer(inode)) return 0; + u32 snapshot = inode->bi_snapshot; + + if (inode->bi_parent_subvol) { + int ret = bch2_subvolume_get_snapshot(trans, inode->bi_parent_subvol, &snapshot); + if (ret) + return ret; + } + struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter, - SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot)); + SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot)); int ret = bkey_err(d) ?: dirent_points_to_inode(c, d, inode) ?: bch2_fsck_remove_dirent(trans, d.k->p); - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); return ret; } @@ -552,7 +555,7 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub if (!bch2_snapshot_is_leaf(c, snapshotid)) { bch_err(c, "need to reconstruct subvol, but have interior node snapshot"); - return -BCH_ERR_fsck_repair_unimplemented; + return bch_err_throw(c, fsck_repair_unimplemented); } /* @@ -566,14 +569,14 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub u64 cpu = raw_smp_processor_id(); bch2_inode_init_early(c, &new_inode); - bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); + bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); new_inode.bi_subvol = subvolid; int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?: - bch2_btree_iter_traverse(trans, &inode_iter) ?: + bch2_btree_iter_traverse(&inode_iter) ?: bch2_inode_write(trans, &inode_iter, &new_inode); - bch2_trans_iter_exit(trans, &inode_iter); + bch2_trans_iter_exit(&inode_iter); if (ret) return ret; @@ -595,8 +598,7 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub if (ret) return ret; - struct btree_iter iter; - struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter, + struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, BTREE_ID_snapshots, POS(0, snapshotid), 0, snapshot); ret = PTR_ERR_OR_ZERO(s); @@ -608,9 +610,8 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub s->v.subvol = cpu_to_le32(subvolid); SET_BCH_SNAPSHOT_SUBVOL(&s->v, true); - bch2_trans_iter_exit(trans, &iter); - struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter, + struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, BTREE_ID_snapshot_trees, POS(0, snapshot_tree), 0, snapshot_tree); ret = PTR_ERR_OR_ZERO(st); @@ -620,8 +621,6 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub if (!st->v.master_subvol) st->v.master_subvol = cpu_to_le32(subvolid); - - bch2_trans_iter_exit(trans, &iter); return 0; } @@ -633,11 +632,8 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 switch (btree) { case BTREE_ID_extents: { - struct btree_iter iter = {}; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); - struct bkey_s_c k = bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum, 0)); - bch2_trans_iter_exit(trans, &iter); + CLASS(btree_iter, iter)(trans, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); + struct bkey_s_c k = bch2_btree_iter_peek_prev_min(&iter, POS(inum, 0)); int ret = bkey_err(k); if (ret) return ret; @@ -656,7 +652,7 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 struct bch_inode_unpacked new_inode; bch2_inode_init_early(c, &new_inode); - bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL); + bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL); new_inode.bi_size = i_size; new_inode.bi_inum = inum; new_inode.bi_snapshot = snapshot; @@ -664,21 +660,20 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 return __bch2_fsck_write_inode(trans, &new_inode); } -struct snapshots_seen { - struct bpos pos; - snapshot_id_list ids; -}; - static inline void snapshots_seen_exit(struct snapshots_seen *s) { darray_exit(&s->ids); } -static inline void snapshots_seen_init(struct snapshots_seen *s) +static inline struct snapshots_seen snapshots_seen_init(void) { - memset(s, 0, sizeof(*s)); + return (struct snapshots_seen) {}; } +DEFINE_CLASS(snapshots_seen, struct snapshots_seen, + snapshots_seen_exit(&_T), + snapshots_seen_init(), void) + static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id) { u32 *i; @@ -720,14 +715,8 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, u32 id, u32 ancestor) { - ssize_t i; - EBUG_ON(id > ancestor); - /* @ancestor should be the snapshot most recently added to @seen */ - EBUG_ON(ancestor != seen->pos.snapshot); - EBUG_ON(ancestor != darray_last(seen->ids)); - if (id == ancestor) return true; @@ -743,11 +732,8 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see * numerically, since snapshot ID lists are kept sorted, so if we find * an id that's an ancestor of @id we're done: */ - - for (i = seen->ids.nr - 2; - i >= 0 && seen->ids.data[i] >= id; - --i) - if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i])) + darray_for_each_reverse(seen->ids, i) + if (*i != ancestor && bch2_snapshot_is_ancestor(c, id, *i)) return false; return true; @@ -787,12 +773,12 @@ static int ref_visible2(struct bch_fs *c, #define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \ - (_i)->snapshot <= (_snapshot); _i++) \ - if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) + (_i)->inode.bi_snapshot <= (_snapshot); _i++) \ + if (key_visible_in_snapshot(_c, _s, _i->inode.bi_snapshot, _snapshot)) struct inode_walker_entry { struct bch_inode_unpacked inode; - u32 snapshot; + bool whiteout; u64 count; u64 i_size; }; @@ -815,26 +801,36 @@ static void inode_walker_exit(struct inode_walker *w) static struct inode_walker inode_walker_init(void) { - return (struct inode_walker) { 0, }; + return (struct inode_walker) {}; } +DEFINE_CLASS(inode_walker, struct inode_walker, + inode_walker_exit(&_T), + inode_walker_init(), void) + static int add_inode(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c inode) { - struct bch_inode_unpacked u; - - return bch2_inode_unpack(inode, &u) ?: - darray_push(&w->inodes, ((struct inode_walker_entry) { - .inode = u, - .snapshot = inode.k->p.snapshot, + int ret = darray_push(&w->inodes, ((struct inode_walker_entry) { + .whiteout = !bkey_is_inode(inode.k), })); + if (ret) + return ret; + + struct inode_walker_entry *n = &darray_last(w->inodes); + if (!n->whiteout) { + return bch2_inode_unpack(inode, &n->inode); + } else { + n->inode.bi_inum = inode.k->p.offset; + n->inode.bi_snapshot = inode.k->p.snapshot; + return 0; + } } static int get_inodes_all_snapshots(struct btree_trans *trans, struct inode_walker *w, u64 inum) { struct bch_fs *c = trans->c; - struct btree_iter iter; struct bkey_s_c k; int ret; @@ -847,15 +843,13 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, w->recalculate_sums = false; w->inodes.nr = 0; - for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inum) + for_each_btree_key_max_norestart(trans, iter, + BTREE_ID_inodes, POS(0, inum), SPOS(0, inum, U32_MAX), + BTREE_ITER_all_snapshots, k, ret) { + ret = add_inode(c, w, k); + if (ret) break; - - if (bkey_is_inode(k.k)) - add_inode(c, w, k); } - bch2_trans_iter_exit(trans, &iter); if (ret) return ret; @@ -865,48 +859,104 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, return 0; } -static struct inode_walker_entry * -lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k) +static int get_visible_inodes(struct btree_trans *trans, + struct inode_walker *w, + struct snapshots_seen *s, + u64 inum) { - bool is_whiteout = k.k->type == KEY_TYPE_whiteout; + struct bch_fs *c = trans->c; + struct bkey_s_c k; + int ret; - struct inode_walker_entry *i; - __darray_for_each(w->inodes, i) - if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot)) - goto found; + w->inodes.nr = 0; + w->deletes.nr = 0; - return NULL; -found: - BUG_ON(k.k->p.snapshot > i->snapshot); + for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inum) + break; + + if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) + continue; + + if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot)) + continue; + + ret = bkey_is_inode(k.k) + ? add_inode(c, w, k) + : snapshot_list_add(c, &w->deletes, k.k->p.snapshot); + if (ret) + break; + } - if (k.k->p.snapshot != i->snapshot && !is_whiteout) { - struct inode_walker_entry new = *i; + return ret; +} - new.snapshot = k.k->p.snapshot; - new.count = 0; - new.i_size = 0; +static struct inode_walker_entry * +lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + + struct inode_walker_entry *i = darray_find_p(w->inodes, i, + bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot)); + + if (!i) + return NULL; - struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, k); + CLASS(printbuf, buf)(); + int ret = 0; - bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" + if (fsck_err_on(k.k->p.snapshot != i->inode.bi_snapshot, + trans, snapshot_key_missing_inode_snapshot, + "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" "unexpected because we should always update the inode when we update a key in that inode\n" "%s", - w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf); - printbuf_exit(&buf); + w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot, + (bch2_bkey_val_to_text(&buf, c, k), + buf.buf))) { + if (!i->whiteout) { + struct bch_inode_unpacked new = i->inode; + new.bi_snapshot = k.k->p.snapshot; + ret = __bch2_fsck_write_inode(trans, &new); + } else { + struct bkey_i whiteout; + bkey_init(&whiteout.k); + whiteout.k.type = KEY_TYPE_whiteout; + whiteout.k.p = SPOS(0, i->inode.bi_inum, k.k->p.snapshot); + ret = bch2_btree_insert_trans(trans, BTREE_ID_inodes, + &whiteout, + BTREE_ITER_cached| + BTREE_UPDATE_internal_snapshot_node); + } + + if (ret) + goto fsck_err; + + ret = bch2_trans_commit(trans, NULL, NULL, 0); + if (ret) + goto fsck_err; + + struct inode_walker_entry new_entry = *i; + + new_entry.inode.bi_snapshot = k.k->p.snapshot; + new_entry.count = 0; + new_entry.i_size = 0; - while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot) + while (i > w->inodes.data && i[-1].inode.bi_snapshot > k.k->p.snapshot) --i; size_t pos = i - w->inodes.data; - int ret = darray_insert_item(&w->inodes, pos, new); + ret = darray_insert_item(&w->inodes, pos, new_entry); if (ret) - return ERR_PTR(ret); + goto fsck_err; - i = w->inodes.data + pos; + ret = bch_err_throw(c, transaction_restart_nested); + goto fsck_err; } return i; +fsck_err: + return ERR_PTR(ret); } static struct inode_walker_entry *walk_inode(struct btree_trans *trans, @@ -921,42 +971,7 @@ static struct inode_walker_entry *walk_inode(struct btree_trans *trans, w->last_pos = k.k->p; - return lookup_inode_for_snapshot(trans->c, w, k); -} - -static int get_visible_inodes(struct btree_trans *trans, - struct inode_walker *w, - struct snapshots_seen *s, - u64 inum) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - w->inodes.nr = 0; - w->deletes.nr = 0; - - for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inum) - break; - - if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) - continue; - - if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot)) - continue; - - ret = bkey_is_inode(k.k) - ? add_inode(c, w, k) - : snapshot_list_add(c, &w->deletes, k.k->p.snapshot); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - - return ret; + return lookup_inode_for_snapshot(trans, w, k); } /* @@ -974,26 +989,25 @@ int bch2_fsck_update_backpointers(struct btree_trans *trans, return 0; struct bkey_i_dirent *d = bkey_i_to_dirent(new); - struct inode_walker target = inode_walker_init(); - int ret = 0; + CLASS(inode_walker, target)(); if (d->v.d_type == DT_SUBVOL) { - BUG(); + bch_err(trans->c, "%s does not support DT_SUBVOL", __func__); + return bch_err_throw(trans->c, fsck_repair_unimplemented); } else { - ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum)); + int ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum)); if (ret) - goto err; + return ret; darray_for_each(target.inodes, i) { i->inode.bi_dir_offset = d->k.p.offset; ret = __bch2_fsck_write_inode(trans, &i->inode); if (ret) - goto err; + return ret; } + + return 0; } -err: - inode_walker_exit(&target); - return ret; } static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, @@ -1013,11 +1027,9 @@ static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) { - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0); - int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set; - bch2_trans_iter_exit(trans, &iter); - return ret; + CLASS(btree_iter, iter)(trans, BTREE_ID_deleted_inodes, p, 0); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + return bkey_err(k) ?: k.k->type == KEY_TYPE_set; } static int check_inode_dirent_inode(struct btree_trans *trans, @@ -1025,7 +1037,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans, bool *write_inode) { struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); u32 inode_snapshot = inode->bi_snapshot; struct btree_iter dirent_iter = {}; @@ -1034,7 +1046,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans, if (ret && !bch2_err_matches(ret, ENOENT)) return ret; - if ((ret || dirent_points_to_inode_nowarn(d, inode)) && + if ((ret || dirent_points_to_inode_nowarn(c, d, inode)) && inode->bi_subvol && (inode->bi_flags & BCH_INODE_has_child_snapshot)) { /* Older version of a renamed subvolume root: we won't have a @@ -1055,7 +1067,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans, trans, inode_points_to_missing_dirent, "inode points to missing dirent\n%s", (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) || - fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode), + fsck_err_on(!ret && dirent_points_to_inode_nowarn(c, d, inode), trans, inode_points_to_wrong_dirent, "%s", (printbuf_reset(&buf), @@ -1074,38 +1086,11 @@ static int check_inode_dirent_inode(struct btree_trans *trans, out: ret = 0; fsck_err: - bch2_trans_iter_exit(trans, &dirent_iter); - printbuf_exit(&buf); + bch2_trans_iter_exit(&dirent_iter); bch_err_fn(c, ret); return ret; } -static int get_snapshot_root_inode(struct btree_trans *trans, - struct bch_inode_unpacked *root, - u64 inum) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, - SPOS(0, inum, U32_MAX), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inum) - break; - if (bkey_is_inode(k.k)) - goto found_root; - } - if (ret) - goto err; - BUG(); -found_root: - ret = bch2_inode_unpack(k, root); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - static int check_inode(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -1113,7 +1098,7 @@ static int check_inode(struct btree_trans *trans, struct snapshots_seen *s) { struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); struct bch_inode_unpacked u; bool do_update = false; int ret; @@ -1136,27 +1121,31 @@ static int check_inode(struct btree_trans *trans, goto err; if (snapshot_root->bi_inum != u.bi_inum) { - ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum); + ret = bch2_inode_find_snapshot_root(trans, u.bi_inum, snapshot_root); if (ret) goto err; } - if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed || - INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root), - trans, inode_snapshot_mismatch, - "inode hash info in different snapshots don't match")) { - u.bi_hash_seed = snapshot_root->bi_hash_seed; - SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root)); - do_update = true; + if (u.bi_hash_seed != snapshot_root->bi_hash_seed || + INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root)) { + ret = bch2_repair_inode_hash_info(trans, snapshot_root); + BUG_ON(ret == -BCH_ERR_fsck_repair_unimplemented); + if (ret) + goto err; } - if (u.bi_dir || u.bi_dir_offset) { + ret = bch2_check_inode_has_case_insensitive(trans, &u, &s->ids, &do_update); + if (ret) + goto err; + + if (bch2_inode_has_backpointer(&u)) { ret = check_inode_dirent_inode(trans, &u, &do_update); if (ret) goto err; } - if (fsck_err_on(u.bi_dir && (u.bi_flags & BCH_INODE_unlinked), + if (fsck_err_on(bch2_inode_has_backpointer(&u) && + (u.bi_flags & BCH_INODE_unlinked), trans, inode_unlinked_but_has_dirent, "inode unlinked but has dirent\n%s", (printbuf_reset(&buf), @@ -1183,6 +1172,14 @@ static int check_inode(struct btree_trans *trans, ret = 0; } + if (fsck_err_on(S_ISDIR(u.bi_mode) && u.bi_size, + trans, inode_dir_has_nonzero_i_size, + "directory %llu:%u with nonzero i_size %lli", + u.bi_inum, u.bi_snapshot, u.bi_size)) { + u.bi_size = 0; + do_update = true; + } + ret = bch2_inode_has_child_snapshots(trans, k.k->p); if (ret < 0) goto err; @@ -1217,7 +1214,7 @@ static int check_inode(struct btree_trans *trans, */ ret = check_inode_deleted_list(trans, k.k->p); if (ret < 0) - goto err_noprint; + return ret; fsck_err_on(!ret, trans, unlinked_inode_not_on_deleted_list, @@ -1238,7 +1235,7 @@ static int check_inode(struct btree_trans *trans, u.bi_inum, u.bi_snapshot)) { ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); bch_err_msg(c, ret, "in fsck deleting inode"); - goto err_noprint; + return ret; } ret = 0; } @@ -1299,40 +1296,37 @@ static int check_inode(struct btree_trans *trans, ret = __bch2_fsck_write_inode(trans, &u); bch_err_msg(c, ret, "in fsck updating inode"); if (ret) - goto err_noprint; + return ret; } err: fsck_err: bch_err_fn(c, ret); -err_noprint: - printbuf_exit(&buf); return ret; } int bch2_check_inodes(struct bch_fs *c) { struct bch_inode_unpacked snapshot_root = {}; - struct snapshots_seen s; - snapshots_seen_init(&s); + CLASS(btree_trans, trans)(c); + CLASS(snapshots_seen, s)(); + + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_inodes)); - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, + return for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_inode(trans, &iter, k, &snapshot_root, &s))); - - snapshots_seen_exit(&s); - bch_err_fn(c, ret); - return ret; + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + progress_update_iter(trans, &progress, &iter); + check_inode(trans, &iter, k, &snapshot_root, &s); + })); } static int find_oldest_inode_needs_reattach(struct btree_trans *trans, struct bch_inode_unpacked *inode) { struct bch_fs *c = trans->c; - struct btree_iter iter; struct bkey_s_c k; int ret = 0; @@ -1364,7 +1358,6 @@ static int find_oldest_inode_needs_reattach(struct btree_trans *trans, *inode = parent_inode; } - bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1373,7 +1366,7 @@ static int check_unreachable_inode(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); int ret = 0; if (!bkey_is_inode(k.k)) @@ -1397,7 +1390,6 @@ static int check_unreachable_inode(struct btree_trans *trans, buf.buf))) ret = reattach_inode(trans, &inode); fsck_err: - printbuf_exit(&buf); return ret; } @@ -1413,14 +1405,17 @@ static int check_unreachable_inode(struct btree_trans *trans, */ int bch2_check_unreachable_inodes(struct bch_fs *c) { - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_inodes)); + + CLASS(btree_trans, trans)(c); + return for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_unreachable_inode(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + progress_update_iter(trans, &progress, &iter); + check_unreachable_inode(trans, &iter, k); + })); } static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode) @@ -1444,48 +1439,155 @@ static int check_key_has_inode(struct btree_trans *trans, struct bkey_s_c k) { struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); int ret = PTR_ERR_OR_ZERO(i); if (ret) return ret; if (k.k->type == KEY_TYPE_whiteout) - goto out; + return 0; - if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { - ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto err; + bool have_inode = i && !i->whiteout; - inode->last_pos.inode--; - ret = -BCH_ERR_transaction_restart_nested; - goto err; + if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) + goto reconstruct; + + if (have_inode && btree_matches_i_mode(iter->btree_id, i->inode.bi_mode)) + return 0; + + prt_printf(&buf, ", "); + + bool have_old_inode = false; + darray_for_each(inode->inodes, i2) + if (!i2->whiteout && + bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i2->inode.bi_snapshot) && + btree_matches_i_mode(iter->btree_id, i2->inode.bi_mode)) { + prt_printf(&buf, "but found good inode in older snapshot\n"); + bch2_inode_unpacked_to_text(&buf, &i2->inode); + prt_newline(&buf); + have_old_inode = true; + break; + } + + struct bkey_s_c k2; + unsigned nr_keys = 0; + + prt_printf(&buf, "found keys:\n"); + + for_each_btree_key_max_norestart(trans, iter2, iter->btree_id, + SPOS(k.k->p.inode, 0, k.k->p.snapshot), + POS(k.k->p.inode, U64_MAX), + 0, k2, ret) { + if (k.k->type == KEY_TYPE_error || + k.k->type == KEY_TYPE_hash_whiteout) + continue; + + nr_keys++; + if (nr_keys <= 10) { + bch2_bkey_val_to_text(&buf, c, k2); + prt_newline(&buf); + } + if (nr_keys >= 100) + break; } - if (fsck_err_on(!i, - trans, key_in_missing_inode, - "key in missing inode:\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - goto delete; + if (ret) + goto err; - if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), - trans, key_in_wrong_inode_type, - "key for wrong inode mode %o:\n%s", - i->inode.bi_mode, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - goto delete; + unsigned reconstruct_limit = iter->btree_id == BTREE_ID_extents ? 3 : 0; + + if (nr_keys > 100) + prt_printf(&buf, "found > %u keys for this missing inode\n", nr_keys); + else if (nr_keys > reconstruct_limit) + prt_printf(&buf, "found %u keys for this missing inode\n", nr_keys); + + if (!have_inode) { + if (fsck_err_on(!have_inode, + trans, key_in_missing_inode, + "key in missing inode%s", buf.buf)) { + /* + * Maybe a deletion that raced with data move, or something + * weird like that? But if we know the inode was deleted, or + * it's just a few keys, we can safely delete them. + * + * If it's many keys, we should probably recreate the inode + */ + if (have_old_inode || nr_keys <= 2) + goto delete; + else + goto reconstruct; + } + } else { + /* + * not autofix, this one would be a giant wtf - bit error in the + * inode corrupting i_mode? + * + * may want to try repairing inode instead of deleting + */ + if (fsck_err_on(!btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), + trans, key_in_wrong_inode_type, + "key for wrong inode mode %o%s", + i->inode.bi_mode, buf.buf)) + goto delete; + } out: err: fsck_err: - printbuf_exit(&buf); bch_err_fn(c, ret); return ret; delete: + /* + * XXX: print out more info + * count up extents for this inode, check if we have different inode in + * an older snapshot version, perhaps decide if we want to reconstitute + */ ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); goto out; +reconstruct: + ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + if (ret) + goto err; + + inode->last_pos.inode--; + ret = bch_err_throw(c, transaction_restart_nested); + goto out; +} + +static int maybe_reconstruct_inum_btree(struct btree_trans *trans, + u64 inum, u32 snapshot, + enum btree_id btree) +{ + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_max_norestart(trans, iter, btree, + SPOS(inum, 0, snapshot), + POS(inum, U64_MAX), + 0, k, ret) { + ret = 1; + break; + } + + if (ret <= 0) + return ret; + + if (fsck_err(trans, missing_inode_with_contents, + "inode %llu:%u type %s missing, but contents found: reconstruct?", + inum, snapshot, + btree == BTREE_ID_extents ? "reg" : "dir")) + return reconstruct_inode(trans, btree, snapshot, inum) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + bch_err_throw(trans->c, transaction_restart_commit); +fsck_err: + return ret; +} + +static int maybe_reconstruct_inum(struct btree_trans *trans, + u64 inum, u32 snapshot) +{ + return maybe_reconstruct_inum_btree(trans, inum, snapshot, BTREE_ID_extents) ?: + maybe_reconstruct_inum_btree(trans, inum, snapshot, BTREE_ID_dirents); } static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w) @@ -1498,22 +1600,28 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal if (i->inode.bi_sectors == i->count) continue; - count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot); + CLASS(printbuf, buf)(); + lockrestart_do(trans, + bch2_inum_snapshot_to_path(trans, + i->inode.bi_inum, + i->inode.bi_snapshot, NULL, &buf)); + + count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->inode.bi_snapshot); if (w->recalculate_sums) i->count = count2; if (i->count != count2) { - bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", - w->last_pos.inode, i->snapshot, i->count, count2); + bch_err_ratelimited(c, "fsck counted i_sectors wrong: got %llu should be %llu\n%s", + i->count, count2, buf.buf); i->count = count2; } - if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), + if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty) && + i->inode.bi_sectors != i->count, trans, inode_i_sectors_wrong, - "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", - w->last_pos.inode, i->snapshot, - i->inode.bi_sectors, i->count)) { + "incorrect i_sectors: got %llu, should be %llu\n%s", + i->inode.bi_sectors, i->count, buf.buf)) { i->inode.bi_sectors = i->count; ret = bch2_fsck_write_inode(trans, &i->inode); if (ret) @@ -1556,11 +1664,15 @@ static void extent_ends_exit(struct extent_ends *extent_ends) darray_exit(&extent_ends->e); } -static void extent_ends_init(struct extent_ends *extent_ends) +static struct extent_ends extent_ends_init(void) { - memset(extent_ends, 0, sizeof(*extent_ends)); + return (struct extent_ends) {}; } +DEFINE_CLASS(extent_ends, struct extent_ends, + extent_ends_exit(&_T), + extent_ends_init(), void) + static int extent_ends_at(struct bch_fs *c, struct extent_ends *extent_ends, struct snapshots_seen *seen, @@ -1576,7 +1688,7 @@ static int extent_ends_at(struct bch_fs *c, sizeof(seen->ids.data[0]) * seen->ids.size, GFP_KERNEL); if (!n.seen.ids.data) - return -BCH_ERR_ENOMEM_fsck_extent_ends_at; + return bch_err_throw(c, ENOMEM_fsck_extent_ends_at); __darray_for_each(extent_ends->e, i) { if (i->snapshot == k.k->p.snapshot) { @@ -1600,17 +1712,17 @@ static int overlapping_extents_found(struct btree_trans *trans, struct extent_end *extent_end) { struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - struct btree_iter iter1, iter2 = {}; + CLASS(printbuf, buf)(); + struct btree_iter iter2 = {}; struct bkey_s_c k1, k2; int ret; BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2))); - bch2_trans_iter_init(trans, &iter1, btree, pos1, - BTREE_ITER_all_snapshots| - BTREE_ITER_not_extents); - k1 = bch2_btree_iter_peek_max(trans, &iter1, POS(pos1.inode, U64_MAX)); + CLASS(btree_iter, iter1)(trans, btree, pos1, + BTREE_ITER_all_snapshots| + BTREE_ITER_not_extents); + k1 = bch2_btree_iter_peek_max(&iter1, POS(pos1.inode, U64_MAX)); ret = bkey_err(k1); if (ret) goto err; @@ -1626,16 +1738,16 @@ static int overlapping_extents_found(struct btree_trans *trans, bch_err(c, "%s: error finding first overlapping extent when repairing, got%s", __func__, buf.buf); - ret = -BCH_ERR_internal_fsck_err; + ret = bch_err_throw(c, internal_fsck_err); goto err; } - bch2_trans_copy_iter(trans, &iter2, &iter1); + bch2_trans_copy_iter(&iter2, &iter1); while (1) { - bch2_btree_iter_advance(trans, &iter2); + bch2_btree_iter_advance(&iter2); - k2 = bch2_btree_iter_peek_max(trans, &iter2, POS(pos1.inode, U64_MAX)); + k2 = bch2_btree_iter_peek_max(&iter2, POS(pos1.inode, U64_MAX)); ret = bkey_err(k2); if (ret) goto err; @@ -1651,7 +1763,7 @@ static int overlapping_extents_found(struct btree_trans *trans, pos2.size != k2.k->size) { bch_err(c, "%s: error finding seconding overlapping extent when repairing%s", __func__, buf.buf); - ret = -BCH_ERR_internal_fsck_err; + ret = bch_err_throw(c, internal_fsck_err); goto err; } @@ -1699,14 +1811,12 @@ static int overlapping_extents_found(struct btree_trans *trans, * We overwrote the second extent - restart * check_extent() from the top: */ - ret = -BCH_ERR_transaction_restart_nested; + ret = bch_err_throw(c, transaction_restart_nested); } } fsck_err: err: - bch2_trans_iter_exit(trans, &iter2); - bch2_trans_iter_exit(trans, &iter1); - printbuf_exit(&buf); + bch2_trans_iter_exit(&iter2); return ret; } @@ -1763,16 +1873,16 @@ static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *it bkey_for_each_crc(k.k, ptrs, crc, i) if (crc_is_encoded(crc) && crc.uncompressed_size > encoded_extent_max_sectors) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_bkey_val_to_text(&buf, c, k); bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf); - printbuf_exit(&buf); } return 0; } +noinline_for_stack static int check_extent(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, struct inode_walker *inode, @@ -1781,7 +1891,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, struct disk_reservation *res) { struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); int ret = 0; ret = bch2_check_key_has_snapshot(trans, iter, k); @@ -1823,24 +1933,24 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); inode->inodes.data && i >= inode->inodes.data; --i) { - if (i->snapshot > k.k->p.snapshot || - !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) + if (i->inode.bi_snapshot > k.k->p.snapshot || + !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) continue; - if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && + u64 last_block = round_up(i->inode.bi_size, block_bytes(c)) >> 9; + + if (fsck_err_on(k.k->p.offset > last_block && !bkey_extent_is_reservation(k), trans, extent_past_end_of_inode, "extent type past end of inode %llu:%u, i_size %llu\n%s", - i->inode.bi_inum, i->snapshot, i->inode.bi_size, + i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - struct btree_iter iter2; - - bch2_trans_copy_iter(trans, &iter2, iter); - bch2_btree_iter_set_snapshot(trans, &iter2, i->snapshot); - ret = bch2_btree_iter_traverse(trans, &iter2) ?: - bch2_btree_delete_at(trans, &iter2, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &iter2); + ret = snapshots_seen_add_inorder(c, s, i->inode.bi_snapshot) ?: + bch2_fpunch_snapshot(trans, + SPOS(i->inode.bi_inum, + last_block, + i->inode.bi_snapshot), + POS(i->inode.bi_inum, U64_MAX)); if (ret) goto err; @@ -1850,6 +1960,10 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, } } + ret = check_extent_overbig(trans, iter, k); + if (ret) + goto err; + ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc); if (ret) goto err; @@ -1858,8 +1972,9 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); inode->inodes.data && i >= inode->inodes.data; --i) { - if (i->snapshot > k.k->p.snapshot || - !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) + if (i->whiteout || + i->inode.bi_snapshot > k.k->p.snapshot || + !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) continue; i->count += k.k->size; @@ -1874,7 +1989,6 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, out: err: fsck_err: - printbuf_exit(&buf); bch_err_fn(c, ret); return ret; } @@ -1885,49 +1999,48 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, */ int bch2_check_extents(struct bch_fs *c) { - struct inode_walker w = inode_walker_init(); - struct snapshots_seen s; - struct extent_ends extent_ends; struct disk_reservation res = { 0 }; - snapshots_seen_init(&s); - extent_ends_init(&extent_ends); + CLASS(btree_trans, trans)(c); + CLASS(snapshots_seen, s)(); + CLASS(inode_walker, w)(); + CLASS(extent_ends, extent_ends)(); + + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_extents)); - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_extents, + int ret = for_each_btree_key(trans, iter, BTREE_ID_extents, POS(BCACHEFS_ROOT_INO, 0), BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ + progress_update_iter(trans, &progress, &iter); bch2_disk_reservation_put(c, &res); - check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?: - check_extent_overbig(trans, &iter, k); + check_extent(trans, &iter, k, &w, &s, &extent_ends, &res); })) ?: - check_i_sectors_notnested(trans, &w)); + check_i_sectors_notnested(trans, &w); bch2_disk_reservation_put(c, &res); - extent_ends_exit(&extent_ends); - inode_walker_exit(&w); - snapshots_seen_exit(&s); - - bch_err_fn(c, ret); return ret; } int bch2_check_indirect_extents(struct bch_fs *c) { + CLASS(btree_trans, trans)(c); struct disk_reservation res = { 0 }; - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_reflink)); + + int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, POS_MIN, BTREE_ITER_prefetch, k, &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + progress_update_iter(trans, &progress, &iter); bch2_disk_reservation_put(c, &res); check_extent_overbig(trans, &iter, k); - }))); + })); bch2_disk_reservation_put(c, &res); - bch_err_fn(c, ret); return ret; } @@ -1941,26 +2054,34 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_ if (i->inode.bi_nlink == i->count) continue; - count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot); + count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->inode.bi_snapshot); if (count2 < 0) return count2; if (i->count != count2) { bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu", - w->last_pos.inode, i->snapshot, i->count, count2); + w->last_pos.inode, i->inode.bi_snapshot, i->count, count2); i->count = count2; if (i->inode.bi_nlink == i->count) continue; } - if (fsck_err_on(i->inode.bi_nlink != i->count, - trans, inode_dir_wrong_nlink, - "directory %llu:%u with wrong i_nlink: got %u, should be %llu", - w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { - i->inode.bi_nlink = i->count; - ret = bch2_fsck_write_inode(trans, &i->inode); - if (ret) - break; + if (i->inode.bi_nlink != i->count) { + CLASS(printbuf, buf)(); + + lockrestart_do(trans, + bch2_inum_snapshot_to_path(trans, w->last_pos.inode, + i->inode.bi_snapshot, NULL, &buf)); + + if (fsck_err_on(i->inode.bi_nlink != i->count, + trans, inode_dir_wrong_nlink, + "directory with wrong i_nlink: got %u, should be %llu\n%s", + i->inode.bi_nlink, i->count, buf.buf)) { + i->inode.bi_nlink = i->count; + ret = bch2_fsck_write_inode(trans, &i->inode); + if (ret) + break; + } } } fsck_err: @@ -1978,7 +2099,6 @@ static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_wa /* find a subvolume that's a descendent of @snapshot: */ static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid) { - struct btree_iter iter; struct bkey_s_c k; int ret; @@ -1988,16 +2108,13 @@ static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *su struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) { - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); *subvolid = k.k->p.offset; - goto found; + return 0; } } - if (!ret) - ret = -ENOENT; -found: - bch2_trans_iter_exit(trans, &iter); - return ret; + + return ret ?: -ENOENT; } noinline_for_stack @@ -2012,7 +2129,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * u32 parent_snapshot; u32 new_parent_subvol = 0; u64 parent_inum; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); int ret = 0; ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum); @@ -2051,24 +2168,22 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { if (!new_parent_subvol) { bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot); - return -BCH_ERR_fsck_repair_unimplemented; + return bch_err_throw(c, fsck_repair_unimplemented); } struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); ret = PTR_ERR_OR_ZERO(new_dirent); if (ret) - goto err; + return ret; new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol); } - struct bkey_s_c_subvolume s = - bch2_bkey_get_iter_typed(trans, &subvol_iter, - BTREE_ID_subvolumes, POS(0, target_subvol), - 0, subvolume); + bch2_trans_iter_init(trans, &subvol_iter, BTREE_ID_subvolumes, POS(0, target_subvol), 0); + struct bkey_s_c_subvolume s = bch2_bkey_get_typed(&subvol_iter, subvolume); ret = bkey_err(s.s_c); if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; + goto err; if (ret) { if (fsck_err(trans, dirent_to_missing_subvol, @@ -2079,30 +2194,41 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * goto out; } - if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol, - trans, subvol_fs_path_parent_wrong, - "subvol with wrong fs_path_parent, should be be %u\n%s", - parent_subvol, - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - struct bkey_i_subvolume *n = - bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); - ret = PTR_ERR_OR_ZERO(n); + if (le32_to_cpu(s.v->fs_path_parent) != parent_subvol) { + printbuf_reset(&buf); + + prt_printf(&buf, "subvol with wrong fs_path_parent, should be be %u\n", + parent_subvol); + + ret = bch2_inum_to_path(trans, (subvol_inum) { s.k->p.offset, + le64_to_cpu(s.v->inode) }, &buf); if (ret) goto err; + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, s.s_c); - n->v.fs_path_parent = cpu_to_le32(parent_subvol); + if (fsck_err(trans, subvol_fs_path_parent_wrong, "%s", buf.buf)) { + struct bkey_i_subvolume *n = + bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + n->v.fs_path_parent = cpu_to_le32(parent_subvol); + } } u64 target_inum = le64_to_cpu(s.v->inode); u32 target_snapshot = le32_to_cpu(s.v->snapshot); - ret = lookup_inode(trans, target_inum, target_snapshot, &subvol_root); + ret = bch2_inode_find_by_inum_snapshot(trans, target_inum, target_snapshot, + &subvol_root, 0); if (ret && !bch2_err_matches(ret, ENOENT)) goto err; if (ret) { bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum); - ret = -BCH_ERR_fsck_repair_unimplemented; + ret = bch_err_throw(c, fsck_repair_unimplemented); goto err; } @@ -2124,8 +2250,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * out: err: fsck_err: - bch2_trans_iter_exit(trans, &subvol_iter); - printbuf_exit(&buf); + bch2_trans_iter_exit(&subvol_iter); return ret; } @@ -2134,59 +2259,57 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, struct bch_hash_info *hash_info, struct inode_walker *dir, struct inode_walker *target, - struct snapshots_seen *s) + struct snapshots_seen *s, + bool *need_second_pass) { struct bch_fs *c = trans->c; struct inode_walker_entry *i; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); int ret = 0; ret = bch2_check_key_has_snapshot(trans, iter, k); - if (ret) { - ret = ret < 0 ? ret : 0; - goto out; - } + if (ret) + return ret < 0 ? ret : 0; ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); if (ret) - goto err; + return ret; if (k.k->type == KEY_TYPE_whiteout) - goto out; + return 0; if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) { ret = check_subdir_dirents_count(trans, dir); if (ret) - goto err; + return ret; } i = walk_inode(trans, dir, k); ret = PTR_ERR_OR_ZERO(i); - if (ret < 0) - goto err; + if (ret) + return ret; ret = check_key_has_inode(trans, iter, dir, i, k); if (ret) - goto err; + return ret; - if (!i) - goto out; + if (!i || i->whiteout) + return 0; if (dir->first_this_inode) *hash_info = bch2_hash_info_init(c, &i->inode); dir->first_this_inode = false; - ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k); - if (ret < 0) - goto err; - if (ret) { - /* dirent has been deleted */ - ret = 0; - goto out; - } + hash_info->cf_encoding = bch2_inode_casefold(c, &i->inode) ? c->cf_encoding : NULL; + ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, + iter, k, need_second_pass); + if (ret < 0) + return ret; + if (ret) + return 0; /* dirent has been deleted */ if (k.k->type != KEY_TYPE_dirent) - goto out; + return 0; struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); @@ -2197,42 +2320,51 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - struct qstr name = bch2_dirent_get_name(d); - u32 subvol = d.v->d_type == DT_SUBVOL - ? d.v->d_parent_subvol - : 0; + subvol_inum dir_inum = { .subvol = d.v->d_type == DT_SUBVOL + ? le32_to_cpu(d.v->d_parent_subvol) + : 0, + }; u64 target = d.v->d_type == DT_SUBVOL - ? d.v->d_child_subvol - : d.v->d_inum; - u64 dir_offset; + ? le32_to_cpu(d.v->d_child_subvol) + : le64_to_cpu(d.v->d_inum); + struct qstr name = bch2_dirent_get_name(d); - ret = bch2_hash_delete_at(trans, + struct bkey_i_dirent *new_d = + bch2_dirent_create_key(trans, hash_info, dir_inum, + d.v->d_type, &name, NULL, target); + ret = PTR_ERR_OR_ZERO(new_d); + if (ret) + return ret; + + new_d->k.p.inode = d.k->p.inode; + new_d->k.p.snapshot = d.k->p.snapshot; + + struct btree_iter dup_iter = {}; + return bch2_hash_delete_at(trans, bch2_dirent_hash_desc, hash_info, iter, BTREE_UPDATE_internal_snapshot_node) ?: - bch2_dirent_create_snapshot(trans, subvol, - d.k->p.inode, d.k->p.snapshot, - hash_info, - d.v->d_type, - &name, - target, - &dir_offset, - BTREE_ITER_with_updates| - BTREE_UPDATE_internal_snapshot_node| - STR_HASH_must_create) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - - /* might need another check_dirents pass */ - goto out; + bch2_str_hash_repair_key(trans, s, + &bch2_dirent_hash_desc, hash_info, + iter, bkey_i_to_s_c(&new_d->k_i), + &dup_iter, bkey_s_c_null, + need_second_pass); } if (d.v->d_type == DT_SUBVOL) { ret = check_dirent_to_subvol(trans, iter, d); if (ret) - goto err; + return ret; } else { ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); if (ret) - goto err; + return ret; + + if (!target->inodes.nr) { + ret = maybe_reconstruct_inum(trans, le64_to_cpu(d.v->d_inum), + d.k->p.snapshot); + if (ret) + return ret; + } if (fsck_err_on(!target->inodes.nr, trans, dirent_to_missing_inode, @@ -2242,13 +2374,13 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, buf.buf))) { ret = bch2_fsck_remove_dirent(trans, d.k->p); if (ret) - goto err; + return ret; } darray_for_each(target->inodes, i) { ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true); if (ret) - goto err; + return ret; } darray_for_each(target->deletes, i) @@ -2259,37 +2391,37 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - struct btree_iter delete_iter; - bch2_trans_iter_init(trans, &delete_iter, + CLASS(btree_iter, delete_iter)(trans, BTREE_ID_dirents, SPOS(k.k->p.inode, k.k->p.offset, *i), BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(trans, &delete_iter) ?: + ret = bch2_btree_iter_traverse(&delete_iter) ?: bch2_hash_delete_at(trans, bch2_dirent_hash_desc, hash_info, &delete_iter, BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &delete_iter); if (ret) - goto err; + return ret; } } + /* + * Cannot access key values after doing a transaction commit without + * revalidating: + */ + bool have_dir = d.v->d_type == DT_DIR; + ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); if (ret) - goto err; + return ret; for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) { - if (d.v->d_type == DT_DIR) + if (have_dir) i->count++; i->i_size += bkey_bytes(d.k); } -out: -err: fsck_err: - printbuf_exit(&buf); - bch_err_fn(c, ret); return ret; } @@ -2299,24 +2431,38 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, */ int bch2_check_dirents(struct bch_fs *c) { - struct inode_walker dir = inode_walker_init(); - struct inode_walker target = inode_walker_init(); - struct snapshots_seen s; struct bch_hash_info hash_info; + CLASS(btree_trans, trans)(c); + CLASS(snapshots_seen, s)(); + CLASS(inode_walker, dir)(); + CLASS(inode_walker, target)(); + struct progress_indicator_state progress; + bool need_second_pass = false, did_second_pass = false; + int ret; +again: + bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_dirents)); - snapshots_seen_init(&s); - - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_dirents, + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, POS(BCACHEFS_ROOT_INO, 0), BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?: - check_subdir_count_notnested(trans, &dir)); + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + progress_update_iter(trans, &progress, &iter); + check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s, + &need_second_pass); + })) ?: + check_subdir_count_notnested(trans, &dir); + + if (!ret && need_second_pass && !did_second_pass) { + bch_info(c, "check_dirents requires second pass"); + swap(did_second_pass, need_second_pass); + goto again; + } + + if (!ret && need_second_pass) { + bch_err(c, "dirents not repairing"); + ret = -EINVAL; + } - snapshots_seen_exit(&s); - inode_walker_exit(&dir); - inode_walker_exit(&target); - bch_err_fn(c, ret); return ret; } @@ -2326,16 +2472,14 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, struct inode_walker *inode) { struct bch_fs *c = trans->c; - struct inode_walker_entry *i; - int ret; - ret = bch2_check_key_has_snapshot(trans, iter, k); + int ret = bch2_check_key_has_snapshot(trans, iter, k); if (ret < 0) return ret; if (ret) return 0; - i = walk_inode(trans, inode, k); + struct inode_walker_entry *i = walk_inode(trans, inode, k); ret = PTR_ERR_OR_ZERO(i); if (ret) return ret; @@ -2344,16 +2488,16 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, if (ret) return ret; - if (!i) + if (!i || i->whiteout) return 0; if (inode->first_this_inode) *hash_info = bch2_hash_info_init(c, &i->inode); inode->first_this_inode = false; - ret = bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, iter, k); - bch_err_fn(c, ret); - return ret; + bool need_second_pass = false; + return bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, + iter, k, &need_second_pass); } /* @@ -2361,21 +2505,22 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, */ int bch2_check_xattrs(struct bch_fs *c) { - struct inode_walker inode = inode_walker_init(); struct bch_hash_info hash_info; - int ret = 0; + CLASS(btree_trans, trans)(c); + CLASS(inode_walker, inode)(); - ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_xattrs)); + + int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, POS(BCACHEFS_ROOT_INO, 0), BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - check_xattr(trans, &iter, k, &hash_info, &inode))); - - inode_walker_exit(&inode); - bch_err_fn(c, ret); + BCH_TRANS_COMMIT_no_enospc, ({ + progress_update_iter(trans, &progress, &iter); + check_xattr(trans, &iter, k, &hash_info, &inode); + })); return ret; } @@ -2413,7 +2558,8 @@ static int check_root_trans(struct btree_trans *trans) goto err; } - ret = lookup_inode(trans, BCACHEFS_ROOT_INO, snapshot, &root_inode); + ret = bch2_inode_find_by_inum_snapshot(trans, BCACHEFS_ROOT_INO, snapshot, + &root_inode, 0); if (ret && !bch2_err_matches(ret, ENOENT)) return ret; @@ -2439,37 +2585,32 @@ static int check_root_trans(struct btree_trans *trans) /* Get root directory, create if it doesn't exist: */ int bch2_check_root(struct bch_fs *c) { - int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_root_trans(trans)); - bch_err_fn(c, ret); - return ret; -} - -typedef DARRAY(u32) darray_u32; - -static bool darray_u32_has(darray_u32 *d, u32 v) -{ - darray_for_each(*d, i) - if (*i == v) - return true; - return false; + CLASS(btree_trans, trans)(c); + return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_root_trans(trans)); } static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { struct bch_fs *c = trans->c; - struct btree_iter parent_iter = {}; - darray_u32 subvol_path = {}; - struct printbuf buf = PRINTBUF; + CLASS(darray_u32, subvol_path)(); + CLASS(printbuf, buf)(); int ret = 0; if (k.k->type != KEY_TYPE_subvolume) return 0; + CLASS(btree_iter, parent_iter)(trans, BTREE_ID_subvolumes, POS_MIN, 0); + + subvol_inum start = { + .subvol = k.k->p.offset, + .inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode), + }; + while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) { ret = darray_push(&subvol_path, k.k->p.offset); if (ret) - goto err; + return ret; struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); @@ -2482,87 +2623,85 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, u32 parent = le32_to_cpu(s.v->fs_path_parent); - if (darray_u32_has(&subvol_path, parent)) { - if (fsck_err(trans, subvol_loop, "subvolume loop")) + if (darray_find(subvol_path, parent)) { + printbuf_reset(&buf); + prt_printf(&buf, "subvolume loop: "); + + ret = bch2_inum_to_path(trans, start, &buf); + if (ret) + return ret; + + if (fsck_err(trans, subvol_loop, "%s", buf.buf)) ret = reattach_subvol(trans, s); break; } - bch2_trans_iter_exit(trans, &parent_iter); - bch2_trans_iter_init(trans, &parent_iter, - BTREE_ID_subvolumes, POS(0, parent), 0); - k = bch2_btree_iter_peek_slot(trans, &parent_iter); + bch2_btree_iter_set_pos(&parent_iter, POS(0, parent)); + k = bch2_btree_iter_peek_slot(&parent_iter); ret = bkey_err(k); if (ret) - goto err; + return ret; if (fsck_err_on(k.k->type != KEY_TYPE_subvolume, trans, subvol_unreachable, "unreachable subvolume %s", - (bch2_bkey_val_to_text(&buf, c, s.s_c), + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = reattach_subvol(trans, s); - break; + return reattach_subvol(trans, s); } } fsck_err: -err: - printbuf_exit(&buf); - darray_exit(&subvol_path); - bch2_trans_iter_exit(trans, &parent_iter); return ret; } int bch2_check_subvolume_structure(struct bch_fs *c) { - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_subvol_path(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; -} + CLASS(btree_trans, trans)(c); -struct pathbuf_entry { - u64 inum; - u32 snapshot; -}; + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_subvolumes)); -typedef DARRAY(struct pathbuf_entry) pathbuf; + return for_each_btree_key_commit(trans, iter, + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + progress_update_iter(trans, &progress, &iter); + check_subvol_path(trans, &iter, k); + })); +} -static int bch2_bi_depth_renumber_one(struct btree_trans *trans, struct pathbuf_entry *p, +static int bch2_bi_depth_renumber_one(struct btree_trans *trans, + u64 inum, u32 snapshot, u32 new_depth) { - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, p->inum, p->snapshot), 0); + CLASS(btree_iter, iter)(trans, BTREE_ID_inodes, SPOS(0, inum, snapshot), 0); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); struct bch_inode_unpacked inode; int ret = bkey_err(k) ?: !bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode : bch2_inode_unpack(k, &inode); if (ret) - goto err; + return ret; if (inode.bi_depth != new_depth) { inode.bi_depth = new_depth; - ret = __bch2_fsck_write_inode(trans, &inode) ?: - bch2_trans_commit(trans, NULL, NULL, 0); + return __bch2_fsck_write_inode(trans, &inode) ?: + bch2_trans_commit(trans, NULL, NULL, 0); } -err: - bch2_trans_iter_exit(trans, &iter); - return ret; + + return 0; } -static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 new_bi_depth) +static int bch2_bi_depth_renumber(struct btree_trans *trans, darray_u64 *path, + u32 snapshot, u32 new_bi_depth) { u32 restart_count = trans->restart_count; int ret = 0; darray_for_each_reverse(*path, i) { ret = nested_lockrestart_do(trans, - bch2_bi_depth_renumber_one(trans, i, new_bi_depth)); + bch2_bi_depth_renumber_one(trans, *i, snapshot, new_bi_depth)); bch_err_fn(trans->c, ret); if (ret) break; @@ -2573,43 +2712,43 @@ static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 return ret ?: trans_was_restarted(trans, restart_count); } -static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) -{ - darray_for_each(*p, i) - if (i->inum == inum && - i->snapshot == snapshot) - return true; - return false; -} - static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) { struct bch_fs *c = trans->c; - struct btree_iter inode_iter = {}; - pathbuf path = {}; - struct printbuf buf = PRINTBUF; + CLASS(darray_u64, path)(); + CLASS(printbuf, buf)(); u32 snapshot = inode_k.k->p.snapshot; bool redo_bi_depth = false; u32 min_bi_depth = U32_MAX; int ret = 0; + struct bpos start = inode_k.k->p; + struct bch_inode_unpacked inode; ret = bch2_inode_unpack(inode_k, &inode); if (ret) return ret; - while (!inode.bi_subvol) { + CLASS(btree_iter, inode_iter)(trans, BTREE_ID_inodes, POS_MIN, 0); + + /* + * If we're running full fsck, check_dirents() will have already ran, + * and we shouldn't see any missing backpointers here - otherwise that's + * handled separately, by check_unreachable_inodes + */ + while (!inode.bi_subvol && + bch2_inode_has_backpointer(&inode)) { struct btree_iter dirent_iter; struct bkey_s_c_dirent d; - u32 parent_snapshot = snapshot; - d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot); + d = dirent_get_by_pos(trans, &dirent_iter, + SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot)); ret = bkey_err(d.s_c); if (ret && !bch2_err_matches(ret, ENOENT)) goto out; if (!ret && (ret = dirent_points_to_inode(c, d, &inode))) - bch2_trans_iter_exit(trans, &dirent_iter); + bch2_trans_iter_exit(&dirent_iter); if (bch2_err_matches(ret, ENOENT)) { printbuf_reset(&buf); @@ -2619,20 +2758,14 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) goto out; } - bch2_trans_iter_exit(trans, &dirent_iter); + bch2_trans_iter_exit(&dirent_iter); - ret = darray_push(&path, ((struct pathbuf_entry) { - .inum = inode.bi_inum, - .snapshot = snapshot, - })); + ret = darray_push(&path, inode.bi_inum); if (ret) return ret; - snapshot = parent_snapshot; - - bch2_trans_iter_exit(trans, &inode_iter); - inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, - SPOS(0, inode.bi_dir, snapshot), 0); + bch2_btree_iter_set_pos(&inode_iter, SPOS(0, inode.bi_dir, snapshot)); + inode_k = bch2_btree_iter_peek_slot(&inode_iter); struct bch_inode_unpacked parent_inode; ret = bkey_err(inode_k) ?: @@ -2651,22 +2784,28 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) break; inode = parent_inode; - snapshot = inode_k.k->p.snapshot; redo_bi_depth = true; - if (path_is_dup(&path, inode.bi_inum, snapshot)) { - /* XXX print path */ - bch_err(c, "directory structure loop"); + if (darray_find(path, inode.bi_inum)) { + printbuf_reset(&buf); + prt_printf(&buf, "directory structure loop in snapshot %u: ", + snapshot); + + ret = bch2_inum_snapshot_to_path(trans, start.offset, start.snapshot, NULL, &buf); + if (ret) + goto out; - darray_for_each(path, i) - pr_err("%llu:%u", i->inum, i->snapshot); - pr_err("%llu:%u", inode.bi_inum, snapshot); + if (c->opts.verbose) { + prt_newline(&buf); + darray_for_each(path, i) + prt_printf(&buf, "%llu ", *i); + } - if (fsck_err(trans, dir_loop, "directory structure loop")) { + if (fsck_err(trans, dir_loop, "%s", buf.buf)) { ret = remove_backpointer(trans, &inode); bch_err_msg(c, ret, "removing dirent"); if (ret) - break; + goto out; ret = reattach_inode(trans, &inode); bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); @@ -2680,12 +2819,9 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) min_bi_depth = 0; if (redo_bi_depth) - ret = bch2_bi_depth_renumber(trans, &path, min_bi_depth); + ret = bch2_bi_depth_renumber(trans, &path, snapshot, min_bi_depth); out: fsck_err: - bch2_trans_iter_exit(trans, &inode_iter); - darray_exit(&path); - printbuf_exit(&buf); bch_err_fn(c, ret); return ret; } @@ -2696,8 +2832,8 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) */ int bch2_check_directory_structure(struct bch_fs *c) { - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, + CLASS(btree_trans, trans)(c); + return for_each_btree_key_reverse_commit(trans, iter, BTREE_ID_inodes, POS_MIN, BTREE_ITER_intent| BTREE_ITER_prefetch| BTREE_ITER_all_snapshots, k, @@ -2709,10 +2845,7 @@ int bch2_check_directory_structure(struct bch_fs *c) continue; check_path_loop(trans, k); - }))); - - bch_err_fn(c, ret); - return ret; + })); } struct nlink_table { @@ -2736,7 +2869,7 @@ static int add_nlink(struct bch_fs *c, struct nlink_table *t, if (!d) { bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", new_size); - return -BCH_ERR_ENOMEM_fsck_add_nlink; + return bch_err_throw(c, ENOMEM_fsck_add_nlink); } if (t->d) @@ -2796,8 +2929,8 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, struct nlink_table *t, u64 start, u64 *end) { - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_inodes, + CLASS(btree_trans, trans)(c); + int ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, start), BTREE_ITER_intent| BTREE_ITER_prefetch| @@ -2832,7 +2965,7 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, break; } 0; - }))); + })); bch_err_fn(c, ret); return ret; @@ -2842,12 +2975,10 @@ noinline_for_stack static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links, u64 range_start, u64 range_end) { - struct snapshots_seen s; - - snapshots_seen_init(&s); + CLASS(btree_trans, trans)(c); + CLASS(snapshots_seen, s)(); - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, + int ret = for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, BTREE_ITER_intent| BTREE_ITER_prefetch| BTREE_ITER_all_snapshots, k, ({ @@ -2864,9 +2995,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links le64_to_cpu(d.v->d_inum), d.k->p.snapshot); } 0; - }))); - - snapshots_seen_exit(&s); + })); bch_err_fn(c, ret); return ret; @@ -2920,14 +3049,14 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, struct nlink_table *links, u64 range_start, u64 range_end) { + CLASS(btree_trans, trans)(c); size_t idx = 0; - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, + int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS(0, range_start), BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end))); + check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)); if (ret < 0) { bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret)); return ret; @@ -2966,7 +3095,6 @@ int bch2_check_nlinks(struct bch_fs *c) } while (next_iter_range_start != U64_MAX); kvfree(links.d); - bch_err_fn(c, ret); return ret; } @@ -3001,15 +3129,13 @@ int bch2_fix_reflink_p(struct bch_fs *c) if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) return 0; - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, + CLASS(btree_trans, trans)(c); + return for_each_btree_key_commit(trans, iter, BTREE_ID_extents, POS_MIN, BTREE_ITER_intent|BTREE_ITER_prefetch| BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - fix_reflink_p_key(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; + fix_reflink_p_key(trans, &iter, k)); } #ifndef NO_BCACHEFS_CHARDEV @@ -3035,6 +3161,8 @@ static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) if (ret) return ret; + thr->c->recovery_task = current; + ret = bch2_fs_start(thr->c); if (ret) goto err; @@ -3061,7 +3189,7 @@ long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) { struct bch_ioctl_fsck_offline arg; struct fsck_thread *thr = NULL; - darray_str(devs) = {}; + darray_const_str devs = {}; long ret = 0; if (copy_from_user(&arg, user_arg, sizeof(arg))) @@ -3119,7 +3247,7 @@ long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops); - thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts); + thr->c = bch2_fs_open(&devs, &thr->opts); if (!IS_ERR(thr->c) && thr->c->opts.errors == BCH_ON_ERROR_panic) @@ -3156,19 +3284,18 @@ static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) c->opts.fix_errors = FSCK_FIX_ask; c->opts.fsck = true; - set_bit(BCH_FS_fsck_running, &c->flags); + set_bit(BCH_FS_in_fsck, &c->flags); - c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; - int ret = bch2_run_online_recovery_passes(c); + int ret = bch2_run_online_recovery_passes(c, ~0ULL); - clear_bit(BCH_FS_fsck_running, &c->flags); + clear_bit(BCH_FS_in_fsck, &c->flags); bch_err_fn(c, ret); c->stdio = NULL; c->stdio_filter = NULL; c->opts.fix_errors = old_fix_errors; - up(&c->online_fsck_mutex); + up(&c->recovery.run_lock); bch2_ro_ref_put(c); return ret; } @@ -3192,7 +3319,7 @@ long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) if (!bch2_ro_ref_tryget(c)) return -EROFS; - if (down_trylock(&c->online_fsck_mutex)) { + if (down_trylock(&c->recovery.run_lock)) { bch2_ro_ref_put(c); return -EAGAIN; } @@ -3224,7 +3351,7 @@ long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) bch_err_fn(c, ret); if (thr) bch2_fsck_thread_exit(&thr->thr); - up(&c->online_fsck_mutex); + up(&c->recovery.run_lock); bch2_ro_ref_put(c); } return ret; diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h index 574948278cd4..e5fe7cf7b251 100644 --- a/fs/bcachefs/fsck.h +++ b/fs/bcachefs/fsck.h @@ -4,6 +4,12 @@ #include "str_hash.h" +/* recoverds snapshot IDs of overwrites at @pos */ +struct snapshots_seen { + struct bpos pos; + snapshot_id_list ids; +}; + int bch2_fsck_update_backpointers(struct btree_trans *, struct snapshots_seen *, const struct bch_hash_desc, diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 490b85841de9..d5e5190f0663 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -38,6 +38,7 @@ static const char * const bch2_inode_flag_strs[] = { #undef x static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos); +static int may_delete_deleted_inum(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *); static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; @@ -241,6 +242,7 @@ static int bch2_inode_unpack_v3(struct bkey_s_c k, u64 v[2]; unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_snapshot = inode.k->p.snapshot; unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); unpacked->bi_hash_seed = inode.v->bi_hash_seed; unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); @@ -285,13 +287,12 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, { memset(unpacked, 0, sizeof(*unpacked)); - unpacked->bi_snapshot = k.k->p.snapshot; - switch (k.k->type) { case KEY_TYPE_inode: { struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_snapshot = inode.k->p.snapshot; unpacked->bi_journal_seq= 0; unpacked->bi_hash_seed = inode.v->bi_hash_seed; unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); @@ -310,6 +311,7 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_snapshot = inode.k->p.snapshot; unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); unpacked->bi_hash_seed = inode.v->bi_hash_seed; unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); @@ -327,8 +329,6 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, int bch2_inode_unpack(struct bkey_s_c k, struct bch_inode_unpacked *unpacked) { - unpacked->bi_snapshot = k.k->p.snapshot; - return likely(k.k->type == KEY_TYPE_inode_v3) ? bch2_inode_unpack_v3(k, unpacked) : bch2_inode_unpack_slowpath(k, unpacked); @@ -345,12 +345,12 @@ int __bch2_inode_peek(struct btree_trans *trans, if (ret) return ret; - struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes, - SPOS(0, inum.inum, snapshot), - flags|BTREE_ITER_cached); + bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, SPOS(0, inum.inum, snapshot), + flags|BTREE_ITER_cached); + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) - return ret; + goto err; ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; if (ret) @@ -364,7 +364,75 @@ int __bch2_inode_peek(struct btree_trans *trans, err: if (warn) bch_err_msg(trans->c, ret, "looking up inum %llu:%llu:", inum.subvol, inum.inum); - bch2_trans_iter_exit(trans, iter); + bch2_trans_iter_exit(iter); + return ret; +} + +int bch2_inode_find_by_inum_snapshot(struct btree_trans *trans, + u64 inode_nr, u32 snapshot, + struct bch_inode_unpacked *inode, + unsigned flags) +{ + CLASS(btree_iter, iter)(trans, BTREE_ID_inodes, SPOS(0, inode_nr, snapshot), flags); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + int ret = bkey_err(k); + if (ret) + return ret; + + return bkey_is_inode(k.k) + ? bch2_inode_unpack(k, inode) + : -BCH_ERR_ENOENT_inode; +} + +int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans, + subvol_inum inum, + struct bch_inode_unpacked *inode) +{ + struct btree_iter iter; + int ret; + + ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0); + if (!ret) + bch2_trans_iter_exit(&iter); + return ret; +} + +int bch2_inode_find_by_inum_trans(struct btree_trans *trans, + subvol_inum inum, + struct bch_inode_unpacked *inode) +{ + struct btree_iter iter; + int ret; + + ret = bch2_inode_peek(trans, &iter, inode, inum, 0); + if (!ret) + bch2_trans_iter_exit(&iter); + return ret; +} + +int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, + struct bch_inode_unpacked *inode) +{ + CLASS(btree_trans, trans)(c); + return lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, inode)); +} + +int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum, + struct bch_inode_unpacked *root) +{ + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, + SPOS(0, inum, U32_MAX), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inum) + break; + if (bkey_is_inode(k.k)) + return bch2_inode_unpack(k, root); + } + /* We're only called when we know we have an inode for @inum */ + BUG_ON(!ret); return ret; } @@ -395,9 +463,10 @@ int __bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked bch2_inode_pack(inode_p, inode); inode_p->inode.k.p.snapshot = inode->bi_snapshot; - return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, - &inode_p->inode.k_i, - BTREE_UPDATE_internal_snapshot_node); + return bch2_btree_insert_trans(trans, BTREE_ID_inodes, + &inode_p->inode.k_i, + BTREE_ITER_cached| + BTREE_UPDATE_internal_snapshot_node); } int bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) @@ -619,14 +688,15 @@ bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter struct bkey_s_c k; int ret = 0; - for_each_btree_key_max_norestart(trans, *iter, btree, - bpos_successor(pos), - SPOS(pos.inode, pos.offset, U32_MAX), - flags|BTREE_ITER_all_snapshots, k, ret) + bch2_trans_iter_init(trans, iter, btree, bpos_successor(pos), + flags|BTREE_ITER_all_snapshots); + + for_each_btree_key_max_continue_norestart(*iter, SPOS(pos.inode, pos.offset, U32_MAX), + flags|BTREE_ITER_all_snapshots, k, ret) if (bch2_snapshot_is_ancestor(c, pos.snapshot, k.k->p.snapshot)) return k; - bch2_trans_iter_exit(trans, iter); + bch2_trans_iter_exit(iter); return ret ? bkey_s_c_err(ret) : bkey_s_c_null; } @@ -642,7 +712,7 @@ bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter bkey_is_inode(k.k)) return k; - bch2_trans_iter_exit(trans, iter); + bch2_trans_iter_exit(iter); pos = k.k->p; goto again; } @@ -650,7 +720,6 @@ bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) { struct bch_fs *c = trans->c; - struct btree_iter iter; struct bkey_s_c k; int ret = 0; @@ -663,7 +732,6 @@ int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) ret = 1; break; } - bch2_trans_iter_exit(trans, &iter); return ret; } @@ -715,7 +783,7 @@ static int update_parent_inode_has_children(struct btree_trans *trans, struct bp bkey_inode_flags_set(bkey_i_to_s(update), f ^ BCH_INODE_has_child_snapshot); } err: - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); return ret; } @@ -833,7 +901,8 @@ void bch2_inode_init_early(struct bch_fs *c, get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed)); } -void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, +void bch2_inode_init_late(struct bch_fs *c, + struct bch_inode_unpacked *inode_u, u64 now, uid_t uid, gid_t gid, umode_t mode, dev_t rdev, struct bch_inode_unpacked *parent) { @@ -857,6 +926,12 @@ void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, BCH_INODE_OPTS() #undef x } + + if (!S_ISDIR(mode)) + inode_u->bi_casefold = 0; + + if (bch2_inode_casefold(c, inode_u)) + inode_u->bi_flags |= BCH_INODE_has_case_insensitive; } void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, @@ -864,7 +939,7 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, struct bch_inode_unpacked *parent) { bch2_inode_init_early(c, inode_u); - bch2_inode_init_late(inode_u, bch2_current_time(c), + bch2_inode_init_late(c, inode_u, bch2_current_time(c), uid, gid, mode, rdev, parent); } @@ -877,11 +952,10 @@ bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *m cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits); - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, - BTREE_ID_logged_ops, - POS(LOGGED_OPS_INUM_inode_cursors, cursor_idx), - BTREE_ITER_cached); + CLASS(btree_iter, iter)(trans, BTREE_ID_logged_ops, + POS(LOGGED_OPS_INUM_inode_cursors, cursor_idx), + BTREE_ITER_cached); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); int ret = bkey_err(k); if (ret) return ERR_PTR(ret); @@ -890,9 +964,8 @@ bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *m k.k->type == KEY_TYPE_inode_alloc_cursor ? bch2_bkey_make_mut_typed(trans, &iter, &k, 0, inode_alloc_cursor) : bch2_bkey_alloc(trans, &iter, 0, inode_alloc_cursor); - ret = PTR_ERR_OR_ZERO(cursor); - if (ret) - goto err; + if (IS_ERR(cursor)) + return cursor; if (c->opts.inodes_32bit) { *min = BLOCKDEV_INODE_MAX; @@ -913,9 +986,8 @@ bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *m cursor->v.idx = cpu_to_le64(*min); le32_add_cpu(&cursor->v.gen, 1); } -err: - bch2_trans_iter_exit(trans, &iter); - return ret ? ERR_PTR(ret) : cursor; + + return cursor; } /* @@ -935,53 +1007,60 @@ int bch2_inode_create(struct btree_trans *trans, u64 start = le64_to_cpu(cursor->v.idx); u64 pos = start; + u64 gen = 0; bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos), BTREE_ITER_all_snapshots| BTREE_ITER_intent); struct bkey_s_c k; again: - while ((k = bch2_btree_iter_peek(trans, iter)).k && + while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k)) && bkey_lt(k.k->p, POS(0, max))) { if (pos < iter->pos.offset) goto found_slot; + if (bch2_snapshot_is_ancestor(trans->c, snapshot, k.k->p.snapshot) && + k.k->type == KEY_TYPE_inode_generation) { + gen = le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); + goto found_slot; + } + /* * We don't need to iterate over keys in every snapshot once * we've found just one: */ pos = iter->pos.offset + 1; - bch2_btree_iter_set_pos(trans, iter, POS(0, pos)); + bch2_btree_iter_set_pos(iter, POS(0, pos)); } if (!ret && pos < max) goto found_slot; if (!ret && start == min) - ret = -BCH_ERR_ENOSPC_inode_create; + ret = bch_err_throw(trans->c, ENOSPC_inode_create); if (ret) { - bch2_trans_iter_exit(trans, iter); + bch2_trans_iter_exit(iter); return ret; } /* Retry from start */ pos = start = min; - bch2_btree_iter_set_pos(trans, iter, POS(0, pos)); + bch2_btree_iter_set_pos(iter, POS(0, pos)); le32_add_cpu(&cursor->v.gen, 1); goto again; found_slot: - bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, snapshot)); - k = bch2_btree_iter_peek_slot(trans, iter); + bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot)); + k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) { - bch2_trans_iter_exit(trans, iter); + bch2_trans_iter_exit(iter); return ret; } inode_u->bi_inum = k.k->p.offset; - inode_u->bi_generation = le64_to_cpu(cursor->v.gen); + inode_u->bi_generation = max(gen, le64_to_cpu(cursor->v.gen)); cursor->v.idx = cpu_to_le64(k.k->p.offset + 1); return 0; } @@ -989,7 +1068,6 @@ int bch2_inode_create(struct btree_trans *trans, static int bch2_inode_delete_keys(struct btree_trans *trans, subvol_inum inum, enum btree_id id) { - struct btree_iter iter; struct bkey_s_c k; struct bkey_i delete; struct bpos end = POS(inum.inum, U64_MAX); @@ -1000,8 +1078,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, * We're never going to be deleting partial extents, no need to use an * extent iterator: */ - bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0), - BTREE_ITER_intent); + CLASS(btree_iter, iter)(trans, id, POS(inum.inum, 0), BTREE_ITER_intent); while (1) { bch2_trans_begin(trans); @@ -1010,9 +1087,9 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, if (ret) goto err; - bch2_btree_iter_set_snapshot(trans, &iter, snapshot); + bch2_btree_iter_set_snapshot(&iter, snapshot); - k = bch2_btree_iter_peek_max(trans, &iter, end); + k = bch2_btree_iter_peek_max(&iter, end); ret = bkey_err(k); if (ret) goto err; @@ -1036,31 +1113,36 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, break; } - bch2_trans_iter_exit(trans, &iter); return ret; } int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) { - struct btree_trans *trans = bch2_trans_get(c); + CLASS(btree_trans, trans)(c); struct btree_iter iter = {}; struct bkey_s_c k; + struct bch_inode_unpacked inode; u32 snapshot; int ret; + ret = lockrestart_do(trans, may_delete_deleted_inum(trans, inum, &inode)); + if (ret) + return ret; + /* * If this was a directory, there shouldn't be any real dirents left - * but there could be whiteouts (from hash collisions) that we should * delete: * - * XXX: the dirent could ideally would delete whiteouts when they're no + * XXX: the dirent code ideally would delete whiteouts when they're no * longer needed */ - ret = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?: - bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?: - bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents); + ret = (!S_ISDIR(inode.bi_mode) + ? bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) + : bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents)) ?: + bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs); if (ret) - goto err; + return ret; retry: bch2_trans_begin(trans); @@ -1079,7 +1161,7 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) bch2_fs_inconsistent(c, "inode %llu:%u not found when deleting", inum.inum, snapshot); - ret = -BCH_ERR_ENOENT_inode; + ret = bch_err_throw(c, ENOENT_inode); goto err; } @@ -1087,49 +1169,14 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); err: - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; if (ret) - goto err2; - - ret = delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot)); -err2: - bch2_trans_put(trans); - return ret; -} - -int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans, - subvol_inum inum, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - int ret; - - ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0); - if (!ret) - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_inode_find_by_inum_trans(struct btree_trans *trans, - subvol_inum inum, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - int ret; - - ret = bch2_inode_peek(trans, &iter, inode, inum, 0); - if (!ret) - bch2_trans_iter_exit(trans, &iter); - return ret; -} + return ret; -int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, - struct bch_inode_unpacked *inode) -{ - return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode)); + return delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot)); } int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) @@ -1210,11 +1257,15 @@ int bch2_inode_set_casefold(struct btree_trans *trans, subvol_inum inum, { struct bch_fs *c = trans->c; -#ifdef CONFIG_UNICODE - int ret = 0; + int ret = bch2_fs_casefold_enabled(c); + if (ret) { + bch_err_ratelimited(c, "Cannot enable casefolding: %s", bch2_err_str(ret)); + return ret; + } + /* Not supported on individual files. */ if (!S_ISDIR(bi->bi_mode)) - return -EOPNOTSUPP; + return bch_err_throw(c, casefold_opt_is_dir_only); /* * Make sure the dir is empty, as otherwise we'd need to @@ -1233,20 +1284,13 @@ int bch2_inode_set_casefold(struct btree_trans *trans, subvol_inum inum, bi->bi_casefold = v + 1; bi->bi_fields_set |= BIT(Inode_opt_casefold); - return 0; -#else - bch_err(c, "Cannot use casefolding on a kernel without CONFIG_UNICODE"); - return -EOPNOTSUPP; -#endif + return bch2_maybe_propagate_has_case_insensitive(trans, inum, bi); } static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) { struct bch_fs *c = trans->c; - struct btree_iter iter = {}; - struct bkey_i_inode_generation delete; - struct bch_inode_unpacked inode_u; - struct bkey_s_c k; + struct btree_iter iter = { NULL }; int ret; do { @@ -1262,14 +1306,14 @@ static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum SPOS(inum, 0, snapshot), SPOS(inum, U64_MAX, snapshot), 0, NULL); - } while (ret == -BCH_ERR_transaction_restart_nested); + } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); if (ret) goto err; retry: bch2_trans_begin(trans); - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, inum, snapshot), BTREE_ITER_intent); + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, inum, snapshot), BTREE_ITER_intent); ret = bkey_err(k); if (ret) goto err; @@ -1278,16 +1322,18 @@ static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum bch2_fs_inconsistent(c, "inode %llu:%u not found when deleting", inum, snapshot); - ret = -BCH_ERR_ENOENT_inode; + ret = bch_err_throw(c, ENOENT_inode); goto err; } + struct bch_inode_unpacked inode_u; bch2_inode_unpack(k, &inode_u); /* Subvolume root? */ if (inode_u.bi_subvol) bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum); + struct bkey_i_inode_generation delete; bkey_inode_generation_init(&delete.k_i); delete.k.p = iter.pos; delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); @@ -1296,11 +1342,11 @@ static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); err: - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - return ret ?: -BCH_ERR_transaction_restart_nested; + return ret ?: bch_err_throw(c, transaction_restart_nested); } /* @@ -1321,7 +1367,7 @@ static int delete_ancestor_snapshot_inodes(struct btree_trans *trans, struct bpo bool unlinked = bkey_is_unlinked_inode(k); pos = k.k->p; - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); if (!unlinked) return 0; @@ -1342,122 +1388,133 @@ int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot)); } -static int may_delete_deleted_inode(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos pos, - bool *need_another_pass) +static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos, + struct bch_inode_unpacked *inode, + bool from_deleted_inodes) { struct bch_fs *c = trans->c; - struct btree_iter inode_iter; - struct bkey_s_c k; - struct bch_inode_unpacked inode; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); int ret; - k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached); + CLASS(btree_iter, inode_iter)(trans, BTREE_ID_inodes, pos, BTREE_ITER_cached); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&inode_iter); ret = bkey_err(k); if (ret) return ret; - ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; - if (fsck_err_on(!bkey_is_inode(k.k), + ret = bkey_is_inode(k.k) ? 0 : bch_err_throw(c, ENOENT_inode); + if (fsck_err_on(from_deleted_inodes && ret, trans, deleted_inode_missing, "nonexistent inode %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) goto delete; + if (ret) + return ret; - ret = bch2_inode_unpack(k, &inode); + ret = bch2_inode_unpack(k, inode); if (ret) - goto out; + return ret; - if (S_ISDIR(inode.bi_mode)) { + if (S_ISDIR(inode->bi_mode)) { ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot); - if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY), + if (fsck_err_on(from_deleted_inodes && + bch2_err_matches(ret, ENOTEMPTY), trans, deleted_inode_is_dir, "non empty directory %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) goto delete; if (ret) - goto out; + return ret; } - if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), + ret = inode->bi_flags & BCH_INODE_unlinked ? 0 : bch_err_throw(c, inode_not_unlinked); + if (fsck_err_on(from_deleted_inodes && ret, trans, deleted_inode_not_unlinked, "non-deleted inode %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) goto delete; + if (ret) + return ret; - if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot, + ret = !(inode->bi_flags & BCH_INODE_has_child_snapshot) + ? 0 : bch_err_throw(c, inode_has_child_snapshot); + + if (fsck_err_on(from_deleted_inodes && ret, trans, deleted_inode_has_child_snapshots, "inode with child snapshots %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) goto delete; + if (ret) + return ret; ret = bch2_inode_has_child_snapshots(trans, k.k->p); if (ret < 0) - goto out; + return ret; if (ret) { if (fsck_err(trans, inode_has_child_snapshots_wrong, "inode has_child_snapshots flag wrong (should be set)\n%s", (printbuf_reset(&buf), - bch2_inode_unpacked_to_text(&buf, &inode), + bch2_inode_unpacked_to_text(&buf, inode), buf.buf))) { - inode.bi_flags |= BCH_INODE_has_child_snapshot; - ret = __bch2_fsck_write_inode(trans, &inode); + inode->bi_flags |= BCH_INODE_has_child_snapshot; + ret = __bch2_fsck_write_inode(trans, inode); if (ret) - goto out; + return ret; } + + if (!from_deleted_inodes) { + return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + bch_err_throw(c, inode_has_child_snapshot); + } + goto delete; } - if (test_bit(BCH_FS_clean_recovery, &c->flags) && - !fsck_err(trans, deleted_inode_but_clean, - "filesystem marked as clean but have deleted inode %llu:%u", - pos.offset, pos.snapshot)) { - ret = 0; - goto out; - } + if (from_deleted_inodes) { + if (test_bit(BCH_FS_clean_recovery, &c->flags) && + !fsck_err(trans, deleted_inode_but_clean, + "filesystem marked as clean but have deleted inode %llu:%u", + pos.offset, pos.snapshot)) + return 0; - ret = 1; -out: + ret = 1; + } fsck_err: - bch2_trans_iter_exit(trans, &inode_iter); - printbuf_exit(&buf); return ret; delete: - ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false); - goto out; + return bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false); +} + +static int may_delete_deleted_inum(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_unpacked *inode) +{ + u32 snapshot; + + return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: + may_delete_deleted_inode(trans, SPOS(0, inum.inum, snapshot), inode, false); } int bch2_delete_dead_inodes(struct bch_fs *c) { - struct btree_trans *trans = bch2_trans_get(c); - bool need_another_pass; - int ret; -again: + CLASS(btree_trans, trans)(c); /* * if we ran check_inodes() unlinked inodes will have already been * cleaned up but the write buffer will be out of sync; therefore we * alway need a write buffer flush - */ - ret = bch2_btree_write_buffer_flush_sync(trans); - if (ret) - goto err; - - need_another_pass = false; - - /* + * * Weird transaction restart handling here because on successful delete, * bch2_inode_rm_snapshot() will return a nested transaction restart, * but we can't retry because the btree write buffer won't have been * flushed and we'd spin: */ - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, + return bch2_btree_write_buffer_flush_sync(trans) ?: + for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass); + struct bch_inode_unpacked inode; + int ret = may_delete_deleted_inode(trans, k.k->p, &inode, true); if (ret > 0) { bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot); @@ -1478,10 +1535,4 @@ int bch2_delete_dead_inodes(struct bch_fs *c) ret; })); - - if (!ret && need_another_pass) - goto again; -err: - bch2_trans_put(trans); - return ret; } diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 5cfba9e98966..b8ec3e628d90 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -134,10 +134,21 @@ static inline int bch2_inode_peek(struct btree_trans *trans, subvol_inum inum, unsigned flags) { return __bch2_inode_peek(trans, iter, inode, inum, flags, true); - int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags); - return ret; } +int bch2_inode_find_by_inum_snapshot(struct btree_trans *, u64, u32, + struct bch_inode_unpacked *, unsigned); +int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *, + subvol_inum, + struct bch_inode_unpacked *); +int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *); +int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, + struct bch_inode_unpacked *); + +int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum, + struct bch_inode_unpacked *root); + int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *, struct bch_inode_unpacked *, enum btree_iter_update_trigger_flags); @@ -153,7 +164,7 @@ int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *); void bch2_inode_init_early(struct bch_fs *, struct bch_inode_unpacked *); -void bch2_inode_init_late(struct bch_inode_unpacked *, u64, +void bch2_inode_init_late(struct bch_fs *, struct bch_inode_unpacked *, u64, uid_t, gid_t, umode_t, dev_t, struct bch_inode_unpacked *); void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, @@ -165,14 +176,6 @@ int bch2_inode_create(struct btree_trans *, struct btree_iter *, int bch2_inode_rm(struct bch_fs *, subvol_inum); -int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *, - subvol_inum, - struct bch_inode_unpacked *); -int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *); -int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, - struct bch_inode_unpacked *); - #define inode_opt_get(_c, _inode, _name) \ ((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name) @@ -245,12 +248,17 @@ static inline unsigned bkey_inode_mode(struct bkey_s_c k) static inline bool bch2_inode_casefold(struct bch_fs *c, const struct bch_inode_unpacked *bi) { - /* inode apts are stored with a +1 bias: 0 means "unset, use fs opt" */ + /* inode opts are stored with a +1 bias: 0 means "unset, use fs opt" */ return bi->bi_casefold ? bi->bi_casefold - 1 : c->opts.casefold; } +static inline bool bch2_inode_has_backpointer(const struct bch_inode_unpacked *bi) +{ + return bi->bi_dir || bi->bi_dir_offset; +} + /* i_nlink: */ static inline unsigned nlink_bias(umode_t mode) @@ -280,15 +288,6 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, int bch2_inode_nlink_inc(struct bch_inode_unpacked *); void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); -static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *inode) -{ - bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset; - - return S_ISDIR(inode->bi_mode) || - inode->bi_subvol || - (!inode->bi_nlink && inode_has_bp); -} - struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *); void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, struct bch_inode_unpacked *); @@ -306,6 +305,14 @@ bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode return io_opts_to_rebalance_opts(c, &io_opts); } +#define BCACHEFS_ROOT_SUBVOL_INUM \ + ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) + +static inline bool subvol_inum_eq(subvol_inum a, subvol_inum b) +{ + return a.subvol == b.subvol && a.inum == b.inum; +} + int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32); int bch2_delete_dead_inodes(struct bch_fs *); diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h index 87e193e8ed25..1f00938b1bdc 100644 --- a/fs/bcachefs/inode_format.h +++ b/fs/bcachefs/inode_format.h @@ -129,6 +129,10 @@ enum inode_opt_id { Inode_opt_nr, }; +/* + * BCH_INODE_has_case_insensitive is set if any descendent is case insensitive - + * for overlayfs + */ #define BCH_INODE_FLAGS() \ x(sync, 0) \ x(immutable, 1) \ @@ -139,7 +143,8 @@ enum inode_opt_id { x(i_sectors_dirty, 6) \ x(unlinked, 7) \ x(backptr_untrusted, 8) \ - x(has_child_snapshot, 9) + x(has_child_snapshot, 9) \ + x(has_case_insensitive, 10) /* bits 20+ reserved for packed fields below: */ diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index cc07729a4b62..fa0b06e17d17 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -43,7 +43,7 @@ int bch2_extent_fallocate(struct btree_trans *trans, bch2_bkey_buf_init(&new); closure_init_stack(&cl); - k = bch2_btree_iter_peek_slot(trans, iter); + k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) return ret; @@ -91,7 +91,7 @@ int bch2_extent_fallocate(struct btree_trans *trans, opts.data_replicas, BCH_WATERMARK_normal, 0, &cl, &wp); if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) - ret = -BCH_ERR_transaction_restart_nested; + ret = bch_err_throw(c, transaction_restart_nested); if (ret) goto err; @@ -114,12 +114,11 @@ int bch2_extent_fallocate(struct btree_trans *trans, if (!ret && sectors_allocated) bch2_increment_clock(c, sectors_allocated, WRITE); if (should_print_err(ret)) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); lockrestart_do(trans, bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9)); prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); } err_noprint: bch2_open_buckets_put(c, &open_buckets); @@ -135,6 +134,33 @@ int bch2_extent_fallocate(struct btree_trans *trans, return ret; } +/* For fsck */ +int bch2_fpunch_snapshot(struct btree_trans *trans, struct bpos start, struct bpos end) +{ + u32 restart_count = trans->restart_count; + struct bch_fs *c = trans->c; + struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); + unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); + struct bkey_i delete; + + int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, + start, end, 0, k, + &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + bkey_init(&delete.k); + delete.k.p = iter.pos; + + /* create the biggest key we can */ + bch2_key_resize(&delete.k, max_sectors); + bch2_cut_back(end, &delete); + + bch2_extent_trim_atomic(trans, &iter, &delete) ?: + bch2_trans_update(trans, &iter, &delete, 0); + })); + + bch2_disk_reservation_put(c, &disk_res); + return ret ?: trans_was_restarted(trans, restart_count); +} + /* * Returns -BCH_ERR_transacton_restart if we had to drop locks: */ @@ -164,12 +190,12 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, if (ret) continue; - bch2_btree_iter_set_snapshot(trans, iter, snapshot); + bch2_btree_iter_set_snapshot(iter, snapshot); /* * peek_max() doesn't have ideal semantics for extents: */ - k = bch2_btree_iter_peek_max(trans, iter, end_pos); + k = bch2_btree_iter_peek_max(iter, end_pos); if (!k.k) break; @@ -195,23 +221,13 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, s64 *i_sectors_delta) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - POS(inum.inum, start), - BTREE_ITER_intent); + CLASS(btree_trans, trans)(c); + CLASS(btree_iter, iter)(trans, BTREE_ID_extents, POS(inum.inum, start), + BTREE_ITER_intent); - ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta); + int ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta); - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - ret = 0; - - return ret; + return bch2_err_matches(ret, BCH_ERR_transaction_restart) ? 0 : ret; } /* truncate: */ @@ -230,7 +246,7 @@ static int truncate_set_isize(struct btree_trans *trans, u64 new_i_size, bool warn) { - struct btree_iter iter = {}; + struct btree_iter iter = { NULL }; struct bch_inode_unpacked inode_u; int ret; @@ -238,7 +254,7 @@ static int truncate_set_isize(struct btree_trans *trans, (inode_u.bi_size = new_i_size, 0) ?: bch2_inode_write(trans, &iter, &inode_u); - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); return ret; } @@ -247,7 +263,6 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans, u64 *i_sectors_delta) { struct bch_fs *c = trans->c; - struct btree_iter fpunch_iter; struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(op_k); subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) }; u64 new_i_size = le64_to_cpu(op->v.new_i_size); @@ -259,14 +274,15 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans, if (ret) goto err; - bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents, - POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9), - BTREE_ITER_intent); - ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta); - bch2_trans_iter_exit(trans, &fpunch_iter); + { + CLASS(btree_iter, fpunch_iter)(trans, BTREE_ID_extents, + POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9), + BTREE_ITER_intent); + ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - ret = 0; + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; + } err: if (warn_errors) bch_err_fn(c, ret); @@ -292,17 +308,13 @@ int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sec * snapshot while they're in progress, then crashing, will result in the * resume only proceeding in one of the snapshots */ - down_read(&c->snapshot_create_lock); - struct btree_trans *trans = bch2_trans_get(c); + guard(rwsem_read)(&c->snapshot_create_lock); + CLASS(btree_trans, trans)(c); int ret = bch2_logged_op_start(trans, &op.k_i); if (ret) - goto out; + return ret; ret = __bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta); ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret; -out: - bch2_trans_put(trans); - up_read(&c->snapshot_create_lock); - return ret; } @@ -349,7 +361,7 @@ static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, ret = bch2_inode_write(trans, &iter, &inode_u); err: - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); return ret; } @@ -399,7 +411,7 @@ case LOGGED_OP_FINSERT_start: if (ret) goto err; } else { - bch2_btree_iter_set_pos(trans, &iter, POS(inum.inum, src_offset)); + bch2_btree_iter_set_pos(&iter, POS(inum.inum, src_offset)); ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -425,12 +437,12 @@ case LOGGED_OP_FINSERT_shift_extents: if (ret) goto btree_err; - bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - bch2_btree_iter_set_pos(trans, &iter, SPOS(inum.inum, pos, snapshot)); + bch2_btree_iter_set_snapshot(&iter, snapshot); + bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot)); k = insert - ? bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum.inum, 0)) - : bch2_btree_iter_peek_max(trans, &iter, POS(inum.inum, U64_MAX)); + ? bch2_btree_iter_peek_prev_min(&iter, POS(inum.inum, 0)) + : bch2_btree_iter_peek_max(&iter, POS(inum.inum, U64_MAX)); if ((ret = bkey_err(k))) goto btree_err; @@ -498,7 +510,7 @@ case LOGGED_OP_FINSERT_finish: break; } err: - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); if (warn_errors) bch_err_fn(c, ret); return ret; @@ -528,16 +540,12 @@ int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum, * snapshot while they're in progress, then crashing, will result in the * resume only proceeding in one of the snapshots */ - down_read(&c->snapshot_create_lock); - struct btree_trans *trans = bch2_trans_get(c); + guard(rwsem_read)(&c->snapshot_create_lock); + CLASS(btree_trans, trans)(c); int ret = bch2_logged_op_start(trans, &op.k_i); if (ret) - goto out; + return ret; ret = __bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta); ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret; -out: - bch2_trans_put(trans); - up_read(&c->snapshot_create_lock); - return ret; } diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h index 9cb44a7c43c1..b93e4d4b3c0c 100644 --- a/fs/bcachefs/io_misc.h +++ b/fs/bcachefs/io_misc.h @@ -5,6 +5,8 @@ int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *, u64, struct bch_io_opts, s64 *, struct write_point_specifier); + +int bch2_fpunch_snapshot(struct btree_trans *, struct bpos, struct bpos); int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, subvol_inum, u64, s64 *); int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *); diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index def4a26a3b45..c4f0f9d8f959 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -9,6 +9,7 @@ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" +#include "async_objs.h" #include "btree_update.h" #include "buckets.h" #include "checksum.h" @@ -17,6 +18,7 @@ #include "data_update.h" #include "disk_groups.h" #include "ec.h" +#include "enumerated_ref.h" #include "error.h" #include "io_read.h" #include "io_misc.h" @@ -25,6 +27,7 @@ #include "subvolume.h" #include "trace.h" +#include #include #include @@ -34,41 +37,81 @@ module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); MODULE_PARM_DESC(read_corrupt_ratio, ""); #endif +static bool bch2_poison_extents_on_checksum_error; +module_param_named(poison_extents_on_checksum_error, + bch2_poison_extents_on_checksum_error, bool, 0644); +MODULE_PARM_DESC(poison_extents_on_checksum_error, + "Extents with checksum errors are marked as poisoned - unsafe without read fua support"); + #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT +static inline u32 bch2_dev_congested_read(struct bch_dev *ca, u64 now) +{ + s64 congested = atomic_read(&ca->congested); + u64 last = READ_ONCE(ca->congested_last); + if (time_after64(now, last)) + congested -= (now - last) >> 12; + + return clamp(congested, 0LL, CONGESTED_MAX); +} + static bool bch2_target_congested(struct bch_fs *c, u16 target) { const struct bch_devs_mask *devs; unsigned d, nr = 0, total = 0; - u64 now = local_clock(), last; - s64 congested; - struct bch_dev *ca; - - if (!target) - return false; + u64 now = local_clock(); - rcu_read_lock(); + guard(rcu)(); devs = bch2_target_to_mask(c, target) ?: &c->rw_devs[BCH_DATA_user]; for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { - ca = rcu_dereference(c->devs[d]); + struct bch_dev *ca = rcu_dereference(c->devs[d]); if (!ca) continue; - congested = atomic_read(&ca->congested); - last = READ_ONCE(ca->congested_last); - if (time_after64(now, last)) - congested -= (now - last) >> 12; - - total += max(congested, 0LL); + total += bch2_dev_congested_read(ca, now); nr++; } - rcu_read_unlock(); return get_random_u32_below(nr * CONGESTED_MAX) < total; } +void bch2_dev_congested_to_text(struct printbuf *out, struct bch_dev *ca) +{ + printbuf_tabstop_push(out, 32); + + prt_printf(out, "current:\t%u%%\n", + bch2_dev_congested_read(ca, local_clock()) * + 100 / CONGESTED_MAX); + + prt_printf(out, "raw:\t%i/%u\n", atomic_read(&ca->congested), CONGESTED_MAX); + + prt_printf(out, "last io over threshold:\t"); + bch2_pr_time_units(out, local_clock() - ca->congested_last); + prt_newline(out); + + prt_printf(out, "read latency threshold:\t"); + bch2_pr_time_units(out, + ca->io_latency[READ].quantiles.entries[QUANTILE_IDX(1)].m << 2); + prt_newline(out); + + prt_printf(out, "median read latency:\t"); + bch2_pr_time_units(out, + ca->io_latency[READ].quantiles.entries[QUANTILE_IDX(7)].m); + prt_newline(out); + + prt_printf(out, "write latency threshold:\t"); + bch2_pr_time_units(out, + ca->io_latency[WRITE].quantiles.entries[QUANTILE_IDX(1)].m << 3); + prt_newline(out); + + prt_printf(out, "median write latency:\t"); + bch2_pr_time_units(out, + ca->io_latency[WRITE].quantiles.entries[QUANTILE_IDX(7)].m); + prt_newline(out); +} + #else static bool bch2_target_congested(struct bch_fs *c, u16 target) @@ -80,18 +123,6 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target) /* Cache promotion on read */ -struct promote_op { - struct rcu_head rcu; - u64 start_time; - - struct rhash_head hash; - struct bpos pos; - - struct work_struct work; - struct data_update write; - struct bio_vec bi_inline_vecs[]; /* must be last */ -}; - static const struct rhashtable_params bch_promote_params = { .head_offset = offsetof(struct promote_op, hash), .key_offset = offsetof(struct promote_op, pos), @@ -140,22 +171,32 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, if (!have_io_error(failed)) { BUG_ON(!opts.promote_target); - if (!(flags & BCH_READ_may_promote)) - return -BCH_ERR_nopromote_may_not; + if (!(flags & BCH_READ_may_promote)) { + count_event(c, io_read_nopromote_may_not); + return bch_err_throw(c, nopromote_may_not); + } - if (bch2_bkey_has_target(c, k, opts.promote_target)) - return -BCH_ERR_nopromote_already_promoted; + if (bch2_bkey_has_target(c, k, opts.promote_target)) { + count_event(c, io_read_nopromote_already_promoted); + return bch_err_throw(c, nopromote_already_promoted); + } - if (bkey_extent_is_unwritten(k)) - return -BCH_ERR_nopromote_unwritten; + if (bkey_extent_is_unwritten(k)) { + count_event(c, io_read_nopromote_unwritten); + return bch_err_throw(c, nopromote_unwritten); + } - if (bch2_target_congested(c, opts.promote_target)) - return -BCH_ERR_nopromote_congested; + if (bch2_target_congested(c, opts.promote_target)) { + count_event(c, io_read_nopromote_congested); + return bch_err_throw(c, nopromote_congested); + } } if (rhashtable_lookup_fast(&c->promote_table, &pos, - bch_promote_params)) - return -BCH_ERR_nopromote_in_flight; + bch_promote_params)) { + count_event(c, io_read_nopromote_in_flight); + return bch_err_throw(c, nopromote_in_flight); + } return 0; } @@ -169,9 +210,12 @@ static noinline void promote_free(struct bch_read_bio *rbio) bch_promote_params); BUG_ON(ret); + async_object_list_del(c, promote, op->list_idx); + async_object_list_del(c, rbio, rbio->list_idx); + bch2_data_update_exit(&op->write); - bch2_write_ref_put(c, BCH_WRITE_REF_promote); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); kfree_rcu(op, rcu); } @@ -236,12 +280,12 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, return NULL; } - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_promote)) return ERR_PTR(-BCH_ERR_nopromote_no_writes); struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL); if (!op) { - ret = -BCH_ERR_nopromote_enomem; + ret = bch_err_throw(c, nopromote_enomem); goto err_put; } @@ -250,10 +294,14 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, bch_promote_params)) { - ret = -BCH_ERR_nopromote_in_flight; + ret = bch_err_throw(c, nopromote_in_flight); goto err; } + ret = async_object_list_add(c, promote, op, &op->list_idx); + if (ret < 0) + goto err_remove_hash; + ret = bch2_data_update_init(trans, NULL, NULL, &op->write, writepoint_hashed((unsigned long) current), &orig->opts, @@ -265,7 +313,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, * -BCH_ERR_ENOSPC_disk_reservation: */ if (ret) - goto err_remove_hash; + goto err_remove_list; rbio_init_fragment(&op->write.rbio.bio, orig); op->write.rbio.bounce = true; @@ -273,6 +321,8 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, op->write.op.end_io = promote_done; return &op->write.rbio; +err_remove_list: + async_object_list_del(c, promote, op->list_idx); err_remove_hash: BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, bch_promote_params)); @@ -281,7 +331,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, /* We may have added to the rhashtable and thus need rcu freeing: */ kfree_rcu(op, rcu); err_put: - bch2_write_ref_put(c, BCH_WRITE_REF_promote); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); return ERR_PTR(ret); } @@ -296,6 +346,13 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans, bool *read_full, struct bch_io_failures *failed) { + /* + * We're in the retry path, but we don't know what to repair yet, and we + * don't want to do a promote here: + */ + if (failed && !failed->nr) + return NULL; + struct bch_fs *c = trans->c; /* * if failed != NULL we're not actually doing a promote, we're @@ -332,12 +389,39 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans, *bounce = true; *read_full = promote_full; + + if (have_io_error(failed)) + orig->self_healing = true; + return promote; nopromote: - trace_io_read_nopromote(c, ret); + if (trace_io_read_nopromote_enabled()) { + CLASS(printbuf, buf)(); + printbuf_indent_add_nextline(&buf, 2); + prt_printf(&buf, "%s\n", bch2_err_str(ret)); + bch2_bkey_val_to_text(&buf, c, k); + + trace_io_read_nopromote(c, buf.buf); + } + count_event(c, io_read_nopromote); + return NULL; } +void bch2_promote_op_to_text(struct printbuf *out, + struct bch_fs *c, + struct promote_op *op) +{ + if (!op->write.read_done) { + prt_printf(out, "parent read: %px\n", op->write.rbio.parent); + printbuf_indent_add(out, 2); + bch2_read_bio_to_text(out, c, op->write.rbio.parent); + printbuf_indent_sub(out, 2); + } + + bch2_data_update_to_text(out, &op->write); +} + /* Read */ static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, @@ -359,7 +443,8 @@ static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *o static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, struct bch_read_bio *rbio, struct bpos read_pos) { - bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos)); + CLASS(btree_trans, trans)(c); + bch2_read_err_msg_trans(trans, out, rbio, read_pos); } enum rbio_context { @@ -394,7 +479,7 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) if (rbio->have_ioref) { struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); } if (rbio->split) { @@ -406,6 +491,8 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) else promote_free(rbio); } else { + async_object_list_del(rbio->c, rbio, rbio->list_idx); + if (rbio->bounce) bch2_bio_free_pages_pool(rbio->c, &rbio->bio); @@ -427,9 +514,80 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) if (rbio->start_time) bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], rbio->start_time); +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + if (rbio->list_idx) + async_object_list_del(rbio->c, rbio, rbio->list_idx); +#endif bio_endio(&rbio->bio); } +static void get_rbio_extent(struct btree_trans *trans, + struct bch_read_bio *rbio, + struct bkey_buf *sk) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = lockrestart_do(trans, + bkey_err(k = bch2_bkey_get_iter(trans, &iter, + rbio->data_btree, rbio->data_pos, 0))); + if (ret) + return; + + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr(ptrs, ptr) + if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr)) { + bch2_bkey_buf_reassemble(sk, trans->c, k); + break; + } + + bch2_trans_iter_exit(&iter); +} + +static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio, + enum btree_id btree, struct bkey_s_c read_k) +{ + if (!bch2_poison_extents_on_checksum_error) + return 0; + + struct bch_fs *c = trans->c; + + struct data_update *u = rbio_data_update(rbio); + if (u) + read_k = bkey_i_to_s_c(u->k.k); + + u64 flags = bch2_bkey_extent_flags(read_k); + if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) + return 0; + + CLASS(btree_iter, iter)(trans, btree, bkey_start_pos(read_k.k), BTREE_ITER_intent); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + int ret = bkey_err(k); + if (ret) + return ret; + + if (!bkey_and_val_eq(k, read_k)) + return 0; + + struct bkey_i *new = bch2_trans_kmalloc(trans, + bkey_bytes(k.k) + sizeof(struct bch_extent_flags)); + ret = PTR_ERR_OR_ZERO(new) ?: + (bkey_reassemble(new, k), 0) ?: + bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?: + bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node) ?: + bch2_trans_commit(trans, NULL, NULL, 0); + if (ret) + return ret; + + /* + * Propagate key change back to data update path, in particular so it + * knows the extent has been poisoned and it's safe to change the + * checksum + */ + if (u) + bch2_bkey_buf_copy(&u->k, c, new); + return 0; +} + static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, @@ -451,7 +609,7 @@ static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { /* extent we wanted to read no longer exists: */ - rbio->ret = -BCH_ERR_data_read_key_overwritten; + rbio->ret = bch_err_throw(trans->c, data_read_key_overwritten); goto err; } @@ -461,9 +619,10 @@ static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, bkey_i_to_s_c(u->k.k), 0, failed, flags, -1); err: - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); - if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + bch2_err_matches(ret, BCH_ERR_data_read_retry)) goto retry; if (ret) { @@ -487,15 +646,21 @@ static void bch2_rbio_retry(struct work_struct *work) .inum = rbio->read_pos.inode, }; struct bch_io_failures failed = { .nr = 0 }; - int orig_error = rbio->ret; - struct btree_trans *trans = bch2_trans_get(c); + CLASS(btree_trans, trans)(c); + + struct bkey_buf sk; + bch2_bkey_buf_init(&sk); + bkey_init(&sk.k->k); trace_io_read_retry(&rbio->bio); this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], bvec_iter_sectors(rbio->bvec_iter)); - if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) + get_rbio_extent(trans, rbio, &sk); + + if (!bkey_deleted(&sk.k->k) && + bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) bch2_mark_io_failure(&failed, &rbio->pick, rbio->ret == -BCH_ERR_data_read_retry_csum_err); @@ -516,15 +681,16 @@ static void bch2_rbio_retry(struct work_struct *work) int ret = rbio->data_update ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags) - : __bch2_read(trans, rbio, iter, inum, &failed, flags); + : __bch2_read(trans, rbio, iter, inum, &failed, &sk, flags); if (ret) { rbio->ret = ret; rbio->bio.bi_status = BLK_STS_IOERR; - } else if (orig_error != -BCH_ERR_data_read_retry_csum_err_maybe_userspace && - orig_error != -BCH_ERR_data_read_ptr_stale_race && - !failed.nr) { - struct printbuf buf = PRINTBUF; + } + + if (failed.nr || ret) { + CLASS(printbuf, buf)(); + bch2_log_msg_start(c, &buf); lockrestart_do(trans, bch2_inum_offset_err_msg_trans(trans, &buf, @@ -532,14 +698,29 @@ static void bch2_rbio_retry(struct work_struct *work) read_pos.offset << 9)); if (rbio->data_update) prt_str(&buf, "(internal move) "); - prt_str(&buf, "successful retry"); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); + prt_str(&buf, "data read error, "); + if (!ret) { + prt_str(&buf, "successful retry"); + if (rbio->self_healing) + prt_str(&buf, ", self healing"); + } else + prt_str(&buf, bch2_err_str(ret)); + prt_newline(&buf); + + + if (!bkey_deleted(&sk.k->k)) { + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(sk.k)); + prt_newline(&buf); + } + + bch2_io_failures_to_text(&buf, c, &failed); + + bch2_print_str_ratelimited(c, KERN_ERR, buf.buf); } bch2_rbio_done(rbio); - bch2_trans_put(trans); + bch2_bkey_buf_exit(&sk, c); } static void bch2_rbio_error(struct bch_read_bio *rbio, @@ -568,113 +749,60 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, } } -static void bch2_read_io_err(struct work_struct *work) -{ - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bio *bio = &rbio->bio; - struct bch_fs *c = rbio->c; - struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - struct printbuf buf = PRINTBUF; - - bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); - prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status)); - - if (ca) - bch_err_ratelimited(ca, "%s", buf.buf); - else - bch_err_ratelimited(c, "%s", buf.buf); - - printbuf_exit(&buf); - bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status); -} - static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, struct bch_read_bio *rbio) { struct bch_fs *c = rbio->c; u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; - struct bch_extent_crc_unpacked new_crc; - struct btree_iter iter; - struct bkey_i *new; - struct bkey_s_c k; int ret = 0; if (crc_is_compressed(rbio->pick.crc)) return 0; - k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, - BTREE_ITER_slots|BTREE_ITER_intent); + CLASS(btree_iter, iter)(trans, rbio->data_btree, rbio->data_pos, BTREE_ITER_intent); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); if ((ret = bkey_err(k))) - goto out; + return ret; if (bversion_cmp(k.k->bversion, rbio->version) || !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) - goto out; + return 0; /* Extent was merged? */ if (bkey_start_offset(k.k) < data_offset || k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) - goto out; + return 0; + struct bch_extent_crc_unpacked new_crc; if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, rbio->pick.crc, NULL, &new_crc, bkey_start_offset(k.k) - data_offset, k.k->size, rbio->pick.crc.csum_type)) { bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); - ret = 0; - goto out; + return 0; } /* * going to be temporarily appending another checksum entry: */ - new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + - sizeof(struct bch_extent_crc128)); + struct bkey_i *new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + + sizeof(struct bch_extent_crc128)); if ((ret = PTR_ERR_OR_ZERO(new))) - goto out; + return ret; bkey_reassemble(new, k); if (!bch2_bkey_narrow_crcs(new, new_crc)) - goto out; + return 0; - ret = bch2_trans_update(trans, &iter, new, - BTREE_UPDATE_internal_snapshot_node); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; + return bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node); } static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) { - bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __bch2_rbio_narrow_crcs(trans, rbio)); -} - -static void bch2_read_csum_err(struct work_struct *work) -{ - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct bio *src = &rbio->bio; - struct bch_extent_crc_unpacked crc = rbio->pick.crc; - struct nonce nonce = extent_nonce(rbio->version, crc); - struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); - struct printbuf buf = PRINTBUF; - - bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); - prt_str(&buf, "data "); - bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); - - struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - if (ca) - bch_err_ratelimited(ca, "%s", buf.buf); - else - bch_err_ratelimited(c, "%s", buf.buf); - - bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); - printbuf_exit(&buf); + CLASS(btree_trans, trans)(rbio->c); + commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + __bch2_rbio_narrow_crcs(trans, rbio)); } static void bch2_read_decompress_err(struct work_struct *work) @@ -682,7 +810,7 @@ static void bch2_read_decompress_err(struct work_struct *work) struct bch_read_bio *rbio = container_of(work, struct bch_read_bio, work); struct bch_fs *c = rbio->c; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); prt_str(&buf, "decompression error"); @@ -694,7 +822,6 @@ static void bch2_read_decompress_err(struct work_struct *work) bch_err_ratelimited(c, "%s", buf.buf); bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR); - printbuf_exit(&buf); } static void bch2_read_decrypt_err(struct work_struct *work) @@ -702,7 +829,7 @@ static void bch2_read_decrypt_err(struct work_struct *work) struct bch_read_bio *rbio = container_of(work, struct bch_read_bio, work); struct bch_fs *c = rbio->c; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); prt_str(&buf, "decrypt error"); @@ -714,7 +841,6 @@ static void bch2_read_decrypt_err(struct work_struct *work) bch_err_ratelimited(c, "%s", buf.buf); bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR); - printbuf_exit(&buf); } /* Inner part that may run in process context */ @@ -837,7 +963,7 @@ static void __bch2_read_endio(struct work_struct *work) memalloc_nofs_restore(nofs_flags); return; csum_err: - bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); goto out; decompression_err: bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); @@ -863,7 +989,7 @@ static void bch2_read_endio(struct bio *bio) rbio->bio.bi_end_io = rbio->end_io; if (unlikely(bio->bi_status)) { - bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status); return; } @@ -895,13 +1021,10 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, struct bch_extent_ptr ptr) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct printbuf buf = PRINTBUF; - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - PTR_BUCKET_POS(ca, &ptr), - BTREE_ITER_cached); + CLASS(printbuf, buf)(); + CLASS(btree_iter, iter)(trans, BTREE_ID_alloc, + PTR_BUCKET_POS(ca, &ptr), + BTREE_ITER_cached); int gen = bucket_gen_get(ca, iter.pos.offset); if (gen >= 0) { @@ -913,7 +1036,7 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, prt_printf(&buf, "memory gen: %u", gen); - ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter))); + int ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); if (!ret) { prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, k); @@ -931,9 +1054,6 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, } bch2_fs_inconsistent(c, "%s", buf.buf); - - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); } int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, @@ -963,6 +1083,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, bvec_iter_sectors(iter)); goto out_read_done; } + + if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) && + !orig->data_update) + return bch_err_throw(c, extent_poisoned); retry_pick: ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); @@ -971,30 +1095,38 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, goto hole; if (unlikely(ret < 0)) { - struct printbuf buf = PRINTBUF; + if (ret == -BCH_ERR_data_read_csum_err) { + int ret2 = maybe_poison_extent(trans, orig, data_btree, k); + if (ret2) { + ret = ret2; + goto err; + } + + trace_and_count(c, io_read_fail_and_poison, &orig->bio); + } + + CLASS(printbuf, buf)(); bch2_read_err_msg_trans(trans, &buf, orig, read_pos); prt_printf(&buf, "%s\n ", bch2_err_str(ret)); bch2_bkey_val_to_text(&buf, c, k); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); goto err; } if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20_key_set) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_read_err_msg_trans(trans, &buf, orig, read_pos); prt_printf(&buf, "attempting to read encrypted data without encryption key\n "); bch2_bkey_val_to_text(&buf, c, k); bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - ret = -BCH_ERR_data_read_no_encryption_key; + ret = bch_err_throw(c, data_read_no_encryption_key); goto err; } - struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, + BCH_DEV_READ_REF_io_read); /* * Stale dirty pointers are treated as IO errors, but @failed isn't @@ -1008,7 +1140,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, unlikely(dev_ptr_stale(ca, &pick.ptr))) { read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); bch2_mark_io_failure(failed, &pick, false); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); goto retry_pick; } @@ -1041,8 +1173,9 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, */ if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { if (ca) - percpu_ref_put(&ca->io_ref[READ]); - rbio->ret = -BCH_ERR_data_read_buffer_too_small; + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_io_read); + rbio->ret = bch_err_throw(c, data_read_buffer_too_small); goto out_read_done; } @@ -1138,6 +1271,8 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, rbio->bio.bi_iter.bi_sector = pick.ptr.offset; rbio->bio.bi_end_io = bch2_read_endio; + async_object_list_add(c, rbio, rbio, &rbio->list_idx); + if (rbio->bounce) trace_and_count(c, io_read_bounce, &rbio->bio); @@ -1171,14 +1306,6 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, if (likely(!rbio->pick.do_ec_reconstruct)) { if (unlikely(!rbio->have_ioref)) { - struct printbuf buf = PRINTBUF; - bch2_read_err_msg_trans(trans, &buf, rbio, read_pos); - prt_printf(&buf, "no device to read from:\n "); - bch2_bkey_val_to_text(&buf, c, k); - - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_device_offline, BLK_STS_IOERR); @@ -1253,7 +1380,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, * have to signal that: */ if (u) - orig->ret = -BCH_ERR_data_read_key_overwritten; + orig->ret = bch_err_throw(c, data_read_key_overwritten); zero_fill_bio_iter(&orig->bio, iter); out_read_done: @@ -1265,23 +1392,25 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, subvol_inum inum, - struct bch_io_failures *failed, unsigned flags) + struct bch_io_failures *failed, + struct bkey_buf *prev_read, + unsigned flags) { struct bch_fs *c = trans->c; - struct btree_iter iter; struct bkey_buf sk; struct bkey_s_c k; + enum btree_id data_btree; int ret; EBUG_ON(rbio->data_update); bch2_bkey_buf_init(&sk); - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - POS(inum.inum, bvec_iter.bi_sector), - BTREE_ITER_slots); + CLASS(btree_iter, iter)(trans, BTREE_ID_extents, + POS(inum.inum, bvec_iter.bi_sector), + BTREE_ITER_slots); while (1) { - enum btree_id data_btree = BTREE_ID_extents; + data_btree = BTREE_ID_extents; bch2_trans_begin(trans); @@ -1290,12 +1419,12 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, if (ret) goto err; - bch2_btree_iter_set_snapshot(trans, &iter, snapshot); + bch2_btree_iter_set_snapshot(&iter, snapshot); - bch2_btree_iter_set_pos(trans, &iter, + bch2_btree_iter_set_pos(&iter, POS(inum.inum, bvec_iter.bi_sector)); - k = bch2_btree_iter_peek_slot(trans, &iter); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto err; @@ -1313,6 +1442,12 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, k = bkey_i_to_s_c(sk.k); + if (unlikely(flags & BCH_READ_in_retry)) { + if (!bkey_and_val_eq(k, bkey_i_to_s_c(prev_read->k))) + failed->nr = 0; + bch2_bkey_buf_copy(prev_read, c, sk.k); + } + /* * With indirect extents, the amount of data to read is the min * of the original extent and the indirect extent: @@ -1347,17 +1482,14 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, break; } - bch2_trans_iter_exit(trans, &iter); - if (unlikely(ret)) { if (ret != -BCH_ERR_extent_poisoned) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); lockrestart_do(trans, bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9)); prt_printf(&buf, "data read error: %s", bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); } rbio->bio.bi_status = BLK_STS_IOERR; @@ -1371,26 +1503,90 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, return ret; } +static const char * const bch2_read_bio_flags[] = { +#define x(n) #n, + BCH_READ_FLAGS() +#undef x + NULL +}; + +void bch2_read_bio_to_text(struct printbuf *out, + struct bch_fs *c, + struct bch_read_bio *rbio) +{ + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 20); + + bch2_read_err_msg(c, out, rbio, rbio->read_pos); + prt_newline(out); + + /* Are we in a retry? */ + + printbuf_indent_add(out, 2); + + u64 now = local_clock(); + prt_printf(out, "start_time:\t"); + bch2_pr_time_units(out, max_t(s64, 0, now - rbio->start_time)); + prt_newline(out); + + prt_printf(out, "submit_time:\t"); + bch2_pr_time_units(out, max_t(s64, 0, now - rbio->submit_time)); + prt_newline(out); + + if (!rbio->split) + prt_printf(out, "end_io:\t%ps\n", rbio->end_io); + else + prt_printf(out, "parent:\t%px\n", rbio->parent); + + prt_printf(out, "promote:\t%u\n", rbio->promote); + prt_printf(out, "bounce:\t%u\n", rbio->bounce); + prt_printf(out, "split:\t%u\n", rbio->split); + prt_printf(out, "have_ioref:\t%u\n", rbio->have_ioref); + prt_printf(out, "narrow_crcs:\t%u\n", rbio->narrow_crcs); + prt_printf(out, "context:\t%u\n", rbio->context); + + int ret = READ_ONCE(rbio->ret); + if (ret < 0) + prt_printf(out, "ret:\t%s\n", bch2_err_str(ret)); + else + prt_printf(out, "ret:\t%i\n", ret); + + prt_printf(out, "flags:\t"); + bch2_prt_bitflags(out, bch2_read_bio_flags, rbio->flags); + prt_newline(out); + + bch2_bio_to_text(out, &rbio->bio); + printbuf_indent_sub(out, 2); +} + void bch2_fs_io_read_exit(struct bch_fs *c) { if (c->promote_table.tbl) rhashtable_destroy(&c->promote_table); bioset_exit(&c->bio_read_split); bioset_exit(&c->bio_read); + mempool_exit(&c->bio_bounce_pages); } int bch2_fs_io_read_init(struct bch_fs *c) { + if (mempool_init_page_pool(&c->bio_bounce_pages, + max_t(unsigned, + c->opts.btree_node_size, + c->opts.encoded_extent_max) / + PAGE_SIZE, 0)) + return bch_err_throw(c, ENOMEM_bio_bounce_pages_init); + if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_bio_read_init; + return bch_err_throw(c, ENOMEM_bio_read_init); if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_bio_read_split_init; + return bch_err_throw(c, ENOMEM_bio_read_split_init); if (rhashtable_init(&c->promote_table, &bch_promote_params)) - return -BCH_ERR_ENOMEM_promote_table_init; + return bch_err_throw(c, ENOMEM_promote_table_init); return 0; } diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index c78025d863e0..1e1c0476bd03 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -4,8 +4,13 @@ #include "bkey_buf.h" #include "btree_iter.h" +#include "extents_types.h" #include "reflink.h" +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT +void bch2_dev_congested_to_text(struct printbuf *, struct bch_dev *); +#endif + struct bch_read_bio { struct bch_fs *c; u64 start_time; @@ -43,11 +48,15 @@ struct bch_read_bio { have_ioref:1, narrow_crcs:1, saw_error:1, + self_healing:1, context:2; }; u16 _state; }; s16 ret; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + unsigned list_idx; +#endif struct extent_ptr_decoded pick; @@ -87,6 +96,8 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans, return 0; *data_btree = BTREE_ID_reflink; + + struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, offset_into_extent, @@ -97,12 +108,12 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans, return ret; if (bkey_deleted(k.k)) { - bch2_trans_iter_exit(trans, &iter); - return -BCH_ERR_missing_indirect_extent; + bch2_trans_iter_exit(&iter); + return bch_err_throw(c, missing_indirect_extent); } - bch2_bkey_buf_reassemble(extent, trans->c, k); - bch2_trans_iter_exit(trans, &iter); + bch2_bkey_buf_reassemble(extent, c, k); + bch2_trans_iter_exit(&iter); return 0; } @@ -140,11 +151,12 @@ static inline void bch2_read_extent(struct btree_trans *trans, int ret = __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, data_btree, k, offset_into_extent, NULL, flags, -1); /* __bch2_read_extent only returns errors if BCH_READ_in_retry is set */ - WARN(ret, "unhandled error from __bch2_read_extent()"); + WARN(ret, "unhandled error from __bch2_read_extent(): %s", bch2_err_str(ret)); } int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter, - subvol_inum, struct bch_io_failures *, unsigned flags); + subvol_inum, + struct bch_io_failures *, struct bkey_buf *, unsigned flags); static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, subvol_inum inum) @@ -153,11 +165,11 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, rbio->subvol = inum.subvol; - bch2_trans_run(c, - __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, - BCH_READ_retry_if_stale| - BCH_READ_may_promote| - BCH_READ_user_mapped)); + CLASS(btree_trans, trans)(c); + __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, NULL, + BCH_READ_retry_if_stale| + BCH_READ_may_promote| + BCH_READ_user_mapped); } static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, @@ -172,6 +184,9 @@ static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, rbio->split = true; rbio->parent = orig; rbio->opts = orig->opts; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + rbio->list_idx = 0; +#endif return rbio; } @@ -189,9 +204,16 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio, rbio->ret = 0; rbio->opts = opts; rbio->bio.bi_end_io = end_io; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + rbio->list_idx = 0; +#endif return rbio; } +struct promote_op; +void bch2_promote_op_to_text(struct printbuf *, struct bch_fs *, struct promote_op *); +void bch2_read_bio_to_text(struct printbuf *, struct bch_fs *, struct bch_read_bio *); + void bch2_fs_io_read_exit(struct bch_fs *); int bch2_fs_io_read_init(struct bch_fs *); diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index c1237da079ed..1d83dcc9731e 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -6,6 +6,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "async_objs.h" #include "bkey_buf.h" #include "bset.h" #include "btree_update.h" @@ -15,6 +16,7 @@ #include "compress.h" #include "debug.h" #include "ec.h" +#include "enumerated_ref.h" #include "error.h" #include "extent_update.h" #include "inode.h" @@ -30,6 +32,7 @@ #include "trace.h" #include +#include #include #include #include @@ -52,14 +55,9 @@ static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, s64 latency_over = io_latency - latency_threshold; if (latency_threshold && latency_over > 0) { - /* - * bump up congested by approximately latency_over * 4 / - * latency_threshold - we don't need much accuracy here so don't - * bother with the divide: - */ if (atomic_read(&ca->congested) < CONGESTED_MAX) - atomic_add(latency_over >> - max_t(int, ilog2(latency_threshold) - 2, 0), + atomic_add((u32) min(U32_MAX, io_latency * 2) / + (u32) min(U32_MAX, latency_threshold), &ca->congested); ca->congested_last = now; @@ -91,7 +89,12 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) new = ewma_add(old, io_latency, 5); } while (!atomic64_try_cmpxchg(latency, &old, new)); - bch2_congested_acct(ca, io_latency, now, rw); + /* + * Only track read latency for congestion accounting: writes are subject + * to heavy queuing delays from page cache writeback: + */ + if (rw == READ) + bch2_congested_acct(ca, io_latency, now, rw); __bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now); } @@ -168,9 +171,9 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, *i_sectors_delta = 0; *disk_sectors_delta = 0; - bch2_trans_copy_iter(trans, &iter, extent_iter); + bch2_trans_copy_iter(&iter, extent_iter); - for_each_btree_key_max_continue_norestart(trans, iter, + for_each_btree_key_max_continue_norestart(iter, new->k.p, BTREE_ITER_slots, old, ret) { s64 sectors = min(new->k.p.offset, old.k->p.offset) - max(bkey_start_offset(&new->k), @@ -195,7 +198,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, break; } - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); return ret; } @@ -217,13 +220,13 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, */ unsigned inode_update_flags = BTREE_UPDATE_nojournal; - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, - extent_iter->pos.inode, - extent_iter->snapshot), - BTREE_ITER_intent| - BTREE_ITER_cached); + CLASS(btree_iter, iter)(trans, BTREE_ID_inodes, + SPOS(0, + extent_iter->pos.inode, + extent_iter->snapshot), + BTREE_ITER_intent| + BTREE_ITER_cached); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); int ret = bkey_err(k); if (unlikely(ret)) return ret; @@ -235,7 +238,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, struct bkey_i *k_mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + 8); ret = PTR_ERR_OR_ZERO(k_mut); if (unlikely(ret)) - goto err; + return ret; bkey_reassemble(k_mut, k); @@ -243,7 +246,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, k_mut = bch2_inode_to_v3(trans, k_mut); ret = PTR_ERR_OR_ZERO(k_mut); if (unlikely(ret)) - goto err; + return ret; } struct bkey_i_inode_v3 *inode = bkey_i_to_inode_v3(k_mut); @@ -258,17 +261,14 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, s64 bi_sectors = le64_to_cpu(inode->v.bi_sectors); if (unlikely(bi_sectors + i_sectors_delta < 0)) { struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_log_msg_start(c, &buf); prt_printf(&buf, "inode %llu i_sectors underflow: %lli + %lli < 0", extent_iter->pos.inode, bi_sectors, i_sectors_delta); - bool repeat = false, print = false, suppress = false; - bch2_count_fsck_err(c, inode_i_sectors_underflow, buf.buf, - &repeat, &print, &suppress); + bool print = bch2_count_fsck_err(c, inode_i_sectors_underflow, &buf); if (print) - bch2_print_str(c, buf.buf); - printbuf_exit(&buf); + bch2_print_str(c, KERN_ERR, buf.buf); if (i_sectors_delta < 0) i_sectors_delta = -bi_sectors; @@ -280,17 +280,20 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, inode_update_flags = 0; } + /* + * extents, dirents and xattrs updates require that an inode update also + * happens - to ensure that if a key exists in one of those btrees with + * a given snapshot ID an inode is also present - so we may have to skip + * the nojournal optimization: + */ if (inode->k.p.snapshot != iter.snapshot) { inode->k.p.snapshot = iter.snapshot; inode_update_flags = 0; } - ret = bch2_trans_update(trans, &iter, &inode->k_i, - BTREE_UPDATE_internal_snapshot_node| - inode_update_flags); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; + return bch2_trans_update(trans, &iter, &inode->k_i, + BTREE_UPDATE_internal_snapshot_node| + inode_update_flags); } int bch2_extent_update(struct btree_trans *trans, @@ -313,7 +316,7 @@ int bch2_extent_update(struct btree_trans *trans, * path already traversed at iter->pos because * bch2_trans_extent_update() will use it to attempt extent merging */ - ret = __bch2_btree_iter_traverse(trans, iter); + ret = __bch2_btree_iter_traverse(iter); if (ret) return ret; @@ -358,7 +361,7 @@ int bch2_extent_update(struct btree_trans *trans, if (i_sectors_delta_total) *i_sectors_delta_total += i_sectors_delta; - bch2_btree_iter_set_pos(trans, iter, next_pos); + bch2_btree_iter_set_pos(iter, next_pos); return 0; } @@ -368,8 +371,6 @@ static int bch2_write_index_default(struct bch_write_op *op) struct bkey_buf sk; struct keylist *keys = &op->insert_keys; struct bkey_i *k = bch2_keylist_front(keys); - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; subvol_inum inum = { .subvol = op->subvol, .inum = k->k.p.inode, @@ -378,6 +379,7 @@ static int bch2_write_index_default(struct bch_write_op *op) BUG_ON(!inum.subvol); + CLASS(btree_trans, trans)(c); bch2_bkey_buf_init(&sk); do { @@ -393,16 +395,14 @@ static int bch2_write_index_default(struct bch_write_op *op) if (ret) break; - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - bkey_start_pos(&sk.k->k), - BTREE_ITER_slots|BTREE_ITER_intent); + CLASS(btree_iter, iter)(trans, BTREE_ID_extents, + bkey_start_pos(&sk.k->k), + BTREE_ITER_slots|BTREE_ITER_intent); - ret = bch2_bkey_set_needs_rebalance(c, &op->opts, sk.k) ?: - bch2_extent_update(trans, inum, &iter, sk.k, + ret = bch2_extent_update(trans, inum, &iter, sk.k, &op->res, op->new_i_size, &op->i_sectors_delta, op->flags & BCH_WRITE_check_enospc); - bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; @@ -415,7 +415,6 @@ static int bch2_write_index_default(struct bch_write_op *op) bch2_cut_front(iter.pos, k); } while (!bch2_keylist_empty(keys)); - bch2_trans_put(trans); bch2_bkey_buf_exit(&sk, c); return ret; @@ -425,7 +424,7 @@ static int bch2_write_index_default(struct bch_write_op *op) void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, ...) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); if (op->subvol) { bch2_inum_offset_err_msg(op->c, &buf, @@ -452,7 +451,6 @@ void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, . } bch_err_ratelimited(op->c, "%s", buf.buf); - printbuf_exit(&buf); } void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, @@ -462,9 +460,17 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); struct bch_write_bio *n; + unsigned ref_rw = type == BCH_DATA_btree ? READ : WRITE; + unsigned ref_idx = type == BCH_DATA_btree + ? (unsigned) BCH_DEV_READ_REF_btree_node_write + : (unsigned) BCH_DEV_WRITE_REF_io_write; BUG_ON(c->opts.nochanges); + const struct bch_extent_ptr *last = NULL; + bkey_for_each_ptr(ptrs, ptr) + last = ptr; + bkey_for_each_ptr(ptrs, ptr) { /* * XXX: btree writes should be using io_ref[WRITE], but we @@ -473,9 +479,9 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, */ struct bch_dev *ca = nocow ? bch2_dev_have_ref(c, ptr->dev) - : bch2_dev_get_ioref(c, ptr->dev, type == BCH_DATA_btree ? READ : WRITE); + : bch2_dev_get_ioref(c, ptr->dev, ref_rw, ref_idx); - if (to_entry(ptr + 1) < ptrs.end) { + if (ptr != last) { n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set)); n->bio.bi_end_io = wbio->bio.bi_end_io; @@ -533,17 +539,19 @@ static void bch2_write_done(struct closure *cl) bch2_disk_reservation_put(c, &op->res); if (!(op->flags & BCH_WRITE_move)) - bch2_write_ref_put(c, BCH_WRITE_REF_write); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_write); bch2_keylist_free(&op->insert_keys, op->inline_keys); EBUG_ON(cl->parent); closure_debug_destroy(cl); + async_object_list_del(c, write_op, op->list_idx); if (op->end_io) op->end_io(op); } static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) { + struct bch_fs *c = op->c; struct keylist *keys = &op->insert_keys; struct bkey_i *src, *dst = keys->keys, *n; @@ -555,7 +563,7 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) test_bit(ptr->dev, op->failed.d)); if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) - return -BCH_ERR_data_write_io; + return bch_err_throw(c, data_write_io); } if (dst != src) @@ -748,7 +756,8 @@ static void bch2_write_endio(struct bio *bio) } if (wbio->have_ioref) - percpu_ref_put(&ca->io_ref[WRITE]); + enumerated_ref_put(&ca->io_ref[WRITE], + BCH_DEV_WRITE_REF_io_write); if (wbio->bounce) bch2_bio_free_pages_pool(c, bio); @@ -784,6 +793,9 @@ static void init_append_extent(struct bch_write_op *op, bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, op->flags & BCH_WRITE_cached); + if (!(op->flags & BCH_WRITE_move)) + bch2_bkey_set_needs_rebalance(op->c, &op->opts, &e->k_i); + bch2_keylist_push(&op->insert_keys); } @@ -958,7 +970,7 @@ static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct op->crc.csum_type < BCH_CSUM_NR ? __bch2_csum_types[op->crc.csum_type] : "(unknown)"); - return -BCH_ERR_data_write_csum; + return bch_err_throw(c, data_write_csum); } static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, @@ -1190,22 +1202,20 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op, e = bkey_s_c_to_extent(k); - rcu_read_lock(); + guard(rcu)(); extent_for_each_ptr_decode(e, p, entry) { - if (crc_is_encoded(p.crc) || p.has_ec) { - rcu_read_unlock(); + if (crc_is_encoded(p.crc) || p.has_ec) return false; - } replicas += bch2_extent_ptr_durability(c, &p); } - rcu_read_unlock(); return replicas >= op->opts.data_replicas; } static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, struct btree_iter *iter, + struct bch_write_op *op, struct bkey_i *orig, struct bkey_s_c k, u64 new_i_size) @@ -1215,11 +1225,13 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, return 0; } - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); + struct bkey_i *new = bch2_trans_kmalloc_nomemzero(trans, + bkey_bytes(k.k) + sizeof(struct bch_extent_rebalance)); int ret = PTR_ERR_OR_ZERO(new); if (ret) return ret; + bkey_reassemble(new, k); bch2_cut_front(bkey_start_pos(&orig->k), new); bch2_cut_back(orig->k.p, new); @@ -1227,6 +1239,8 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, bkey_for_each_ptr(ptrs, ptr) ptr->unwritten = 0; + bch2_bkey_set_needs_rebalance(op->c, &op->opts, new); + /* * Note that we're not calling bch2_subvol_get_snapshot() in this path - * that was done when we kicked off the write, and here it's important @@ -1251,7 +1265,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) bkey_start_pos(&orig->k), orig->k.p, BTREE_ITER_intent, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); + bch2_nocow_write_convert_one_unwritten(trans, &iter, op, orig, k, op->new_i_size); })); if (ret) break; @@ -1272,7 +1286,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) static void __bch2_nocow_write_done(struct bch_write_op *op) { if (unlikely(op->flags & BCH_WRITE_io_error)) { - op->error = -BCH_ERR_data_write_io; + op->error = bch_err_throw(op->c, data_write_io); } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten)) bch2_nocow_write_convert_unwritten(op); } @@ -1326,7 +1340,7 @@ static void bch2_nocow_write(struct bch_write_op *op) if (ret) break; - k = bch2_btree_iter_peek_slot(trans, &iter); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) break; @@ -1345,7 +1359,8 @@ static void bch2_nocow_write(struct bch_write_op *op) /* Get iorefs before dropping btree locks: */ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); + struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE, + BCH_DEV_WRITE_REF_io_write); if (unlikely(!ca)) goto err_get_ioref; @@ -1410,10 +1425,10 @@ static void bch2_nocow_write(struct bch_write_op *op) bch2_keylist_push(&op->insert_keys); if (op->flags & BCH_WRITE_submitted) break; - bch2_btree_iter_advance(trans, &iter); + bch2_btree_iter_advance(&iter); } out: - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; @@ -1447,7 +1462,8 @@ static void bch2_nocow_write(struct bch_write_op *op) return; err_get_ioref: darray_for_each(buckets, i) - percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE]); + enumerated_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE], + BCH_DEV_WRITE_REF_io_write); /* Fall back to COW path: */ goto out; @@ -1458,17 +1474,16 @@ static void bch2_nocow_write(struct bch_write_op *op) break; } - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); if (bch2_fs_inconsistent_on(stale < 0, c, "pointer to invalid bucket in nocow path on device %llu\n %s", stale_at->b.inode, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = -BCH_ERR_data_write_invalid_ptr; + ret = bch_err_throw(c, data_write_invalid_ptr); } else { /* We can retry this: */ - ret = -BCH_ERR_transaction_restart; + ret = bch_err_throw(c, transaction_restart); } - printbuf_exit(&buf); goto err_get_ioref; } @@ -1512,7 +1527,7 @@ static void __bch2_write(struct bch_write_op *op) * freeing up space on specific disks, which means that * allocations for specific disks may hang arbitrarily long: */ - ret = bch2_trans_run(c, lockrestart_do(trans, + ret = bch2_trans_do(c, bch2_alloc_sectors_start_trans(trans, op->target, op->opts.erasure_code && !(op->flags & BCH_WRITE_cached), @@ -1522,7 +1537,7 @@ static void __bch2_write(struct bch_write_op *op) op->nr_replicas_required, op->watermark, op->flags, - &op->cl, &wp))); + &op->cl, &wp)); if (unlikely(ret)) { if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) break; @@ -1661,6 +1676,8 @@ CLOSURE_CALLBACK(bch2_write) BUG_ON(!op->write_point.v); BUG_ON(bkey_eq(op->pos, POS_MAX)); + async_object_list_add(c, write_op, op, &op->list_idx); + if (op->flags & BCH_WRITE_only_specified_devs) op->flags |= BCH_WRITE_alloc_nowait; @@ -1671,18 +1688,18 @@ CLOSURE_CALLBACK(bch2_write) if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { bch2_write_op_error(op, op->pos.offset, "misaligned write"); - op->error = -BCH_ERR_data_write_misaligned; + op->error = bch_err_throw(c, data_write_misaligned); goto err; } if (c->opts.nochanges) { - op->error = -BCH_ERR_erofs_no_writes; + op->error = bch_err_throw(c, erofs_no_writes); goto err; } if (!(op->flags & BCH_WRITE_move) && - !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { - op->error = -BCH_ERR_erofs_no_writes; + !enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_write)) { + op->error = bch_err_throw(c, erofs_no_writes); goto err; } @@ -1705,6 +1722,7 @@ CLOSURE_CALLBACK(bch2_write) bch2_disk_reservation_put(c, &op->res); closure_debug_destroy(&op->cl); + async_object_list_del(c, write_op, op->list_idx); if (op->end_io) op->end_io(op); } @@ -1738,13 +1756,13 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required); prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl)); + prt_printf(out, "ret\t%s\n", bch2_err_str(op->error)); printbuf_indent_sub(out, 2); } void bch2_fs_io_write_exit(struct bch_fs *c) { - mempool_exit(&c->bio_bounce_pages); bioset_exit(&c->replica_set); bioset_exit(&c->bio_write); } @@ -1753,14 +1771,7 @@ int bch2_fs_io_write_init(struct bch_fs *c) { if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) || bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0)) - return -BCH_ERR_ENOMEM_bio_write_init; - - if (mempool_init_page_pool(&c->bio_bounce_pages, - max_t(unsigned, - c->opts.btree_node_size, - c->opts.encoded_extent_max) / - PAGE_SIZE, 0)) - return -BCH_ERR_ENOMEM_bio_bounce_pages_init; + return bch_err_throw(c, ENOMEM_bio_write_init); return 0; } diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h index b8ab19a1e1da..2c0a8f35ee1f 100644 --- a/fs/bcachefs/io_write.h +++ b/fs/bcachefs/io_write.h @@ -17,34 +17,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, __printf(3, 4) void bch2_write_op_error(struct bch_write_op *op, u64, const char *, ...); -#define BCH_WRITE_FLAGS() \ - x(alloc_nowait) \ - x(cached) \ - x(data_encoded) \ - x(pages_stable) \ - x(pages_owned) \ - x(only_specified_devs) \ - x(wrote_data_inline) \ - x(check_enospc) \ - x(sync) \ - x(move) \ - x(in_worker) \ - x(submitted) \ - x(io_error) \ - x(convert_unwritten) - -enum __bch_write_flags { -#define x(f) __BCH_WRITE_##f, - BCH_WRITE_FLAGS() -#undef x -}; - -enum bch_write_flags { -#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f), - BCH_WRITE_FLAGS() -#undef x -}; - static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) { return op->watermark == BCH_WATERMARK_copygc diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h index 3ef6df9145ef..5da4eb8bb6f6 100644 --- a/fs/bcachefs/io_write_types.h +++ b/fs/bcachefs/io_write_types.h @@ -13,6 +13,34 @@ #include #include +#define BCH_WRITE_FLAGS() \ + x(alloc_nowait) \ + x(cached) \ + x(data_encoded) \ + x(pages_stable) \ + x(pages_owned) \ + x(only_specified_devs) \ + x(wrote_data_inline) \ + x(check_enospc) \ + x(sync) \ + x(move) \ + x(in_worker) \ + x(submitted) \ + x(io_error) \ + x(convert_unwritten) + +enum __bch_write_flags { +#define x(f) __BCH_WRITE_##f, + BCH_WRITE_FLAGS() +#undef x +}; + +enum bch_write_flags { +#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f), + BCH_WRITE_FLAGS() +#undef x +}; + struct bch_write_bio { struct_group(wbio, struct bch_fs *c; @@ -43,6 +71,10 @@ struct bch_write_op { void (*end_io)(struct bch_write_op *); u64 start_time; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + unsigned list_idx; +#endif + unsigned written; /* sectors */ u16 flags; s16 error; /* dio write path expects it to hold -ERESTARTSYS... */ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index bb45d3634194..07869436a964 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -12,6 +12,7 @@ #include "btree_update.h" #include "btree_write_buffer.h" #include "buckets.h" +#include "enumerated_ref.h" #include "error.h" #include "journal.h" #include "journal_io.h" @@ -87,7 +88,7 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) { lockdep_assert_held(&j->lock); - out->atomic++; + guard(printbuf_atomic)(out); if (!out->nr_tabstops) printbuf_tabstop_push(out, 24); @@ -97,8 +98,6 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) seq++) bch2_journal_buf_to_text(out, j, seq); prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed"); - - --out->atomic; } static inline struct journal_buf * @@ -139,9 +138,9 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) { struct bch_fs *c = container_of(j, struct bch_fs, journal); bool stuck = false; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); - buf.atomic++; + guard(printbuf_atomic)(&buf); if (!(error == -BCH_ERR_journal_full || error == -BCH_ERR_journal_pin_full) || @@ -149,36 +148,31 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) return stuck; - spin_lock(&j->lock); + scoped_guard(spinlock, &j->lock) { + if (j->can_discard) + return stuck; - if (j->can_discard) { - spin_unlock(&j->lock); - return stuck; - } + stuck = true; - stuck = true; + /* + * The journal shutdown path will set ->err_seq, but do it here first to + * serialize against concurrent failures and avoid duplicate error + * reports. + */ + if (j->err_seq) + return stuck; - /* - * The journal shutdown path will set ->err_seq, but do it here first to - * serialize against concurrent failures and avoid duplicate error - * reports. - */ - if (j->err_seq) { - spin_unlock(&j->lock); - return stuck; - } - j->err_seq = journal_cur_seq(j); + j->err_seq = journal_cur_seq(j); - __bch2_journal_debug_to_text(&buf, j); - spin_unlock(&j->lock); + __bch2_journal_debug_to_text(&buf, j); + } prt_printf(&buf, bch2_fmt(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)"), bch2_err_str(error)); - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_reset(&buf); bch2_journal_pins_to_text(&buf, j); bch_err(c, "Journal pins:\n%s", buf.buf); - printbuf_exit(&buf); bch2_fatal_error(c); dump_stack(); @@ -188,6 +182,8 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) void bch2_journal_do_writes(struct journal *j) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + for (u64 seq = journal_last_unwritten_seq(j); seq <= journal_cur_seq(j); seq++) { @@ -202,6 +198,7 @@ void bch2_journal_do_writes(struct journal *j) if (!journal_state_seq_count(j, j->reservations, seq)) { j->seq_write_started = seq; w->write_started = true; + closure_get(&c->cl); closure_call(&w->io, bch2_journal_write, j->wq, NULL); } @@ -268,22 +265,21 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t buf->data->u64s = cpu_to_le32(old.cur_entry_offset); if (trace_journal_entry_close_enabled() && trace) { - struct printbuf pbuf = PRINTBUF; - pbuf.atomic++; - - prt_str(&pbuf, "entry size: "); - prt_human_readable_u64(&pbuf, vstruct_bytes(buf->data)); - prt_newline(&pbuf); - bch2_prt_task_backtrace(&pbuf, current, 1, GFP_NOWAIT); - trace_journal_entry_close(c, pbuf.buf); - printbuf_exit(&pbuf); + CLASS(printbuf, err)(); + guard(printbuf_atomic)(&err); + + prt_str(&err, "entry size: "); + prt_human_readable_u64(&err, vstruct_bytes(buf->data)); + prt_newline(&err); + bch2_prt_task_backtrace(&err, current, 1, GFP_NOWAIT); + trace_journal_entry_close(c, err.buf); } sectors = vstruct_blocks_plus(buf->data, c->block_bits, buf->u64s_reserved) << c->block_bits; if (unlikely(sectors > buf->sectors)) { - struct printbuf err = PRINTBUF; - err.atomic++; + CLASS(printbuf, err)(); + guard(printbuf_atomic)(&err); prt_printf(&err, "journal entry overran reserved space: %u > %u\n", sectors, buf->sectors); @@ -295,7 +291,6 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t bch2_journal_halt_locked(j); bch_err(c, "%s", err.buf); - printbuf_exit(&err); return; } @@ -331,16 +326,6 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t __bch2_journal_buf_put(j, le64_to_cpu(buf->data->seq)); } -void bch2_journal_halt(struct journal *j) -{ - spin_lock(&j->lock); - __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true); - if (!j->err_seq) - j->err_seq = journal_cur_seq(j); - journal_wake(j); - spin_unlock(&j->lock); -} - void bch2_journal_halt_locked(struct journal *j) { lockdep_assert_held(&j->lock); @@ -351,6 +336,12 @@ void bch2_journal_halt_locked(struct journal *j) journal_wake(j); } +void bch2_journal_halt(struct journal *j) +{ + guard(spinlock)(&j->lock); + bch2_journal_halt_locked(j); +} + static bool journal_entry_want_write(struct journal *j) { bool ret = !journal_entry_is_open(j) || @@ -373,13 +364,8 @@ static bool journal_entry_want_write(struct journal *j) bool bch2_journal_entry_close(struct journal *j) { - bool ret; - - spin_lock(&j->lock); - ret = journal_entry_want_write(j); - spin_unlock(&j->lock); - - return ret; + guard(spinlock)(&j->lock); + return journal_entry_want_write(j); } /* @@ -396,10 +382,10 @@ static int journal_entry_open(struct journal *j) lockdep_assert_held(&j->lock); BUG_ON(journal_entry_is_open(j)); - BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); + BUG_ON(c->sb.clean); if (j->blocked) - return -BCH_ERR_journal_blocked; + return bch_err_throw(c, journal_blocked); if (j->cur_entry_error) return j->cur_entry_error; @@ -409,23 +395,23 @@ static int journal_entry_open(struct journal *j) return ret; if (!fifo_free(&j->pin)) - return -BCH_ERR_journal_pin_full; + return bch_err_throw(c, journal_pin_full); if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) - return -BCH_ERR_journal_max_in_flight; + return bch_err_throw(c, journal_max_in_flight); if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR) - return -BCH_ERR_journal_max_open; + return bch_err_throw(c, journal_max_open); - if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) { + if (unlikely(journal_cur_seq(j) >= JOURNAL_SEQ_MAX)) { bch_err(c, "cannot start: journal seq overflow"); if (bch2_fs_emergency_read_only_locked(c)) bch_err(c, "fatal error - emergency read only"); - return -BCH_ERR_journal_shutdown; + return bch_err_throw(c, journal_shutdown); } if (!j->free_buf && !buf->data) - return -BCH_ERR_journal_buf_enomem; /* will retry after write completion frees up a buf */ + return bch_err_throw(c, journal_buf_enomem); /* will retry after write completion frees up a buf */ BUG_ON(!j->cur_entry_sectors); @@ -449,7 +435,7 @@ static int journal_entry_open(struct journal *j) u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); if (u64s <= (ssize_t) j->early_journal_entries.nr) - return -BCH_ERR_journal_full; + return bch_err_throw(c, journal_full); if (fifo_empty(&j->pin) && j->reclaim_thread) wake_up_process(j->reclaim_thread); @@ -461,6 +447,14 @@ static int journal_entry_open(struct journal *j) atomic64_inc(&j->seq); journal_pin_list_init(fifo_push_ref(&j->pin), 1); + if (unlikely(bch2_journal_seq_is_blacklisted(c, journal_cur_seq(j), false))) { + bch_err(c, "attempting to open blacklisted journal seq %llu", + journal_cur_seq(j)); + if (bch2_fs_emergency_read_only_locked(c)) + bch_err(c, "fatal error - emergency read only"); + return bch_err_throw(c, journal_shutdown); + } + BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf); @@ -536,7 +530,7 @@ static void journal_write_work(struct work_struct *work) { struct journal *j = container_of(work, struct journal, write_work.work); - spin_lock(&j->lock); + guard(spinlock)(&j->lock); if (__journal_entry_is_open(j->reservations)) { long delta = journal_cur_buf(j)->expires - jiffies; @@ -545,7 +539,6 @@ static void journal_write_work(struct work_struct *work) else __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); } - spin_unlock(&j->lock); } static void journal_buf_prealloc(struct journal *j) @@ -591,16 +584,16 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, return ret; if (j->blocked) - return -BCH_ERR_journal_blocked; + return bch_err_throw(c, journal_blocked); if ((flags & BCH_WATERMARK_MASK) < j->watermark) { - ret = -BCH_ERR_journal_full; + ret = bch_err_throw(c, journal_full); can_discard = j->can_discard; goto out; } if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) { - ret = -BCH_ERR_journal_max_in_flight; + ret = bch_err_throw(c, journal_max_in_flight); goto out; } @@ -641,39 +634,37 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, goto retry; if (journal_error_check_stuck(j, ret, flags)) - ret = -BCH_ERR_journal_stuck; + ret = bch_err_throw(c, journal_stuck); if (ret == -BCH_ERR_journal_max_in_flight && track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) && trace_journal_entry_full_enabled()) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_printbuf_make_room(&buf, 4096); - spin_lock(&j->lock); - prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); - bch2_journal_bufs_to_text(&buf, j); - spin_unlock(&j->lock); + scoped_guard(spinlock, &j->lock) { + prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); + bch2_journal_bufs_to_text(&buf, j); + } trace_journal_entry_full(c, buf.buf); - printbuf_exit(&buf); count_event(c, journal_entry_full); } if (ret == -BCH_ERR_journal_max_open && track_event_change(&c->times[BCH_TIME_blocked_journal_max_open], true) && trace_journal_entry_full_enabled()) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_printbuf_make_room(&buf, 4096); - spin_lock(&j->lock); - prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); - bch2_journal_bufs_to_text(&buf, j); - spin_unlock(&j->lock); + scoped_guard(spinlock, &j->lock) { + prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); + bch2_journal_bufs_to_text(&buf, j); + } trace_journal_entry_full(c, buf.buf); - printbuf_exit(&buf); count_event(c, journal_entry_full); } @@ -702,7 +693,8 @@ static unsigned max_dev_latency(struct bch_fs *c) { u64 nsecs = 0; - for_each_rw_member(c, ca) + guard(rcu)(); + for_each_rw_member_rcu(c, ca) nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration); return nsecs_to_jiffies(nsecs); @@ -744,11 +736,10 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, remaining_wait)) return ret; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_journal_debug_to_text(&buf, j); - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); prt_printf(&buf, bch2_fmt(c, "Journal stuck? Waited for 10 seconds, err %s"), bch2_err_str(ret)); - printbuf_exit(&buf); closure_wait_event(&j->async_wait, !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || @@ -765,11 +756,13 @@ void bch2_journal_entry_res_resize(struct journal *j, union journal_res_state state; int d = new_u64s - res->u64s; - spin_lock(&j->lock); + guard(spinlock)(&j->lock); + + j->entry_u64s_reserved += d; + res->u64s += d; - j->entry_u64s_reserved += d; if (d <= 0) - goto out; + return; j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d); state = READ_ONCE(j->reservations); @@ -784,9 +777,6 @@ void bch2_journal_entry_res_resize(struct journal *j, } else { journal_cur_buf(j)->u64s_reserved += d; } -out: - spin_unlock(&j->lock); - res->u64s += d; } /* journal flushing: */ @@ -805,6 +795,7 @@ void bch2_journal_entry_res_resize(struct journal *j, int bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *buf; int ret = 0; @@ -820,7 +811,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, /* Recheck under lock: */ if (j->err_seq && seq >= j->err_seq) { - ret = -BCH_ERR_journal_flush_err; + ret = bch_err_throw(c, journal_flush_err); goto out; } @@ -936,7 +927,6 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 start, u64 end) { struct bch_fs *c = container_of(j, struct bch_fs, journal); u64 unwritten_seq; - bool ret = false; if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush))) return false; @@ -944,9 +934,10 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 start, u64 end) if (c->journal.flushed_seq_ondisk >= start) return false; - spin_lock(&j->lock); + guard(spinlock)(&j->lock); + if (c->journal.flushed_seq_ondisk >= start) - goto out; + return false; for (unwritten_seq = journal_last_unwritten_seq(j); unwritten_seq < end; @@ -955,15 +946,12 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 start, u64 end) /* journal flush already in flight, or flush requseted */ if (buf->must_flush) - goto out; + return false; buf->noflush = true; } - ret = true; -out: - spin_unlock(&j->lock); - return ret; + return true; } static int __bch2_journal_meta(struct journal *j) @@ -990,11 +978,11 @@ int bch2_journal_meta(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_journal)) - return -BCH_ERR_erofs_no_writes; + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_journal)) + return bch_err_throw(c, erofs_no_writes); int ret = __bch2_journal_meta(j); - bch2_write_ref_put(c, BCH_WRITE_REF_journal); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_journal); return ret; } @@ -1002,19 +990,18 @@ int bch2_journal_meta(struct journal *j) void bch2_journal_unblock(struct journal *j) { - spin_lock(&j->lock); - if (!--j->blocked && - j->cur_entry_offset_if_blocked < JOURNAL_ENTRY_CLOSED_VAL && - j->reservations.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) { - union journal_res_state old, new; - - old.v = atomic64_read(&j->reservations.counter); - do { - new.v = old.v; - new.cur_entry_offset = j->cur_entry_offset_if_blocked; - } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); - } - spin_unlock(&j->lock); + scoped_guard(spinlock, &j->lock) + if (!--j->blocked && + j->cur_entry_offset_if_blocked < JOURNAL_ENTRY_CLOSED_VAL && + j->reservations.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) { + union journal_res_state old, new; + + old.v = atomic64_read(&j->reservations.counter); + do { + new.v = old.v; + new.cur_entry_offset = j->cur_entry_offset_if_blocked; + } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); + } journal_wake(j); } @@ -1042,9 +1029,8 @@ static void __bch2_journal_block(struct journal *j) void bch2_journal_block(struct journal *j) { - spin_lock(&j->lock); - __bch2_journal_block(j); - spin_unlock(&j->lock); + scoped_guard(spinlock, &j->lock) + __bch2_journal_block(j); journal_quiesce(j); } @@ -1057,7 +1043,7 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou /* We're inside wait_event(), but using mutex_lock(: */ sched_annotate_sleep(); mutex_lock(&j->buf_lock); - spin_lock(&j->lock); + guard(spinlock)(&j->lock); max_seq = min(max_seq, journal_cur_seq(j)); for (u64 seq = journal_last_unwritten_seq(j); @@ -1074,6 +1060,7 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou if (open && !*blocked) { __bch2_journal_block(j); + s.v = atomic64_read_acquire(&j->reservations.counter); *blocked = true; } @@ -1084,7 +1071,6 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou } } - spin_unlock(&j->lock); if (IS_ERR_OR_NULL(ret)) mutex_unlock(&j->buf_lock); return ret; @@ -1124,7 +1110,7 @@ static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr, new_buckets = kcalloc(nr, sizeof(u64), GFP_KERNEL); new_bucket_seq = kcalloc(nr, sizeof(u64), GFP_KERNEL); if (!bu || !ob || !new_buckets || !new_bucket_seq) { - ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets; + ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets); goto err_free; } @@ -1139,16 +1125,14 @@ static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr, if (ret) break; - if (!new_fs) { - ret = bch2_trans_run(c, - bch2_trans_mark_metadata_bucket(trans, ca, - ob[nr_got]->bucket, BCH_DATA_journal, - ca->mi.bucket_size, BTREE_TRIGGER_transactional)); - if (ret) { - bch2_open_bucket_put(c, ob[nr_got]); - bch_err_msg(c, ret, "marking new journal buckets"); - break; - } + CLASS(btree_trans, trans)(c); + ret = bch2_trans_mark_metadata_bucket(trans, ca, + ob[nr_got]->bucket, BCH_DATA_journal, + ca->mi.bucket_size, BTREE_TRIGGER_transactional); + if (ret) { + bch2_open_bucket_put(c, ob[nr_got]); + bch_err_msg(c, ret, "marking new journal buckets"); + break; } bu[nr_got] = ob[nr_got]->bucket; @@ -1218,12 +1202,13 @@ static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr, mutex_unlock(&c->sb_lock); } - if (ret && !new_fs) + if (ret) { + CLASS(btree_trans, trans)(c); for (i = 0; i < nr_got; i++) - bch2_trans_run(c, - bch2_trans_mark_metadata_bucket(trans, ca, + bch2_trans_mark_metadata_bucket(trans, ca, bu[i], BCH_DATA_free, 0, - BTREE_TRIGGER_transactional)); + BTREE_TRIGGER_transactional); + } err_free: for (i = 0; i < nr_got; i++) bch2_open_bucket_put(c, ob[i]); @@ -1275,7 +1260,7 @@ static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca ret = 0; /* wait and retry */ bch2_disk_reservation_put(c, &disk_res); - closure_sync(&cl); + bch2_wait_on_allocator(c, &cl); } return ret; @@ -1288,21 +1273,89 @@ static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, unsigned nr) { - down_write(&c->state_lock); + guard(rwsem_write)(&c->state_lock); int ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, false); - up_write(&c->state_lock); - bch_err_fn(c, ret); return ret; } +int bch2_dev_journal_bucket_delete(struct bch_dev *ca, u64 b) +{ + struct bch_fs *c = ca->fs; + struct journal *j = &c->journal; + struct journal_device *ja = &ca->journal; + + guard(mutex)(&c->sb_lock); + unsigned pos; + for (pos = 0; pos < ja->nr; pos++) + if (ja->buckets[pos] == b) + break; + + if (pos == ja->nr) { + bch_err(ca, "journal bucket %llu not found when deleting", b); + return -EINVAL; + } + + u64 *new_buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); + if (!new_buckets) + return bch_err_throw(c, ENOMEM_set_nr_journal_buckets); + + memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); + memmove(&new_buckets[pos], + &new_buckets[pos + 1], + (ja->nr - 1 - pos) * sizeof(new_buckets[0])); + + int ret = bch2_journal_buckets_to_sb(c, ca, ja->buckets, ja->nr - 1) ?: + bch2_write_super(c); + if (ret) { + kfree(new_buckets); + return ret; + } + + scoped_guard(spinlock, &j->lock) { + if (pos < ja->discard_idx) + --ja->discard_idx; + if (pos < ja->dirty_idx_ondisk) + --ja->dirty_idx_ondisk; + if (pos < ja->dirty_idx) + --ja->dirty_idx; + if (pos < ja->cur_idx) + --ja->cur_idx; + + ja->nr--; + + memmove(&ja->buckets[pos], + &ja->buckets[pos + 1], + (ja->nr - pos) * sizeof(ja->buckets[0])); + + memmove(&ja->bucket_seq[pos], + &ja->bucket_seq[pos + 1], + (ja->nr - pos) * sizeof(ja->bucket_seq[0])); + + bch2_journal_space_available(j); + } + + kfree(new_buckets); + return 0; +} + int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) { + struct bch_fs *c = ca->fs; + + if (!(ca->mi.data_allowed & BIT(BCH_DATA_journal))) + return 0; + + if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { + bch_err(c, "cannot allocate journal, filesystem is an unresized image file"); + return bch_err_throw(c, erofs_filesystem_full); + } + unsigned nr; int ret; if (dynamic_fault("bcachefs:add:journal_alloc")) { - ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets; + ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets); goto err; } @@ -1318,7 +1371,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) min(1 << 13, (1 << 24) / ca->mi.bucket_size)); - ret = bch2_set_nr_journal_buckets_loop(ca->fs, ca, nr, new_fs); + ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, new_fs); err: bch_err_fn(ca, ret); return ret; @@ -1326,13 +1379,14 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) int bch2_fs_journal_alloc(struct bch_fs *c) { - for_each_online_member(c, ca) { + for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_journal_alloc) { if (ca->journal.nr) continue; int ret = bch2_dev_journal_alloc(ca, true); if (ret) { - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_fs_journal_alloc); return ret; } } @@ -1344,21 +1398,18 @@ int bch2_fs_journal_alloc(struct bch_fs *c) static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) { - bool ret = false; - u64 seq; + guard(spinlock)(&j->lock); - spin_lock(&j->lock); - for (seq = journal_last_unwritten_seq(j); - seq <= journal_cur_seq(j) && !ret; + for (u64 seq = journal_last_unwritten_seq(j); + seq <= journal_cur_seq(j); seq++) { struct journal_buf *buf = journal_seq_to_buf(j, seq); if (bch2_bkey_has_device_c(bkey_i_to_s_c(&buf->key), dev_idx)) - ret = true; + return true; } - spin_unlock(&j->lock); - return ret; + return false; } void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) @@ -1395,32 +1446,36 @@ void bch2_fs_journal_stop(struct journal *j) clear_bit(JOURNAL_running, &j->flags); } -int bch2_fs_journal_start(struct journal *j, u64 cur_seq) +int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_entry_pin_list *p; struct journal_replay *i, **_i; struct genradix_iter iter; bool had_entries = false; - u64 last_seq = cur_seq, nr, seq; + + /* + * + * XXX pick most recent non blacklisted sequence number + */ + + cur_seq = max(cur_seq, bch2_journal_last_blacklisted_seq(c)); if (cur_seq >= JOURNAL_SEQ_MAX) { bch_err(c, "cannot start: journal seq overflow"); return -EINVAL; } - genradix_for_each_reverse(&c->journal_entries, iter, _i) { - i = *_i; + /* Clean filesystem? */ + if (!last_seq) + last_seq = cur_seq; - if (journal_replay_ignore(i)) - continue; - - last_seq = le64_to_cpu(i->j.last_seq); - break; + u64 nr = cur_seq - last_seq; + if (nr * sizeof(struct journal_entry_pin_list) > 1U << 30) { + bch_err(c, "too many ntjournal fifo (%llu open entries)", nr); + return bch_err_throw(c, ENOMEM_journal_pin_fifo); } - nr = cur_seq - last_seq; - /* * Extra fudge factor, in case we crashed when the journal pin fifo was * nearly or completely full. We'll need to be able to open additional @@ -1429,13 +1484,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) */ nr += nr / 4; - if (nr + 1 > j->pin.size) { - free_fifo(&j->pin); - init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL); - if (!j->pin.data) { - bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); - return -BCH_ERR_ENOMEM_journal_pin_fifo; - } + nr = max(nr, JOURNAL_PIN); + init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL); + if (!j->pin.data) { + bch_err(c, "error allocating journal fifo (%llu open entries)", nr); + return bch_err_throw(c, ENOMEM_journal_pin_fifo); } j->replay_journal_seq = last_seq; @@ -1448,6 +1501,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) j->pin.back = cur_seq; atomic64_set(&j->seq, cur_seq - 1); + u64 seq; fifo_for_each_entry_ptr(p, &j->pin, seq) journal_pin_list_init(p, 1); @@ -1478,13 +1532,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) if (!had_entries) j->last_empty_seq = cur_seq - 1; /* to match j->seq */ - spin_lock(&j->lock); - j->last_flush_write = jiffies; - - j->reservations.idx = journal_cur_seq(j); - - c->last_bucket_seq_cleanup = journal_cur_seq(j); - spin_unlock(&j->lock); + scoped_guard(spinlock, &j->lock) { + j->last_flush_write = jiffies; + j->reservations.idx = journal_cur_seq(j); + c->last_bucket_seq_cleanup = journal_cur_seq(j); + } return 0; } @@ -1495,13 +1547,12 @@ void bch2_journal_set_replay_done(struct journal *j) * journal_space_available must happen before setting JOURNAL_running * JOURNAL_running must happen before JOURNAL_replay_done */ - spin_lock(&j->lock); + guard(spinlock)(&j->lock); bch2_journal_space_available(j); set_bit(JOURNAL_need_flush_write, &j->flags); set_bit(JOURNAL_running, &j->flags); set_bit(JOURNAL_replay_done, &j->flags); - spin_unlock(&j->lock); } /* init/exit: */ @@ -1511,7 +1562,7 @@ void bch2_dev_journal_exit(struct bch_dev *ca) struct journal_device *ja = &ca->journal; for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { - kfree(ja->bio[i]); + kvfree(ja->bio[i]); ja->bio[i] = NULL; } @@ -1523,6 +1574,7 @@ void bch2_dev_journal_exit(struct bch_dev *ca) int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) { + struct bch_fs *c = ca->fs; struct journal_device *ja = &ca->journal; struct bch_sb_field_journal *journal_buckets = bch2_sb_field_get(sb, journal); @@ -1542,15 +1594,24 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); if (!ja->bucket_seq) - return -BCH_ERR_ENOMEM_dev_journal_init; + return bch_err_throw(c, ENOMEM_dev_journal_init); unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { - ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, + /* + * kvzalloc() is not what we want to be using here: + * JOURNAL_ENTRY_SIZE_MAX is probably quite a bit bigger than it + * needs to be. + * + * But changing that will require performance testing - + * performance can be sensitive to anything that affects journal + * pipelining. + */ + ja->bio[i] = kvzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, nr_bvecs), GFP_KERNEL); if (!ja->bio[i]) - return -BCH_ERR_ENOMEM_dev_journal_init; + return bch_err_throw(c, ENOMEM_dev_journal_init); ja->bio[i]->ca = ca; ja->bio[i]->buf_idx = i; @@ -1559,7 +1620,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); if (!ja->buckets) - return -BCH_ERR_ENOMEM_dev_journal_init; + return bch_err_throw(c, ENOMEM_dev_journal_init); if (journal_buckets_v2) { unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); @@ -1590,7 +1651,7 @@ void bch2_fs_journal_exit(struct journal *j) free_fifo(&j->pin); } -int bch2_fs_journal_init(struct journal *j) +void bch2_fs_journal_init_early(struct journal *j) { static struct lock_class_key res_key; @@ -1609,24 +1670,24 @@ int bch2_fs_journal_init(struct journal *j) atomic64_set(&j->reservations.counter, ((union journal_res_state) { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); +} - if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) - return -BCH_ERR_ENOMEM_journal_pin_fifo; +int bch2_fs_journal_init(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN; j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL); if (!j->free_buf) - return -BCH_ERR_ENOMEM_journal_buf; + return bch_err_throw(c, ENOMEM_journal_buf); for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) j->buf[i].idx = i; - j->pin.front = j->pin.back = 1; - j->wq = alloc_workqueue("bcachefs_journal", WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512); if (!j->wq) - return -BCH_ERR_ENOMEM_fs_other_alloc; + return bch_err_throw(c, ENOMEM_fs_other_alloc); return 0; } @@ -1648,9 +1709,10 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) printbuf_tabstops_reset(out); printbuf_tabstop_push(out, 28); - out->atomic++; - rcu_read_lock(); + guard(printbuf_atomic)(out); + guard(rcu)(); + s = READ_ONCE(j->reservations); prt_printf(out, "flags:\t"); @@ -1740,15 +1802,10 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) } prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required); - - rcu_read_unlock(); - - --out->atomic; } void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) { - spin_lock(&j->lock); + guard(spinlock)(&j->lock); __bch2_journal_debug_to_text(out, j); - spin_unlock(&j->lock); } diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 641e20c05a14..b46b9718d841 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -297,9 +297,8 @@ static inline void bch2_journal_buf_put(struct journal *j, u64 seq) s = journal_state_buf_put(j, idx); if (!journal_state_count(s, idx)) { - spin_lock(&j->lock); + guard(spinlock)(&j->lock); bch2_journal_buf_put_final(j, seq); - spin_unlock(&j->lock); } else if (unlikely(s.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL)) wake_up(&j->wait); } @@ -426,8 +425,8 @@ int bch2_journal_flush(struct journal *); bool bch2_journal_noflush_seq(struct journal *, u64, u64); int bch2_journal_meta(struct journal *); -void bch2_journal_halt(struct journal *); void bch2_journal_halt_locked(struct journal *); +void bch2_journal_halt(struct journal *); static inline int bch2_journal_error(struct journal *j) { @@ -444,20 +443,22 @@ struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_debug_to_text(struct printbuf *, struct journal *); -int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, - unsigned nr); +int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned); +int bch2_dev_journal_bucket_delete(struct bch_dev *, u64); + int bch2_dev_journal_alloc(struct bch_dev *, bool); int bch2_fs_journal_alloc(struct bch_fs *); void bch2_dev_journal_stop(struct journal *, struct bch_dev *); void bch2_fs_journal_stop(struct journal *); -int bch2_fs_journal_start(struct journal *, u64); +int bch2_fs_journal_start(struct journal *, u64, u64); void bch2_journal_set_replay_done(struct journal *); void bch2_dev_journal_exit(struct bch_dev *); int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); void bch2_fs_journal_exit(struct journal *); +void bch2_fs_journal_init_early(struct journal *); int bch2_fs_journal_init(struct journal *); #endif /* _BCACHEFS_JOURNAL_H */ diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index ded18a94ed02..093e4acad085 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -35,7 +35,8 @@ void bch2_journal_pos_from_member_info_set(struct bch_fs *c) void bch2_journal_pos_from_member_info_resume(struct bch_fs *c) { - mutex_lock(&c->sb_lock); + guard(mutex)(&c->sb_lock); + for_each_member_device(c, ca) { struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); @@ -46,28 +47,28 @@ void bch2_journal_pos_from_member_info_resume(struct bch_fs *c) if (offset <= ca->mi.bucket_size) ca->journal.sectors_free = ca->mi.bucket_size - offset; } - mutex_unlock(&c->sb_lock); } -void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, - struct journal_replay *j) +static void bch2_journal_ptr_to_text(struct printbuf *out, struct bch_fs *c, struct journal_ptr *p) +{ + CLASS(bch2_dev_tryget_noerror, ca)(c, p->dev); + prt_printf(out, "%s %u:%u:%u (sector %llu)", + ca ? ca->name : "(invalid dev)", + p->dev, p->bucket, p->bucket_offset, p->sector); +} + +void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct journal_replay *j) { darray_for_each(j->ptrs, i) { if (i != j->ptrs.data) prt_printf(out, " "); - prt_printf(out, "%u:%u:%u (sector %llu)", - i->dev, i->bucket, i->bucket_offset, i->sector); + bch2_journal_ptr_to_text(out, c, i); } } -static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, - struct journal_replay *j) +static void bch2_journal_datetime_to_text(struct printbuf *out, struct jset *j) { - prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); - - bch2_journal_ptrs_to_text(out, c, j); - - for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) { + for_each_jset_entry_type(entry, j, BCH_JSET_ENTRY_datetime) { struct jset_entry_datetime *datetime = container_of(entry, struct jset_entry_datetime, entry); bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); @@ -75,6 +76,15 @@ static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, } } +static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, + struct journal_replay *j) +{ + prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); + bch2_journal_datetime_to_text(out, &j->j); + prt_char(out, ' '); + bch2_journal_ptrs_to_text(out, c, j); +} + static struct nonce journal_nonce(const struct jset *jset) { return (struct nonce) {{ @@ -146,9 +156,12 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, struct journal_replay **_i, *i, *dup; size_t bytes = vstruct_bytes(j); u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); int ret = JOURNAL_ENTRY_ADD_OK; + if (last_seq && c->opts.journal_rewind) + last_seq = min(last_seq, c->opts.journal_rewind); + if (!c->journal.oldest_seq_found_ondisk || le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk) c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq); @@ -188,7 +201,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, journal_entry_radix_idx(c, le64_to_cpu(j->seq)), GFP_KERNEL); if (!_i) - return -BCH_ERR_ENOMEM_journal_entry_add; + return bch_err_throw(c, ENOMEM_journal_entry_add); /* * Duplicate journal entries? If so we want the one that didn't have a @@ -209,7 +222,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, ret = darray_push(&dup->ptrs, entry_ptr); if (ret) - goto out; + return ret; bch2_journal_replay_to_text(&buf, c, dup); @@ -226,12 +239,12 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, if (entry_ptr.csum_good && !identical) goto replace; - goto out; + return ret; } replace: i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); if (!i) - return -BCH_ERR_ENOMEM_journal_entry_add; + return bch_err_throw(c, ENOMEM_journal_entry_add); darray_init(&i->ptrs); i->csum_good = entry_ptr.csum_good; @@ -249,9 +262,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, } *_i = i; -out: fsck_err: - printbuf_exit(&buf); return ret; } @@ -298,7 +309,7 @@ static void journal_entry_err_msg(struct printbuf *out, #define journal_entry_err(c, version, jset, entry, _err, msg, ...) \ ({ \ - struct printbuf _buf = PRINTBUF; \ + CLASS(printbuf, _buf)(); \ \ journal_entry_err_msg(&_buf, version, jset, entry); \ prt_printf(&_buf, msg, ##__VA_ARGS__); \ @@ -311,13 +322,12 @@ static void journal_entry_err_msg(struct printbuf *out, bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ if (bch2_fs_inconsistent(c, \ "corrupt metadata before write: %s\n", _buf.buf)) {\ - ret = -BCH_ERR_fsck_errors_not_fixed; \ + ret = bch_err_throw(c, fsck_errors_not_fixed); \ goto fsck_err; \ } \ break; \ } \ \ - printbuf_exit(&_buf); \ true; \ }) @@ -423,6 +433,17 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs bch2_prt_jset_entry_type(out, entry->type); prt_str(out, ": "); } + /* We may be called on entries that haven't been validated: */ + if (!k->k.u64s) { + prt_str(out, "(invalid, k->u64s 0)"); + break; + } + + if (bkey_next(k) > vstruct_last(entry)) { + prt_str(out, "(invalid, bkey overruns jset_entry)"); + break; + } + bch2_btree_id_level_to_text(out, entry->btree_id, entry->level); prt_char(out, ' '); bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); @@ -599,7 +620,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c, struct jset_entry_data_usage *u = container_of(entry, struct jset_entry_data_usage, entry); unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); - struct printbuf err = PRINTBUF; + CLASS(printbuf, err)(); int ret = 0; if (journal_entry_err_on(bytes < sizeof(*u) || @@ -608,7 +629,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c, journal_entry_data_usage_bad_size, "invalid journal entry usage: bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); - goto out; + return 0; } if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err), @@ -616,11 +637,9 @@ static int journal_entry_data_usage_validate(struct bch_fs *c, journal_entry_data_usage_bad_size, "invalid journal entry usage: %s", err.buf)) { journal_entry_null_range(entry, vstruct_next(entry)); - goto out; + return 0; } -out: fsck_err: - printbuf_exit(&err); return ret; } @@ -1005,19 +1024,19 @@ struct journal_read_buf { size_t size; }; -static int journal_read_buf_realloc(struct journal_read_buf *b, +static int journal_read_buf_realloc(struct bch_fs *c, struct journal_read_buf *b, size_t new_size) { void *n; /* the bios are sized for this many pages, max: */ if (new_size > JOURNAL_ENTRY_SIZE_MAX) - return -BCH_ERR_ENOMEM_journal_read_buf_realloc; + return bch_err_throw(c, ENOMEM_journal_read_buf_realloc); new_size = roundup_pow_of_two(new_size); n = kvmalloc(new_size, GFP_KERNEL); if (!n) - return -BCH_ERR_ENOMEM_journal_read_buf_realloc; + return bch_err_throw(c, ENOMEM_journal_read_buf_realloc); kvfree(b->data); b->data = n; @@ -1037,7 +1056,6 @@ static int journal_read_bucket(struct bch_dev *ca, u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), end = offset + ca->mi.bucket_size; bool saw_bad = false, csum_good; - struct printbuf err = PRINTBUF; int ret = 0; pr_debug("reading %u", bucket); @@ -1053,7 +1071,7 @@ static int journal_read_bucket(struct bch_dev *ca, bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); if (!bio) - return -BCH_ERR_ENOMEM_journal_read_bucket; + return bch_err_throw(c, ENOMEM_journal_read_bucket); bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); bio->bi_iter.bi_sector = offset; @@ -1064,7 +1082,7 @@ static int journal_read_bucket(struct bch_dev *ca, kfree(bio); if (!ret && bch2_meta_read_fault("journal")) - ret = -BCH_ERR_EIO_fault_injected; + ret = bch_err_throw(c, EIO_fault_injected); bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !ret); @@ -1078,7 +1096,7 @@ static int journal_read_bucket(struct bch_dev *ca, * found on a different device, and missing or * no journal entries will be handled later */ - goto out; + return 0; } j = buf->data; @@ -1092,15 +1110,15 @@ static int journal_read_bucket(struct bch_dev *ca, break; case JOURNAL_ENTRY_REREAD: if (vstruct_bytes(j) > buf->size) { - ret = journal_read_buf_realloc(buf, + ret = journal_read_buf_realloc(c, buf, vstruct_bytes(j)); if (ret) - goto err; + return ret; } goto reread; case JOURNAL_ENTRY_NONE: if (!saw_bad) - goto out; + return 0; /* * On checksum error we don't really trust the size * field of the journal entry we read, so try reading @@ -1109,7 +1127,7 @@ static int journal_read_bucket(struct bch_dev *ca, sectors = block_sectors(c); goto next_block; default: - goto err; + return ret; } if (le64_to_cpu(j->seq) > ja->highest_seq_found) { @@ -1126,22 +1144,20 @@ static int journal_read_bucket(struct bch_dev *ca, * bucket: */ if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) - goto out; + return 0; ja->bucket_seq[bucket] = le64_to_cpu(j->seq); - enum bch_csum_type csum_type = JSET_CSUM_TYPE(j); struct bch_csum csum; csum_good = jset_csum_good(c, j, &csum); bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); if (!csum_good) { - bch_err_dev_ratelimited(ca, "%s", - (printbuf_reset(&err), - prt_str(&err, "journal "), - bch2_csum_err_msg(&err, csum_type, j->csum, csum), - err.buf)); + /* + * Don't print an error here, we'll print the error + * later if we need this journal entry + */ saw_bad = true; } @@ -1150,16 +1166,16 @@ static int journal_read_bucket(struct bch_dev *ca, vstruct_end(j) - (void *) j->encrypted_start); bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)); - mutex_lock(&jlist->lock); - ret = journal_entry_add(c, ca, (struct journal_ptr) { - .csum_good = csum_good, - .dev = ca->dev_idx, - .bucket = bucket, - .bucket_offset = offset - - bucket_to_sector(ca, ja->buckets[bucket]), - .sector = offset, - }, jlist, j); - mutex_unlock(&jlist->lock); + scoped_guard(mutex, &jlist->lock) + ret = journal_entry_add(c, ca, (struct journal_ptr) { + .csum_good = csum_good, + .csum = csum, + .dev = ca->dev_idx, + .bucket = bucket, + .bucket_offset = offset - + bucket_to_sector(ca, ja->buckets[bucket]), + .sector = offset, + }, jlist, j); switch (ret) { case JOURNAL_ENTRY_ADD_OK: @@ -1167,7 +1183,7 @@ static int journal_read_bucket(struct bch_dev *ca, case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: break; default: - goto err; + return ret; } next_block: pr_debug("next"); @@ -1176,11 +1192,7 @@ static int journal_read_bucket(struct bch_dev *ca, j = ((void *) j) + (sectors << 9); } -out: - ret = 0; -err: - printbuf_exit(&err); - return ret; + return 0; } static CLOSURE_CALLBACK(bch2_journal_read_device) @@ -1197,7 +1209,7 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) if (!ja->nr) goto out; - ret = journal_read_buf_realloc(&buf, PAGE_SIZE); + ret = journal_read_buf_realloc(c, &buf, PAGE_SIZE); if (ret) goto err; @@ -1219,25 +1231,125 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) out: bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); kvfree(buf.data); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_journal_read); closure_return(cl); return; err: - mutex_lock(&jlist->lock); - jlist->ret = ret; - mutex_unlock(&jlist->lock); + scoped_guard(mutex, &jlist->lock) + jlist->ret = ret; goto out; } +noinline_for_stack +static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_replay *j) +{ + CLASS(printbuf, buf)(); + bch2_log_msg_start(c, &buf); + + enum bch_csum_type csum_type = JSET_CSUM_TYPE(&j->j); + bool have_good = false; + + prt_printf(&buf, "invalid journal checksum(s) at seq %llu ", le64_to_cpu(j->j.seq)); + bch2_journal_datetime_to_text(&buf, &j->j); + prt_newline(&buf); + + darray_for_each(j->ptrs, ptr) + if (!ptr->csum_good) { + bch2_journal_ptr_to_text(&buf, c, ptr); + prt_char(&buf, ' '); + bch2_csum_to_text(&buf, csum_type, ptr->csum); + prt_newline(&buf); + } else { + have_good = true; + } + + prt_printf(&buf, "should be "); + bch2_csum_to_text(&buf, csum_type, j->j.csum); + + if (have_good) + prt_printf(&buf, "\n(had good copy on another device)"); + + bch2_print_str(c, KERN_ERR, buf.buf); +} + +struct u64_range bch2_journal_entry_missing_range(struct bch_fs *c, u64 start, u64 end) +{ + BUG_ON(start > end); + + if (start == end) + return (struct u64_range) {}; + + start = bch2_journal_seq_next_nonblacklisted(c, start); + if (start >= end) + return (struct u64_range) {}; + + struct u64_range missing = { + .start = start, + .end = min(end, bch2_journal_seq_next_blacklisted(c, start)), + }; + + if (missing.start == missing.end) + return (struct u64_range) {}; + + return missing; +} + +noinline_for_stack +static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 end_seq) +{ + int ret = 0; + + struct genradix_iter radix_iter; + struct journal_replay *i, **_i, *prev = NULL; + /* Sequence number we expect to find next, to check for missing entries */ + u64 seq = start_seq; + + genradix_for_each(&c->journal_entries, radix_iter, _i) { + i = *_i; + + if (journal_replay_ignore(i)) + continue; + + BUG_ON(seq > le64_to_cpu(i->j.seq)); + + struct u64_range missing; + + while ((missing = bch2_journal_entry_missing_range(c, seq, le64_to_cpu(i->j.seq))).start) { + CLASS(printbuf, buf)(); + prt_printf(&buf, "journal entries %llu-%llu missing! (replaying %llu-%llu)", + missing.start, missing.end - 1, + start_seq, end_seq); + + if (prev) { + prt_printf(&buf, "\n%llu at ", le64_to_cpu(prev->j.seq)); + bch2_journal_ptrs_to_text(&buf, c, prev); + prt_printf(&buf, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); + } + + prt_printf(&buf, "\n%llu at ", le64_to_cpu(i->j.seq)); + bch2_journal_ptrs_to_text(&buf, c, i); + prt_printf(&buf, ", continue?"); + + fsck_err(c, journal_entries_missing, "%s", buf.buf); + + seq = missing.end; + } + + prev = i; + seq = le64_to_cpu(i->j.seq) + 1; + } +fsck_err: + return ret; +} + int bch2_journal_read(struct bch_fs *c, u64 *last_seq, u64 *blacklist_seq, u64 *start_seq) { struct journal_list jlist; - struct journal_replay *i, **_i, *prev = NULL; + struct journal_replay *i, **_i; struct genradix_iter radix_iter; - struct printbuf buf = PRINTBUF; bool degraded = false, last_write_torn = false; u64 seq; int ret = 0; @@ -1254,7 +1366,8 @@ int bch2_journal_read(struct bch_fs *c, if ((ca->mi.state == BCH_MEMBER_STATE_rw || ca->mi.state == BCH_MEMBER_STATE_ro) && - percpu_ref_tryget(&ca->io_ref[READ])) + enumerated_ref_tryget(&ca->io_ref[READ], + BCH_DEV_READ_REF_journal_read)) closure_call(&ca->journal.read, bch2_journal_read_device, system_unbound_wq, @@ -1325,14 +1438,27 @@ int bch2_journal_read(struct bch_fs *c, return 0; } - bch_info(c, "journal read done, replaying entries %llu-%llu", - *last_seq, *blacklist_seq - 1); + u64 drop_before = *last_seq; + { + CLASS(printbuf, buf)(); + prt_printf(&buf, "journal read done, replaying entries %llu-%llu", + *last_seq, *blacklist_seq - 1); + + /* + * Drop blacklisted entries and entries older than last_seq (or start of + * journal rewind: + */ + if (c->opts.journal_rewind) { + drop_before = min(drop_before, c->opts.journal_rewind); + prt_printf(&buf, " (rewinding from %llu)", c->opts.journal_rewind); + } - if (*start_seq != *blacklist_seq) - bch_info(c, "dropped unflushed entries %llu-%llu", - *blacklist_seq, *start_seq - 1); + *last_seq = drop_before; + if (*start_seq != *blacklist_seq) + prt_printf(&buf, " (unflushed %llu-%llu)", *blacklist_seq, *start_seq - 1); + bch_info(c, "%s", buf.buf); + } - /* Drop blacklisted entries and entries older than last_seq: */ genradix_for_each(&c->journal_entries, radix_iter, _i) { i = *_i; @@ -1340,7 +1466,7 @@ int bch2_journal_read(struct bch_fs *c, continue; seq = le64_to_cpu(i->j.seq); - if (seq < *last_seq) { + if (seq < drop_before) { journal_replay_free(c, i, false); continue; } @@ -1353,59 +1479,12 @@ int bch2_journal_read(struct bch_fs *c, } } - /* Check for missing entries: */ - seq = *last_seq; - genradix_for_each(&c->journal_entries, radix_iter, _i) { - i = *_i; - - if (journal_replay_ignore(i)) - continue; - - BUG_ON(seq > le64_to_cpu(i->j.seq)); - - while (seq < le64_to_cpu(i->j.seq)) { - u64 missing_start, missing_end; - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; - - while (seq < le64_to_cpu(i->j.seq) && - bch2_journal_seq_is_blacklisted(c, seq, false)) - seq++; - - if (seq == le64_to_cpu(i->j.seq)) - break; - - missing_start = seq; - - while (seq < le64_to_cpu(i->j.seq) && - !bch2_journal_seq_is_blacklisted(c, seq, false)) - seq++; - - if (prev) { - bch2_journal_ptrs_to_text(&buf1, c, prev); - prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); - } else - prt_printf(&buf1, "(none)"); - bch2_journal_ptrs_to_text(&buf2, c, i); - - missing_end = seq - 1; - fsck_err(c, journal_entries_missing, - "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" - "prev at %s\n" - "next at %s, continue?", - missing_start, missing_end, - *last_seq, *blacklist_seq - 1, - buf1.buf, buf2.buf); - - printbuf_exit(&buf1); - printbuf_exit(&buf2); - } - - prev = i; - seq++; - } + ret = bch2_journal_check_for_missing(c, drop_before, *blacklist_seq - 1); + if (ret) + return ret; genradix_for_each(&c->journal_entries, radix_iter, _i) { - struct bch_replicas_padded replicas = { + union bch_replicas_padded replicas = { .e.data_type = BCH_DATA_journal, .e.nr_devs = 0, .e.nr_required = 1, @@ -1415,15 +1494,15 @@ int bch2_journal_read(struct bch_fs *c, if (journal_replay_ignore(i)) continue; - darray_for_each(i->ptrs, ptr) { - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - - if (!ptr->csum_good) - bch_err_dev_offset(ca, ptr->sector, - "invalid journal checksum, seq %llu%s", - le64_to_cpu(i->j.seq), - i->csum_good ? " (had good copy on another device)" : ""); - } + /* + * Don't print checksum errors until we know we're going to use + * a given journal entry: + */ + darray_for_each(i->ptrs, ptr) + if (!ptr->csum_good) { + bch2_journal_print_checksum_error(c, i); + break; + } ret = jset_validate(c, bch2_dev_have_ref(c, i->ptrs.data[0].dev), @@ -1431,14 +1510,14 @@ int bch2_journal_read(struct bch_fs *c, i->ptrs.data[0].sector, READ); if (ret) - goto err; + return ret; darray_for_each(i->ptrs, ptr) replicas_entry_add_dev(&replicas.e, ptr->dev); bch2_replicas_entry_sort(&replicas.e); - printbuf_reset(&buf); + CLASS(printbuf, buf)(); bch2_replicas_entry_to_text(&buf, &replicas.e); if (!degraded && @@ -1449,12 +1528,10 @@ int bch2_journal_read(struct bch_fs *c, le64_to_cpu(i->j.seq), buf.buf))) { ret = bch2_mark_replicas(c, &replicas.e); if (ret) - goto err; + return ret; } } -err: fsck_err: - printbuf_exit(&buf); return ret; } @@ -1466,6 +1543,7 @@ static void journal_advance_devs_to_next_bucket(struct journal *j, { struct bch_fs *c = container_of(j, struct bch_fs, journal); + guard(rcu)(); darray_for_each(*devs, i) { struct bch_dev *ca = rcu_dereference(c->devs[*i]); if (!ca) @@ -1499,7 +1577,8 @@ static void __journal_write_alloc(struct journal *j, struct bch_fs *c = container_of(j, struct bch_fs, journal); darray_for_each(*devs, i) { - struct bch_dev *ca = rcu_dereference(c->devs[*i]); + struct bch_dev *ca = bch2_dev_get_ioref(c, *i, WRITE, + BCH_DEV_WRITE_REF_journal_write); if (!ca) continue; @@ -1513,8 +1592,10 @@ static void __journal_write_alloc(struct journal *j, ca->mi.state != BCH_MEMBER_STATE_rw || !ja->nr || bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || - sectors > ja->sectors_free) + sectors > ja->sectors_free) { + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); continue; + } bch2_dev_stripe_increment(ca, &j->wp.stripe); @@ -1537,15 +1618,8 @@ static void __journal_write_alloc(struct journal *j, } } -/** - * journal_write_alloc - decide where to write next journal entry - * - * @j: journal object - * @w: journal buf (entry to be written) - * - * Returns: 0 on success, or -BCH_ERR_insufficient_devices on failure - */ -static int journal_write_alloc(struct journal *j, struct journal_buf *w) +static int journal_write_alloc(struct journal *j, struct journal_buf *w, + unsigned *replicas) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_devs_mask devs; @@ -1553,29 +1627,18 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) unsigned sectors = vstruct_sectors(w->data, c->block_bits); unsigned target = c->opts.metadata_target ?: c->opts.foreground_target; - unsigned replicas = 0, replicas_want = - READ_ONCE(c->opts.metadata_replicas); + unsigned replicas_want = READ_ONCE(c->opts.metadata_replicas); unsigned replicas_need = min_t(unsigned, replicas_want, READ_ONCE(c->opts.metadata_replicas_required)); bool advance_done = false; - rcu_read_lock(); - - /* We might run more than once if we have to stop and do discards: */ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key)); - bkey_for_each_ptr(ptrs, p) { - struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev); - if (ca) - replicas += ca->mi.durability; - } - retry_target: devs = target_rw_devs(c, BCH_DATA_journal, target); - devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); + bch2_dev_alloc_list(c, &j->wp.stripe, &devs, &devs_sorted); retry_alloc: - __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); + __journal_write_alloc(j, w, &devs_sorted, sectors, replicas, replicas_want); - if (likely(replicas >= replicas_want)) + if (likely(*replicas >= replicas_want)) goto done; if (!advance_done) { @@ -1584,18 +1647,26 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) goto retry_alloc; } - if (replicas < replicas_want && target) { + if (*replicas < replicas_want && target) { /* Retry from all devices: */ target = 0; advance_done = false; goto retry_target; } done: - rcu_read_unlock(); - BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); - return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; +#if 0 + /* + * XXX: we need a way to alert the user when we go degraded for any + * reason + */ + if (*replicas < min(replicas_want, + dev_mask_nr(&c->rw_devs[BCH_DATA_free]))) { + } +#endif + + return *replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; } static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) @@ -1620,10 +1691,10 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) memcpy(new_buf, buf->data, buf->buf_size); - spin_lock(&j->lock); - swap(buf->data, new_buf); - swap(buf->buf_size, new_size); - spin_unlock(&j->lock); + scoped_guard(spinlock, &j->lock) { + swap(buf->data, new_buf); + swap(buf->buf_size, new_size); + } kvfree(new_buf); } @@ -1633,7 +1704,7 @@ static CLOSURE_CALLBACK(journal_write_done) closure_type(w, struct journal_buf, io); struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_replicas_padded replicas; + union bch_replicas_padded replicas; u64 seq = le64_to_cpu(w->data->seq); int err = 0; @@ -1642,17 +1713,27 @@ static CLOSURE_CALLBACK(journal_write_done) : j->noflush_write_time, j->write_start_time); if (!w->devs_written.nr) { - if (!bch2_journal_error(j)) - bch_err(c, "unable to write journal to sufficient devices"); - err = -BCH_ERR_journal_write_err; + err = bch_err_throw(c, journal_write_err); } else { bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, w->devs_written); err = bch2_mark_replicas(c, &replicas.e); } - if (err) - bch2_fatal_error(c); + if (err && !bch2_journal_error(j)) { + CLASS(printbuf, buf)(); + bch2_log_msg_start(c, &buf); + + if (err == -BCH_ERR_journal_write_err) + prt_printf(&buf, "unable to write journal to sufficient devices\n"); + else + prt_printf(&buf, "journal write error marking replicas: %s\n", + bch2_err_str(err)); + + bch2_fs_emergency_read_only2(c, &buf); + + bch2_print_str(c, KERN_ERR, buf.buf); + } closure_debug_destroy(cl); @@ -1694,6 +1775,7 @@ static CLOSURE_CALLBACK(journal_write_done) closure_wake_up(&c->freelist_wait); bch2_reset_alloc_cursors(c); + do_discards = true; } j->seq_ondisk = seq; @@ -1745,6 +1827,8 @@ static CLOSURE_CALLBACK(journal_write_done) if (do_discards) bch2_do_discards(c); + + closure_put(&c->cl); } static void journal_write_endio(struct bio *bio) @@ -1770,7 +1854,7 @@ static void journal_write_endio(struct bio *bio) } closure_put(&w->io); - percpu_ref_put(&ca->io_ref[WRITE]); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); } static CLOSURE_CALLBACK(journal_write_submit) @@ -1781,12 +1865,7 @@ static CLOSURE_CALLBACK(journal_write_submit) unsigned sectors = vstruct_sectors(w->data, c->block_bits); extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { - struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); - if (!ca) { - /* XXX: fix this */ - bch_err(c, "missing device %u for journal write", ptr->dev); - continue; - } + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], sectors); @@ -1797,7 +1876,11 @@ static CLOSURE_CALLBACK(journal_write_submit) jbio->submit_time = local_clock(); - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); + /* + * blk-wbt.c throttles all writes except those that have both + * REQ_SYNC and REQ_IDLE set... + */ + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_IDLE|REQ_META); bio->bi_iter.bi_sector = ptr->offset; bio->bi_end_io = journal_write_endio; bio->bi_private = ca; @@ -1844,8 +1927,9 @@ static CLOSURE_CALLBACK(journal_write_preflush) } if (w->separate_flush) { - for_each_rw_member(c, ca) { - percpu_ref_get(&ca->io_ref[WRITE]); + for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_write) { + enumerated_ref_get(&ca->io_ref[WRITE], + BCH_DEV_WRITE_REF_journal_write); struct journal_device *ja = &ca->journal; struct bio *bio = &ja->bio[w->idx]->bio; @@ -1872,9 +1956,8 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) struct jset_entry *start, *end; struct jset *jset = w->data; struct journal_keys_to_wb wb = { NULL }; - unsigned sectors, bytes, u64s; + unsigned u64s; unsigned long btree_roots_have = 0; - bool validate_before_checksum = false; u64 seq = le64_to_cpu(jset->seq); int ret; @@ -1937,9 +2020,8 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) } } - spin_lock(&c->journal.lock); - w->need_flush_to_write_buffer = false; - spin_unlock(&c->journal.lock); + scoped_guard(spinlock, &c->journal.lock) + w->need_flush_to_write_buffer = false; start = end = vstruct_last(jset); @@ -1957,8 +2039,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) le32_add_cpu(&jset->u64s, u64s); - sectors = vstruct_sectors(jset, c->block_bits); - bytes = vstruct_bytes(jset); + unsigned sectors = vstruct_sectors(jset, c->block_bits); if (sectors > w->sectors) { bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)", @@ -1967,6 +2048,17 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) return -EINVAL; } + return 0; +} + +static int bch2_journal_write_checksum(struct journal *j, struct journal_buf *w) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct jset *jset = w->data; + u64 seq = le64_to_cpu(jset->seq); + bool validate_before_checksum = false; + int ret = 0; + jset->magic = cpu_to_le64(jset_magic(c)); jset->version = cpu_to_le32(c->sb.version); @@ -1989,7 +2081,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset->encrypted_start, vstruct_end(jset) - (void *) jset->encrypted_start); - if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret))) + if (bch2_fs_fatal_err_on(ret, c, "encrypting journal entry: %s", bch2_err_str(ret))) return ret; jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), @@ -1999,6 +2091,8 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) (ret = jset_validate(c, NULL, jset, 0, WRITE))) return ret; + unsigned sectors = vstruct_sectors(jset, c->block_bits); + unsigned bytes = vstruct_bytes(jset); memset((void *) jset + bytes, 0, (sectors << 9) - bytes); return 0; } @@ -2054,13 +2148,10 @@ CLOSURE_CALLBACK(bch2_journal_write) closure_type(w, struct journal_buf, io); struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_replicas_padded replicas; - unsigned nr_rw_members = 0; + union bch_replicas_padded replicas; + unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_free]); int ret; - for_each_rw_member(c, ca) - nr_rw_members++; - BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); BUG_ON(!w->write_started); BUG_ON(w->write_allocated); @@ -2068,71 +2159,60 @@ CLOSURE_CALLBACK(bch2_journal_write) j->write_start_time = local_clock(); - spin_lock(&j->lock); - if (nr_rw_members > 1) - w->separate_flush = true; + scoped_guard(spinlock, &j->lock) { + if (nr_rw_members > 1) + w->separate_flush = true; - ret = bch2_journal_write_pick_flush(j, w); - spin_unlock(&j->lock); - if (ret) + ret = bch2_journal_write_pick_flush(j, w); + } + + if (unlikely(ret)) goto err; - mutex_lock(&j->buf_lock); - journal_buf_realloc(j, w); + scoped_guard(mutex, &j->buf_lock) { + journal_buf_realloc(j, w); - ret = bch2_journal_write_prep(j, w); - mutex_unlock(&j->buf_lock); - if (ret) - goto err; + ret = bch2_journal_write_prep(j, w); + } - j->entry_bytes_written += vstruct_bytes(w->data); + if (unlikely(ret)) + goto err; + unsigned replicas_allocated = 0; while (1) { - spin_lock(&j->lock); - ret = journal_write_alloc(j, w); + ret = journal_write_alloc(j, w, &replicas_allocated); if (!ret || !j->can_discard) break; - spin_unlock(&j->lock); bch2_journal_do_discards(j); } - if (ret && !bch2_journal_error(j)) { - struct printbuf buf = PRINTBUF; - buf.atomic++; + if (unlikely(ret)) + goto err_allocate_write; - __bch2_journal_debug_to_text(&buf, j); - spin_unlock(&j->lock); - prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), - le64_to_cpu(w->data->seq), - vstruct_sectors(w->data, c->block_bits), - bch2_err_str(ret)); - bch2_print_string_as_lines(KERN_ERR, buf.buf); - printbuf_exit(&buf); - } - if (ret) + ret = bch2_journal_write_checksum(j, w); + if (unlikely(ret)) goto err; - /* - * write is allocated, no longer need to account for it in - * bch2_journal_space_available(): - */ - w->sectors = 0; - w->write_allocated = true; + scoped_guard(spinlock, &j->lock) { + /* + * write is allocated, no longer need to account for it in + * bch2_journal_space_available(): + */ + w->sectors = 0; + w->write_allocated = true; + j->entry_bytes_written += vstruct_bytes(w->data); - /* - * journal entry has been compacted and allocated, recalculate space - * available: - */ - bch2_journal_space_available(j); - bch2_journal_do_writes(j); - spin_unlock(&j->lock); + /* + * journal entry has been compacted and allocated, recalculate space + * available: + */ + bch2_journal_space_available(j); + bch2_journal_do_writes(j); + } w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); - if (c->opts.nochanges) - goto no_io; - /* * Mark journal replicas before we submit the write to guarantee * recovery will find the journal entries after a crash. @@ -2143,15 +2223,32 @@ CLOSURE_CALLBACK(bch2_journal_write) if (ret) goto err; + if (c->opts.nochanges) + goto no_io; + if (!JSET_NO_FLUSH(w->data)) continue_at(cl, journal_write_preflush, j->wq); else continue_at(cl, journal_write_submit, j->wq); return; -no_io: - continue_at(cl, journal_write_done, j->wq); - return; +err_allocate_write: + if (!bch2_journal_error(j)) { + CLASS(printbuf, buf)(); + + bch2_journal_debug_to_text(&buf, j); + prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), + le64_to_cpu(w->data->seq), + vstruct_sectors(w->data, c->block_bits), + bch2_err_str(ret)); + bch2_print_str(c, KERN_ERR, buf.buf); + } err: bch2_fatal_error(c); +no_io: + extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); + } + continue_at(cl, journal_write_done, j->wq); } diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index 12b39fcb4424..f53c5c81d137 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -9,6 +9,7 @@ void bch2_journal_pos_from_member_info_resume(struct bch_fs *); struct journal_ptr { bool csum_good; + struct bch_csum csum; u8 dev; u32 bucket; u32 bucket_offset; @@ -70,6 +71,13 @@ void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *, struct journal_replay *); +struct u64_range { + u64 start; + u64 end; +}; + +struct u64_range bch2_journal_entry_missing_range(struct bch_fs *, u64, u64); + int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *); CLOSURE_CALLBACK(bch2_journal_write); diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index cc00b0fc40d8..f23e5ee9ad75 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -83,18 +83,20 @@ static struct journal_space journal_dev_space_available(struct journal *j, struct bch_dev *ca, enum journal_space_from from) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_device *ja = &ca->journal; unsigned sectors, buckets, unwritten; + unsigned bucket_size_aligned = round_down(ca->mi.bucket_size, block_sectors(c)); u64 seq; if (from == journal_space_total) return (struct journal_space) { - .next_entry = ca->mi.bucket_size, - .total = ca->mi.bucket_size * ja->nr, + .next_entry = bucket_size_aligned, + .total = bucket_size_aligned * ja->nr, }; buckets = bch2_journal_dev_buckets_available(j, ja, from); - sectors = ja->sectors_free; + sectors = round_down(ja->sectors_free, block_sectors(c)); /* * We that we don't allocate the space for a journal entry @@ -109,7 +111,7 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca, continue; /* entry won't fit on this device, skip: */ - if (unwritten > ca->mi.bucket_size) + if (unwritten > bucket_size_aligned) continue; if (unwritten >= sectors) { @@ -119,7 +121,7 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca, } buckets--; - sectors = ca->mi.bucket_size; + sectors = bucket_size_aligned; } sectors -= unwritten; @@ -127,12 +129,12 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca, if (sectors < ca->mi.bucket_size && buckets) { buckets--; - sectors = ca->mi.bucket_size; + sectors = bucket_size_aligned; } return (struct journal_space) { .next_entry = sectors, - .total = sectors + buckets * ca->mi.bucket_size, + .total = sectors + buckets * bucket_size_aligned, }; } @@ -146,7 +148,6 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); - rcu_read_lock(); for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { if (!ca->journal.nr || !ca->mi.durability) @@ -164,11 +165,16 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne array_insert_item(dev_space, nr_devs, pos, space); } - rcu_read_unlock(); if (nr_devs < nr_devs_want) return (struct journal_space) { 0, 0 }; + /* + * It's possible for bucket size to be misaligned w.r.t. the filesystem + * block size: + */ + min_bucket_size = round_down(min_bucket_size, block_sectors(c)); + /* * We sorted largest to smallest, and we want the smallest out of the * @nr_devs_want largest devices: @@ -189,8 +195,8 @@ void bch2_journal_space_available(struct journal *j) int ret = 0; lockdep_assert_held(&j->lock); + guard(rcu)(); - rcu_read_lock(); for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { struct journal_device *ja = &ca->journal; @@ -210,24 +216,22 @@ void bch2_journal_space_available(struct journal *j) max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); nr_online++; } - rcu_read_unlock(); j->can_discard = can_discard; if (nr_online < metadata_replicas_required(c)) { - struct printbuf buf = PRINTBUF; - buf.atomic++; - prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n" - "rw journal devs:", nr_online, metadata_replicas_required(c)); - - rcu_read_lock(); - for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) - prt_printf(&buf, " %s", ca->name); - rcu_read_unlock(); - - bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); - ret = -BCH_ERR_insufficient_journal_devices; + if (!(c->sb.features & BIT_ULL(BCH_FEATURE_small_image))) { + CLASS(printbuf, buf)(); + guard(printbuf_atomic)(&buf); + prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n" + "rw journal devs:", nr_online, metadata_replicas_required(c)); + + for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) + prt_printf(&buf, " %s", ca->name); + + bch_err(c, "%s", buf.buf); + } + ret = bch_err_throw(c, insufficient_journal_devices); goto out; } @@ -241,7 +245,7 @@ void bch2_journal_space_available(struct journal *j) total = j->space[journal_space_total].total; if (!j->space[journal_space_discarded].next_entry) - ret = -BCH_ERR_journal_full; + ret = bch_err_throw(c, journal_full); if ((j->space[journal_space_clean_ondisk].next_entry < j->space[journal_space_clean_ondisk].total) && @@ -254,8 +258,7 @@ void bch2_journal_space_available(struct journal *j) bch2_journal_set_watermark(j); out: j->cur_entry_sectors = !ret - ? round_down(j->space[journal_space_discarded].next_entry, - block_sectors(c)) + ? j->space[journal_space_discarded].next_entry : 0; j->cur_entry_error = ret; @@ -276,11 +279,8 @@ static bool __should_discard_bucket(struct journal *j, struct journal_device *ja static bool should_discard_bucket(struct journal *j, struct journal_device *ja) { - spin_lock(&j->lock); - bool ret = __should_discard_bucket(j, ja); - spin_unlock(&j->lock); - - return ret; + guard(spinlock)(&j->lock); + return __should_discard_bucket(j, ja); } /* @@ -291,29 +291,26 @@ void bch2_journal_do_discards(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - mutex_lock(&j->discard_lock); + guard(mutex)(&j->discard_lock); - for_each_rw_member(c, ca) { + for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_do_discards) { struct journal_device *ja = &ca->journal; while (should_discard_bucket(j, ja)) { if (!c->opts.nochanges && - ca->mi.discard && + bch2_discard_opt_enabled(c, ca) && bdev_max_discard_sectors(ca->disk_sb.bdev)) blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, ja->buckets[ja->discard_idx]), ca->mi.bucket_size, GFP_NOFS); - spin_lock(&j->lock); - ja->discard_idx = (ja->discard_idx + 1) % ja->nr; - - bch2_journal_space_available(j); - spin_unlock(&j->lock); + scoped_guard(spinlock, &j->lock) { + ja->discard_idx = (ja->discard_idx + 1) % ja->nr; + bch2_journal_space_available(j); + } } } - - mutex_unlock(&j->discard_lock); } /* @@ -354,9 +351,8 @@ bool __bch2_journal_pin_put(struct journal *j, u64 seq) void bch2_journal_pin_put(struct journal *j, u64 seq) { if (__bch2_journal_pin_put(j, seq)) { - spin_lock(&j->lock); + guard(spinlock)(&j->lock); bch2_journal_reclaim_fast(j); - spin_unlock(&j->lock); } } @@ -389,10 +385,9 @@ static inline bool __journal_pin_drop(struct journal *j, void bch2_journal_pin_drop(struct journal *j, struct journal_entry_pin *pin) { - spin_lock(&j->lock); + guard(spinlock)(&j->lock); if (__journal_pin_drop(j, pin)) bch2_journal_reclaim_fast(j); - spin_unlock(&j->lock); } static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin, @@ -439,7 +434,7 @@ void bch2_journal_pin_copy(struct journal *j, struct journal_entry_pin *src, journal_pin_flush_fn flush_fn) { - spin_lock(&j->lock); + guard(spinlock)(&j->lock); u64 seq = READ_ONCE(src->seq); @@ -450,7 +445,6 @@ void bch2_journal_pin_copy(struct journal *j, * longer to exist, but that means there's no longer anything to * copy and we can bail out here: */ - spin_unlock(&j->lock); return; } @@ -467,31 +461,32 @@ void bch2_journal_pin_copy(struct journal *j, */ if (seq == journal_last_seq(j)) journal_wake(j); - spin_unlock(&j->lock); } void bch2_journal_pin_set(struct journal *j, u64 seq, struct journal_entry_pin *pin, journal_pin_flush_fn flush_fn) { - spin_lock(&j->lock); + bool wake; - BUG_ON(seq < journal_last_seq(j)); + scoped_guard(spinlock, &j->lock) { + BUG_ON(seq < journal_last_seq(j)); - bool reclaim = __journal_pin_drop(j, pin); + bool reclaim = __journal_pin_drop(j, pin); - bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn)); + bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn)); - if (reclaim) - bch2_journal_reclaim_fast(j); - /* - * If the journal is currently full, we might want to call flush_fn - * immediately: - */ - if (seq == journal_last_seq(j)) - journal_wake(j); + if (reclaim) + bch2_journal_reclaim_fast(j); + /* + * If the journal is currently full, we might want to call flush_fn + * immediately: + */ + wake = seq == journal_last_seq(j); + } - spin_unlock(&j->lock); + if (wake) + journal_wake(j); } /** @@ -576,17 +571,17 @@ static size_t journal_flush_pins(struct journal *j, j->last_flushed = jiffies; - spin_lock(&j->lock); - pin = journal_get_next_pin(j, seq_to_flush, - allowed_below, - allowed_above, &seq); - if (pin) { - BUG_ON(j->flush_in_progress); - j->flush_in_progress = pin; - j->flush_in_progress_dropped = false; - flush_fn = pin->flush; + scoped_guard(spinlock, &j->lock) { + pin = journal_get_next_pin(j, seq_to_flush, + allowed_below, + allowed_above, &seq); + if (pin) { + BUG_ON(j->flush_in_progress); + j->flush_in_progress = pin; + j->flush_in_progress_dropped = false; + flush_fn = pin->flush; + } } - spin_unlock(&j->lock); if (!pin) break; @@ -599,13 +594,13 @@ static size_t journal_flush_pins(struct journal *j, err = flush_fn(j, pin, seq); - spin_lock(&j->lock); - /* Pin might have been dropped or rearmed: */ - if (likely(!err && !j->flush_in_progress_dropped)) - list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]); - j->flush_in_progress = NULL; - j->flush_in_progress_dropped = false; - spin_unlock(&j->lock); + scoped_guard(spinlock, &j->lock) { + /* Pin might have been dropped or rearmed: */ + if (likely(!err && !j->flush_in_progress_dropped)) + list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]); + j->flush_in_progress = NULL; + j->flush_in_progress_dropped = false; + } wake_up(&j->pin_flush_wait); @@ -623,9 +618,10 @@ static u64 journal_seq_to_flush(struct journal *j) struct bch_fs *c = container_of(j, struct bch_fs, journal); u64 seq_to_flush = 0; - spin_lock(&j->lock); + guard(spinlock)(&j->lock); + guard(rcu)(); - for_each_rw_member(c, ca) { + for_each_rw_member_rcu(c, ca) { struct journal_device *ja = &ca->journal; unsigned nr_buckets, bucket_to_flush; @@ -635,20 +631,15 @@ static u64 journal_seq_to_flush(struct journal *j) /* Try to keep the journal at most half full: */ nr_buckets = ja->nr / 2; - nr_buckets = min(nr_buckets, ja->nr); - bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; seq_to_flush = max(seq_to_flush, ja->bucket_seq[bucket_to_flush]); } /* Also flush if the pin fifo is more than half full */ - seq_to_flush = max_t(s64, seq_to_flush, - (s64) journal_cur_seq(j) - - (j->pin.size >> 1)); - spin_unlock(&j->lock); - - return seq_to_flush; + return max_t(s64, seq_to_flush, + (s64) journal_cur_seq(j) - + (j->pin.size >> 1)); } /** @@ -699,6 +690,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) if (ret) break; + /* XXX shove journal discards off to another thread */ bch2_journal_do_discards(j); seq_to_flush = journal_seq_to_flush(j); @@ -769,9 +761,8 @@ static int bch2_journal_reclaim_thread(void *arg) j->reclaim_kicked = false; - mutex_lock(&j->reclaim_lock); - ret = __bch2_journal_reclaim(j, false, kicked); - mutex_unlock(&j->reclaim_lock); + scoped_guard(mutex, &j->reclaim_lock) + ret = __bch2_journal_reclaim(j, false, kicked); now = jiffies; delay = msecs_to_jiffies(c->opts.journal_reclaim_delay); @@ -787,9 +778,8 @@ static int bch2_journal_reclaim_thread(void *arg) if (j->reclaim_kicked) break; - spin_lock(&j->lock); - journal_empty = fifo_empty(&j->pin); - spin_unlock(&j->lock); + scoped_guard(spinlock, &j->lock) + journal_empty = fifo_empty(&j->pin); long timeout = j->next_reclaim - jiffies; @@ -843,10 +833,10 @@ int bch2_journal_reclaim_start(struct journal *j) static bool journal_pins_still_flushing(struct journal *j, u64 seq_to_flush, unsigned types) { + guard(spinlock)(&j->lock); + struct journal_entry_pin_list *pin_list; u64 seq; - - spin_lock(&j->lock); fifo_for_each_entry_ptr(pin_list, &j->pin, seq) { if (seq > seq_to_flush) break; @@ -854,12 +844,9 @@ static bool journal_pins_still_flushing(struct journal *j, u64 seq_to_flush, for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++) if ((BIT(i) & types) && (!list_empty(&pin_list->unflushed[i]) || - !list_empty(&pin_list->flushed[i]))) { - spin_unlock(&j->lock); + !list_empty(&pin_list->flushed[i]))) return true; - } } - spin_unlock(&j->lock); return false; } @@ -880,32 +867,54 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, if (ret) return ret; - mutex_lock(&j->reclaim_lock); + guard(mutex)(&j->reclaim_lock); for (int type = JOURNAL_PIN_TYPE_NR - 1; type >= 0; --type) if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) { *did_work = true; - goto unlock; + + /* + * Question from Dan Carpenter, on the early return: + * + * If journal_flush_pins_or_still_flushing() returns + * true, then the flush hasn't complete and we must + * return 0; we want the outer closure_wait_event() in + * journal_flush_pins() to continue. + * + * The early return is there because we don't want to + * call journal_entry_close() until we've finished + * flushing all outstanding journal pins - otherwise + * seq_to_flush can be U64_MAX, and we'll close a bunch + * of journal entries and write tiny ones completely + * unnecessarily. + * + * Having the early return be in the loop where we loop + * over types is important, because flushing one journal + * pin can cause new journal pins to be added (even of + * the same type, btree node writes may generate more + * btree node writes, when updating the parent pointer + * has a full node and has to trigger a split/compact). + * + * This is part of our shutdown sequence, where order of + * flushing is important in order to make sure that it + * terminates... + */ + return 0; } if (seq_to_flush > journal_cur_seq(j)) bch2_journal_entry_close(j); - spin_lock(&j->lock); /* * If journal replay hasn't completed, the unreplayed journal entries * hold refs on their corresponding sequence numbers */ + guard(spinlock)(&j->lock); ret = !test_bit(JOURNAL_replay_done, &j->flags) || journal_last_seq(j) > seq_to_flush || !fifo_used(&j->pin); - - spin_unlock(&j->lock); -unlock: - mutex_unlock(&j->reclaim_lock); - return ret; } @@ -930,13 +939,12 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) u64 iter, seq = 0; int ret = 0; - spin_lock(&j->lock); - fifo_for_each_entry_ptr(p, &j->pin, iter) - if (dev_idx >= 0 - ? bch2_dev_list_has_dev(p->devs, dev_idx) - : p->devs.nr < c->opts.metadata_replicas) - seq = iter; - spin_unlock(&j->lock); + scoped_guard(spinlock, &j->lock) + fifo_for_each_entry_ptr(p, &j->pin, iter) + if (dev_idx >= 0 + ? bch2_dev_list_has_dev(p->devs, dev_idx) + : p->devs.nr < c->opts.metadata_replicas) + seq = iter; bch2_journal_flush_pins(j, seq); @@ -944,7 +952,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) if (ret) return ret; - mutex_lock(&c->replicas_gc_lock); + guard(mutex)(&c->replicas_gc_lock); bch2_replicas_gc_start(c, 1 << BCH_DATA_journal); /* @@ -959,29 +967,25 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) goto err; seq = 0; - spin_lock(&j->lock); - while (!ret) { - struct bch_replicas_padded replicas; + scoped_guard(spinlock, &j->lock) + while (!ret) { + union bch_replicas_padded replicas; - seq = max(seq, journal_last_seq(j)); - if (seq >= j->pin.back) - break; - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, - journal_seq_pin(j, seq)->devs); - seq++; + seq = max(seq, journal_last_seq(j)); + if (seq >= j->pin.back) + break; + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, + journal_seq_pin(j, seq)->devs); + seq++; - if (replicas.e.nr_devs) { - spin_unlock(&j->lock); - ret = bch2_mark_replicas(c, &replicas.e); - spin_lock(&j->lock); + if (replicas.e.nr_devs) { + spin_unlock(&j->lock); + ret = bch2_mark_replicas(c, &replicas.e); + spin_lock(&j->lock); + } } - } - spin_unlock(&j->lock); err: - ret = bch2_replicas_gc_end(c, ret); - mutex_unlock(&c->replicas_gc_lock); - - return ret; + return bch2_replicas_gc_end(c, ret); } bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) @@ -989,20 +993,16 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 struct journal_entry_pin_list *pin_list; struct journal_entry_pin *pin; - spin_lock(&j->lock); - if (!test_bit(JOURNAL_running, &j->flags)) { - spin_unlock(&j->lock); + guard(spinlock)(&j->lock); + guard(printbuf_atomic)(out); + + if (!test_bit(JOURNAL_running, &j->flags)) return true; - } *seq = max(*seq, j->pin.front); - if (*seq >= j->pin.back) { - spin_unlock(&j->lock); + if (*seq >= j->pin.back) return true; - } - - out->atomic++; pin_list = journal_seq_pin(j, *seq); @@ -1021,9 +1021,6 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 printbuf_indent_sub(out, 2); - --out->atomic; - spin_unlock(&j->lock); - return false; } diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c index 62b910f2fb27..0cb9b93f13e7 100644 --- a/fs/bcachefs/journal_sb.c +++ b/fs/bcachefs/journal_sb.c @@ -210,7 +210,7 @@ int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca, j = bch2_sb_field_resize(&ca->disk_sb, journal_v2, (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64)); if (!j) - return -BCH_ERR_ENOSPC_sb_journal; + return bch_err_throw(c, ENOSPC_sb_journal); bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index e463d2d95359..399db5b77d9f 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -49,7 +49,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) unsigned i = 0, nr; int ret = 0; - mutex_lock(&c->sb_lock); + guard(mutex)(&c->sb_lock); bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); nr = blacklist_nr_entries(bl); @@ -77,10 +77,8 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, sb_blacklist_u64s(nr + 1)); - if (!bl) { - ret = -BCH_ERR_ENOSPC_sb_journal_seq_blacklist; - goto out; - } + if (!bl) + return bch_err_throw(c, ENOSPC_sb_journal_seq_blacklist); array_insert_item(bl->start, nr, i, ((struct journal_seq_blacklist_entry) { .start = cpu_to_le64(start), @@ -89,8 +87,6 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); ret = bch2_write_super(c); -out: - mutex_unlock(&c->sb_lock); return ret ?: bch2_blacklist_table_initialize(c); } @@ -103,6 +99,52 @@ static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r) return cmp_int(l->start, r->start); } +static int journal_seq_blacklist_table_end_cmp(const void *_l, const void *_r) +{ + const struct journal_seq_blacklist_table_entry *l = _l; + const struct journal_seq_blacklist_table_entry *r = _r; + + return cmp_int(l->end, r->end); +} + +u64 bch2_journal_seq_next_blacklisted(struct bch_fs *c, u64 seq) +{ + struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; + + if (!t) + return U64_MAX; + + struct journal_seq_blacklist_table_entry search = { .end = seq }; + int idx = eytzinger0_find_gt(t->entries, t->nr, + sizeof(t->entries[0]), + journal_seq_blacklist_table_end_cmp, + &search); + if (idx < 0) + return U64_MAX; + + return max(seq, t->entries[idx].start); +} + +u64 bch2_journal_seq_next_nonblacklisted(struct bch_fs *c, u64 seq) +{ + struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; + + if (!t) + return seq; + + while (true) { + struct journal_seq_blacklist_table_entry search = { .start = seq }; + int idx = eytzinger0_find_le(t->entries, t->nr, + sizeof(t->entries[0]), + journal_seq_blacklist_table_cmp, + &search); + if (idx < 0 || t->entries[idx].end <= seq) + return seq; + + seq = t->entries[idx].end; + } +} + bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, bool dirty) { @@ -130,6 +172,16 @@ bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, return true; } +u64 bch2_journal_last_blacklisted_seq(struct bch_fs *c) +{ + struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; + + if (!t || !t->nr) + return 0; + + return t->entries[eytzinger0_last(t->nr)].end - 1; +} + int bch2_blacklist_table_initialize(struct bch_fs *c) { struct bch_sb_field_journal_seq_blacklist *bl = @@ -142,7 +194,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL); if (!t) - return -BCH_ERR_ENOMEM_blacklist_table_init; + return bch_err_throw(c, ENOMEM_blacklist_table_init); t->nr = nr; diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h index d47636f96fdc..389b789b26f4 100644 --- a/fs/bcachefs/journal_seq_blacklist.h +++ b/fs/bcachefs/journal_seq_blacklist.h @@ -11,7 +11,11 @@ blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) : 0; } +u64 bch2_journal_seq_next_blacklisted(struct bch_fs *, u64); +u64 bch2_journal_seq_next_nonblacklisted(struct bch_fs *, u64); + bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); +u64 bch2_journal_last_blacklisted_seq(struct bch_fs *); int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); int bch2_blacklist_table_initialize(struct bch_fs *); diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 8e0eba776b9d..51104bbb99da 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -151,8 +151,6 @@ enum journal_flags { #undef x }; -typedef DARRAY(u64) darray_u64; - struct journal_bio { struct bch_dev *ca; unsigned buf_idx; diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c index 75f27ec26f85..38cdacc6b067 100644 --- a/fs/bcachefs/logged_ops.c +++ b/fs/bcachefs/logged_ops.c @@ -35,7 +35,7 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, { struct bch_fs *c = trans->c; u32 restart_count = trans->restart_count; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); int ret = 0; fsck_err_on(test_bit(BCH_FS_clean_recovery, &c->flags), @@ -56,21 +56,18 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, bch2_bkey_buf_exit(&sk, c); fsck_err: - printbuf_exit(&buf); return ret ?: trans_was_restarted(trans, restart_count); } int bch2_resume_logged_ops(struct bch_fs *c) { - int ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, + CLASS(btree_trans, trans)(c); + return for_each_btree_key_max(trans, iter, BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM_logged_ops, 0), POS(LOGGED_OPS_INUM_logged_ops, U64_MAX), BTREE_ITER_prefetch, k, - resume_logged_op(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; + resume_logged_op(trans, &iter, k)); } static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) @@ -84,7 +81,7 @@ static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) k->k.p = iter.pos; ret = bch2_trans_update(trans, &iter, k, 0); - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); return ret; } @@ -107,12 +104,11 @@ int bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k) */ if (ret) { struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); bch2_fs_fatal_error(c, "deleting logged operation %s: %s", buf.buf, bch2_err_str(ret)); - printbuf_exit(&buf); } return ret; diff --git a/fs/bcachefs/logged_ops.h b/fs/bcachefs/logged_ops.h index 30ae9ef737dd..6dea6e2ac7a8 100644 --- a/fs/bcachefs/logged_ops.h +++ b/fs/bcachefs/logged_ops.h @@ -10,7 +10,7 @@ static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i *op) { - return bch2_btree_insert_nonextent(trans, BTREE_ID_logged_ops, op, 0); + return bch2_btree_insert_trans(trans, BTREE_ID_logged_ops, op, BTREE_ITER_cached); } int bch2_resume_logged_ops(struct bch_fs *); diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index 2f63fc6d456f..b9c0834498dd 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -9,6 +9,7 @@ #include "ec.h" #include "error.h" #include "lru.h" +#include "progress.h" #include "recovery.h" /* KEY_TYPE_lru is obsolete: */ @@ -86,11 +87,9 @@ int bch2_lru_check_set(struct btree_trans *trans, struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - struct btree_iter lru_iter; - struct bkey_s_c lru_k = - bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, - lru_pos(lru_id, dev_bucket, time), 0); + CLASS(printbuf, buf)(); + CLASS(btree_iter, lru_iter)(trans, BTREE_ID_lru, lru_pos(lru_id, dev_bucket, time), 0); + struct bkey_s_c lru_k = bch2_btree_iter_peek_slot(&lru_iter); int ret = bkey_err(lru_k); if (ret) return ret; @@ -98,7 +97,7 @@ int bch2_lru_check_set(struct btree_trans *trans, if (lru_k.k->type != KEY_TYPE_set) { ret = bch2_btree_write_buffer_maybe_flush(trans, referring_k, last_flushed); if (ret) - goto err; + return ret; if (fsck_err(trans, alloc_key_to_missing_lru_entry, "missing %s lru entry\n%s", @@ -106,13 +105,10 @@ int bch2_lru_check_set(struct btree_trans *trans, (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) { ret = bch2_lru_set(trans, lru_id, dev_bucket, time); if (ret) - goto err; + return ret; } } -err: fsck_err: - bch2_trans_iter_exit(trans, &lru_iter); - printbuf_exit(&buf); return ret; } @@ -145,13 +141,11 @@ static u64 bkey_lru_type_idx(struct bch_fs *c, case BCH_LRU_fragmentation: { a = bch2_alloc_to_v4(k, &a_convert); - rcu_read_lock(); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode); - u64 idx = ca + return ca ? alloc_lru_idx_fragmentation(*a, ca) : 0; - rcu_read_unlock(); - return idx; } case BCH_LRU_stripes: return k.k->type == KEY_TYPE_stripe @@ -168,16 +162,16 @@ static int bch2_check_lru_key(struct btree_trans *trans, struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; + CLASS(printbuf, buf1)(); + CLASS(printbuf, buf2)(); struct bbpos bp = lru_pos_to_bp(lru_k); - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, bp.btree, bp.pos, 0); + CLASS(btree_iter, iter)(trans, bp.btree, bp.pos, 0); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); int ret = bkey_err(k); if (ret) - goto err; + return ret; enum bch_lru_type type = lru_type(lru_k); u64 idx = bkey_lru_type_idx(c, type, k); @@ -185,7 +179,7 @@ static int bch2_check_lru_key(struct btree_trans *trans, if (lru_pos_time(lru_k.k->p) != idx) { ret = bch2_btree_write_buffer_maybe_flush(trans, lru_k, last_flushed); if (ret) - goto err; + return ret; if (fsck_err(trans, lru_entry_bad, "incorrect lru entry: lru %s time %llu\n" @@ -195,13 +189,9 @@ static int bch2_check_lru_key(struct btree_trans *trans, lru_pos_time(lru_k.k->p), (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf), (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) - ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false); + return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false); } -err: fsck_err: - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf2); - printbuf_exit(&buf1); return ret; } @@ -212,14 +202,18 @@ int bch2_check_lrus(struct bch_fs *c) bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_lru)); + + CLASS(btree_trans, trans)(c); + int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_lru_key(trans, &iter, k, &last_flushed))); + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + progress_update_iter(trans, &progress, &iter); + bch2_check_lru_key(trans, &iter, k, &last_flushed); + })); bch2_bkey_buf_exit(&last_flushed, c); - bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index 90dcf80bd64a..a66d01d04e57 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -4,10 +4,13 @@ */ #include "bcachefs.h" +#include "backpointers.h" #include "bkey_buf.h" #include "btree_update.h" #include "btree_update_interior.h" +#include "btree_write_buffer.h" #include "buckets.h" +#include "ec.h" #include "errcode.h" #include "extents.h" #include "io_write.h" @@ -20,7 +23,7 @@ #include "super-io.h" static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, - unsigned dev_idx, int flags, bool metadata) + unsigned dev_idx, unsigned flags, bool metadata) { unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas; unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST; @@ -32,16 +35,33 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, nr_good = bch2_bkey_durability(c, k.s_c); if ((!nr_good && !(flags & lost)) || (nr_good < replicas && !(flags & degraded))) - return -BCH_ERR_remove_would_lose_data; + return bch_err_throw(c, remove_would_lose_data); return 0; } +static int drop_btree_ptrs(struct btree_trans *trans, struct btree_iter *iter, + struct btree *b, unsigned dev_idx, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_buf k; + + bch2_bkey_buf_init(&k); + bch2_bkey_buf_copy(&k, c, &b->key); + + int ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), dev_idx, flags, true) ?: + bch2_btree_node_update_key(trans, iter, b, k.k, 0, false); + + bch_err_fn(c, ret); + bch2_bkey_buf_exit(&k, c); + return ret; +} + static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, unsigned dev_idx, - int flags) + unsigned flags) { struct bch_fs *c = trans->c; struct bkey_i *n; @@ -77,38 +97,51 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, return 0; } +static int bch2_dev_btree_drop_key(struct btree_trans *trans, + struct bkey_s_c_backpointer bp, + unsigned dev_idx, + struct bkey_buf *last_flushed, + unsigned flags) +{ + struct btree_iter iter; + struct btree *b = bch2_backpointer_get_node(trans, bp, &iter, last_flushed); + int ret = PTR_ERR_OR_ZERO(b); + if (ret) + return ret == -BCH_ERR_backpointer_to_overwritten_btree_node ? 0 : ret; + + ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags); + + bch2_trans_iter_exit(&iter); + return ret; +} + static int bch2_dev_usrdata_drop(struct bch_fs *c, struct progress_indicator_state *progress, - unsigned dev_idx, int flags) + unsigned dev_idx, unsigned flags) { - struct btree_trans *trans = bch2_trans_get(c); - enum btree_id id; - int ret = 0; + CLASS(btree_trans, trans)(c); - for (id = 0; id < BTREE_ID_NR; id++) { + for (unsigned id = 0; id < BTREE_ID_NR; id++) { if (!btree_type_has_ptrs(id)) continue; - ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, + int ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ bch2_progress_update_iter(trans, progress, &iter, "dropping user data"); bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags); })); if (ret) - break; + return ret; } - bch2_trans_put(trans); - - return ret; + return 0; } static int bch2_dev_metadata_drop(struct bch_fs *c, struct progress_indicator_state *progress, - unsigned dev_idx, int flags) + unsigned dev_idx, unsigned flags) { - struct btree_trans *trans; struct btree_iter iter; struct closure cl; struct btree *b; @@ -118,9 +151,9 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, /* don't handle this yet: */ if (flags & BCH_FORCE_IF_METADATA_LOST) - return -BCH_ERR_remove_with_metadata_missing_unimplemented; + return bch_err_throw(c, remove_with_metadata_missing_unimplemented); - trans = bch2_trans_get(c); + CLASS(btree_trans, trans)(c); bch2_bkey_buf_init(&k); closure_init_stack(&cl); @@ -130,36 +163,28 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, retry: ret = 0; while (bch2_trans_begin(trans), - (b = bch2_btree_iter_peek_node(trans, &iter)) && + (b = bch2_btree_iter_peek_node(&iter)) && !(ret = PTR_ERR_OR_ZERO(b))) { bch2_progress_update_iter(trans, progress, &iter, "dropping metadata"); if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx)) goto next; - bch2_bkey_buf_copy(&k, c, &b->key); - - ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), - dev_idx, flags, true); - if (ret) - break; - - ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false); + ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { ret = 0; continue; } - bch_err_msg(c, ret, "updating btree node key"); if (ret) break; next: - bch2_btree_iter_next_node(trans, &iter); + bch2_btree_iter_next_node(&iter); } if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); if (ret) goto err; @@ -169,14 +194,71 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, ret = 0; err: bch2_bkey_buf_exit(&k, c); - bch2_trans_put(trans); BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); return ret; } -int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags) +static int data_drop_bp(struct btree_trans *trans, unsigned dev_idx, + struct bkey_s_c_backpointer bp, struct bkey_buf *last_flushed, + unsigned flags) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, + last_flushed); + int ret = bkey_err(k); + if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) + return 0; + if (ret) + return ret; + + if (!k.k || !bch2_bkey_has_device_c(k, dev_idx)) + goto out; + + /* + * XXX: pass flags arg to invalidate_stripe_to_dev and handle it + * properly + */ + + if (bkey_is_btree_ptr(k.k)) + ret = bch2_dev_btree_drop_key(trans, bp, dev_idx, last_flushed, flags); + else if (k.k->type == KEY_TYPE_stripe) + ret = bch2_invalidate_stripe_to_dev(trans, &iter, k, dev_idx, flags); + else + ret = bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags); +out: + bch2_trans_iter_exit(&iter); + return ret; +} + +int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsigned flags) +{ + CLASS(btree_trans, trans)(c); + + struct bkey_buf last_flushed; + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + + int ret = bch2_btree_write_buffer_flush_sync(trans) ?: + for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers, + POS(dev_idx, 0), + POS(dev_idx, U64_MAX), 0, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + if (k.k->type != KEY_TYPE_backpointer) + continue; + + data_drop_bp(trans, dev_idx, bkey_s_c_to_backpointer(k), + &last_flushed, flags); + + })); + + bch2_bkey_buf_exit(&last_flushed, trans->c); + bch_err_fn(c, ret); + return ret; +} + +int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, unsigned flags) { struct progress_indicator_state progress; bch2_progress_init(&progress, c, diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h index 027efaa0d575..30018140711b 100644 --- a/fs/bcachefs/migrate.h +++ b/fs/bcachefs/migrate.h @@ -2,6 +2,7 @@ #ifndef _BCACHEFS_MIGRATE_H #define _BCACHEFS_MIGRATE_H -int bch2_dev_data_drop(struct bch_fs *, unsigned, int); +int bch2_dev_data_drop_by_backpointers(struct bch_fs *, unsigned, unsigned); +int bch2_dev_data_drop(struct bch_fs *, unsigned, unsigned); #endif /* _BCACHEFS_MIGRATE_H */ diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index dfdbb9259985..30fe269d531d 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -38,36 +38,77 @@ const char * const bch2_data_ops_strs[] = { NULL }; -static void trace_io_move2(struct bch_fs *c, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) +struct evacuate_bucket_arg { + struct bpos bucket; + int gen; + struct data_update_opts data_opts; +}; + +static bool evacuate_bucket_pred(struct bch_fs *, void *, + enum btree_id, struct bkey_s_c, + struct bch_io_opts *, + struct data_update_opts *); + +static noinline void +trace_io_move2(struct bch_fs *c, struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) { - if (trace_io_move_enabled()) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); - trace_io_move(c, buf.buf); - printbuf_exit(&buf); - } + bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); + trace_io_move(c, buf.buf); } -static void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k) +static noinline void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k) { - if (trace_io_move_read_enabled()) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); - bch2_bkey_val_to_text(&buf, c, k); - trace_io_move_read(c, buf.buf); - printbuf_exit(&buf); + bch2_bkey_val_to_text(&buf, c, k); + trace_io_move_read(c, buf.buf); +} + +static noinline void +trace_io_move_pred2(struct bch_fs *c, struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts, + move_pred_fn pred, void *_arg, bool p) +{ + CLASS(printbuf, buf)(); + + prt_printf(&buf, "%ps: %u", pred, p); + + if (pred == evacuate_bucket_pred) { + struct evacuate_bucket_arg *arg = _arg; + prt_printf(&buf, " gen=%u", arg->gen); } + + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); + trace_io_move_pred(c, buf.buf); +} + +static noinline void +trace_io_move_evacuate_bucket2(struct bch_fs *c, struct bpos bucket, int gen) +{ + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "bucket: "); + bch2_bpos_to_text(&buf, bucket); + prt_printf(&buf, " gen: %i\n", gen); + + trace_io_move_evacuate_bucket(c, buf.buf); + printbuf_exit(&buf); } struct moving_io { struct list_head read_list; struct list_head io_list; - struct move_bucket_in_flight *b; + struct move_bucket *b; struct closure cl; bool read_completed; @@ -84,10 +125,9 @@ static void move_free(struct moving_io *io) if (io->b) atomic_dec(&io->b->count); - mutex_lock(&ctxt->lock); - list_del(&io->io_list); + scoped_guard(mutex, &ctxt->lock) + list_del(&io->io_list); wake_up(&ctxt->wait); - mutex_unlock(&ctxt->lock); if (!io->write.data_opts.scrub) { bch2_data_update_exit(&io->write); @@ -106,12 +146,9 @@ static void move_write_done(struct bch_write_op *op) if (op->error) { if (trace_io_move_write_fail_enabled()) { - struct printbuf buf = PRINTBUF; - + CLASS(printbuf, buf)(); bch2_write_op_to_text(&buf, op); - prt_printf(&buf, "ret\t%s\n", bch2_err_str(op->error)); trace_io_move_write_fail(c, buf.buf); - printbuf_exit(&buf); } this_cpu_inc(c->counters[BCH_COUNTER_io_move_write_fail]); @@ -126,31 +163,43 @@ static void move_write_done(struct bch_write_op *op) static void move_write(struct moving_io *io) { + struct bch_fs *c = io->write.op.c; struct moving_context *ctxt = io->write.ctxt; + struct bch_read_bio *rbio = &io->write.rbio; if (ctxt->stats) { - if (io->write.rbio.bio.bi_status) + if (rbio->bio.bi_status) atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, &ctxt->stats->sectors_error_uncorrected); - else if (io->write.rbio.saw_error) + else if (rbio->saw_error) atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, &ctxt->stats->sectors_error_corrected); } - if (unlikely(io->write.rbio.ret || - io->write.rbio.bio.bi_status || - io->write.data_opts.scrub)) { + /* + * If the extent has been bitrotted, we're going to have to give it a + * new checksum in order to move it - but the poison bit will ensure + * that userspace still gets the appropriate error. + */ + if (unlikely(rbio->ret == -BCH_ERR_data_read_csum_err && + (bch2_bkey_extent_flags(bkey_i_to_s_c(io->write.k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) { + struct bch_extent_crc_unpacked crc = rbio->pick.crc; + struct nonce nonce = extent_nonce(rbio->version, crc); + + rbio->pick.crc.csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type, + nonce, &rbio->bio); + rbio->ret = 0; + } + + if (unlikely(rbio->ret || io->write.data_opts.scrub)) { move_free(io); return; } if (trace_io_move_write_enabled()) { - struct bch_fs *c = io->write.op.c; - struct printbuf buf = PRINTBUF; - + CLASS(printbuf, buf)(); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k)); trace_io_move_write(c, buf.buf); - printbuf_exit(&buf); } closure_get(&io->write.ctxt->cl); @@ -219,9 +268,8 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt) EBUG_ON(atomic_read(&ctxt->read_sectors)); EBUG_ON(atomic_read(&ctxt->read_ios)); - mutex_lock(&c->moving_context_lock); - list_del(&ctxt->list); - mutex_unlock(&c->moving_context_lock); + scoped_guard(mutex, &c->moving_context_lock) + list_del(&ctxt->list); /* * Generally, releasing a transaction within a transaction restart means @@ -257,9 +305,8 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt, INIT_LIST_HEAD(&ctxt->ios); init_waitqueue_head(&ctxt->wait); - mutex_lock(&c->moving_context_lock); - list_add(&ctxt->list, &c->moving_context_list); - mutex_unlock(&c->moving_context_lock); + scoped_guard(mutex, &c->moving_context_lock) + list_add(&ctxt->list, &c->moving_context_list); } void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c) @@ -275,7 +322,7 @@ void bch2_move_stats_init(struct bch_move_stats *stats, const char *name) } int bch2_move_extent(struct moving_context *ctxt, - struct move_bucket_in_flight *bucket_in_flight, + struct move_bucket *bucket_in_flight, struct btree_iter *iter, struct bkey_s_c k, struct bch_io_opts io_opts, @@ -283,9 +330,10 @@ int bch2_move_extent(struct moving_context *ctxt, { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; - int ret = -ENOMEM; + int ret = 0; - trace_io_move2(c, k, &io_opts, &data_opts); + if (trace_io_move_enabled()) + trace_io_move2(c, k, &io_opts, &data_opts); this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); if (ctxt->stats) @@ -296,19 +344,20 @@ int bch2_move_extent(struct moving_context *ctxt, if (!data_opts.rewrite_ptrs && !data_opts.extra_replicas && !data_opts.scrub) { - if (data_opts.kill_ptrs) + if (data_opts.kill_ptrs) { + this_cpu_add(c->counters[BCH_COUNTER_io_move_drop_only], k.k->size); return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts); - return 0; + } else { + this_cpu_add(c->counters[BCH_COUNTER_io_move_noop], k.k->size); + return 0; + } } - /* - * Before memory allocations & taking nocow locks in - * bch2_data_update_init(): - */ - bch2_trans_unlock(trans); - - struct moving_io *io = kzalloc(sizeof(struct moving_io), GFP_KERNEL); - if (!io) + struct moving_io *io = allocate_dropping_locks(trans, ret, + kzalloc(sizeof(struct moving_io), _gfp)); + if (!io && !ret) + ret = bch_err_throw(c, ENOMEM_move_extent); + if (ret) goto err; INIT_LIST_HEAD(&io->io_list); @@ -320,7 +369,7 @@ int bch2_move_extent(struct moving_context *ctxt, ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, &io_opts, data_opts, iter->btree_id, k); if (ret) - goto err_free; + goto err; io->write.op.end_io = move_write_done; } else { @@ -330,9 +379,11 @@ int bch2_move_extent(struct moving_context *ctxt, io->write.op.c = c; io->write.data_opts = data_opts; + bch2_trans_unlock(trans); + ret = bch2_data_update_bios_init(&io->write, c, &io_opts); if (ret) - goto err_free; + goto err; } io->write.rbio.bio.bi_end_io = move_read_endio; @@ -351,15 +402,16 @@ int bch2_move_extent(struct moving_context *ctxt, atomic_inc(&io->b->count); } - trace_io_move_read2(c, k); + if (trace_io_move_read_enabled()) + trace_io_move_read2(c, k); - mutex_lock(&ctxt->lock); - atomic_add(io->read_sectors, &ctxt->read_sectors); - atomic_inc(&ctxt->read_ios); + scoped_guard(mutex, &ctxt->lock) { + atomic_add(io->read_sectors, &ctxt->read_sectors); + atomic_inc(&ctxt->read_ios); - list_add_tail(&io->read_list, &ctxt->reads); - list_add_tail(&io->io_list, &ctxt->ios); - mutex_unlock(&ctxt->lock); + list_add_tail(&io->read_list, &ctxt->reads); + list_add_tail(&io->io_list, &ctxt->ios); + } /* * dropped by move_read_endio() - guards against use after free of @@ -374,12 +426,8 @@ int bch2_move_extent(struct moving_context *ctxt, BCH_READ_last_fragment, data_opts.scrub ? data_opts.read_dev : -1); return 0; -err_free: - kfree(io); err: - if (bch2_err_matches(ret, BCH_ERR_data_update_done)) - return 0; - + kfree(io); if (bch2_err_matches(ret, EROFS) || bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ret; @@ -387,18 +435,19 @@ int bch2_move_extent(struct moving_context *ctxt, count_event(c, io_move_start_fail); if (trace_io_move_start_fail_enabled()) { - struct printbuf buf = PRINTBUF; - + CLASS(printbuf, buf)(); bch2_bkey_val_to_text(&buf, c, k); prt_str(&buf, ": "); prt_str(&buf, bch2_err_str(ret)); trace_io_move_start_fail(c, buf.buf); - printbuf_exit(&buf); } + + if (bch2_err_matches(ret, BCH_ERR_data_update_done)) + return 0; return ret; } -static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, +struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, struct per_snapshot_io_opts *io_opts, struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */ struct btree_iter *extent_iter, @@ -409,6 +458,9 @@ static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, struct bch_io_opts *opts_ret = &io_opts->fs_io_opts; int ret = 0; + if (btree_iter_path(trans, extent_iter)->level) + return opts_ret; + if (extent_k.k->type == KEY_TYPE_reflink_v) goto out; @@ -463,24 +515,22 @@ int bch2_move_get_io_opts_one(struct btree_trans *trans, *io_opts = bch2_opts_to_inode_opts(c->opts); /* reflink btree? */ - if (!extent_k.k->p.inode) - goto out; - - struct btree_iter inode_iter; - struct bkey_s_c inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, - SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), - BTREE_ITER_cached); - int ret = bkey_err(inode_k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ret; + if (extent_k.k->p.inode) { + CLASS(btree_iter, inode_iter)(trans, BTREE_ID_inodes, + SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), + BTREE_ITER_cached); + struct bkey_s_c inode_k = bch2_btree_iter_peek_slot(&inode_iter); + int ret = bkey_err(inode_k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; - if (!ret && bkey_is_inode(inode_k.k)) { - struct bch_inode_unpacked inode; - bch2_inode_unpack(inode_k, &inode); - bch2_inode_opts_get(io_opts, c, &inode); + if (!ret && bkey_is_inode(inode_k.k)) { + struct bch_inode_unpacked inode; + bch2_inode_unpack(inode_k, &inode); + bch2_inode_opts_get(io_opts, c, &inode); + } } - bch2_trans_iter_exit(trans, &inode_iter); -out: + return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k); } @@ -545,25 +595,25 @@ static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans * BTREE_ID_reflink, reflink_pos, BTREE_ITER_not_extents); - struct bkey_s_c k = bch2_btree_iter_peek(trans, iter); + struct bkey_s_c k = bch2_btree_iter_peek(iter); if (!k.k || bkey_err(k)) { - bch2_trans_iter_exit(trans, iter); + bch2_trans_iter_exit(iter); return k; } if (bkey_lt(reflink_pos, bkey_start_pos(k.k))) { - bch2_trans_iter_exit(trans, iter); + bch2_trans_iter_exit(iter); return bkey_s_c_null; } return k; } -static int bch2_move_data_btree(struct moving_context *ctxt, - struct bpos start, - struct bpos end, - move_pred_fn pred, void *arg, - enum btree_id btree_id) +int bch2_move_data_btree(struct moving_context *ctxt, + struct bpos start, + struct bpos end, + move_pred_fn pred, void *arg, + enum btree_id btree_id, unsigned level) { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; @@ -589,11 +639,55 @@ static int bch2_move_data_btree(struct moving_context *ctxt, ctxt->stats->pos = BBPOS(btree_id, start); } +retry_root: bch2_trans_begin(trans); - bch2_trans_iter_init(trans, &iter, btree_id, start, - BTREE_ITER_prefetch| - BTREE_ITER_not_extents| - BTREE_ITER_all_snapshots); + + if (level == bch2_btree_id_root(c, btree_id)->level + 1) { + bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level - 1, + BTREE_ITER_prefetch| + BTREE_ITER_not_extents| + BTREE_ITER_all_snapshots); + struct btree *b = bch2_btree_iter_peek_node(&iter); + ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto root_err; + + if (b != btree_node_root(c, b)) { + bch2_trans_iter_exit(&iter); + goto retry_root; + } + + k = bkey_i_to_s_c(&b->key); + + io_opts = &snapshot_io_opts.fs_io_opts; + ret = PTR_ERR_OR_ZERO(io_opts); + if (ret) + goto root_err; + + memset(&data_opts, 0, sizeof(data_opts)); + if (!pred(c, arg, iter.btree_id, k, io_opts, &data_opts)) + goto out; + + + if (!data_opts.scrub) + ret = bch2_btree_node_rewrite_pos(trans, btree_id, level, + k.k->p, data_opts.target, 0); + else + ret = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev); + +root_err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + bch2_trans_iter_exit(&iter); + goto retry_root; + } + + goto out; + } + + bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level, + BTREE_ITER_prefetch| + BTREE_ITER_not_extents| + BTREE_ITER_all_snapshots); if (ctxt->rate) bch2_ratelimit_reset(ctxt->rate); @@ -603,7 +697,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, bch2_trans_begin(trans); - k = bch2_btree_iter_peek(trans, &iter); + k = bch2_btree_iter_peek(&iter); if (!k.k) break; @@ -613,7 +707,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, if (ret) break; - if (bkey_ge(bkey_start_pos(k.k), end)) + if (bkey_gt(bkey_start_pos(k.k), end)) break; if (ctxt->stats) @@ -624,7 +718,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - bch2_trans_iter_exit(trans, &reflink_iter); + bch2_trans_iter_exit(&reflink_iter); k = bch2_lookup_indirect_extent_for_move(trans, &reflink_iter, p); ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -653,7 +747,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, continue; memset(&data_opts, 0, sizeof(data_opts)); - if (!pred(c, arg, k, io_opts, &data_opts)) + if (!pred(c, arg, extent_iter->btree_id, k, io_opts, &data_opts)) goto next; /* @@ -663,7 +757,14 @@ static int bch2_move_data_btree(struct moving_context *ctxt, bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); + if (!level) + ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); + else if (!data_opts.scrub) + ret2 = bch2_btree_node_rewrite_pos(trans, btree_id, level, + k.k->p, data_opts.target, 0); + else + ret2 = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev); + if (ret2) { if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) continue; @@ -681,83 +782,86 @@ static int bch2_move_data_btree(struct moving_context *ctxt, if (ctxt->stats) atomic64_add(k.k->size, &ctxt->stats->sectors_seen); next_nondata: - bch2_btree_iter_advance(trans, &iter); + if (!bch2_btree_iter_advance(&iter)) + break; } - - bch2_trans_iter_exit(trans, &reflink_iter); - bch2_trans_iter_exit(trans, &iter); +out: + bch2_trans_iter_exit(&reflink_iter); + bch2_trans_iter_exit(&iter); bch2_bkey_buf_exit(&sk, c); per_snapshot_io_opts_exit(&snapshot_io_opts); return ret; } -int __bch2_move_data(struct moving_context *ctxt, - struct bbpos start, - struct bbpos end, - move_pred_fn pred, void *arg) +static int bch2_move_data(struct bch_fs *c, + struct bbpos start, + struct bbpos end, + unsigned min_depth, + struct bch_ratelimit *rate, + struct bch_move_stats *stats, + struct write_point_specifier wp, + bool wait_on_copygc, + move_pred_fn pred, void *arg) { - struct bch_fs *c = ctxt->trans->c; - enum btree_id id; int ret = 0; - for (id = start.btree; + struct moving_context ctxt; + bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); + + for (enum btree_id id = start.btree; id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); id++) { - ctxt->stats->pos = BBPOS(id, POS_MIN); + ctxt.stats->pos = BBPOS(id, POS_MIN); - if (!btree_type_has_ptrs(id) || - !bch2_btree_id_root(c, id)->b) + if (!bch2_btree_id_root(c, id)->b) continue; - ret = bch2_move_data_btree(ctxt, - id == start.btree ? start.pos : POS_MIN, - id == end.btree ? end.pos : POS_MAX, - pred, arg, id); + unsigned min_depth_this_btree = min_depth; + + if (!btree_type_has_ptrs(id)) + min_depth_this_btree = max(min_depth_this_btree, 1); + + for (unsigned level = min_depth_this_btree; + level < BTREE_MAX_DEPTH; + level++) { + ret = bch2_move_data_btree(&ctxt, + id == start.btree ? start.pos : POS_MIN, + id == end.btree ? end.pos : POS_MAX, + pred, arg, id, level); + if (ret) + break; + } + if (ret) break; } - return ret; -} - -int bch2_move_data(struct bch_fs *c, - struct bbpos start, - struct bbpos end, - struct bch_ratelimit *rate, - struct bch_move_stats *stats, - struct write_point_specifier wp, - bool wait_on_copygc, - move_pred_fn pred, void *arg) -{ - struct moving_context ctxt; - - bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); - int ret = __bch2_move_data(&ctxt, start, end, pred, arg); bch2_moving_ctxt_exit(&ctxt); - return ret; } static int __bch2_move_data_phys(struct moving_context *ctxt, - struct move_bucket_in_flight *bucket_in_flight, + struct move_bucket *bucket_in_flight, unsigned dev, u64 bucket_start, u64 bucket_end, unsigned data_types, + bool copygc, move_pred_fn pred, void *arg) { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; bool is_kthread = current->flags & PF_KTHREAD; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); - struct btree_iter iter = {}, bp_iter = {}; + struct btree_iter iter = {}; struct bkey_buf sk; struct bkey_s_c k; struct bkey_buf last_flushed; + u64 check_mismatch_done = bucket_start; int ret = 0; - struct bch_dev *ca = bch2_dev_tryget(c, dev); + CLASS(bch2_dev_tryget, ca)(c, dev); if (!ca) return 0; @@ -765,8 +869,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, struct bpos bp_start = bucket_pos_to_bp_start(ca, POS(dev, bucket_start)); struct bpos bp_end = bucket_pos_to_bp_end(ca, POS(dev, bucket_end)); - bch2_dev_put(ca); - ca = NULL; bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); @@ -777,11 +879,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, */ bch2_trans_begin(trans); - bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_start, 0); - - bch_err_msg(c, ret, "looking up alloc key"); - if (ret) - goto err; + CLASS(btree_iter, bp_iter)(trans, BTREE_ID_backpointers, bp_start, 0); ret = bch2_btree_write_buffer_tryflush(trans); if (!bch2_err_matches(ret, EROFS)) @@ -795,7 +893,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, bch2_trans_begin(trans); - k = bch2_btree_iter_peek(trans, &bp_iter); + k = bch2_btree_iter_peek(&bp_iter); ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; @@ -805,6 +903,14 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, if (!k.k || bkey_gt(k.k->p, bp_end)) break; + if (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) { + while (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) { + bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++, + copygc, &last_flushed); + } + continue; + } + if (k.k->type != KEY_TYPE_backpointer) goto next; @@ -831,21 +937,27 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, if (!bp.v->level) { ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k); if (ret) { - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); continue; } } struct data_update_opts data_opts = {}; - if (!pred(c, arg, k, &io_opts, &data_opts)) { - bch2_trans_iter_exit(trans, &iter); + bool p = pred(c, arg, bp.v->btree_id, k, &io_opts, &data_opts); + + if (trace_io_move_pred_enabled()) + trace_io_move_pred2(c, k, &io_opts, &data_opts, + pred, arg, p); + + if (!p) { + bch2_trans_iter_exit(&iter); goto next; } if (data_opts.scrub && !bch2_dev_idx_is_online(c, data_opts.read_dev)) { - bch2_trans_iter_exit(trans, &iter); - ret = -BCH_ERR_device_offline; + bch2_trans_iter_exit(&iter); + ret = bch_err_throw(c, device_offline); break; } @@ -858,11 +970,12 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, if (!bp.v->level) ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts); else if (!data_opts.scrub) - ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, k.k->p, 0); + ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, + k.k->p, data_opts.target, 0); else ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev); - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; @@ -877,47 +990,48 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, if (ctxt->stats) atomic64_add(sectors, &ctxt->stats->sectors_seen); next: - bch2_btree_iter_advance(trans, &bp_iter); + bch2_btree_iter_advance(&bp_iter); } + + while (check_mismatch_done < bucket_end) + bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++, + copygc, &last_flushed); err: - bch2_trans_iter_exit(trans, &bp_iter); bch2_bkey_buf_exit(&sk, c); bch2_bkey_buf_exit(&last_flushed, c); return ret; } -static int bch2_move_data_phys(struct bch_fs *c, - unsigned dev, - u64 start, - u64 end, - unsigned data_types, - struct bch_ratelimit *rate, - struct bch_move_stats *stats, - struct write_point_specifier wp, - bool wait_on_copygc, - move_pred_fn pred, void *arg) +int bch2_move_data_phys(struct bch_fs *c, + unsigned dev, + u64 start, + u64 end, + unsigned data_types, + struct bch_ratelimit *rate, + struct bch_move_stats *stats, + struct write_point_specifier wp, + bool wait_on_copygc, + move_pred_fn pred, void *arg) { struct moving_context ctxt; - bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans)); - bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); - ctxt.stats->phys = true; - ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys; + bch2_btree_write_buffer_flush_sync(ctxt.trans); + + if (ctxt.stats) { + ctxt.stats->phys = true; + ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys; + } - int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, data_types, pred, arg); + int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, + data_types, false, pred, arg); bch2_moving_ctxt_exit(&ctxt); return ret; } -struct evacuate_bucket_arg { - struct bpos bucket; - int gen; - struct data_update_opts data_opts; -}; - -static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, struct bkey_s_c k, +static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, + enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { @@ -938,17 +1052,23 @@ static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, struct bkey_s_c k } int bch2_evacuate_bucket(struct moving_context *ctxt, - struct move_bucket_in_flight *bucket_in_flight, - struct bpos bucket, int gen, - struct data_update_opts data_opts) + struct move_bucket *bucket_in_flight, + struct bpos bucket, int gen, + struct data_update_opts data_opts) { + struct bch_fs *c = ctxt->trans->c; struct evacuate_bucket_arg arg = { bucket, gen, data_opts, }; + count_event(c, io_move_evacuate_bucket); + if (trace_io_move_evacuate_bucket_enabled()) + trace_io_move_evacuate_bucket2(c, bucket, gen); + return __bch2_move_data_phys(ctxt, bucket_in_flight, bucket.inode, bucket.offset, bucket.offset + 1, ~0, + true, evacuate_bucket_pred, &arg); } @@ -992,7 +1112,7 @@ static int bch2_move_btree(struct bch_fs *c, retry: ret = 0; while (bch2_trans_begin(trans), - (b = bch2_btree_iter_peek_node(trans, &iter)) && + (b = bch2_btree_iter_peek_node(&iter)) && !(ret = PTR_ERR_OR_ZERO(b))) { if (kthread && kthread_should_stop()) break; @@ -1006,18 +1126,18 @@ static int bch2_move_btree(struct bch_fs *c, if (!pred(c, arg, b, &io_opts, &data_opts)) goto next; - ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret; + ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0) ?: ret; if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) break; next: - bch2_btree_iter_next_node(trans, &iter); + bch2_btree_iter_next_node(&iter); } if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); if (kthread && kthread_should_stop()) break; @@ -1031,7 +1151,7 @@ static int bch2_move_btree(struct bch_fs *c, } static bool rereplicate_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, + enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { @@ -1040,7 +1160,7 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg, ? c->opts.metadata_replicas : io_opts->data_replicas; - rcu_read_lock(); + guard(rcu)(); struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); unsigned i = 0; bkey_for_each_ptr(ptrs, ptr) { @@ -1050,7 +1170,6 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg, data_opts->kill_ptrs |= BIT(i); i++; } - rcu_read_unlock(); if (!data_opts->kill_ptrs && (!nr_good || nr_good >= replicas)) @@ -1063,7 +1182,7 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg, } static bool migrate_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, + enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { @@ -1085,14 +1204,6 @@ static bool migrate_pred(struct bch_fs *c, void *arg, return data_opts->rewrite_ptrs != 0; } -static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, - struct btree *b, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); -} - /* * Ancient versions of bcachefs produced packed formats which could represent * keys that the in memory format cannot represent; this checks for those @@ -1133,12 +1244,11 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) BBPOS_MAX, rewrite_old_nodes_pred, c, stats); if (!ret) { - mutex_lock(&c->sb_lock); + guard(mutex)(&c->sb_lock); c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); c->disk_sb.sb->version_min = c->disk_sb.sb->version; bch2_write_super(c); - mutex_unlock(&c->sb_lock); } bch_err_fn(c, ret); @@ -1146,7 +1256,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) } static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, + enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { @@ -1158,7 +1268,7 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, struct extent_ptr_decoded p; unsigned i = 0; - rcu_read_lock(); + guard(rcu)(); bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { unsigned d = bch2_extent_ptr_durability(c, &p); @@ -1169,21 +1279,12 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, i++; } - rcu_read_unlock(); return data_opts->kill_ptrs != 0; } -static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg, - struct btree *b, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); -} - static bool scrub_pred(struct bch_fs *c, void *_arg, - struct bkey_s_c k, + enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { @@ -1208,18 +1309,18 @@ static bool scrub_pred(struct bch_fs *c, void *_arg, int bch2_data_job(struct bch_fs *c, struct bch_move_stats *stats, - struct bch_ioctl_data op) + struct bch_ioctl_data *op) { - struct bbpos start = BBPOS(op.start_btree, op.start_pos); - struct bbpos end = BBPOS(op.end_btree, op.end_pos); + struct bbpos start = BBPOS(op->start_btree, op->start_pos); + struct bbpos end = BBPOS(op->end_btree, op->end_pos); int ret = 0; - if (op.op >= BCH_DATA_OP_NR) + if (op->op >= BCH_DATA_OP_NR) return -EINVAL; - bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]); + bch2_move_stats_init(stats, bch2_data_ops_strs[op->op]); - switch (op.op) { + switch (op->op) { case BCH_DATA_OP_scrub: /* * prevent tests from spuriously failing, make sure we see all @@ -1227,41 +1328,38 @@ int bch2_data_job(struct bch_fs *c, */ bch2_btree_interior_updates_flush(c); - ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX, - op.scrub.data_types, + ret = bch2_move_data_phys(c, op->scrub.dev, 0, U64_MAX, + op->scrub.data_types, NULL, stats, writepoint_hashed((unsigned long) current), false, - scrub_pred, &op) ?: ret; + scrub_pred, op) ?: ret; break; case BCH_DATA_OP_rereplicate: stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, -1); - ret = bch2_move_btree(c, start, end, - rereplicate_btree_pred, c, stats) ?: ret; - ret = bch2_move_data(c, start, end, - NULL, - stats, + ret = bch2_move_data(c, start, end, 0, NULL, stats, writepoint_hashed((unsigned long) current), true, rereplicate_pred, c) ?: ret; + bch2_btree_interior_updates_flush(c); ret = bch2_replicas_gc2(c) ?: ret; break; case BCH_DATA_OP_migrate: - if (op.migrate.dev >= c->sb.nr_devices) + if (op->migrate.dev >= c->sb.nr_devices) return -EINVAL; stats->data_type = BCH_DATA_journal; - ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); - ret = bch2_move_data_phys(c, op.migrate.dev, 0, U64_MAX, + ret = bch2_journal_flush_device_pins(&c->journal, op->migrate.dev); + ret = bch2_move_data_phys(c, op->migrate.dev, 0, U64_MAX, ~0, NULL, stats, writepoint_hashed((unsigned long) current), true, - migrate_pred, &op) ?: ret; + migrate_pred, op) ?: ret; bch2_btree_interior_updates_flush(c); ret = bch2_replicas_gc2(c) ?: ret; break; @@ -1269,12 +1367,10 @@ int bch2_data_job(struct bch_fs *c, ret = bch2_scan_old_btree_nodes(c, stats); break; case BCH_DATA_OP_drop_extra_replicas: - ret = bch2_move_btree(c, start, end, - drop_extra_replicas_btree_pred, c, stats) ?: ret; - ret = bch2_move_data(c, start, end, NULL, stats, - writepoint_hashed((unsigned long) current), - true, - drop_extra_replicas_pred, c) ?: ret; + ret = bch2_move_data(c, start, end, 0, NULL, stats, + writepoint_hashed((unsigned long) current), + true, + drop_extra_replicas_pred, c) ?: ret; ret = bch2_replicas_gc2(c) ?: ret; break; default: @@ -1333,11 +1429,11 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str printbuf_indent_add(out, 2); - mutex_lock(&ctxt->lock); - struct moving_io *io; - list_for_each_entry(io, &ctxt->ios, io_list) - bch2_data_update_inflight_to_text(out, &io->write); - mutex_unlock(&ctxt->lock); + scoped_guard(mutex, &ctxt->lock) { + struct moving_io *io; + list_for_each_entry(io, &ctxt->ios, io_list) + bch2_data_update_inflight_to_text(out, &io->write); + } printbuf_indent_sub(out, 4); } @@ -1346,10 +1442,9 @@ void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c) { struct moving_context *ctxt; - mutex_lock(&c->moving_context_lock); - list_for_each_entry(ctxt, &c->moving_context_list, list) - bch2_moving_ctxt_to_text(out, c, ctxt); - mutex_unlock(&c->moving_context_lock); + scoped_guard(mutex, &c->moving_context_lock) + list_for_each_entry(ctxt, &c->moving_context_list, list) + bch2_moving_ctxt_to_text(out, c, ctxt); } void bch2_fs_move_init(struct bch_fs *c) diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 51e0505a8156..481026ff99ab 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -72,7 +72,7 @@ do { \ break; \ } while (1) -typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c, +typedef bool (*move_pred_fn)(struct bch_fs *, void *, enum btree_id, struct bkey_s_c, struct bch_io_opts *, struct data_update_opts *); extern const char * const bch2_data_ops_strs[]; @@ -116,32 +116,31 @@ int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); int bch2_move_extent(struct moving_context *, - struct move_bucket_in_flight *, + struct move_bucket *, struct btree_iter *, struct bkey_s_c, struct bch_io_opts, struct data_update_opts); -int __bch2_move_data(struct moving_context *, - struct bbpos, - struct bbpos, - move_pred_fn, void *); -int bch2_move_data(struct bch_fs *, - struct bbpos start, - struct bbpos end, - struct bch_ratelimit *, - struct bch_move_stats *, - struct write_point_specifier, - bool, - move_pred_fn, void *); +struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *, + struct per_snapshot_io_opts *, struct bpos, + struct btree_iter *, struct bkey_s_c); + +int bch2_move_data_btree(struct moving_context *, struct bpos, struct bpos, + move_pred_fn, void *, enum btree_id, unsigned); + +int bch2_move_data_phys(struct bch_fs *, unsigned, u64, u64, unsigned, + struct bch_ratelimit *, struct bch_move_stats *, + struct write_point_specifier, bool, + move_pred_fn, void *); int bch2_evacuate_bucket(struct moving_context *, - struct move_bucket_in_flight *, + struct move_bucket *, struct bpos, int, struct data_update_opts); int bch2_data_job(struct bch_fs *, struct bch_move_stats *, - struct bch_ioctl_data); + struct bch_ioctl_data *); void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *); void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *); diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h index 807f779f6f76..c5c62cd600de 100644 --- a/fs/bcachefs/move_types.h +++ b/fs/bcachefs/move_types.h @@ -36,14 +36,10 @@ struct move_bucket_key { }; struct move_bucket { + struct move_bucket *next; + struct rhash_head hash; struct move_bucket_key k; unsigned sectors; -}; - -struct move_bucket_in_flight { - struct move_bucket_in_flight *next; - struct rhash_head hash; - struct move_bucket bucket; atomic_t count; }; diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 96873372b516..b0cbe3c1aab6 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -8,6 +8,7 @@ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" +#include "backpointers.h" #include "btree_iter.h" #include "btree_update.h" #include "btree_write_buffer.h" @@ -27,47 +28,32 @@ #include struct buckets_in_flight { - struct rhashtable table; - struct move_bucket_in_flight *first; - struct move_bucket_in_flight *last; - size_t nr; - size_t sectors; + struct rhashtable *table; + struct move_bucket *first; + struct move_bucket *last; + size_t nr; + size_t sectors; + + DARRAY(struct move_bucket *) to_evacuate; }; static const struct rhashtable_params bch_move_bucket_params = { - .head_offset = offsetof(struct move_bucket_in_flight, hash), - .key_offset = offsetof(struct move_bucket_in_flight, bucket.k), + .head_offset = offsetof(struct move_bucket, hash), + .key_offset = offsetof(struct move_bucket, k), .key_len = sizeof(struct move_bucket_key), .automatic_shrinking = true, }; -static struct move_bucket_in_flight * -move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket b) +static void move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket *b) { - struct move_bucket_in_flight *new = kzalloc(sizeof(*new), GFP_KERNEL); - int ret; - - if (!new) - return ERR_PTR(-ENOMEM); - - new->bucket = b; - - ret = rhashtable_lookup_insert_fast(&list->table, &new->hash, - bch_move_bucket_params); - if (ret) { - kfree(new); - return ERR_PTR(ret); - } - if (!list->first) - list->first = new; + list->first = b; else - list->last->next = new; + list->last->next = b; - list->last = new; + list->last = b; list->nr++; - list->sectors += b.sectors; - return new; + list->sectors += b->sectors; } static int bch2_bucket_is_movable(struct btree_trans *trans, @@ -78,20 +64,22 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, if (bch2_bucket_is_open(c, b->k.bucket.inode, b->k.bucket.offset)) return 0; - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, - b->k.bucket, BTREE_ITER_cached); + CLASS(btree_iter, iter)(trans, BTREE_ID_alloc, b->k.bucket, BTREE_ITER_cached); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); int ret = bkey_err(k); if (ret) return ret; - struct bch_dev *ca = bch2_dev_tryget(c, k.k->p.inode); + CLASS(bch2_dev_bucket_tryget, ca)(c, k.k->p); if (!ca) - goto out; + return 0; + + if (bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b->k.bucket.offset)) + return 0; if (ca->mi.state != BCH_MEMBER_STATE_rw || !bch2_dev_is_online(ca)) - goto out_put; + return 0; struct bch_alloc_v4 _a; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); @@ -99,20 +87,23 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, b->sectors = bch2_bucket_sectors_dirty(*a); u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca); - ret = lru_idx && lru_idx <= time; -out_put: - bch2_dev_put(ca); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; + return lru_idx && lru_idx <= time; +} + +static void move_bucket_free(struct buckets_in_flight *list, + struct move_bucket *b) +{ + int ret = rhashtable_remove_fast(list->table, &b->hash, + bch_move_bucket_params); + BUG_ON(ret); + kfree(b); } static void move_buckets_wait(struct moving_context *ctxt, struct buckets_in_flight *list, bool flush) { - struct move_bucket_in_flight *i; - int ret; + struct move_bucket *i; while ((i = list->first)) { if (flush) @@ -126,12 +117,9 @@ static void move_buckets_wait(struct moving_context *ctxt, list->last = NULL; list->nr--; - list->sectors -= i->bucket.sectors; + list->sectors -= i->sectors; - ret = rhashtable_remove_fast(&list->table, &i->hash, - bch_move_bucket_params); - BUG_ON(ret); - kfree(i); + move_bucket_free(list, i); } bch2_trans_unlock_long(ctxt->trans); @@ -140,14 +128,11 @@ static void move_buckets_wait(struct moving_context *ctxt, static bool bucket_in_flight(struct buckets_in_flight *list, struct move_bucket_key k) { - return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params); + return rhashtable_lookup_fast(list->table, &k, bch_move_bucket_params); } -typedef DARRAY(struct move_bucket) move_buckets; - static int bch2_copygc_get_buckets(struct moving_context *ctxt, - struct buckets_in_flight *buckets_in_flight, - move_buckets *buckets) + struct buckets_in_flight *buckets_in_flight) { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; @@ -164,8 +149,6 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret))) return ret; - bch2_trans_begin(trans); - ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0), lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX), @@ -184,20 +167,34 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, else if (bucket_in_flight(buckets_in_flight, b.k)) in_flight++; else { - ret2 = darray_push(buckets, b); + struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL); + ret2 = b_i ? 0 : -ENOMEM; if (ret2) goto err; + + *b_i = b; + + ret2 = darray_push(&buckets_in_flight->to_evacuate, b_i); + if (ret2) { + kfree(b_i); + goto err; + } + + ret2 = rhashtable_lookup_insert_fast(buckets_in_flight->table, &b_i->hash, + bch_move_bucket_params); + BUG_ON(ret2); + sectors += b.sectors; } - ret2 = buckets->nr >= nr_to_get; + ret2 = buckets_in_flight->to_evacuate.nr >= nr_to_get; err: ret2; })); pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i", buckets_in_flight->nr, buckets_in_flight->sectors, - saw, in_flight, not_movable, buckets->nr, sectors, nr_to_get, ret); + saw, in_flight, not_movable, buckets_in_flight->to_evacuate.nr, sectors, nr_to_get, ret); return ret < 0 ? ret : 0; } @@ -212,40 +209,30 @@ static int bch2_copygc(struct moving_context *ctxt, struct data_update_opts data_opts = { .btree_insert_flags = BCH_WATERMARK_copygc, }; - move_buckets buckets = { 0 }; - struct move_bucket_in_flight *f; u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen); u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved); int ret = 0; - ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets); + ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight); if (ret) goto err; - darray_for_each(buckets, i) { + darray_for_each(buckets_in_flight->to_evacuate, i) { if (kthread_should_stop() || freezing(current)) break; - f = move_bucket_in_flight_add(buckets_in_flight, *i); - ret = PTR_ERR_OR_ZERO(f); - if (ret == -EEXIST) { /* rare race: copygc_get_buckets returned same bucket more than once */ - ret = 0; - continue; - } - if (ret == -ENOMEM) { /* flush IO, continue later */ - ret = 0; - break; - } + struct move_bucket *b = *i; + *i = NULL; + + move_bucket_in_flight_add(buckets_in_flight, b); - ret = bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket, - f->bucket.k.gen, data_opts); + ret = bch2_evacuate_bucket(ctxt, b, b->k.bucket, b->k.gen, data_opts); if (ret) goto err; *did_work = true; } err: - /* no entries in LRU btree found, or got to end: */ if (bch2_err_matches(ret, ENOENT)) ret = 0; @@ -255,12 +242,34 @@ static int bch2_copygc(struct moving_context *ctxt, sectors_seen = atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen; sectors_moved = atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved; - trace_and_count(c, copygc, c, buckets.nr, sectors_seen, sectors_moved); + trace_and_count(c, copygc, c, buckets_in_flight->to_evacuate.nr, sectors_seen, sectors_moved); - darray_exit(&buckets); + darray_for_each(buckets_in_flight->to_evacuate, i) + if (*i) + move_bucket_free(buckets_in_flight, *i); + darray_exit(&buckets_in_flight->to_evacuate); return ret; } +static u64 bch2_copygc_dev_wait_amount(struct bch_dev *ca) +{ + struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca); + struct bch_dev_usage usage; + + for (unsigned i = 0; i < BCH_DATA_NR; i++) + usage.buckets[i] = usage_full.d[i].buckets; + + s64 fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) * + ca->mi.bucket_size) >> 1); + s64 fragmented = 0; + + for (unsigned i = 0; i < BCH_DATA_NR; i++) + if (data_type_movable(i)) + fragmented += usage_full.d[i].fragmented; + + return max(0LL, fragmented_allowed - fragmented); +} + /* * Copygc runs when the amount of fragmented data is above some arbitrary * threshold: @@ -275,28 +284,13 @@ static int bch2_copygc(struct moving_context *ctxt, * often and continually reduce the amount of fragmented space as the device * fills up. So, we increase the threshold by half the current free space. */ -unsigned long bch2_copygc_wait_amount(struct bch_fs *c) +u64 bch2_copygc_wait_amount(struct bch_fs *c) { - s64 wait = S64_MAX, fragmented_allowed, fragmented; - - for_each_rw_member(c, ca) { - struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca); - struct bch_dev_usage usage; - - for (unsigned i = 0; i < BCH_DATA_NR; i++) - usage.buckets[i] = usage_full.d[i].buckets; - - fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) * - ca->mi.bucket_size) >> 1); - fragmented = 0; - - for (unsigned i = 0; i < BCH_DATA_NR; i++) - if (data_type_movable(i)) - fragmented += usage_full.d[i].fragmented; - - wait = min(wait, max(0LL, fragmented_allowed - fragmented)); - } + u64 wait = U64_MAX; + guard(rcu)(); + for_each_rw_member_rcu(c, ca) + wait = min(wait, bch2_copygc_dev_wait_amount(ca)); return wait; } @@ -318,15 +312,22 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) c->copygc_wait_at) << 9); prt_newline(out); - prt_printf(out, "Currently calculated wait:\t"); - prt_human_readable_u64(out, bch2_copygc_wait_amount(c)); - prt_newline(out); + bch2_printbuf_make_room(out, 4096); - rcu_read_lock(); - struct task_struct *t = rcu_dereference(c->copygc_thread); - if (t) - get_task_struct(t); - rcu_read_unlock(); + struct task_struct *t; + scoped_guard(rcu) { + guard(printbuf_atomic)(out); + prt_printf(out, "Currently calculated wait:\n"); + for_each_rw_member_rcu(c, ca) { + prt_printf(out, " %s:\t", ca->name); + prt_human_readable_u64(out, bch2_copygc_dev_wait_amount(ca)); + prt_newline(out); + } + + t = rcu_dereference(c->copygc_thread); + if (t) + get_task_struct(t); + } if (t) { bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); @@ -340,19 +341,16 @@ static int bch2_copygc_thread(void *arg) struct moving_context ctxt; struct bch_move_stats move_stats; struct io_clock *clock = &c->io_clock[WRITE]; - struct buckets_in_flight *buckets; + struct buckets_in_flight buckets = {}; u64 last, wait; - int ret = 0; - buckets = kzalloc(sizeof(struct buckets_in_flight), GFP_KERNEL); - if (!buckets) - return -ENOMEM; - ret = rhashtable_init(&buckets->table, &bch_move_bucket_params); + buckets.table = kzalloc(sizeof(*buckets.table), GFP_KERNEL); + int ret = !buckets.table + ? -ENOMEM + : rhashtable_init(buckets.table, &bch_move_bucket_params); bch_err_msg(c, ret, "allocating copygc buckets in flight"); - if (ret) { - kfree(buckets); - return ret; - } + if (ret) + goto err; set_freezable(); @@ -360,7 +358,7 @@ static int bch2_copygc_thread(void *arg) * Data move operations can't run until after check_snapshots has * completed, and bch2_snapshot_is_ancestor() is available. */ - kthread_wait_freezable(c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots || + kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots || kthread_should_stop()); bch2_move_stats_init(&move_stats, "copygc"); @@ -375,13 +373,13 @@ static int bch2_copygc_thread(void *arg) cond_resched(); if (!c->opts.copygc_enabled) { - move_buckets_wait(&ctxt, buckets, true); + move_buckets_wait(&ctxt, &buckets, true); kthread_wait_freezable(c->opts.copygc_enabled || kthread_should_stop()); } if (unlikely(freezing(current))) { - move_buckets_wait(&ctxt, buckets, true); + move_buckets_wait(&ctxt, &buckets, true); __refrigerator(false); continue; } @@ -392,7 +390,7 @@ static int bch2_copygc_thread(void *arg) if (wait > clock->max_slop) { c->copygc_wait_at = last; c->copygc_wait = last + wait; - move_buckets_wait(&ctxt, buckets, true); + move_buckets_wait(&ctxt, &buckets, true); trace_and_count(c, copygc_wait, c, wait, last + wait); bch2_kthread_io_clock_wait(clock, last + wait, MAX_SCHEDULE_TIMEOUT); @@ -402,7 +400,7 @@ static int bch2_copygc_thread(void *arg) c->copygc_wait = 0; c->copygc_running = true; - ret = bch2_copygc(&ctxt, buckets, &did_work); + ret = bch2_copygc(&ctxt, &buckets, &did_work); c->copygc_running = false; wake_up(&c->copygc_running_wq); @@ -413,20 +411,19 @@ static int bch2_copygc_thread(void *arg) if (min_member_capacity == U64_MAX) min_member_capacity = 128 * 2048; - move_buckets_wait(&ctxt, buckets, true); + move_buckets_wait(&ctxt, &buckets, true); bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6), MAX_SCHEDULE_TIMEOUT); } } - move_buckets_wait(&ctxt, buckets, true); - - rhashtable_destroy(&buckets->table); - kfree(buckets); + move_buckets_wait(&ctxt, &buckets, true); + rhashtable_destroy(buckets.table); bch2_moving_ctxt_exit(&ctxt); bch2_move_stats_exit(&move_stats, c); - - return 0; +err: + kfree(buckets.table); + return ret; } void bch2_copygc_stop(struct bch_fs *c) diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h index d1885cf67a45..f615910d6f98 100644 --- a/fs/bcachefs/movinggc.h +++ b/fs/bcachefs/movinggc.h @@ -2,16 +2,15 @@ #ifndef _BCACHEFS_MOVINGGC_H #define _BCACHEFS_MOVINGGC_H -unsigned long bch2_copygc_wait_amount(struct bch_fs *); +u64 bch2_copygc_wait_amount(struct bch_fs *); void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *); static inline void bch2_copygc_wakeup(struct bch_fs *c) { - rcu_read_lock(); + guard(rcu)(); struct task_struct *p = rcu_dereference(c->copygc_thread); if (p) wake_up_process(p); - rcu_read_unlock(); } void bch2_copygc_stop(struct bch_fs *); diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c index 9136a9097789..d1019052f182 100644 --- a/fs/bcachefs/namei.c +++ b/fs/bcachefs/namei.c @@ -11,6 +11,14 @@ #include +static inline subvol_inum parent_inum(subvol_inum inum, struct bch_inode_unpacked *inode) +{ + return (subvol_inum) { + .subvol = inode->bi_parent_subvol ?: inum.subvol, + .inum = inode->bi_dir, + }; +} + static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode) { return S_ISDIR(inode->bi_mode) && !inode->bi_subvol; @@ -28,8 +36,8 @@ int bch2_create_trans(struct btree_trans *trans, unsigned flags) { struct bch_fs *c = trans->c; - struct btree_iter dir_iter = {}; - struct btree_iter inode_iter = {}; + struct btree_iter dir_iter = { NULL }; + struct btree_iter inode_iter = { NULL }; subvol_inum new_inum = dir; u64 now = bch2_current_time(c); u64 cpu = raw_smp_processor_id(); @@ -49,7 +57,7 @@ int bch2_create_trans(struct btree_trans *trans, if (!(flags & BCH_CREATE_SNAPSHOT)) { /* Normal create path - allocate a new inode: */ - bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); + bch2_inode_init_late(c, new_inode, now, uid, gid, mode, rdev, dir_u); if (flags & BCH_CREATE_TMPFILE) new_inode->bi_flags |= BCH_INODE_unlinked; @@ -91,7 +99,9 @@ int bch2_create_trans(struct btree_trans *trans, * If we're not root, we have to own the subvolume being * snapshotted: */ - if (uid && new_inode->bi_uid != uid) { + if (uid && + !capable(CAP_FOWNER) && + new_inode->bi_uid != uid) { ret = -EPERM; goto err; } @@ -123,8 +133,8 @@ int bch2_create_trans(struct btree_trans *trans, if (ret) goto err; - bch2_btree_iter_set_snapshot(trans, &dir_iter, dir_snapshot); - ret = bch2_btree_iter_traverse(trans, &dir_iter); + bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot); + ret = bch2_btree_iter_traverse(&dir_iter); if (ret) goto err; } @@ -167,18 +177,28 @@ int bch2_create_trans(struct btree_trans *trans, new_inode->bi_dir_offset = dir_offset; } + if (S_ISDIR(mode)) { + ret = bch2_maybe_propagate_has_case_insensitive(trans, + (subvol_inum) { + new_inode->bi_subvol ?: dir.subvol, + new_inode->bi_inum }, + new_inode); + if (ret) + goto err; + } + if (S_ISDIR(mode) && !new_inode->bi_subvol) new_inode->bi_depth = dir_u->bi_depth + 1; inode_iter.flags &= ~BTREE_ITER_all_snapshots; - bch2_btree_iter_set_snapshot(trans, &inode_iter, snapshot); + bch2_btree_iter_set_snapshot(&inode_iter, snapshot); - ret = bch2_btree_iter_traverse(trans, &inode_iter) ?: + ret = bch2_btree_iter_traverse(&inode_iter) ?: bch2_inode_write(trans, &inode_iter, new_inode); err: - bch2_trans_iter_exit(trans, &inode_iter); - bch2_trans_iter_exit(trans, &dir_iter); + bch2_trans_iter_exit(&inode_iter); + bch2_trans_iter_exit(&dir_iter); return ret; } @@ -188,8 +208,8 @@ int bch2_link_trans(struct btree_trans *trans, const struct qstr *name) { struct bch_fs *c = trans->c; - struct btree_iter dir_iter = {}; - struct btree_iter inode_iter = {}; + struct btree_iter dir_iter = { NULL }; + struct btree_iter inode_iter = { NULL }; struct bch_hash_info dir_hash; u64 now = bch2_current_time(c); u64 dir_offset = 0; @@ -234,8 +254,8 @@ int bch2_link_trans(struct btree_trans *trans, ret = bch2_inode_write(trans, &dir_iter, dir_u) ?: bch2_inode_write(trans, &inode_iter, inode_u); err: - bch2_trans_iter_exit(trans, &dir_iter); - bch2_trans_iter_exit(trans, &inode_iter); + bch2_trans_iter_exit(&dir_iter); + bch2_trans_iter_exit(&inode_iter); return ret; } @@ -247,9 +267,9 @@ int bch2_unlink_trans(struct btree_trans *trans, bool deleting_subvol) { struct bch_fs *c = trans->c; - struct btree_iter dir_iter = {}; - struct btree_iter dirent_iter = {}; - struct btree_iter inode_iter = {}; + struct btree_iter dir_iter = { NULL }; + struct btree_iter dirent_iter = { NULL }; + struct btree_iter inode_iter = { NULL }; struct bch_hash_info dir_hash; subvol_inum inum; u64 now = bch2_current_time(c); @@ -279,7 +299,7 @@ int bch2_unlink_trans(struct btree_trans *trans, } if (deleting_subvol && !inode_u->bi_subvol) { - ret = -BCH_ERR_ENOENT_not_subvol; + ret = bch_err_throw(c, ENOENT_not_subvol); goto err; } @@ -295,7 +315,7 @@ int bch2_unlink_trans(struct btree_trans *trans, if (ret) goto err; - k = bch2_btree_iter_peek_slot(trans, &dirent_iter); + k = bch2_btree_iter_peek_slot(&dirent_iter); ret = bkey_err(k); if (ret) goto err; @@ -304,8 +324,8 @@ int bch2_unlink_trans(struct btree_trans *trans, * If we're deleting a subvolume, we need to really delete the * dirent, not just emit a whiteout in the current snapshot: */ - bch2_btree_iter_set_snapshot(trans, &dirent_iter, k.k->p.snapshot); - ret = bch2_btree_iter_traverse(trans, &dirent_iter); + bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot); + ret = bch2_btree_iter_traverse(&dirent_iter); if (ret) goto err; } else { @@ -327,9 +347,9 @@ int bch2_unlink_trans(struct btree_trans *trans, bch2_inode_write(trans, &dir_iter, dir_u) ?: bch2_inode_write(trans, &inode_iter, inode_u); err: - bch2_trans_iter_exit(trans, &inode_iter); - bch2_trans_iter_exit(trans, &dirent_iter); - bch2_trans_iter_exit(trans, &dir_iter); + bch2_trans_iter_exit(&inode_iter); + bch2_trans_iter_exit(&dirent_iter); + bch2_trans_iter_exit(&dir_iter); return ret; } @@ -363,9 +383,8 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_parent) { - struct btree_iter iter; struct bkey_i_subvolume *s = - bch2_bkey_get_mut_typed(trans, &iter, + bch2_bkey_get_mut_typed(trans, BTREE_ID_subvolumes, POS(0, subvol), BTREE_ITER_cached, subvolume); int ret = PTR_ERR_OR_ZERO(s); @@ -373,7 +392,6 @@ static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_p return ret; s->v.fs_path_parent = cpu_to_le32(new_parent); - bch2_trans_iter_exit(trans, &iter); return 0; } @@ -387,10 +405,10 @@ int bch2_rename_trans(struct btree_trans *trans, enum bch_rename_mode mode) { struct bch_fs *c = trans->c; - struct btree_iter src_dir_iter = {}; - struct btree_iter dst_dir_iter = {}; - struct btree_iter src_inode_iter = {}; - struct btree_iter dst_inode_iter = {}; + struct btree_iter src_dir_iter = { NULL }; + struct btree_iter dst_dir_iter = { NULL }; + struct btree_iter src_inode_iter = { NULL }; + struct btree_iter dst_inode_iter = { NULL }; struct bch_hash_info src_hash, dst_hash; subvol_inum src_inum, dst_inum; u64 src_offset, dst_offset; @@ -404,8 +422,7 @@ int bch2_rename_trans(struct btree_trans *trans, src_hash = bch2_hash_info_init(c, src_dir_u); - if (dst_dir.inum != src_dir.inum || - dst_dir.subvol != src_dir.subvol) { + if (!subvol_inum_eq(dst_dir, src_dir)) { ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir, BTREE_ITER_intent); if (ret) @@ -418,8 +435,8 @@ int bch2_rename_trans(struct btree_trans *trans, } ret = bch2_dirent_rename(trans, - src_dir, &src_hash, &src_dir_u->bi_size, - dst_dir, &dst_hash, &dst_dir_u->bi_size, + src_dir, &src_hash, + dst_dir, &dst_hash, src_name, &src_inum, &src_offset, dst_name, &dst_inum, &dst_offset, mode); @@ -497,32 +514,41 @@ int bch2_rename_trans(struct btree_trans *trans, } } - if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && - S_ISDIR(src_inode_u->bi_mode)) { - ret = -EXDEV; - goto err; - } + if (!subvol_inum_eq(dst_dir, src_dir)) { + if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && + S_ISDIR(src_inode_u->bi_mode)) { + ret = -EXDEV; + goto err; + } - if (mode == BCH_RENAME_EXCHANGE && - bch2_reinherit_attrs(dst_inode_u, src_dir_u) && - S_ISDIR(dst_inode_u->bi_mode)) { - ret = -EXDEV; - goto err; - } + if (mode == BCH_RENAME_EXCHANGE && + bch2_reinherit_attrs(dst_inode_u, src_dir_u) && + S_ISDIR(dst_inode_u->bi_mode)) { + ret = -EXDEV; + goto err; + } - if (is_subdir_for_nlink(src_inode_u)) { - src_dir_u->bi_nlink--; - dst_dir_u->bi_nlink++; - } + ret = bch2_maybe_propagate_has_case_insensitive(trans, src_inum, src_inode_u) ?: + (mode == BCH_RENAME_EXCHANGE + ? bch2_maybe_propagate_has_case_insensitive(trans, dst_inum, dst_inode_u) + : 0); + if (ret) + goto err; - if (S_ISDIR(src_inode_u->bi_mode) && - !src_inode_u->bi_subvol) - src_inode_u->bi_depth = dst_dir_u->bi_depth + 1; + if (is_subdir_for_nlink(src_inode_u)) { + src_dir_u->bi_nlink--; + dst_dir_u->bi_nlink++; + } - if (mode == BCH_RENAME_EXCHANGE && - S_ISDIR(dst_inode_u->bi_mode) && - !dst_inode_u->bi_subvol) - dst_inode_u->bi_depth = src_dir_u->bi_depth + 1; + if (S_ISDIR(src_inode_u->bi_mode) && + !src_inode_u->bi_subvol) + src_inode_u->bi_depth = dst_dir_u->bi_depth + 1; + + if (mode == BCH_RENAME_EXCHANGE && + S_ISDIR(dst_inode_u->bi_mode) && + !dst_inode_u->bi_subvol) + dst_inode_u->bi_depth = src_dir_u->bi_depth + 1; + } if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) { dst_dir_u->bi_nlink--; @@ -554,15 +580,31 @@ int bch2_rename_trans(struct btree_trans *trans, ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u) : 0); err: - bch2_trans_iter_exit(trans, &dst_inode_iter); - bch2_trans_iter_exit(trans, &src_inode_iter); - bch2_trans_iter_exit(trans, &dst_dir_iter); - bch2_trans_iter_exit(trans, &src_dir_iter); + bch2_trans_iter_exit(&dst_inode_iter); + bch2_trans_iter_exit(&src_inode_iter); + bch2_trans_iter_exit(&dst_dir_iter); + bch2_trans_iter_exit(&src_dir_iter); return ret; } /* inum_to_path */ +static inline void reverse_bytes(void *b, size_t n) +{ + char *e = b + n, *s = b; + + while (s < e) { + --e; + swap(*s, *e); + s++; + } +} + +static inline void printbuf_reverse_from(struct printbuf *out, unsigned pos) +{ + reverse_bytes(out->buf + pos, out->pos - pos); +} + static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n) { bch2_printbuf_make_room(out, n); @@ -582,78 +624,116 @@ static inline void prt_str_reversed(struct printbuf *out, const char *s) prt_bytes_reversed(out, s, strlen(s)); } -static inline void reverse_bytes(void *b, size_t n) +__printf(2, 3) +static inline void prt_printf_reversed(struct printbuf *out, const char *fmt, ...) { - char *e = b + n, *s = b; + unsigned orig_pos = out->pos; - while (s < e) { - --e; - swap(*s, *e); - s++; - } + va_list args; + va_start(args, fmt); + prt_vprintf(out, fmt, args); + va_end(args); + + printbuf_reverse_from(out, orig_pos); } -/* XXX: we don't yet attempt to print paths when we don't know the subvol */ -int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printbuf *path) +static int __bch2_inum_to_path(struct btree_trans *trans, + u32 subvol, u64 inum, u32 snapshot, + struct printbuf *path) { unsigned orig_pos = path->pos; int ret = 0; + DARRAY(subvol_inum) inums = {}; - while (!(inum.subvol == BCACHEFS_ROOT_SUBVOL && - inum.inum == BCACHEFS_ROOT_INO)) { - struct bch_inode_unpacked inode; - ret = bch2_inode_find_by_inum_trans(trans, inum, &inode); + if (!snapshot) { + ret = bch2_subvolume_get_snapshot(trans, subvol, &snapshot); if (ret) goto disconnected; + } - if (!inode.bi_dir && !inode.bi_dir_offset) { - ret = -BCH_ERR_ENOENT_inode_no_backpointer; - goto disconnected; + while (true) { + subvol_inum n = (subvol_inum) { subvol ?: snapshot, inum }; + + if (darray_find_p(inums, i, i->subvol == n.subvol && i->inum == n.inum)) { + prt_printf_reversed(path, "(loop at %llu:%u)", inum, snapshot); + break; } - inum.subvol = inode.bi_parent_subvol ?: inum.subvol; - inum.inum = inode.bi_dir; + ret = darray_push(&inums, n); + if (ret) + goto err; - u32 snapshot; - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + struct bch_inode_unpacked inode; + ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, &inode, 0); if (ret) goto disconnected; - struct btree_iter d_iter; - struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter, - BTREE_ID_dirents, SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot), - 0, dirent); + if (inode.bi_subvol == BCACHEFS_ROOT_SUBVOL && + inode.bi_inum == BCACHEFS_ROOT_INO) + break; + + if (!inode.bi_dir && !inode.bi_dir_offset) { + ret = bch_err_throw(trans->c, ENOENT_inode_no_backpointer); + goto disconnected; + } + + inum = inode.bi_dir; + if (inode.bi_parent_subvol) { + subvol = inode.bi_parent_subvol; + ret = bch2_subvolume_get_snapshot(trans, inode.bi_parent_subvol, &snapshot); + if (ret) + goto disconnected; + } + + CLASS(btree_iter, d_iter)(trans, BTREE_ID_dirents, + SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot), 0); + struct bkey_s_c_dirent d = bch2_bkey_get_typed(&d_iter, dirent); ret = bkey_err(d.s_c); if (ret) goto disconnected; struct qstr dirent_name = bch2_dirent_get_name(d); + prt_bytes_reversed(path, dirent_name.name, dirent_name.len); prt_char(path, '/'); - - bch2_trans_iter_exit(trans, &d_iter); } if (orig_pos == path->pos) prt_char(path, '/'); out: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto err; + ret = path->allocation_failure ? -ENOMEM : 0; if (ret) goto err; - reverse_bytes(path->buf + orig_pos, path->pos - orig_pos); + printbuf_reverse_from(path, orig_pos); + darray_exit(&inums); return 0; err: + darray_exit(&inums); return ret; disconnected: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto err; - - prt_str_reversed(path, "(disconnected)"); + prt_printf_reversed(path, "(disconnected at %llu.%u)", inum, snapshot); goto out; } +int bch2_inum_to_path(struct btree_trans *trans, + subvol_inum inum, + struct printbuf *path) +{ + return __bch2_inum_to_path(trans, inum.subvol, inum.inum, 0, path); +} + +int bch2_inum_snapshot_to_path(struct btree_trans *trans, u64 inum, u32 snapshot, + snapshot_id_list *snapshot_overwrites, + struct printbuf *path) +{ + return __bch2_inum_to_path(trans, 0, inum, snapshot, path); +} + /* fsck */ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, @@ -662,15 +742,14 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, bool in_fsck) { struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); struct btree_iter bp_iter = {}; int ret = 0; if (inode_points_to_dirent(target, d)) return 0; - if (!target->bi_dir && - !target->bi_dir_offset) { + if (!bch2_inode_has_backpointer(target)) { fsck_err_on(S_ISDIR(target->bi_mode), trans, inode_dir_missing_backpointer, "directory with missing backpointer\n%s", @@ -695,19 +774,9 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, return __bch2_fsck_write_inode(trans, target); } - if (bch2_inode_should_have_single_bp(target) && - !fsck_err(trans, inode_wrong_backpointer, - "dirent points to inode that does not point back:\n%s", - (bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_newline(&buf), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf))) - goto err; - - struct bkey_s_c_dirent bp_dirent = - bch2_bkey_get_iter_typed(trans, &bp_iter, BTREE_ID_dirents, - SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot), - 0, dirent); + bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_dirents, + SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot), 0); + struct bkey_s_c_dirent bp_dirent = bch2_bkey_get_typed(&bp_iter, dirent); ret = bkey_err(bp_dirent); if (ret && !bch2_err_matches(ret, ENOENT)) goto err; @@ -730,6 +799,7 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, ret = __bch2_fsck_write_inode(trans, target); } } else { + printbuf_reset(&buf); bch2_bkey_val_to_text(&buf, c, d.s_c); prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); @@ -778,8 +848,7 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, out: err: fsck_err: - bch2_trans_iter_exit(trans, &bp_iter); - printbuf_exit(&buf); + bch2_trans_iter_exit(&bp_iter); bch_err_fn(c, ret); return ret; } @@ -791,7 +860,7 @@ int __bch2_check_dirent_target(struct btree_trans *trans, bool in_fsck) { struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); int ret = 0; ret = bch2_check_dirent_inode_dirent(trans, d, target, in_fsck); @@ -819,13 +888,157 @@ int __bch2_check_dirent_target(struct btree_trans *trans, n->v.d_inum = cpu_to_le64(target->bi_inum); } - ret = bch2_trans_update(trans, dirent_iter, &n->k_i, 0); + ret = bch2_trans_update(trans, dirent_iter, &n->k_i, + BTREE_UPDATE_internal_snapshot_node); if (ret) goto err; } err: fsck_err: - printbuf_exit(&buf); bch_err_fn(c, ret); return ret; } + +/* + * BCH_INODE_has_case_insensitive: + * We have to track whether directories have any descendent directory that is + * casefolded - for overlayfs: + */ + +static int bch2_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum) +{ + struct btree_iter iter = {}; + int ret = 0; + + while (true) { + struct bch_inode_unpacked inode; + ret = bch2_inode_peek(trans, &iter, &inode, inum, + BTREE_ITER_intent|BTREE_ITER_with_updates); + if (ret) + break; + + if (inode.bi_flags & BCH_INODE_has_case_insensitive) + break; + + inode.bi_flags |= BCH_INODE_has_case_insensitive; + ret = bch2_inode_write(trans, &iter, &inode); + if (ret) + break; + + bch2_trans_iter_exit(&iter); + if (subvol_inum_eq(inum, BCACHEFS_ROOT_SUBVOL_INUM)) + break; + + inum = parent_inum(inum, &inode); + } + + bch2_trans_iter_exit(&iter); + return ret; +} + +int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_unpacked *inode) +{ + if (!bch2_inode_casefold(trans->c, inode)) + return 0; + + inode->bi_flags |= BCH_INODE_has_case_insensitive; + + return bch2_propagate_has_case_insensitive(trans, parent_inum(inum, inode)); +} + +int bch2_check_inode_has_case_insensitive(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + snapshot_id_list *snapshot_overwrites, + bool *do_update) +{ + CLASS(printbuf, buf)(); + bool repairing_parents = false; + int ret = 0; + + if (!S_ISDIR(inode->bi_mode)) { + /* + * Old versions set bi_casefold for non dirs, but that's + * unnecessary and wasteful + */ + if (inode->bi_casefold) { + inode->bi_casefold = 0; + *do_update = true; + } + return 0; + } + + if (trans->c->sb.version < bcachefs_metadata_version_inode_has_case_insensitive) + return 0; + + if (bch2_inode_casefold(trans->c, inode) && + !(inode->bi_flags & BCH_INODE_has_case_insensitive)) { + prt_printf(&buf, "casefolded dir with has_case_insensitive not set\ninum %llu:%u ", + inode->bi_inum, inode->bi_snapshot); + + ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, inode->bi_snapshot, + snapshot_overwrites, &buf); + if (ret) + return ret; + + if (fsck_err(trans, inode_has_case_insensitive_not_set, "%s", buf.buf)) { + inode->bi_flags |= BCH_INODE_has_case_insensitive; + *do_update = true; + } + } + + if (!(inode->bi_flags & BCH_INODE_has_case_insensitive)) + goto out; + + struct bch_inode_unpacked dir = *inode; + u32 snapshot = dir.bi_snapshot; + + while (!(dir.bi_inum == BCACHEFS_ROOT_INO && + dir.bi_subvol == BCACHEFS_ROOT_SUBVOL)) { + if (dir.bi_parent_subvol) { + ret = bch2_subvolume_get_snapshot(trans, dir.bi_parent_subvol, &snapshot); + if (ret) + return ret; + + snapshot_overwrites = NULL; + } + + ret = bch2_inode_find_by_inum_snapshot(trans, dir.bi_dir, snapshot, &dir, 0); + if (ret) + return ret; + + if (!(dir.bi_flags & BCH_INODE_has_case_insensitive)) { + prt_printf(&buf, "parent of casefolded dir with has_case_insensitive not set\n"); + + ret = bch2_inum_snapshot_to_path(trans, dir.bi_inum, dir.bi_snapshot, + snapshot_overwrites, &buf); + if (ret) + return ret; + + if (fsck_err(trans, inode_parent_has_case_insensitive_not_set, "%s", buf.buf)) { + dir.bi_flags |= BCH_INODE_has_case_insensitive; + ret = __bch2_fsck_write_inode(trans, &dir); + if (ret) + return ret; + } + } + + /* + * We only need to check the first parent, unless we find an + * inconsistency + */ + if (!repairing_parents) + break; + } +out: +fsck_err: + if (ret) + return ret; + + if (repairing_parents) { + return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + bch_err_throw(trans->c, transaction_restart_nested); + } + + return 0; +} diff --git a/fs/bcachefs/namei.h b/fs/bcachefs/namei.h index 2e6f6364767f..ae6ebc2d0785 100644 --- a/fs/bcachefs/namei.h +++ b/fs/bcachefs/namei.h @@ -43,6 +43,8 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *, struct bch_inode_unpacked *); int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *); +int bch2_inum_snapshot_to_path(struct btree_trans *, u64, u32, + snapshot_id_list *, struct printbuf *); int __bch2_check_dirent_target(struct btree_trans *, struct btree_iter *, @@ -69,4 +71,9 @@ static inline int bch2_check_dirent_target(struct btree_trans *trans, return __bch2_check_dirent_target(trans, dirent_iter, d, target, in_fsck); } +int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *); +int bch2_check_inode_has_case_insensitive(struct btree_trans *, struct bch_inode_unpacked *, + snapshot_id_list *, bool *); + #endif /* _BCACHEFS_NAMEI_H */ diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c index 3c21981a4a1c..58cfd540c6d6 100644 --- a/fs/bcachefs/nocow_locking.c +++ b/fs/bcachefs/nocow_locking.c @@ -47,7 +47,7 @@ bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l, int v, lock_val = flags ? 1 : -1; unsigned i; - spin_lock(&l->lock); + guard(spinlock)(&l->lock); for (i = 0; i < ARRAY_SIZE(l->b); i++) if (l->b[i] == dev_bucket) @@ -58,21 +58,19 @@ bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l, l->b[i] = dev_bucket; goto take_lock; } -fail: - spin_unlock(&l->lock); + return false; got_entry: v = atomic_read(&l->l[i]); if (lock_val > 0 ? v < 0 : v > 0) - goto fail; + return false; take_lock: v = atomic_read(&l->l[i]); /* Overflow? */ if (v && sign(v + lock_val) != sign(v)) - goto fail; + return false; atomic_add(lock_val, &l->l[i]); - spin_unlock(&l->lock); return true; } @@ -133,12 +131,10 @@ void bch2_fs_nocow_locking_exit(struct bch_fs *c) BUG_ON(atomic_read(&l->l[j])); } -int bch2_fs_nocow_locking_init(struct bch_fs *c) +void bch2_fs_nocow_locking_init_early(struct bch_fs *c) { struct bucket_nocow_lock_table *t = &c->nocow_locks; for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) spin_lock_init(&l->lock); - - return 0; } diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h index f9d6a426a960..48b8a003c0d2 100644 --- a/fs/bcachefs/nocow_locking.h +++ b/fs/bcachefs/nocow_locking.h @@ -45,6 +45,6 @@ static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t, void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *); void bch2_fs_nocow_locking_exit(struct bch_fs *); -int bch2_fs_nocow_locking_init(struct bch_fs *); +void bch2_fs_nocow_locking_init_early(struct bch_fs *); #endif /* _BCACHEFS_NOCOW_LOCKING_H */ diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index af3258814822..921f9049912d 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -7,7 +7,9 @@ #include "compress.h" #include "disk_groups.h" #include "error.h" +#include "movinggc.h" #include "opts.h" +#include "rebalance.h" #include "recovery_passes.h" #include "super-io.h" #include "util.h" @@ -19,6 +21,11 @@ const char * const bch2_error_actions[] = { NULL }; +const char * const bch2_degraded_actions[] = { + BCH_DEGRADED_ACTIONS() + NULL +}; + const char * const bch2_fsck_fix_opts[] = { BCH_FIX_ERRORS_OPTS() NULL @@ -273,20 +280,20 @@ int bch2_opt_lookup(const char *name) return -1; } -struct synonym { +struct opt_synonym { const char *s1, *s2; }; -static const struct synonym bch_opt_synonyms[] = { +static const struct opt_synonym bch2_opt_synonyms[] = { { "quota", "usrquota" }, }; static int bch2_mount_opt_lookup(const char *name) { - const struct synonym *i; + const struct opt_synonym *i; - for (i = bch_opt_synonyms; - i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms); + for (i = bch2_opt_synonyms; + i < bch2_opt_synonyms + ARRAY_SIZE(bch2_opt_synonyms); i++) if (!strcmp(name, i->s1)) name = i->s2; @@ -294,6 +301,30 @@ static int bch2_mount_opt_lookup(const char *name) return bch2_opt_lookup(name); } +struct opt_val_synonym { + const char *opt, *v1, *v2; +}; + +static const struct opt_val_synonym bch2_opt_val_synonyms[] = { + { "degraded", "true", "yes" }, + { "degraded", "false", "no" }, + { "degraded", "1", "yes" }, + { "degraded", "0", "no" }, +}; + +static const char *bch2_opt_val_synonym_lookup(const char *opt, const char *val) +{ + const struct opt_val_synonym *i; + + for (i = bch2_opt_val_synonyms; + i < bch2_opt_val_synonyms + ARRAY_SIZE(bch2_opt_val_synonyms); + i++) + if (!strcmp(opt, i->opt) && !strcmp(val, i->v1)) + return i->v2; + + return val; +} + int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err) { if (v < opt->min) { @@ -337,21 +368,22 @@ int bch2_opt_parse(struct bch_fs *c, { ssize_t ret; + if (err) + printbuf_indent_add_nextline(err, 2); + switch (opt->type) { case BCH_OPT_BOOL: - if (val) { - ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool); - if (ret != -BCH_ERR_option_not_bool) { - *res = ret; - } else { - if (err) - prt_printf(err, "%s: must be bool", opt->attr.name); - return ret; - } + if (!val) + val = "1"; + + ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool); + if (ret != -BCH_ERR_option_not_bool) { + *res = ret; } else { - *res = 1; + if (err) + prt_printf(err, "%s: must be bool", opt->attr.name); + return ret; } - break; case BCH_OPT_UINT: if (!val) { @@ -360,9 +392,15 @@ int bch2_opt_parse(struct bch_fs *c, return -EINVAL; } - ret = opt->flags & OPT_HUMAN_READABLE - ? bch2_strtou64_h(val, res) - : kstrtou64(val, 10, res); + if (*val != '-') { + ret = opt->flags & OPT_HUMAN_READABLE + ? bch2_strtou64_h(val, res) + : kstrtou64(val, 10, res); + } else { + prt_printf(err, "%s: must be a non-negative number", opt->attr.name); + return -BCH_ERR_option_negative; + } + if (ret < 0) { if (err) prt_printf(err, "%s: must be a number", @@ -480,7 +518,7 @@ void bch2_opts_to_text(struct printbuf *out, } } -int bch2_opt_check_may_set(struct bch_fs *c, struct bch_dev *ca, int id, u64 v) +int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, enum bch_opt_id id, u64 v) { int ret = 0; @@ -498,15 +536,17 @@ int bch2_opt_check_may_set(struct bch_fs *c, struct bch_dev *ca, int id, u64 v) if (v) bch2_check_set_feature(c, BCH_FEATURE_ec); break; + default: + break; } return ret; } -int bch2_opts_check_may_set(struct bch_fs *c) +int bch2_opts_hooks_pre_set(struct bch_fs *c) { for (unsigned i = 0; i < bch2_opts_nr; i++) { - int ret = bch2_opt_check_may_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i)); + int ret = bch2_opt_hook_pre_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i)); if (ret) return ret; } @@ -514,11 +554,64 @@ int bch2_opts_check_may_set(struct bch_fs *c) return 0; } +void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, + struct bch_opts *new_opts, enum bch_opt_id id) +{ + switch (id) { + case Opt_foreground_target: + if (new_opts->foreground_target && + !new_opts->background_target) + bch2_set_rebalance_needs_scan(c, inum); + break; + case Opt_compression: + if (new_opts->compression && + !new_opts->background_compression) + bch2_set_rebalance_needs_scan(c, inum); + break; + case Opt_background_target: + if (new_opts->background_target) + bch2_set_rebalance_needs_scan(c, inum); + break; + case Opt_background_compression: + if (new_opts->background_compression) + bch2_set_rebalance_needs_scan(c, inum); + break; + case Opt_rebalance_enabled: + bch2_rebalance_wakeup(c); + break; + case Opt_copygc_enabled: + bch2_copygc_wakeup(c); + break; + case Opt_discard: + if (!ca) { + guard(mutex)(&c->sb_lock); + for_each_member_device(c, ca) { + struct bch_member *m = + bch2_members_v2_get_mut(ca->disk_sb.sb, ca->dev_idx); + SET_BCH_MEMBER_DISCARD(m, c->opts.discard); + } + + bch2_write_super(c); + } + break; + case Opt_version_upgrade: + /* + * XXX: in the future we'll likely want to do compatible + * upgrades at runtime as well, but right now there's nothing + * that does that: + */ + if (new_opts->version_upgrade == BCH_VERSION_UPGRADE_incompatible) + bch2_sb_upgrade_incompat(c); + break; + default: + break; + } +} + int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, struct printbuf *parse_later, const char *name, const char *val) { - struct printbuf err = PRINTBUF; u64 v; int ret, id; @@ -536,47 +629,43 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, if (id < 0) return 0; + /* must have a value for synonym lookup - but OPT_FN is weird */ + if (!val && bch2_opt_table[id].type != BCH_OPT_FN) + val = "1"; + + val = bch2_opt_val_synonym_lookup(name, val); + if (!(bch2_opt_table[id].flags & OPT_MOUNT)) - goto bad_opt; + return -BCH_ERR_option_name; if (id == Opt_acl && !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL)) - goto bad_opt; + return -BCH_ERR_option_name; if ((id == Opt_usrquota || id == Opt_grpquota) && !IS_ENABLED(CONFIG_BCACHEFS_QUOTA)) - goto bad_opt; + return -BCH_ERR_option_name; + CLASS(printbuf, err)(); ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err); if (ret == -BCH_ERR_option_needs_open_fs) { - ret = 0; - if (parse_later) { prt_printf(parse_later, "%s=%s,", name, val); if (parse_later->allocation_failure) - ret = -ENOMEM; + return -ENOMEM; } - goto out; + return 0; } if (ret < 0) - goto bad_val; + return -BCH_ERR_option_value; if (opts) bch2_opt_set_by_id(opts, id, v); - ret = 0; -out: - printbuf_exit(&err); - return ret; -bad_opt: - ret = -BCH_ERR_option_name; - goto out; -bad_val: - ret = -BCH_ERR_option_value; - goto out; + return 0; } int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, @@ -667,9 +756,11 @@ int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) return 0; } -void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, +bool __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, const struct bch_option *opt, u64 v) { + bool changed = false; + if (opt->flags & OPT_SB_FIELD_SECTORS) v >>= 9; @@ -679,26 +770,34 @@ void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, if (opt->flags & OPT_SB_FIELD_ONE_BIAS) v++; - if ((opt->flags & OPT_FS) && opt->set_sb && dev_idx < 0) + if ((opt->flags & OPT_FS) && opt->set_sb && dev_idx < 0) { + changed = v != opt->get_sb(sb); + opt->set_sb(sb, v); + } if ((opt->flags & OPT_DEVICE) && opt->set_member && dev_idx >= 0) { if (WARN(!bch2_member_exists(sb, dev_idx), "tried to set device option %s on nonexistent device %i", opt->attr.name, dev_idx)) - return; + return false; - opt->set_member(bch2_members_v2_get_mut(sb, dev_idx), v); + struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx); + changed = v != opt->get_member(m); + opt->set_member(m, v); } + + return changed; } -void bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca, +bool bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca, const struct bch_option *opt, u64 v) { - mutex_lock(&c->sb_lock); - __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); + guard(mutex)(&c->sb_lock); + bool changed = __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v); + if (changed) + bch2_write_super(c); + return changed; } /* io opts: */ diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index dfb14810124c..84ce69a7f131 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -11,6 +11,7 @@ struct bch_fs; extern const char * const bch2_error_actions[]; +extern const char * const bch2_degraded_actions[]; extern const char * const bch2_fsck_fix_opts[]; extern const char * const bch2_version_upgrade_opts[]; extern const char * const bch2_sb_features[]; @@ -149,12 +150,12 @@ enum fsck_err_opts { NULL, "Number of consecutive write errors allowed before kicking out a device")\ x(metadata_replicas, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(1, BCH_REPLICAS_MAX), \ + OPT_UINT(1, BCH_REPLICAS_MAX + 1), \ BCH_SB_META_REPLICAS_WANT, 1, \ "#", "Number of metadata replicas") \ x(data_replicas, u8, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_UINT(1, BCH_REPLICAS_MAX), \ + OPT_UINT(1, BCH_REPLICAS_MAX + 1), \ BCH_SB_DATA_REPLICAS_WANT, 1, \ "#", "Number of data replicas") \ x(metadata_replicas_required, u8, \ @@ -164,7 +165,7 @@ enum fsck_err_opts { "#", NULL) \ x(data_replicas_required, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ - OPT_UINT(1, BCH_REPLICAS_MAX), \ + OPT_UINT(1, BCH_REPLICAS_MAX + 1), \ BCH_SB_DATA_REPLICAS_REQ, 1, \ "#", NULL) \ x(encoded_extent_max, u32, \ @@ -233,6 +234,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH_SB_CASEFOLD, false, \ NULL, "Dirent lookups are casefolded") \ + x(casefold_disabled, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Disable casefolding filesystem wide") \ x(inodes_32bit, u8, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ @@ -307,14 +313,9 @@ enum fsck_err_opts { NULL, "Enable project quotas") \ x(degraded, u8, \ OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ + OPT_STR(bch2_degraded_actions), \ + BCH_SB_DEGRADED_ACTION, BCH_DEGRADED_ask, \ NULL, "Allow mounting in degraded mode") \ - x(very_degraded, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Allow mounting in when data will be missing") \ x(no_splitbrain_check, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ @@ -383,6 +384,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Exit recovery immediately prior to journal replay")\ + x(journal_rewind, u64, \ + OPT_FS|OPT_MOUNT, \ + OPT_UINT(0, U64_MAX), \ + BCH2_NO_SB_OPT, 0, \ + NULL, "Rewind journal") \ x(recovery_passes, u64, \ OPT_FS|OPT_MOUNT, \ OPT_BITFIELD(bch2_recovery_passes), \ @@ -454,7 +460,7 @@ enum fsck_err_opts { BCH2_NO_SB_OPT, false, \ NULL, "Reconstruct alloc btree") \ x(version_upgrade, u8, \ - OPT_FS|OPT_MOUNT, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_STR(bch2_version_upgrade_opts), \ BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \ NULL, "Set superblock to latest version,\n" \ @@ -494,6 +500,17 @@ enum fsck_err_opts { BCH2_NO_SB_OPT, true, \ NULL, "Enable rebalance: disable for debugging, or to\n"\ "quiet the system when doing performance testing\n")\ + x(rebalance_on_ac_only, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_REBALANCE_AC_ONLY, false, \ + NULL, "Enable rebalance while on mains power only\n") \ + x(auto_snapshot_deletion, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "Enable automatic snapshot deletion: disable for debugging, or to\n"\ + "quiet the system when doing performance testing\n")\ x(no_data_io, u8, \ OPT_MOUNT, \ OPT_BOOL(), \ @@ -512,7 +529,7 @@ enum fsck_err_opts { "size", "Specifies the bucket size; must be greater than the btree node size")\ x(durability, u8, \ OPT_DEVICE|OPT_RUNTIME|OPT_SB_FIELD_ONE_BIAS, \ - OPT_UINT(0, BCH_REPLICAS_MAX), \ + OPT_UINT(0, BCH_REPLICAS_MAX + 1), \ BCH_MEMBER_DURABILITY, 1, \ "n", "Data written to this device will be considered\n"\ "to have already been replicated n times") \ @@ -522,7 +539,7 @@ enum fsck_err_opts { BCH_MEMBER_DATA_ALLOWED, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\ "types", "Allowed data types for this device: journal, btree, and/or user")\ x(discard, u8, \ - OPT_MOUNT|OPT_DEVICE|OPT_RUNTIME, \ + OPT_MOUNT|OPT_FS|OPT_DEVICE|OPT_RUNTIME, \ OPT_BOOL(), \ BCH_MEMBER_DISCARD, true, \ NULL, "Enable discard/TRIM support") \ @@ -530,7 +547,7 @@ enum fsck_err_opts { OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ - NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\ + NULL, "BTREE_ITER_prefetch causes btree nodes to be\n"\ " prefetched sequentially") struct bch_opts { @@ -616,10 +633,10 @@ void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id, int); int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *); -void __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64); +bool __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64); struct bch_dev; -void bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64); +bool bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64); int bch2_opt_lookup(const char *); int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *); @@ -636,8 +653,11 @@ void bch2_opts_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, unsigned, unsigned, unsigned); -int bch2_opt_check_may_set(struct bch_fs *, struct bch_dev *, int, u64); -int bch2_opts_check_may_set(struct bch_fs *); +int bch2_opt_hook_pre_set(struct bch_fs *, struct bch_dev *, enum bch_opt_id, u64); +int bch2_opts_hooks_pre_set(struct bch_fs *); +void bch2_opt_hook_post_set(struct bch_fs *, struct bch_dev *, u64, + struct bch_opts *, enum bch_opt_id); + int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *, struct printbuf *, const char *, const char *); int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *, diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h index 1ca476adbf6f..907e5c97550b 100644 --- a/fs/bcachefs/printbuf.h +++ b/fs/bcachefs/printbuf.h @@ -140,6 +140,14 @@ void bch2_prt_bitflags_vector(struct printbuf *, const char * const[], .size = _size, \ }) +static inline struct printbuf bch2_printbuf_init(void) +{ + return PRINTBUF; +} + +DEFINE_CLASS(printbuf, struct printbuf, + bch2_printbuf_exit(&_T), bch2_printbuf_init(), void) + /* * Returns size remaining of output buffer: */ @@ -287,4 +295,8 @@ static inline void printbuf_atomic_dec(struct printbuf *buf) buf->atomic--; } +DEFINE_GUARD(printbuf_atomic, struct printbuf *, + printbuf_atomic_inc(_T), + printbuf_atomic_dec(_T)); + #endif /* _BCACHEFS_PRINTBUF_H */ diff --git a/fs/bcachefs/progress.c b/fs/bcachefs/progress.c index d09898566abe..792fc6fef270 100644 --- a/fs/bcachefs/progress.c +++ b/fs/bcachefs/progress.c @@ -46,16 +46,16 @@ void bch2_progress_update_iter(struct btree_trans *trans, s->last_node = b; if (progress_update_p(s)) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); unsigned percent = s->nodes_total ? div64_u64(s->nodes_seen * 100, s->nodes_total) : 0; prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ", - msg, percent, s->nodes_seen, s->nodes_total); + strip_bch2(msg), + percent, s->nodes_seen, s->nodes_total); bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos)); bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); } } diff --git a/fs/bcachefs/progress.h b/fs/bcachefs/progress.h index 23fb1811f943..972a73087ffe 100644 --- a/fs/bcachefs/progress.h +++ b/fs/bcachefs/progress.h @@ -26,4 +26,7 @@ void bch2_progress_update_iter(struct btree_trans *, struct btree_iter *, const char *); +#define progress_update_iter(trans, p, iter) \ + bch2_progress_update_iter(trans, p, iter, __func__) + #endif /* _BCACHEFS_PROGRESS_H */ diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index 3d4755d73af7..eaa43ad9baa6 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -394,12 +394,10 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k, dq = bkey_s_c_to_quota(k); q = &c->quotas[k.k->p.inode]; - mutex_lock(&q->lock); + guard(mutex)(&q->lock); mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL); - if (!mq) { - mutex_unlock(&q->lock); + if (!mq) return -ENOMEM; - } for (i = 0; i < Q_COUNTERS; i++) { mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit); @@ -414,8 +412,6 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k, mq->c[Q_INO].timer = qdq->d_ino_timer; if (qdq && qdq->d_fieldmask & QC_INO_WARNS) mq->c[Q_INO].warns = qdq->d_ino_warns; - - mutex_unlock(&q->lock); } return 0; @@ -516,30 +512,27 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans, bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, KEY_TYPE_QUOTA_NOCHECK); advance: - bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos)); + bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); return 0; } int bch2_fs_quota_read(struct bch_fs *c) { + scoped_guard(mutex, &c->sb_lock) { + struct bch_sb_field_quota *sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); + if (!sb_quota) + return bch_err_throw(c, ENOSPC_sb_quota); - mutex_lock(&c->sb_lock); - struct bch_sb_field_quota *sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); - if (!sb_quota) { - mutex_unlock(&c->sb_lock); - return -BCH_ERR_ENOSPC_sb_quota; + bch2_sb_quota_read(c); } - bch2_sb_quota_read(c); - mutex_unlock(&c->sb_lock); - - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_quotas, POS_MIN, + CLASS(btree_trans, trans)(c); + int ret = for_each_btree_key(trans, iter, BTREE_ID_quotas, POS_MIN, BTREE_ITER_prefetch, k, __bch2_quota_set(c, k, NULL)) ?: for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - bch2_fs_quota_read_inode(trans, &iter, k))); + bch2_fs_quota_read_inode(trans, &iter, k)); bch_err_fn(c, ret); return ret; } @@ -550,7 +543,6 @@ static int bch2_quota_enable(struct super_block *sb, unsigned uflags) { struct bch_fs *c = sb->s_fs_info; struct bch_sb_field_quota *sb_quota; - int ret = 0; if (sb->s_flags & SB_RDONLY) return -EROFS; @@ -569,11 +561,12 @@ static int bch2_quota_enable(struct super_block *sb, unsigned uflags) if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota) return -EINVAL; - mutex_lock(&c->sb_lock); + guard(mutex)(&c->sb_lock); sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); if (!sb_quota) { - ret = -BCH_ERR_ENOSPC_sb_quota; - goto unlock; + int ret = bch_err_throw(c, ENOSPC_sb_quota); + bch_err_fn(c, ret); + return ret; } if (uflags & FS_QUOTA_UDQ_ENFD) @@ -586,10 +579,7 @@ static int bch2_quota_enable(struct super_block *sb, unsigned uflags) SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true); bch2_write_super(c); -unlock: - mutex_unlock(&c->sb_lock); - - return bch2_err_class(ret); + return 0; } static int bch2_quota_disable(struct super_block *sb, unsigned uflags) @@ -599,7 +589,7 @@ static int bch2_quota_disable(struct super_block *sb, unsigned uflags) if (sb->s_flags & SB_RDONLY) return -EROFS; - mutex_lock(&c->sb_lock); + guard(mutex)(&c->sb_lock); if (uflags & FS_QUOTA_UDQ_ENFD) SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false); @@ -610,8 +600,6 @@ static int bch2_quota_disable(struct super_block *sb, unsigned uflags) SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false); bch2_write_super(c); - mutex_unlock(&c->sb_lock); - return 0; } @@ -700,14 +688,12 @@ static int bch2_quota_set_info(struct super_block *sb, int type, { struct bch_fs *c = sb->s_fs_info; struct bch_sb_field_quota *sb_quota; - int ret = 0; if (0) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); qc_info_to_text(&buf, info); pr_info("setting:\n%s", buf.buf); - printbuf_exit(&buf); } if (sb->s_flags & SB_RDONLY) @@ -723,11 +709,12 @@ static int bch2_quota_set_info(struct super_block *sb, int type, ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS)) return -EINVAL; - mutex_lock(&c->sb_lock); + guard(mutex)(&c->sb_lock); sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); if (!sb_quota) { - ret = -BCH_ERR_ENOSPC_sb_quota; - goto unlock; + int ret = bch_err_throw(c, ENOSPC_sb_quota); + bch_err_fn(c, ret); + return bch2_err_class(ret); } if (info->i_fieldmask & QC_SPC_TIMER) @@ -749,10 +736,7 @@ static int bch2_quota_set_info(struct super_block *sb, int type, bch2_sb_quota_read(c); bch2_write_super(c); -unlock: - mutex_unlock(&c->sb_lock); - - return bch2_err_class(ret); + return 0; } /* Get/set individual quotas: */ @@ -778,15 +762,13 @@ static int bch2_get_quota(struct super_block *sb, struct kqid kqid, struct bch_fs *c = sb->s_fs_info; struct bch_memquota_type *q = &c->quotas[kqid.type]; qid_t qid = from_kqid(&init_user_ns, kqid); - struct bch_memquota *mq; memset(qdq, 0, sizeof(*qdq)); - mutex_lock(&q->lock); - mq = genradix_ptr(&q->table, qid); + guard(mutex)(&q->lock); + struct bch_memquota *mq = genradix_ptr(&q->table, qid); if (mq) __bch2_quota_get(qdq, mq); - mutex_unlock(&q->lock); return 0; } @@ -799,34 +781,27 @@ static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid, qid_t qid = from_kqid(&init_user_ns, *kqid); struct genradix_iter iter; struct bch_memquota *mq; - int ret = 0; - mutex_lock(&q->lock); + guard(mutex)(&q->lock); genradix_for_each_from(&q->table, iter, mq, qid) if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) { __bch2_quota_get(qdq, mq); *kqid = make_kqid(current_user_ns(), kqid->type, iter.pos); - goto found; + return 0; } - ret = -ENOENT; -found: - mutex_unlock(&q->lock); - return bch2_err_class(ret); + return -ENOENT; } static int bch2_set_quota_trans(struct btree_trans *trans, struct bkey_i_quota *new_quota, struct qc_dqblk *qdq) { - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_quotas, new_quota->k.p, - BTREE_ITER_slots|BTREE_ITER_intent); - ret = bkey_err(k); + CLASS(btree_iter, iter)(trans, BTREE_ID_quotas, new_quota->k.p, + BTREE_ITER_slots|BTREE_ITER_intent); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + int ret = bkey_err(k); if (unlikely(ret)) return ret; @@ -843,33 +818,29 @@ static int bch2_set_quota_trans(struct btree_trans *trans, if (qdq->d_fieldmask & QC_INO_HARD) new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); - ret = bch2_trans_update(trans, &iter, &new_quota->k_i, 0); - bch2_trans_iter_exit(trans, &iter); - return ret; + return bch2_trans_update(trans, &iter, &new_quota->k_i, 0); } static int bch2_set_quota(struct super_block *sb, struct kqid qid, struct qc_dqblk *qdq) { struct bch_fs *c = sb->s_fs_info; - struct bkey_i_quota new_quota; - int ret; if (0) { - struct printbuf buf = PRINTBUF; - + CLASS(printbuf, buf)(); qc_dqblk_to_text(&buf, qdq); pr_info("setting:\n%s", buf.buf); - printbuf_exit(&buf); } if (sb->s_flags & SB_RDONLY) return -EROFS; + struct bkey_i_quota new_quota; bkey_quota_init(&new_quota.k_i); new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); - ret = bch2_trans_commit_do(c, NULL, NULL, 0, + CLASS(btree_trans, trans)(c); + int ret = commit_do(trans, NULL, NULL, 0, bch2_set_quota_trans(trans, &new_quota, qdq)) ?: __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq); diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c index bef2aa1b8bcd..b1438be9d690 100644 --- a/fs/bcachefs/rcu_pending.c +++ b/fs/bcachefs/rcu_pending.c @@ -182,11 +182,6 @@ static inline void kfree_bulk(size_t nr, void ** p) while (nr--) kfree(*p); } - -#define local_irq_save(flags) \ -do { \ - flags = 0; \ -} while (0) #endif static noinline void __process_finished_items(struct rcu_pending *pending, @@ -429,9 +424,15 @@ __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN)); - local_irq_save(flags); - p = this_cpu_ptr(pending->p); - spin_lock(&p->lock); + /* We could technically be scheduled before taking the lock and end up + * using a different cpu's rcu_pending_pcpu: that's ok, it needs a lock + * anyways + * + * And we have to do it this way to avoid breaking PREEMPT_RT, which + * redefines how spinlocks work: + */ + p = raw_cpu_ptr(pending->p); + spin_lock_irqsave(&p->lock, flags); rcu_gp_poll_state_t seq = __get_state_synchronize_rcu(pending->srcu); restart: if (may_sleep && @@ -520,9 +521,8 @@ __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, goto free_node; } - local_irq_save(flags); - p = this_cpu_ptr(pending->p); - spin_lock(&p->lock); + p = raw_cpu_ptr(pending->p); + spin_lock_irqsave(&p->lock, flags); goto restart; } diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 623273556aa9..c0c5fe961a83 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -15,6 +15,7 @@ #include "inode.h" #include "io_write.h" #include "move.h" +#include "progress.h" #include "rebalance.h" #include "subvolume.h" #include "super-io.h" @@ -80,6 +81,7 @@ static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c, unsigned ptr_bit = 1; unsigned rewrite_ptrs = 0; + guard(rcu)(); bkey_for_each_ptr(ptrs, ptr) { if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target)) rewrite_ptrs |= ptr_bit; @@ -95,6 +97,9 @@ static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) + return 0; + return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) | bch2_bkey_ptrs_need_move(c, opts, ptrs); } @@ -107,6 +112,9 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) if (!opts) return 0; + if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) + return 0; + const union bch_extent_entry *entry; struct extent_ptr_decoded p; u64 sectors = 0; @@ -126,10 +134,13 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) } } incompressible: - if (opts->background_target) + if (opts->background_target) { + guard(rcu)(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) + if (!p.ptr.cached && + !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) sectors += p.crc.compressed_size; + } return sectors; } @@ -210,7 +221,7 @@ int bch2_get_update_rebalance_opts(struct btree_trans *trans, return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?: bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, NULL, NULL, 0) ?: - -BCH_ERR_transaction_restart_nested; + bch_err_throw(trans->c, transaction_restart_nested); } #define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) @@ -224,43 +235,34 @@ static const char * const bch2_rebalance_state_strs[] = { int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum) { - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_i_cookie *cookie; - u64 v; - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, - SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), - BTREE_ITER_intent); - k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); + CLASS(btree_iter, iter)(trans, BTREE_ID_rebalance_work, + SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), + BTREE_ITER_intent); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + int ret = bkey_err(k); if (ret) - goto err; + return ret; - v = k.k->type == KEY_TYPE_cookie + u64 v = k.k->type == KEY_TYPE_cookie ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) : 0; - cookie = bch2_trans_kmalloc(trans, sizeof(*cookie)); + struct bkey_i_cookie *cookie = bch2_trans_kmalloc(trans, sizeof(*cookie)); ret = PTR_ERR_OR_ZERO(cookie); if (ret) - goto err; + return ret; bkey_cookie_init(&cookie->k_i); cookie->k.p = iter.pos; cookie->v.cookie = cpu_to_le64(v + 1); - ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; + return bch2_trans_update(trans, &iter, &cookie->k_i, 0); } int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) { - int ret = bch2_trans_commit_do(c, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, + CLASS(btree_trans, trans)(c); + int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_set_rebalance_needs_scan_trans(trans, inum)); bch2_rebalance_wakeup(c); return ret; @@ -273,35 +275,28 @@ int bch2_set_fs_needs_rebalance(struct bch_fs *c) static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie) { - struct btree_iter iter; - struct bkey_s_c k; - u64 v; - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, - SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), - BTREE_ITER_intent); - k = bch2_btree_iter_peek_slot(trans, &iter); - ret = bkey_err(k); + CLASS(btree_iter, iter)(trans, BTREE_ID_rebalance_work, + SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), + BTREE_ITER_intent); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + int ret = bkey_err(k); if (ret) - goto err; + return ret; - v = k.k->type == KEY_TYPE_cookie + u64 v = k.k->type == KEY_TYPE_cookie ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) : 0; - if (v == cookie) - ret = bch2_btree_delete_at(trans, &iter, 0); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; + return v == cookie + ? bch2_btree_delete_at(trans, &iter, 0) + : 0; } static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans, struct btree_iter *work_iter) { return !kthread_should_stop() - ? bch2_btree_iter_peek(trans, work_iter) + ? bch2_btree_iter_peek(work_iter) : bkey_s_c_null; } @@ -330,12 +325,12 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, { struct bch_fs *c = trans->c; - bch2_trans_iter_exit(trans, extent_iter); + bch2_trans_iter_exit(extent_iter); bch2_trans_iter_init(trans, extent_iter, work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink, work_pos, BTREE_ITER_all_snapshots); - struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, extent_iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(extent_iter); if (bkey_err(k)) return k; @@ -363,7 +358,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, } if (trace_rebalance_extent_enabled()) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_bkey_val_to_text(&buf, c, k); prt_newline(&buf); @@ -389,7 +384,6 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, } trace_rebalance_extent(c, buf.buf); - printbuf_exit(&buf); } return k; @@ -433,7 +427,7 @@ static int do_rebalance_extent(struct moving_context *ctxt, if (bch2_err_matches(ret, ENOMEM)) { /* memory allocation failure, wait for some IO to finish */ bch2_move_ctxt_wait_for_io(ctxt); - ret = -BCH_ERR_transaction_restart_nested; + ret = bch_err_throw(c, transaction_restart_nested); } if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -447,22 +441,11 @@ static int do_rebalance_extent(struct moving_context *ctxt, return ret; } -static bool rebalance_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); - data_opts->target = io_opts->background_target; - data_opts->write_flags |= BCH_WRITE_only_specified_devs; - return data_opts->rewrite_ptrs != 0; -} - static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) { struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; struct bch_fs_rebalance *r = &trans->c->rebalance; - int ret; bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); ctxt->stats = &r->scan_stats; @@ -477,11 +460,34 @@ static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) r->state = BCH_REBALANCE_scanning; - ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?: - commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_clear_rebalance_needs_scan(trans, inum, cookie)); + struct per_snapshot_io_opts snapshot_io_opts; + per_snapshot_io_opts_init(&snapshot_io_opts, c); + + int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, + r->scan_start.pos, r->scan_end.pos, + BTREE_ITER_all_snapshots| + BTREE_ITER_not_extents| + BTREE_ITER_prefetch, k, ({ + ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); + struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans, + &snapshot_io_opts, iter.pos, &iter, k); + PTR_ERR_OR_ZERO(io_opts); + })) ?: + commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_clear_rebalance_needs_scan(trans, inum, cookie)); + + per_snapshot_io_opts_exit(&snapshot_io_opts); bch2_move_stats_exit(&r->scan_stats, trans->c); + + /* + * Ensure that the rebalance_work entries we created are seen by the + * next iteration of do_rebalance(), so we don't end up stuck in + * rebalance_wait(): + */ + atomic64_inc(&r->scan_stats.sectors_seen); + bch2_btree_write_buffer_flush_sync(trans); + return ret; } @@ -503,7 +509,14 @@ static void rebalance_wait(struct bch_fs *c) r->state = BCH_REBALANCE_waiting; } - bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT); + bch2_kthread_io_clock_wait_once(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT); +} + +static bool bch2_rebalance_enabled(struct bch_fs *c) +{ + return c->opts.rebalance_enabled && + !(c->opts.rebalance_on_ac_only && + c->rebalance.on_battery); } static int do_rebalance(struct moving_context *ctxt) @@ -511,8 +524,9 @@ static int do_rebalance(struct moving_context *ctxt) struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; struct bch_fs_rebalance *r = &c->rebalance; - struct btree_iter rebalance_work_iter, extent_iter = {}; + struct btree_iter extent_iter = { NULL }; struct bkey_s_c k; + u32 kick = r->kick; int ret = 0; bch2_trans_begin(trans); @@ -520,14 +534,14 @@ static int do_rebalance(struct moving_context *ctxt) bch2_move_stats_init(&r->work_stats, "rebalance_work"); bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); - bch2_trans_iter_init(trans, &rebalance_work_iter, - BTREE_ID_rebalance_work, POS_MIN, - BTREE_ITER_all_snapshots); + CLASS(btree_iter, rebalance_work_iter)(trans, + BTREE_ID_rebalance_work, POS_MIN, + BTREE_ITER_all_snapshots); while (!bch2_move_ratelimit(ctxt)) { - if (!c->opts.rebalance_enabled) { + if (!bch2_rebalance_enabled(c)) { bch2_moving_ctxt_flush_all(ctxt); - kthread_wait_freezable(c->opts.rebalance_enabled || + kthread_wait_freezable(bch2_rebalance_enabled(c) || kthread_should_stop()); } @@ -552,17 +566,17 @@ static int do_rebalance(struct moving_context *ctxt) if (ret) break; - bch2_btree_iter_advance(trans, &rebalance_work_iter); + bch2_btree_iter_advance(&rebalance_work_iter); } - bch2_trans_iter_exit(trans, &extent_iter); - bch2_trans_iter_exit(trans, &rebalance_work_iter); + bch2_trans_iter_exit(&extent_iter); bch2_move_stats_exit(&r->scan_stats, c); if (!ret && !kthread_should_stop() && !atomic64_read(&r->work_stats.sectors_seen) && - !atomic64_read(&r->scan_stats.sectors_seen)) { + !atomic64_read(&r->scan_stats.sectors_seen) && + kick == r->kick) { bch2_moving_ctxt_flush_all(ctxt); bch2_trans_unlock_long(trans); rebalance_wait(c); @@ -585,7 +599,7 @@ static int bch2_rebalance_thread(void *arg) * Data move operations can't run until after check_snapshots has * completed, and bch2_snapshot_is_ancestor() is available. */ - kthread_wait_freezable(c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots || + kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots || kthread_should_stop()); bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats, @@ -646,11 +660,12 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) } prt_newline(out); - rcu_read_lock(); - struct task_struct *t = rcu_dereference(c->rebalance.thread); - if (t) - get_task_struct(t); - rcu_read_unlock(); + struct task_struct *t; + scoped_guard(rcu) { + t = rcu_dereference(c->rebalance.thread); + if (t) + get_task_struct(t); + } if (t) { bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); @@ -681,17 +696,15 @@ void bch2_rebalance_stop(struct bch_fs *c) int bch2_rebalance_start(struct bch_fs *c) { - struct task_struct *p; - int ret; - if (c->rebalance.thread) return 0; if (c->opts.nochanges) return 0; - p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name); - ret = PTR_ERR_OR_ZERO(p); + struct task_struct *p = + kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name); + int ret = PTR_ERR_OR_ZERO(p); bch_err_msg(c, ret, "creating rebalance thread"); if (ret) return ret; @@ -702,7 +715,152 @@ int bch2_rebalance_start(struct bch_fs *c) return 0; } -void bch2_fs_rebalance_init(struct bch_fs *c) +#ifdef CONFIG_POWER_SUPPLY +#include + +static int bch2_rebalance_power_notifier(struct notifier_block *nb, + unsigned long event, void *data) +{ + struct bch_fs *c = container_of(nb, struct bch_fs, rebalance.power_notifier); + + c->rebalance.on_battery = !power_supply_is_system_supplied(); + bch2_rebalance_wakeup(c); + return NOTIFY_OK; +} +#endif + +void bch2_fs_rebalance_exit(struct bch_fs *c) +{ +#ifdef CONFIG_POWER_SUPPLY + power_supply_unreg_notifier(&c->rebalance.power_notifier); +#endif +} + +int bch2_fs_rebalance_init(struct bch_fs *c) +{ + struct bch_fs_rebalance *r = &c->rebalance; + + bch2_pd_controller_init(&r->pd); + +#ifdef CONFIG_POWER_SUPPLY + r->power_notifier.notifier_call = bch2_rebalance_power_notifier; + int ret = power_supply_reg_notifier(&r->power_notifier); + if (ret) + return ret; + + r->on_battery = !power_supply_is_system_supplied(); +#endif + return 0; +} + +static int check_rebalance_work_one(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct btree_iter *rebalance_iter, + struct bkey_buf *last_flushed) { - bch2_pd_controller_init(&c->rebalance.pd); + struct bch_fs *c = trans->c; + struct bkey_s_c extent_k, rebalance_k; + CLASS(printbuf, buf)(); + + int ret = bkey_err(extent_k = bch2_btree_iter_peek(extent_iter)) ?: + bkey_err(rebalance_k = bch2_btree_iter_peek(rebalance_iter)); + if (ret) + return ret; + + if (!extent_k.k && + extent_iter->btree_id == BTREE_ID_reflink && + (!rebalance_k.k || + rebalance_k.k->p.inode >= BCACHEFS_ROOT_INO)) { + bch2_trans_iter_exit(extent_iter); + bch2_trans_iter_init(trans, extent_iter, + BTREE_ID_extents, POS_MIN, + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots); + return bch_err_throw(c, transaction_restart_nested); + } + + if (!extent_k.k && !rebalance_k.k) + return 1; + + int cmp = bpos_cmp(extent_k.k ? extent_k.k->p : SPOS_MAX, + rebalance_k.k ? rebalance_k.k->p : SPOS_MAX); + + struct bkey deleted; + bkey_init(&deleted); + + if (cmp < 0) { + deleted.p = extent_k.k->p; + rebalance_k.k = &deleted; + } else if (cmp > 0) { + deleted.p = rebalance_k.k->p; + extent_k.k = &deleted; + } + + bool should_have_rebalance = + bch2_bkey_sectors_need_rebalance(c, extent_k) != 0; + bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set; + + if (should_have_rebalance != have_rebalance) { + ret = bch2_btree_write_buffer_maybe_flush(trans, extent_k, last_flushed); + if (ret) + return ret; + + bch2_bkey_val_to_text(&buf, c, extent_k); + } + + if (fsck_err_on(!should_have_rebalance && have_rebalance, + trans, rebalance_work_incorrectly_set, + "rebalance work incorrectly set\n%s", buf.buf)) { + ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, + extent_k.k->p, false); + if (ret) + return ret; + } + + if (fsck_err_on(should_have_rebalance && !have_rebalance, + trans, rebalance_work_incorrectly_unset, + "rebalance work incorrectly unset\n%s", buf.buf)) { + ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, + extent_k.k->p, true); + if (ret) + return ret; + } + + if (cmp <= 0) + bch2_btree_iter_advance(extent_iter); + if (cmp >= 0) + bch2_btree_iter_advance(rebalance_iter); +fsck_err: + return ret; +} + +int bch2_check_rebalance_work(struct bch_fs *c) +{ + CLASS(btree_trans, trans)(c); + CLASS(btree_iter, extent_iter)(trans, BTREE_ID_reflink, POS_MIN, + BTREE_ITER_prefetch); + CLASS(btree_iter, rebalance_iter)(trans, BTREE_ID_rebalance_work, POS_MIN, + BTREE_ITER_prefetch); + + struct bkey_buf last_flushed; + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_rebalance_work)); + + int ret = 0; + while (!ret) { + progress_update_iter(trans, &progress, &rebalance_iter); + + bch2_trans_begin(trans); + + ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; + } + + bch2_bkey_buf_exit(&last_flushed, c); + return ret < 0 ? ret : 0; } diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index e5e8eb4a2dd1..7a565ea7dbfc 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -39,19 +39,21 @@ int bch2_set_fs_needs_rebalance(struct bch_fs *); static inline void bch2_rebalance_wakeup(struct bch_fs *c) { - struct task_struct *p; - - rcu_read_lock(); - p = rcu_dereference(c->rebalance.thread); + c->rebalance.kick++; + guard(rcu)(); + struct task_struct *p = rcu_dereference(c->rebalance.thread); if (p) wake_up_process(p); - rcu_read_unlock(); } void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *); void bch2_rebalance_stop(struct bch_fs *); int bch2_rebalance_start(struct bch_fs *); -void bch2_fs_rebalance_init(struct bch_fs *); + +void bch2_fs_rebalance_exit(struct bch_fs *); +int bch2_fs_rebalance_init(struct bch_fs *); + +int bch2_check_rebalance_work(struct bch_fs *); #endif /* _BCACHEFS_REBALANCE_H */ diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h index fe5098c17dfc..c659da149fa3 100644 --- a/fs/bcachefs/rebalance_types.h +++ b/fs/bcachefs/rebalance_types.h @@ -18,6 +18,7 @@ enum bch_rebalance_states { struct bch_fs_rebalance { struct task_struct __rcu *thread; + u32 kick; struct bch_pd_controller pd; enum bch_rebalance_states state; @@ -30,6 +31,11 @@ struct bch_fs_rebalance { struct bbpos scan_start; struct bbpos scan_end; struct bch_move_stats scan_stats; + + bool on_battery; +#ifdef CONFIG_POWER_SUPPLY + struct notifier_block power_notifier; +#endif }; #endif /* _BCACHEFS_REBALANCE_TYPES_H */ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index d6c4ef819d40..c57ff235a97a 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -33,77 +33,86 @@ #include #include - -int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) +int bch2_btree_lost_data(struct bch_fs *c, + struct printbuf *msg, + enum btree_id btree) { - u64 b = BIT_ULL(btree); int ret = 0; - mutex_lock(&c->sb_lock); + guard(mutex)(&c->sb_lock); + bool write_sb = false; struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - if (!(c->sb.btrees_lost_data & b)) { - struct printbuf buf = PRINTBUF; - bch2_btree_id_to_text(&buf, btree); - bch_err(c, "flagging btree %s lost data", buf.buf); - printbuf_exit(&buf); - ext->btrees_lost_data |= cpu_to_le64(b); + if (!(c->sb.btrees_lost_data & BIT_ULL(btree))) { + prt_printf(msg, "flagging btree "); + bch2_btree_id_to_text(msg, btree); + prt_printf(msg, " lost data\n"); + + write_sb |= !__test_and_set_bit_le64(btree, &ext->btrees_lost_data); } /* Once we have runtime self healing for topology errors we won't need this: */ - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_topology) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0, &write_sb) ?: ret; /* Btree node accounting will be off: */ - __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; + write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0, &write_sb) ?: ret; #ifdef CONFIG_BCACHEFS_DEBUG /* * These are much more minor, and don't need to be corrected right away, * but in debug mode we want the next fsck run to be clean: */ - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_lrus) ?: ret; - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_lrus, 0, &write_sb) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents, 0, &write_sb) ?: ret; #endif + write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent); + write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_backpointer_to_missing_ptr, ext->errors_silent); + write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); + write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); + switch (btree) { case BTREE_ID_alloc: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; - - __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); - __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0, &write_sb) ?: ret; + + write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); + write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent); + write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); + write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); goto out; case BTREE_ID_backpointers: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret; - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers, 0, &write_sb) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers, 0, &write_sb) ?: ret; goto out; case BTREE_ID_need_discard: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0, &write_sb) ?: ret; goto out; case BTREE_ID_freespace: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0, &write_sb) ?: ret; goto out; case BTREE_ID_bucket_gens: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0, &write_sb) ?: ret; goto out; case BTREE_ID_lru: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0, &write_sb) ?: ret; goto out; case BTREE_ID_accounting: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0, &write_sb) ?: ret; + goto out; + case BTREE_ID_snapshots: + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots, 0, &write_sb) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0, &write_sb) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0, &write_sb) ?: ret; goto out; default: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0, &write_sb) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0, &write_sb) ?: ret; goto out; } out: - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - + if (write_sb) + bch2_write_super(c); return ret; } @@ -114,12 +123,9 @@ static void kill_btree(struct bch_fs *c, enum btree_id btree) } /* for -o reconstruct_alloc: */ -static void bch2_reconstruct_alloc(struct bch_fs *c) +void bch2_reconstruct_alloc(struct bch_fs *c) { - bch2_journal_log_msg(c, "dropping alloc info"); - bch_info(c, "dropping and reconstructing all alloc info"); - - mutex_lock(&c->sb_lock); + guard(mutex)(&c->sb_lock); struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required); @@ -160,8 +166,9 @@ static void bch2_reconstruct_alloc(struct bch_fs *c) c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info)); + bch2_write_super(c); - mutex_unlock(&c->sb_lock); for (unsigned i = 0; i < btree_id_nr_alive(c); i++) if (btree_id_is_alloc(i)) @@ -199,7 +206,7 @@ static int bch2_journal_replay_accounting_key(struct btree_trans *trans, bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, BTREE_MAX_DEPTH, k->level, BTREE_ITER_intent); - int ret = bch2_btree_iter_traverse(trans, &iter); + int ret = bch2_btree_iter_traverse(&iter); if (ret) goto out; @@ -227,7 +234,7 @@ static int bch2_journal_replay_accounting_key(struct btree_trans *trans, ret = bch2_trans_update(trans, &iter, new, BTREE_TRIGGER_norun); out: - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); return ret; } @@ -262,16 +269,38 @@ static int bch2_journal_replay_key(struct btree_trans *trans, bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, BTREE_MAX_DEPTH, k->level, iter_flags); - ret = bch2_btree_iter_traverse(trans, &iter); + ret = bch2_btree_iter_traverse(&iter); if (ret) goto out; struct btree_path *path = btree_iter_path(trans, &iter); if (unlikely(!btree_path_node(path, k->level))) { - bch2_trans_iter_exit(trans, &iter); + struct bch_fs *c = trans->c; + + CLASS(printbuf, buf)(); + prt_str(&buf, "btree="); + bch2_btree_id_to_text(&buf, k->btree_id); + prt_printf(&buf, " level=%u ", k->level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k->k)); + + if (!(c->recovery.passes_complete & (BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes)| + BIT_ULL(BCH_RECOVERY_PASS_check_topology)))) { + bch_err(c, "have key in journal replay for btree depth that does not exist, confused\n%s", + buf.buf); + ret = -EINVAL; + } + + if (!k->allocated) { + bch_notice(c, "dropping key in journal replay for depth that does not exist because we're recovering from scan\n%s", + buf.buf); + k->overwritten = true; + goto out; + } + + bch2_trans_iter_exit(&iter); bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, BTREE_MAX_DEPTH, 0, iter_flags); - ret = bch2_btree_iter_traverse(trans, &iter) ?: + ret = bch2_btree_iter_traverse(&iter) ?: bch2_btree_increase_depth(trans, iter.path, 0) ?: -BCH_ERR_transaction_restart_nested; goto out; @@ -282,13 +311,18 @@ static int bch2_journal_replay_key(struct btree_trans *trans, goto out; if (k->k->k.type == KEY_TYPE_accounting) { - ret = bch2_trans_update_buffered(trans, BTREE_ID_accounting, k->k); + struct bkey_i *n = bch2_trans_subbuf_alloc(trans, &trans->accounting, k->k->k.u64s); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto out; + + bkey_copy(n, k->k); goto out; } ret = bch2_trans_update(trans, &iter, k->k, update_flags); out: - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); return ret; } @@ -306,14 +340,15 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) return cmp_int(l->journal_seq - 1, r->journal_seq - 1); } +DEFINE_DARRAY_NAMED(darray_journal_keys, struct journal_key *) + int bch2_journal_replay(struct bch_fs *c) { struct journal_keys *keys = &c->journal_keys; - DARRAY(struct journal_key *) keys_sorted = { 0 }; + CLASS(darray_journal_keys, keys_sorted)(); struct journal *j = &c->journal; u64 start_seq = c->journal_replay_seq_start; u64 end_seq = c->journal_replay_seq_start; - struct btree_trans *trans = NULL; bool immediate_flush = false; int ret = 0; @@ -321,13 +356,13 @@ int bch2_journal_replay(struct bch_fs *c) ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)", keys->nr, start_seq, end_seq); if (ret) - goto err; + return ret; } BUG_ON(!atomic_read(&keys->ref)); move_gap(keys, keys->nr); - trans = bch2_trans_get(c); + CLASS(btree_trans, trans)(c); /* * Replay accounting keys first: we can't allow the write buffer to @@ -347,7 +382,7 @@ int bch2_journal_replay(struct bch_fs *c) BCH_WATERMARK_reclaim, bch2_journal_replay_accounting_key(trans, k)); if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret))) - goto err; + return ret; k->overwritten = true; } @@ -381,7 +416,7 @@ int bch2_journal_replay(struct bch_fs *c) if (ret) { ret = darray_push(&keys_sorted, k); if (ret) - goto err; + return ret; } } @@ -412,25 +447,19 @@ int bch2_journal_replay(struct bch_fs *c) : 0), bch2_journal_replay_key(trans, k)); if (ret) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_btree_id_level_to_text(&buf, k->btree_id, k->level); bch_err_msg(c, ret, "while replaying key at %s:", buf.buf); - printbuf_exit(&buf); - goto err; + return ret; } BUG_ON(k->btree_id != BTREE_ID_accounting && !k->overwritten); } - /* - * We need to put our btree_trans before calling flush_all_pins(), since - * that will use a btree_trans internally - */ - bch2_trans_put(trans); - trans = NULL; + bch2_trans_unlock_long(trans); if (!c->opts.retain_recovery_info && - c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) + c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) bch2_journal_keys_put_initial(c); replay_now_at(j, j->replay_journal_seq_end); @@ -446,12 +475,7 @@ int bch2_journal_replay(struct bch_fs *c) if (keys->nr) bch2_journal_log_msg(c, "journal replay finished"); -err: - if (trans) - bch2_trans_put(trans); - darray_exit(&keys_sorted); - bch_err_fn(c, ret); - return ret; + return 0; } /* journal replay early: */ @@ -563,7 +587,7 @@ static int journal_replay_early(struct bch_fs *c, static int read_btree_roots(struct bch_fs *c) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); int ret = 0; for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { @@ -585,9 +609,7 @@ static int read_btree_roots(struct bch_fs *c) buf.buf, bch2_err_str(ret))) { if (btree_id_is_alloc(i)) r->error = 0; - - ret = bch2_btree_lost_data(c, i); - BUG_ON(ret); + ret = 0; } } @@ -601,7 +623,6 @@ static int read_btree_roots(struct bch_fs *c) } } fsck_err: - printbuf_exit(&buf); return ret; } @@ -635,7 +656,7 @@ static bool check_version_upgrade(struct bch_fs *c) } if (new_version > old_version) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); if (old_version < bcachefs_metadata_required_upgrade_below) prt_str(&buf, "Version upgrade required:\n"); @@ -667,15 +688,13 @@ static bool check_version_upgrade(struct bch_fs *c) bch2_recovery_passes_from_stable(le64_to_cpu(passes))); } - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - + bch_notice(c, "%s", buf.buf); ret = true; } - if (new_version > c->sb.version_incompat && + if (new_version > c->sb.version_incompat_allowed && c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); prt_str(&buf, "Now allowing incompatible features up to "); bch2_version_to_text(&buf, new_version); @@ -683,9 +702,7 @@ static bool check_version_upgrade(struct bch_fs *c) bch2_version_to_text(&buf, c->sb.version_incompat_allowed); prt_newline(&buf); - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - + bch_notice(c, "%s", buf.buf); ret = true; } @@ -733,7 +750,24 @@ int bch2_fs_recovery(struct bch_fs *c) ? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read) : BCH_RECOVERY_PASS_snapshots_read; c->opts.nochanges = true; + } + + if (c->opts.nochanges) c->opts.read_only = true; + + if (c->opts.journal_rewind) { + bch_info(c, "rewinding journal, fsck required"); + c->opts.fsck = true; + } + + if (go_rw_in_recovery(c)) { + /* + * start workqueues/kworkers early - kthread creation checks for + * pending signals, which is _very_ annoying + */ + ret = bch2_fs_init_rw(c); + if (ret) + goto err; } mutex_lock(&c->sb_lock); @@ -748,15 +782,14 @@ int bch2_fs_recovery(struct bch_fs *c) u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); if (sb_passes) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); prt_str(&buf, "superblock requires following recovery passes to be run:\n "); prt_bitflags(&buf, bch2_recovery_passes, sb_passes); bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); } if (bch2_check_version_downgrade(c)) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); prt_str(&buf, "Version downgrade required:"); @@ -772,7 +805,6 @@ int bch2_fs_recovery(struct bch_fs *c) } bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); write_sb = true; } @@ -790,11 +822,11 @@ int bch2_fs_recovery(struct bch_fs *c) bch2_write_super(c); mutex_unlock(&c->sb_lock); - if (c->opts.fsck) - set_bit(BCH_FS_fsck_running, &c->flags); if (c->sb.clean) set_bit(BCH_FS_clean_recovery, &c->flags); - set_bit(BCH_FS_recovery_running, &c->flags); + if (c->opts.fsck) + set_bit(BCH_FS_in_fsck, &c->flags); + set_bit(BCH_FS_in_recovery, &c->flags); ret = bch2_blacklist_table_initialize(c); if (ret) { @@ -873,7 +905,7 @@ int bch2_fs_recovery(struct bch_fs *c) use_clean: if (!clean) { bch_err(c, "no superblock clean section found"); - ret = -BCH_ERR_fsck_repair_impossible; + ret = bch_err_throw(c, fsck_repair_impossible); goto err; } @@ -889,8 +921,36 @@ int bch2_fs_recovery(struct bch_fs *c) if (ret) goto err; - if (c->opts.reconstruct_alloc) + scoped_guard(rwsem_write, &c->state_lock) + ret = bch2_fs_resize_on_mount(c); + if (ret) + goto err; + + if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { + bch_info(c, "filesystem is an unresized image file, mounting ro"); + c->opts.read_only = true; + } + + if (!c->opts.read_only && + (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) { + bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate"); + + bch2_reconstruct_alloc(c); + } else if (c->opts.reconstruct_alloc) { + bch2_journal_log_msg(c, "dropping alloc info"); + bch_info(c, "dropping and reconstructing all alloc info"); + bch2_reconstruct_alloc(c); + } + + if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) { + /* We can't go RW to fix errors without alloc info */ + if (c->opts.fix_errors == FSCK_FIX_yes || + c->opts.fix_errors == FSCK_FIX_ask) + c->opts.fix_errors = FSCK_FIX_no; + if (c->opts.errors == BCH_ON_ERROR_fix_safe) + c->opts.errors = BCH_ON_ERROR_continue; + } /* * After an unclean shutdown, skip then next few journal sequence @@ -915,7 +975,7 @@ int bch2_fs_recovery(struct bch_fs *c) ret = bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu", journal_seq, last_seq, blacklist_seq - 1) ?: - bch2_fs_journal_start(&c->journal, journal_seq); + bch2_fs_journal_start(&c->journal, last_seq, journal_seq); if (ret) goto err; @@ -933,8 +993,10 @@ int bch2_fs_recovery(struct bch_fs *c) set_bit(BCH_FS_btree_running, &c->flags); ret = bch2_sb_set_upgrade_extra(c); + if (ret) + goto err; - ret = bch2_run_recovery_passes(c); + ret = bch2_run_recovery_passes(c, 0); if (ret) goto err; @@ -945,8 +1007,7 @@ int bch2_fs_recovery(struct bch_fs *c) * multithreaded use: */ set_bit(BCH_FS_may_go_rw, &c->flags); - clear_bit(BCH_FS_fsck_running, &c->flags); - clear_bit(BCH_FS_recovery_running, &c->flags); + clear_bit(BCH_FS_in_fsck, &c->flags); /* in case we don't run journal replay, i.e. norecovery mode */ set_bit(BCH_FS_accounting_replay_done, &c->flags); @@ -969,9 +1030,8 @@ int bch2_fs_recovery(struct bch_fs *c) bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); clear_bit(BCH_FS_errors_fixed, &c->flags); - c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; - - ret = bch2_run_recovery_passes(c); + ret = bch2_run_recovery_passes(c, + BCH_RECOVERY_PASS_check_alloc_info); if (ret) goto err; @@ -1015,7 +1075,7 @@ int bch2_fs_recovery(struct bch_fs *c) if (c->opts.fsck && !test_bit(BCH_FS_error, &c->flags) && - c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 && + c->recovery.pass_done == BCH_RECOVERY_PASS_NR - 1 && ext->btrees_lost_data) { ext->btrees_lost_data = 0; write_sb = true; @@ -1042,10 +1102,9 @@ int bch2_fs_recovery(struct bch_fs *c) bch2_move_stats_init(&stats, "recovery"); - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_version_to_text(&buf, c->sb.version_min); bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf); - printbuf_exit(&buf); ret = bch2_fs_read_write_early(c) ?: bch2_scan_old_btree_nodes(c, &stats); @@ -1058,13 +1117,6 @@ int bch2_fs_recovery(struct bch_fs *c) out: bch2_flush_fsck_errs(c); - if (!c->opts.retain_recovery_info) { - bch2_journal_keys_put_initial(c); - bch2_find_btree_nodes_exit(&c->found_btree_nodes); - } - if (!IS_ERR(clean)) - kfree(clean); - if (!ret && test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) && !c->opts.nochanges) { @@ -1073,11 +1125,22 @@ int bch2_fs_recovery(struct bch_fs *c) } bch_err_fn(c, ret); +final_out: + if (!IS_ERR(clean)) + kfree(clean); return ret; err: fsck_err: - bch2_fs_emergency_read_only(c); - goto out; + { + CLASS(printbuf, buf)(); + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "error in recovery: %s\n", bch2_err_str(ret)); + bch2_fs_emergency_read_only2(c, &buf); + + bch2_print_str(c, KERN_ERR, buf.buf); + } + goto final_out; } int bch2_fs_initialize(struct bch_fs *c) @@ -1085,58 +1148,36 @@ int bch2_fs_initialize(struct bch_fs *c) struct bch_inode_unpacked root_inode, lostfound_inode; struct bkey_inode_buf packed_inode; struct qstr lostfound = QSTR("lost+found"); - struct bch_member *m; int ret; bch_notice(c, "initializing new filesystem"); set_bit(BCH_FS_new_fs, &c->flags); - mutex_lock(&c->sb_lock); - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); + scoped_guard(mutex, &c->sb_lock) { + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); - bch2_check_version_downgrade(c); + bch2_check_version_downgrade(c); - if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) { - bch2_sb_upgrade(c, bcachefs_metadata_version_current, false); - SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); - bch2_write_super(c); - } + if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) { + bch2_sb_upgrade(c, bcachefs_metadata_version_current, false); + SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); + bch2_write_super(c); + } - for_each_member_device(c, ca) { - m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, false); - ca->mi = bch2_mi_to_cpu(m); - } + for_each_member_device(c, ca) { + struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, false); + } - bch2_write_super(c); - mutex_unlock(&c->sb_lock); + bch2_write_super(c); + } set_bit(BCH_FS_btree_running, &c->flags); - set_bit(BCH_FS_may_go_rw, &c->flags); for (unsigned i = 0; i < BTREE_ID_NR; i++) bch2_btree_root_alloc_fake(c, i, 0); - ret = bch2_fs_journal_alloc(c); - if (ret) - goto err; - - /* - * journal_res_get() will crash if called before this has - * set up the journal.pin FIFO and journal.cur pointer: - */ - ret = bch2_fs_journal_start(&c->journal, 1); - if (ret) - goto err; - - ret = bch2_fs_read_write_early(c); - if (ret) - goto err; - - set_bit(BCH_FS_accounting_replay_done, &c->flags); - bch2_journal_set_replay_done(&c->journal); - for_each_member_device(c, ca) { ret = bch2_dev_usage_init(ca, false); if (ret) { @@ -1155,6 +1196,27 @@ int bch2_fs_initialize(struct bch_fs *c) if (ret) goto err; + ret = bch2_fs_journal_alloc(c); + if (ret) + goto err; + + /* + * journal_res_get() will crash if called before this has + * set up the journal.pin FIFO and journal.cur pointer: + */ + ret = bch2_fs_journal_start(&c->journal, 1, 1); + if (ret) + goto err; + + set_bit(BCH_FS_may_go_rw, &c->flags); + ret = bch2_fs_read_write_early(c); + if (ret) + goto err; + + ret = bch2_journal_replay(c); + if (ret) + goto err; + ret = bch2_fs_freespace_init(c); if (ret) goto err; @@ -1193,7 +1255,7 @@ int bch2_fs_initialize(struct bch_fs *c) if (ret) goto err; - c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1; + c->recovery.pass_done = BCH_RECOVERY_PASS_NR - 1; bch2_copygc_wakeup(c); bch2_rebalance_wakeup(c); @@ -1209,14 +1271,13 @@ int bch2_fs_initialize(struct bch_fs *c) if (ret) goto err; - mutex_lock(&c->sb_lock); - SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); - SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); + scoped_guard(mutex, &c->sb_lock) { + SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + bch2_write_super(c); + } - c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; + c->recovery.curr_pass = BCH_RECOVERY_PASS_NR; return 0; err: bch_err_fn(c, ret); diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h index b0d55754b21b..c023f52fc2d6 100644 --- a/fs/bcachefs/recovery.h +++ b/fs/bcachefs/recovery.h @@ -2,7 +2,8 @@ #ifndef _BCACHEFS_RECOVERY_H #define _BCACHEFS_RECOVERY_H -int bch2_btree_lost_data(struct bch_fs *, enum btree_id); +int bch2_btree_lost_data(struct bch_fs *, struct printbuf *, enum btree_id); +void bch2_reconstruct_alloc(struct bch_fs *); int bch2_journal_replay(struct bch_fs *); diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 22f72bb5b853..bd442652d0f5 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -28,6 +28,176 @@ const char * const bch2_recovery_passes[] = { NULL }; +static const u8 passes_to_stable_map[] = { +#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, + BCH_RECOVERY_PASSES() +#undef x +}; + +static const u8 passes_from_stable_map[] = { +#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, + BCH_RECOVERY_PASSES() +#undef x +}; + +static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass) +{ + return passes_to_stable_map[pass]; +} + +u64 bch2_recovery_passes_to_stable(u64 v) +{ + u64 ret = 0; + for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++) + if (v & BIT_ULL(i)) + ret |= BIT_ULL(passes_to_stable_map[i]); + return ret; +} + +static enum bch_recovery_pass bch2_recovery_pass_from_stable(enum bch_recovery_pass_stable pass) +{ + return pass < ARRAY_SIZE(passes_from_stable_map) + ? passes_from_stable_map[pass] + : 0; +} + +u64 bch2_recovery_passes_from_stable(u64 v) +{ + u64 ret = 0; + for (unsigned i = 0; i < ARRAY_SIZE(passes_from_stable_map); i++) + if (v & BIT_ULL(i)) + ret |= BIT_ULL(passes_from_stable_map[i]); + return ret; +} + +static int bch2_sb_recovery_passes_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) +{ + return 0; +} + +static void bch2_sb_recovery_passes_to_text(struct printbuf *out, + struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_recovery_passes *r = + field_to_type(f, recovery_passes); + unsigned nr = recovery_passes_nr_entries(r); + + if (out->nr_tabstops < 1) + printbuf_tabstop_push(out, 32); + if (out->nr_tabstops < 2) + printbuf_tabstop_push(out, 16); + + prt_printf(out, "Pass\tLast run\tLast runtime\n"); + + for (struct recovery_pass_entry *i = r->start; i < r->start + nr; i++) { + if (!i->last_run) + continue; + + unsigned idx = i - r->start; + + prt_printf(out, "%s\t", bch2_recovery_passes[bch2_recovery_pass_from_stable(idx)]); + + bch2_prt_datetime(out, le64_to_cpu(i->last_run)); + prt_tab(out); + + bch2_pr_time_units(out, le32_to_cpu(i->last_runtime) * NSEC_PER_SEC); + + if (BCH_RECOVERY_PASS_NO_RATELIMIT(i)) + prt_str(out, " (no ratelimit)"); + + prt_newline(out); + } +} + +static struct recovery_pass_entry *bch2_sb_recovery_pass_entry(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); + + lockdep_assert_held(&c->sb_lock); + + struct bch_sb_field_recovery_passes *r = + bch2_sb_field_get(c->disk_sb.sb, recovery_passes); + + if (stable >= recovery_passes_nr_entries(r)) { + unsigned u64s = struct_size(r, start, stable + 1) / sizeof(u64); + + r = bch2_sb_field_resize(&c->disk_sb, recovery_passes, u64s); + if (!r) { + bch_err(c, "error creating recovery_passes sb section"); + return NULL; + } + } + + return r->start + stable; +} + +static void bch2_sb_recovery_pass_complete(struct bch_fs *c, + enum bch_recovery_pass pass, + s64 start_time) +{ + guard(mutex)(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + __clear_bit_le64(bch2_recovery_pass_to_stable(pass), + ext->recovery_passes_required); + + struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass); + if (e) { + s64 end_time = ktime_get_real_seconds(); + e->last_run = cpu_to_le64(end_time); + e->last_runtime = cpu_to_le32(max(0, end_time - start_time)); + SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false); + } + + bch2_write_super(c); +} + +void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + guard(mutex)(&c->sb_lock); + + struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass); + if (e && !BCH_RECOVERY_PASS_NO_RATELIMIT(e)) { + SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false); + bch2_write_super(c); + } +} + +static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass) +{ + enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); + bool ret = false; + + lockdep_assert_held(&c->sb_lock); + + struct bch_sb_field_recovery_passes *r = + bch2_sb_field_get(c->disk_sb.sb, recovery_passes); + + if (stable < recovery_passes_nr_entries(r)) { + struct recovery_pass_entry *i = r->start + stable; + + /* + * Ratelimit if the last runtime was more than 1% of the time + * since we last ran + */ + ret = (u64) le32_to_cpu(i->last_runtime) * 100 > + ktime_get_real_seconds() - le64_to_cpu(i->last_run); + + if (BCH_RECOVERY_PASS_NO_RATELIMIT(i)) + ret = false; + } + + return ret; +} + +const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes = { + .validate = bch2_sb_recovery_passes_validate, + .to_text = bch2_sb_recovery_passes_to_text +}; + /* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */ static int bch2_recovery_pass_empty(struct bch_fs *c) { @@ -47,268 +217,436 @@ static int bch2_set_may_go_rw(struct bch_fs *c) set_bit(BCH_FS_may_go_rw, &c->flags); - if (keys->nr || !c->opts.read_only || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes) + if (go_rw_in_recovery(c)) { + if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) { + bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate"); + bch2_reconstruct_alloc(c); + } + return bch2_fs_read_write_early(c); + } return 0; } +/* + * Make sure root inode is readable while we're still in recovery and can rewind + * for repair: + */ +static int bch2_lookup_root_inode(struct bch_fs *c) +{ + subvol_inum inum = BCACHEFS_ROOT_SUBVOL_INUM; + struct bch_inode_unpacked inode_u; + struct bch_subvolume subvol; + CLASS(btree_trans, trans)(c); + + return lockrestart_do(trans, + bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: + bch2_inode_find_by_inum_trans(trans, inum, &inode_u)); +} + struct recovery_pass_fn { int (*fn)(struct bch_fs *); + const char *name; unsigned when; }; static struct recovery_pass_fn recovery_pass_fns[] = { -#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when }, +#define x(_fn, _id, _when) { .fn = bch2_##_fn, .name = #_fn, .when = _when }, BCH_RECOVERY_PASSES() #undef x }; -static const u8 passes_to_stable_map[] = { -#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, - BCH_RECOVERY_PASSES() -#undef x -}; +static u64 bch2_recovery_passes_match(unsigned flags) +{ + u64 ret = 0; -static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass) + for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) + if (recovery_pass_fns[i].when & flags) + ret |= BIT_ULL(i); + return ret; +} + +u64 bch2_fsck_recovery_passes(void) { - return passes_to_stable_map[pass]; + return bch2_recovery_passes_match(PASS_FSCK); } -u64 bch2_recovery_passes_to_stable(u64 v) +static void bch2_run_async_recovery_passes(struct bch_fs *c) { - u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++) - if (v & BIT_ULL(i)) - ret |= BIT_ULL(passes_to_stable_map[i]); - return ret; + if (!down_trylock(&c->recovery.run_lock)) + return; + + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_async_recovery_passes)) + goto unlock; + + if (queue_work(system_long_wq, &c->recovery.work)) + return; + + enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes); +unlock: + up(&c->recovery.run_lock); } -u64 bch2_recovery_passes_from_stable(u64 v) +static bool recovery_pass_needs_set(struct bch_fs *c, + enum bch_recovery_pass pass, + enum bch_run_recovery_pass_flags *flags) { - static const u8 map[] = { -#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, - BCH_RECOVERY_PASSES() -#undef x - }; + struct bch_fs_recovery *r = &c->recovery; - u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(map); i++) - if (v & BIT_ULL(i)) - ret |= BIT_ULL(map[i]); - return ret; + /* + * Never run scan_for_btree_nodes persistently: check_topology will run + * it if required + */ + if (pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) + *flags |= RUN_RECOVERY_PASS_nopersistent; + + if ((*flags & RUN_RECOVERY_PASS_ratelimit) && + !bch2_recovery_pass_want_ratelimit(c, pass)) + *flags &= ~RUN_RECOVERY_PASS_ratelimit; + + /* + * If RUN_RECOVERY_PASS_nopersistent is set, we don't want to do + * anything if the pass has already run: these mean we need a prior pass + * to run before we continue to repair, we don't expect that pass to fix + * the damage we encountered. + * + * Otherwise, we run run_explicit_recovery_pass when we find damage, so + * it should run again even if it's already run: + */ + bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); + bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent); + bool rewind = in_recovery && + r->curr_pass > pass && + !(r->passes_complete & BIT_ULL(pass)); + + if (persistent + ? !(c->sb.recovery_passes_required & BIT_ULL(pass)) + : !((r->passes_to_run|r->passes_complete) & BIT_ULL(pass))) + return true; + + if (!(*flags & RUN_RECOVERY_PASS_ratelimit) && + (r->passes_ratelimiting & BIT_ULL(pass))) + return true; + + if (rewind) + return true; + + return false; } /* * For when we need to rewind recovery passes and run a pass we skipped: */ -static int __bch2_run_explicit_recovery_pass(struct bch_fs *c, - enum bch_recovery_pass pass) +int __bch2_run_explicit_recovery_pass(struct bch_fs *c, + struct printbuf *out, + enum bch_recovery_pass pass, + enum bch_run_recovery_pass_flags flags, + bool *write_sb) { - if (c->curr_recovery_pass == ARRAY_SIZE(recovery_pass_fns)) - return -BCH_ERR_not_in_recovery; + struct bch_fs_recovery *r = &c->recovery; + int ret = 0; + + lockdep_assert_held(&c->sb_lock); - if (c->recovery_passes_complete & BIT_ULL(pass)) + bch2_printbuf_make_room(out, 1024); + guard(printbuf_atomic)(out); + guard(spinlock_irq)(&r->lock); + + if (!recovery_pass_needs_set(c, pass, &flags)) return 0; - bool print = !(c->opts.recovery_passes & BIT_ULL(pass)); + bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); + bool rewind = in_recovery && + r->curr_pass > pass && + !(r->passes_complete & BIT_ULL(pass)); + bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit; + + if (!(flags & RUN_RECOVERY_PASS_nopersistent)) { + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + *write_sb |= !__test_and_set_bit_le64(bch2_recovery_pass_to_stable(pass), + ext->recovery_passes_required); + } if (pass < BCH_RECOVERY_PASS_set_may_go_rw && - c->curr_recovery_pass >= BCH_RECOVERY_PASS_set_may_go_rw) { - if (print) - bch_info(c, "need recovery pass %s (%u), but already rw", - bch2_recovery_passes[pass], pass); - return -BCH_ERR_cannot_rewind_recovery; + (!in_recovery || r->curr_pass >= BCH_RECOVERY_PASS_set_may_go_rw)) { + prt_printf(out, "need recovery pass %s (%u), but already rw\n", + bch2_recovery_passes[pass], pass); + return bch_err_throw(c, cannot_rewind_recovery); } - if (print) - bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", - bch2_recovery_passes[pass], pass, - bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); + if (ratelimit) + r->passes_ratelimiting |= BIT_ULL(pass); + else + r->passes_ratelimiting &= ~BIT_ULL(pass); + + if (in_recovery && !ratelimit) { + prt_printf(out, "running recovery pass %s (%u), currently at %s (%u)%s\n", + bch2_recovery_passes[pass], pass, + bch2_recovery_passes[r->curr_pass], r->curr_pass, + rewind ? " - rewinding" : ""); - c->opts.recovery_passes |= BIT_ULL(pass); + r->passes_to_run |= BIT_ULL(pass); - if (c->curr_recovery_pass > pass) { - c->next_recovery_pass = pass; - c->recovery_passes_complete &= (1ULL << pass) >> 1; - return -BCH_ERR_restart_recovery; + if (rewind) { + r->next_pass = pass; + r->passes_complete &= (1ULL << pass) >> 1; + ret = bch_err_throw(c, restart_recovery); + } } else { - return 0; + prt_printf(out, "scheduling recovery pass %s (%u)%s\n", + bch2_recovery_passes[pass], pass, + ratelimit ? " - ratelimiting" : ""); + + struct recovery_pass_fn *p = recovery_pass_fns + pass; + if (p->when & PASS_ONLINE) + bch2_run_async_recovery_passes(c); } + + return ret; } int bch2_run_explicit_recovery_pass(struct bch_fs *c, - enum bch_recovery_pass pass) + struct printbuf *out, + enum bch_recovery_pass pass, + enum bch_run_recovery_pass_flags flags) { - unsigned long flags; - spin_lock_irqsave(&c->recovery_pass_lock, flags); - int ret = __bch2_run_explicit_recovery_pass(c, pass); - spin_unlock_irqrestore(&c->recovery_pass_lock, flags); + /* + * With RUN_RECOVERY_PASS_ratelimit, recovery_pass_needs_set needs + * sb_lock + */ + if (!(flags & RUN_RECOVERY_PASS_ratelimit) && + !recovery_pass_needs_set(c, pass, &flags)) + return 0; + + guard(mutex)(&c->sb_lock); + bool write_sb = false; + int ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags, &write_sb); + if (write_sb) + bch2_write_super(c); return ret; } -int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *c, - enum bch_recovery_pass pass) +/* + * Returns 0 if @pass has run recently, otherwise one of + * -BCH_ERR_restart_recovery + * -BCH_ERR_recovery_pass_will_run + */ +int bch2_require_recovery_pass(struct bch_fs *c, + struct printbuf *out, + enum bch_recovery_pass pass) { - lockdep_assert_held(&c->sb_lock); - - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); + if (test_bit(BCH_FS_in_recovery, &c->flags) && + c->recovery.passes_complete & BIT_ULL(pass)) + return 0; - return bch2_run_explicit_recovery_pass(c, pass); -} + guard(mutex)(&c->sb_lock); -int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, - enum bch_recovery_pass pass) -{ - enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass); + if (bch2_recovery_pass_want_ratelimit(c, pass)) + return 0; - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + enum bch_run_recovery_pass_flags flags = 0; - if (!test_bit_le64(s, ext->recovery_passes_required)) { - __set_bit_le64(s, ext->recovery_passes_required); + bool write_sb = false; + int ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags, &write_sb) ?: + bch_err_throw(c, recovery_pass_will_run); + if (write_sb) bch2_write_super(c); - } - mutex_unlock(&c->sb_lock); - - return bch2_run_explicit_recovery_pass(c, pass); + return ret; } -static void bch2_clear_recovery_pass_required(struct bch_fs *c, - enum bch_recovery_pass pass) +int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { - enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass); + enum bch_run_recovery_pass_flags flags = 0; - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + if (!recovery_pass_needs_set(c, pass, &flags)) + return 0; - if (test_bit_le64(s, ext->recovery_passes_required)) { - __clear_bit_le64(s, ext->recovery_passes_required); - bch2_write_super(c); - } - mutex_unlock(&c->sb_lock); -} + CLASS(printbuf, buf)(); + bch2_log_msg_start(c, &buf); -u64 bch2_fsck_recovery_passes(void) -{ - u64 ret = 0; + guard(mutex)(&c->sb_lock); + bool write_sb = false; + int ret = __bch2_run_explicit_recovery_pass(c, &buf, pass, + RUN_RECOVERY_PASS_nopersistent, + &write_sb); - for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) - if (recovery_pass_fns[i].when & PASS_FSCK) - ret |= BIT_ULL(i); + bch2_print_str(c, KERN_NOTICE, buf.buf); return ret; } -static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) -{ - struct recovery_pass_fn *p = recovery_pass_fns + pass; - - if (c->opts.recovery_passes_exclude & BIT_ULL(pass)) - return false; - if (c->opts.recovery_passes & BIT_ULL(pass)) - return true; - if ((p->when & PASS_FSCK) && c->opts.fsck) - return true; - if ((p->when & PASS_UNCLEAN) && !c->sb.clean) - return true; - if (p->when & PASS_ALWAYS) - return true; - return false; -} - static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { + struct bch_fs_recovery *r = &c->recovery; struct recovery_pass_fn *p = recovery_pass_fns + pass; - int ret; if (!(p->when & PASS_SILENT)) bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), bch2_recovery_passes[pass]); - ret = p->fn(c); - if (ret) + + s64 start_time = ktime_get_real_seconds(); + int ret = p->fn(c); + + r->passes_to_run &= ~BIT_ULL(pass); + + if (ret) { + bch_err(c, "%s(): error %s", p->name, bch2_err_str(ret)); + r->passes_failing |= BIT_ULL(pass); return ret; + } + + r->passes_failing = 0; + + if (!test_bit(BCH_FS_error, &c->flags)) + bch2_sb_recovery_pass_complete(c, pass, start_time); + if (!(p->when & PASS_SILENT)) bch2_print(c, KERN_CONT " done\n"); return 0; } -int bch2_run_online_recovery_passes(struct bch_fs *c) +static int __bch2_run_recovery_passes(struct bch_fs *c, u64 orig_passes_to_run, + bool online) { - for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) { - struct recovery_pass_fn *p = recovery_pass_fns + i; + struct bch_fs_recovery *r = &c->recovery; + int ret = 0; - if (!(p->when & PASS_ONLINE)) - continue; + spin_lock_irq(&r->lock); - int ret = bch2_run_recovery_pass(c, i); - if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) { - i = c->curr_recovery_pass; - continue; - } - if (ret) - return ret; - } + if (online) + orig_passes_to_run &= bch2_recovery_passes_match(PASS_ONLINE); - return 0; -} - -int bch2_run_recovery_passes(struct bch_fs *c) -{ - int ret = 0; + if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) + orig_passes_to_run &= ~bch2_recovery_passes_match(PASS_ALLOC); /* - * We can't allow set_may_go_rw to be excluded; that would cause us to - * use the journal replay keys for updates where it's not expected. + * A failed recovery pass will be retried after another pass succeeds - + * but not this iteration. + * + * This is because some passes depend on repair done by other passes: we + * may want to retry, but we don't want to loop on failing passes. */ - c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; - spin_lock_irq(&c->recovery_pass_lock); + orig_passes_to_run &= ~r->passes_failing; - while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) { - unsigned prev_done = c->recovery_pass_done; - unsigned pass = c->curr_recovery_pass; + r->passes_to_run = orig_passes_to_run; - c->next_recovery_pass = pass + 1; + while (r->passes_to_run) { + unsigned prev_done = r->pass_done; + unsigned pass = __ffs64(r->passes_to_run); + r->curr_pass = pass; + r->next_pass = r->curr_pass + 1; + r->passes_to_run &= ~BIT_ULL(pass); - if (c->opts.recovery_pass_last && - c->curr_recovery_pass > c->opts.recovery_pass_last) - break; + spin_unlock_irq(&r->lock); + + int ret2 = bch2_run_recovery_pass(c, pass) ?: + bch2_journal_flush(&c->journal); - if (should_run_recovery_pass(c, pass)) { - spin_unlock_irq(&c->recovery_pass_lock); - ret = bch2_run_recovery_pass(c, pass) ?: - bch2_journal_flush(&c->journal); - - if (!ret && !test_bit(BCH_FS_error, &c->flags)) - bch2_clear_recovery_pass_required(c, pass); - spin_lock_irq(&c->recovery_pass_lock); - - if (c->next_recovery_pass < c->curr_recovery_pass) { - /* - * bch2_run_explicit_recovery_pass() was called: we - * can't always catch -BCH_ERR_restart_recovery because - * it may have been called from another thread (btree - * node read completion) - */ - ret = 0; - c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass); - } else { - c->recovery_passes_complete |= BIT_ULL(pass); - c->recovery_pass_done = max(c->recovery_pass_done, pass); - } + spin_lock_irq(&r->lock); + + if (r->next_pass < r->curr_pass) { + /* Rewind: */ + r->passes_to_run |= orig_passes_to_run & (~0ULL << r->next_pass); + } else if (!ret2) { + r->pass_done = max(r->pass_done, pass); + r->passes_complete |= BIT_ULL(pass); + } else { + ret = ret2; } - c->curr_recovery_pass = c->next_recovery_pass; + if (ret && !online) + break; if (prev_done <= BCH_RECOVERY_PASS_check_snapshots && - c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots) { + r->pass_done > BCH_RECOVERY_PASS_check_snapshots) { bch2_copygc_wakeup(c); bch2_rebalance_wakeup(c); } } - spin_unlock_irq(&c->recovery_pass_lock); + clear_bit(BCH_FS_in_recovery, &c->flags); + spin_unlock_irq(&r->lock); + + return ret; +} + +static void bch2_async_recovery_passes_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, recovery.work); + struct bch_fs_recovery *r = &c->recovery; + + __bch2_run_recovery_passes(c, + c->sb.recovery_passes_required & ~r->passes_ratelimiting, + true); + + up(&r->run_lock); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes); +} + +int bch2_run_online_recovery_passes(struct bch_fs *c, u64 passes) +{ + return __bch2_run_recovery_passes(c, c->sb.recovery_passes_required|passes, true); +} + +int bch2_run_recovery_passes(struct bch_fs *c, enum bch_recovery_pass from) +{ + u64 passes = + bch2_recovery_passes_match(PASS_ALWAYS) | + (!c->sb.clean ? bch2_recovery_passes_match(PASS_UNCLEAN) : 0) | + (c->opts.fsck ? bch2_recovery_passes_match(PASS_FSCK) : 0) | + c->opts.recovery_passes | + c->sb.recovery_passes_required; + + if (c->opts.recovery_pass_last) + passes &= BIT_ULL(c->opts.recovery_pass_last + 1) - 1; + + /* + * We can't allow set_may_go_rw to be excluded; that would cause us to + * use the journal replay keys for updates where it's not expected. + */ + c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; + passes &= ~c->opts.recovery_passes_exclude; + + passes &= ~(BIT_ULL(from) - 1); + + down(&c->recovery.run_lock); + int ret = __bch2_run_recovery_passes(c, passes, false); + up(&c->recovery.run_lock); return ret; } + +static void prt_passes(struct printbuf *out, const char *msg, u64 passes) +{ + prt_printf(out, "%s:\t", msg); + prt_bitflags(out, bch2_recovery_passes, passes); + prt_newline(out); +} + +void bch2_recovery_pass_status_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct bch_fs_recovery *r = &c->recovery; + + printbuf_tabstop_push(out, 32); + prt_passes(out, "Scheduled passes", c->sb.recovery_passes_required); + prt_passes(out, "Scheduled online passes", c->sb.recovery_passes_required & + bch2_recovery_passes_match(PASS_ONLINE)); + prt_passes(out, "Complete passes", r->passes_complete); + prt_passes(out, "Failing passes", r->passes_failing); + + if (r->curr_pass) { + prt_printf(out, "Current pass:\t%s\n", bch2_recovery_passes[r->curr_pass]); + prt_passes(out, "Current passes", r->passes_to_run); + } + + prt_printf(out, "Pass done:\t%s\n", bch2_recovery_passes[r->pass_done]); +} + +void bch2_fs_recovery_passes_init(struct bch_fs *c) +{ + spin_lock_init(&c->recovery.lock); + sema_init(&c->recovery.run_lock, 1); + + INIT_WORK(&c->recovery.work, bch2_async_recovery_passes_work); +} diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h index 7d7339c8fa29..95e3612bb96c 100644 --- a/fs/bcachefs/recovery_passes.h +++ b/fs/bcachefs/recovery_passes.h @@ -3,16 +3,53 @@ extern const char * const bch2_recovery_passes[]; +extern const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes; + u64 bch2_recovery_passes_to_stable(u64 v); u64 bch2_recovery_passes_from_stable(u64 v); u64 bch2_fsck_recovery_passes(void); -int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); -int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *, enum bch_recovery_pass); -int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass); +void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *, enum bch_recovery_pass); + +enum bch_run_recovery_pass_flags { + RUN_RECOVERY_PASS_nopersistent = BIT(0), + RUN_RECOVERY_PASS_ratelimit = BIT(1), +}; + +static inline bool go_rw_in_recovery(struct bch_fs *c) +{ + return (c->journal_keys.nr || + !c->opts.read_only || + !c->sb.clean || + c->opts.recovery_passes || + (c->opts.fsck && !(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)))); +} + +static inline bool recovery_pass_will_run(struct bch_fs *c, enum bch_recovery_pass pass) +{ + return unlikely(test_bit(BCH_FS_in_recovery, &c->flags) && + c->recovery.passes_to_run & BIT_ULL(pass)); +} + +int bch2_run_print_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); + +int __bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *, + enum bch_recovery_pass, + enum bch_run_recovery_pass_flags, + bool *); +int bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *, + enum bch_recovery_pass, + enum bch_run_recovery_pass_flags); + +int bch2_require_recovery_pass(struct bch_fs *, struct printbuf *, + enum bch_recovery_pass); + +int bch2_run_online_recovery_passes(struct bch_fs *, u64); +int bch2_run_recovery_passes(struct bch_fs *, enum bch_recovery_pass); + +void bch2_recovery_pass_status_to_text(struct printbuf *, struct bch_fs *); -int bch2_run_online_recovery_passes(struct bch_fs *); -int bch2_run_recovery_passes(struct bch_fs *); +void bch2_fs_recovery_passes_init(struct bch_fs *); #endif /* _BCACHEFS_RECOVERY_PASSES_H */ diff --git a/fs/bcachefs/recovery_passes_format.h b/fs/bcachefs/recovery_passes_format.h new file mode 100644 index 000000000000..b63c20558d3d --- /dev/null +++ b/fs/bcachefs/recovery_passes_format.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_RECOVERY_PASSES_FORMAT_H +#define _BCACHEFS_RECOVERY_PASSES_FORMAT_H + +#define PASS_SILENT BIT(0) +#define PASS_FSCK BIT(1) +#define PASS_UNCLEAN BIT(2) +#define PASS_ALWAYS BIT(3) +#define PASS_ONLINE BIT(4) +#define PASS_ALLOC BIT(5) +#define PASS_FSCK_ALLOC (PASS_FSCK|PASS_ALLOC) + +#ifdef CONFIG_BCACHEFS_DEBUG +#define PASS_FSCK_DEBUG BIT(1) +#else +#define PASS_FSCK_DEBUG 0 +#endif + +/* + * Passes may be reordered, but the second field is a persistent identifier and + * must never change: + */ +#define BCH_RECOVERY_PASSES() \ + x(recovery_pass_empty, 41, PASS_SILENT) \ + x(scan_for_btree_nodes, 37, 0) \ + x(check_topology, 4, 0) \ + x(accounting_read, 39, PASS_ALWAYS) \ + x(alloc_read, 0, PASS_ALWAYS) \ + x(stripes_read, 1, 0) \ + x(initialize_subvolumes, 2, 0) \ + x(snapshots_read, 3, PASS_ALWAYS) \ + x(check_allocations, 5, PASS_FSCK_ALLOC) \ + x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \ + x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \ + x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ + x(journal_replay, 9, PASS_ALWAYS) \ + x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK_ALLOC) \ + x(check_lrus, 11, PASS_ONLINE|PASS_FSCK_ALLOC) \ + x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK_ALLOC) \ + x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \ + x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK_ALLOC) \ + x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK_ALLOC) \ + x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ + x(bucket_gens_init, 17, 0) \ + x(reconstruct_snapshots, 38, 0) \ + x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ + x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ + x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ + x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ + x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ + x(fs_upgrade_for_subvolumes, 22, 0) \ + x(check_inodes, 24, PASS_FSCK) \ + x(check_extents, 25, PASS_FSCK) \ + x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \ + x(check_dirents, 27, PASS_FSCK) \ + x(check_xattrs, 28, PASS_FSCK) \ + x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ + x(check_unreachable_inodes, 40, PASS_FSCK) \ + x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ + x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ + x(check_nlinks, 31, PASS_FSCK) \ + x(check_rebalance_work, 43, PASS_ONLINE|PASS_FSCK) \ + x(resume_logged_ops, 23, PASS_ALWAYS) \ + x(delete_dead_inodes, 32, PASS_ALWAYS) \ + x(fix_reflink_p, 33, 0) \ + x(set_fs_needs_rebalance, 34, 0) \ + x(lookup_root_inode, 42, PASS_ALWAYS|PASS_SILENT) + +/* We normally enumerate recovery passes in the order we run them: */ +enum bch_recovery_pass { +#define x(n, id, when) BCH_RECOVERY_PASS_##n, + BCH_RECOVERY_PASSES() +#undef x + BCH_RECOVERY_PASS_NR +}; + +/* But we also need stable identifiers that can be used in the superblock */ +enum bch_recovery_pass_stable { +#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id, + BCH_RECOVERY_PASSES() +#undef x +}; + +struct recovery_pass_entry { + __le64 last_run; + __le32 last_runtime; + __le32 flags; +}; + +LE32_BITMASK(BCH_RECOVERY_PASS_NO_RATELIMIT, struct recovery_pass_entry, flags, 0, 1) + +struct bch_sb_field_recovery_passes { + struct bch_sb_field field; + struct recovery_pass_entry start[]; +}; + +static inline unsigned +recovery_passes_nr_entries(struct bch_sb_field_recovery_passes *r) +{ + return r + ? ((vstruct_end(&r->field) - (void *) &r->start[0]) / + sizeof(struct recovery_pass_entry)) + : 0; +} + +#endif /* _BCACHEFS_RECOVERY_PASSES_FORMAT_H */ diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index e89b9c783285..aa9526938cc3 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -2,79 +2,26 @@ #ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H #define _BCACHEFS_RECOVERY_PASSES_TYPES_H -#define PASS_SILENT BIT(0) -#define PASS_FSCK BIT(1) -#define PASS_UNCLEAN BIT(2) -#define PASS_ALWAYS BIT(3) -#define PASS_ONLINE BIT(4) - -#ifdef CONFIG_BCACHEFS_DEBUG -#define PASS_FSCK_DEBUG BIT(1) -#else -#define PASS_FSCK_DEBUG 0 -#endif - -/* - * Passes may be reordered, but the second field is a persistent identifier and - * must never change: - */ -#define BCH_RECOVERY_PASSES() \ - x(recovery_pass_empty, 41, PASS_SILENT) \ - x(scan_for_btree_nodes, 37, 0) \ - x(check_topology, 4, 0) \ - x(accounting_read, 39, PASS_ALWAYS) \ - x(alloc_read, 0, PASS_ALWAYS) \ - x(stripes_read, 1, 0) \ - x(initialize_subvolumes, 2, 0) \ - x(snapshots_read, 3, PASS_ALWAYS) \ - x(check_allocations, 5, PASS_FSCK) \ - x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \ - x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \ - x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ - x(journal_replay, 9, PASS_ALWAYS) \ - x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \ - x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \ - x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \ - x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \ - x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \ - x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \ - x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ - x(bucket_gens_init, 17, 0) \ - x(reconstruct_snapshots, 38, 0) \ - x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ - x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ - x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ - x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ - x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ - x(fs_upgrade_for_subvolumes, 22, 0) \ - x(check_inodes, 24, PASS_FSCK) \ - x(check_extents, 25, PASS_FSCK) \ - x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \ - x(check_dirents, 27, PASS_FSCK) \ - x(check_xattrs, 28, PASS_FSCK) \ - x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ - x(check_unreachable_inodes, 40, PASS_FSCK) \ - x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ - x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ - x(check_nlinks, 31, PASS_FSCK) \ - x(resume_logged_ops, 23, PASS_ALWAYS) \ - x(delete_dead_inodes, 32, PASS_ALWAYS) \ - x(fix_reflink_p, 33, 0) \ - x(set_fs_needs_rebalance, 34, 0) - -/* We normally enumerate recovery passes in the order we run them: */ -enum bch_recovery_pass { -#define x(n, id, when) BCH_RECOVERY_PASS_##n, - BCH_RECOVERY_PASSES() -#undef x - BCH_RECOVERY_PASS_NR -}; - -/* But we also need stable identifiers that can be used in the superblock */ -enum bch_recovery_pass_stable { -#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id, - BCH_RECOVERY_PASSES() -#undef x +struct bch_fs_recovery { + /* + * Two different uses: + * "Has this fsck pass?" - i.e. should this type of error be an + * emergency read-only + * And, in certain situations fsck will rewind to an earlier pass: used + * for signaling to the toplevel code which pass we want to run now. + */ + enum bch_recovery_pass curr_pass; + enum bch_recovery_pass next_pass; + /* never rewinds version of curr_pass */ + enum bch_recovery_pass pass_done; + u64 passes_to_run; + /* bitmask of recovery passes that we actually ran */ + u64 passes_complete; + u64 passes_failing; + u64 passes_ratelimiting; + spinlock_t lock; + struct semaphore run_lock; + struct work_struct work; }; #endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */ diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 710178e3da4c..238a362de19e 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -3,6 +3,7 @@ #include "bkey_buf.h" #include "btree_update.h" #include "buckets.h" +#include "enumerated_ref.h" #include "error.h" #include "extents.h" #include "inode.h" @@ -63,6 +64,9 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, REFLINK_P_IDX(p.v), le32_to_cpu(p.v->front_pad), le32_to_cpu(p.v->back_pad)); + + if (REFLINK_P_ERROR(p.v)) + prt_str(out, " error"); } bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) @@ -163,7 +167,7 @@ static int bch2_indirect_extent_not_missing(struct btree_trans *trans, struct bk return 0; return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; + bch_err_throw(trans->c, transaction_restart_nested); } static int bch2_indirect_extent_missing_error(struct btree_trans *trans, @@ -179,7 +183,7 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans, u64 live_end = REFLINK_P_IDX(p.v) + p.k->size; u64 refd_start = live_start - le32_to_cpu(p.v->front_pad); u64 refd_end = live_end + le32_to_cpu(p.v->back_pad); - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); int ret = 0; BUG_ON(missing_start < refd_start); @@ -191,7 +195,7 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans, prt_printf(&buf, "pointer to missing indirect extent in "); ret = bch2_inum_snap_offset_err_msg_trans(trans, &buf, missing_pos); if (ret) - goto err; + return ret; prt_printf(&buf, "-%llu\n", (missing_pos.offset + (missing_end - missing_start)) << 9); bch2_bkey_val_to_text(&buf, c, p.s_c); @@ -203,7 +207,7 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans, struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p); ret = PTR_ERR_OR_ZERO(new); if (ret) - goto err; + return ret; /* * Is the missing range not actually needed? @@ -234,15 +238,13 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans, ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun); if (ret) - goto err; + return ret; if (should_commit) ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; + bch_err_throw(c, transaction_restart_nested); } -err: fsck_err: - printbuf_exit(&buf); return ret; } @@ -262,33 +264,32 @@ struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans, u64 reflink_offset = REFLINK_P_IDX(p.v) + *offset_into_extent; - struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_reflink, - POS(0, reflink_offset), iter_flags); - if (bkey_err(k)) - return k; + bch2_trans_iter_init(trans, iter, BTREE_ID_reflink, POS(0, reflink_offset), iter_flags); + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); + int ret = bkey_err(k); + if (ret) + goto err; if (unlikely(!bkey_extent_is_reflink_data(k.k))) { - unsigned size = min((u64) k.k->size, - REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad) - - reflink_offset); - bch2_key_resize(&iter->k, size); - - int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset, - k.k->p.offset, should_commit); - if (ret) { - bch2_trans_iter_exit(trans, iter); - return bkey_s_c_err(ret); - } + u64 missing_end = min(k.k->p.offset, + REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad)); + BUG_ON(reflink_offset == missing_end); + + ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset, + missing_end, should_commit); + if (ret) + goto err; } else if (unlikely(REFLINK_P_ERROR(p.v))) { - int ret = bch2_indirect_extent_not_missing(trans, p, should_commit); - if (ret) { - bch2_trans_iter_exit(trans, iter); - return bkey_s_c_err(ret); - } + ret = bch2_indirect_extent_not_missing(trans, p, should_commit); + if (ret) + goto err; } *offset_into_extent = reflink_offset - bkey_start_offset(k.k); return k; +err: + bch2_trans_iter_exit(iter); + return bkey_s_c_err(ret); } /* reflink pointer trigger */ @@ -298,7 +299,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); s64 offset_into_extent = *idx - REFLINK_P_IDX(p.v); struct btree_iter iter; @@ -311,7 +312,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, if (!bkey_refcount_c(k)) { if (!(flags & BTREE_TRIGGER_overwrite)) - ret = -BCH_ERR_missing_indirect_extent; + ret = bch_err_throw(c, missing_indirect_extent); goto next; } @@ -356,8 +357,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, *idx = k.k->p.offset; err: fsck_err: - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); + bch2_trans_iter_exit(&iter); return ret; } @@ -371,7 +371,7 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1; u64 next_idx = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad); s64 ret = 0; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); if (r_idx >= c->reflink_gc_nr) goto not_found; @@ -391,12 +391,10 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, if (flags & BTREE_TRIGGER_check_repair) { ret = bch2_indirect_extent_missing_error(trans, p, *idx, next_idx, false); if (ret) - goto err; + return ret; } *idx = next_idx; -err: - printbuf_exit(&buf); return ret; } @@ -495,22 +493,16 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, bool reflink_p_may_update_opts_field) { struct bch_fs *c = trans->c; - struct btree_iter reflink_iter = {}; - struct bkey_s_c k; - struct bkey_i *r_v; - struct bkey_i_reflink_p *r_p; - __le64 *refcount; - int ret; if (orig->k.type == KEY_TYPE_inline_data) bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data); - bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX, - BTREE_ITER_intent); - k = bch2_btree_iter_peek_prev(trans, &reflink_iter); - ret = bkey_err(k); + CLASS(btree_iter, reflink_iter)(trans, BTREE_ID_reflink, POS_MAX, + BTREE_ITER_intent); + struct bkey_s_c k = bch2_btree_iter_peek_prev(&reflink_iter); + int ret = bkey_err(k); if (ret) - goto err; + return ret; /* * XXX: we're assuming that 56 bits will be enough for the life of the @@ -520,10 +512,10 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, if (bkey_ge(reflink_iter.pos, POS(0, REFLINK_P_IDX_MAX - orig->k.size))) return -ENOSPC; - r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k)); + struct bkey_i *r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k)); ret = PTR_ERR_OR_ZERO(r_v); if (ret) - goto err; + return ret; bkey_init(&r_v->k); r_v->k.type = bkey_type_to_indirect(&orig->k); @@ -533,20 +525,21 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k)); - refcount = bkey_refcount(bkey_i_to_s(r_v)); + __le64 *refcount = bkey_refcount(bkey_i_to_s(r_v)); *refcount = 0; memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k)); ret = bch2_trans_update(trans, &reflink_iter, r_v, 0); if (ret) - goto err; + return ret; /* * orig is in a bkey_buf which statically allocates 5 64s for the val, * so we know it will be big enough: */ orig->k.type = KEY_TYPE_reflink_p; - r_p = bkey_i_to_reflink_p(orig); + + struct bkey_i_reflink_p *r_p = bkey_i_to_reflink_p(orig); set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); /* FORTIFY_SOURCE is broken here, and doesn't provide unsafe_memset() */ @@ -561,21 +554,16 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, if (reflink_p_may_update_opts_field) SET_REFLINK_P_MAY_UPDATE_OPTIONS(&r_p->v, true); - ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, - BTREE_UPDATE_internal_snapshot_node); -err: - bch2_trans_iter_exit(trans, &reflink_iter); - - return ret; + return bch2_trans_update(trans, extent_iter, &r_p->k_i, + BTREE_UPDATE_internal_snapshot_node); } -static struct bkey_s_c get_next_src(struct btree_trans *trans, - struct btree_iter *iter, struct bpos end) +static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) { struct bkey_s_c k; int ret; - for_each_btree_key_max_continue_norestart(trans, *iter, end, 0, k, ret) { + for_each_btree_key_max_continue_norestart(*iter, end, 0, k, ret) { if (bkey_extent_is_unwritten(k)) continue; @@ -584,7 +572,7 @@ static struct bkey_s_c get_next_src(struct btree_trans *trans, } if (bkey_ge(iter->pos, end)) - bch2_btree_iter_set_pos(trans, iter, end); + bch2_btree_iter_set_pos(iter, end); return ret ? bkey_s_c_err(ret) : bkey_s_c_null; } @@ -595,7 +583,6 @@ s64 bch2_remap_range(struct bch_fs *c, u64 new_i_size, s64 *i_sectors_delta, bool may_change_src_io_path_opts) { - struct btree_trans *trans; struct btree_iter dst_iter, src_iter; struct bkey_s_c src_k; struct bkey_buf new_dst, new_src; @@ -610,8 +597,8 @@ s64 bch2_remap_range(struct bch_fs *c, !bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts); int ret = 0, ret2 = 0; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink)) - return -BCH_ERR_erofs_no_writes; + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_reflink)) + return bch_err_throw(c, erofs_no_writes); bch2_check_set_feature(c, BCH_FEATURE_reflink); @@ -620,7 +607,7 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_bkey_buf_init(&new_dst); bch2_bkey_buf_init(&new_src); - trans = bch2_trans_get(c); + CLASS(btree_trans, trans)(c); ret = bch2_inum_opts_get(trans, src_inum, &opts); if (ret) @@ -648,27 +635,27 @@ s64 bch2_remap_range(struct bch_fs *c, if (ret) continue; - bch2_btree_iter_set_snapshot(trans, &src_iter, src_snapshot); + bch2_btree_iter_set_snapshot(&src_iter, src_snapshot); ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol, &dst_snapshot); if (ret) continue; - bch2_btree_iter_set_snapshot(trans, &dst_iter, dst_snapshot); + bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot); if (dst_inum.inum < src_inum.inum) { /* Avoid some lock cycle transaction restarts */ - ret = bch2_btree_iter_traverse(trans, &dst_iter); + ret = bch2_btree_iter_traverse(&dst_iter); if (ret) continue; } dst_done = dst_iter.pos.offset - dst_start.offset; src_want = POS(src_start.inode, src_start.offset + dst_done); - bch2_btree_iter_set_pos(trans, &src_iter, src_want); + bch2_btree_iter_set_pos(&src_iter, src_want); - src_k = get_next_src(trans, &src_iter, src_end); + src_k = get_next_src(&src_iter, src_end); ret = bkey_err(src_k); if (ret) continue; @@ -710,7 +697,8 @@ s64 bch2_remap_range(struct bch_fs *c, SET_REFLINK_P_IDX(&dst_p->v, offset); if (reflink_p_may_update_opts_field && - may_change_src_io_path_opts) + may_change_src_io_path_opts && + REFLINK_P_MAY_UPDATE_OPTIONS(src_p.v)) SET_REFLINK_P_MAY_UPDATE_OPTIONS(&dst_p->v, true); } else { BUG(); @@ -728,8 +716,8 @@ s64 bch2_remap_range(struct bch_fs *c, true); bch2_disk_reservation_put(c, &disk_res); } - bch2_trans_iter_exit(trans, &dst_iter); - bch2_trans_iter_exit(trans, &src_iter); + bch2_trans_iter_exit(&dst_iter); + bch2_trans_iter_exit(&src_iter); BUG_ON(!ret && !bkey_eq(dst_iter.pos, dst_end)); BUG_ON(bkey_gt(dst_iter.pos, dst_end)); @@ -739,7 +727,7 @@ s64 bch2_remap_range(struct bch_fs *c, do { struct bch_inode_unpacked inode_u; - struct btree_iter inode_iter = {}; + struct btree_iter inode_iter = { NULL }; bch2_trans_begin(trans); @@ -754,14 +742,13 @@ s64 bch2_remap_range(struct bch_fs *c, BCH_TRANS_COMMIT_no_enospc); } - bch2_trans_iter_exit(trans, &inode_iter); + bch2_trans_iter_exit(&inode_iter); } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart)); err: - bch2_trans_put(trans); bch2_bkey_buf_exit(&new_src, c); bch2_bkey_buf_exit(&new_dst, c); - bch2_write_ref_put(c, BCH_WRITE_REF_reflink); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_reflink); return dst_done ?: ret ?: ret2; } @@ -775,7 +762,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans, { struct bch_fs *c = trans->c; const __le64 *refcount = bkey_refcount_c(k); - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); struct reflink_gc *r; int ret = 0; @@ -803,7 +790,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans, struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); ret = PTR_ERR_OR_ZERO(new); if (ret) - goto out; + return ret; if (!r->refcount) new->k.type = KEY_TYPE_deleted; @@ -811,32 +798,30 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans, *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount); ret = bch2_trans_update(trans, iter, new, 0); } -out: fsck_err: - printbuf_exit(&buf); return ret; } int bch2_gc_reflink_done(struct bch_fs *c) { + CLASS(btree_trans, trans)(c); size_t idx = 0; - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, + int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_gc_write_reflink_key(trans, &iter, k, &idx))); + bch2_gc_write_reflink_key(trans, &iter, k, &idx)); c->reflink_gc_nr = 0; return ret; } int bch2_gc_reflink_start(struct bch_fs *c) { + CLASS(btree_trans, trans)(c); c->reflink_gc_nr = 0; - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, + int ret = for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, BTREE_ITER_prefetch, k, ({ const __le64 *refcount = bkey_refcount_c(k); @@ -846,7 +831,7 @@ int bch2_gc_reflink_start(struct bch_fs *c) struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, GFP_KERNEL); if (!r) { - ret = -BCH_ERR_ENOMEM_gc_reflink_start; + ret = bch_err_throw(c, ENOMEM_gc_reflink_start); break; } @@ -854,7 +839,7 @@ int bch2_gc_reflink_start(struct bch_fs *c) r->size = k.k->size; r->refcount = 0; 0; - }))); + })); bch_err_fn(c, ret); return ret; diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 477ef0997949..0784283ce78c 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -119,7 +119,7 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, return 0; bad: bch2_replicas_entry_to_text(err, r); - return -BCH_ERR_invalid_replicas_entry; + return bch_err_throw(c, invalid_replicas_entry); } void bch2_cpu_replicas_to_text(struct printbuf *out, @@ -286,11 +286,8 @@ bool bch2_replicas_marked_locked(struct bch_fs *c, bool bch2_replicas_marked(struct bch_fs *c, struct bch_replicas_entry_v1 *search) { - percpu_down_read(&c->mark_lock); - bool ret = bch2_replicas_marked_locked(c, search); - percpu_up_read(&c->mark_lock); - - return ret; + guard(percpu_read)(&c->mark_lock); + return bch2_replicas_marked_locked(c, search); } noinline @@ -305,27 +302,27 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c, memset(&new_r, 0, sizeof(new_r)); memset(&new_gc, 0, sizeof(new_gc)); - mutex_lock(&c->sb_lock); + guard(mutex)(&c->sb_lock); if (c->replicas_gc.entries && !__replicas_has_entry(&c->replicas_gc, new_entry)) { new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry); if (!new_gc.entries) { - ret = -BCH_ERR_ENOMEM_cpu_replicas; - goto err; + ret = bch_err_throw(c, ENOMEM_cpu_replicas); + goto out; } } if (!__replicas_has_entry(&c->replicas, new_entry)) { new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry); if (!new_r.entries) { - ret = -BCH_ERR_ENOMEM_cpu_replicas; - goto err; + ret = bch_err_throw(c, ENOMEM_cpu_replicas); + goto out; } ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r); if (ret) - goto err; + goto out; } if (!new_r.entries && @@ -338,22 +335,18 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c, bch2_write_super(c); /* don't update in memory replicas until changes are persistent */ - percpu_down_write(&c->mark_lock); - if (new_r.entries) - swap(c->replicas, new_r); - if (new_gc.entries) - swap(new_gc, c->replicas_gc); - percpu_up_write(&c->mark_lock); + scoped_guard(percpu_write, &c->mark_lock) { + if (new_r.entries) + swap(c->replicas, new_r); + if (new_gc.entries) + swap(new_gc, c->replicas_gc); + } out: - mutex_unlock(&c->sb_lock); - kfree(new_r.entries); kfree(new_gc.entries); - return ret; -err: bch_err_msg(c, ret, "adding replicas entry"); - goto out; + return ret; } int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r) @@ -371,24 +364,20 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret) { lockdep_assert_held(&c->replicas_gc_lock); - mutex_lock(&c->sb_lock); - percpu_down_write(&c->mark_lock); - - ret = ret ?: - bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc); - if (!ret) - swap(c->replicas, c->replicas_gc); - - kfree(c->replicas_gc.entries); - c->replicas_gc.entries = NULL; + guard(mutex)(&c->sb_lock); + scoped_guard(percpu_write, &c->mark_lock) { + ret = ret ?: + bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc); + if (!ret) + swap(c->replicas, c->replicas_gc); - percpu_up_write(&c->mark_lock); + kfree(c->replicas_gc.entries); + c->replicas_gc.entries = NULL; + } if (!ret) bch2_write_super(c); - mutex_unlock(&c->sb_lock); - return ret; } @@ -399,7 +388,7 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) lockdep_assert_held(&c->replicas_gc_lock); - mutex_lock(&c->sb_lock); + guard(mutex)(&c->sb_lock); BUG_ON(c->replicas_gc.entries); c->replicas_gc.nr = 0; @@ -420,9 +409,8 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) c->replicas_gc.entry_size, GFP_KERNEL); if (!c->replicas_gc.entries) { - mutex_unlock(&c->sb_lock); bch_err(c, "error allocating c->replicas_gc"); - return -BCH_ERR_ENOMEM_replicas_gc; + return bch_err_throw(c, ENOMEM_replicas_gc); } for_each_cpu_replicas_entry(&c->replicas, e) @@ -432,8 +420,6 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) e, c->replicas_gc.entry_size); bch2_cpu_replicas_sort(&c->replicas_gc); - mutex_unlock(&c->sb_lock); - return 0; } @@ -458,58 +444,51 @@ int bch2_replicas_gc2(struct bch_fs *c) new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL); if (!new.entries) { bch_err(c, "error allocating c->replicas_gc"); - return -BCH_ERR_ENOMEM_replicas_gc; - } - - mutex_lock(&c->sb_lock); - percpu_down_write(&c->mark_lock); - - if (nr != c->replicas.nr || - new.entry_size != c->replicas.entry_size) { - percpu_up_write(&c->mark_lock); - mutex_unlock(&c->sb_lock); - kfree(new.entries); - goto retry; + return bch_err_throw(c, ENOMEM_replicas_gc); } - for (unsigned i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry_v1 *e = - cpu_replicas_entry(&c->replicas, i); + guard(mutex)(&c->sb_lock); + scoped_guard(percpu_write, &c->mark_lock) { + if (nr != c->replicas.nr || + new.entry_size != c->replicas.entry_size) { + kfree(new.entries); + goto retry; + } - struct disk_accounting_pos k = { - .type = BCH_DISK_ACCOUNTING_replicas, - }; + for (unsigned i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry_v1 *e = + cpu_replicas_entry(&c->replicas, i); - unsafe_memcpy(&k.replicas, e, replicas_entry_bytes(e), - "embedded variable length struct"); + struct disk_accounting_pos k = { + .type = BCH_DISK_ACCOUNTING_replicas, + }; - struct bpos p = disk_accounting_pos_to_bpos(&k); + unsafe_memcpy(&k.replicas, e, replicas_entry_bytes(e), + "embedded variable length struct"); - struct bch_accounting_mem *acc = &c->accounting; - bool kill = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, &p) >= acc->k.nr; + struct bpos p = disk_accounting_pos_to_bpos(&k); - if (e->data_type == BCH_DATA_journal || !kill) - memcpy(cpu_replicas_entry(&new, new.nr++), - e, new.entry_size); - } + struct bch_accounting_mem *acc = &c->accounting; + bool kill = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, &p) >= acc->k.nr; - bch2_cpu_replicas_sort(&new); + if (e->data_type == BCH_DATA_journal || !kill) + memcpy(cpu_replicas_entry(&new, new.nr++), + e, new.entry_size); + } - ret = bch2_cpu_replicas_to_sb_replicas(c, &new); + bch2_cpu_replicas_sort(&new); - if (!ret) - swap(c->replicas, new); + ret = bch2_cpu_replicas_to_sb_replicas(c, &new); - kfree(new.entries); + if (!ret) + swap(c->replicas, new); - percpu_up_write(&c->mark_lock); + kfree(new.entries); + } if (!ret) bch2_write_super(c); - - mutex_unlock(&c->sb_lock); - return ret; } @@ -597,9 +576,8 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) bch2_cpu_replicas_sort(&new_r); - percpu_down_write(&c->mark_lock); + guard(percpu_write)(&c->mark_lock); swap(c->replicas, new_r); - percpu_up_write(&c->mark_lock); kfree(new_r.entries); @@ -622,7 +600,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0, DIV_ROUND_UP(bytes, sizeof(u64))); if (!sb_r) - return -BCH_ERR_ENOSPC_sb_replicas; + return bch_err_throw(c, ENOSPC_sb_replicas); bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0); @@ -667,7 +645,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, sb_r = bch2_sb_field_resize(&c->disk_sb, replicas, DIV_ROUND_UP(bytes, sizeof(u64))); if (!sb_r) - return -BCH_ERR_ENOSPC_sb_replicas; + return bch_err_throw(c, ENOSPC_sb_replicas); bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas); @@ -809,9 +787,8 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, unsigned flags, bool print) { struct bch_replicas_entry_v1 *e; - bool ret = true; - percpu_down_read(&c->mark_lock); + guard(percpu_read)(&c->mark_lock); for_each_cpu_replicas_entry(&c->replicas, e) { unsigned nr_online = 0, nr_failed = 0, dflags = 0; bool metadata = e->data_type < BCH_DATA_user; @@ -819,19 +796,18 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, if (e->data_type == BCH_DATA_cached) continue; - rcu_read_lock(); - for (unsigned i = 0; i < e->nr_devs; i++) { - if (e->devs[i] == BCH_SB_MEMBER_INVALID) { - nr_failed++; - continue; - } + scoped_guard(rcu) + for (unsigned i = 0; i < e->nr_devs; i++) { + if (e->devs[i] == BCH_SB_MEMBER_INVALID) { + nr_failed++; + continue; + } - nr_online += test_bit(e->devs[i], devs.d); + nr_online += test_bit(e->devs[i], devs.d); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]); - nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed; - } - rcu_read_unlock(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]); + nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed; + } if (nr_online + nr_failed == e->nr_devs) continue; @@ -848,21 +824,18 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, if (dflags & ~flags) { if (print) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); bch2_replicas_entry_to_text(&buf, e); bch_err(c, "insufficient devices online (%u) for replicas entry %s", nr_online, buf.buf); - printbuf_exit(&buf); } - ret = false; - break; + return false; } } - percpu_up_read(&c->mark_lock); - return ret; + return true; } unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) @@ -905,11 +878,8 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) { - mutex_lock(&c->sb_lock); - unsigned ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx); - mutex_unlock(&c->sb_lock); - - return ret; + guard(mutex)(&c->sb_lock); + return bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx); } void bch2_fs_replicas_exit(struct bch_fs *c) diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c index 59c8770e4a0e..a5916984565e 100644 --- a/fs/bcachefs/sb-clean.c +++ b/fs/bcachefs/sb-clean.c @@ -89,8 +89,8 @@ int bch2_verify_superblock_clean(struct bch_fs *c, { unsigned i; struct bch_sb_field_clean *clean = *cleanp; - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; + CLASS(printbuf, buf1)(); + CLASS(printbuf, buf2)(); int ret = 0; if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, @@ -140,8 +140,6 @@ int bch2_verify_superblock_clean(struct bch_fs *c, l2, buf2.buf); } fsck_err: - printbuf_exit(&buf2); - printbuf_exit(&buf1); return ret; } @@ -150,7 +148,7 @@ struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c) struct bch_sb_field_clean *clean, *sb_clean; int ret; - mutex_lock(&c->sb_lock); + guard(mutex)(&c->sb_lock); sb_clean = bch2_sb_field_get(c->disk_sb.sb, clean); if (fsck_err_on(!sb_clean, c, @@ -158,29 +156,22 @@ struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c) "superblock marked clean but clean section not present")) { SET_BCH_SB_CLEAN(c->disk_sb.sb, false); c->sb.clean = false; - mutex_unlock(&c->sb_lock); return ERR_PTR(-BCH_ERR_invalid_sb_clean); } clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), GFP_KERNEL); - if (!clean) { - mutex_unlock(&c->sb_lock); + if (!clean) return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean); - } ret = bch2_sb_clean_validate_late(c, clean, READ); if (ret) { kfree(clean); - mutex_unlock(&c->sb_lock); return ERR_PTR(ret); } - mutex_unlock(&c->sb_lock); - return clean; fsck_err: - mutex_unlock(&c->sb_lock); return ERR_PTR(ret); } @@ -265,21 +256,16 @@ const struct bch_sb_field_ops bch_sb_field_ops_clean = { int bch2_fs_mark_dirty(struct bch_fs *c) { - int ret; - /* * Unconditionally write superblock, to verify it hasn't changed before * we go rw: */ - mutex_lock(&c->sb_lock); + guard(mutex)(&c->sb_lock); SET_BCH_SB_CLEAN(c->disk_sb.sb, false); c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); - ret = bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - return ret; + return bch2_write_super(c); } void bch2_fs_mark_clean(struct bch_fs *c) @@ -289,9 +275,9 @@ void bch2_fs_mark_clean(struct bch_fs *c) unsigned u64s; int ret; - mutex_lock(&c->sb_lock); + guard(mutex)(&c->sb_lock); if (BCH_SB_CLEAN(c->disk_sb.sb)) - goto out; + return; SET_BCH_SB_CLEAN(c->disk_sb.sb, true); @@ -305,7 +291,7 @@ void bch2_fs_mark_clean(struct bch_fs *c) sb_clean = bch2_sb_field_resize(&c->disk_sb, clean, u64s); if (!sb_clean) { bch_err(c, "error resizing superblock while setting filesystem clean"); - goto out; + return; } sb_clean->flags = 0; @@ -329,12 +315,10 @@ void bch2_fs_mark_clean(struct bch_fs *c) ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE); if (ret) { bch_err(c, "error writing marking filesystem clean: validate error"); - goto out; + return; } bch2_journal_pos_from_member_info_set(c); bch2_write_super(c); -out: - mutex_unlock(&c->sb_lock); } diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h index fa27ec59a647..f3ea53a55384 100644 --- a/fs/bcachefs/sb-counters_format.h +++ b/fs/bcachefs/sb-counters_format.h @@ -12,10 +12,17 @@ enum counters_flags { x(io_read_inline, 80, TYPE_SECTORS) \ x(io_read_hole, 81, TYPE_SECTORS) \ x(io_read_promote, 30, TYPE_COUNTER) \ + x(io_read_nopromote, 85, TYPE_COUNTER) \ + x(io_read_nopromote_may_not, 86, TYPE_COUNTER) \ + x(io_read_nopromote_already_promoted, 87, TYPE_COUNTER) \ + x(io_read_nopromote_unwritten, 88, TYPE_COUNTER) \ + x(io_read_nopromote_congested, 89, TYPE_COUNTER) \ + x(io_read_nopromote_in_flight, 90, TYPE_COUNTER) \ x(io_read_bounce, 31, TYPE_COUNTER) \ x(io_read_split, 33, TYPE_COUNTER) \ x(io_read_reuse_race, 34, TYPE_COUNTER) \ x(io_read_retry, 32, TYPE_COUNTER) \ + x(io_read_fail_and_poison, 82, TYPE_COUNTER) \ x(io_write, 1, TYPE_SECTORS) \ x(io_move, 2, TYPE_SECTORS) \ x(io_move_read, 35, TYPE_SECTORS) \ @@ -24,6 +31,10 @@ enum counters_flags { x(io_move_fail, 38, TYPE_COUNTER) \ x(io_move_write_fail, 82, TYPE_COUNTER) \ x(io_move_start_fail, 39, TYPE_COUNTER) \ + x(io_move_drop_only, 91, TYPE_COUNTER) \ + x(io_move_noop, 92, TYPE_COUNTER) \ + x(io_move_created_rebalance, 83, TYPE_COUNTER) \ + x(io_move_evacuate_bucket, 84, TYPE_COUNTER) \ x(bucket_invalidate, 3, TYPE_COUNTER) \ x(bucket_discard, 4, TYPE_COUNTER) \ x(bucket_discard_fast, 79, TYPE_COUNTER) \ diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index badd0e17ada5..de56a1ee79db 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -100,7 +100,11 @@ BCH_FSCK_ERR_ptr_to_missing_backpointer) \ x(stripe_backpointers, \ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ - BCH_FSCK_ERR_ptr_to_missing_backpointer) + BCH_FSCK_ERR_ptr_to_missing_backpointer) \ + x(inode_has_case_insensitive, \ + BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ + BCH_FSCK_ERR_inode_has_case_insensitive_not_set, \ + BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set) #define DOWNGRADE_TABLE() \ x(bucket_stripe_sectors, \ @@ -187,7 +191,7 @@ int bch2_sb_set_upgrade_extra(struct bch_fs *c) bool write_sb = false; int ret = 0; - mutex_lock(&c->sb_lock); + guard(mutex)(&c->sb_lock); struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); if (old_version < bcachefs_metadata_version_bucket_stripe_sectors && @@ -201,7 +205,6 @@ int bch2_sb_set_upgrade_extra(struct bch_fs *c) if (write_sb) bch2_write_super(c); - mutex_unlock(&c->sb_lock); return ret < 0 ? ret : 0; } @@ -249,6 +252,7 @@ DOWNGRADE_TABLE() static int downgrade_table_extra(struct bch_fs *c, darray_char *table) { + unsigned dst_offset = table->nr; struct bch_sb_field_downgrade_entry *dst = (void *) &darray_top(*table); unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors); int ret = 0; @@ -264,6 +268,9 @@ static int downgrade_table_extra(struct bch_fs *c, darray_char *table) if (ret) return ret; + dst = (void *) &table->data[dst_offset]; + dst->nr_errors = cpu_to_le16(nr_errors + 1); + /* open coded __set_bit_le64, as dst is packed and * dst->recovery_passes is misaligned */ unsigned b = BCH_RECOVERY_PASS_STABLE_check_allocations; @@ -274,7 +281,6 @@ static int downgrade_table_extra(struct bch_fs *c, darray_char *table) break; } - dst->nr_errors = cpu_to_le16(nr_errors); return ret; } @@ -365,7 +371,7 @@ int bch2_sb_downgrade_update(struct bch_fs *c) if (!test_bit(BCH_FS_btree_running, &c->flags)) return 0; - darray_char table = {}; + CLASS(darray_char, table)(); int ret = 0; for (const struct upgrade_downgrade_entry *src = downgrade_table; @@ -374,12 +380,15 @@ int bch2_sb_downgrade_update(struct bch_fs *c) if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version))) continue; + if (src->version < c->sb.version_incompat) + continue; + struct bch_sb_field_downgrade_entry *dst; unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * src->nr_errors; ret = darray_make_room(&table, bytes); if (ret) - goto out; + return ret; dst = (void *) &darray_top(table); dst->version = cpu_to_le16(src->version); @@ -391,7 +400,7 @@ int bch2_sb_downgrade_update(struct bch_fs *c) ret = downgrade_table_extra(c, &table); if (ret) - goto out; + return ret; if (!dst->recovery_passes[0] && !dst->recovery_passes[1] && @@ -406,18 +415,14 @@ int bch2_sb_downgrade_update(struct bch_fs *c) unsigned sb_u64s = DIV_ROUND_UP(sizeof(*d) + table.nr, sizeof(u64)); if (d && le32_to_cpu(d->field.u64s) > sb_u64s) - goto out; + return 0; d = bch2_sb_field_resize(&c->disk_sb, downgrade, sb_u64s); - if (!d) { - ret = -BCH_ERR_ENOSPC_sb_downgrade; - goto out; - } + if (!d) + return bch_err_throw(c, ENOSPC_sb_downgrade); memcpy(d->entries, table.data, table.nr); memset_u64s_tail(d->entries, 0, table.nr); -out: - darray_exit(&table); return ret; } diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c index 013a96883b4e..41a259eab4fb 100644 --- a/fs/bcachefs/sb-errors.c +++ b/fs/bcachefs/sb-errors.c @@ -78,6 +78,28 @@ const struct bch_sb_field_ops bch_sb_field_ops_errors = { .to_text = bch2_sb_errors_to_text, }; +void bch2_fs_errors_to_text(struct printbuf *out, struct bch_fs *c) +{ + if (out->nr_tabstops < 1) + printbuf_tabstop_push(out, 48); + if (out->nr_tabstops < 2) + printbuf_tabstop_push(out, 8); + if (out->nr_tabstops < 3) + printbuf_tabstop_push(out, 16); + + guard(mutex)(&c->fsck_error_counts_lock); + + bch_sb_errors_cpu *e = &c->fsck_error_counts; + darray_for_each(*e, i) { + bch2_sb_error_id_to_text(out, i->id); + prt_tab(out); + prt_u64(out, i->nr); + prt_tab(out); + bch2_prt_datetime(out, i->last_error_time); + prt_newline(out); + } +} + void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err) { bch_sb_errors_cpu *e = &c->fsck_error_counts; @@ -88,75 +110,66 @@ void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err) }; unsigned i; - mutex_lock(&c->fsck_error_counts_lock); + guard(mutex)(&c->fsck_error_counts_lock); + for (i = 0; i < e->nr; i++) { if (err == e->data[i].id) { e->data[i].nr++; e->data[i].last_error_time = n.last_error_time; - goto out; + return; } if (err < e->data[i].id) break; } if (darray_make_room(e, 1)) - goto out; + return; darray_insert_item(e, i, n); -out: - mutex_unlock(&c->fsck_error_counts_lock); } void bch2_sb_errors_from_cpu(struct bch_fs *c) { - bch_sb_errors_cpu *src = &c->fsck_error_counts; - struct bch_sb_field_errors *dst; - unsigned i; - - mutex_lock(&c->fsck_error_counts_lock); - - dst = bch2_sb_field_resize(&c->disk_sb, errors, - bch2_sb_field_errors_u64s(src->nr)); + guard(mutex)(&c->fsck_error_counts_lock); + bch_sb_errors_cpu *src = &c->fsck_error_counts; + struct bch_sb_field_errors *dst = + bch2_sb_field_resize(&c->disk_sb, errors, + bch2_sb_field_errors_u64s(src->nr)); if (!dst) - goto err; + return; - for (i = 0; i < src->nr; i++) { + for (unsigned i = 0; i < src->nr; i++) { SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id); SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr); dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time); } - -err: - mutex_unlock(&c->fsck_error_counts_lock); } static int bch2_sb_errors_to_cpu(struct bch_fs *c) { + guard(mutex)(&c->fsck_error_counts_lock); + struct bch_sb_field_errors *src = bch2_sb_field_get(c->disk_sb.sb, errors); bch_sb_errors_cpu *dst = &c->fsck_error_counts; - unsigned i, nr = bch2_sb_field_errors_nr_entries(src); - int ret; + unsigned nr = bch2_sb_field_errors_nr_entries(src); if (!nr) return 0; - mutex_lock(&c->fsck_error_counts_lock); - ret = darray_make_room(dst, nr); + int ret = darray_make_room(dst, nr); if (ret) - goto err; + return ret; dst->nr = nr; - for (i = 0; i < nr; i++) { + for (unsigned i = 0; i < nr; i++) { dst->data[i].id = BCH_SB_ERROR_ENTRY_ID(&src->entries[i]); dst->data[i].nr = BCH_SB_ERROR_ENTRY_NR(&src->entries[i]); dst->data[i].last_error_time = le64_to_cpu(src->entries[i].last_error_time); } -err: - mutex_unlock(&c->fsck_error_counts_lock); - return ret; + return 0; } void bch2_fs_sb_errors_exit(struct bch_fs *c) diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h index b2357b8e6107..e86267264692 100644 --- a/fs/bcachefs/sb-errors.h +++ b/fs/bcachefs/sb-errors.h @@ -7,6 +7,7 @@ extern const char * const bch2_sb_error_strs[]; void bch2_sb_error_id_to_text(struct printbuf *, enum bch_sb_error_id); +void bch2_fs_errors_to_text(struct printbuf *, struct bch_fs *); extern const struct bch_sb_field_ops bch_sb_field_ops_errors; diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 4036a20c6adc..5317b1bfe2e5 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -3,9 +3,11 @@ #define _BCACHEFS_SB_ERRORS_FORMAT_H enum bch_fsck_flags { - FSCK_CAN_FIX = 1 << 0, - FSCK_CAN_IGNORE = 1 << 1, - FSCK_AUTOFIX = 1 << 2, + FSCK_CAN_FIX = BIT(0), + FSCK_CAN_IGNORE = BIT(1), + FSCK_AUTOFIX = BIT(2), + FSCK_ERR_NO_LOG = BIT(3), + FSCK_ERR_SILENT = BIT(4), }; #define BCH_SB_ERRS() \ @@ -74,6 +76,8 @@ enum bch_fsck_flags { x(btree_node_read_error, 62, FSCK_AUTOFIX) \ x(btree_node_topology_bad_min_key, 63, FSCK_AUTOFIX) \ x(btree_node_topology_bad_max_key, 64, FSCK_AUTOFIX) \ + x(btree_node_topology_bad_root_min_key, 323, FSCK_AUTOFIX) \ + x(btree_node_topology_bad_root_max_key, 324, FSCK_AUTOFIX) \ x(btree_node_topology_overwritten_by_prev_node, 65, FSCK_AUTOFIX) \ x(btree_node_topology_overwritten_by_next_node, 66, FSCK_AUTOFIX) \ x(btree_node_topology_interior_node_empty, 67, FSCK_AUTOFIX) \ @@ -134,7 +138,7 @@ enum bch_fsck_flags { x(bucket_gens_to_invalid_buckets, 121, FSCK_AUTOFIX) \ x(bucket_gens_nonzero_for_invalid_buckets, 122, FSCK_AUTOFIX) \ x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \ - x(need_discard_freespace_key_bad, 124, 0) \ + x(need_discard_freespace_key_bad, 124, FSCK_AUTOFIX) \ x(discarding_bucket_not_in_need_discard_btree, 291, 0) \ x(backpointer_bucket_offset_wrong, 125, 0) \ x(backpointer_level_bad, 294, 0) \ @@ -156,6 +160,7 @@ enum bch_fsck_flags { x(extent_ptrs_unwritten, 140, 0) \ x(extent_ptrs_written_and_unwritten, 141, 0) \ x(ptr_to_invalid_device, 142, 0) \ + x(ptr_to_removed_device, 322, 0) \ x(ptr_to_duplicate_device, 143, 0) \ x(ptr_after_last_bucket, 144, 0) \ x(ptr_before_first_bucket, 145, 0) \ @@ -165,7 +170,7 @@ enum bch_fsck_flags { x(ptr_to_missing_replicas_entry, 149, FSCK_AUTOFIX) \ x(ptr_to_missing_stripe, 150, 0) \ x(ptr_to_incorrect_stripe, 151, 0) \ - x(ptr_gen_newer_than_bucket_gen, 152, 0) \ + x(ptr_gen_newer_than_bucket_gen, 152, FSCK_AUTOFIX) \ x(ptr_too_stale, 153, 0) \ x(stale_dirty_ptr, 154, FSCK_AUTOFIX) \ x(ptr_bucket_data_type_mismatch, 155, 0) \ @@ -209,7 +214,7 @@ enum bch_fsck_flags { x(subvol_to_missing_root, 188, 0) \ x(subvol_root_wrong_bi_subvol, 189, FSCK_AUTOFIX) \ x(bkey_in_missing_snapshot, 190, 0) \ - x(bkey_in_deleted_snapshot, 315, 0) \ + x(bkey_in_deleted_snapshot, 315, FSCK_AUTOFIX) \ x(inode_pos_inode_nonzero, 191, 0) \ x(inode_pos_blockdev_range, 192, 0) \ x(inode_alloc_cursor_inode_bad, 301, 0) \ @@ -217,7 +222,7 @@ enum bch_fsck_flags { x(inode_str_hash_invalid, 194, 0) \ x(inode_v3_fields_start_bad, 195, 0) \ x(inode_snapshot_mismatch, 196, 0) \ - x(snapshot_key_missing_inode_snapshot, 314, 0) \ + x(snapshot_key_missing_inode_snapshot, 314, FSCK_AUTOFIX) \ x(inode_unlinked_but_clean, 197, 0) \ x(inode_unlinked_but_nlink_nonzero, 198, 0) \ x(inode_unlinked_and_not_open, 281, 0) \ @@ -232,10 +237,11 @@ enum bch_fsck_flags { x(inode_dir_multiple_links, 206, FSCK_AUTOFIX) \ x(inode_dir_missing_backpointer, 284, FSCK_AUTOFIX) \ x(inode_dir_unlinked_but_not_empty, 286, FSCK_AUTOFIX) \ + x(inode_dir_has_nonzero_i_size, 319, FSCK_AUTOFIX) \ x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \ x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \ x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \ - x(inode_has_child_snapshots_wrong, 287, 0) \ + x(inode_has_child_snapshots_wrong, 287, FSCK_AUTOFIX) \ x(inode_unreachable, 210, FSCK_AUTOFIX) \ x(inode_journal_seq_in_future, 299, FSCK_AUTOFIX) \ x(inode_i_sectors_underflow, 312, FSCK_AUTOFIX) \ @@ -243,26 +249,27 @@ enum bch_fsck_flags { x(inode_parent_has_case_insensitive_not_set, 317, FSCK_AUTOFIX) \ x(vfs_inode_i_blocks_underflow, 311, FSCK_AUTOFIX) \ x(vfs_inode_i_blocks_not_zero_at_truncate, 313, FSCK_AUTOFIX) \ + x(vfs_bad_inode_rm, 320, 0) \ x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \ x(deleted_inode_missing, 212, FSCK_AUTOFIX) \ x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \ x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \ x(deleted_inode_has_child_snapshots, 288, FSCK_AUTOFIX) \ x(extent_overlapping, 215, 0) \ - x(key_in_missing_inode, 216, 0) \ + x(key_in_missing_inode, 216, FSCK_AUTOFIX) \ x(key_in_wrong_inode_type, 217, 0) \ - x(extent_past_end_of_inode, 218, 0) \ + x(extent_past_end_of_inode, 218, FSCK_AUTOFIX) \ x(dirent_empty_name, 219, 0) \ x(dirent_val_too_big, 220, 0) \ x(dirent_name_too_long, 221, 0) \ x(dirent_name_embedded_nul, 222, 0) \ x(dirent_name_dot_or_dotdot, 223, 0) \ x(dirent_name_has_slash, 224, 0) \ - x(dirent_d_type_wrong, 225, 0) \ + x(dirent_d_type_wrong, 225, FSCK_AUTOFIX) \ x(inode_bi_parent_wrong, 226, 0) \ x(dirent_in_missing_dir_inode, 227, 0) \ x(dirent_in_non_dir_inode, 228, 0) \ - x(dirent_to_missing_inode, 229, 0) \ + x(dirent_to_missing_inode, 229, FSCK_AUTOFIX) \ x(dirent_to_overwritten_inode, 302, 0) \ x(dirent_to_missing_subvol, 230, 0) \ x(dirent_to_itself, 231, 0) \ @@ -276,9 +283,9 @@ enum bch_fsck_flags { x(root_subvol_missing, 238, 0) \ x(root_dir_missing, 239, 0) \ x(root_inode_not_dir, 240, 0) \ - x(dir_loop, 241, 0) \ - x(hash_table_key_duplicate, 242, 0) \ - x(hash_table_key_wrong_offset, 243, 0) \ + x(dir_loop, 241, FSCK_AUTOFIX) \ + x(hash_table_key_duplicate, 242, FSCK_AUTOFIX) \ + x(hash_table_key_wrong_offset, 243, FSCK_AUTOFIX) \ x(unlinked_inode_not_on_deleted_list, 244, FSCK_AUTOFIX) \ x(reflink_p_front_pad_bad, 245, 0) \ x(journal_entry_dup_same_device, 246, 0) \ @@ -287,18 +294,19 @@ enum bch_fsck_flags { x(inode_points_to_missing_dirent, 249, FSCK_AUTOFIX) \ x(inode_points_to_wrong_dirent, 250, FSCK_AUTOFIX) \ x(inode_bi_parent_nonzero, 251, 0) \ + x(missing_inode_with_contents, 321, FSCK_AUTOFIX) \ x(dirent_to_missing_parent_subvol, 252, 0) \ x(dirent_not_visible_in_parent_subvol, 253, 0) \ x(subvol_fs_path_parent_wrong, 254, 0) \ x(subvol_root_fs_path_parent_nonzero, 255, 0) \ x(subvol_children_not_set, 256, 0) \ x(subvol_children_bad, 257, 0) \ - x(subvol_loop, 258, 0) \ + x(subvol_loop, 258, FSCK_AUTOFIX) \ x(subvol_unreachable, 259, FSCK_AUTOFIX) \ x(btree_node_bkey_bad_u64s, 260, 0) \ x(btree_node_topology_empty_interior_node, 261, 0) \ x(btree_ptr_v2_min_key_bad, 262, 0) \ - x(btree_root_unreadable_and_scan_found_nothing, 263, FSCK_AUTOFIX) \ + x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \ x(snapshot_node_missing, 264, FSCK_AUTOFIX) \ x(dup_backpointer_to_bad_csum_extent, 265, 0) \ x(btree_bitmap_not_marked, 266, FSCK_AUTOFIX) \ @@ -311,7 +319,7 @@ enum bch_fsck_flags { x(accounting_mismatch, 272, FSCK_AUTOFIX) \ x(accounting_replicas_not_marked, 273, 0) \ x(accounting_to_invalid_device, 289, 0) \ - x(invalid_btree_id, 274, 0) \ + x(invalid_btree_id, 274, FSCK_AUTOFIX) \ x(alloc_key_io_time_bad, 275, 0) \ x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \ x(accounting_key_junk_at_end, 277, FSCK_AUTOFIX) \ @@ -328,7 +336,7 @@ enum bch_fsck_flags { x(dirent_stray_data_after_cf_name, 305, 0) \ x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \ x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \ - x(MAX, 319, 0) + x(MAX, 325, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 72779912939b..e3c73d903898 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -5,14 +5,41 @@ #include "disk_groups.h" #include "error.h" #include "opts.h" +#include "recovery_passes.h" #include "replicas.h" #include "sb-members.h" #include "super-io.h" -void bch2_dev_missing(struct bch_fs *c, unsigned dev) +int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev) +{ + CLASS(printbuf, buf)(); + bch2_log_msg_start(c, &buf); + + bool removed = test_bit(dev, c->devs_removed.d); + + prt_printf(&buf, "pointer to %s device %u in key\n", + removed ? "removed" : "nonexistent", dev); + bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + + bool print = removed + ? bch2_count_fsck_err(c, ptr_to_removed_device, &buf) + : bch2_count_fsck_err(c, ptr_to_invalid_device, &buf); + + int ret = bch2_run_explicit_recovery_pass(c, &buf, + BCH_RECOVERY_PASS_check_allocations, 0); + + if (print) + bch2_print_str(c, KERN_ERR, buf.buf); + return ret; +} + +void bch2_dev_missing_atomic(struct bch_fs *c, unsigned dev) { if (dev != BCH_SB_MEMBER_INVALID) - bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev); + bch2_fs_inconsistent(c, "pointer to %s device %u", + test_bit(dev, c->devs_removed.d) + ? "removed" : "nonexistent", dev); } void bch2_dev_bucket_missing(struct bch_dev *ca, u64 bucket) @@ -41,34 +68,13 @@ struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i) return __bch2_members_v2_get_mut(bch2_sb_field_get(sb, members_v2), i); } -static struct bch_member members_v2_get(struct bch_sb_field_members_v2 *mi, int i) -{ - struct bch_member ret, *p = __bch2_members_v2_get_mut(mi, i); - memset(&ret, 0, sizeof(ret)); - memcpy(&ret, p, min_t(size_t, le16_to_cpu(mi->member_bytes), sizeof(ret))); - return ret; -} - -static struct bch_member *members_v1_get_mut(struct bch_sb_field_members_v1 *mi, int i) -{ - return (void *) mi->_members + (i * BCH_MEMBER_V1_BYTES); -} - -static struct bch_member members_v1_get(struct bch_sb_field_members_v1 *mi, int i) -{ - struct bch_member ret, *p = members_v1_get_mut(mi, i); - memset(&ret, 0, sizeof(ret)); - memcpy(&ret, p, min_t(size_t, BCH_MEMBER_V1_BYTES, sizeof(ret))); - return ret; -} - struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i) { struct bch_sb_field_members_v2 *mi2 = bch2_sb_field_get(sb, members_v2); if (mi2) - return members_v2_get(mi2, i); + return bch2_members_v2_get(mi2, i); struct bch_sb_field_members_v1 *mi1 = bch2_sb_field_get(sb, members_v1); - return members_v1_get(mi1, i); + return bch2_members_v1_get(mi1, i); } static int sb_members_v2_resize_entries(struct bch_fs *c) @@ -81,7 +87,7 @@ static int sb_members_v2_resize_entries(struct bch_fs *c) mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s); if (!mi) - return -BCH_ERR_ENOSPC_sb_members_v2; + return bch_err_throw(c, ENOSPC_sb_members_v2); for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) { void *dst = (void *) mi->_members + (i * sizeof(struct bch_member)); @@ -119,6 +125,11 @@ int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb) struct bch_sb_field_members_v1 *mi1; struct bch_sb_field_members_v2 *mi2; + if (BCH_SB_VERSION_INCOMPAT(disk_sb->sb) > bcachefs_metadata_version_extent_flags) { + bch2_sb_field_resize(disk_sb, members_v1, 0); + return 0; + } + mi1 = bch2_sb_field_resize(disk_sb, members_v1, DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES * disk_sb->sb->nr_devices, sizeof(u64))); @@ -170,42 +181,34 @@ static int validate_member(struct printbuf *err, return -BCH_ERR_invalid_sb_members; } + if (BCH_MEMBER_FREESPACE_INITIALIZED(&m) && + sb->features[0] & cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info))) { + prt_printf(err, "device %u: freespace initialized but fs has no alloc info", i); + return -BCH_ERR_invalid_sb_members; + } + return 0; } -static void member_to_text(struct printbuf *out, - struct bch_member m, - struct bch_sb_field_disk_groups *gi, - struct bch_sb *sb, - int i) +void bch2_member_to_text(struct printbuf *out, + struct bch_member *m, + struct bch_sb_field_disk_groups *gi, + struct bch_sb *sb, + unsigned idx) { - unsigned data_have = bch2_sb_dev_has_data(sb, i); - u64 bucket_size = le16_to_cpu(m.bucket_size); - u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size; - - if (!bch2_member_alive(&m)) - return; - - prt_printf(out, "Device:\t%u\n", i); - - printbuf_indent_add(out, 2); + u64 bucket_size = le16_to_cpu(m->bucket_size); + u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size; prt_printf(out, "Label:\t"); - if (BCH_MEMBER_GROUP(&m)) { - unsigned idx = BCH_MEMBER_GROUP(&m) - 1; - - if (idx < disk_groups_nr(gi)) - prt_printf(out, "%s (%u)", - gi->entries[idx].label, idx); - else - prt_printf(out, "(bad disk labels section)"); - } else { + if (BCH_MEMBER_GROUP(m)) + bch2_disk_path_to_text_sb(out, sb, + BCH_MEMBER_GROUP(m) - 1); + else prt_printf(out, "(none)"); - } prt_newline(out); prt_printf(out, "UUID:\t"); - pr_uuid(out, m.uuid.b); + pr_uuid(out, m->uuid.b); prt_newline(out); prt_printf(out, "Size:\t"); @@ -213,40 +216,41 @@ static void member_to_text(struct printbuf *out, prt_newline(out); for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) - prt_printf(out, "%s errors:\t%llu\n", bch2_member_error_strs[i], le64_to_cpu(m.errors[i])); + prt_printf(out, "%s errors:\t%llu\n", bch2_member_error_strs[i], le64_to_cpu(m->errors[i])); for (unsigned i = 0; i < BCH_IOPS_NR; i++) - prt_printf(out, "%s iops:\t%u\n", bch2_iops_measurements[i], le32_to_cpu(m.iops[i])); + prt_printf(out, "%s iops:\t%u\n", bch2_iops_measurements[i], le32_to_cpu(m->iops[i])); prt_printf(out, "Bucket size:\t"); prt_units_u64(out, bucket_size << 9); prt_newline(out); - prt_printf(out, "First bucket:\t%u\n", le16_to_cpu(m.first_bucket)); - prt_printf(out, "Buckets:\t%llu\n", le64_to_cpu(m.nbuckets)); + prt_printf(out, "First bucket:\t%u\n", le16_to_cpu(m->first_bucket)); + prt_printf(out, "Buckets:\t%llu\n", le64_to_cpu(m->nbuckets)); prt_printf(out, "Last mount:\t"); - if (m.last_mount) - bch2_prt_datetime(out, le64_to_cpu(m.last_mount)); + if (m->last_mount) + bch2_prt_datetime(out, le64_to_cpu(m->last_mount)); else prt_printf(out, "(never)"); prt_newline(out); - prt_printf(out, "Last superblock write:\t%llu\n", le64_to_cpu(m.seq)); + prt_printf(out, "Last superblock write:\t%llu\n", le64_to_cpu(m->seq)); prt_printf(out, "State:\t%s\n", - BCH_MEMBER_STATE(&m) < BCH_MEMBER_STATE_NR - ? bch2_member_states[BCH_MEMBER_STATE(&m)] + BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR + ? bch2_member_states[BCH_MEMBER_STATE(m)] : "unknown"); prt_printf(out, "Data allowed:\t"); - if (BCH_MEMBER_DATA_ALLOWED(&m)) - prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m)); + if (BCH_MEMBER_DATA_ALLOWED(m)) + prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(m)); else prt_printf(out, "(none)"); prt_newline(out); prt_printf(out, "Has data:\t"); + unsigned data_have = bch2_sb_dev_has_data(sb, idx); if (data_have) prt_bitflags(out, __bch2_data_types, data_have); else @@ -254,21 +258,36 @@ static void member_to_text(struct printbuf *out, prt_newline(out); prt_printf(out, "Btree allocated bitmap blocksize:\t"); - if (m.btree_bitmap_shift < 64) - prt_units_u64(out, 1ULL << m.btree_bitmap_shift); + if (m->btree_bitmap_shift < 64) + prt_units_u64(out, 1ULL << m->btree_bitmap_shift); else - prt_printf(out, "(invalid shift %u)", m.btree_bitmap_shift); + prt_printf(out, "(invalid shift %u)", m->btree_bitmap_shift); prt_newline(out); prt_printf(out, "Btree allocated bitmap:\t"); - bch2_prt_u64_base2_nbits(out, le64_to_cpu(m.btree_allocated_bitmap), 64); + bch2_prt_u64_base2_nbits(out, le64_to_cpu(m->btree_allocated_bitmap), 64); prt_newline(out); - prt_printf(out, "Durability:\t%llu\n", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1); + prt_printf(out, "Durability:\t%llu\n", BCH_MEMBER_DURABILITY(m) ? BCH_MEMBER_DURABILITY(m) - 1 : 1); + + prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(m)); + prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(m)); + prt_printf(out, "Resize on mount:\t%llu\n", BCH_MEMBER_RESIZE_ON_MOUNT(m)); +} + +static void member_to_text(struct printbuf *out, + struct bch_member m, + struct bch_sb_field_disk_groups *gi, + struct bch_sb *sb, + unsigned idx) +{ + if (!bch2_member_alive(&m)) + return; - prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(&m)); - prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(&m)); + prt_printf(out, "Device:\t%u\n", idx); + printbuf_indent_add(out, 2); + bch2_member_to_text(out, &m, gi, sb, idx); printbuf_indent_sub(out, 2); } @@ -284,7 +303,7 @@ static int bch2_sb_members_v1_validate(struct bch_sb *sb, struct bch_sb_field *f } for (i = 0; i < sb->nr_devices; i++) { - struct bch_member m = members_v1_get(mi, i); + struct bch_member m = bch2_members_v1_get(mi, i); int ret = validate_member(err, m, sb, i); if (ret) @@ -299,10 +318,18 @@ static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb, { struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1); struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups); - unsigned i; - for (i = 0; i < sb->nr_devices; i++) - member_to_text(out, members_v1_get(mi, i), gi, sb, i); + if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) { + prt_printf(out, "field ends before start of entries"); + return; + } + + unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / sizeof(mi->_members[0]); + if (nr != sb->nr_devices) + prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices); + + for (unsigned i = 0; i < min(sb->nr_devices, nr); i++) + member_to_text(out, bch2_members_v1_get(mi, i), gi, sb, i); } const struct bch_sb_field_ops bch_sb_field_ops_members_v1 = { @@ -315,10 +342,28 @@ static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb, { struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2); struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups); - unsigned i; - for (i = 0; i < sb->nr_devices; i++) - member_to_text(out, members_v2_get(mi, i), gi, sb, i); + if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) { + prt_printf(out, "field ends before start of entries"); + return; + } + + if (!le16_to_cpu(mi->member_bytes)) { + prt_printf(out, "member_bytes 0"); + return; + } + + unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / le16_to_cpu(mi->member_bytes); + if (nr != sb->nr_devices) + prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices); + + /* + * We call to_text() on superblock sections that haven't passed + * validate, so we can't trust sb->nr_devices. + */ + + for (unsigned i = 0; i < min(sb->nr_devices, nr); i++) + member_to_text(out, bch2_members_v2_get(mi, i), gi, sb, i); } static int bch2_sb_members_v2_validate(struct bch_sb *sb, struct bch_sb_field *f, @@ -335,7 +380,7 @@ static int bch2_sb_members_v2_validate(struct bch_sb *sb, struct bch_sb_field *f } for (unsigned i = 0; i < sb->nr_devices; i++) { - int ret = validate_member(err, members_v2_get(mi, i), sb, i); + int ret = validate_member(err, bch2_members_v2_get(mi, i), sb, i); if (ret) return ret; } @@ -352,14 +397,29 @@ void bch2_sb_members_from_cpu(struct bch_fs *c) { struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - rcu_read_lock(); + guard(rcu)(); for_each_member_device_rcu(c, ca, NULL) { struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx); for (unsigned e = 0; e < BCH_MEMBER_ERROR_NR; e++) m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e])); } - rcu_read_unlock(); +} + +void bch2_sb_members_to_cpu(struct bch_fs *c) +{ + for_each_member_device(c, ca) { + struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); + ca->mi = bch2_mi_to_cpu(&m); + } + + struct bch_sb_field_members_v2 *mi2 = bch2_sb_field_get(c->disk_sb.sb, members_v2); + if (mi2) + for (unsigned i = 0; i < c->sb.nr_devices; i++) { + struct bch_member m = bch2_members_v2_get(mi2, i); + bool removed = uuid_equal(&m.uuid, &BCH_SB_MEMBER_DELETED_UUID); + mod_bit(i, c->devs_removed.d, removed); + } } void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) @@ -367,9 +427,8 @@ void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) struct bch_fs *c = ca->fs; struct bch_member m; - mutex_lock(&ca->fs->sb_lock); - m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); - mutex_unlock(&ca->fs->sb_lock); + scoped_guard(mutex, &ca->fs->sb_lock) + m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); printbuf_tabstop_push(out, 12); @@ -396,16 +455,15 @@ void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) void bch2_dev_errors_reset(struct bch_dev *ca) { struct bch_fs *c = ca->fs; - struct bch_member *m; - mutex_lock(&c->sb_lock); - m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + guard(mutex)(&c->sb_lock); + + struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++) m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i])); m->errors_reset_time = cpu_to_le64(ktime_get_real_seconds()); bch2_write_super(c); - mutex_unlock(&c->sb_lock); } /* @@ -417,20 +475,14 @@ void bch2_dev_errors_reset(struct bch_dev *ca) bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k) { - bool ret = true; - rcu_read_lock(); + guard(rcu)(); bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - if (!ca) - continue; - - if (!bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) { - ret = false; - break; - } + if (ca && + !bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) + return false; } - rcu_read_unlock(); - return ret; + return true; } static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev, @@ -493,6 +545,7 @@ int bch2_sb_member_alloc(struct bch_fs *c) unsigned u64s; int best = -1; u64 best_last_mount = 0; + unsigned nr_deleted = 0; if (dev_idx < BCH_SB_MEMBERS_MAX) goto have_slot; @@ -503,7 +556,10 @@ int bch2_sb_member_alloc(struct bch_fs *c) continue; struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx); - if (bch2_member_alive(&m)) + + nr_deleted += uuid_equal(&m.uuid, &BCH_SB_MEMBER_DELETED_UUID); + + if (!bch2_is_zero(&m.uuid, sizeof(m.uuid))) continue; u64 last_mount = le64_to_cpu(m.last_mount); @@ -517,6 +573,10 @@ int bch2_sb_member_alloc(struct bch_fs *c) goto have_slot; } + if (nr_deleted) + bch_err(c, "unable to allocate new member, but have %u deleted: run fsck", + nr_deleted); + return -BCH_ERR_ENOSPC_sb_members; have_slot: nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); @@ -532,3 +592,21 @@ int bch2_sb_member_alloc(struct bch_fs *c) c->disk_sb.sb->nr_devices = nr_devices; return dev_idx; } + +void bch2_sb_members_clean_deleted(struct bch_fs *c) +{ + guard(mutex)(&c->sb_lock); + bool write_sb = false; + + for (unsigned i = 0; i < c->sb.nr_devices; i++) { + struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, i); + + if (uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID)) { + memset(&m->uuid, 0, sizeof(m->uuid)); + write_sb = true; + } + } + + if (write_sb) + bch2_write_super(c); +} diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index 42786657522c..6de999cf71cb 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -4,6 +4,7 @@ #include "darray.h" #include "bkey_types.h" +#include "enumerated_ref.h" extern char * const bch2_member_error_strs[]; @@ -13,26 +14,48 @@ __bch2_members_v2_get_mut(struct bch_sb_field_members_v2 *mi, unsigned i) return (void *) mi->_members + (i * le16_to_cpu(mi->member_bytes)); } +static inline struct bch_member bch2_members_v2_get(struct bch_sb_field_members_v2 *mi, int i) +{ + struct bch_member ret, *p = __bch2_members_v2_get_mut(mi, i); + memset(&ret, 0, sizeof(ret)); + memcpy(&ret, p, min_t(size_t, le16_to_cpu(mi->member_bytes), sizeof(ret))); + return ret; +} + +static inline struct bch_member *members_v1_get_mut(struct bch_sb_field_members_v1 *mi, int i) +{ + return (void *) mi->_members + (i * BCH_MEMBER_V1_BYTES); +} + +static inline struct bch_member bch2_members_v1_get(struct bch_sb_field_members_v1 *mi, int i) +{ + struct bch_member ret, *p = members_v1_get_mut(mi, i); + memset(&ret, 0, sizeof(ret)); + memcpy(&ret, p, min_t(size_t, BCH_MEMBER_V1_BYTES, sizeof(ret))); + return ret; +} + int bch2_sb_members_v2_init(struct bch_fs *c); int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb); struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i); struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i); +void bch2_member_to_text(struct printbuf *, struct bch_member *, + struct bch_sb_field_disk_groups *, + struct bch_sb *, unsigned); + static inline bool bch2_dev_is_online(struct bch_dev *ca) { - return !percpu_ref_is_zero(&ca->io_ref[READ]); + return !enumerated_ref_is_zero(&ca->io_ref[READ]); } static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned); static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev) { - rcu_read_lock(); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu(c, dev); - bool ret = ca && bch2_dev_is_online(ca); - rcu_read_unlock(); - - return ret; + return ca && bch2_dev_is_online(ca); } static inline bool bch2_dev_is_healthy(struct bch_dev *ca) @@ -104,6 +127,12 @@ static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev * for (struct bch_dev *_ca = NULL; \ (_ca = __bch2_next_dev((_c), _ca, (_mask)));) +#define for_each_online_member_rcu(_c, _ca) \ + for_each_member_device_rcu(_c, _ca, &(_c)->online_devs) + +#define for_each_rw_member_rcu(_c, _ca) \ + for_each_member_device_rcu(_c, _ca, &(_c)->rw_devs[BCH_DATA_free]) + static inline void bch2_dev_get(struct bch_dev *ca) { #ifdef CONFIG_BCACHEFS_DEBUG @@ -129,18 +158,16 @@ static inline void __bch2_dev_put(struct bch_dev *ca) static inline void bch2_dev_put(struct bch_dev *ca) { - if (ca) + if (!IS_ERR_OR_NULL(ca)) __bch2_dev_put(ca); } static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca) { - rcu_read_lock(); + guard(rcu)(); bch2_dev_put(ca); if ((ca = __bch2_next_dev(c, ca, NULL))) bch2_dev_get(ca); - rcu_read_unlock(); - return ca; } @@ -157,33 +184,32 @@ static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, struct bch_dev *ca, unsigned state_mask, - int rw) + int rw, unsigned ref_idx) { - rcu_read_lock(); + guard(rcu)(); if (ca) - percpu_ref_put(&ca->io_ref[rw]); + enumerated_ref_put(&ca->io_ref[rw], ref_idx); while ((ca = __bch2_next_dev(c, ca, NULL)) && (!((1 << ca->mi.state) & state_mask) || - !percpu_ref_tryget(&ca->io_ref[rw]))) + !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx))) ; - rcu_read_unlock(); return ca; } -#define __for_each_online_member(_c, _ca, state_mask, rw) \ +#define __for_each_online_member(_c, _ca, state_mask, rw, ref_idx) \ for (struct bch_dev *_ca = NULL; \ - (_ca = bch2_get_next_online_dev(_c, _ca, state_mask, rw));) + (_ca = bch2_get_next_online_dev(_c, _ca, state_mask, rw, ref_idx));) -#define for_each_online_member(c, ca) \ - __for_each_online_member(c, ca, ~0, READ) +#define for_each_online_member(c, ca, ref_idx) \ + __for_each_online_member(c, ca, ~0, READ, ref_idx) -#define for_each_rw_member(c, ca) \ - __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), WRITE) +#define for_each_rw_member(c, ca, ref_idx) \ + __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), WRITE, ref_idx) -#define for_each_readable_member(c, ca) \ - __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro), READ) +#define for_each_readable_member(c, ca, ref_idx) \ + __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro), READ, ref_idx) static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev) { @@ -218,34 +244,43 @@ static inline struct bch_dev *bch2_dev_rcu_noerror(struct bch_fs *c, unsigned de : NULL; } -void bch2_dev_missing(struct bch_fs *, unsigned); +int bch2_dev_missing_bkey(struct bch_fs *, struct bkey_s_c, unsigned); + +void bch2_dev_missing_atomic(struct bch_fs *, unsigned); static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev) { struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev); if (unlikely(!ca)) - bch2_dev_missing(c, dev); + bch2_dev_missing_atomic(c, dev); return ca; } static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev) { - rcu_read_lock(); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev); if (ca) bch2_dev_get(ca); - rcu_read_unlock(); return ca; } +DEFINE_CLASS(bch2_dev_tryget_noerror, struct bch_dev *, + bch2_dev_put(_T), bch2_dev_tryget_noerror(c, dev), + struct bch_fs *c, unsigned dev); + static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev) { struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev); if (unlikely(!ca)) - bch2_dev_missing(c, dev); + bch2_dev_missing_atomic(c, dev); return ca; } +DEFINE_CLASS(bch2_dev_tryget, struct bch_dev *, + bch2_dev_put(_T), bch2_dev_tryget(c, dev), + struct bch_fs *c, unsigned dev); + static inline struct bch_dev *bch2_dev_bucket_tryget_noerror(struct bch_fs *c, struct bpos bucket) { struct bch_dev *ca = bch2_dev_tryget_noerror(c, bucket.inode); @@ -256,6 +291,10 @@ static inline struct bch_dev *bch2_dev_bucket_tryget_noerror(struct bch_fs *c, s return ca; } +DEFINE_CLASS(bch2_dev_bucket_tryget_noerror, struct bch_dev *, + bch2_dev_put(_T), bch2_dev_bucket_tryget_noerror(c, bucket), + struct bch_fs *c, struct bpos bucket); + void bch2_dev_bucket_missing(struct bch_dev *, u64); static inline struct bch_dev *bch2_dev_bucket_tryget(struct bch_fs *c, struct bpos bucket) @@ -269,6 +308,10 @@ static inline struct bch_dev *bch2_dev_bucket_tryget(struct bch_fs *c, struct bp return ca; } +DEFINE_CLASS(bch2_dev_bucket_tryget, struct bch_dev *, + bch2_dev_put(_T), bch2_dev_bucket_tryget(c, bucket), + struct bch_fs *c, struct bpos bucket); + static inline struct bch_dev *bch2_dev_iterate_noerror(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx) { if (ca && ca->dev_idx == dev_idx) @@ -285,43 +328,31 @@ static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev return bch2_dev_tryget(c, dev_idx); } -static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw) +static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, + int rw, unsigned ref_idx) { might_sleep(); - rcu_read_lock(); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu(c, dev); - if (ca && !percpu_ref_tryget(&ca->io_ref[rw])) - ca = NULL; - rcu_read_unlock(); + if (!ca || !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx)) + return NULL; - if (ca && - (ca->mi.state == BCH_MEMBER_STATE_rw || - (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))) + if (ca->mi.state == BCH_MEMBER_STATE_rw || + (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)) return ca; - if (ca) - percpu_ref_put(&ca->io_ref[rw]); + enumerated_ref_put(&ca->io_ref[rw], ref_idx); return NULL; } -/* XXX kill, move to struct bch_fs */ -static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) -{ - struct bch_devs_mask devs; - - memset(&devs, 0, sizeof(devs)); - for_each_online_member(c, ca) - __set_bit(ca->dev_idx, devs.d); - return devs; -} - extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1; extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2; static inline bool bch2_member_alive(struct bch_member *m) { - return !bch2_is_zero(&m->uuid, sizeof(m->uuid)); + return !bch2_is_zero(&m->uuid, sizeof(m->uuid)) && + !uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID); } static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev) @@ -351,6 +382,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) ? BCH_MEMBER_DURABILITY(mi) - 1 : 1, .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), + .resize_on_mount = BCH_MEMBER_RESIZE_ON_MOUNT(mi), .valid = bch2_member_alive(mi), .btree_bitmap_shift = mi->btree_bitmap_shift, .btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap), @@ -358,6 +390,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) } void bch2_sb_members_from_cpu(struct bch_fs *); +void bch2_sb_members_to_cpu(struct bch_fs *); void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *); void bch2_dev_errors_reset(struct bch_dev *); @@ -381,5 +414,6 @@ bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c); void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c); int bch2_sb_member_alloc(struct bch_fs *); +void bch2_sb_members_clean_deleted(struct bch_fs *); #endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h index 3affec823b3f..b2b892687cdd 100644 --- a/fs/bcachefs/sb-members_format.h +++ b/fs/bcachefs/sb-members_format.h @@ -13,7 +13,11 @@ */ #define BCH_SB_MEMBER_INVALID 255 -#define BCH_MIN_NR_NBUCKETS (1 << 6) +#define BCH_SB_MEMBER_DELETED_UUID \ + UUID_INIT(0xffffffff, 0xffff, 0xffff, \ + 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef) + +#define BCH_MIN_NR_NBUCKETS (1 << 9) #define BCH_IOPS_MEASUREMENTS() \ x(seqread, 0) \ @@ -88,6 +92,8 @@ LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28) LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30) LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, struct bch_member, flags, 30, 31) +LE64_BITMASK(BCH_MEMBER_RESIZE_ON_MOUNT, + struct bch_member, flags, 31, 32) #if 0 LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); diff --git a/fs/bcachefs/sb-members_types.h b/fs/bcachefs/sb-members_types.h index c0eda888fe39..d6443e186872 100644 --- a/fs/bcachefs/sb-members_types.h +++ b/fs/bcachefs/sb-members_types.h @@ -13,6 +13,7 @@ struct bch_member_cpu { u8 data_allowed; u8 durability; u8 freespace_initialized; + u8 resize_on_mount; u8 valid; u8 btree_bitmap_shift; u64 btree_allocated_bitmap; diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c index 7c403427fbdb..08083d6ca8bc 100644 --- a/fs/bcachefs/six.c +++ b/fs/bcachefs/six.c @@ -152,16 +152,16 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type, * here. */ if (type == SIX_LOCK_read && lock->readers) { - preempt_disable(); - this_cpu_inc(*lock->readers); /* signal that we own lock */ + scoped_guard(preempt) { + this_cpu_inc(*lock->readers); /* signal that we own lock */ - smp_mb(); + smp_mb(); - old = atomic_read(&lock->state); - ret = !(old & l[type].lock_fail); + old = atomic_read(&lock->state); + ret = !(old & l[type].lock_fail); - this_cpu_sub(*lock->readers, !ret); - preempt_enable(); + this_cpu_sub(*lock->readers, !ret); + } if (!ret) { smp_mb(); @@ -339,12 +339,9 @@ static inline bool six_owner_running(struct six_lock *lock) * acquiring the lock and setting the owner field. If we're an RT task * that will live-lock because we won't let the owner complete. */ - rcu_read_lock(); + guard(rcu)(); struct task_struct *owner = READ_ONCE(lock->owner); - bool ret = owner ? owner_on_cpu(owner) : !rt_or_dl_task(current); - rcu_read_unlock(); - - return ret; + return owner ? owner_on_cpu(owner) : !rt_or_dl_task(current); } static inline bool six_optimistic_spin(struct six_lock *lock, @@ -363,7 +360,7 @@ static inline bool six_optimistic_spin(struct six_lock *lock, if (atomic_read(&lock->state) & SIX_LOCK_NOSPIN) return false; - preempt_disable(); + guard(preempt)(); end_time = sched_clock() + 10 * NSEC_PER_USEC; while (!need_resched() && six_owner_running(lock)) { @@ -372,10 +369,8 @@ static inline bool six_optimistic_spin(struct six_lock *lock, * wait->lock_acquired: pairs with the smp_store_release in * __six_lock_wakeup */ - if (smp_load_acquire(&wait->lock_acquired)) { - preempt_enable(); + if (smp_load_acquire(&wait->lock_acquired)) return true; - } if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) { six_set_bitmask(lock, SIX_LOCK_NOSPIN); @@ -391,7 +386,6 @@ static inline bool six_optimistic_spin(struct six_lock *lock, cpu_relax(); } - preempt_enable(); return false; } diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index fec569c7deb1..84f987d3a02a 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1,14 +1,17 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bbpos.h" #include "bkey_buf.h" #include "btree_cache.h" #include "btree_key_cache.h" #include "btree_update.h" #include "buckets.h" +#include "enumerated_ref.h" #include "errcode.h" #include "error.h" #include "fs.h" +#include "progress.h" #include "recovery_passes.h" #include "snapshot.h" @@ -52,7 +55,7 @@ int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id, BTREE_ITER_with_updates, snapshot_tree, s); if (bch2_err_matches(ret, ENOENT)) - ret = -BCH_ERR_ENOENT_snapshot_tree; + ret = bch_err_throw(trans->c, ENOENT_snapshot_tree); return ret; } @@ -65,13 +68,13 @@ __bch2_snapshot_tree_create(struct btree_trans *trans) struct bkey_i_snapshot_tree *s_t; if (ret == -BCH_ERR_ENOSPC_btree_slot) - ret = -BCH_ERR_ENOSPC_snapshot_tree; + ret = bch_err_throw(trans->c, ENOSPC_snapshot_tree); if (ret) return ERR_PTR(ret); s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree); ret = PTR_ERR_OR_ZERO(s_t); - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); return ret ? ERR_PTR(ret) : s_t; } @@ -103,11 +106,8 @@ static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor) { - rcu_read_lock(); - bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor); - rcu_read_unlock(); - - return ret; + guard(rcu)(); + return __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor); } static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor) @@ -136,28 +136,25 @@ static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor) bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) { - bool ret; +#ifdef CONFIG_BCACHEFS_DEBUG + u32 orig_id = id; +#endif - rcu_read_lock(); + guard(rcu)(); struct snapshot_table *t = rcu_dereference(c->snapshots); - if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) { - ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor); - goto out; - } + if (unlikely(recovery_pass_will_run(c, BCH_RECOVERY_PASS_check_snapshots))) + return __bch2_snapshot_is_ancestor_early(t, id, ancestor); if (likely(ancestor >= IS_ANCESTOR_BITMAP)) while (id && id < ancestor - IS_ANCESTOR_BITMAP) id = get_ancestor_below(t, id, ancestor); - ret = id && id < ancestor + bool ret = id && id < ancestor ? test_ancestor_bitmap(t, id, ancestor) : id == ancestor; - EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor)); -out: - rcu_read_unlock(); - + EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, orig_id, ancestor)); return ret; } @@ -209,9 +206,14 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, { struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); - prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u", - BCH_SNAPSHOT_SUBVOL(s.v), - BCH_SNAPSHOT_DELETED(s.v), + if (BCH_SNAPSHOT_SUBVOL(s.v)) + prt_str(out, "subvol "); + if (BCH_SNAPSHOT_WILL_DELETE(s.v)) + prt_str(out, "will_delete "); + if (BCH_SNAPSHOT_DELETED(s.v)) + prt_str(out, "deleted "); + + prt_printf(out, "parent %10u children %10u %10u subvol %u tree %u", le32_to_cpu(s.v->parent), le32_to_cpu(s.v->children[0]), le32_to_cpu(s.v->children[1]), @@ -281,6 +283,14 @@ int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k, return ret; } +static int bch2_snapshot_table_make_room(struct bch_fs *c, u32 id) +{ + guard(mutex)(&c->snapshot_table_lock); + return snapshot_t_mut(c, id) + ? 0 + : bch_err_throw(c, ENOMEM_mark_snapshot); +} + static int __bch2_mark_snapshot(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s_c new, @@ -289,20 +299,19 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, struct bch_fs *c = trans->c; struct snapshot_t *t; u32 id = new.k->p.offset; - int ret = 0; - mutex_lock(&c->snapshot_table_lock); + guard(mutex)(&c->snapshot_table_lock); t = snapshot_t_mut(c, id); - if (!t) { - ret = -BCH_ERR_ENOMEM_mark_snapshot; - goto err; - } + if (!t) + return bch_err_throw(c, ENOMEM_mark_snapshot); if (new.k->type == KEY_TYPE_snapshot) { struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); - t->live = true; + t->state = !BCH_SNAPSHOT_DELETED(s.v) + ? SNAPSHOT_ID_live + : SNAPSHOT_ID_deleted; t->parent = le32_to_cpu(s.v->parent); t->children[0] = le32_to_cpu(s.v->children[0]); t->children[1] = le32_to_cpu(s.v->children[1]); @@ -327,17 +336,16 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, parent - id - 1 < IS_ANCESTOR_BITMAP) __set_bit(parent - id - 1, t->is_ancestor); - if (BCH_SNAPSHOT_DELETED(s.v)) { + if (BCH_SNAPSHOT_WILL_DELETE(s.v)) { set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots) + if (c->recovery.pass_done > BCH_RECOVERY_PASS_delete_dead_snapshots) bch2_delete_dead_snapshots_async(c); } } else { memset(t, 0, sizeof(*t)); } -err: - mutex_unlock(&c->snapshot_table_lock); - return ret; + + return 0; } int bch2_mark_snapshot(struct btree_trans *trans, @@ -357,31 +365,32 @@ int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, /* fsck: */ -static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child) +static u32 bch2_snapshot_child(struct snapshot_table *t, + u32 id, unsigned child) { - return snapshot_t(c, id)->children[child]; + return __snapshot_t(t, id)->children[child]; } -static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id) +static u32 bch2_snapshot_left_child(struct snapshot_table *t, u32 id) { - return bch2_snapshot_child(c, id, 0); + return bch2_snapshot_child(t, id, 0); } -static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id) +static u32 bch2_snapshot_right_child(struct snapshot_table *t, u32 id) { - return bch2_snapshot_child(c, id, 1); + return bch2_snapshot_child(t, id, 1); } -static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) +static u32 bch2_snapshot_tree_next(struct snapshot_table *t, u32 id) { u32 n, parent; - n = bch2_snapshot_left_child(c, id); + n = bch2_snapshot_left_child(t, id); if (n) return n; - while ((parent = bch2_snapshot_parent(c, id))) { - n = bch2_snapshot_right_child(c, parent); + while ((parent = __bch2_snapshot_parent(t, id))) { + n = bch2_snapshot_right_child(t, parent); if (n && n != id) return n; id = parent; @@ -390,21 +399,30 @@ static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) return 0; } -u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) +u32 bch2_snapshot_oldest_subvol(struct bch_fs *c, u32 snapshot_root, + snapshot_id_list *skip) { - u32 id = snapshot_root; - u32 subvol = 0, s; - - rcu_read_lock(); - while (id && bch2_snapshot_exists(c, id)) { - s = snapshot_t(c, id)->subvol; - - if (s && (!subvol || s < subvol)) - subvol = s; + guard(rcu)(); + struct snapshot_table *t = rcu_dereference(c->snapshots); + u32 id, subvol = 0, s; +retry: + id = snapshot_root; + while (id && __bch2_snapshot_exists(t, id)) { + if (!(skip && snapshot_list_has_id(skip, id))) { + s = __snapshot_t(t, id)->subvol; + + if (s && (!subvol || s < subvol)) + subvol = s; + } + id = bch2_snapshot_tree_next(t, id); + if (id == snapshot_root) + break; + } - id = bch2_snapshot_tree_next(c, id); + if (!subvol && skip) { + skip = NULL; + goto retry; } - rcu_read_unlock(); return subvol; } @@ -413,9 +431,7 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, u32 snapshot_root, u32 *subvol_id) { struct bch_fs *c = trans->c; - struct btree_iter iter; struct bkey_s_c k; - bool found = false; int ret; for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, @@ -428,28 +444,23 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, continue; if (!BCH_SUBVOLUME_SNAP(s.v)) { *subvol_id = s.k->p.offset; - found = true; - break; + return 0; } } - bch2_trans_iter_exit(trans, &iter); - - if (!ret && !found) { - struct bkey_i_subvolume *u; - - *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root); + if (ret) + return ret; - u = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_subvolumes, POS(0, *subvol_id), - 0, subvolume); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - return ret; + *subvol_id = bch2_snapshot_oldest_subvol(c, snapshot_root, NULL); - SET_BCH_SUBVOLUME_SNAP(&u->v, false); - } + struct bkey_i_subvolume *u = + bch2_bkey_get_mut_typed(trans, BTREE_ID_subvolumes, POS(0, *subvol_id), + 0, subvolume); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + return ret; - return ret; + SET_BCH_SUBVOLUME_SNAP(&u->v, false); + return 0; } static int check_snapshot_tree(struct btree_trans *trans, @@ -457,27 +468,21 @@ static int check_snapshot_tree(struct btree_trans *trans, struct bkey_s_c k) { struct bch_fs *c = trans->c; - struct bkey_s_c_snapshot_tree st; - struct bch_snapshot s; - struct bch_subvolume subvol; - struct printbuf buf = PRINTBUF; - struct btree_iter snapshot_iter = {}; - u32 root_id; - int ret; + CLASS(printbuf, buf)(); if (k.k->type != KEY_TYPE_snapshot_tree) return 0; - st = bkey_s_c_to_snapshot_tree(k); - root_id = le32_to_cpu(st.v->root_snapshot); + struct bkey_s_c_snapshot_tree st = bkey_s_c_to_snapshot_tree(k); + u32 root_id = le32_to_cpu(st.v->root_snapshot); - struct bkey_s_c_snapshot snapshot_k = - bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots, - POS(0, root_id), 0, snapshot); - ret = bkey_err(snapshot_k); + CLASS(btree_iter, snapshot_iter)(trans, BTREE_ID_snapshots, POS(0, root_id), 0); + struct bkey_s_c_snapshot snapshot_k = bch2_bkey_get_typed(&snapshot_iter, snapshot); + int ret = bkey_err(snapshot_k); if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; + return ret; + struct bch_snapshot s; if (!ret) bkey_val_copy(&s, snapshot_k); @@ -491,17 +496,16 @@ static int check_snapshot_tree(struct btree_trans *trans, ret ? prt_printf(&buf, "(%s)", bch2_err_str(ret)) : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c), - buf.buf))) { - ret = bch2_btree_delete_at(trans, iter, 0); - goto err; - } + buf.buf))) + return bch2_btree_delete_at(trans, iter, 0); if (!st.v->master_subvol) - goto out; + return 0; + struct bch_subvolume subvol; ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol); if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; + return ret; if (fsck_err_on(ret, trans, snapshot_tree_to_missing_subvol, @@ -526,27 +530,21 @@ static int check_snapshot_tree(struct btree_trans *trans, ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id); bch_err_fn(c, ret); - if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */ - ret = 0; - goto err; - } + if (bch2_err_matches(ret, ENOENT)) /* nothing to be done here */ + return 0; if (ret) - goto err; + return ret; u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree); ret = PTR_ERR_OR_ZERO(u); if (ret) - goto err; + return ret; u->v.master_subvol = cpu_to_le32(subvol_id); st = snapshot_tree_i_to_s_c(u); } -out: -err: fsck_err: - bch2_trans_iter_exit(trans, &snapshot_iter); - printbuf_exit(&buf); return ret; } @@ -559,14 +557,12 @@ static int check_snapshot_tree(struct btree_trans *trans, */ int bch2_check_snapshot_trees(struct bch_fs *c) { - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, + CLASS(btree_trans, trans)(c); + return for_each_btree_key_commit(trans, iter, BTREE_ID_snapshot_trees, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_snapshot_tree(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; + check_snapshot_tree(trans, &iter, k)); } /* @@ -589,18 +585,14 @@ static int snapshot_tree_ptr_good(struct btree_trans *trans, u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id) { - const struct snapshot_t *s; - if (!id) return 0; - rcu_read_lock(); - s = snapshot_t(c, id); - if (s->parent) - id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth)); - rcu_read_unlock(); - - return id; + guard(rcu)(); + const struct snapshot_t *s = snapshot_t(c, id); + return s->parent + ? bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth)) + : id; } static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s) @@ -630,22 +622,19 @@ static int snapshot_tree_ptr_repair(struct btree_trans *trans, struct bch_snapshot *s) { struct bch_fs *c = trans->c; - struct btree_iter root_iter; - struct bch_snapshot_tree s_t; - struct bkey_s_c_snapshot root; struct bkey_i_snapshot *u; - u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id; - int ret; + u32 root_id = bch2_snapshot_root(c, k.k->p.offset); - root = bch2_bkey_get_iter_typed(trans, &root_iter, - BTREE_ID_snapshots, POS(0, root_id), - BTREE_ITER_with_updates, snapshot); - ret = bkey_err(root); + CLASS(btree_iter, root_iter)(trans, BTREE_ID_snapshots, POS(0, root_id), + BTREE_ITER_with_updates); + struct bkey_s_c_snapshot root = bch2_bkey_get_typed(&root_iter, snapshot); + int ret = bkey_err(root); if (ret) - goto err; + return ret; - tree_id = le32_to_cpu(root.v->tree); + u32 tree_id = le32_to_cpu(root.v->tree); + struct bch_snapshot_tree s_t; ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); if (ret && !bch2_err_matches(ret, ENOENT)) return ret; @@ -654,10 +643,10 @@ static int snapshot_tree_ptr_repair(struct btree_trans *trans, u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot); ret = PTR_ERR_OR_ZERO(u) ?: bch2_snapshot_tree_create(trans, root_id, - bch2_snapshot_tree_oldest_subvol(c, root_id), + bch2_snapshot_oldest_subvol(c, root_id, NULL), &tree_id); if (ret) - goto err; + return ret; u->v.tree = cpu_to_le32(tree_id); if (k.k->p.offset == root_id) @@ -668,14 +657,13 @@ static int snapshot_tree_ptr_repair(struct btree_trans *trans, u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); ret = PTR_ERR_OR_ZERO(u); if (ret) - goto err; + return ret; u->v.tree = cpu_to_le32(tree_id); *s = u->v; } -err: - bch2_trans_iter_exit(trans, &root_iter); - return ret; + + return 0; } static int check_snapshot(struct btree_trans *trans, @@ -689,7 +677,7 @@ static int check_snapshot(struct btree_trans *trans, struct bkey_i_snapshot *u; u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset); u32 real_depth; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); u32 i, id; int ret = 0; @@ -699,6 +687,9 @@ static int check_snapshot(struct btree_trans *trans, memset(&s, 0, sizeof(s)); memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k))); + if (BCH_SNAPSHOT_DELETED(&s)) + return 0; + id = le32_to_cpu(s.parent); if (id) { ret = bch2_snapshot_lookup(trans, id, &v); @@ -736,7 +727,7 @@ static int check_snapshot(struct btree_trans *trans, } bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && - !BCH_SNAPSHOT_DELETED(&s); + !BCH_SNAPSHOT_WILL_DELETE(&s); if (should_have_subvol) { id = le32_to_cpu(s.subvol); @@ -819,7 +810,6 @@ static int check_snapshot(struct btree_trans *trans, ret = 0; err: fsck_err: - printbuf_exit(&buf); return ret; } @@ -829,14 +819,12 @@ int bch2_check_snapshots(struct bch_fs *c) * We iterate backwards as checking/fixing the depth field requires that * the parent's depth already be correct: */ - int ret = bch2_trans_run(c, - for_each_btree_key_reverse_commit(trans, iter, + CLASS(btree_trans, trans)(c); + return for_each_btree_key_reverse_commit(trans, iter, BTREE_ID_snapshots, POS_MAX, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_snapshot(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; + check_snapshot(trans, &iter, k)); } static int check_snapshot_exists(struct btree_trans *trans, u32 id) @@ -844,19 +832,18 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) struct bch_fs *c = trans->c; /* Do we need to reconstruct the snapshot_tree entry as well? */ - struct btree_iter iter; struct bkey_s_c k; int ret = 0; u32 tree_id = 0; for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN, 0, k, ret) { - if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) { + if (k.k->type == KEY_TYPE_snapshot_tree && + le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) { tree_id = k.k->p.offset; break; } } - bch2_trans_iter_exit(trans, &iter); if (ret) return ret; @@ -879,17 +866,16 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) { - if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) { + if (k.k->type == KEY_TYPE_subvolume && + le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) { snapshot->v.subvol = cpu_to_le32(k.k->p.offset); SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true); break; } } - bch2_trans_iter_exit(trans, &iter); - return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?: - bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, - bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0); + return bch2_snapshot_table_make_room(c, id) ?: + bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0); } /* Figure out which snapshot nodes belong in the same tree: */ @@ -917,10 +903,7 @@ static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpo static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r) { - darray_for_each(*l, i) - if (snapshot_list_has_id(r, *i)) - return true; - return false; + return darray_find_p(*l, i, snapshot_list_has_id(r, *i)) != NULL; } static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s) @@ -962,17 +945,21 @@ static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct int bch2_reconstruct_snapshots(struct bch_fs *c) { - struct btree_trans *trans = bch2_trans_get(c); - struct printbuf buf = PRINTBUF; + CLASS(btree_trans, trans)(c); + CLASS(printbuf, buf)(); struct snapshot_tree_reconstruct r = {}; int ret = 0; + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, btree_has_snapshots_mask); + for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { if (btree_type_has_snapshots(btree)) { r.btree = btree; ret = for_each_btree_key(trans, iter, btree, POS_MIN, BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({ + progress_update_iter(trans, &progress, &iter); get_snapshot_trees(c, &r, k.k->p); })); if (ret) @@ -987,12 +974,12 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) snapshot_id_list_to_text(&buf, t); darray_for_each(*t, id) { - if (fsck_err_on(!bch2_snapshot_exists(c, *id), + if (fsck_err_on(bch2_snapshot_id_state(c, *id) == SNAPSHOT_ID_empty, trans, snapshot_node_missing, "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) { if (t->nr > 1) { bch_err(c, "cannot reconstruct snapshot trees with multiple nodes"); - ret = -BCH_ERR_fsck_repair_unimplemented; + ret = bch_err_throw(c, fsck_repair_unimplemented); goto err; } @@ -1005,31 +992,90 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) } fsck_err: err: - bch2_trans_put(trans); snapshot_tree_reconstruct_exit(&r); - printbuf_exit(&buf); - bch_err_fn(c, ret); return ret; } -int bch2_check_key_has_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) +int __bch2_check_key_has_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) { struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); int ret = 0; + enum snapshot_id_state state = bch2_snapshot_id_state(c, k.k->p.snapshot); - if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot), - trans, bkey_in_missing_snapshot, - "key in missing snapshot %s, delete?", + /* Snapshot was definitively deleted, this error is marked autofix */ + if (fsck_err_on(state == SNAPSHOT_ID_deleted, + trans, bkey_in_deleted_snapshot, + "key in deleted snapshot %s, delete?", (bch2_btree_id_to_text(&buf, iter->btree_id), prt_char(&buf, ' '), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node) ?: 1; + BTREE_UPDATE_internal_snapshot_node) ?: 1; + + if (state == SNAPSHOT_ID_empty) { + /* + * Snapshot missing: we should have caught this with btree_lost_data and + * kicked off reconstruct_snapshots, so if we end up here we have no + * idea what happened. + * + * Do not delete unless we know that subvolumes and snapshots + * are consistent: + * + * XXX: + * + * We could be smarter here, and instead of using the generic + * recovery pass ratelimiting, track if there have been any + * changes to the snapshots or inodes btrees since those passes + * last ran. + */ + ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_snapshots) ?: ret; + ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_subvols) ?: ret; + + if (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)) + ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret; + + unsigned repair_flags = FSCK_CAN_IGNORE | (!ret ? FSCK_CAN_FIX : 0); + + if (__fsck_err(trans, repair_flags, bkey_in_missing_snapshot, + "key in missing snapshot %s, delete?", + (bch2_btree_id_to_text(&buf, iter->btree_id), + prt_char(&buf, ' '), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_internal_snapshot_node) ?: 1; + } + } fsck_err: - printbuf_exit(&buf); + return ret; +} + +int __bch2_get_snapshot_overwrites(struct btree_trans *trans, + enum btree_id btree, struct bpos pos, + snapshot_id_list *s) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_reverse_norestart(trans, iter, btree, bpos_predecessor(pos), + BTREE_ITER_all_snapshots, k, ret) { + if (!bkey_eq(k.k->p, pos)) + break; + + if (!bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) || + snapshot_list_has_ancestor(c, s, k.k->p.snapshot)) + continue; + + ret = snapshot_list_add(c, s, k.k->p.snapshot); + if (ret) + break; + } + if (ret) + darray_exit(s); + return ret; } @@ -1038,28 +1084,21 @@ int bch2_check_key_has_snapshot(struct btree_trans *trans, */ int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) { - struct btree_iter iter; struct bkey_i_snapshot *s = - bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_snapshots, POS(0, id), - 0, snapshot); + bch2_bkey_get_mut_typed(trans, BTREE_ID_snapshots, POS(0, id), 0, snapshot); int ret = PTR_ERR_OR_ZERO(s); - if (unlikely(ret)) { - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), - trans->c, "missing snapshot %u", id); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, "missing snapshot %u", id); + if (unlikely(ret)) return ret; - } /* already deleted? */ - if (BCH_SNAPSHOT_DELETED(&s->v)) - goto err; + if (BCH_SNAPSHOT_WILL_DELETE(&s->v)) + return 0; - SET_BCH_SNAPSHOT_DELETED(&s->v, true); + SET_BCH_SNAPSHOT_WILL_DELETE(&s->v, true); SET_BCH_SNAPSHOT_SUBVOL(&s->v, false); s->v.subvol = 0; -err: - bch2_trans_iter_exit(trans, &iter); - return ret; + return 0; } static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s) @@ -1071,39 +1110,33 @@ static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s) static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) { struct bch_fs *c = trans->c; - struct btree_iter iter, p_iter = {}; - struct btree_iter c_iter = {}; - struct btree_iter tree_iter = {}; - struct bkey_s_c_snapshot s; u32 parent_id, child_id; unsigned i; - int ret = 0; - s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), - BTREE_ITER_intent, snapshot); - ret = bkey_err(s); + struct bkey_i_snapshot *s = + bch2_bkey_get_mut_typed(trans, BTREE_ID_snapshots, POS(0, id), 0, snapshot); + int ret = PTR_ERR_OR_ZERO(s); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, "missing snapshot %u", id); if (ret) - goto err; + return ret; - BUG_ON(s.v->children[1]); + BUG_ON(BCH_SNAPSHOT_DELETED(&s->v)); + BUG_ON(s->v.children[1]); - parent_id = le32_to_cpu(s.v->parent); - child_id = le32_to_cpu(s.v->children[0]); + parent_id = le32_to_cpu(s->v.parent); + child_id = le32_to_cpu(s->v.children[0]); if (parent_id) { - struct bkey_i_snapshot *parent; - - parent = bch2_bkey_get_mut_typed(trans, &p_iter, - BTREE_ID_snapshots, POS(0, parent_id), - 0, snapshot); + struct bkey_i_snapshot *parent = + bch2_bkey_get_mut_typed(trans, BTREE_ID_snapshots, POS(0, parent_id), + 0, snapshot); ret = PTR_ERR_OR_ZERO(parent); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, "missing snapshot %u", parent_id); if (unlikely(ret)) - goto err; + return ret; /* find entry in parent->children for node being deleted */ for (i = 0; i < 2; i++) @@ -1113,7 +1146,7 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) if (bch2_fs_inconsistent_on(i == 2, c, "snapshot %u missing child pointer to %u", parent_id, id)) - goto err; + return bch_err_throw(c, ENOENT_snapshot); parent->v.children[i] = cpu_to_le32(child_id); @@ -1121,16 +1154,14 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) } if (child_id) { - struct bkey_i_snapshot *child; - - child = bch2_bkey_get_mut_typed(trans, &c_iter, - BTREE_ID_snapshots, POS(0, child_id), - 0, snapshot); + struct bkey_i_snapshot *child = + bch2_bkey_get_mut_typed(trans, BTREE_ID_snapshots, POS(0, child_id), + 0, snapshot); ret = PTR_ERR_OR_ZERO(child); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, "missing snapshot %u", child_id); if (unlikely(ret)) - goto err; + return ret; child->v.parent = cpu_to_le32(parent_id); @@ -1147,32 +1178,41 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) * snapshot_tree entry to point to the new root, or delete it if * this is the last snapshot ID in this tree: */ - struct bkey_i_snapshot_tree *s_t; - BUG_ON(s.v->children[1]); + BUG_ON(s->v.children[1]); - s_t = bch2_bkey_get_mut_typed(trans, &tree_iter, - BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)), + struct bkey_i_snapshot_tree *s_t = bch2_bkey_get_mut_typed(trans, + BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s->v.tree)), 0, snapshot_tree); ret = PTR_ERR_OR_ZERO(s_t); if (ret) - goto err; + return ret; - if (s.v->children[0]) { - s_t->v.root_snapshot = s.v->children[0]; + if (s->v.children[0]) { + s_t->v.root_snapshot = s->v.children[0]; } else { s_t->k.type = KEY_TYPE_deleted; set_bkey_val_u64s(&s_t->k, 0); } } - ret = bch2_btree_delete_at(trans, &iter, 0); -err: - bch2_trans_iter_exit(trans, &tree_iter); - bch2_trans_iter_exit(trans, &p_iter); - bch2_trans_iter_exit(trans, &c_iter); - bch2_trans_iter_exit(trans, &iter); - return ret; + if (!bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)) { + SET_BCH_SNAPSHOT_DELETED(&s->v, true); + s->v.parent = 0; + s->v.children[0] = 0; + s->v.children[1] = 0; + s->v.subvol = 0; + s->v.tree = 0; + s->v.depth = 0; + s->v.skip[0] = 0; + s->v.skip[1] = 0; + s->v.skip[2] = 0; + } else { + s->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&s->k, 0); + } + + return 0; } static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, @@ -1181,35 +1221,29 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, unsigned nr_snapids) { struct bch_fs *c = trans->c; - struct btree_iter iter; struct bkey_i_snapshot *n; - struct bkey_s_c k; - unsigned i, j; u32 depth = bch2_snapshot_depth(c, parent); - int ret; - bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, - POS_MIN, BTREE_ITER_intent); - k = bch2_btree_iter_peek(trans, &iter); - ret = bkey_err(k); + CLASS(btree_iter, iter)(trans, BTREE_ID_snapshots, POS_MIN, BTREE_ITER_intent); + struct bkey_s_c k = bch2_btree_iter_peek(&iter); + int ret = bkey_err(k); if (ret) - goto err; + return ret; - for (i = 0; i < nr_snapids; i++) { - k = bch2_btree_iter_prev_slot(trans, &iter); + for (unsigned i = 0; i < nr_snapids; i++) { + k = bch2_btree_iter_prev_slot(&iter); ret = bkey_err(k); if (ret) - goto err; + return ret; if (!k.k || !k.k->p.offset) { - ret = -BCH_ERR_ENOSPC_snapshot_create; - goto err; + return bch_err_throw(c, ENOSPC_snapshot_create); } n = bch2_bkey_alloc(trans, &iter, 0, snapshot); ret = PTR_ERR_OR_ZERO(n); if (ret) - goto err; + return ret; n->v.flags = 0; n->v.parent = cpu_to_le32(parent); @@ -1219,7 +1253,7 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, n->v.btime.lo = cpu_to_le64(bch2_current_time(c)); n->v.btime.hi = 0; - for (j = 0; j < ARRAY_SIZE(n->v.skip); j++) + for (unsigned j = 0; j < ARRAY_SIZE(n->v.skip); j++) n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent)); bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32); @@ -1228,13 +1262,12 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); if (ret) - goto err; + return ret; new_snapids[i] = iter.pos.offset; } -err: - bch2_trans_iter_exit(trans, &iter); - return ret; + + return 0; } /* @@ -1245,14 +1278,9 @@ static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 par u32 *snapshot_subvols, unsigned nr_snapids) { - struct btree_iter iter; - struct bkey_i_snapshot *n_parent; - int ret = 0; - - n_parent = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_snapshots, POS(0, parent), - 0, snapshot); - ret = PTR_ERR_OR_ZERO(n_parent); + struct bkey_i_snapshot *n_parent = + bch2_bkey_get_mut_typed(trans, BTREE_ID_snapshots, POS(0, parent), 0, snapshot); + int ret = PTR_ERR_OR_ZERO(n_parent); if (unlikely(ret)) { if (bch2_err_matches(ret, ENOENT)) bch_err(trans->c, "snapshot %u not found", parent); @@ -1261,22 +1289,19 @@ static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 par if (n_parent->v.children[0] || n_parent->v.children[1]) { bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children"); - ret = -EINVAL; - goto err; + return -EINVAL; } ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree), new_snapids, snapshot_subvols, nr_snapids); if (ret) - goto err; + return ret; n_parent->v.children[0] = cpu_to_le32(new_snapids[0]); n_parent->v.children[1] = cpu_to_le32(new_snapids[1]); n_parent->v.subvol = 0; SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; + return 0; } /* @@ -1336,67 +1361,47 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, * that key to snapshot leaf nodes, where we can mutate it */ -struct snapshot_interior_delete { - u32 id; - u32 live_child; -}; -typedef DARRAY(struct snapshot_interior_delete) interior_delete_list; - static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id) { - darray_for_each(*l, i) - if (i->id == id) - return i->live_child; - return 0; + struct snapshot_interior_delete *i = darray_find_p(*l, i, i->id == id); + return i ? i->live_child : 0; } -static unsigned __live_child(struct snapshot_table *t, u32 id, - snapshot_id_list *delete_leaves, - interior_delete_list *delete_interior) +static unsigned live_child(struct bch_fs *c, u32 start) { - struct snapshot_t *s = __snapshot_t(t, id); - if (!s) - return 0; + struct snapshot_delete *d = &c->snapshot_delete; - for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) - if (s->children[i] && - !snapshot_list_has_id(delete_leaves, s->children[i]) && - !interior_delete_has_id(delete_interior, s->children[i])) - return s->children[i]; - - for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) { - u32 live_child = s->children[i] - ? __live_child(t, s->children[i], delete_leaves, delete_interior) - : 0; - if (live_child) - return live_child; - } + guard(rcu)(); + struct snapshot_table *t = rcu_dereference(c->snapshots); + + for (u32 id = bch2_snapshot_tree_next(t, start); + id && id != start; + id = bch2_snapshot_tree_next(t, id)) + if (bch2_snapshot_is_leaf(c, id) && + !snapshot_list_has_id(&d->delete_leaves, id) && + !interior_delete_has_id(&d->delete_interior, id)) + return id; return 0; } -static unsigned live_child(struct bch_fs *c, u32 id, - snapshot_id_list *delete_leaves, - interior_delete_list *delete_interior) +static bool snapshot_id_dying(struct snapshot_delete *d, unsigned id) { - rcu_read_lock(); - u32 ret = __live_child(rcu_dereference(c->snapshots), id, - delete_leaves, delete_interior); - rcu_read_unlock(); - return ret; + return snapshot_list_has_id(&d->delete_leaves, id) || + interior_delete_has_id(&d->delete_interior, id) != 0; } static int delete_dead_snapshots_process_key(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k, - snapshot_id_list *delete_leaves, - interior_delete_list *delete_interior) + struct bkey_s_c k) { - if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot)) + struct snapshot_delete *d = &trans->c->snapshot_delete; + + if (snapshot_list_has_id(&d->delete_leaves, k.k->p.snapshot)) return bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); - u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot); + u32 live_child = interior_delete_has_id(&d->delete_interior, k.k->p.snapshot); if (live_child) { struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); int ret = PTR_ERR_OR_ZERO(new); @@ -1405,86 +1410,241 @@ static int delete_dead_snapshots_process_key(struct btree_trans *trans, new->k.p.snapshot = live_child; - struct btree_iter dst_iter; - struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter, - iter->btree_id, new->k.p, - BTREE_ITER_all_snapshots| - BTREE_ITER_intent); + CLASS(btree_iter, dst_iter)(trans, iter->btree_id, new->k.p, + BTREE_ITER_all_snapshots|BTREE_ITER_intent); + struct bkey_s_c dst_k = bch2_btree_iter_peek_slot(&dst_iter); ret = bkey_err(dst_k); if (ret) return ret; - ret = (bkey_deleted(dst_k.k) + return (bkey_deleted(dst_k.k) ? bch2_trans_update(trans, &dst_iter, new, BTREE_UPDATE_internal_snapshot_node) : 0) ?: bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &dst_iter); - return ret; } return 0; } +static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree_iter *iter, u64 *prev_inum) +{ + struct bch_fs *c = trans->c; + struct snapshot_delete *d = &c->snapshot_delete; + + u64 inum = iter->btree_id != BTREE_ID_inodes + ? iter->pos.inode + : iter->pos.offset; + + if (*prev_inum == inum) + return false; + + *prev_inum = inum; + + bool ret = !snapshot_list_has_id(&d->deleting_from_trees, + bch2_snapshot_tree(c, iter->pos.snapshot)); + if (unlikely(ret)) { + struct bpos pos = iter->pos; + pos.snapshot = 0; + if (iter->btree_id != BTREE_ID_inodes) + pos.offset = U64_MAX; + bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(pos)); + } + + return ret; +} + +static int delete_dead_snapshot_keys_v1(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct snapshot_delete *d = &c->snapshot_delete; + + for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) { + struct disk_reservation res = { 0 }; + u64 prev_inum = 0; + + d->pos.pos = POS_MIN; + + if (!btree_type_has_snapshots(d->pos.btree)) + continue; + + int ret = for_each_btree_key_commit(trans, iter, + d->pos.btree, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, + &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + d->pos.pos = iter.pos; + + if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) + continue; + + delete_dead_snapshots_process_key(trans, &iter, k); + })); + + bch2_disk_reservation_put(c, &res); + + if (ret) + return ret; + } + + return 0; +} + +static int delete_dead_snapshot_keys_range(struct btree_trans *trans, enum btree_id btree, + struct bpos start, struct bpos end) +{ + struct bch_fs *c = trans->c; + struct snapshot_delete *d = &c->snapshot_delete; + struct disk_reservation res = { 0 }; + + d->pos.btree = btree; + d->pos.pos = POS_MIN; + + int ret = for_each_btree_key_max_commit(trans, iter, + btree, start, end, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, + &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + d->pos.pos = iter.pos; + delete_dead_snapshots_process_key(trans, &iter, k); + })); + + bch2_disk_reservation_put(c, &res); + return ret; +} + +static int delete_dead_snapshot_keys_v2(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct snapshot_delete *d = &c->snapshot_delete; + struct disk_reservation res = { 0 }; + u64 prev_inum = 0; + int ret = 0; + + struct btree_iter iter; + bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); + + while (1) { + struct bkey_s_c k; + ret = lockrestart_do(trans, + bkey_err(k = bch2_btree_iter_peek(&iter))); + if (ret) + break; + + if (!k.k) + break; + + d->pos.btree = iter.btree_id; + d->pos.pos = iter.pos; + + if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) + continue; + + if (snapshot_id_dying(d, k.k->p.snapshot)) { + struct bpos start = POS(k.k->p.offset, 0); + struct bpos end = POS(k.k->p.offset, U64_MAX); + + ret = delete_dead_snapshot_keys_range(trans, BTREE_ID_extents, start, end) ?: + delete_dead_snapshot_keys_range(trans, BTREE_ID_dirents, start, end) ?: + delete_dead_snapshot_keys_range(trans, BTREE_ID_xattrs, start, end); + if (ret) + break; + + bch2_btree_iter_set_pos(&iter, POS(0, k.k->p.offset + 1)); + } else { + bch2_btree_iter_advance(&iter); + } + } + bch2_trans_iter_exit(&iter); + + if (ret) + goto err; + + prev_inum = 0; + ret = for_each_btree_key_commit(trans, iter, + BTREE_ID_inodes, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, + &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + d->pos.btree = iter.btree_id; + d->pos.pos = iter.pos; + + if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) + continue; + + delete_dead_snapshots_process_key(trans, &iter, k); + })); +err: + bch2_disk_reservation_put(c, &res); + return ret; +} + /* * For a given snapshot, if it doesn't have a subvolume that points to it, and * it doesn't have child snapshot nodes - it's now redundant and we can mark it * as deleted. */ -static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k, - snapshot_id_list *delete_leaves, - interior_delete_list *delete_interior) +static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k) { if (k.k->type != KEY_TYPE_snapshot) return 0; struct bch_fs *c = trans->c; + struct snapshot_delete *d = &c->snapshot_delete; struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); unsigned live_children = 0; + int ret = 0; if (BCH_SNAPSHOT_SUBVOL(s.v)) return 0; + if (BCH_SNAPSHOT_DELETED(s.v)) + return 0; + + guard(mutex)(&d->progress_lock); for (unsigned i = 0; i < 2; i++) { u32 child = le32_to_cpu(s.v->children[i]); live_children += child && - !snapshot_list_has_id(delete_leaves, child); + !snapshot_list_has_id(&d->delete_leaves, child); } + u32 tree = bch2_snapshot_tree(c, s.k->p.offset); + if (live_children == 0) { - return snapshot_list_add(c, delete_leaves, s.k->p.offset); + ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?: + snapshot_list_add(c, &d->delete_leaves, s.k->p.offset); } else if (live_children == 1) { - struct snapshot_interior_delete d = { + struct snapshot_interior_delete n = { .id = s.k->p.offset, - .live_child = live_child(c, s.k->p.offset, delete_leaves, delete_interior), + .live_child = live_child(c, s.k->p.offset), }; - if (!d.live_child) { - bch_err(c, "error finding live child of snapshot %u", d.id); - return -EINVAL; + if (!n.live_child) { + bch_err(c, "error finding live child of snapshot %u", n.id); + ret = -EINVAL; + } else { + ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?: + darray_push(&d->delete_interior, n); } - - return darray_push(delete_interior, d); - } else { - return 0; } + + return ret; } static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, interior_delete_list *skip) { - rcu_read_lock(); + guard(rcu)(); + struct snapshot_table *t = rcu_dereference(c->snapshots); + while (interior_delete_has_id(skip, id)) - id = __bch2_snapshot_parent(c, id); + id = __bch2_snapshot_parent(t, id); while (n--) { do { - id = __bch2_snapshot_parent(c, id); + id = __bch2_snapshot_parent(t, id); } while (interior_delete_has_id(skip, id)); } - rcu_read_unlock(); return id; } @@ -1498,6 +1658,9 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, struct bkey_i_snapshot *s; int ret; + if (!bch2_snapshot_exists(c, k.k->p.offset)) + return 0; + if (k.k->type != KEY_TYPE_snapshot) return 0; @@ -1545,69 +1708,73 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, return bch2_trans_update(trans, iter, &s->k_i, 0); } -int bch2_delete_dead_snapshots(struct bch_fs *c) +static void bch2_snapshot_delete_nodes_to_text(struct printbuf *out, struct snapshot_delete *d) { - if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) - return 0; + prt_printf(out, "deleting from trees"); + darray_for_each(d->deleting_from_trees, i) + prt_printf(out, " %u", *i); + + prt_printf(out, "deleting leaves"); + darray_for_each(d->delete_leaves, i) + prt_printf(out, " %u", *i); + prt_newline(out); + + prt_printf(out, "interior"); + darray_for_each(d->delete_interior, i) + prt_printf(out, " %u->%u", i->id, i->live_child); + prt_newline(out); +} - struct btree_trans *trans = bch2_trans_get(c); - snapshot_id_list delete_leaves = {}; - interior_delete_list delete_interior = {}; +int __bch2_delete_dead_snapshots(struct bch_fs *c) +{ + struct snapshot_delete *d = &c->snapshot_delete; int ret = 0; + if (!mutex_trylock(&d->lock)) + return 0; + + if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) { + mutex_unlock(&d->lock); + return 0; + } + + CLASS(btree_trans, trans)(c); + /* * For every snapshot node: If we have no live children and it's not * pointed to by a subvolume, delete it: */ + d->running = true; + d->pos = BBPOS_MIN; + ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, - check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior)); + check_should_delete_snapshot(trans, k)); if (!bch2_err_matches(ret, EROFS)) bch_err_msg(c, ret, "walking snapshots"); if (ret) goto err; - if (!delete_leaves.nr && !delete_interior.nr) + if (!d->delete_leaves.nr && !d->delete_interior.nr) goto err; { - struct printbuf buf = PRINTBUF; - prt_printf(&buf, "deleting leaves"); - darray_for_each(delete_leaves, i) - prt_printf(&buf, " %u", *i); - - prt_printf(&buf, " interior"); - darray_for_each(delete_interior, i) - prt_printf(&buf, " %u->%u", i->id, i->live_child); + CLASS(printbuf, buf)(); + bch2_snapshot_delete_nodes_to_text(&buf, d); ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf)); - printbuf_exit(&buf); if (ret) goto err; } - for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { - struct disk_reservation res = { 0 }; - - if (!btree_type_has_snapshots(btree)) - continue; - - ret = for_each_btree_key_commit(trans, iter, - btree, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - &res, NULL, BCH_TRANS_COMMIT_no_enospc, - delete_dead_snapshots_process_key(trans, &iter, k, - &delete_leaves, - &delete_interior)); - - bch2_disk_reservation_put(c, &res); - - if (!bch2_err_matches(ret, EROFS)) - bch_err_msg(c, ret, "deleting keys from dying snapshots"); - if (ret) - goto err; - } + ret = !bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2) + ? delete_dead_snapshot_keys_v2(trans) + : delete_dead_snapshot_keys_v1(trans); + if (!bch2_err_matches(ret, EROFS)) + bch_err_msg(c, ret, "deleting keys from dying snapshots"); + if (ret) + goto err; - darray_for_each(delete_leaves, i) { + darray_for_each(d->delete_leaves, i) { ret = commit_do(trans, NULL, NULL, 0, bch2_snapshot_node_delete(trans, *i)); if (!bch2_err_matches(ret, EROFS)) @@ -1624,11 +1791,11 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, BTREE_ITER_intent, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior)); + bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &d->delete_interior)); if (ret) goto err; - darray_for_each(delete_interior, i) { + darray_for_each(d->delete_interior, i) { ret = commit_do(trans, NULL, NULL, 0, bch2_snapshot_node_delete(trans, i->id)); if (!bch2_err_matches(ret, EROFS)) @@ -1637,33 +1804,64 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) goto err; } err: - darray_exit(&delete_interior); - darray_exit(&delete_leaves); - bch2_trans_put(trans); - if (!bch2_err_matches(ret, EROFS)) - bch_err_fn(c, ret); + scoped_guard(mutex, &d->progress_lock) { + darray_exit(&d->deleting_from_trees); + darray_exit(&d->delete_interior); + darray_exit(&d->delete_leaves); + d->running = false; + } + + bch2_recovery_pass_set_no_ratelimit(c, BCH_RECOVERY_PASS_check_snapshots); + + mutex_unlock(&d->lock); return ret; } +int bch2_delete_dead_snapshots(struct bch_fs *c) +{ + if (!c->opts.auto_snapshot_deletion) + return 0; + + return __bch2_delete_dead_snapshots(c); +} + void bch2_delete_dead_snapshots_work(struct work_struct *work) { - struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); + struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete.work); set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name); bch2_delete_dead_snapshots(c); - bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots); } void bch2_delete_dead_snapshots_async(struct bch_fs *c) { - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots)) + if (!c->opts.auto_snapshot_deletion) + return; + + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_delete_dead_snapshots)) return; BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); - if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); + if (!queue_work(system_long_wq, &c->snapshot_delete.work)) + enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots); +} + +void bch2_snapshot_delete_status_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct snapshot_delete *d = &c->snapshot_delete; + + if (!d->running) { + prt_str(out, "(not running)"); + return; + } + + scoped_guard(mutex, &d->progress_lock) { + bch2_snapshot_delete_nodes_to_text(out, d); + bch2_bbpos_to_text(out, d->pos); + } } int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, @@ -1671,7 +1869,6 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, struct bpos pos) { struct bch_fs *c = trans->c; - struct btree_iter iter; struct bkey_s_c k; int ret; @@ -1682,12 +1879,9 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, if (!bkey_eq(pos, k.k->p)) break; - if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) { - ret = 1; - break; - } + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) + return 1; } - bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1704,7 +1898,7 @@ static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct return 0; struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k); - if (BCH_SNAPSHOT_DELETED(snap.v) || + if (BCH_SNAPSHOT_WILL_DELETE(snap.v) || interior_snapshot_needs_delete(snap)) set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags); @@ -1717,11 +1911,11 @@ int bch2_snapshots_read(struct bch_fs *c) * Initializing the is_ancestor bitmaps requires ancestors to already be * initialized - so mark in reverse: */ - int ret = bch2_trans_run(c, - for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots, + CLASS(btree_trans, trans)(c); + int ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots, POS_MAX, 0, k, __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: - bch2_check_snapshot_needs_deletion(trans, k))); + bch2_check_snapshot_needs_deletion(trans, k)); bch_err_fn(c, ret); /* @@ -1733,10 +1927,6 @@ int bch2_snapshots_read(struct bch_fs *c) BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) && test_bit(BCH_FS_may_go_rw, &c->flags)); - if (bch2_err_matches(ret, EIO) || - (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots))) - ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots); - return ret; } @@ -1744,3 +1934,11 @@ void bch2_fs_snapshots_exit(struct bch_fs *c) { kvfree(rcu_dereference_protected(c->snapshots, true)); } + +void bch2_fs_snapshots_init_early(struct bch_fs *c) +{ + INIT_WORK(&c->snapshot_delete.work, bch2_delete_dead_snapshots_work); + mutex_init(&c->snapshot_delete.lock); + mutex_init(&c->snapshot_delete.progress_lock); + mutex_init(&c->snapshots_unlinked_lock); +} diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 81180181d7c9..fef32a0118c4 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -46,12 +46,9 @@ static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) { - rcu_read_lock(); + guard(rcu)(); const struct snapshot_t *s = snapshot_t(c, id); - id = s ? s->tree : 0; - rcu_read_unlock(); - - return id; + return s ? s->tree : 0; } static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) @@ -62,87 +59,84 @@ static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) { - rcu_read_lock(); - id = __bch2_snapshot_parent_early(c, id); - rcu_read_unlock(); - - return id; + guard(rcu)(); + return __bch2_snapshot_parent_early(c, id); } -static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) +static inline u32 __bch2_snapshot_parent(struct snapshot_table *t, u32 id) { - const struct snapshot_t *s = snapshot_t(c, id); + const struct snapshot_t *s = __snapshot_t(t, id); if (!s) return 0; u32 parent = s->parent; if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && parent && - s->depth != snapshot_t(c, parent)->depth + 1) + s->depth != __snapshot_t(t, parent)->depth + 1) panic("id %u depth=%u parent %u depth=%u\n", - id, snapshot_t(c, id)->depth, - parent, snapshot_t(c, parent)->depth); + id, __snapshot_t(t, id)->depth, + parent, __snapshot_t(t, parent)->depth); return parent; } static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) { - rcu_read_lock(); - id = __bch2_snapshot_parent(c, id); - rcu_read_unlock(); - - return id; + guard(rcu)(); + return __bch2_snapshot_parent(rcu_dereference(c->snapshots), id); } static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n) { - rcu_read_lock(); - while (n--) - id = __bch2_snapshot_parent(c, id); - rcu_read_unlock(); + guard(rcu)(); + struct snapshot_table *t = rcu_dereference(c->snapshots); + while (n--) + id = __bch2_snapshot_parent(t, id); return id; } -u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *, u32); +u32 bch2_snapshot_oldest_subvol(struct bch_fs *, u32, snapshot_id_list *); u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32); static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) { - u32 parent; + guard(rcu)(); + struct snapshot_table *t = rcu_dereference(c->snapshots); - rcu_read_lock(); - while ((parent = __bch2_snapshot_parent(c, id))) + u32 parent; + while ((parent = __bch2_snapshot_parent(t, id))) id = parent; - rcu_read_unlock(); - return id; } -static inline bool __bch2_snapshot_exists(struct bch_fs *c, u32 id) +static inline enum snapshot_id_state __bch2_snapshot_id_state(struct snapshot_table *t, u32 id) { - const struct snapshot_t *s = snapshot_t(c, id); - return s ? s->live : 0; + const struct snapshot_t *s = __snapshot_t(t, id); + return s ? s->state : SNAPSHOT_ID_empty; } -static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) +static inline enum snapshot_id_state bch2_snapshot_id_state(struct bch_fs *c, u32 id) { - rcu_read_lock(); - bool ret = __bch2_snapshot_exists(c, id); - rcu_read_unlock(); + guard(rcu)(); + return __bch2_snapshot_id_state(rcu_dereference(c->snapshots), id); +} - return ret; +static inline bool __bch2_snapshot_exists(struct snapshot_table *t, u32 id) +{ + return __bch2_snapshot_id_state(t, id) == SNAPSHOT_ID_live; +} + +static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) +{ + return bch2_snapshot_id_state(c, id) == SNAPSHOT_ID_live; } static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) { - rcu_read_lock(); + guard(rcu)(); const struct snapshot_t *s = snapshot_t(c, id); - int ret = s ? s->children[0] : -BCH_ERR_invalid_snapshot_node; - rcu_read_unlock(); - - return ret; + return s ? s->children[0] : bch_err_throw(c, invalid_snapshot_node); } static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) @@ -155,13 +149,8 @@ static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent) { - u32 depth; - - rcu_read_lock(); - depth = parent ? snapshot_t(c, parent)->depth + 1 : 0; - rcu_read_unlock(); - - return depth; + guard(rcu)(); + return parent ? snapshot_t(c, parent)->depth + 1 : 0; } bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32); @@ -175,20 +164,14 @@ static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ances static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id) { - rcu_read_lock(); + guard(rcu)(); const struct snapshot_t *t = snapshot_t(c, id); - bool ret = t && (t->children[0]|t->children[1]) != 0; - rcu_read_unlock(); - - return ret; + return t && (t->children[0]|t->children[1]) != 0; } static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) { - darray_for_each(*s, i) - if (*i == id) - return true; - return false; + return darray_find(*s, id) != NULL; } static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id) @@ -241,10 +224,38 @@ int bch2_snapshot_node_create(struct btree_trans *, u32, int bch2_check_snapshot_trees(struct bch_fs *); int bch2_check_snapshots(struct bch_fs *); int bch2_reconstruct_snapshots(struct bch_fs *); -int bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c); + +int __bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c); + +static inline int bch2_check_key_has_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + return likely(bch2_snapshot_exists(trans->c, k.k->p.snapshot)) + ? 0 + : __bch2_check_key_has_snapshot(trans, iter, k); +} + +int __bch2_get_snapshot_overwrites(struct btree_trans *, + enum btree_id, struct bpos, + snapshot_id_list *); + +/* + * Get a list of snapshot IDs that have overwritten a given key: + */ +static inline int bch2_get_snapshot_overwrites(struct btree_trans *trans, + enum btree_id btree, struct bpos pos, + snapshot_id_list *s) +{ + darray_init(s); + + return bch2_snapshot_has_children(trans->c, pos.snapshot) + ? __bch2_get_snapshot_overwrites(trans, btree, pos, s) + : 0; + +} int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); -void bch2_delete_dead_snapshots_work(struct work_struct *); int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos); @@ -259,7 +270,14 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans, return __bch2_key_has_snapshot_overwrites(trans, id, pos); } +int __bch2_delete_dead_snapshots(struct bch_fs *); +int bch2_delete_dead_snapshots(struct bch_fs *); +void bch2_delete_dead_snapshots_work(struct work_struct *); +void bch2_delete_dead_snapshots_async(struct bch_fs *); +void bch2_snapshot_delete_status_to_text(struct printbuf *, struct bch_fs *); + int bch2_snapshots_read(struct bch_fs *); void bch2_fs_snapshots_exit(struct bch_fs *); +void bch2_fs_snapshots_init_early(struct bch_fs *); #endif /* _BCACHEFS_SNAPSHOT_H */ diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h index aabcd3a74cd9..9bccae1f3590 100644 --- a/fs/bcachefs/snapshot_format.h +++ b/fs/bcachefs/snapshot_format.h @@ -15,10 +15,10 @@ struct bch_snapshot { bch_le128 btime; }; -LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) - +LE32_BITMASK(BCH_SNAPSHOT_WILL_DELETE, struct bch_snapshot, flags, 0, 1) /* True if a subvolume points to this snapshot node: */ LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) +LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 2, 3) /* * Snapshot trees: diff --git a/fs/bcachefs/snapshot_types.h b/fs/bcachefs/snapshot_types.h new file mode 100644 index 000000000000..a826c9c83c11 --- /dev/null +++ b/fs/bcachefs/snapshot_types.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SNAPSHOT_TYPES_H +#define _BCACHEFS_SNAPSHOT_TYPES_H + +#include "bbpos_types.h" +#include "darray.h" +#include "subvolume_types.h" + +DEFINE_DARRAY_NAMED(snapshot_id_list, u32); + +#define IS_ANCESTOR_BITMAP 128 + +struct snapshot_t { + enum snapshot_id_state { + SNAPSHOT_ID_empty, + SNAPSHOT_ID_live, + SNAPSHOT_ID_deleted, + } state; + u32 parent; + u32 skip[3]; + u32 depth; + u32 children[2]; + u32 subvol; /* Nonzero only if a subvolume points to this node: */ + u32 tree; + unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; +}; + +struct snapshot_table { + struct rcu_head rcu; + size_t nr; +#ifndef RUST_BINDGEN + DECLARE_FLEX_ARRAY(struct snapshot_t, s); +#else + struct snapshot_t s[0]; +#endif +}; + +struct snapshot_interior_delete { + u32 id; + u32 live_child; +}; +typedef DARRAY(struct snapshot_interior_delete) interior_delete_list; + +struct snapshot_delete { + struct mutex lock; + struct work_struct work; + + struct mutex progress_lock; + snapshot_id_list deleting_from_trees; + snapshot_id_list delete_leaves; + interior_delete_list delete_interior; + + bool running; + struct bbpos pos; +}; + +#endif /* _BCACHEFS_SNAPSHOT_TYPES_H */ diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c index a90bf7b8a2b4..ce2a54902a64 100644 --- a/fs/bcachefs/str_hash.c +++ b/fs/bcachefs/str_hash.c @@ -18,27 +18,27 @@ static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dir return ret; return !ret; } else { - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + CLASS(btree_iter, iter)(trans, BTREE_ID_inodes, SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); int ret = bkey_err(k); if (ret) return ret; - ret = bkey_is_inode(k.k); - bch2_trans_iter_exit(trans, &iter); - return ret; + return bkey_is_inode(k.k); } } -static noinline int fsck_rename_dirent(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_s_c_dirent old) +static int bch2_fsck_rename_dirent(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_s_c_dirent old, + bool *updated_before_k_pos) { + struct bch_fs *c = trans->c; struct qstr old_name = bch2_dirent_get_name(old); - struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32); + struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64)); int ret = PTR_ERR_OR_ZERO(new); if (ret) return ret; @@ -47,28 +47,39 @@ static noinline int fsck_rename_dirent(struct btree_trans *trans, dirent_copy_target(new, old); new->k.p = old.k->p; + char *renamed_buf = bch2_trans_kmalloc(trans, old_name.len + 20); + ret = PTR_ERR_OR_ZERO(renamed_buf); + if (ret) + return ret; + for (unsigned i = 0; i < 1000; i++) { - unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u", - old_name.len, old_name.name, i); - unsigned u64s = BKEY_U64s + dirent_val_u64s(len, 0); + new->k.u64s = BKEY_U64s_MAX; - if (u64s > U8_MAX) - return -EINVAL; + struct qstr renamed_name = (struct qstr) QSTR_INIT(renamed_buf, + sprintf(renamed_buf, "%.*s.fsck_renamed-%u", + old_name.len, old_name.name, i)); - new->k.u64s = u64s; + ret = bch2_dirent_init_name(c, new, hash_info, &renamed_name, NULL); + if (ret) + return ret; ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, (subvol_inum) { 0, old.k->p.inode }, old.k->p.snapshot, &new->k_i, - BTREE_UPDATE_internal_snapshot_node); - if (!bch2_err_matches(ret, EEXIST)) + BTREE_UPDATE_internal_snapshot_node| + STR_HASH_must_create); + if (ret && !bch2_err_matches(ret, EEXIST)) break; + if (!ret) { + if (bpos_lt(new->k.p, old.k->p)) + *updated_before_k_pos = true; + break; + } } - if (ret) - return ret; - - return bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); + ret = ret ?: bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); + bch_err_fn(c, ret); + return ret; } static noinline int hash_pick_winner(struct btree_trans *trans, @@ -101,17 +112,24 @@ static noinline int hash_pick_winner(struct btree_trans *trans, } } -static int repair_inode_hash_info(struct btree_trans *trans, - struct bch_inode_unpacked *snapshot_root) +/* + * str_hash lookups across snapshots break in wild ways if hash_info in + * different snapshot versions doesn't match - so if we find one mismatch, check + * them all + */ +int bch2_repair_inode_hash_info(struct btree_trans *trans, + struct bch_inode_unpacked *snapshot_root) { - struct btree_iter iter; + struct bch_fs *c = trans->c; struct bkey_s_c k; + CLASS(printbuf, buf)(); + bool need_commit = false; int ret = 0; - for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, - SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot - 1), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != snapshot_root->bi_inum) + for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, + POS(0, snapshot_root->bi_inum), + BTREE_ITER_all_snapshots, k, ret) { + if (bpos_ge(k.k->p, SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot))) break; if (!bkey_is_inode(k.k)) continue; @@ -121,20 +139,68 @@ static int repair_inode_hash_info(struct btree_trans *trans, if (ret) break; - if (fsck_err_on(inode.bi_hash_seed != snapshot_root->bi_hash_seed || - INODE_STR_HASH(&inode) != INODE_STR_HASH(snapshot_root), - trans, inode_snapshot_mismatch, - "inode hash info in different snapshots don't match")) { + if (inode.bi_hash_seed == snapshot_root->bi_hash_seed && + INODE_STR_HASH(&inode) == INODE_STR_HASH(snapshot_root)) { +#ifdef CONFIG_BCACHEFS_DEBUG + struct bch_hash_info hash1 = bch2_hash_info_init(c, snapshot_root); + struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode); + + BUG_ON(hash1.type != hash2.type || + memcmp(&hash1.siphash_key, + &hash2.siphash_key, + sizeof(hash1.siphash_key))); +#endif + continue; + } + + printbuf_reset(&buf); + prt_printf(&buf, "inode %llu hash info in snapshots %u %u don't match\n", + snapshot_root->bi_inum, + inode.bi_snapshot, + snapshot_root->bi_snapshot); + + bch2_prt_str_hash_type(&buf, INODE_STR_HASH(&inode)); + prt_printf(&buf, " %llx\n", inode.bi_hash_seed); + + bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root)); + prt_printf(&buf, " %llx", snapshot_root->bi_hash_seed); + + if (fsck_err(trans, inode_snapshot_mismatch, "%s", buf.buf)) { inode.bi_hash_seed = snapshot_root->bi_hash_seed; SET_INODE_STR_HASH(&inode, INODE_STR_HASH(snapshot_root)); - ret = __bch2_fsck_write_inode(trans, &inode) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; - break; + + ret = __bch2_fsck_write_inode(trans, &inode); + if (ret) + break; + need_commit = true; } } + + if (ret) + return ret; + + if (!need_commit) { + printbuf_reset(&buf); + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "inode %llu hash info mismatch with root, but mismatch not found\n", + snapshot_root->bi_inum); + + prt_printf(&buf, "root snapshot %u ", snapshot_root->bi_snapshot); + bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root)); + prt_printf(&buf, " %llx\n", snapshot_root->bi_hash_seed); +#if 0 + prt_printf(&buf, "vs snapshot %u ", hash_info->inum_snapshot); + bch2_prt_str_hash_type(&buf, hash_info->type); + prt_printf(&buf, " %llx %llx", hash_info->siphash_key.k0, hash_info->siphash_key.k1); +#endif + bch2_print_str(c, KERN_ERR, buf.buf); + return bch_err_throw(c, fsck_repair_unimplemented); + } + + ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + bch_err_throw(c, transaction_restart_nested); fsck_err: - bch2_trans_iter_exit(trans, &iter); return ret; } @@ -144,47 +210,121 @@ static int repair_inode_hash_info(struct btree_trans *trans, */ static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum, struct bch_hash_info *hash_info) +{ + struct bch_inode_unpacked snapshot_root; + int ret = bch2_inode_find_snapshot_root(trans, inum, &snapshot_root); + if (ret) + return ret; + + struct bch_hash_info hash_root = bch2_hash_info_init(trans->c, &snapshot_root); + if (hash_info->type != hash_root.type || + memcmp(&hash_info->siphash_key, + &hash_root.siphash_key, + sizeof(hash_root.siphash_key))) + ret = bch2_repair_inode_hash_info(trans, &snapshot_root); + + return ret; +} + +/* Put a str_hash key in its proper location, checking for duplicates */ +int bch2_str_hash_repair_key(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc *desc, + struct bch_hash_info *hash_info, + struct btree_iter *k_iter, struct bkey_s_c k, + struct btree_iter *dup_iter, struct bkey_s_c dup_k, + bool *updated_before_k_pos) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; + CLASS(printbuf, buf)(); + bool free_snapshots_seen = false; int ret = 0; - for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, U32_MAX), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inum) - break; - if (bkey_is_inode(k.k)) - goto found; + if (!s) { + s = bch2_trans_kmalloc(trans, sizeof(*s)); + ret = PTR_ERR_OR_ZERO(s); + if (ret) + goto out; + + s->pos = k_iter->pos; + darray_init(&s->ids); + + ret = bch2_get_snapshot_overwrites(trans, desc->btree_id, k_iter->pos, &s->ids); + if (ret) + goto out; + + free_snapshots_seen = true; } - bch_err(c, "%s(): inum %llu not found", __func__, inum); - ret = -BCH_ERR_fsck_repair_unimplemented; - goto err; -found:; - struct bch_inode_unpacked inode; - ret = bch2_inode_unpack(k, &inode); - if (ret) - goto err; - struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode); - if (hash_info->type != hash2.type || - memcmp(&hash_info->siphash_key, &hash2.siphash_key, sizeof(hash2.siphash_key))) { - ret = repair_inode_hash_info(trans, &inode); - if (!ret) { - bch_err(c, "inode hash info mismatch with root, but mismatch not found\n" - "%u %llx %llx\n" - "%u %llx %llx", - hash_info->type, - hash_info->siphash_key.k0, - hash_info->siphash_key.k1, - hash2.type, - hash2.siphash_key.k0, - hash2.siphash_key.k1); - ret = -BCH_ERR_fsck_repair_unimplemented; + if (!dup_k.k) { + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto out; + + dup_k = bch2_hash_set_or_get_in_snapshot(trans, dup_iter, *desc, hash_info, + (subvol_inum) { 0, new->k.p.inode }, + new->k.p.snapshot, new, + STR_HASH_must_create| + BTREE_ITER_with_updates| + BTREE_UPDATE_internal_snapshot_node); + ret = bkey_err(dup_k); + if (ret) + goto out; + if (dup_k.k) + goto duplicate_entries; + + if (bpos_lt(new->k.p, k.k->p)) + *updated_before_k_pos = true; + + ret = bch2_insert_snapshot_whiteouts(trans, desc->btree_id, + k_iter->pos, new->k.p) ?: + bch2_hash_delete_at(trans, *desc, hash_info, k_iter, + BTREE_ITER_with_updates| + BTREE_UPDATE_internal_snapshot_node) ?: + bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + bch_err_throw(c, transaction_restart_commit); + } else { +duplicate_entries: + ret = hash_pick_winner(trans, *desc, hash_info, k, dup_k); + if (ret < 0) + goto out; + + if (!fsck_err(trans, hash_table_key_duplicate, + "duplicate hash table keys%s:\n%s", + ret != 2 ? "" : ", both point to valid inodes", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), + prt_newline(&buf), + bch2_bkey_val_to_text(&buf, c, dup_k), + buf.buf))) + goto out; + + switch (ret) { + case 0: + ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); + break; + case 1: + ret = bch2_hash_delete_at(trans, *desc, hash_info, dup_iter, 0); + break; + case 2: + ret = bch2_fsck_rename_dirent(trans, s, *desc, hash_info, + bkey_s_c_to_dirent(k), + updated_before_k_pos) ?: + bch2_hash_delete_at(trans, *desc, hash_info, k_iter, + BTREE_ITER_with_updates); + goto out; } + + ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: + bch_err_throw(c, transaction_restart_commit); } -err: - bch2_trans_iter_exit(trans, &iter); +out: +fsck_err: + bch2_trans_iter_exit(dup_iter); + if (free_snapshots_seen) + darray_exit(&s->ids); return ret; } @@ -192,11 +332,12 @@ int __bch2_str_hash_check_key(struct btree_trans *trans, struct snapshots_seen *s, const struct bch_hash_desc *desc, struct bch_hash_info *hash_info, - struct btree_iter *k_iter, struct bkey_s_c hash_k) + struct btree_iter *k_iter, struct bkey_s_c hash_k, + bool *updated_before_k_pos) { struct bch_fs *c = trans->c; struct btree_iter iter = {}; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); struct bkey_s_c k; int ret = 0; @@ -204,92 +345,49 @@ int __bch2_str_hash_check_key(struct btree_trans *trans, if (hash_k.k->p.offset < hash) goto bad_hash; - for_each_btree_key_norestart(trans, iter, desc->btree_id, - SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), - BTREE_ITER_slots, k, ret) { + bch2_trans_iter_init(trans, &iter, desc->btree_id, + SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), + BTREE_ITER_slots| + BTREE_ITER_with_updates); + + for_each_btree_key_continue_norestart(iter, + BTREE_ITER_slots| + BTREE_ITER_with_updates, k, ret) { if (bkey_eq(k.k->p, hash_k.k->p)) break; if (k.k->type == desc->key_type && - !desc->cmp_bkey(k, hash_k)) - goto duplicate_entries; + !desc->cmp_bkey(k, hash_k)) { + ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, + hash_info) ?: + bch2_str_hash_repair_key(trans, s, desc, hash_info, + k_iter, hash_k, + &iter, k, updated_before_k_pos); + break; + } - if (bkey_deleted(k.k)) { - bch2_trans_iter_exit(trans, &iter); + if (bkey_deleted(k.k)) goto bad_hash; - } } -out: - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); + bch2_trans_iter_exit(&iter); +fsck_err: return ret; bad_hash: + bch2_trans_iter_exit(&iter); /* * Before doing any repair, check hash_info itself: */ ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, hash_info); if (ret) - goto out; + return ret; if (fsck_err(trans, hash_table_key_wrong_offset, - "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s", - bch2_btree_id_str(desc->btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k); - if (IS_ERR(new)) - return PTR_ERR(new); - - k = bch2_hash_set_or_get_in_snapshot(trans, &iter, *desc, hash_info, - (subvol_inum) { 0, hash_k.k->p.inode }, - hash_k.k->p.snapshot, new, - STR_HASH_must_create| - BTREE_ITER_with_updates| - BTREE_UPDATE_internal_snapshot_node); - ret = bkey_err(k); - if (ret) - goto out; - if (k.k) - goto duplicate_entries; - - ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, - BTREE_UPDATE_internal_snapshot_node) ?: - bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; - goto out; - } -fsck_err: - goto out; -duplicate_entries: - ret = hash_pick_winner(trans, *desc, hash_info, hash_k, k); - if (ret < 0) - goto out; - - if (!fsck_err(trans, hash_table_key_duplicate, - "duplicate hash table keys%s:\n%s", - ret != 2 ? "" : ", both point to valid inodes", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, hash_k), - prt_newline(&buf), - bch2_bkey_val_to_text(&buf, c, k), - buf.buf))) - goto out; - - switch (ret) { - case 0: - ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); - break; - case 1: - ret = bch2_hash_delete_at(trans, *desc, hash_info, &iter, 0); - break; - case 2: - ret = fsck_rename_dirent(trans, s, *desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?: - bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); - goto out; - } - - ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: - -BCH_ERR_transaction_restart_nested; - goto out; + "hash table key at wrong offset: should be at %llu\n%s", + hash, + (bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) + ret = bch2_str_hash_repair_key(trans, s, desc, hash_info, + k_iter, hash_k, + &iter, bkey_s_c_null, + updated_before_k_pos); + return ret; } diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 0c1a00539bd1..8c0fb44929cc 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -32,6 +32,7 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) } struct bch_hash_info { + u32 inum_snapshot; u8 type; struct unicode_map *cf_encoding; /* @@ -45,11 +46,10 @@ static inline struct bch_hash_info bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) { struct bch_hash_info info = { - .type = INODE_STR_HASH(bi), -#ifdef CONFIG_UNICODE - .cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL, -#endif - .siphash_key = { .k0 = bi->bi_hash_seed } + .inum_snapshot = bi->bi_snapshot, + .type = INODE_STR_HASH(bi), + .cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL, + .siphash_key = { .k0 = bi->bi_hash_seed } }; if (unlikely(info.type == BCH_STR_HASH_siphash_old)) { @@ -159,8 +159,11 @@ bch2_hash_lookup_in_snapshot(struct btree_trans *trans, struct bkey_s_c k; int ret; - for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, - SPOS(inum.inum, desc.hash_key(info, key), snapshot), + bch2_trans_iter_init(trans, iter, + desc.btree_id, SPOS(inum.inum, desc.hash_key(info, key), snapshot), + BTREE_ITER_slots|flags); + + for_each_btree_key_max_continue_norestart(*iter, POS(inum.inum, U64_MAX), BTREE_ITER_slots|flags, k, ret) { if (is_visible_key(desc, inum, k)) { @@ -173,9 +176,9 @@ bch2_hash_lookup_in_snapshot(struct btree_trans *trans, break; } } - bch2_trans_iter_exit(trans, iter); + bch2_trans_iter_exit(iter); - return bkey_s_c_err(ret ?: -BCH_ERR_ENOENT_str_hash_lookup); + return bkey_s_c_err(ret ?: bch_err_throw(trans->c, ENOENT_str_hash_lookup)); } static __always_inline struct bkey_s_c @@ -209,15 +212,18 @@ bch2_hash_hole(struct btree_trans *trans, if (ret) return ret; - for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, - SPOS(inum.inum, desc.hash_key(info, key), snapshot), + bch2_trans_iter_init(trans, iter, desc.btree_id, + SPOS(inum.inum, desc.hash_key(info, key), snapshot), + BTREE_ITER_slots|BTREE_ITER_intent); + + for_each_btree_key_max_continue_norestart(*iter, POS(inum.inum, U64_MAX), BTREE_ITER_slots|BTREE_ITER_intent, k, ret) if (!is_visible_key(desc, inum, k)) return 0; - bch2_trans_iter_exit(trans, iter); + bch2_trans_iter_exit(iter); - return ret ?: -BCH_ERR_ENOSPC_str_hash_create; + return ret ?: bch_err_throw(trans->c, ENOSPC_str_hash_create); } static __always_inline @@ -230,11 +236,11 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, struct bkey_s_c k; int ret; - bch2_trans_copy_iter(trans, &iter, start); + bch2_trans_copy_iter(&iter, start); - bch2_btree_iter_advance(trans, &iter); + bch2_btree_iter_advance(&iter); - for_each_btree_key_continue_norestart(trans, iter, BTREE_ITER_slots, k, ret) { + for_each_btree_key_continue_norestart(iter, BTREE_ITER_slots, k, ret) { if (k.k->type != desc.key_type && k.k->type != KEY_TYPE_hash_whiteout) break; @@ -246,7 +252,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, } } - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); return ret; } @@ -259,15 +265,19 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans, struct bkey_i *insert, enum btree_iter_update_trigger_flags flags) { + struct bch_fs *c = trans->c; struct btree_iter slot = {}; struct bkey_s_c k; bool found = false; int ret; - for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, + bch2_trans_iter_init(trans, iter, desc.btree_id, SPOS(insert->k.p.inode, desc.hash_bkey(info, bkey_i_to_s_c(insert)), snapshot), + BTREE_ITER_slots|BTREE_ITER_intent|flags); + + for_each_btree_key_max_continue_norestart(*iter, POS(insert->k.p.inode, U64_MAX), BTREE_ITER_slots|BTREE_ITER_intent|flags, k, ret) { if (is_visible_key(desc, inum, k)) { @@ -279,26 +289,26 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans, } if (!slot.path && !(flags & STR_HASH_must_replace)) - bch2_trans_copy_iter(trans, &slot, iter); + bch2_trans_copy_iter(&slot, iter); if (k.k->type != KEY_TYPE_hash_whiteout) goto not_found; } if (!ret) - ret = -BCH_ERR_ENOSPC_str_hash_create; + ret = bch_err_throw(c, ENOSPC_str_hash_create); out: - bch2_trans_iter_exit(trans, &slot); - bch2_trans_iter_exit(trans, iter); + bch2_trans_iter_exit(&slot); + bch2_trans_iter_exit(iter); return ret ? bkey_s_c_err(ret) : bkey_s_c_null; found: found = true; not_found: if (found && (flags & STR_HASH_must_create)) { - bch2_trans_iter_exit(trans, &slot); + bch2_trans_iter_exit(&slot); return k; } else if (!found && (flags & STR_HASH_must_replace)) { - ret = -BCH_ERR_ENOENT_str_hash_set_must_replace; + ret = bch_err_throw(c, ENOENT_str_hash_set_must_replace); } else { if (!found && slot.path) swap(*iter, slot); @@ -325,8 +335,8 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans, if (ret) return ret; if (k.k) { - bch2_trans_iter_exit(trans, &iter); - return -BCH_ERR_EEXIST_str_hash_set; + bch2_trans_iter_exit(&iter); + return bch_err_throw(trans->c, EEXIST_str_hash_set); } return 0; @@ -388,22 +398,34 @@ int bch2_hash_delete(struct btree_trans *trans, return ret; ret = bch2_hash_delete_at(trans, desc, info, &iter, 0); - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); return ret; } +int bch2_repair_inode_hash_info(struct btree_trans *, struct bch_inode_unpacked *); + struct snapshots_seen; +int bch2_str_hash_repair_key(struct btree_trans *, + struct snapshots_seen *, + const struct bch_hash_desc *, + struct bch_hash_info *, + struct btree_iter *, struct bkey_s_c, + struct btree_iter *, struct bkey_s_c, + bool *); + int __bch2_str_hash_check_key(struct btree_trans *, struct snapshots_seen *, const struct bch_hash_desc *, struct bch_hash_info *, - struct btree_iter *, struct bkey_s_c); + struct btree_iter *, struct bkey_s_c, + bool *); static inline int bch2_str_hash_check_key(struct btree_trans *trans, struct snapshots_seen *s, const struct bch_hash_desc *desc, struct bch_hash_info *hash_info, - struct btree_iter *k_iter, struct bkey_s_c hash_k) + struct btree_iter *k_iter, struct bkey_s_c hash_k, + bool *updated_before_k_pos) { if (hash_k.k->type != desc->key_type) return 0; @@ -411,7 +433,8 @@ static inline int bch2_str_hash_check_key(struct btree_trans *trans, if (likely(desc->hash_bkey(hash_info, hash_k) == hash_k.k->p.offset)) return 0; - return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k); + return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k, + updated_before_k_pos); } #endif /* _BCACHEFS_STR_HASH_H */ diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index d0209f7658bb..6023ae46ca72 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -3,6 +3,7 @@ #include "bcachefs.h" #include "btree_key_cache.h" #include "btree_update.h" +#include "enumerated_ref.h" #include "errcode.h" #include "error.h" #include "fs.h" @@ -14,6 +15,21 @@ static int bch2_subvolume_delete(struct btree_trans *, u32); +static int bch2_subvolume_missing(struct bch_fs *c, u32 subvolid) +{ + CLASS(printbuf, buf)(); + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "missing subvolume %u", subvolid); + bool print = bch2_count_fsck_err(c, subvol_missing, &buf); + + int ret = bch2_run_explicit_recovery_pass(c, &buf, + BCH_RECOVERY_PASS_check_inodes, 0); + if (print) + bch2_print_str(c, KERN_ERR, buf.buf); + return ret; +} + static struct bpos subvolume_children_pos(struct bkey_s_c k) { if (k.k->type != KEY_TYPE_subvolume) @@ -30,144 +46,142 @@ static int check_subvol(struct btree_trans *trans, struct bkey_s_c k) { struct bch_fs *c = trans->c; - struct bkey_s_c_subvolume subvol; - struct btree_iter subvol_children_iter = {}; + struct bch_subvolume subvol; struct bch_snapshot snapshot; - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); unsigned snapid; int ret = 0; if (k.k->type != KEY_TYPE_subvolume) return 0; - subvol = bkey_s_c_to_subvolume(k); - snapid = le32_to_cpu(subvol.v->snapshot); + bkey_val_copy(&subvol, bkey_s_c_to_subvolume(k)); + snapid = le32_to_cpu(subvol.snapshot); ret = bch2_snapshot_lookup(trans, snapid, &snapshot); if (bch2_err_matches(ret, ENOENT)) - return bch2_run_explicit_recovery_pass(c, + return bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret; if (ret) return ret; - if (BCH_SUBVOLUME_UNLINKED(subvol.v)) { + if (BCH_SUBVOLUME_UNLINKED(&subvol)) { ret = bch2_subvolume_delete(trans, iter->pos.offset); bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); - return ret ?: -BCH_ERR_transaction_restart_nested; + return ret ?: bch_err_throw(c, transaction_restart_nested); } - if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL && - subvol.v->fs_path_parent, + if (fsck_err_on(k.k->p.offset == BCACHEFS_ROOT_SUBVOL && + subvol.fs_path_parent, trans, subvol_root_fs_path_parent_nonzero, "root subvolume has nonzero fs_path_parent\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { struct bkey_i_subvolume *n = - bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume); + bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume); ret = PTR_ERR_OR_ZERO(n); if (ret) - goto err; + return ret; n->v.fs_path_parent = 0; } - if (subvol.v->fs_path_parent) { - struct bpos pos = subvolume_children_pos(k); - - struct bkey_s_c subvol_children_k = - bch2_bkey_get_iter(trans, &subvol_children_iter, - BTREE_ID_subvolume_children, pos, 0); + if (subvol.fs_path_parent) { + CLASS(btree_iter, subvol_children_iter)(trans, + BTREE_ID_subvolume_children, subvolume_children_pos(k), 0); + struct bkey_s_c subvol_children_k = bch2_btree_iter_peek_slot(&subvol_children_iter); ret = bkey_err(subvol_children_k); if (ret) - goto err; + return ret; if (fsck_err_on(subvol_children_k.k->type != KEY_TYPE_set, trans, subvol_children_not_set, "subvolume not set in subvolume_children btree at %llu:%llu\n%s", - pos.inode, pos.offset, + subvol_children_iter.pos.inode, subvol_children_iter.pos.offset, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, true); + ret = bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, subvol_children_iter.pos, true); if (ret) - goto err; + return ret; } } struct bch_inode_unpacked inode; ret = bch2_inode_find_by_inum_nowarn_trans(trans, - (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) }, + (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.inode) }, &inode); if (!ret) { - if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset, + if (fsck_err_on(inode.bi_subvol != k.k->p.offset, trans, subvol_root_wrong_bi_subvol, "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu", inode.bi_inum, inode.bi_snapshot, - inode.bi_subvol, subvol.k->p.offset)) { - inode.bi_subvol = subvol.k->p.offset; - inode.bi_snapshot = le32_to_cpu(subvol.v->snapshot); + inode.bi_subvol, k.k->p.offset)) { + inode.bi_subvol = k.k->p.offset; + inode.bi_snapshot = le32_to_cpu(subvol.snapshot); ret = __bch2_fsck_write_inode(trans, &inode); if (ret) - goto err; + return ret; } } else if (bch2_err_matches(ret, ENOENT)) { if (fsck_err(trans, subvol_to_missing_root, "subvolume %llu points to missing subvolume root %llu:%u", - k.k->p.offset, le64_to_cpu(subvol.v->inode), - le32_to_cpu(subvol.v->snapshot))) { - ret = bch2_subvolume_delete(trans, iter->pos.offset); - bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); - ret = ret ?: -BCH_ERR_transaction_restart_nested; - goto err; + k.k->p.offset, le64_to_cpu(subvol.inode), + le32_to_cpu(subvol.snapshot))) { + /* + * Recreate - any contents that are still disconnected + * will then get reattached under lost+found + */ + bch2_inode_init_early(c, &inode); + bch2_inode_init_late(c, &inode, bch2_current_time(c), + 0, 0, S_IFDIR|0700, 0, NULL); + inode.bi_inum = le64_to_cpu(subvol.inode); + inode.bi_snapshot = le32_to_cpu(subvol.snapshot); + inode.bi_subvol = k.k->p.offset; + inode.bi_parent_subvol = le32_to_cpu(subvol.fs_path_parent); + ret = __bch2_fsck_write_inode(trans, &inode); + if (ret) + return ret; } } else { - goto err; + return ret; } - if (!BCH_SUBVOLUME_SNAP(subvol.v)) { - u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot)); - u32 snapshot_tree; - struct bch_snapshot_tree st; - - rcu_read_lock(); - snapshot_tree = snapshot_t(c, snapshot_root)->tree; - rcu_read_unlock(); + if (!BCH_SUBVOLUME_SNAP(&subvol)) { + u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.snapshot)); + u32 snapshot_tree = bch2_snapshot_tree(c, snapshot_root); + struct bch_snapshot_tree st; ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, "%s: snapshot tree %u not found", __func__, snapshot_tree); if (ret) - goto err; + return ret; - if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, + if (fsck_err_on(le32_to_cpu(st.master_subvol) != k.k->p.offset, trans, subvol_not_master_and_not_snapshot, "subvolume %llu is not set as snapshot but is not master subvolume", k.k->p.offset)) { struct bkey_i_subvolume *s = - bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume); + bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume); ret = PTR_ERR_OR_ZERO(s); if (ret) - goto err; + return ret; SET_BCH_SUBVOLUME_SNAP(&s->v, true); } } -err: fsck_err: - bch2_trans_iter_exit(trans, &subvol_children_iter); - printbuf_exit(&buf); return ret; } int bch2_check_subvols(struct bch_fs *c) { - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, + CLASS(btree_trans, trans)(c); + return for_each_btree_key_commit(trans, iter, BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_subvol(trans, &iter, k))); - bch_err_fn(c, ret); - return ret; + check_subvol(trans, &iter, k)); } static int check_subvol_child(struct btree_trans *trans, @@ -196,13 +210,11 @@ static int check_subvol_child(struct btree_trans *trans, int bch2_check_subvol_children(struct bch_fs *c) { - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, + CLASS(btree_trans, trans)(c); + return for_each_btree_key_commit(trans, iter, BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_subvol_child(trans, &iter, k))); - bch_err_fn(c, ret); - return 0; + check_subvol_child(trans, &iter, k)); } /* Subvolumes: */ @@ -242,6 +254,13 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent)); prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent)); } + + if (BCH_SUBVOLUME_RO(s.v)) + prt_printf(out, " ro"); + if (BCH_SUBVOLUME_SNAP(s.v)) + prt_printf(out, " snapshot"); + if (BCH_SUBVOLUME_UNLINKED(s.v)) + prt_printf(out, " unlinked"); } static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set) @@ -273,14 +292,11 @@ int bch2_subvolume_trigger(struct btree_trans *trans, int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol) { - struct btree_iter iter; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0); - struct bkey_s_c k = bch2_btree_iter_peek(trans, &iter); - bch2_trans_iter_exit(trans, &iter); + CLASS(btree_iter, iter)(trans, BTREE_ID_subvolume_children, POS(subvol, 0), 0); + struct bkey_s_c k = bch2_btree_iter_peek(&iter); return bkey_err(k) ?: k.k && k.k->p.inode == subvol - ? -BCH_ERR_ENOTEMPTY_subvol_not_empty + ? bch_err_throw(trans->c, ENOTEMPTY_subvol_not_empty) : 0; } @@ -292,9 +308,8 @@ bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol, int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol), BTREE_ITER_cached| BTREE_ITER_with_updates, subvolume, s); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) && - inconsistent_if_not_found, - trans->c, "missing subvolume %u", subvol); + if (bch2_err_matches(ret, ENOENT) && inconsistent_if_not_found) + ret = bch2_subvolume_missing(trans->c, subvol) ?: ret; return ret; } @@ -319,7 +334,8 @@ int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol) int bch2_subvol_is_ro(struct bch_fs *c, u32 subvol) { - return bch2_trans_do(c, bch2_subvol_is_ro_trans(trans, subvol)); + CLASS(btree_trans, trans)(c); + return lockrestart_do(trans, bch2_subvol_is_ro_trans(trans, subvol)); } int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, @@ -334,22 +350,16 @@ int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, u32 *snapid, bool warn) { - struct btree_iter iter; - struct bkey_s_c_subvolume subvol; - int ret; - - subvol = bch2_bkey_get_iter_typed(trans, &iter, - BTREE_ID_subvolumes, POS(0, subvolid), - BTREE_ITER_cached|BTREE_ITER_with_updates, - subvolume); - ret = bkey_err(subvol); + CLASS(btree_iter, iter)(trans, BTREE_ID_subvolumes, POS(0, subvolid), + BTREE_ITER_cached|BTREE_ITER_with_updates); + struct bkey_s_c_subvolume subvol = bch2_bkey_get_typed(&iter, subvolume); + int ret = bkey_err(subvol); - bch2_fs_inconsistent_on(warn && bch2_err_matches(ret, ENOENT), trans->c, - "missing subvolume %u", subvolid); + if (bch2_err_matches(ret, ENOENT)) + ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret; if (likely(!ret)) *snapid = le32_to_cpu(subvol.v->snapshot); - bch2_trans_iter_exit(trans, &iter); return ret; } @@ -410,42 +420,35 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d */ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) { - struct btree_iter subvol_iter = {}, snapshot_iter = {}, snapshot_tree_iter = {}; - - struct bkey_s_c_subvolume subvol = - bch2_bkey_get_iter_typed(trans, &subvol_iter, - BTREE_ID_subvolumes, POS(0, subvolid), - BTREE_ITER_cached|BTREE_ITER_intent, - subvolume); + CLASS(btree_iter, subvol_iter)(trans, BTREE_ID_subvolumes, POS(0, subvolid), + BTREE_ITER_cached|BTREE_ITER_intent); + struct bkey_s_c_subvolume subvol = bch2_bkey_get_typed(&subvol_iter, subvolume); int ret = bkey_err(subvol); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, - "missing subvolume %u", subvolid); + if (bch2_err_matches(ret, ENOENT)) + ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret; if (ret) - goto err; + return ret; u32 snapid = le32_to_cpu(subvol.v->snapshot); - struct bkey_s_c_snapshot snapshot = - bch2_bkey_get_iter_typed(trans, &snapshot_iter, - BTREE_ID_snapshots, POS(0, snapid), - 0, snapshot); + CLASS(btree_iter, snapshot_iter)(trans, BTREE_ID_snapshots, POS(0, snapid), 0); + struct bkey_s_c_snapshot snapshot = bch2_bkey_get_typed(&snapshot_iter, snapshot); ret = bkey_err(snapshot); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, "missing snapshot %u", snapid); if (ret) - goto err; + return ret; u32 treeid = le32_to_cpu(snapshot.v->tree); + CLASS(btree_iter, snapshot_tree_iter)(trans, BTREE_ID_snapshot_trees, POS(0, treeid), 0); struct bkey_s_c_snapshot_tree snapshot_tree = - bch2_bkey_get_iter_typed(trans, &snapshot_tree_iter, - BTREE_ID_snapshot_trees, POS(0, treeid), - 0, snapshot_tree); + bch2_bkey_get_typed(&snapshot_tree_iter, snapshot_tree); ret = bkey_err(snapshot_tree); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, "missing snapshot tree %u", treeid); if (ret) - goto err; + return ret; if (le32_to_cpu(snapshot_tree.v->master_subvol) == subvolid) { struct bkey_i_snapshot_tree *snapshot_tree_mut = @@ -454,48 +457,48 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) 0, snapshot_tree); ret = PTR_ERR_OR_ZERO(snapshot_tree_mut); if (ret) - goto err; + return ret; snapshot_tree_mut->v.master_subvol = 0; } - ret = bch2_btree_delete_at(trans, &subvol_iter, 0) ?: + return bch2_btree_delete_at(trans, &subvol_iter, 0) ?: bch2_snapshot_node_set_deleted(trans, snapid); -err: - bch2_trans_iter_exit(trans, &snapshot_tree_iter); - bch2_trans_iter_exit(trans, &snapshot_iter); - bch2_trans_iter_exit(trans, &subvol_iter); - return ret; } static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) { - return bch2_subvolumes_reparent(trans, subvolid) ?: + int ret = bch2_subvolumes_reparent(trans, subvolid) ?: commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, __bch2_subvolume_delete(trans, subvolid)); + + bch2_recovery_pass_set_no_ratelimit(trans->c, BCH_RECOVERY_PASS_check_subvols); + return ret; } static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, snapshot_wait_for_pagecache_and_delete_work); - snapshot_id_list s; - u32 *id; int ret = 0; while (!ret) { - mutex_lock(&c->snapshots_unlinked_lock); - s = c->snapshots_unlinked; - darray_init(&c->snapshots_unlinked); - mutex_unlock(&c->snapshots_unlinked_lock); + snapshot_id_list s; + + scoped_guard(mutex, &c->snapshots_unlinked_lock) { + s = c->snapshots_unlinked; + darray_init(&c->snapshots_unlinked); + } if (!s.nr) break; bch2_evict_subvolume_inodes(c, &s); - for (id = s.data; id < s.data + s.nr; id++) { - ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id)); + CLASS(btree_trans, trans)(c); + + darray_for_each(s, id) { + ret = bch2_subvolume_delete(trans, *id); bch_err_msg(c, ret, "deleting subvolume %u", *id); if (ret) break; @@ -504,7 +507,7 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor darray_exit(&s); } - bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache); } struct subvolume_unlink_hook { @@ -519,31 +522,25 @@ static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans struct bch_fs *c = trans->c; int ret = 0; - mutex_lock(&c->snapshots_unlinked_lock); - if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol)) - ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol); - mutex_unlock(&c->snapshots_unlinked_lock); + scoped_guard(mutex, &c->snapshots_unlinked_lock) + if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol)) + ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol); if (ret) return ret; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache)) return -EROFS; if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache); return 0; } int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) { - struct btree_iter iter; - struct bkey_i_subvolume *n; - struct subvolume_unlink_hook *h; - int ret = 0; - - h = bch2_trans_kmalloc(trans, sizeof(*h)); - ret = PTR_ERR_OR_ZERO(h); + struct subvolume_unlink_hook *h = bch2_trans_kmalloc(trans, sizeof(*h)); + int ret = PTR_ERR_OR_ZERO(h); if (ret) return ret; @@ -551,19 +548,17 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) h->subvol = subvolid; bch2_trans_commit_hook(trans, &h->h); - n = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_subvolumes, POS(0, subvolid), - BTREE_ITER_cached, subvolume); + struct bkey_i_subvolume *n = + bch2_bkey_get_mut_typed(trans, BTREE_ID_subvolumes, POS(0, subvolid), + BTREE_ITER_cached, subvolume); ret = PTR_ERR_OR_ZERO(n); - if (unlikely(ret)) { - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, - "missing subvolume %u", subvolid); + if (bch2_err_matches(ret, ENOENT)) + ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret; + if (unlikely(ret)) return ret; - } SET_BCH_SUBVOLUME_UNLINKED(&n->v, true); n->v.fs_path_parent = 0; - bch2_trans_iter_exit(trans, &iter); return ret; } @@ -575,7 +570,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, bool ro) { struct bch_fs *c = trans->c; - struct btree_iter dst_iter, src_iter = {}; + struct btree_iter dst_iter; struct bkey_i_subvolume *new_subvol = NULL; struct bkey_i_subvolume *src_subvol = NULL; u32 parent = 0, new_nodes[2], snapshot_subvols[2]; @@ -584,7 +579,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, ret = bch2_bkey_get_empty_slot(trans, &dst_iter, BTREE_ID_subvolumes, POS(0, U32_MAX)); if (ret == -BCH_ERR_ENOSPC_btree_slot) - ret = -BCH_ERR_ENOSPC_subvolume_create; + ret = bch_err_throw(c, ENOSPC_subvolume_create); if (ret) return ret; @@ -594,15 +589,13 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, if (src_subvolid) { /* Creating a snapshot: */ - src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter, - BTREE_ID_subvolumes, POS(0, src_subvolid), - BTREE_ITER_cached, subvolume); + src_subvol = bch2_bkey_get_mut_typed(trans, BTREE_ID_subvolumes, POS(0, src_subvolid), + BTREE_ITER_cached, subvolume); ret = PTR_ERR_OR_ZERO(src_subvol); - if (unlikely(ret)) { - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, - "subvolume %u not found", src_subvolid); + if (bch2_err_matches(ret, ENOENT)) + ret = bch2_subvolume_missing(trans->c, src_subvolid) ?: ret; + if (unlikely(ret)) goto err; - } parent = le32_to_cpu(src_subvol->v.snapshot); } @@ -613,12 +606,8 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, if (ret) goto err; - if (src_subvolid) { + if (src_subvolid) src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]); - ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0); - if (ret) - goto err; - } new_subvol = bch2_bkey_alloc(trans, &dst_iter, 0, subvolume); ret = PTR_ERR_OR_ZERO(new_subvol); @@ -639,8 +628,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, *new_subvolid = new_subvol->k.p.offset; *new_snapshotid = new_nodes[0]; err: - bch2_trans_iter_exit(trans, &src_iter); - bch2_trans_iter_exit(trans, &dst_iter); + bch2_trans_iter_exit(&dst_iter); return ret; } @@ -649,7 +637,6 @@ int bch2_initialize_subvolumes(struct bch_fs *c) struct bkey_i_snapshot_tree root_tree; struct bkey_i_snapshot root_snapshot; struct bkey_i_subvolume root_volume; - int ret; bkey_snapshot_tree_init(&root_tree.k_i); root_tree.k.p.offset = 1; @@ -670,57 +657,44 @@ int bch2_initialize_subvolumes(struct bch_fs *c) root_volume.v.snapshot = cpu_to_le32(U32_MAX); root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); - ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0, 0) ?: + return bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0, 0) ?: bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0, 0) ?: bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0, 0); - bch_err_fn(c, ret); - return ret; } static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) { - struct btree_iter iter; - struct bkey_s_c k; - struct bch_inode_unpacked inode; - int ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0); - ret = bkey_err(k); + CLASS(btree_iter, iter)(trans, BTREE_ID_inodes, SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + int ret = bkey_err(k); if (ret) return ret; if (!bkey_is_inode(k.k)) { - bch_err(trans->c, "root inode not found"); - ret = -BCH_ERR_ENOENT_inode; - goto err; + struct bch_fs *c = trans->c; + bch_err(c, "root inode not found"); + return bch_err_throw(c, ENOENT_inode); } + struct bch_inode_unpacked inode; ret = bch2_inode_unpack(k, &inode); BUG_ON(ret); inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; - ret = bch2_inode_write(trans, &iter, &inode); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; + return bch2_inode_write(trans, &iter, &inode); } /* set bi_subvol on root inode */ int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) { - int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __bch2_fs_upgrade_for_subvolumes(trans)); - bch_err_fn(c, ret); - return ret; + CLASS(btree_trans, trans)(c); + return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + __bch2_fs_upgrade_for_subvolumes(trans)); } -int bch2_fs_subvolumes_init(struct bch_fs *c) +void bch2_fs_subvolumes_init_early(struct bch_fs *c) { - INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work); INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work, bch2_subvolume_wait_for_pagecache_and_delete); - mutex_init(&c->snapshots_unlinked_lock); - return 0; } diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index f640c1e3d639..b6d7c1f4a256 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -33,59 +33,52 @@ int bch2_subvol_is_ro_trans(struct btree_trans *, u32); int bch2_subvol_is_ro(struct bch_fs *, u32); static inline struct bkey_s_c -bch2_btree_iter_peek_in_subvolume_max_type(struct btree_trans *trans, struct btree_iter *iter, - struct bpos end, u32 subvolid, unsigned flags) +bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos end, + u32 subvolid, unsigned flags) { u32 snapshot; - int ret = bch2_subvolume_get_snapshot(trans, subvolid, &snapshot); + int ret = bch2_subvolume_get_snapshot(iter->trans, subvolid, &snapshot); if (ret) return bkey_s_c_err(ret); - bch2_btree_iter_set_snapshot(trans, iter, snapshot); - return bch2_btree_iter_peek_max_type(trans, iter, end, flags); + bch2_btree_iter_set_snapshot(iter, snapshot); + return bch2_btree_iter_peek_max_type(iter, end, flags); } #define for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \ _end, _subvolid, _flags, _k, _do) \ ({ \ - struct bkey_s_c _k; \ int _ret3 = 0; \ \ do { \ _ret3 = lockrestart_do(_trans, ({ \ - (_k) = bch2_btree_iter_peek_in_subvolume_max_type(trans, &(_iter),\ + struct bkey_s_c _k = bch2_btree_iter_peek_in_subvolume_max_type(&(_iter),\ _end, _subvolid, (_flags)); \ if (!(_k).k) \ break; \ \ bkey_err(_k) ?: (_do); \ })); \ - } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \ + } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \ \ - bch2_trans_iter_exit((_trans), &(_iter)); \ _ret3; \ }) #define for_each_btree_key_in_subvolume_max(_trans, _iter, _btree_id, \ _start, _end, _subvolid, _flags, _k, _do) \ ({ \ - struct btree_iter _iter; \ - bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ + CLASS(btree_iter, _iter)((_trans), (_btree_id), (_start), (_flags)); \ \ for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \ _end, _subvolid, _flags, _k, _do); \ }) -int bch2_delete_dead_snapshots(struct bch_fs *); -void bch2_delete_dead_snapshots_async(struct bch_fs *); - int bch2_subvolume_unlink(struct btree_trans *, u32); int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool); int bch2_initialize_subvolumes(struct bch_fs *); int bch2_fs_upgrade_for_subvolumes(struct bch_fs *); -int bch2_fs_subvolumes_init(struct bch_fs *); +void bch2_fs_subvolumes_init_early(struct bch_fs *); #endif /* _BCACHEFS_SUBVOLUME_H */ diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h index 1549d6daf7af..9d634b906dcd 100644 --- a/fs/bcachefs/subvolume_types.h +++ b/fs/bcachefs/subvolume_types.h @@ -2,33 +2,6 @@ #ifndef _BCACHEFS_SUBVOLUME_TYPES_H #define _BCACHEFS_SUBVOLUME_TYPES_H -#include "darray.h" - -typedef DARRAY(u32) snapshot_id_list; - -#define IS_ANCESTOR_BITMAP 128 - -struct snapshot_t { - bool live; - u32 parent; - u32 skip[3]; - u32 depth; - u32 children[2]; - u32 subvol; /* Nonzero only if a subvolume points to this node: */ - u32 tree; - unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; -}; - -struct snapshot_table { - struct rcu_head rcu; - size_t nr; -#ifndef RUST_BINDGEN - DECLARE_FLEX_ARRAY(struct snapshot_t, s); -#else - struct snapshot_t s[0]; -#endif -}; - typedef struct { /* we can't have padding in this struct: */ u64 subvol; diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index cb5d960aed92..be7ed612d28f 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -68,36 +68,35 @@ enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_meta int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version) { - int ret = ((c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) && - version <= c->sb.version_incompat_allowed) - ? 0 - : -BCH_ERR_may_not_use_incompat_feature; - - mutex_lock(&c->sb_lock); - if (!ret) { - SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb, - max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version)); - bch2_write_super(c); + if (((c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) && + version <= c->sb.version_incompat_allowed)) { + guard(mutex)(&c->sb_lock); + + if (version > c->sb.version_incompat) { + SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb, + max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version)); + bch2_write_super(c); + } + return 0; } else { - darray_for_each(c->incompat_versions_requested, i) - if (version == *i) - goto out; - - darray_push(&c->incompat_versions_requested, version); - struct printbuf buf = PRINTBUF; - prt_str(&buf, "requested incompat feature "); - bch2_version_to_text(&buf, version); - prt_str(&buf, " currently not enabled"); - prt_printf(&buf, "\n set version_upgrade=incompat to enable"); - - bch_notice(c, "%s", buf.buf); - printbuf_exit(&buf); - } + BUILD_BUG_ON(BCH_VERSION_MAJOR(bcachefs_metadata_version_current) != 1); -out: - mutex_unlock(&c->sb_lock); + unsigned minor = BCH_VERSION_MINOR(version); - return ret; + if (!test_bit(minor, c->incompat_versions_requested) && + !test_and_set_bit(minor, c->incompat_versions_requested)) { + CLASS(printbuf, buf)(); + prt_str(&buf, "requested incompat feature "); + bch2_version_to_text(&buf, version); + prt_str(&buf, " currently not enabled, allowed up to "); + bch2_version_to_text(&buf, version); + prt_printf(&buf, "\n set version_upgrade=incompat to enable"); + + bch_notice(c, "%s", buf.buf); + } + + return bch_err_throw(c, may_not_use_incompat_feature); + } } const char * const bch2_sb_fields[] = { @@ -202,12 +201,11 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; if (new_bytes > max_bytes) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); prt_bdevname(&buf, sb->bdev); prt_printf(&buf, ": superblock too big: want %zu but have %llu", new_bytes, max_bytes); pr_err("%s", buf.buf); - printbuf_exit(&buf); return -BCH_ERR_ENOSPC_sb; } } @@ -260,11 +258,11 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb, /* XXX: we're not checking that offline device have enough space */ - for_each_online_member(c, ca) { + for_each_online_member(c, ca, BCH_DEV_READ_REF_sb_field_resize) { struct bch_sb_handle *dev_sb = &ca->disk_sb; if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) { - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_sb_field_resize); return NULL; } } @@ -384,7 +382,6 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out) int bch2_sb_validate(struct bch_sb *sb, u64 read_offset, enum bch_validate_flags flags, struct printbuf *out) { - struct bch_sb_field_members_v1 *mi; enum bch_opt_id opt_id; int ret; @@ -468,6 +465,9 @@ int bch2_sb_validate(struct bch_sb *sb, u64 read_offset, SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, BCH_SB_VERSION_INCOMPAT(sb)); } + if (sb->nr_devices > 1) + SET_BCH_SB_MULTI_DEVICE(sb, true); + if (!flags) { /* * Been seeing a bug where these are getting inexplicably @@ -536,14 +536,17 @@ int bch2_sb_validate(struct bch_sb *sb, u64 read_offset, } } + struct bch_sb_field *mi = + bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v2) ?: + bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v1); + /* members must be validated first: */ - mi = bch2_sb_field_get(sb, members_v1); if (!mi) { prt_printf(out, "Invalid superblock: member info area missing"); return -BCH_ERR_invalid_sb_members_missing; } - ret = bch2_sb_field_validate(sb, &mi->field, flags, out); + ret = bch2_sb_field_validate(sb, mi, flags, out); if (ret) return ret; @@ -612,20 +615,21 @@ static void bch2_sb_update(struct bch_fs *c) c->sb.features = le64_to_cpu(src->features[0]); c->sb.compat = le64_to_cpu(src->compat[0]); + c->sb.multi_device = BCH_SB_MULTI_DEVICE(src); memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent)); struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext); if (ext) { + c->sb.recovery_passes_required = + bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent, sizeof(c->sb.errors_silent) * 8); c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data); } - for_each_member_device(c, ca) { - struct bch_member m = bch2_sb_member_get(src, ca->dev_idx); - ca->mi = bch2_mi_to_cpu(&m); - } + bch2_sb_members_to_cpu(c); } static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) @@ -776,8 +780,8 @@ static int __bch2_read_super(const char *path, struct bch_opts *opts, { u64 offset = opt_get(*opts, sb); struct bch_sb_layout layout; - struct printbuf err = PRINTBUF; - struct printbuf err2 = PRINTBUF; + CLASS(printbuf, err)(); + CLASS(printbuf, err2)(); __le64 *i; int ret; #ifndef __KERNEL__ @@ -852,7 +856,6 @@ static int __bch2_read_super(const char *path, struct bch_opts *opts, else bch2_print_opts(opts, KERN_ERR "%s", err2.buf); - printbuf_exit(&err2); printbuf_reset(&err); /* @@ -918,15 +921,14 @@ static int __bch2_read_super(const char *path, struct bch_opts *opts, path, err.buf); goto err_no_print; } -out: - printbuf_exit(&err); - return ret; + + return 0; err: bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error reading superblock: %s\n", path, err.buf); err_no_print: bch2_free_super(sb); - goto out; + return ret; } int bch2_read_super(const char *path, struct bch_opts *opts, @@ -961,7 +963,7 @@ static void write_super_endio(struct bio *bio) } closure_put(&ca->fs->sb_write); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); } static void read_back_super(struct bch_fs *c, struct bch_dev *ca) @@ -979,7 +981,7 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca) this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio)); - percpu_ref_get(&ca->io_ref[READ]); + enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); closure_bio_submit(bio, &c->sb_write); } @@ -994,7 +996,12 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), null_nonce(), sb); - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); + /* + * blk-wbt.c throttles all writes except those that have both REQ_SYNC + * and REQ_IDLE set... + */ + + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_IDLE|REQ_META); bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); bio->bi_end_io = write_super_endio; bio->bi_private = ca; @@ -1005,14 +1012,14 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb], bio_sectors(bio)); - percpu_ref_get(&ca->io_ref[READ]); + enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); closure_bio_submit(bio, &c->sb_write); } int bch2_write_super(struct bch_fs *c) { struct closure *cl = &c->sb_write; - struct printbuf err = PRINTBUF; + CLASS(printbuf, err)(); unsigned sb = 0, nr_wrote; struct bch_devs_mask sb_written; bool wrote, can_mount_without_written, can_mount_with_written; @@ -1022,7 +1029,7 @@ int bch2_write_super(struct bch_fs *c) trace_and_count(c, write_super, c, _RET_IP_); - if (c->opts.very_degraded) + if (c->opts.degraded == BCH_DEGRADED_very) degraded_flags |= BCH_FORCE_IF_LOST; lockdep_assert_held(&c->sb_lock); @@ -1037,13 +1044,13 @@ int bch2_write_super(struct bch_fs *c) * For now, we expect to be able to call write_super() when we're not * yet RW: */ - for_each_online_member(c, ca) { + for_each_online_member(c, ca, BCH_DEV_READ_REF_write_super) { ret = darray_push(&online_devices, ca); if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) { - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); goto out; } - percpu_ref_get(&ca->io_ref[READ]); + enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); } /* Make sure we're using the new magic numbers: */ @@ -1094,15 +1101,14 @@ int bch2_write_super(struct bch_fs *c) goto out; if (le16_to_cpu(c->disk_sb.sb->version) > bcachefs_metadata_version_current) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); prt_printf(&buf, "attempting to write superblock that wasn't version downgraded ("); bch2_version_to_text(&buf, le16_to_cpu(c->disk_sb.sb->version)); prt_str(&buf, " > "); bch2_version_to_text(&buf, bcachefs_metadata_version_current); prt_str(&buf, ")"); bch2_fs_fatal_error(c, ": %s", buf.buf); - printbuf_exit(&buf); - ret = -BCH_ERR_sb_not_downgraded; + ret = bch_err_throw(c, sb_not_downgraded); goto out; } @@ -1122,7 +1128,7 @@ int bch2_write_super(struct bch_fs *c) continue; if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); prt_char(&buf, ' '); prt_bdevname(&buf, ca->disk_sb.bdev); prt_printf(&buf, @@ -1132,17 +1138,15 @@ int bch2_write_super(struct bch_fs *c) if (c->opts.errors != BCH_ON_ERROR_continue && c->opts.errors != BCH_ON_ERROR_fix_safe) { - ret = -BCH_ERR_erofs_sb_err; + ret = bch_err_throw(c, erofs_sb_err); bch2_fs_fatal_error(c, "%s", buf.buf); } else { bch_err(c, "%s", buf.buf); } - - printbuf_exit(&buf); } if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); prt_char(&buf, ' '); prt_bdevname(&buf, ca->disk_sb.bdev); prt_printf(&buf, @@ -1150,8 +1154,7 @@ int bch2_write_super(struct bch_fs *c) le64_to_cpu(ca->sb_read_scratch->seq), ca->disk_sb.seq); bch2_fs_fatal_error(c, "%s", buf.buf); - printbuf_exit(&buf); - ret = -BCH_ERR_erofs_sb_err; + ret = bch_err_throw(c, erofs_sb_err); } } @@ -1205,26 +1208,24 @@ int bch2_write_super(struct bch_fs *c) !can_mount_with_written), c, ": Unable to write superblock to sufficient devices (from %ps)", (void *) _RET_IP_)) - ret = -BCH_ERR_erofs_sb_err; + ret = bch_err_throw(c, erofs_sb_err); out: /* Make new options visible after they're persistent: */ bch2_sb_update(c); darray_for_each(online_devices, ca) - percpu_ref_put(&(*ca)->io_ref[READ]); + enumerated_ref_put(&(*ca)->io_ref[READ], BCH_DEV_READ_REF_write_super); darray_exit(&online_devices); - printbuf_exit(&err); return ret; } void __bch2_check_set_feature(struct bch_fs *c, unsigned feat) { - mutex_lock(&c->sb_lock); - if (!(c->sb.features & (1ULL << feat))) { - c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat); + guard(mutex)(&c->sb_lock); + if (!(c->sb.features & BIT_ULL(feat))) { + c->disk_sb.sb->features[0] |= cpu_to_le64(BIT_ULL(feat)); bch2_write_super(c); } - mutex_unlock(&c->sb_lock); } /* Downgrade if superblock is at a higher version than currently supported: */ @@ -1270,6 +1271,29 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat) } } +void bch2_sb_upgrade_incompat(struct bch_fs *c) +{ + guard(mutex)(&c->sb_lock); + + if (c->sb.version == c->sb.version_incompat_allowed) + return; + + CLASS(printbuf, buf)(); + + prt_str(&buf, "Now allowing incompatible features up to "); + bch2_version_to_text(&buf, c->sb.version); + prt_str(&buf, ", previously allowed up to "); + bch2_version_to_text(&buf, c->sb.version_incompat_allowed); + prt_newline(&buf); + + bch_notice(c, "%s", buf.buf); + + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); + SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, + max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), c->sb.version)); + bch2_write_super(c); +} + static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f, enum bch_validate_flags flags, struct printbuf *err) { @@ -1333,7 +1357,7 @@ static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, enum bch_validate_flags flags, struct printbuf *err) { unsigned type = le32_to_cpu(f->type); - struct printbuf field_err = PRINTBUF; + CLASS(printbuf, field_err)(); const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); int ret; @@ -1345,7 +1369,6 @@ static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, bch2_sb_field_to_text(err, sb, f); } - printbuf_exit(&field_err); return ret; } diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index 78f708a6fbcd..a3b7a90f2533 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -107,6 +107,7 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) bool bch2_check_version_downgrade(struct bch_fs *); void bch2_sb_upgrade(struct bch_fs *, unsigned, bool); +void bch2_sb_upgrade_incompat(struct bch_fs *); void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, struct bch_sb_field *); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 84a37d971ffd..b0019488f586 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -10,6 +10,8 @@ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" +#include "async_objs.h" +#include "backpointers.h" #include "bkey_sort.h" #include "btree_cache.h" #include "btree_gc.h" @@ -28,6 +30,7 @@ #include "disk_accounting.h" #include "disk_groups.h" #include "ec.h" +#include "enumerated_ref.h" #include "errcode.h" #include "error.h" #include "fs.h" @@ -48,6 +51,7 @@ #include "quota.h" #include "rebalance.h" #include "recovery.h" +#include "recovery_passes.h" #include "replicas.h" #include "sb-clean.h" #include "sb-counters.h" @@ -75,15 +79,56 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Kent Overstreet "); MODULE_DESCRIPTION("bcachefs filesystem"); -const char * const bch2_fs_flag_strs[] = { +typedef DARRAY(struct bch_sb_handle) bch_sb_handles; + #define x(n) #n, +const char * const bch2_fs_flag_strs[] = { BCH_FS_FLAGS() -#undef x NULL }; -void bch2_print_str(struct bch_fs *c, const char *str) +const char * const bch2_write_refs[] = { + BCH_WRITE_REFS() + NULL +}; + +const char * const bch2_dev_read_refs[] = { + BCH_DEV_READ_REFS() + NULL +}; + +const char * const bch2_dev_write_refs[] = { + BCH_DEV_WRITE_REFS() + NULL +}; +#undef x + +static bool should_print_loglevel(struct bch_fs *c, const char *fmt) +{ + unsigned loglevel_opt = c->loglevel ?: c->opts.verbose ? 7: 6; + + bool have_soh = fmt[0] == KERN_SOH[0]; + bool have_loglevel = have_soh && fmt[1] >= '0' && fmt[1] <= '9'; + + unsigned loglevel = have_loglevel + ? fmt[1] - '0' + : c->prev_loglevel; + + if (have_loglevel) + c->prev_loglevel = loglevel; + + return loglevel <= loglevel_opt; +} + +void bch2_print_str(struct bch_fs *c, const char *prefix, const char *str) { + if (!should_print_loglevel(c, prefix)) + return; + +#ifndef __KERNEL__ + prefix = ""; +#endif + #ifdef __KERNEL__ struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); @@ -92,7 +137,7 @@ void bch2_print_str(struct bch_fs *c, const char *str) return; } #endif - bch2_print_string_as_lines(KERN_ERR, str); + bch2_print_string_as_lines(prefix, str); } __printf(2, 0) @@ -122,6 +167,14 @@ void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...) void __bch2_print(struct bch_fs *c, const char *fmt, ...) { + if (!should_print_loglevel(c, fmt)) + return; + +#ifndef __KERNEL__ + if (fmt[0] == KERN_SOH[0]) + fmt += 2; +#endif + struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); va_list args; @@ -186,23 +239,17 @@ static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); struct bch_fs *bch2_dev_to_fs(dev_t dev) { - struct bch_fs *c; - - mutex_lock(&bch_fs_list_lock); - rcu_read_lock(); + guard(mutex)(&bch_fs_list_lock); + guard(rcu)(); + struct bch_fs *c; list_for_each_entry(c, &bch_fs_list, list) for_each_member_device_rcu(c, ca, NULL) if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) { closure_get(&c->cl); - goto found; + return c; } - c = NULL; -found: - rcu_read_unlock(); - mutex_unlock(&bch_fs_list_lock); - - return c; + return NULL; } static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid) @@ -220,14 +267,11 @@ static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid) struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid) { - struct bch_fs *c; + guard(mutex)(&bch_fs_list_lock); - mutex_lock(&bch_fs_list_lock); - c = __bch2_uuid_to_fs(uuid); + struct bch_fs *c = __bch2_uuid_to_fs(uuid); if (c) closure_get(&c->cl); - mutex_unlock(&bch_fs_list_lock); - return c; } @@ -297,15 +341,13 @@ static void __bch2_fs_read_only(struct bch_fs *c) } } -#ifndef BCH_WRITE_REF_DEBUG -static void bch2_writes_disabled(struct percpu_ref *writes) +static void bch2_writes_disabled(struct enumerated_ref *writes) { struct bch_fs *c = container_of(writes, struct bch_fs, writes); set_bit(BCH_FS_write_disable_complete, &c->flags); wake_up(&bch2_read_only_wait); } -#endif void bch2_fs_read_only(struct bch_fs *c) { @@ -323,12 +365,7 @@ void bch2_fs_read_only(struct bch_fs *c) * writes will return -EROFS: */ set_bit(BCH_FS_going_ro, &c->flags); -#ifndef BCH_WRITE_REF_DEBUG - percpu_ref_kill(&c->writes); -#else - for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) - bch2_write_ref_put(c, i); -#endif + enumerated_ref_stop_async(&c->writes); /* * If we're not doing an emergency shutdown, we want to wait on @@ -366,7 +403,7 @@ void bch2_fs_read_only(struct bch_fs *c) !test_bit(BCH_FS_emergency_ro, &c->flags) && test_bit(BCH_FS_started, &c->flags) && test_bit(BCH_FS_clean_shutdown, &c->flags) && - c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) { + c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) { BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty)); BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); @@ -378,9 +415,8 @@ void bch2_fs_read_only(struct bch_fs *c) bch2_fs_mark_clean(c); } else { /* Make sure error counts/counters are persisted */ - mutex_lock(&c->sb_lock); + guard(mutex)(&c->sb_lock); bch2_write_super(c); - mutex_unlock(&c->sb_lock); bch_verbose(c, "done going read-only, filesystem not clean"); } @@ -391,9 +427,8 @@ static void bch2_fs_read_only_work(struct work_struct *work) struct bch_fs *c = container_of(work, struct bch_fs, read_only_work); - down_write(&c->state_lock); + guard(rwsem_write)(&c->state_lock); bch2_fs_read_only(c); - up_write(&c->state_lock); } static void bch2_fs_read_only_async(struct bch_fs *c) @@ -412,6 +447,30 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c) return ret; } +static bool __bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out, + bool locked) +{ + bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); + + if (!locked) + bch2_journal_halt(&c->journal); + else + bch2_journal_halt_locked(&c->journal); + bch2_fs_read_only_async(c); + wake_up(&bch2_read_only_wait); + + if (ret) + prt_printf(out, "emergency read only at seq %llu\n", + journal_cur_seq(&c->journal)); + + return ret; +} + +bool bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out) +{ + return __bch2_fs_emergency_read_only2(c, out, false); +} + bool bch2_fs_emergency_read_only_locked(struct bch_fs *c) { bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); @@ -429,9 +488,17 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); + if (WARN_ON(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) + return bch_err_throw(c, erofs_no_alloc_info); + if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) { bch_err(c, "cannot go rw, unfixed btree errors"); - return -BCH_ERR_erofs_unfixed_errors; + return bch_err_throw(c, erofs_unfixed_errors); + } + + if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { + bch_err(c, "cannot go rw, filesystem is an unresized image file"); + return bch_err_throw(c, erofs_filesystem_full); } if (test_bit(BCH_FS_rw, &c->flags)) @@ -439,16 +506,27 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) bch_info(c, "going read-write"); + ret = bch2_fs_init_rw(c); + if (ret) + return ret; + ret = bch2_sb_members_v2_init(c); if (ret) - goto err; + return ret; + + ret = bch2_fs_mark_dirty(c); + if (ret) + return ret; clear_bit(BCH_FS_clean_shutdown, &c->flags); - __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) { - bch2_dev_allocator_add(c, ca); - percpu_ref_reinit(&ca->io_ref[WRITE]); - } + scoped_guard(rcu) + for_each_online_member_rcu(c, ca) + if (ca->mi.state == BCH_MEMBER_STATE_rw) { + bch2_dev_allocator_add(c, ca); + enumerated_ref_start(&ca->io_ref[WRITE]); + } + bch2_recalc_capacity(c); /* @@ -457,15 +535,16 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) * overwriting whatever was there previously, and there must always be * at least one non-flush write in the journal or recovery will fail: */ - spin_lock(&c->journal.lock); - set_bit(JOURNAL_need_flush_write, &c->journal.flags); - set_bit(JOURNAL_running, &c->journal.flags); - bch2_journal_space_available(&c->journal); - spin_unlock(&c->journal.lock); + scoped_guard(spinlock, &c->journal.lock) { + set_bit(JOURNAL_need_flush_write, &c->journal.flags); + set_bit(JOURNAL_running, &c->journal.flags); + bch2_journal_space_available(&c->journal); + } - ret = bch2_fs_mark_dirty(c); - if (ret) - goto err; + /* + * Don't jump to our error path, and call bch2_fs_read_only(), unless we + * successfully marked the filesystem dirty + */ ret = bch2_journal_reclaim_start(&c->journal); if (ret) @@ -474,14 +553,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) set_bit(BCH_FS_rw, &c->flags); set_bit(BCH_FS_was_rw, &c->flags); -#ifndef BCH_WRITE_REF_DEBUG - percpu_ref_reinit(&c->writes); -#else - for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) { - BUG_ON(atomic_long_read(&c->writes[i])); - atomic_long_inc(&c->writes[i]); - } -#endif + enumerated_ref_start(&c->writes); ret = bch2_copygc_start(c); if (ret) { @@ -512,21 +584,21 @@ int bch2_fs_read_write(struct bch_fs *c) { if (c->opts.recovery_pass_last && c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay) - return -BCH_ERR_erofs_norecovery; + return bch_err_throw(c, erofs_norecovery); if (c->opts.nochanges) - return -BCH_ERR_erofs_nochanges; + return bch_err_throw(c, erofs_nochanges); + + if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) + return bch_err_throw(c, erofs_no_alloc_info); return __bch2_fs_read_write(c, false); } int bch2_fs_read_write_early(struct bch_fs *c) { - down_write(&c->state_lock); - int ret = __bch2_fs_read_write(c, true); - up_write(&c->state_lock); - - return ret; + guard(rwsem_write)(&c->state_lock); + return __bch2_fs_read_write(c, true); } /* Filesystem startup/shutdown: */ @@ -536,42 +608,44 @@ static void __bch2_fs_free(struct bch_fs *c) for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_exit(&c->times[i]); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) utf8_unload(c->cf_encoding); #endif bch2_find_btree_nodes_exit(&c->found_btree_nodes); bch2_free_pending_node_rewrites(c); bch2_free_fsck_errs(c); - bch2_fs_accounting_exit(c); - bch2_fs_sb_errors_exit(c); - bch2_fs_counters_exit(c); + bch2_fs_vfs_exit(c); bch2_fs_snapshots_exit(c); + bch2_fs_sb_errors_exit(c); + bch2_fs_replicas_exit(c); + bch2_fs_rebalance_exit(c); bch2_fs_quota_exit(c); + bch2_fs_nocow_locking_exit(c); + bch2_fs_journal_exit(&c->journal); bch2_fs_fs_io_direct_exit(c); bch2_fs_fs_io_buffered_exit(c); bch2_fs_fsio_exit(c); - bch2_fs_vfs_exit(c); - bch2_fs_ec_exit(c); - bch2_fs_encryption_exit(c); - bch2_fs_nocow_locking_exit(c); bch2_fs_io_write_exit(c); bch2_fs_io_read_exit(c); + bch2_fs_encryption_exit(c); + bch2_fs_ec_exit(c); + bch2_fs_counters_exit(c); + bch2_fs_compress_exit(c); + bch2_io_clock_exit(&c->io_clock[WRITE]); + bch2_io_clock_exit(&c->io_clock[READ]); bch2_fs_buckets_waiting_for_journal_exit(c); - bch2_fs_btree_interior_update_exit(c); + bch2_fs_btree_write_buffer_exit(c); bch2_fs_btree_key_cache_exit(&c->btree_key_cache); - bch2_fs_btree_cache_exit(c); bch2_fs_btree_iter_exit(c); - bch2_fs_replicas_exit(c); - bch2_fs_journal_exit(&c->journal); - bch2_io_clock_exit(&c->io_clock[WRITE]); - bch2_io_clock_exit(&c->io_clock[READ]); - bch2_fs_compress_exit(c); - bch2_fs_btree_gc_exit(c); + bch2_fs_btree_interior_update_exit(c); + bch2_fs_btree_cache_exit(c); + bch2_fs_accounting_exit(c); + bch2_fs_async_obj_exit(c); bch2_journal_keys_put_initial(c); bch2_find_btree_nodes_exit(&c->found_btree_nodes); + BUG_ON(atomic_read(&c->journal_keys.ref)); - bch2_fs_btree_write_buffer_exit(c); percpu_free_rwsem(&c->mark_lock); if (c->online_reserved) { u64 v = percpu_u64_get(c->online_reserved); @@ -579,7 +653,6 @@ static void __bch2_fs_free(struct bch_fs *c) free_percpu(c->online_reserved); } - darray_exit(&c->incompat_versions_requested); darray_exit(&c->btree_roots_extra); free_percpu(c->pcpu); free_percpu(c->usage); @@ -587,9 +660,7 @@ static void __bch2_fs_free(struct bch_fs *c) mempool_exit(&c->btree_bounce_pool); bioset_exit(&c->btree_bio); mempool_exit(&c->fill_iter); -#ifndef BCH_WRITE_REF_DEBUG - percpu_ref_exit(&c->writes); -#endif + enumerated_ref_exit(&c->writes); kfree(rcu_dereference_protected(c->disk_groups, 1)); kfree(c->journal_seq_blacklist_table); @@ -601,8 +672,8 @@ static void __bch2_fs_free(struct bch_fs *c) destroy_workqueue(c->btree_read_complete_wq); if (c->copygc_wq) destroy_workqueue(c->copygc_wq); - if (c->btree_io_complete_wq) - destroy_workqueue(c->btree_io_complete_wq); + if (c->btree_write_complete_wq) + destroy_workqueue(c->btree_write_complete_wq); if (c->btree_update_wq) destroy_workqueue(c->btree_update_wq); @@ -624,9 +695,14 @@ void __bch2_fs_stop(struct bch_fs *c) set_bit(BCH_FS_stopping, &c->flags); - down_write(&c->state_lock); - bch2_fs_read_only(c); - up_write(&c->state_lock); + scoped_guard(rwsem_write, &c->state_lock) + bch2_fs_read_only(c); + + for (unsigned i = 0; i < c->sb.nr_devices; i++) { + struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); + if (ca) + bch2_dev_io_ref_stop(ca, READ); + } for_each_member_device(c, ca) bch2_dev_unlink(ca); @@ -652,20 +728,19 @@ void __bch2_fs_stop(struct bch_fs *c) cancel_work_sync(&ca->io_error_work); cancel_work_sync(&c->read_only_work); + + flush_work(&c->btree_interior_update_work); } void bch2_fs_free(struct bch_fs *c) { - unsigned i; - - mutex_lock(&bch_fs_list_lock); - list_del(&c->list); - mutex_unlock(&bch_fs_list_lock); + scoped_guard(mutex, &bch_fs_list_lock) + list_del(&c->list); closure_sync(&c->cl); closure_debug_destroy(&c->cl); - for (i = 0; i < c->sb.nr_devices; i++) { + for (unsigned i = 0; i < c->sb.nr_devices; i++) { struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); if (ca) { @@ -693,9 +768,10 @@ static int bch2_fs_online(struct bch_fs *c) lockdep_assert_held(&bch_fs_list_lock); - if (__bch2_uuid_to_fs(c->sb.uuid)) { + if (c->sb.multi_device && + __bch2_uuid_to_fs(c->sb.uuid)) { bch_err(c, "filesystem UUID already open"); - return -EINVAL; + return bch_err_throw(c, filesystem_uuid_already_open); } ret = bch2_fs_chardev_init(c); @@ -706,7 +782,9 @@ static int bch2_fs_online(struct bch_fs *c) bch2_fs_debug_init(c); - ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?: + ret = (c->sb.multi_device + ? kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) + : kobject_add(&c->kobj, NULL, "%s", c->name)) ?: kobject_add(&c->internal, &c->kobj, "internal") ?: kobject_add(&c->opts_dir, &c->kobj, "options") ?: #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT @@ -719,29 +797,57 @@ static int bch2_fs_online(struct bch_fs *c) return ret; } - down_write(&c->state_lock); + guard(rwsem_write)(&c->state_lock); for_each_member_device(c, ca) { ret = bch2_dev_sysfs_online(c, ca); if (ret) { bch_err(c, "error creating sysfs objects"); bch2_dev_put(ca); - goto err; + return ret; } } BUG_ON(!list_empty(&c->list)); list_add(&c->list, &bch_fs_list); -err: - up_write(&c->state_lock); return ret; } -static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) +int bch2_fs_init_rw(struct bch_fs *c) +{ + if (test_bit(BCH_FS_rw_init_done, &c->flags)) + return 0; + + if (!(c->btree_update_wq = alloc_workqueue("bcachefs", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || + !(c->btree_write_complete_wq = alloc_workqueue("bcachefs_btree_write_complete", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || + !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || + !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || + !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", + WQ_FREEZABLE, 0))) + return bch_err_throw(c, ENOMEM_fs_other_alloc); + + int ret = bch2_fs_btree_interior_update_init(c) ?: + bch2_fs_btree_write_buffer_init(c) ?: + bch2_fs_fs_io_buffered_init(c) ?: + bch2_fs_io_write_init(c) ?: + bch2_fs_journal_init(&c->journal); + if (ret) + return ret; + + set_bit(BCH_FS_rw_init_done, &c->flags); + return 0; +} + +static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, + bch_sb_handles *sbs) { struct bch_fs *c; - struct printbuf name = PRINTBUF; unsigned i, iter_size; + CLASS(printbuf, name)(); int ret = 0; c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); @@ -750,7 +856,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) goto out; } - c->stdio = (void *)(unsigned long) opts.stdio; + c->stdio = (void *)(unsigned long) opts->stdio; __module_get(THIS_MODULE); @@ -774,24 +880,29 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) refcount_set(&c->ro_ref, 1); init_waitqueue_head(&c->ro_ref_wait); - spin_lock_init(&c->recovery_pass_lock); - sema_init(&c->online_fsck_mutex, 1); for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_init(&c->times[i]); - bch2_fs_copygc_init(c); - bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); - bch2_fs_btree_iter_init_early(c); - bch2_fs_btree_interior_update_init_early(c); - bch2_fs_journal_keys_init(c); bch2_fs_allocator_background_init(c); bch2_fs_allocator_foreground_init(c); - bch2_fs_rebalance_init(c); - bch2_fs_quota_init(c); + bch2_fs_btree_cache_init_early(&c->btree_cache); + bch2_fs_btree_gc_init_early(c); + bch2_fs_btree_interior_update_init_early(c); + bch2_fs_btree_iter_init_early(c); + bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); + bch2_fs_btree_write_buffer_init_early(c); + bch2_fs_copygc_init(c); bch2_fs_ec_init_early(c); + bch2_fs_journal_init_early(&c->journal); + bch2_fs_journal_keys_init(c); bch2_fs_move_init(c); + bch2_fs_nocow_locking_init_early(c); + bch2_fs_quota_init(c); + bch2_fs_recovery_passes_init(c); bch2_fs_sb_errors_init_early(c); + bch2_fs_snapshots_init_early(c); + bch2_fs_subvolumes_init_early(c); INIT_LIST_HEAD(&c->list); @@ -817,29 +928,18 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; - bch2_fs_btree_cache_init_early(&c->btree_cache); - mutex_init(&c->sectors_available_lock); ret = percpu_init_rwsem(&c->mark_lock); if (ret) goto err; - mutex_lock(&c->sb_lock); - ret = bch2_sb_to_fs(c, sb); - mutex_unlock(&c->sb_lock); - - if (ret) - goto err; + scoped_guard(mutex, &c->sb_lock) + ret = bch2_sb_to_fs(c, sb); - pr_uuid(&name, c->sb.user_uuid.b); - ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; if (ret) goto err; - strscpy(c->name, name.buf, sizeof(c->name)); - printbuf_exit(&name); - /* Compat: */ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) @@ -854,7 +954,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) if (ret) goto err; - bch2_opts_apply(&c->opts, opts); + bch2_opts_apply(&c->opts, *opts); + + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + c->opts.block_size > PAGE_SIZE) { + bch_err(c, "cannot mount bs > ps filesystem without CONFIG_TRANSPARENT_HUGEPAGE"); + ret = -EINVAL; + goto err; + } c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; if (c->opts.inodes_use_key_cache) @@ -870,26 +977,25 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) goto err; } + if (c->sb.multi_device) + pr_uuid(&name, c->sb.user_uuid.b); + else + prt_bdevname(&name, sbs->data[0].bdev); + + ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; + if (ret) + goto err; + + strscpy(c->name, name.buf, sizeof(c->name)); + iter_size = sizeof(struct sort_iter) + (btree_blocks(c) + 1) * 2 * sizeof(struct sort_iter_set); - if (!(c->btree_update_wq = alloc_workqueue("bcachefs", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || - !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || - !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || - !(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", + if (!(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || - !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || - !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", - WQ_FREEZABLE, 0)) || -#ifndef BCH_WRITE_REF_DEBUG - percpu_ref_init(&c->writes, bch2_writes_disabled, - PERCPU_REF_INIT_DEAD, GFP_KERNEL) || -#endif + enumerated_ref_init(&c->writes, BCH_WRITE_REF_NR, + bch2_writes_disabled) || mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || bioset_init(&c->btree_bio, 1, max(offsetof(struct btree_read_bio, bio), @@ -901,51 +1007,54 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, c->opts.btree_node_size) || mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) { - ret = -BCH_ERR_ENOMEM_fs_other_alloc; + ret = bch_err_throw(c, ENOMEM_fs_other_alloc); goto err; } - ret = bch2_fs_counters_init(c) ?: - bch2_fs_sb_errors_init(c) ?: - bch2_io_clock_init(&c->io_clock[READ]) ?: - bch2_io_clock_init(&c->io_clock[WRITE]) ?: - bch2_fs_journal_init(&c->journal) ?: - bch2_fs_btree_iter_init(c) ?: + ret = + bch2_fs_async_obj_init(c) ?: bch2_fs_btree_cache_init(c) ?: + bch2_fs_btree_iter_init(c) ?: bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: - bch2_fs_btree_interior_update_init(c) ?: - bch2_fs_btree_gc_init(c) ?: bch2_fs_buckets_waiting_for_journal_init(c) ?: - bch2_fs_btree_write_buffer_init(c) ?: - bch2_fs_subvolumes_init(c) ?: - bch2_fs_io_read_init(c) ?: - bch2_fs_io_write_init(c) ?: - bch2_fs_nocow_locking_init(c) ?: - bch2_fs_encryption_init(c) ?: + bch2_io_clock_init(&c->io_clock[READ]) ?: + bch2_io_clock_init(&c->io_clock[WRITE]) ?: bch2_fs_compress_init(c) ?: + bch2_fs_counters_init(c) ?: bch2_fs_ec_init(c) ?: - bch2_fs_vfs_init(c) ?: + bch2_fs_encryption_init(c) ?: bch2_fs_fsio_init(c) ?: - bch2_fs_fs_io_buffered_init(c) ?: - bch2_fs_fs_io_direct_init(c); + bch2_fs_fs_io_direct_init(c) ?: + bch2_fs_io_read_init(c) ?: + bch2_fs_rebalance_init(c) ?: + bch2_fs_sb_errors_init(c) ?: + bch2_fs_vfs_init(c); if (ret) goto err; -#ifdef CONFIG_UNICODE - /* Default encoding until we can potentially have more as an option. */ - c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING); - if (IS_ERR(c->cf_encoding)) { - printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u", - unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); - ret = -EINVAL; - goto err; + if (go_rw_in_recovery(c)) { + /* + * start workqueues/kworkers early - kthread creation checks for + * pending signals, which is _very_ annoying + */ + ret = bch2_fs_init_rw(c); + if (ret) + goto err; + } + +#if IS_ENABLED(CONFIG_UNICODE) + if (!bch2_fs_casefold_enabled(c)) { + /* Default encoding until we can potentially have more as an option. */ + c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING); + if (IS_ERR(c->cf_encoding)) { + printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u", + unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); + ret = -EINVAL; + goto err; + } } - bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u", - unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); #else if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) { printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n"); @@ -969,12 +1078,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) &c->clock_journal_res, (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2); - mutex_lock(&bch_fs_list_lock); - ret = bch2_fs_online(c); - mutex_unlock(&bch_fs_list_lock); + scoped_guard(mutex, &bch_fs_list_lock) + ret = bch2_fs_online(c); if (ret) goto err; + + c->recovery_task = current; out: return c; err: @@ -987,12 +1097,13 @@ noinline_for_stack static void print_mount_opts(struct bch_fs *c) { enum bch_opt_id i; - struct printbuf p = PRINTBUF; - bool first = true; + CLASS(printbuf, p)(); + bch2_log_msg_start(c, &p); prt_str(&p, "starting version "); bch2_version_to_text(&p, c->sb.version); + bool first = true; for (i = 0; i < bch2_opts_nr; i++) { const struct bch_option *opt = &bch2_opt_table[i]; u64 v = bch2_opt_get_by_id(&c->opts, i); @@ -1009,30 +1120,41 @@ static void print_mount_opts(struct bch_fs *c) } if (c->sb.version_incompat_allowed != c->sb.version) { - prt_printf(&p, "\n allowing incompatible features above "); + prt_printf(&p, "\nallowing incompatible features above "); bch2_version_to_text(&p, c->sb.version_incompat_allowed); } - bch_info(c, "%s", p.buf); - printbuf_exit(&p); + if (c->opts.verbose) { + prt_printf(&p, "\nfeatures: "); + prt_bitflags(&p, bch2_sb_features, c->sb.features); + } + + if (c->sb.multi_device) { + prt_printf(&p, "\nwith devices"); + for_each_online_member(c, ca, BCH_DEV_READ_REF_bch2_online_devs) { + prt_char(&p, ' '); + prt_str(&p, ca->name); + } + } + + bch2_print_str(c, KERN_INFO, p.buf); } static bool bch2_fs_may_start(struct bch_fs *c) { struct bch_dev *ca; - unsigned i, flags = 0; + unsigned flags = 0; - if (c->opts.very_degraded) + switch (c->opts.degraded) { + case BCH_DEGRADED_very: flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; - - if (c->opts.degraded) + break; + case BCH_DEGRADED_yes: flags |= BCH_FORCE_IF_DEGRADED; - - if (!c->opts.degraded && - !c->opts.very_degraded) { - mutex_lock(&c->sb_lock); - - for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { + break; + default: { + guard(mutex)(&c->sb_lock); + for (unsigned i = 0; i < c->disk_sb.sb->nr_devices; i++) { if (!bch2_member_exists(c->disk_sb.sb, i)) continue; @@ -1040,15 +1162,14 @@ static bool bch2_fs_may_start(struct bch_fs *c) if (!bch2_dev_is_online(ca) && (ca->mi.state == BCH_MEMBER_STATE_rw || - ca->mi.state == BCH_MEMBER_STATE_ro)) { - mutex_unlock(&c->sb_lock); + ca->mi.state == BCH_MEMBER_STATE_ro)) return false; - } } - mutex_unlock(&c->sb_lock); + break; + } } - return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true); + return bch2_have_enough_devs(c, c->online_devs, flags, true); } int bch2_fs_start(struct bch_fs *c) @@ -1056,42 +1177,42 @@ int bch2_fs_start(struct bch_fs *c) time64_t now = ktime_get_real_seconds(); int ret = 0; + BUG_ON(test_bit(BCH_FS_started, &c->flags)); + print_mount_opts(c); + if (c->cf_encoding) + bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u", + unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); + if (!bch2_fs_may_start(c)) - return -BCH_ERR_insufficient_devices_to_start; + return bch_err_throw(c, insufficient_devices_to_start); - down_write(&c->state_lock); - mutex_lock(&c->sb_lock); + scoped_guard(rwsem_write, &c->state_lock) { + guard(mutex)(&c->sb_lock); + if (!bch2_sb_field_get_minsize(&c->disk_sb, ext, + sizeof(struct bch_sb_field_ext) / sizeof(u64))) { + ret = bch_err_throw(c, ENOSPC_sb); + goto err; + } - BUG_ON(test_bit(BCH_FS_started, &c->flags)); + ret = bch2_sb_members_v2_init(c); + if (ret) + goto err; - if (!bch2_sb_field_get_minsize(&c->disk_sb, ext, - sizeof(struct bch_sb_field_ext) / sizeof(u64))) { - mutex_unlock(&c->sb_lock); - up_write(&c->state_lock); - ret = -BCH_ERR_ENOSPC_sb; - goto err; - } + scoped_guard(rcu) + for_each_online_member_rcu(c, ca) { + bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = + cpu_to_le64(now); + if (ca->mi.state == BCH_MEMBER_STATE_rw) + bch2_dev_allocator_add(c, ca); + } - ret = bch2_sb_members_v2_init(c); - if (ret) { - mutex_unlock(&c->sb_lock); - up_write(&c->state_lock); - goto err; + bch2_recalc_capacity(c); } - for_each_online_member(c, ca) - bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now); - - mutex_unlock(&c->sb_lock); - - for_each_rw_member(c, ca) - bch2_dev_allocator_add(c, ca); - bch2_recalc_capacity(c); - up_write(&c->state_lock); - - c->recovery_task = current; ret = BCH_SB_INITIALIZED(c->disk_sb.sb) ? bch2_fs_recovery(c) : bch2_fs_initialize(c); @@ -1100,25 +1221,24 @@ int bch2_fs_start(struct bch_fs *c) if (ret) goto err; - ret = bch2_opts_check_may_set(c); + ret = bch2_opts_hooks_pre_set(c); if (ret) goto err; if (bch2_fs_init_fault("fs_start")) { - ret = -BCH_ERR_injected_fs_start; + ret = bch_err_throw(c, injected_fs_start); goto err; } set_bit(BCH_FS_started, &c->flags); wake_up(&c->ro_ref_wait); - down_write(&c->state_lock); - if (c->opts.read_only) - bch2_fs_read_only(c); - else if (!test_bit(BCH_FS_rw, &c->flags)) - ret = bch2_fs_read_write(c); - up_write(&c->state_lock); - + scoped_guard(rwsem_write, &c->state_lock) { + if (c->opts.read_only) + bch2_fs_read_only(c); + else if (!test_bit(BCH_FS_rw, &c->flags)) + ret = bch2_fs_read_write(c); + } err: if (ret) bch_err_msg(c, ret, "starting filesystem"); @@ -1132,11 +1252,11 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); if (le16_to_cpu(sb->block_size) != block_sectors(c)) - return -BCH_ERR_mismatched_block_size; + return bch_err_throw(c, mismatched_block_size); if (le16_to_cpu(m.bucket_size) < BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) - return -BCH_ERR_bucket_size_too_small; + return bch_err_throw(c, bucket_size_too_small); return 0; } @@ -1163,7 +1283,7 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, if (fs->sb->seq == sb->sb->seq && fs->sb->write_time != sb->sb->write_time) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); prt_str(&buf, "Split brain detected between "); prt_bdevname(&buf, sb->bdev); @@ -1188,7 +1308,6 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, prt_printf(&buf, "Not using older sb"); pr_err("%s", buf.buf); - printbuf_exit(&buf); if (!opts->no_splitbrain_check) return -BCH_ERR_device_splitbrain; @@ -1199,7 +1318,7 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, u64 seq_from_member = le64_to_cpu(sb->sb->seq); if (seq_from_fs && seq_from_fs < seq_from_member) { - struct printbuf buf = PRINTBUF; + CLASS(printbuf, buf)(); prt_str(&buf, "Split brain detected between "); prt_bdevname(&buf, sb->bdev); @@ -1221,7 +1340,6 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, } pr_err("%s", buf.buf); - printbuf_exit(&buf); if (!opts->no_splitbrain_check) return -BCH_ERR_device_splitbrain; @@ -1234,11 +1352,14 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, static void bch2_dev_io_ref_stop(struct bch_dev *ca, int rw) { - if (!percpu_ref_is_zero(&ca->io_ref[rw])) { - reinit_completion(&ca->io_ref_completion[rw]); - percpu_ref_kill(&ca->io_ref[rw]); - wait_for_completion(&ca->io_ref_completion[rw]); - } + if (rw == READ) + clear_bit(ca->dev_idx, ca->fs->online_devs.d); + + if (!enumerated_ref_is_zero(&ca->io_ref[rw])) + enumerated_ref_stop(&ca->io_ref[rw], + rw == READ + ? bch2_dev_read_refs + : bch2_dev_write_refs); } static void bch2_dev_release(struct kobject *kobj) @@ -1250,8 +1371,8 @@ static void bch2_dev_release(struct kobject *kobj) static void bch2_dev_free(struct bch_dev *ca) { - WARN_ON(!percpu_ref_is_zero(&ca->io_ref[WRITE])); - WARN_ON(!percpu_ref_is_zero(&ca->io_ref[READ])); + WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE])); + WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[READ])); cancel_work_sync(&ca->io_error_work); @@ -1260,6 +1381,9 @@ static void bch2_dev_free(struct bch_dev *ca) if (ca->kobj.state_in_sysfs) kobject_del(&ca->kobj); + bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch); + bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty); + bch2_free_super(&ca->disk_sb); bch2_dev_allocator_background_exit(ca); bch2_dev_journal_exit(ca); @@ -1271,8 +1395,8 @@ static void bch2_dev_free(struct bch_dev *ca) bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); - percpu_ref_exit(&ca->io_ref[WRITE]); - percpu_ref_exit(&ca->io_ref[READ]); + enumerated_ref_exit(&ca->io_ref[WRITE]); + enumerated_ref_exit(&ca->io_ref[READ]); #ifndef CONFIG_BCACHEFS_DEBUG percpu_ref_exit(&ca->ref); #endif @@ -1284,7 +1408,7 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) lockdep_assert_held(&c->state_lock); - if (percpu_ref_is_zero(&ca->io_ref[READ])) + if (enumerated_ref_is_zero(&ca->io_ref[READ])) return; __bch2_dev_read_only(c, ca); @@ -1306,20 +1430,6 @@ static void bch2_dev_ref_complete(struct percpu_ref *ref) } #endif -static void bch2_dev_io_ref_read_complete(struct percpu_ref *ref) -{ - struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref[READ]); - - complete(&ca->io_ref_completion[READ]); -} - -static void bch2_dev_io_ref_write_complete(struct percpu_ref *ref) -{ - struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref[WRITE]); - - complete(&ca->io_ref_completion[WRITE]); -} - static void bch2_dev_unlink(struct bch_dev *ca) { struct kobject *b; @@ -1381,8 +1491,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, kobject_init(&ca->kobj, &bch2_dev_ktype); init_completion(&ca->ref_completion); - init_completion(&ca->io_ref_completion[READ]); - init_completion(&ca->io_ref_completion[WRITE]); INIT_WORK(&ca->io_error_work, bch2_io_error_work); @@ -1406,12 +1514,13 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, atomic_long_set(&ca->ref, 1); #endif + mutex_init(&ca->bucket_backpointer_mismatch.lock); + mutex_init(&ca->bucket_backpointer_empty.lock); + bch2_dev_allocator_background_init(ca); - if (percpu_ref_init(&ca->io_ref[READ], bch2_dev_io_ref_read_complete, - PERCPU_REF_INIT_DEAD, GFP_KERNEL) || - percpu_ref_init(&ca->io_ref[WRITE], bch2_dev_io_ref_write_complete, - PERCPU_REF_INIT_DEAD, GFP_KERNEL) || + if (enumerated_ref_init(&ca->io_ref[READ], BCH_DEV_READ_REF_NR, NULL) || + enumerated_ref_init(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_NR, NULL) || !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) || bch2_dev_buckets_alloc(c, ca) || !(ca->io_done = alloc_percpu(*ca->io_done))) @@ -1428,7 +1537,9 @@ static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, { ca->dev_idx = dev_idx; __set_bit(ca->dev_idx, ca->self.d); - scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); + + if (!ca->name[0]) + scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); ca->fs = c; rcu_assign_pointer(c->devs[ca->dev_idx], ca); @@ -1443,18 +1554,16 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) struct bch_dev *ca = NULL; if (bch2_fs_init_fault("dev_alloc")) - goto err; + return bch_err_throw(c, ENOMEM_dev_alloc); ca = __bch2_dev_alloc(c, &member); if (!ca) - goto err; + return bch_err_throw(c, ENOMEM_dev_alloc); ca->fs = c; bch2_dev_attach(c, ca, dev_idx); return 0; -err: - return -BCH_ERR_ENOMEM_dev_alloc; } static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) @@ -1464,22 +1573,29 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) if (bch2_dev_is_online(ca)) { bch_err(ca, "already have device online in slot %u", sb->sb->dev_idx); - return -BCH_ERR_device_already_online; + return bch_err_throw(ca->fs, device_already_online); } if (get_capacity(sb->bdev->bd_disk) < ca->mi.bucket_size * ca->mi.nbuckets) { - bch_err(ca, "cannot online: device too small"); - return -BCH_ERR_device_size_too_small; + bch_err(ca, "cannot online: device too small (capacity %llu filesystem size %llu nbuckets %llu)", + get_capacity(sb->bdev->bd_disk), + ca->mi.bucket_size * ca->mi.nbuckets, + ca->mi.nbuckets); + return bch_err_throw(ca->fs, device_size_too_small); } - BUG_ON(!percpu_ref_is_zero(&ca->io_ref[READ])); - BUG_ON(!percpu_ref_is_zero(&ca->io_ref[WRITE])); + BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[READ])); + BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE])); ret = bch2_dev_journal_init(ca, sb->sb); if (ret) return ret; + CLASS(printbuf, name)(); + prt_bdevname(&name, sb->bdev); + strscpy(ca->name, name.buf, sizeof(ca->name)); + /* Commit: */ ca->disk_sb = *sb; memset(sb, 0, sizeof(*sb)); @@ -1493,7 +1609,7 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) ca->dev = ca->disk_sb.bdev->bd_dev; - percpu_ref_reinit(&ca->io_ref[READ]); + enumerated_ref_start(&ca->io_ref[READ]); return 0; } @@ -1517,16 +1633,9 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) if (ret) return ret; - bch2_dev_sysfs_online(c, ca); - - struct printbuf name = PRINTBUF; - prt_bdevname(&name, ca->disk_sb.bdev); - - if (c->sb.nr_devices == 1) - strscpy(c->name, name.buf, sizeof(c->name)); - strscpy(ca->name, name.buf, sizeof(ca->name)); + set_bit(ca->dev_idx, c->online_devs.d); - printbuf_exit(&name); + bch2_dev_sysfs_online(c, ca); bch2_rebalance_wakeup(c); return 0; @@ -1578,7 +1687,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, return true; /* do we have enough devices to read from? */ - new_online_devs = bch2_online_devs(c); + new_online_devs = c->online_devs; __clear_bit(ca->dev_idx, new_online_devs.d); return bch2_have_enough_devs(c, new_online_devs, flags, false); @@ -1608,8 +1717,8 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); - if (percpu_ref_is_zero(&ca->io_ref[WRITE])) - percpu_ref_reinit(&ca->io_ref[WRITE]); + if (enumerated_ref_is_zero(&ca->io_ref[WRITE])) + enumerated_ref_start(&ca->io_ref[WRITE]); bch2_dev_do_discards(ca); } @@ -1617,25 +1726,24 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, enum bch_member_state new_state, int flags) { - struct bch_member *m; int ret = 0; if (ca->mi.state == new_state) return 0; if (!bch2_dev_state_allowed(c, ca, new_state, flags)) - return -BCH_ERR_device_state_not_allowed; + return bch_err_throw(c, device_state_not_allowed); if (new_state != BCH_MEMBER_STATE_rw) __bch2_dev_read_only(c, ca); bch_notice(ca, "%s", bch2_member_states[new_state]); - mutex_lock(&c->sb_lock); - m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - SET_BCH_MEMBER_STATE(m, new_state); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); + scoped_guard(mutex, &c->sb_lock) { + struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + SET_BCH_MEMBER_STATE(m, new_state); + bch2_write_super(c); + } if (new_state == BCH_MEMBER_STATE_rw) __bch2_dev_read_write(c, ca); @@ -1648,24 +1756,20 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, enum bch_member_state new_state, int flags) { - int ret; - - down_write(&c->state_lock); - ret = __bch2_dev_set_state(c, ca, new_state, flags); - up_write(&c->state_lock); - - return ret; + guard(rwsem_write)(&c->state_lock); + return __bch2_dev_set_state(c, ca, new_state, flags); } /* Device add/removal: */ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) { - struct bch_member *m; unsigned dev_idx = ca->dev_idx, data; + bool fast_device_removal = !bch2_request_incompat_feature(c, + bcachefs_metadata_version_fast_device_removal); int ret; - down_write(&c->state_lock); + guard(rwsem_write)(&c->state_lock); /* * We consume a reference to ca->ref, regardless of whether we succeed @@ -1675,17 +1779,31 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { bch_err(ca, "Cannot remove without losing data"); - ret = -BCH_ERR_device_state_not_allowed; + ret = bch_err_throw(c, device_state_not_allowed); goto err; } __bch2_dev_read_only(c, ca); - ret = bch2_dev_data_drop(c, ca->dev_idx, flags); - bch_err_msg(ca, ret, "bch2_dev_data_drop()"); + ret = fast_device_removal + ? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags) + : (bch2_dev_data_drop(c, ca->dev_idx, flags) ?: + bch2_dev_remove_stripes(c, ca->dev_idx, flags)); if (ret) goto err; + /* Check if device still has data before blowing away alloc info */ + struct bch_dev_usage usage = bch2_dev_usage_read(ca); + for (unsigned i = 0; i < BCH_DATA_NR; i++) + if (!data_type_is_empty(i) && + !data_type_is_hidden(i) && + usage.buckets[i]) { + bch_err(ca, "Remove failed: still has data (%s, %llu buckets)", + __bch2_data_types[i], usage.buckets[i]); + ret = -EBUSY; + goto err; + } + ret = bch2_dev_remove_alloc(c, ca); bch_err_msg(ca, ret, "bch2_dev_remove_alloc()"); if (ret) @@ -1718,20 +1836,17 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) data = bch2_dev_has_data(c, ca); if (data) { - struct printbuf data_has = PRINTBUF; - + CLASS(printbuf, data_has)(); prt_bitflags(&data_has, __bch2_data_types, data); bch_err(ca, "Remove failed, still has data (%s)", data_has.buf); - printbuf_exit(&data_has); ret = -EBUSY; goto err; } __bch2_dev_offline(c, ca); - mutex_lock(&c->sb_lock); - rcu_assign_pointer(c->devs[ca->dev_idx], NULL); - mutex_unlock(&c->sb_lock); + scoped_guard(mutex, &c->sb_lock) + rcu_assign_pointer(c->devs[ca->dev_idx], NULL); #ifndef CONFIG_BCACHEFS_DEBUG percpu_ref_kill(&ca->ref); @@ -1747,21 +1862,23 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) * Free this device's slot in the bch_member array - all pointers to * this device must be gone: */ - mutex_lock(&c->sb_lock); - m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx); - memset(&m->uuid, 0, sizeof(m->uuid)); + scoped_guard(mutex, &c->sb_lock) { + struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx); - bch2_write_super(c); + if (fast_device_removal) + m->uuid = BCH_SB_MEMBER_DELETED_UUID; + else + memset(&m->uuid, 0, sizeof(m->uuid)); + + bch2_write_super(c); + } - mutex_unlock(&c->sb_lock); - up_write(&c->state_lock); return 0; err: if (test_bit(BCH_FS_rw, &c->flags) && ca->mi.state == BCH_MEMBER_STATE_rw && - !percpu_ref_is_zero(&ca->io_ref[READ])) + !enumerated_ref_is_zero(&ca->io_ref[READ])) __bch2_dev_read_write(c, ca); - up_write(&c->state_lock); return ret; } @@ -1769,11 +1886,10 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) int bch2_dev_add(struct bch_fs *c, const char *path) { struct bch_opts opts = bch2_opts_empty(); - struct bch_sb_handle sb; + struct bch_sb_handle sb = {}; struct bch_dev *ca = NULL; - struct printbuf errbuf = PRINTBUF; - struct printbuf label = PRINTBUF; - int ret; + CLASS(printbuf, label)(); + int ret = 0; ret = bch2_read_super(path, &opts, &sb); bch_err_msg(c, ret, "reading super"); @@ -1790,6 +1906,20 @@ int bch2_dev_add(struct bch_fs *c, const char *path) } } + if (list_empty(&c->list)) { + scoped_guard(mutex, &bch_fs_list_lock) { + if (__bch2_uuid_to_fs(c->sb.uuid)) + ret = bch_err_throw(c, filesystem_uuid_already_open); + else + list_add(&c->list, &bch_fs_list); + } + + if (ret) { + bch_err(c, "filesystem UUID already open"); + goto err; + } + } + ret = bch2_dev_may_add(sb.sb, c); if (ret) goto err; @@ -1804,81 +1934,95 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (ret) goto err; - down_write(&c->state_lock); - mutex_lock(&c->sb_lock); + scoped_guard(rwsem_write, &c->state_lock) { + scoped_guard(mutex, &c->sb_lock) { + SET_BCH_SB_MULTI_DEVICE(c->disk_sb.sb, true); - ret = bch2_sb_from_fs(c, ca); - bch_err_msg(c, ret, "setting up new superblock"); - if (ret) - goto err_unlock; - - if (dynamic_fault("bcachefs:add:no_slot")) - goto err_unlock; + ret = bch2_sb_from_fs(c, ca); + bch_err_msg(c, ret, "setting up new superblock"); + if (ret) + goto err; - ret = bch2_sb_member_alloc(c); - if (ret < 0) { - bch_err_msg(c, ret, "setting up new superblock"); - goto err_unlock; - } - unsigned dev_idx = ret; + if (dynamic_fault("bcachefs:add:no_slot")) + goto err; - /* success: */ + ret = bch2_sb_member_alloc(c); + if (ret < 0) { + bch_err_msg(c, ret, "setting up new superblock"); + goto err; + } + unsigned dev_idx = ret; + ret = 0; - dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds()); - *bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi; + /* success: */ - ca->disk_sb.sb->dev_idx = dev_idx; - bch2_dev_attach(c, ca, dev_idx); + dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds()); + *bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi; - if (BCH_MEMBER_GROUP(&dev_mi)) { - ret = __bch2_dev_group_set(c, ca, label.buf); - bch_err_msg(c, ret, "creating new label"); - if (ret) - goto err_unlock; - } + ca->disk_sb.sb->dev_idx = dev_idx; + bch2_dev_attach(c, ca, dev_idx); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); + set_bit(ca->dev_idx, c->online_devs.d); - ret = bch2_dev_usage_init(ca, false); - if (ret) - goto err_late; + if (BCH_MEMBER_GROUP(&dev_mi)) { + ret = __bch2_dev_group_set(c, ca, label.buf); + bch_err_msg(c, ret, "creating new label"); + if (ret) + goto err_late; + } - ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); - bch_err_msg(ca, ret, "marking new superblock"); - if (ret) - goto err_late; + bch2_write_super(c); + } - ret = bch2_fs_freespace_init(c); - bch_err_msg(ca, ret, "initializing free space"); - if (ret) - goto err_late; + ret = bch2_dev_usage_init(ca, false); + if (ret) + goto err_late; + + if (test_bit(BCH_FS_started, &c->flags)) { + ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); + bch_err_msg(ca, ret, "marking new superblock"); + if (ret) + goto err_late; + + ret = bch2_fs_freespace_init(c); + bch_err_msg(ca, ret, "initializing free space"); + if (ret) + goto err_late; + + if (ca->mi.state == BCH_MEMBER_STATE_rw) + __bch2_dev_read_write(c, ca); + + ret = bch2_dev_journal_alloc(ca, false); + bch_err_msg(c, ret, "allocating journal"); + if (ret) + goto err_late; + } - if (ca->mi.state == BCH_MEMBER_STATE_rw) - __bch2_dev_read_write(c, ca); + /* + * We just changed the superblock UUID, invalidate cache and send a + * uevent to update /dev/disk/by-uuid + */ + invalidate_bdev(ca->disk_sb.bdev); - ret = bch2_dev_journal_alloc(ca, false); - bch_err_msg(c, ret, "allocating journal"); - if (ret) - goto err_late; + char uuid_str[37]; + snprintf(uuid_str, sizeof(uuid_str), "UUID=%pUb", &c->sb.uuid); - up_write(&c->state_lock); + char *envp[] = { + "CHANGE=uuid", + uuid_str, + NULL, + }; + kobject_uevent_env(&ca->disk_sb.bdev->bd_device.kobj, KOBJ_CHANGE, envp); + } out: - printbuf_exit(&label); - printbuf_exit(&errbuf); bch_err_fn(c, ret); return ret; - -err_unlock: - mutex_unlock(&c->sb_lock); - up_write(&c->state_lock); err: if (ca) bch2_dev_free(ca); bch2_free_super(&sb); goto out; err_late: - up_write(&c->state_lock); ca = NULL; goto err; } @@ -1892,13 +2036,11 @@ int bch2_dev_online(struct bch_fs *c, const char *path) unsigned dev_idx; int ret; - down_write(&c->state_lock); + guard(rwsem_write)(&c->state_lock); ret = bch2_read_super(path, &opts, &sb); - if (ret) { - up_write(&c->state_lock); + if (ret) return ret; - } dev_idx = sb.sb->dev_idx; @@ -1935,104 +2077,139 @@ int bch2_dev_online(struct bch_fs *c, const char *path) goto err; } - mutex_lock(&c->sb_lock); - bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = - cpu_to_le64(ktime_get_real_seconds()); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); + scoped_guard(mutex, &c->sb_lock) { + bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = + cpu_to_le64(ktime_get_real_seconds()); + bch2_write_super(c); + } - up_write(&c->state_lock); return 0; err: - up_write(&c->state_lock); bch2_free_super(&sb); return ret; } int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) { - down_write(&c->state_lock); + guard(rwsem_write)(&c->state_lock); if (!bch2_dev_is_online(ca)) { bch_err(ca, "Already offline"); - up_write(&c->state_lock); return 0; } if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { bch_err(ca, "Cannot offline required disk"); - up_write(&c->state_lock); - return -BCH_ERR_device_state_not_allowed; + return bch_err_throw(c, device_state_not_allowed); } __bch2_dev_offline(c, ca); - - up_write(&c->state_lock); return 0; } +static int __bch2_dev_resize_alloc(struct bch_dev *ca, u64 old_nbuckets, u64 new_nbuckets) +{ + struct bch_fs *c = ca->fs; + u64 v[3] = { new_nbuckets - old_nbuckets, 0, 0 }; + + return bch2_trans_commit_do(ca->fs, NULL, NULL, 0, + bch2_disk_accounting_mod2(trans, false, v, dev_data_type, + .dev = ca->dev_idx, + .data_type = BCH_DATA_free)) ?: + bch2_dev_freespace_init(c, ca, old_nbuckets, new_nbuckets); +} + int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) { - struct bch_member *m; u64 old_nbuckets; int ret = 0; - down_write(&c->state_lock); + guard(rwsem_write)(&c->state_lock); old_nbuckets = ca->mi.nbuckets; if (nbuckets < ca->mi.nbuckets) { bch_err(ca, "Cannot shrink yet"); - ret = -EINVAL; - goto err; + return -EINVAL; } if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) { bch_err(ca, "New device size too big (%llu greater than max %u)", nbuckets, BCH_MEMBER_NBUCKETS_MAX); - ret = -BCH_ERR_device_size_too_big; - goto err; + return bch_err_throw(c, device_size_too_big); } if (bch2_dev_is_online(ca) && get_capacity(ca->disk_sb.bdev->bd_disk) < ca->mi.bucket_size * nbuckets) { bch_err(ca, "New size larger than device"); - ret = -BCH_ERR_device_size_too_small; - goto err; + return bch_err_throw(c, device_size_too_small); } ret = bch2_dev_buckets_resize(c, ca, nbuckets); bch_err_msg(ca, ret, "resizing buckets"); if (ret) - goto err; + return ret; ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); if (ret) - goto err; + return ret; - mutex_lock(&c->sb_lock); - m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - m->nbuckets = cpu_to_le64(nbuckets); + scoped_guard(mutex, &c->sb_lock) { + struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + m->nbuckets = cpu_to_le64(nbuckets); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); + bch2_write_super(c); + } if (ca->mi.freespace_initialized) { - u64 v[3] = { nbuckets - old_nbuckets, 0, 0 }; - - ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0, - bch2_disk_accounting_mod2(trans, false, v, dev_data_type, - .dev = ca->dev_idx, - .data_type = BCH_DATA_free)) ?: - bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets); + ret = __bch2_dev_resize_alloc(ca, old_nbuckets, nbuckets); if (ret) - goto err; + return ret; } bch2_recalc_capacity(c); -err: - up_write(&c->state_lock); - return ret; + return 0; +} + +int bch2_fs_resize_on_mount(struct bch_fs *c) +{ + for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_resize_on_mount) { + u64 old_nbuckets = ca->mi.nbuckets; + u64 new_nbuckets = div64_u64(get_capacity(ca->disk_sb.bdev->bd_disk), + ca->mi.bucket_size); + + if (ca->mi.resize_on_mount && + new_nbuckets > ca->mi.nbuckets) { + bch_info(ca, "resizing to size %llu", new_nbuckets * ca->mi.bucket_size); + int ret = bch2_dev_buckets_resize(c, ca, new_nbuckets); + bch_err_fn(ca, ret); + if (ret) { + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_fs_resize_on_mount); + return ret; + } + + scoped_guard(mutex, &c->sb_lock) { + struct bch_member *m = + bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + m->nbuckets = cpu_to_le64(new_nbuckets); + SET_BCH_MEMBER_RESIZE_ON_MOUNT(m, false); + + c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_small_image)); + bch2_write_super(c); + } + + if (ca->mi.freespace_initialized) { + ret = __bch2_dev_resize_alloc(ca, old_nbuckets, new_nbuckets); + if (ret) { + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_fs_resize_on_mount); + return ret; + } + } + } + } + return 0; } /* return with ref on ca->ref: */ @@ -2065,6 +2242,10 @@ static struct bch_fs *bdev_get_fs(struct block_device *bdev) return c; } +DEFINE_CLASS(bdev_get_fs, struct bch_fs *, + bch2_ro_ref_put(_T), bdev_get_fs(bdev), + struct block_device *bdev); + /* returns with ref on ca->ref */ static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev) { @@ -2076,7 +2257,7 @@ static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bd static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise) { - struct bch_fs *c = bdev_get_fs(bdev); + CLASS(bdev_get_fs, c)(bdev); if (!c) return; @@ -2090,36 +2271,45 @@ static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise) down_read(&sb->s_umount); } - down_write(&c->state_lock); + guard(rwsem_write)(&c->state_lock); + struct bch_dev *ca = bdev_to_bch_dev(c, bdev); - if (!ca) - goto unlock; + if (ca) { + bool dev = bch2_dev_state_allowed(c, ca, + BCH_MEMBER_STATE_failed, + BCH_FORCE_IF_DEGRADED); - if (bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, BCH_FORCE_IF_DEGRADED)) { - __bch2_dev_offline(c, ca); - } else { - if (sb) { + if (!dev && sb) { if (!surprise) sync_filesystem(sb); shrink_dcache_sb(sb); evict_inodes(sb); } - bch2_journal_flush(&c->journal); - bch2_fs_emergency_read_only(c); + CLASS(printbuf, buf)(); + __bch2_log_msg_start(ca->name, &buf); + + prt_printf(&buf, "offline from block layer"); + + if (dev) { + __bch2_dev_offline(c, ca); + } else { + bch2_journal_flush(&c->journal); + bch2_fs_emergency_read_only2(c, &buf); + } + + bch2_print_str(c, KERN_ERR, buf.buf); + + bch2_dev_put(ca); } - bch2_dev_put(ca); -unlock: if (sb) up_read(&sb->s_umount); - up_write(&c->state_lock); - bch2_ro_ref_put(c); } static void bch2_fs_bdev_sync(struct block_device *bdev) { - struct bch_fs *c = bdev_get_fs(bdev); + CLASS(bdev_get_fs, c)(bdev); if (!c) return; @@ -2130,12 +2320,9 @@ static void bch2_fs_bdev_sync(struct block_device *bdev) * unmounted - we only take this to avoid a warning in * sync_filesystem: */ - down_read(&sb->s_umount); + guard(rwsem_read)(&sb->s_umount); sync_filesystem(sb); - up_read(&sb->s_umount); } - - bch2_ro_ref_put(c); } const struct blk_holder_ops bch2_sb_handle_bdev_ops = { @@ -2151,39 +2338,38 @@ static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time)); } -struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, - struct bch_opts opts) +struct bch_fs *bch2_fs_open(darray_const_str *devices, + struct bch_opts *opts) { - DARRAY(struct bch_sb_handle) sbs = { 0 }; + bch_sb_handles sbs = {}; struct bch_fs *c = NULL; struct bch_sb_handle *best = NULL; - struct printbuf errbuf = PRINTBUF; int ret = 0; if (!try_module_get(THIS_MODULE)) return ERR_PTR(-ENODEV); - if (!nr_devices) { + if (!devices->nr) { ret = -EINVAL; goto err; } - ret = darray_make_room(&sbs, nr_devices); + ret = darray_make_room(&sbs, devices->nr); if (ret) goto err; - for (unsigned i = 0; i < nr_devices; i++) { + darray_for_each(*devices, i) { struct bch_sb_handle sb = { NULL }; - ret = bch2_read_super(devices[i], &opts, &sb); + ret = bch2_read_super(*i, opts, &sb); if (ret) goto err; BUG_ON(darray_push(&sbs, sb)); } - if (opts.nochanges && !opts.read_only) { - ret = -BCH_ERR_erofs_nochanges; + if (opts->nochanges && !opts->read_only) { + ret = bch_err_throw(c, erofs_nochanges); goto err_print; } @@ -2192,7 +2378,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, best = sb; darray_for_each_reverse(sbs, sb) { - ret = bch2_dev_in_fs(best, sb, &opts); + ret = bch2_dev_in_fs(best, sb, opts); if (ret == -BCH_ERR_device_has_been_removed || ret == -BCH_ERR_device_splitbrain) { @@ -2207,20 +2393,17 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, goto err_print; } - c = bch2_fs_alloc(best->sb, opts); + c = bch2_fs_alloc(best->sb, opts, &sbs); ret = PTR_ERR_OR_ZERO(c); if (ret) goto err; - down_write(&c->state_lock); - darray_for_each(sbs, sb) { - ret = bch2_dev_attach_bdev(c, sb); - if (ret) { - up_write(&c->state_lock); - goto err; + scoped_guard(rwsem_write, &c->state_lock) + darray_for_each(sbs, sb) { + ret = bch2_dev_attach_bdev(c, sb); + if (ret) + goto err; } - } - up_write(&c->state_lock); if (!c->opts.nostart) { ret = bch2_fs_start(c); @@ -2231,12 +2414,11 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, darray_for_each(sbs, sb) bch2_free_super(sb); darray_exit(&sbs); - printbuf_exit(&errbuf); module_put(THIS_MODULE); return c; err_print: pr_err("bch_fs_open err opening %s: %s", - devices[0], bch2_err_str(ret)); + devices->data[0], bch2_err_str(ret)); err: if (!IS_ERR_OR_NULL(c)) bch2_fs_stop(c); @@ -2273,9 +2455,47 @@ static int __init bcachefs_init(void) return -ENOMEM; } -#define BCH_DEBUG_PARAM(name, description) \ - bool bch2_##name; \ - module_param_named(name, bch2_##name, bool, 0644); \ +#define BCH_DEBUG_PARAM(name, description) DEFINE_STATIC_KEY_FALSE(bch2_##name); +BCH_DEBUG_PARAMS_ALL() +#undef BCH_DEBUG_PARAM + +static int bch2_param_set_static_key_t(const char *val, const struct kernel_param *kp) +{ + /* Match bool exactly, by re-using it. */ + struct static_key *key = kp->arg; + struct kernel_param boolkp = *kp; + bool v; + int ret; + + boolkp.arg = &v; + + ret = param_set_bool(val, &boolkp); + if (ret) + return ret; + if (v) + static_key_enable(key); + else + static_key_disable(key); + return 0; +} + +static int bch2_param_get_static_key_t(char *buffer, const struct kernel_param *kp) +{ + struct static_key *key = kp->arg; + return sprintf(buffer, "%c\n", static_key_enabled(key) ? 'N' : 'Y'); +} + +/* this is unused in userspace - silence the warning */ +__maybe_unused +static const struct kernel_param_ops bch2_param_ops_static_key_t = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = bch2_param_set_static_key_t, + .get = bch2_param_get_static_key_t, +}; + +#define BCH_DEBUG_PARAM(name, description) \ + module_param_cb(name, &bch2_param_ops_static_key_t, &bch2_##name.key, 0644);\ + __MODULE_PARM_TYPE(name, "static_key_t"); \ MODULE_PARM_DESC(name, description); BCH_DEBUG_PARAMS() #undef BCH_DEBUG_PARAM diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index 23533bce5709..e90bab9afe78 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -9,6 +9,9 @@ #include extern const char * const bch2_fs_flag_strs[]; +extern const char * const bch2_write_refs[]; +extern const char * const bch2_dev_read_refs[]; +extern const char * const bch2_dev_write_refs[]; struct bch_fs *bch2_dev_to_fs(dev_t); struct bch_fs *bch2_uuid_to_fs(__uuid_t); @@ -29,18 +32,23 @@ int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); bool bch2_fs_emergency_read_only(struct bch_fs *); +bool bch2_fs_emergency_read_only2(struct bch_fs *, struct printbuf *); + bool bch2_fs_emergency_read_only_locked(struct bch_fs *); void bch2_fs_read_only(struct bch_fs *); int bch2_fs_read_write(struct bch_fs *); int bch2_fs_read_write_early(struct bch_fs *); +int bch2_fs_resize_on_mount(struct bch_fs *); + void __bch2_fs_stop(struct bch_fs *); void bch2_fs_free(struct bch_fs *); void bch2_fs_stop(struct bch_fs *); +int bch2_fs_init_rw(struct bch_fs *); int bch2_fs_start(struct bch_fs *); -struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); +struct bch_fs *bch2_fs_open(darray_const_str *, struct bch_opts *); extern const struct blk_holder_ops bch2_sb_handle_bdev_ops; diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 82ee333ddd21..bd3fa9c3372d 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -18,6 +18,7 @@ #include "btree_key_cache.h" #include "btree_update.h" #include "btree_update_interior.h" +#include "btree_write_buffer.h" #include "btree_gc.h" #include "buckets.h" #include "clock.h" @@ -25,6 +26,8 @@ #include "disk_accounting.h" #include "disk_groups.h" #include "ec.h" +#include "enumerated_ref.h" +#include "error.h" #include "inode.h" #include "journal.h" #include "journal_reclaim.h" @@ -34,12 +37,15 @@ #include "nocow_locking.h" #include "opts.h" #include "rebalance.h" +#include "recovery_passes.h" #include "replicas.h" +#include "sb-errors.h" #include "super-io.h" #include "tests.h" #include #include +#include #include #include "util.h" @@ -57,7 +63,7 @@ static ssize_t fn ## _to_text(struct printbuf *, \ static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ char *buf) \ { \ - struct printbuf out = PRINTBUF; \ + CLASS(printbuf, out)(); \ ssize_t ret = fn ## _to_text(&out, kobj, attr); \ \ if (out.pos && out.buf[out.pos - 1] != '\n') \ @@ -70,7 +76,6 @@ static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ ret = min_t(size_t, out.pos, PAGE_SIZE - 1); \ memcpy(buf, out.buf, ret); \ } \ - printbuf_exit(&out); \ return bch2_err_class(ret); \ } \ \ @@ -141,13 +146,19 @@ do { \ write_attribute(trigger_gc); write_attribute(trigger_discards); write_attribute(trigger_invalidates); +write_attribute(trigger_journal_commit); write_attribute(trigger_journal_flush); write_attribute(trigger_journal_writes); write_attribute(trigger_btree_cache_shrink); write_attribute(trigger_btree_key_cache_shrink); -write_attribute(trigger_freelist_wakeup); +write_attribute(trigger_btree_write_buffer_flush); write_attribute(trigger_btree_updates); +write_attribute(trigger_freelist_wakeup); +write_attribute(trigger_recalc_capacity); +write_attribute(trigger_delete_dead_snapshots); +write_attribute(trigger_emergency_read_only); read_attribute(gc_gens_pos); +__sysfs_attribute(read_fua_test, 0400); read_attribute(uuid); read_attribute(minor); @@ -162,12 +173,15 @@ read_attribute(io_latency_read); read_attribute(io_latency_write); read_attribute(io_latency_stats_read); read_attribute(io_latency_stats_write); +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT read_attribute(congested); +#endif read_attribute(btree_write_stats); read_attribute(btree_cache_size); read_attribute(compression_stats); +read_attribute(errors); read_attribute(journal_debug); read_attribute(btree_cache); read_attribute(btree_key_cache); @@ -176,25 +190,9 @@ read_attribute(open_buckets); read_attribute(open_buckets_partial); read_attribute(nocow_lock_table); -#ifdef BCH_WRITE_REF_DEBUG +read_attribute(read_refs); read_attribute(write_refs); -static const char * const bch2_write_refs[] = { -#define x(n) #n, - BCH_WRITE_REFS() -#undef x - NULL -}; - -static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c) -{ - bch2_printbuf_tabstop_push(out, 24); - - for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) - prt_printf(out, "%s\t%li\n", bch2_write_refs[i], atomic_long_read(&c->writes[i])); -} -#endif - read_attribute(internal_uuid); read_attribute(disk_groups); @@ -212,6 +210,8 @@ read_attribute(copy_gc_wait); sysfs_pd_controller_attribute(rebalance); read_attribute(rebalance_status); +read_attribute(snapshot_delete_status); +read_attribute(recovery_status); read_attribute(new_stripes); @@ -236,14 +236,13 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) size_t ret = 0; struct btree *b; - mutex_lock(&bc->lock); + guard(mutex)(&bc->lock); list_for_each_entry(b, &bc->live[0].list, list) ret += btree_buf_bytes(b); list_for_each_entry(b, &bc->live[1].list, list) ret += btree_buf_bytes(b); list_for_each_entry(b, &bc->freeable, list) ret += btree_buf_bytes(b); - mutex_unlock(&bc->lock); return ret; } @@ -308,6 +307,116 @@ static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c) prt_printf(out, "nr_inodes:\t%llu\n", b.nr_inodes); } +static int bch2_read_fua_test(struct printbuf *out, struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + struct bio *bio = NULL; + void *buf = NULL; + unsigned bs = c->opts.block_size, iters; + u64 end, test_duration = NSEC_PER_SEC * 2; + struct bch2_time_stats stats_nofua, stats_fua, stats_random; + int ret = 0; + + bch2_time_stats_init_no_pcpu(&stats_nofua); + bch2_time_stats_init_no_pcpu(&stats_fua); + bch2_time_stats_init_no_pcpu(&stats_random); + + if (!bch2_dev_get_ioref(c, ca->dev_idx, READ, BCH_DEV_READ_REF_read_fua_test)) { + prt_str(out, "offline\n"); + return 0; + } + + struct block_device *bdev = ca->disk_sb.bdev; + + bio = bio_kmalloc(1, GFP_KERNEL); + if (!bio) { + ret = -ENOMEM; + goto err; + } + + buf = kmalloc(bs, GFP_KERNEL); + if (!buf) + goto err; + + end = ktime_get_ns() + test_duration; + for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) { + bio_init(bio, bdev, bio->bi_inline_vecs, 1, READ); + bch2_bio_map(bio, buf, bs); + + u64 submit_time = ktime_get_ns(); + ret = submit_bio_wait(bio); + bch2_time_stats_update(&stats_nofua, submit_time); + + if (ret) + goto err; + } + + end = ktime_get_ns() + test_duration; + for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) { + bio_init(bio, bdev, bio->bi_inline_vecs, 1, REQ_FUA|READ); + bch2_bio_map(bio, buf, bs); + + u64 submit_time = ktime_get_ns(); + ret = submit_bio_wait(bio); + bch2_time_stats_update(&stats_fua, submit_time); + + if (ret) + goto err; + } + + u64 dev_size = ca->mi.nbuckets * bucket_bytes(ca); + + end = ktime_get_ns() + test_duration; + for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) { + bio_init(bio, bdev, bio->bi_inline_vecs, 1, READ); + bio->bi_iter.bi_sector = (bch2_get_random_u64_below(dev_size) & ~((u64) bs - 1)) >> 9; + bch2_bio_map(bio, buf, bs); + + u64 submit_time = ktime_get_ns(); + ret = submit_bio_wait(bio); + bch2_time_stats_update(&stats_random, submit_time); + + if (ret) + goto err; + } + + u64 ns_nofua = mean_and_variance_get_mean(stats_nofua.duration_stats); + u64 ns_fua = mean_and_variance_get_mean(stats_fua.duration_stats); + u64 ns_rand = mean_and_variance_get_mean(stats_random.duration_stats); + + u64 stddev_nofua = mean_and_variance_get_stddev(stats_nofua.duration_stats); + u64 stddev_fua = mean_and_variance_get_stddev(stats_fua.duration_stats); + u64 stddev_rand = mean_and_variance_get_stddev(stats_random.duration_stats); + + printbuf_tabstop_push(out, 8); + printbuf_tabstop_push(out, 12); + printbuf_tabstop_push(out, 12); + prt_printf(out, "This test must be run on an idle drive for accurate results\n"); + prt_printf(out, "%s\n", dev_name(&ca->disk_sb.bdev->bd_device)); + prt_printf(out, "fua support advertized: %s\n", str_yes_no(bdev_fua(bdev))); + prt_newline(out); + prt_printf(out, "ns:\tlatency\rstddev\r\n"); + prt_printf(out, "nofua\t%llu\r%llu\r\n", ns_nofua, stddev_nofua); + prt_printf(out, "fua\t%llu\r%llu\r\n", ns_fua, stddev_fua); + prt_printf(out, "random\t%llu\r%llu\r\n", ns_rand, stddev_rand); + + bool read_cache = ns_nofua * 2 < ns_rand; + bool fua_cached = read_cache && ns_fua < (ns_nofua + ns_rand) / 2; + + if (!read_cache) + prt_str(out, "reads don't appear to be cached - safe\n"); + else if (!fua_cached) + prt_str(out, "fua reads don't appear to be cached - safe\n"); + else + prt_str(out, "fua reads appear to be cached - unsafe\n"); +err: + kfree(buf); + kfree(bio); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_read_fua_test); + bch_err_fn(c, ret); + return ret; +} + SHOW(bch2_fs) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); @@ -334,6 +443,12 @@ SHOW(bch2_fs) if (attr == &sysfs_rebalance_status) bch2_rebalance_status_to_text(out, c); + if (attr == &sysfs_snapshot_delete_status) + bch2_snapshot_delete_status_to_text(out, c); + + if (attr == &sysfs_recovery_status) + bch2_recovery_pass_status_to_text(out, c); + /* Debugging: */ if (attr == &sysfs_journal_debug) @@ -357,6 +472,9 @@ SHOW(bch2_fs) if (attr == &sysfs_compression_stats) bch2_compression_stats_to_text(out, c); + if (attr == &sysfs_errors) + bch2_fs_errors_to_text(out, c); + if (attr == &sysfs_new_stripes) bch2_new_stripes_to_text(out, c); @@ -369,10 +487,8 @@ SHOW(bch2_fs) if (attr == &sysfs_moving_ctxts) bch2_fs_moving_ctxts_to_text(out, c); -#ifdef BCH_WRITE_REF_DEBUG if (attr == &sysfs_write_refs) - bch2_write_refs_to_text(out, c); -#endif + enumerated_ref_to_text(out, &c->writes, bch2_write_refs); if (attr == &sysfs_nocow_lock_table) bch2_nocow_locks_to_text(out, &c->nocow_locks); @@ -405,7 +521,7 @@ STORE(bch2_fs) if (attr == &sysfs_trigger_btree_updates) queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs)) return -EROFS; if (attr == &sysfs_trigger_btree_cache_shrink) { @@ -425,6 +541,11 @@ STORE(bch2_fs) c->btree_key_cache.shrink->scan_objects(c->btree_key_cache.shrink, &sc); } + if (attr == &sysfs_trigger_btree_write_buffer_flush) + bch2_trans_do(c, + (bch2_btree_write_buffer_flush_sync(trans), + bch2_trans_begin(trans))); + if (attr == &sysfs_trigger_gc) bch2_gc_gens(c); @@ -434,6 +555,9 @@ STORE(bch2_fs) if (attr == &sysfs_trigger_invalidates) bch2_do_invalidates(c); + if (attr == &sysfs_trigger_journal_commit) + bch2_journal_flush(&c->journal); + if (attr == &sysfs_trigger_journal_flush) { bch2_journal_flush_all_pins(&c->journal); bch2_journal_meta(&c->journal); @@ -445,6 +569,24 @@ STORE(bch2_fs) if (attr == &sysfs_trigger_freelist_wakeup) closure_wake_up(&c->freelist_wait); + if (attr == &sysfs_trigger_recalc_capacity) { + guard(rwsem_read)(&c->state_lock); + bch2_recalc_capacity(c); + } + + if (attr == &sysfs_trigger_delete_dead_snapshots) + __bch2_delete_dead_snapshots(c); + + if (attr == &sysfs_trigger_emergency_read_only) { + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "shutdown by sysfs\n"); + bch2_fs_emergency_read_only2(c, &buf); + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + } + #ifdef CONFIG_BCACHEFS_TESTS if (attr == &sysfs_perf_test) { char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; @@ -465,7 +607,7 @@ STORE(bch2_fs) size = ret; } #endif - bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs); return size; } SYSFS_OPS(bch2_fs); @@ -476,8 +618,11 @@ struct attribute *bch2_fs_files[] = { &sysfs_btree_write_stats, &sysfs_rebalance_status, + &sysfs_snapshot_delete_status, + &sysfs_recovery_status, &sysfs_compression_stats, + &sysfs_errors, #ifdef CONFIG_BCACHEFS_TESTS &sysfs_perf_test, @@ -558,9 +703,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_new_stripes, &sysfs_open_buckets, &sysfs_open_buckets_partial, -#ifdef BCH_WRITE_REF_DEBUG &sysfs_write_refs, -#endif &sysfs_nocow_lock_table, &sysfs_io_timers_read, &sysfs_io_timers_write, @@ -568,12 +711,17 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_gc, &sysfs_trigger_discards, &sysfs_trigger_invalidates, + &sysfs_trigger_journal_commit, &sysfs_trigger_journal_flush, &sysfs_trigger_journal_writes, &sysfs_trigger_btree_cache_shrink, &sysfs_trigger_btree_key_cache_shrink, - &sysfs_trigger_freelist_wakeup, + &sysfs_trigger_btree_write_buffer_flush, &sysfs_trigger_btree_updates, + &sysfs_trigger_freelist_wakeup, + &sysfs_trigger_recalc_capacity, + &sysfs_trigger_delete_dead_snapshots, + &sysfs_trigger_emergency_read_only, &sysfs_gc_gens_pos, @@ -626,7 +774,7 @@ static ssize_t sysfs_opt_store(struct bch_fs *c, * We don't need to take c->writes for correctness, but it eliminates an * unsightly error message in the dmesg log when we're RO: */ - if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))) + if (unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs))) return -EROFS; char *tmp = kstrdup(buf, GFP_KERNEL); @@ -637,40 +785,34 @@ static ssize_t sysfs_opt_store(struct bch_fs *c, u64 v; ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL) ?: - bch2_opt_check_may_set(c, ca, id, v); + bch2_opt_hook_pre_set(c, ca, id, v); kfree(tmp); if (ret < 0) goto err; - bch2_opt_set_sb(c, ca, opt, v); - bch2_opt_set_by_id(&c->opts, id, v); - - if (v && - (id == Opt_background_target || - (id == Opt_foreground_target && !c->opts.background_target) || - id == Opt_background_compression || - (id == Opt_compression && !c->opts.background_compression))) - bch2_set_rebalance_needs_scan(c, 0); - - if (v && id == Opt_rebalance_enabled) - bch2_rebalance_wakeup(c); + bool is_sb = opt->get_sb || opt->get_member; + bool changed = false; - if (v && id == Opt_copygc_enabled) - bch2_copygc_wakeup(c); + if (is_sb) { + changed = bch2_opt_set_sb(c, ca, opt, v); + } else if (!ca) { + changed = bch2_opt_get_by_id(&c->opts, id) != v; + } else { + /* device options that aren't superblock options aren't + * supported */ + BUG(); + } - if (id == Opt_discard && !ca) { - mutex_lock(&c->sb_lock); - for_each_member_device(c, ca) - opt->set_member(bch2_members_v2_get_mut(ca->disk_sb.sb, ca->dev_idx), v); + if (!ca) + bch2_opt_set_by_id(&c->opts, id, v); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - } + if (changed) + bch2_opt_hook_post_set(c, ca, 0, &c->opts, id); ret = size; err: - bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs); return ret; } @@ -807,9 +949,10 @@ SHOW(bch2_dev) if (attr == &sysfs_io_latency_stats_write) bch2_time_stats_to_text(out, &ca->io_latency[WRITE].stats); - sysfs_printf(congested, "%u%%", - clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) - * 100 / CONGESTED_MAX); +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT + if (attr == &sysfs_congested) + bch2_dev_congested_to_text(out, ca); +#endif if (attr == &sysfs_alloc_debug) bch2_dev_alloc_debug_to_text(out, ca); @@ -817,10 +960,19 @@ SHOW(bch2_dev) if (attr == &sysfs_open_buckets) bch2_open_buckets_to_text(out, c, ca); + if (attr == &sysfs_read_fua_test) + return bch2_read_fua_test(out, ca); + int opt_id = bch2_opt_lookup(attr->name); if (opt_id >= 0) return sysfs_opt_show(c, ca, opt_id, out); + if (attr == &sysfs_read_refs) + enumerated_ref_to_text(out, &ca->io_ref[READ], bch2_dev_read_refs); + + if (attr == &sysfs_write_refs) + enumerated_ref_to_text(out, &ca->io_ref[WRITE], bch2_dev_write_refs); + return 0; } @@ -871,11 +1023,18 @@ struct attribute *bch2_dev_files[] = { &sysfs_io_latency_write, &sysfs_io_latency_stats_read, &sysfs_io_latency_stats_write, +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT &sysfs_congested, +#endif + + &sysfs_read_fua_test, /* debug: */ &sysfs_alloc_debug, &sysfs_open_buckets, + + &sysfs_read_refs, + &sysfs_write_refs, NULL }; diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index 782a05fe7656..baaaedf68422 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -31,78 +31,66 @@ static void delete_test_keys(struct bch_fs *c) static int test_delete(struct bch_fs *c, u64 nr) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; struct bkey_i_cookie k; - int ret; - bkey_cookie_init(&k.k_i); k.k.p.snapshot = U32_MAX; - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p, - BTREE_ITER_intent); + CLASS(btree_trans, trans)(c); + CLASS(btree_iter, iter)(trans, BTREE_ID_xattrs, k.k.p, BTREE_ITER_intent); - ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(trans, &iter) ?: + int ret = commit_do(trans, NULL, NULL, 0, + bch2_btree_iter_traverse(&iter) ?: bch2_trans_update(trans, &iter, &k.k_i, 0)); bch_err_msg(c, ret, "update error"); if (ret) - goto err; + return ret; pr_info("deleting once"); ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(trans, &iter) ?: + bch2_btree_iter_traverse(&iter) ?: bch2_btree_delete_at(trans, &iter, 0)); bch_err_msg(c, ret, "delete error (first)"); if (ret) - goto err; + return ret; pr_info("deleting twice"); ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(trans, &iter) ?: + bch2_btree_iter_traverse(&iter) ?: bch2_btree_delete_at(trans, &iter, 0)); bch_err_msg(c, ret, "delete error (second)"); if (ret) - goto err; -err: - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; + return ret; + + return 0; } static int test_delete_written(struct bch_fs *c, u64 nr) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; struct bkey_i_cookie k; - int ret; - bkey_cookie_init(&k.k_i); k.k.p.snapshot = U32_MAX; - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p, - BTREE_ITER_intent); + CLASS(btree_trans, trans)(c); + CLASS(btree_iter, iter)(trans, BTREE_ID_xattrs, k.k.p, BTREE_ITER_intent); - ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(trans, &iter) ?: + int ret = commit_do(trans, NULL, NULL, 0, + bch2_btree_iter_traverse(&iter) ?: bch2_trans_update(trans, &iter, &k.k_i, 0)); bch_err_msg(c, ret, "update error"); if (ret) - goto err; + return ret; bch2_trans_unlock(trans); bch2_journal_flush_all_pins(&c->journal); ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(trans, &iter) ?: + bch2_btree_iter_traverse(&iter) ?: bch2_btree_delete_at(trans, &iter, 0)); bch_err_msg(c, ret, "delete error"); if (ret) - goto err; -err: - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; + return ret; + + return 0; } static int test_iterate(struct bch_fs *c, u64 nr) @@ -130,13 +118,14 @@ static int test_iterate(struct bch_fs *c, u64 nr) pr_info("iterating forwards"); i = 0; - ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ + CLASS(btree_trans, trans)(c); + + ret = for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ BUG_ON(k.k->p.offset != i++); 0; - }))); + })); bch_err_msg(c, ret, "error iterating forwards"); if (ret) return ret; @@ -145,12 +134,11 @@ static int test_iterate(struct bch_fs *c, u64 nr) pr_info("iterating backwards"); - ret = bch2_trans_run(c, - for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs, + ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs, SPOS(0, U64_MAX, U32_MAX), 0, k, ({ BUG_ON(k.k->p.offset != --i); 0; - }))); + })); bch_err_msg(c, ret, "error iterating backwards"); if (ret) return ret; @@ -185,14 +173,15 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) pr_info("iterating forwards"); i = 0; - ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ + CLASS(btree_trans, trans)(c); + + ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ BUG_ON(bkey_start_offset(k.k) != i); i = k.k->p.offset; 0; - }))); + })); bch_err_msg(c, ret, "error iterating forwards"); if (ret) return ret; @@ -201,13 +190,12 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) pr_info("iterating backwards"); - ret = bch2_trans_run(c, - for_each_btree_key_reverse(trans, iter, BTREE_ID_extents, + ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_extents, SPOS(0, U64_MAX, U32_MAX), 0, k, ({ BUG_ON(k.k->p.offset != i); i = bkey_start_offset(k.k); 0; - }))); + })); bch_err_msg(c, ret, "error iterating backwards"); if (ret) return ret; @@ -241,14 +229,15 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) pr_info("iterating forwards"); i = 0; - ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ + CLASS(btree_trans, trans)(c); + + ret = for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ BUG_ON(k.k->p.offset != i); i += 2; 0; - }))); + })); bch_err_msg(c, ret, "error iterating forwards"); if (ret) return ret; @@ -258,10 +247,9 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) pr_info("iterating forwards by slots"); i = 0; - ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - BTREE_ITER_slots, k, ({ + ret = for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + BTREE_ITER_slots, k, ({ if (i >= nr * 2) break; @@ -270,7 +258,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) i++; 0; - }))); + })); bch_err_msg(c, ret, "error iterating forwards by slots"); return ret; } @@ -301,15 +289,16 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) pr_info("iterating forwards"); i = 0; - ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ + CLASS(btree_trans, trans)(c); + + ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ BUG_ON(bkey_start_offset(k.k) != i + 8); BUG_ON(k.k->size != 8); i += 16; 0; - }))); + })); bch_err_msg(c, ret, "error iterating forwards"); if (ret) return ret; @@ -319,10 +308,9 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) pr_info("iterating forwards by slots"); i = 0; - ret = bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - BTREE_ITER_slots, k, ({ + ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + BTREE_ITER_slots, k, ({ if (i == nr) break; BUG_ON(bkey_deleted(k.k) != !(i % 16)); @@ -331,7 +319,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) BUG_ON(k.k->size != 8); i = k.k->p.offset; 0; - }))); + })); bch_err_msg(c, ret, "error iterating forwards by slots"); return ret; } @@ -344,21 +332,16 @@ static int test_peek_end(struct bch_fs *c, u64 nr) { delete_test_keys(c); - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; + CLASS(btree_trans, trans)(c); + CLASS(btree_iter, iter)(trans, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), 0); - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), 0); - - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); + struct bkey_s_c k; + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); return 0; } @@ -366,21 +349,16 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr) { delete_test_keys(c); - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), 0); + CLASS(btree_trans, trans)(c); + CLASS(btree_iter, iter)(trans, BTREE_ID_extents, SPOS(0, 0, U32_MAX), 0); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); + struct bkey_s_c k; + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); return 0; } @@ -392,15 +370,13 @@ static int insert_test_extent(struct bch_fs *c, u64 start, u64 end) { struct bkey_i_cookie k; - int ret; - bkey_cookie_init(&k.k_i); k.k_i.k.p.offset = end; k.k_i.k.p.snapshot = U32_MAX; k.k_i.k.size = end - start; k.k_i.k.bversion.lo = test_version++; - ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0, 0); + int ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0, 0); bch_err_fn(c, ret); return ret; } @@ -446,15 +422,14 @@ static int test_extent_overwrite_all(struct bch_fs *c, u64 nr) static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start, u32 len, u32 snapid) { struct bkey_i_cookie k; - int ret; - bkey_cookie_init(&k.k_i); k.k_i.k.p.inode = inum; k.k_i.k.p.offset = start + len; k.k_i.k.p.snapshot = snapid; k.k_i.k.size = len; - ret = bch2_trans_commit_do(c, NULL, NULL, 0, + CLASS(btree_trans, trans)(c); + int ret = commit_do(trans, NULL, NULL, 0, bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i, BTREE_UPDATE_internal_snapshot_node)); bch_err_fn(c, ret); @@ -477,48 +452,43 @@ static int test_extent_create_overlapping(struct bch_fs *c, u64 inum) /* Test skipping over keys in unrelated snapshots: */ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) { - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; struct bkey_i_cookie cookie; - int ret; - bkey_cookie_init(&cookie.k_i); cookie.k.p.snapshot = snapid_hi; - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0); + int ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0); if (ret) return ret; - trans = bch2_trans_get(c); - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, - SPOS(0, 0, snapid_lo), 0); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); + CLASS(btree_trans, trans)(c); + CLASS(btree_iter, iter)(trans, BTREE_ID_xattrs, SPOS(0, 0, snapid_lo), 0); + + struct bkey_s_c k; + ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); BUG_ON(k.k->p.snapshot != U32_MAX); - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); return ret; } static int test_snapshots(struct bch_fs *c, u64 nr) { struct bkey_i_cookie cookie; - u32 snapids[2]; - u32 snapid_subvols[2] = { 1, 1 }; - int ret; - bkey_cookie_init(&cookie.k_i); cookie.k.p.snapshot = U32_MAX; - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0); + + int ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0); if (ret) return ret; - ret = bch2_trans_commit_do(c, NULL, NULL, 0, - bch2_snapshot_node_create(trans, U32_MAX, - snapids, - snapid_subvols, - 2)); + u32 snapids[2]; + u32 snapid_subvols[2] = { 1, 1 }; + + CLASS(btree_trans, trans)(c); + ret = commit_do(trans, NULL, NULL, 0, + bch2_snapshot_node_create(trans, U32_MAX, + snapids, + snapid_subvols, + 2)); if (ret) return ret; @@ -542,42 +512,37 @@ static u64 test_rand(void) static int rand_insert(struct bch_fs *c, u64 nr) { - struct btree_trans *trans = bch2_trans_get(c); - struct bkey_i_cookie k; - int ret = 0; - u64 i; + CLASS(btree_trans, trans)(c); - for (i = 0; i < nr; i++) { + for (u64 i = 0; i < nr; i++) { + struct bkey_i_cookie k; bkey_cookie_init(&k.k_i); k.k.p.offset = test_rand(); k.k.p.snapshot = U32_MAX; - ret = commit_do(trans, NULL, NULL, 0, + int ret = commit_do(trans, NULL, NULL, 0, bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k.k_i, 0)); if (ret) - break; + return ret; } - bch2_trans_put(trans); - return ret; + return 0; } static int rand_insert_multi(struct bch_fs *c, u64 nr) { - struct btree_trans *trans = bch2_trans_get(c); + CLASS(btree_trans, trans)(c); struct bkey_i_cookie k[8]; - int ret = 0; unsigned j; - u64 i; - for (i = 0; i < nr; i += ARRAY_SIZE(k)) { + for (u64 i = 0; i < nr; i += ARRAY_SIZE(k)) { for (j = 0; j < ARRAY_SIZE(k); j++) { bkey_cookie_init(&k[j].k_i); k[j].k.p.offset = test_rand(); k[j].k.p.snapshot = U32_MAX; } - ret = commit_do(trans, NULL, NULL, 0, + int ret = commit_do(trans, NULL, NULL, 0, bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?: bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?: bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?: @@ -587,36 +552,27 @@ static int rand_insert_multi(struct bch_fs *c, u64 nr) bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?: bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[7].k_i, 0)); if (ret) - break; + return ret; } - bch2_trans_put(trans); - return ret; + return 0; } static int rand_lookup(struct bch_fs *c, u64 nr) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - u64 i; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), 0); + CLASS(btree_trans, trans)(c); + CLASS(btree_iter, iter)(trans, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), 0); - for (i = 0; i < nr; i++) { - bch2_btree_iter_set_pos(trans, &iter, SPOS(0, test_rand(), U32_MAX)); + for (u64 i = 0; i < nr; i++) { + bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX)); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(trans, &iter))); - ret = bkey_err(k); + struct bkey_s_c k; + int ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter))); if (ret) - break; + return ret; } - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; + return 0; } static int rand_mixed_trans(struct btree_trans *trans, @@ -627,9 +583,9 @@ static int rand_mixed_trans(struct btree_trans *trans, struct bkey_s_c k; int ret; - bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, U32_MAX)); + bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX)); - k = bch2_btree_iter_peek(trans, iter); + k = bch2_btree_iter_peek(iter); ret = bkey_err(k); bch_err_msg(trans->c, ret, "lookup error"); if (ret) @@ -646,77 +602,59 @@ static int rand_mixed_trans(struct btree_trans *trans, static int rand_mixed(struct bch_fs *c, u64 nr) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_i_cookie cookie; - int ret = 0; - u64 i, rand; + CLASS(btree_trans, trans)(c); + CLASS(btree_iter, iter)(trans, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), 0); - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), 0); - - for (i = 0; i < nr; i++) { - rand = test_rand(); - ret = commit_do(trans, NULL, NULL, 0, + for (u64 i = 0; i < nr; i++) { + u64 rand = test_rand(); + struct bkey_i_cookie cookie; + int ret = commit_do(trans, NULL, NULL, 0, rand_mixed_trans(trans, &iter, &cookie, i, rand)); if (ret) - break; + return ret; } - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; + return 0; } static int __do_delete(struct btree_trans *trans, struct bpos pos) { - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, - BTREE_ITER_intent); - k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)); - ret = bkey_err(k); + CLASS(btree_iter, iter)(trans, BTREE_ID_xattrs, pos, + BTREE_ITER_intent); + struct bkey_s_c k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)); + int ret = bkey_err(k); if (ret) - goto err; + return ret; if (!k.k) - goto err; + return 0; - ret = bch2_btree_delete_at(trans, &iter, 0); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; + return bch2_btree_delete_at(trans, &iter, 0); } static int rand_delete(struct bch_fs *c, u64 nr) { - struct btree_trans *trans = bch2_trans_get(c); - int ret = 0; - u64 i; + CLASS(btree_trans, trans)(c); - for (i = 0; i < nr; i++) { + for (u64 i = 0; i < nr; i++) { struct bpos pos = SPOS(0, test_rand(), U32_MAX); - ret = commit_do(trans, NULL, NULL, 0, + int ret = commit_do(trans, NULL, NULL, 0, __do_delete(trans, pos)); if (ret) - break; + return ret; } - bch2_trans_put(trans); - return ret; + return 0; } static int seq_insert(struct bch_fs *c, u64 nr) { struct bkey_i_cookie insert; - bkey_cookie_init(&insert.k_i); - return bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, + CLASS(btree_trans, trans)(c); + return for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), BTREE_ITER_slots|BTREE_ITER_intent, k, NULL, NULL, 0, ({ @@ -724,22 +662,22 @@ static int seq_insert(struct bch_fs *c, u64 nr) break; insert.k.p = iter.pos; bch2_trans_update(trans, &iter, &insert.k_i, 0); - }))); + })); } static int seq_lookup(struct bch_fs *c, u64 nr) { - return bch2_trans_run(c, - for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, + CLASS(btree_trans, trans)(c); + return for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, - 0)); + 0); } static int seq_overwrite(struct bch_fs *c, u64 nr) { - return bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, + CLASS(btree_trans, trans)(c); + return for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), BTREE_ITER_intent, k, NULL, NULL, 0, ({ @@ -747,7 +685,7 @@ static int seq_overwrite(struct bch_fs *c, u64 nr) bkey_reassemble(&u.k_i, k); bch2_trans_update(trans, &iter, &u.k_i, 0); - }))); + })); } static int seq_delete(struct bch_fs *c, u64 nr) @@ -808,8 +746,8 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname, { struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads }; char name_buf[20]; - struct printbuf nr_buf = PRINTBUF; - struct printbuf per_sec_buf = PRINTBUF; + CLASS(printbuf, nr_buf)(); + CLASS(printbuf, per_sec_buf)(); unsigned i; u64 time; @@ -883,8 +821,6 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname, div_u64(time, NSEC_PER_SEC), div_u64(time * nr_threads, nr), per_sec_buf.buf); - printbuf_exit(&per_sec_buf); - printbuf_exit(&nr_buf); return j.ret; } diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c index 314a24d15d4e..c2eae0ab7765 100644 --- a/fs/bcachefs/thread_with_file.c +++ b/fs/bcachefs/thread_with_file.c @@ -60,8 +60,7 @@ int bch2_run_thread_with_file(struct thread_with_file *thr, err: if (fd >= 0) put_unused_fd(fd); - if (thr->task) - kthread_stop(thr->task); + kthread_stop(thr->task); return ret; } @@ -185,23 +184,23 @@ static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubu break; } - spin_lock(&buf->lock); - size_t makeroom = b; - if (!buf->waiting_for_line || memchr(buf->buf.data, '\n', buf->buf.nr)) - makeroom = min_t(ssize_t, makeroom, - max_t(ssize_t, STDIO_REDIRECT_BUFSIZE - buf->buf.nr, - 0)); - darray_make_room_gfp(&buf->buf, makeroom, GFP_NOWAIT); - - b = min(len, darray_room(buf->buf)); - - if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) { - buf->buf.nr += b; - ubuf += b; - len -= b; - copied += b; + scoped_guard(spinlock, &buf->lock) { + size_t makeroom = b; + if (!buf->waiting_for_line || memchr(buf->buf.data, '\n', buf->buf.nr)) + makeroom = min_t(ssize_t, makeroom, + max_t(ssize_t, STDIO_REDIRECT_BUFSIZE - buf->buf.nr, + 0)); + darray_make_room_gfp(&buf->buf, makeroom, GFP_NOWAIT); + + b = min(len, darray_room(buf->buf)); + + if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) { + buf->buf.nr += b; + ubuf += b; + len -= b; + copied += b; + } } - spin_unlock(&buf->lock); if (b) { wake_up(&buf->wait); @@ -349,14 +348,15 @@ int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t le if (stdio->done) return -1; - spin_lock(&buf->lock); - int ret = min(len, buf->buf.nr); - buf->buf.nr -= ret; - memcpy(ubuf, buf->buf.data, ret); - memmove(buf->buf.data, - buf->buf.data + ret, - buf->buf.nr); - spin_unlock(&buf->lock); + int ret; + scoped_guard(spinlock, &buf->lock) { + ret = min(len, buf->buf.nr); + buf->buf.nr -= ret; + memcpy(ubuf, buf->buf.data, ret); + memmove(buf->buf.data, + buf->buf.data + ret, + buf->buf.nr); + } wake_up(&buf->wait); return ret; diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c index 2c34fe4be912..7b5fa44807d7 100644 --- a/fs/bcachefs/time_stats.c +++ b/fs/bcachefs/time_stats.c @@ -138,10 +138,8 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) GFP_ATOMIC); spin_unlock_irqrestore(&stats->lock, flags); } else { - struct time_stat_buffer *b; - - preempt_disable(); - b = this_cpu_ptr(stats->buffer); + guard(preempt)(); + struct time_stat_buffer *b = this_cpu_ptr(stats->buffer); BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); b->entries[b->nr++] = (struct time_stat_buffer_entry) { @@ -151,7 +149,6 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) if (unlikely(b->nr == ARRAY_SIZE(b->entries))) time_stats_clear_buffer(stats, b); - preempt_enable(); } } diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 519d00d62ae7..3776a1403104 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -92,58 +92,6 @@ DECLARE_EVENT_CLASS(trans_str_nocaller, __entry->trans_fn, __get_str(str)) ); -DECLARE_EVENT_CLASS(btree_node_nofs, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u8, level ) - __field(u8, btree_id ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->level = b->c.level; - __entry->btree_id = b->c.btree_id; - TRACE_BPOS_assign(pos, b->key.k.p); - ), - - TP_printk("%d,%d %u %s %llu:%llu:%u", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->level, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) -); - -DECLARE_EVENT_CLASS(btree_node, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __array(char, trans_fn, 32 ) - __field(u8, level ) - __field(u8, btree_id ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - __entry->dev = trans->c->dev; - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->level = b->c.level; - __entry->btree_id = b->c.btree_id; - TRACE_BPOS_assign(pos, b->key.k.p); - ), - - TP_printk("%d,%d %s %u %s %llu:%llu:%u", - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn, - __entry->level, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) -); - DECLARE_EVENT_CLASS(bch_fs, TP_PROTO(struct bch_fs *c), TP_ARGS(c), @@ -199,6 +147,50 @@ DECLARE_EVENT_CLASS(bio, (unsigned long long)__entry->sector, __entry->nr_sector) ); +/* errors */ + +TRACE_EVENT(error_throw, + TP_PROTO(struct bch_fs *c, int bch_err, unsigned long ip), + TP_ARGS(c, bch_err, ip), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(int, err ) + __array(char, err_str, 32 ) + __array(char, ip, 32 ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->err = bch_err; + strscpy(__entry->err_str, bch2_err_str(bch_err), sizeof(__entry->err_str)); + snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip); + ), + + TP_printk("%d,%d %s ret %s", MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ip, __entry->err_str) +); + +TRACE_EVENT(error_downcast, + TP_PROTO(int bch_err, int std_err, unsigned long ip), + TP_ARGS(bch_err, std_err, ip), + + TP_STRUCT__entry( + __array(char, bch_err, 32 ) + __array(char, std_err, 32 ) + __array(char, ip, 32 ) + ), + + TP_fast_assign( + strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err)); + strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err)); + snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip); + ), + + TP_printk("%s ret %s -> %s %s", __entry->ip, + __entry->bch_err, __entry->std_err, __entry->ip) +); + /* disk_accounting.c */ TRACE_EVENT(accounting_mem_insert, @@ -300,23 +292,9 @@ DEFINE_EVENT(bio, io_read_promote, TP_ARGS(bio) ); -TRACE_EVENT(io_read_nopromote, - TP_PROTO(struct bch_fs *c, int ret), - TP_ARGS(c, ret), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __array(char, ret, 32 ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret)); - ), - - TP_printk("%d,%d ret %s", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ret) +DEFINE_EVENT(fs_str, io_read_nopromote, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); DEFINE_EVENT(bio, io_read_bounce, @@ -339,6 +317,11 @@ DEFINE_EVENT(bio, io_read_reuse_race, TP_ARGS(bio) ); +DEFINE_EVENT(bio, io_read_fail_and_poison, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + /* ec.c */ TRACE_EVENT(stripe_create, @@ -478,9 +461,9 @@ TRACE_EVENT(btree_cache_scan, __entry->nr_to_scan, __entry->can_free, __entry->ret) ); -DEFINE_EVENT(btree_node_nofs, btree_cache_reap, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) +DEFINE_EVENT(fs_str, btree_cache_reap, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock_fail, @@ -505,39 +488,24 @@ DEFINE_EVENT(btree_trans, btree_cache_cannibalize_unlock, /* Btree */ -DEFINE_EVENT(btree_node, btree_node_read, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) +DEFINE_EVENT(fs_str, btree_node_read, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -TRACE_EVENT(btree_node_write, - TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors), - TP_ARGS(b, bytes, sectors), - - TP_STRUCT__entry( - __field(enum btree_node_type, type) - __field(unsigned, bytes ) - __field(unsigned, sectors ) - ), - - TP_fast_assign( - __entry->type = btree_node_type(b); - __entry->bytes = bytes; - __entry->sectors = sectors; - ), - - TP_printk("bkey type %u bytes %u sectors %u", - __entry->type , __entry->bytes, __entry->sectors) +DEFINE_EVENT(fs_str, btree_node_write, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -DEFINE_EVENT(btree_node, btree_node_alloc, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) +DEFINE_EVENT(fs_str, btree_node_alloc, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -DEFINE_EVENT(btree_node, btree_node_free, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) +DEFINE_EVENT(fs_str, btree_node_free, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); TRACE_EVENT(btree_reserve_get_fail, @@ -568,29 +536,29 @@ TRACE_EVENT(btree_reserve_get_fail, __entry->ret) ); -DEFINE_EVENT(btree_node, btree_node_compact, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) +DEFINE_EVENT(fs_str, btree_node_set_root, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -DEFINE_EVENT(btree_node, btree_node_merge, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) +DEFINE_EVENT(fs_str, btree_node_rewrite, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -DEFINE_EVENT(btree_node, btree_node_split, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) +DEFINE_EVENT(fs_str, btree_node_merge, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -DEFINE_EVENT(btree_node, btree_node_rewrite, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) +DEFINE_EVENT(fs_str, btree_node_compact, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -DEFINE_EVENT(btree_node, btree_node_set_root, - TP_PROTO(struct btree_trans *trans, struct btree *b), - TP_ARGS(trans, b) +DEFINE_EVENT(fs_str, btree_node_split, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); TRACE_EVENT(btree_path_relock_fail, @@ -1031,34 +999,14 @@ TRACE_EVENT(trans_blocked_journal_reclaim, __entry->must_wait) ); -TRACE_EVENT(trans_restart_journal_preres_get, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - unsigned flags), - TP_ARGS(trans, caller_ip, flags), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(unsigned, flags ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->flags = flags; - ), - - TP_printk("%s %pS %x", __entry->trans_fn, - (void *) __entry->caller_ip, - __entry->flags) -); - +#if 0 +/* todo: bring back dynamic fault injection */ DEFINE_EVENT(transaction_event, trans_restart_fault_inject, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), TP_ARGS(trans, caller_ip) ); +#endif DEFINE_EVENT(transaction_event, trans_traverse_all, TP_PROTO(struct btree_trans *trans, @@ -1122,51 +1070,9 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, TP_ARGS(trans, caller_ip, path) ); -TRACE_EVENT(trans_restart_upgrade, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path, - unsigned old_locks_want, - unsigned new_locks_want, - struct get_locks_fail *f), - TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want, f), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(u8, btree_id ) - __field(u8, old_locks_want ) - __field(u8, new_locks_want ) - __field(u8, level ) - __field(u32, path_seq ) - __field(u32, node_seq ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->btree_id = path->btree_id; - __entry->old_locks_want = old_locks_want; - __entry->new_locks_want = new_locks_want; - __entry->level = f->l; - __entry->path_seq = path->l[f->l].lock_seq; - __entry->node_seq = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq; - TRACE_BPOS_assign(pos, path->pos) - ), - - TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u", - __entry->trans_fn, - (void *) __entry->caller_ip, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot, - __entry->old_locks_want, - __entry->new_locks_want, - __entry->level, - __entry->path_seq, - __entry->node_seq) +DEFINE_EVENT(fs_str, trans_restart_upgrade, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); DEFINE_EVENT(trans_str, trans_restart_relock, @@ -1188,19 +1094,6 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill, TP_ARGS(trans, caller_ip, path) ); -DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - -DEFINE_EVENT(transaction_event, trans_restart_key_cache_upgrade, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) -); - DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, @@ -1222,13 +1115,6 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent, TP_ARGS(trans, caller_ip, path) ); -DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, @@ -1287,44 +1173,6 @@ TRACE_EVENT(trans_restart_mem_realloced, __entry->bytes) ); -TRACE_EVENT(trans_restart_key_cache_key_realloced, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path, - unsigned old_u64s, - unsigned new_u64s), - TP_ARGS(trans, caller_ip, path, old_u64s, new_u64s), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(enum btree_id, btree_id ) - TRACE_BPOS_entries(pos) - __field(u32, old_u64s ) - __field(u32, new_u64s ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - - __entry->btree_id = path->btree_id; - TRACE_BPOS_assign(pos, path->pos); - __entry->old_u64s = old_u64s; - __entry->new_u64s = new_u64s; - ), - - TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u", - __entry->trans_fn, - (void *) __entry->caller_ip, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot, - __entry->old_u64s, - __entry->new_u64s) -); - DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), @@ -1468,23 +1316,49 @@ DEFINE_EVENT(fs_str, data_update, TP_ARGS(c, str) ); -TRACE_EVENT(error_downcast, - TP_PROTO(int bch_err, int std_err, unsigned long ip), - TP_ARGS(bch_err, std_err, ip), +DEFINE_EVENT(fs_str, data_update_done_no_rw_devs, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); - TP_STRUCT__entry( - __array(char, bch_err, 32 ) - __array(char, std_err, 32 ) - __array(char, ip, 32 ) - ), +DEFINE_EVENT(fs_str, io_move_pred, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); - TP_fast_assign( - strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err)); - strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err)); - snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip); - ), +DEFINE_EVENT(fs_str, io_move_created_rebalance, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); - TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip) +DEFINE_EVENT(fs_str, io_move_evacuate_bucket, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); + +DEFINE_EVENT(fs_str, extent_trim_atomic, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); + +DEFINE_EVENT(fs_str, btree_iter_peek_slot, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); + +DEFINE_EVENT(fs_str, __btree_iter_peek, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); + +DEFINE_EVENT(fs_str, btree_iter_peek_max, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); + +DEFINE_EVENT(fs_str, btree_iter_peek_prev_min, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); #ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS @@ -1899,21 +1773,6 @@ TRACE_EVENT(btree_path_free, __entry->dup_locked) ); -TRACE_EVENT(btree_path_free_trans_begin, - TP_PROTO(btree_path_idx_t path), - TP_ARGS(path), - - TP_STRUCT__entry( - __field(btree_path_idx_t, idx ) - ), - - TP_fast_assign( - __entry->idx = path; - ), - - TP_printk(" path %3u", __entry->idx) -); - #else /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */ #ifndef _TRACE_BCACHEFS_H @@ -1931,7 +1790,6 @@ static inline void trace_btree_path_traverse_start(struct btree_trans *trans, st static inline void trace_btree_path_traverse_end(struct btree_trans *trans, struct btree_path *path) {} static inline void trace_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {} static inline void trace_btree_path_free(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup) {} -static inline void trace_btree_path_free_trans_begin(btree_path_idx_t path) {} #endif #endif /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */ diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 87af551692f4..2ded7f3c835f 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -252,8 +252,17 @@ void bch2_prt_u64_base2(struct printbuf *out, u64 v) bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1); } -static void __bch2_print_string_as_lines(const char *prefix, const char *lines, - bool nonblocking) +static bool string_is_spaces(const char *str) +{ + while (*str) { + if (*str != ' ') + return false; + str++; + } + return true; +} + +void bch2_print_string_as_lines(const char *prefix, const char *lines) { bool locked = false; const char *p; @@ -263,15 +272,13 @@ static void __bch2_print_string_as_lines(const char *prefix, const char *lines, return; } - if (!nonblocking) { - console_lock(); - locked = true; - } else { - locked = console_trylock(); - } + locked = console_trylock(); while (*lines) { p = strchrnul(lines, '\n'); + if (!*p && string_is_spaces(lines)) + break; + printk("%s%.*s\n", prefix, (int) (p - lines), lines); if (!*p) break; @@ -281,16 +288,6 @@ static void __bch2_print_string_as_lines(const char *prefix, const char *lines, console_unlock(); } -void bch2_print_string_as_lines(const char *prefix, const char *lines) -{ - return __bch2_print_string_as_lines(prefix, lines, false); -} - -void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines) -{ - return __bch2_print_string_as_lines(prefix, lines, true); -} - int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr, gfp_t gfp) { @@ -302,17 +299,12 @@ int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigne if (ret) return ret; - if (!down_read_trylock(&task->signal->exec_update_lock)) - return -1; - do { nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, skipnr + 1); } while (nr_entries == stack->size && !(ret = darray_make_room_gfp(stack, stack->size * 2, gfp))); stack->nr = nr_entries; - up_read(&task->signal->exec_update_lock); - return ret; #else return 0; @@ -329,11 +321,10 @@ void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack) int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr, gfp_t gfp) { - bch_stacktrace stack = { 0 }; + CLASS(bch_stacktrace, stack)(); int ret = bch2_save_backtrace(&stack, task, skipnr + 1, gfp); bch2_prt_backtrace(out, &stack); - darray_exit(&stack); return ret; } @@ -620,17 +611,10 @@ void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_contro void bch2_bio_map(struct bio *bio, void *base, size_t size) { - while (size) { - struct page *page = is_vmalloc_addr(base) - ? vmalloc_to_page(base) - : virt_to_page(base); - unsigned offset = offset_in_page(base); - unsigned len = min_t(size_t, PAGE_SIZE - offset, size); - - BUG_ON(!bio_add_page(bio, page, len, offset)); - size -= len; - base += len; - } + if (is_vmalloc_addr(base)) + bio_add_vmalloc(bio, base, size); + else + bio_add_virt_nofail(bio, base, size); } int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) @@ -725,6 +709,16 @@ void bch2_corrupt_bio(struct bio *bio) } #endif +void bch2_bio_to_text(struct printbuf *out, struct bio *bio) +{ + prt_printf(out, "bi_remaining:\t%u\n", + atomic_read(&bio->__bi_remaining)); + prt_printf(out, "bi_end_io:\t%ps\n", + bio->bi_end_io); + prt_printf(out, "bi_status:\t%u\n", + bio->bi_status); +} + #if 0 void eytzinger1_test(void) { @@ -987,9 +981,8 @@ u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) int cpu; /* access to pcpu vars has to be blocked by other locking */ - preempt_disable(); - ret = this_cpu_ptr(p); - preempt_enable(); + scoped_guard(preempt) + ret = this_cpu_ptr(p); for_each_possible_cpu(cpu) { u64 *i = per_cpu_ptr(p, cpu); @@ -1003,14 +996,14 @@ u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) return ret; } -void bch2_darray_str_exit(darray_str *d) +void bch2_darray_str_exit(darray_const_str *d) { darray_for_each(*d, i) kfree(*i); darray_exit(d); } -int bch2_split_devs(const char *_dev_name, darray_str *ret) +int bch2_split_devs(const char *_dev_name, darray_const_str *ret) { darray_init(ret); diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 3e52c7f8ddd2..31e8a4575e4b 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -55,15 +56,16 @@ static inline size_t buf_pages(void *p, size_t len) PAGE_SIZE); } -static inline void *bch2_kvmalloc(size_t n, gfp_t flags) +static inline void *bch2_kvmalloc_noprof(size_t n, gfp_t flags) { void *p = unlikely(n >= INT_MAX) - ? vmalloc(n) - : kvmalloc(n, flags & ~__GFP_ZERO); + ? vmalloc_noprof(n) + : kvmalloc_noprof(n, flags & ~__GFP_ZERO); if (p && (flags & __GFP_ZERO)) memset(p, 0, n); return p; } +#define bch2_kvmalloc(...) alloc_hooks(bch2_kvmalloc_noprof(__VA_ARGS__)) #define init_heap(heap, _size, gfp) \ ({ \ @@ -211,10 +213,10 @@ u64 bch2_read_flag_list(const char *, const char * const[]); void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned); void bch2_prt_u64_base2(struct printbuf *, u64); -void bch2_print_string_as_lines(const char *prefix, const char *lines); -void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines); +void bch2_print_string_as_lines(const char *, const char *); + +DEFINE_DARRAY_NAMED(bch_stacktrace, unsigned long); -typedef DARRAY(unsigned long) bch_stacktrace; int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t); void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *); int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned, gfp_t); @@ -419,6 +421,8 @@ static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio) #define bch2_maybe_corrupt_bio(...) do {} while (0) #endif +void bch2_bio_to_text(struct printbuf *, struct bio *); + static inline void memcpy_u64s_small(void *dst, const void *src, unsigned u64s) { @@ -688,8 +692,8 @@ static inline bool qstr_eq(const struct qstr l, const struct qstr r) return l.len == r.len && !memcmp(l.name, r.name, l.len); } -void bch2_darray_str_exit(darray_str *); -int bch2_split_devs(const char *, darray_str *); +void bch2_darray_str_exit(darray_const_str *); +int bch2_split_devs(const char *, darray_const_str *); #ifdef __KERNEL__ @@ -730,6 +734,13 @@ static inline bool test_bit_le64(size_t bit, __le64 *addr) return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0; } +static inline bool __test_and_set_bit_le64(size_t bit, __le64 *addr) +{ + bool ret = test_bit_le64(bit, addr); + __set_bit_le64(bit, addr); + return ret; +} + static inline void memcpy_swab(void *_dst, void *_src, size_t len) { u8 *dst = _dst + len; diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index e6be32003f3b..6094b568dd33 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -38,7 +38,7 @@ static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k); return bch2_xattr_hash(info, - &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len)); + &X_SEARCH(x.v->x_type, x.v->x_name_and_value, x.v->x_name_len)); } static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) @@ -48,7 +48,7 @@ static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) return l.v->x_type != r->type || l.v->x_name_len != r->name.len || - memcmp(l.v->x_name, r->name.name, r->name.len); + memcmp(l.v->x_name_and_value, r->name.name, r->name.len); } static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) @@ -58,7 +58,7 @@ static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) return l.v->x_type != r.v->x_type || l.v->x_name_len != r.v->x_name_len || - memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len); + memcmp(l.v->x_name_and_value, r.v->x_name_and_value, r.v->x_name_len); } const struct bch_hash_desc bch2_xattr_hash_desc = { @@ -96,7 +96,7 @@ int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k, c, xattr_invalid_type, "invalid type (%u)", xattr.v->x_type); - bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len), + bkey_fsck_err_on(memchr(xattr.v->x_name_and_value, '\0', xattr.v->x_name_len), c, xattr_name_invalid_chars, "xattr name has invalid characters"); fsck_err: @@ -120,13 +120,13 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, unsigned name_len = xattr.v->x_name_len; unsigned val_len = le16_to_cpu(xattr.v->x_val_len); unsigned max_name_val_bytes = bkey_val_bytes(xattr.k) - - offsetof(struct bch_xattr, x_name); + offsetof(struct bch_xattr, x_name_and_value); val_len = min_t(int, val_len, max_name_val_bytes - name_len); name_len = min(name_len, max_name_val_bytes); prt_printf(out, "%.*s:%.*s", - name_len, xattr.v->x_name, + name_len, xattr.v->x_name_and_value, val_len, (char *) xattr_val(xattr.v)); if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS || @@ -157,7 +157,7 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info else memcpy(buffer, xattr_val(xattr.v), ret); } - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_exit(&iter); return ret; } @@ -168,7 +168,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, int type, int flags) { struct bch_fs *c = trans->c; - struct btree_iter inode_iter = {}; + struct btree_iter inode_iter = { NULL }; int ret; ret = bch2_subvol_is_ro_trans(trans, inum.subvol) ?: @@ -176,10 +176,15 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, if (ret) return ret; + /* + * Besides the ctime update, extents, dirents and xattrs updates require + * that an inode update also happens - to ensure that if a key exists in + * one of those btrees with a given snapshot ID an inode is also present + */ inode_u->bi_ctime = bch2_current_time(c); ret = bch2_inode_write(trans, &inode_iter, inode_u); - bch2_trans_iter_exit(trans, &inode_iter); + bch2_trans_iter_exit(&inode_iter); if (ret) return ret; @@ -202,7 +207,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, xattr->v.x_type = type; xattr->v.x_name_len = namelen; xattr->v.x_val_len = cpu_to_le16(size); - memcpy(xattr->v.x_name, name, namelen); + memcpy(xattr->v.x_name_and_value, name, namelen); memcpy(xattr_val(&xattr->v), value, size); ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, @@ -270,7 +275,7 @@ static int bch2_xattr_emit(struct dentry *dentry, if (!prefix) return 0; - return __bch2_xattr_emit(prefix, xattr->x_name, xattr->x_name_len, buf); + return __bch2_xattr_emit(prefix, xattr->x_name_and_value, xattr->x_name_len, buf); } static int bch2_xattr_list_bcachefs(struct bch_fs *c, @@ -308,8 +313,8 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; u64 offset = 0, inum = inode->ei_inode.bi_inum; - int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_xattrs, + CLASS(btree_trans, trans)(c); + int ret = for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_xattrs, POS(inum, offset), POS(inum, U64_MAX), inode->ei_inum.subvol, 0, k, ({ @@ -317,7 +322,7 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) continue; bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf); - }))) ?: + })) ?: bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false) ?: bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true); @@ -330,9 +335,10 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler, { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret = bch2_trans_do(c, - bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags)); + CLASS(btree_trans, trans)(c); + int ret = lockrestart_do(trans, + bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags)); if (ret < 0 && bch2_err_matches(ret, ENOENT)) ret = -ENODATA; @@ -351,12 +357,12 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler, struct bch_inode_unpacked inode_u; int ret; - ret = bch2_trans_run(c, - commit_do(trans, NULL, NULL, 0, + CLASS(btree_trans, trans)(c); + ret = commit_do(trans, NULL, NULL, 0, bch2_xattr_set(trans, inode_inum(inode), &inode_u, &hash, name, value, size, handler->flags, flags)) ?: - (bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME), 0)); + (bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME), 0); return bch2_err_class(ret); } @@ -413,7 +419,6 @@ static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, bch2_inode_opts_to_opts(&inode->ei_inode); const struct bch_option *opt; int id, inode_opt_id; - struct printbuf out = PRINTBUF; int ret; u64 v; @@ -434,6 +439,7 @@ static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id))) return -ENODATA; + CLASS(printbuf, out)(); v = bch2_opt_get_by_id(&opts, id); bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0); @@ -448,7 +454,6 @@ static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, memcpy(buffer, out.buf, out.pos); } - printbuf_exit(&out); return ret; } @@ -527,11 +532,11 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, kfree(buf); if (ret < 0) - goto err_class_exit; + goto err; - ret = bch2_opt_check_may_set(c, NULL, opt_id, v); + ret = bch2_opt_hook_pre_set(c, NULL, opt_id, v); if (ret < 0) - goto err_class_exit; + goto err; s.v = v + 1; s.defined = true; @@ -543,7 +548,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, * rename() also has to deal with keeping inherited options up * to date - see bch2_reinherit_attrs() */ - spin_lock(&dentry->d_lock); + guard(spinlock)(&dentry->d_lock); if (!IS_ROOT(dentry)) { struct bch_inode_info *dir = to_bch_ei(d_inode(dentry->d_parent)); @@ -552,26 +557,24 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, } else { s.v = 0; } - spin_unlock(&dentry->d_lock); s.defined = false; } - mutex_lock(&inode->ei_update_lock); - if (inode_opt_id == Inode_opt_project) { - /* - * inode fields accessible via the xattr interface are stored - * with a +1 bias, so that 0 means unset: - */ - ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0); - if (ret) - goto err; - } + scoped_guard(mutex, &inode->ei_update_lock) { + if (inode_opt_id == Inode_opt_project) { + /* + * inode fields accessible via the xattr interface are stored + * with a +1 bias, so that 0 means unset: + */ + ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0); + if (ret) + goto err; + } - ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); + ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); + } err: - mutex_unlock(&inode->ei_update_lock); -err_class_exit: return bch2_err_class(ret); } diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h index 132fbbd15a66..1139bf345f70 100644 --- a/fs/bcachefs/xattr.h +++ b/fs/bcachefs/xattr.h @@ -18,12 +18,12 @@ void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) { - return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) + + return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name_and_value) + name_len + val_len, sizeof(u64)); } #define xattr_val(_xattr) \ - ((void *) (_xattr)->x_name + (_xattr)->x_name_len) + ((void *) (_xattr)->x_name_and_value + (_xattr)->x_name_len) struct xattr_search_key { u8 type; diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h index 67426e33d04e..4121b78d9a92 100644 --- a/fs/bcachefs/xattr_format.h +++ b/fs/bcachefs/xattr_format.h @@ -16,10 +16,10 @@ struct bch_xattr { /* * x_name contains the name and value counted by * x_name_len + x_val_len. The introduction of - * __counted_by(x_name_len) caused a false positive + * __counted_by(x_name_len) previously caused a false positive * detection of an out of bounds write. */ - __u8 x_name[]; + __u8 x_name_and_value[]; } __packed __aligned(8); #endif /* _BCACHEFS_XATTR_FORMAT_H */ -- 2.49.1