From a053ebfb8c89e023a44c365e369f4053cfc53376 Mon Sep 17 00:00:00 2001 From: Kent Overstreet <kent.overstreet@linux.dev> Date: Sat, 23 Sep 2023 18:42:30 -0400 Subject: [PATCH] Update bcachefs sources to f9c612bbf82d bcachefs: Fixes for building in userspace Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev> --- .bcachefs_revision | 2 +- Makefile.compiler | 8 +- cmd_dump.c | 10 +- cmd_kill_btree_node.c | 10 +- cmd_migrate.c | 13 +- include/linux/blkdev.h | 49 +- include/linux/compiler.h | 1 + include/linux/rcupdate.h | 2 +- libbcachefs/acl.c | 103 +- libbcachefs/acl.h | 6 +- libbcachefs/alloc_background.c | 161 +- libbcachefs/alloc_foreground.c | 19 +- libbcachefs/backpointers.c | 31 +- libbcachefs/bcachefs.h | 9 +- libbcachefs/bcachefs_format.h | 121 +- libbcachefs/bkey.c | 33 +- libbcachefs/bkey.h | 6 +- libbcachefs/bkey_methods.c | 17 +- libbcachefs/bkey_sort.h | 16 +- libbcachefs/bset.c | 27 +- libbcachefs/btree_cache.c | 37 +- libbcachefs/btree_gc.c | 141 +- libbcachefs/btree_io.c | 80 +- libbcachefs/btree_io.h | 2 +- libbcachefs/btree_iter.c | 135 +- libbcachefs/btree_iter.h | 83 +- libbcachefs/btree_key_cache.c | 28 +- libbcachefs/btree_locking.h | 2 +- libbcachefs/btree_trans_commit.c | 22 +- libbcachefs/btree_types.h | 38 +- libbcachefs/btree_update.c | 125 +- libbcachefs/btree_update.h | 35 +- libbcachefs/btree_update_interior.c | 112 +- libbcachefs/btree_write_buffer.c | 2 +- libbcachefs/buckets.c | 12 +- libbcachefs/buckets.h | 35 +- libbcachefs/buckets_waiting_for_journal.c | 2 +- libbcachefs/chardev.c | 55 +- libbcachefs/checksum.c | 27 +- libbcachefs/checksum.h | 7 +- libbcachefs/compress.c | 8 +- libbcachefs/counters.c | 2 +- libbcachefs/data_update.c | 8 +- libbcachefs/data_update.h | 2 +- libbcachefs/debug.c | 57 +- libbcachefs/dirent.c | 31 +- libbcachefs/disk_groups.c | 15 +- libbcachefs/ec.c | 40 +- libbcachefs/ec.h | 2 +- libbcachefs/errcode.c | 9 +- libbcachefs/errcode.h | 14 +- libbcachefs/error.c | 1 - libbcachefs/fs-io-buffered.c | 97 +- libbcachefs/fs-io-direct.c | 18 +- libbcachefs/fs-io-pagecache.c | 37 +- libbcachefs/fs-io.c | 303 +--- libbcachefs/fs-io.h | 4 +- libbcachefs/fs-ioctl.c | 5 +- libbcachefs/fs-ioctl.h | 6 +- libbcachefs/fs.c | 205 +-- libbcachefs/fs.h | 2 +- libbcachefs/fsck.c | 186 +-- libbcachefs/inode.c | 76 +- libbcachefs/inode.h | 3 + libbcachefs/io.h | 202 --- libbcachefs/io_misc.c | 497 ++++++ libbcachefs/io_misc.h | 34 + libbcachefs/io_read.c | 1210 ++++++++++++++ libbcachefs/io_read.h | 158 ++ libbcachefs/{io.c => io_write.c} | 1448 +---------------- libbcachefs/io_write.h | 110 ++ libbcachefs/{io_types.h => io_write_types.h} | 75 +- libbcachefs/journal.c | 35 +- libbcachefs/journal.h | 34 +- libbcachefs/journal_io.c | 34 +- libbcachefs/journal_reclaim.c | 24 +- libbcachefs/journal_reclaim.h | 3 +- libbcachefs/journal_seq_blacklist.c | 12 +- libbcachefs/logged_ops.c | 110 ++ libbcachefs/logged_ops.h | 20 + libbcachefs/lru.c | 4 +- libbcachefs/migrate.c | 29 +- libbcachefs/move.c | 45 +- libbcachefs/move.h | 1 + libbcachefs/movinggc.c | 51 +- libbcachefs/opts.c | 3 +- libbcachefs/opts.h | 2 +- libbcachefs/printbuf.c | 66 +- libbcachefs/quota.c | 19 +- libbcachefs/rebalance.c | 4 +- libbcachefs/recovery.c | 25 +- libbcachefs/recovery_types.h | 1 + libbcachefs/reflink.c | 44 +- libbcachefs/replicas.c | 2 +- libbcachefs/six.c | 1 - libbcachefs/snapshot.c | 93 +- libbcachefs/snapshot.h | 2 - libbcachefs/subvolume.c | 13 +- libbcachefs/subvolume.h | 2 +- libbcachefs/super-io.c | 32 +- libbcachefs/super.c | 78 +- libbcachefs/super_types.h | 3 +- libbcachefs/sysfs.c | 42 +- libbcachefs/tests.c | 262 ++- libbcachefs/trace.h | 19 + libbcachefs/util.c | 25 +- libbcachefs/util.h | 26 +- libbcachefs/varint.c | 24 +- libbcachefs/vstructs.h | 6 +- libbcachefs/xattr.c | 38 +- linux/blkdev.c | 16 +- rust-src/bch_bindgen/src/bkey.rs | 4 + rust-src/bch_bindgen/src/btree.rs | 15 +- .../bch_bindgen/src/libbcachefs_wrapper.h | 8 +- 114 files changed, 4092 insertions(+), 3674 deletions(-) delete mode 100644 libbcachefs/io.h create mode 100644 libbcachefs/io_misc.c create mode 100644 libbcachefs/io_misc.h create mode 100644 libbcachefs/io_read.c create mode 100644 libbcachefs/io_read.h rename libbcachefs/{io.c => io_write.c} (53%) create mode 100644 libbcachefs/io_write.h rename libbcachefs/{io_types.h => io_write_types.h} (54%) create mode 100644 libbcachefs/logged_ops.c create mode 100644 libbcachefs/logged_ops.h diff --git a/.bcachefs_revision b/.bcachefs_revision index 57c74ea2..0c7b8559 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -e7f62157681d96386dc500609149b9685358a2b0 +f9c612bbf82da87d7d4a005310c5213db00e22de diff --git a/Makefile.compiler b/Makefile.compiler index 7aa1fbc4..8fcb4274 100644 --- a/Makefile.compiler +++ b/Makefile.compiler @@ -32,13 +32,13 @@ try-run = $(shell set -e; \ # Usage: aflags-y += $(call as-option,-Wa$(comma)-isa=foo,) as-option = $(call try-run,\ - $(CC) -Werror $(KBUILD_AFLAGS) $(1) -c -x assembler-with-cpp /dev/null -o "$$TMP",$(1),$(2)) + $(CC) -Werror $(KBUILD_CPPFLAGS) $(KBUILD_AFLAGS) $(1) -c -x assembler-with-cpp /dev/null -o "$$TMP",$(1),$(2)) # as-instr # Usage: aflags-y += $(call as-instr,instr,option1,option2) as-instr = $(call try-run,\ - printf "%b\n" "$(1)" | $(CC) -Werror $(KBUILD_AFLAGS) -c -x assembler-with-cpp -o "$$TMP" -,$(2),$(3)) + printf "%b\n" "$(1)" | $(CC) -Werror $(CLANG_FLAGS) $(KBUILD_AFLAGS) -c -x assembler-with-cpp -o "$$TMP" -,$(2),$(3)) # __cc-option # Usage: MY_CFLAGS += $(call __cc-option,$(CC),$(MY_CFLAGS),-march=winchip-c6,-march=i586) @@ -72,7 +72,3 @@ clang-min-version = $(call test-ge, $(CONFIG_CLANG_VERSION), $1) # ld-option # Usage: KBUILD_LDFLAGS += $(call ld-option, -X, -Y) ld-option = $(call try-run, $(LD) $(KBUILD_LDFLAGS) $(1) -v,$(1),$(2),$(3)) - -# ld-ifversion -# Usage: $(call ld-ifversion, -ge, 22252, y) -ld-ifversion = $(shell [ $(CONFIG_LD_VERSION)0 $(1) $(2)0 ] && echo $(3) || echo $(4)) diff --git a/cmd_dump.c b/cmd_dump.c index bf570dc6..0d349233 100644 --- a/cmd_dump.c +++ b/cmd_dump.c @@ -61,13 +61,11 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd, for (i = 0; i < BTREE_ID_NR; i++) { const struct bch_extent_ptr *ptr; struct bkey_ptrs_c ptrs; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct btree *b; - bch2_trans_init(&trans, c, 0, 0); - - __for_each_btree_node(&trans, iter, i, POS_MIN, 0, 1, 0, b, ret) { + __for_each_btree_node(trans, iter, i, POS_MIN, 0, 1, 0, b, ret) { struct btree_node_iter iter; struct bkey u; struct bkey_s_c k; @@ -97,8 +95,8 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd, btree_bytes(c)); } - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); } qcow2_write_image(ca->disk_sb.bdev->bd_buffered_fd, fd, &data, diff --git a/cmd_kill_btree_node.c b/cmd_kill_btree_node.c index e9b8265d..83389bc4 100644 --- a/cmd_kill_btree_node.c +++ b/cmd_kill_btree_node.c @@ -64,7 +64,7 @@ int cmd_kill_btree_node(int argc, char *argv[]) if (IS_ERR(c)) die("error opening %s: %s", argv[0], bch2_err_str(PTR_ERR(c))); - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct btree *b; int ret; @@ -74,9 +74,7 @@ int cmd_kill_btree_node(int argc, char *argv[]) if (ret) die("error %s from posix_memalign", bch2_err_str(ret)); - bch2_trans_init(&trans, c, 0, 0); - - __for_each_btree_node(&trans, iter, btree_id, POS_MIN, 0, level, 0, b, ret) { + __for_each_btree_node(trans, iter, btree_id, POS_MIN, 0, level, 0, b, ret) { if (b->c.level != level) continue; @@ -113,8 +111,8 @@ int cmd_kill_btree_node(int argc, char *argv[]) bch_err(c, "node at specified index not found"); ret = EXIT_FAILURE; done: - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); bch2_fs_stop(c); return ret; diff --git a/cmd_migrate.c b/cmd_migrate.c index 3958ba6b..85ab96c0 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -33,7 +33,7 @@ #include "libbcachefs/errcode.h" #include "libbcachefs/fs-common.h" #include "libbcachefs/inode.h" -#include "libbcachefs/io.h" +#include "libbcachefs/io_write.h" #include "libbcachefs/replicas.h" #include "libbcachefs/str_hash.h" #include "libbcachefs/super.h" @@ -126,7 +126,7 @@ static void update_inode(struct bch_fs *c, bch2_inode_pack(&packed, inode); packed.inode.k.p.snapshot = U32_MAX; ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i, - NULL, NULL, 0); + NULL, 0); if (ret) die("error updating inode: %s", bch2_err_str(ret)); } @@ -140,7 +140,7 @@ static void create_link(struct bch_fs *c, struct bch_inode_unpacked inode; int ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_link_trans(&trans, + bch2_link_trans(trans, (subvol_inum) { 1, parent->bi_inum }, &parent_u, (subvol_inum) { 1, inum }, &inode, &qstr)); if (ret) @@ -159,7 +159,7 @@ static struct bch_inode_unpacked create_file(struct bch_fs *c, bch2_inode_init_early(c, &new_inode); int ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_create_trans(&trans, + bch2_create_trans(trans, (subvol_inum) { 1, parent->bi_inum }, parent, &new_inode, &qstr, uid, gid, mode, rdev, NULL, NULL, @@ -232,7 +232,7 @@ static void copy_xattrs(struct bch_fs *c, struct bch_inode_unpacked *dst, struct bch_inode_unpacked inode_u; int ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_xattr_set(&trans, + bch2_xattr_set(trans, (subvol_inum) { 1, dst->bi_inum }, &inode_u, &hash_info, attr, val, val_size, h->flags, 0)); @@ -339,8 +339,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst, die("error reserving space in new filesystem: %s", bch2_err_str(ret)); - ret = bch2_btree_insert(c, BTREE_ID_extents, &e->k_i, - &res, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_extents, &e->k_i, &res, 0); if (ret) die("btree insert error %s", bch2_err_str(ret)); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7d378ab2..39143117 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -6,6 +6,8 @@ #include <linux/kobject.h> #include <linux/types.h> +#define MAX_LFS_FILESIZE ((loff_t)LLONG_MAX) + #define BIO_MAX_VECS 256U typedef unsigned fmode_t; @@ -21,30 +23,20 @@ struct user_namespace; #define MINOR(dev) ((unsigned int) ((dev) & MINORMASK)) #define MKDEV(ma,mi) (((ma) << MINORBITS) | (mi)) -/* file is open for reading */ -#define FMODE_READ ((__force fmode_t)0x1) -/* file is open for writing */ -#define FMODE_WRITE ((__force fmode_t)0x2) -/* file is seekable */ -#define FMODE_LSEEK ((__force fmode_t)0x4) -/* file can be accessed using pread */ -#define FMODE_PREAD ((__force fmode_t)0x8) -/* file can be accessed using pwrite */ -#define FMODE_PWRITE ((__force fmode_t)0x10) -/* File is opened for execution with sys_execve / sys_uselib */ -#define FMODE_EXEC ((__force fmode_t)0x20) -/* File is opened with O_NDELAY (only set for block devices) */ -#define FMODE_NDELAY ((__force fmode_t)0x40) -/* File is opened with O_EXCL (only set for block devices) */ -#define FMODE_EXCL ((__force fmode_t)0x80) -/* File is opened using open(.., 3, ..) and is writeable only for ioctls - (specialy hack for floppy.c) */ -#define FMODE_WRITE_IOCTL ((__force fmode_t)0x100) -/* 32bit hashes as llseek() offset (for directories) */ -#define FMODE_32BITHASH ((__force fmode_t)0x200) -/* 64bit hashes as llseek() offset (for directories) */ -#define FMODE_64BITHASH ((__force fmode_t)0x400) -#define FMODE_BUFFERED ((__force fmode_t)0x800) +typedef unsigned int __bitwise blk_mode_t; + +/* open for reading */ +#define BLK_OPEN_READ ((__force blk_mode_t)(1 << 0)) +/* open for writing */ +#define BLK_OPEN_WRITE ((__force blk_mode_t)(1 << 1)) +/* open exclusively (vs other exclusive openers */ +#define BLK_OPEN_EXCL ((__force blk_mode_t)(1 << 2)) +/* opened with O_NDELAY */ +#define BLK_OPEN_NDELAY ((__force blk_mode_t)(1 << 3)) +/* open for "writes" only for ioctls (specialy hack for floppy.c) */ +#define BLK_OPEN_WRITE_IOCTL ((__force blk_mode_t)(1 << 4)) + +#define BLK_OPEN_BUFFERED ((__force blk_mode_t)(1 << 5)) struct inode { unsigned long i_ino; @@ -93,9 +85,14 @@ int blkdev_issue_zeroout(struct block_device *, sector_t, sector_t, gfp_t, unsig unsigned bdev_logical_block_size(struct block_device *bdev); sector_t get_capacity(struct gendisk *disk); -void blkdev_put(struct block_device *bdev, fmode_t mode); +struct blk_holder_ops { + void (*mark_dead)(struct block_device *bdev); +}; + +void blkdev_put(struct block_device *bdev, void *holder); void bdput(struct block_device *bdev); -struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, void *holder); +struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode, + void *holder, const struct blk_holder_ops *hop); int lookup_bdev(const char *path, dev_t *); struct super_block { diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 39df1f16..b9486dbe 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -65,6 +65,7 @@ #define unreachable() __builtin_unreachable() #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) #define fallthrough __attribute__((__fallthrough__)) +#define __noreturn __attribute__((__noreturn__)) #define ___PASTE(a,b) a##b #define __PASTE(a,b) ___PASTE(a,b) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index ef032531..ec5f478f 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -12,7 +12,7 @@ #define rcu_access_pointer(p) READ_ONCE(p) #define kfree_rcu(ptr, rcu_head) kfree(ptr) /* XXX */ -#define kvfree_rcu(ptr) kfree(ptr) /* XXX */ +#define kvfree_rcu_mightsleep(ptr) kfree(ptr) /* XXX */ #define RCU_INIT_POINTER(p, v) WRITE_ONCE(p, v) diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c index b1a48886..f3809897 100644 --- a/libbcachefs/acl.c +++ b/libbcachefs/acl.c @@ -1,18 +1,71 @@ // SPDX-License-Identifier: GPL-2.0 -#ifdef CONFIG_BCACHEFS_POSIX_ACL #include "bcachefs.h" -#include <linux/fs.h> +#include "acl.h" +#include "xattr.h" + #include <linux/posix_acl.h> + +static const char * const acl_types[] = { + [ACL_USER_OBJ] = "user_obj", + [ACL_USER] = "user", + [ACL_GROUP_OBJ] = "group_obj", + [ACL_GROUP] = "group", + [ACL_MASK] = "mask", + [ACL_OTHER] = "other", + NULL, +}; + +void bch2_acl_to_text(struct printbuf *out, const void *value, size_t size) +{ + const void *p, *end = value + size; + + if (!value || + size < sizeof(bch_acl_header) || + ((bch_acl_header *)value)->a_version != cpu_to_le32(BCH_ACL_VERSION)) + return; + + p = value + sizeof(bch_acl_header); + while (p < end) { + const bch_acl_entry *in = p; + unsigned tag = le16_to_cpu(in->e_tag); + + prt_str(out, acl_types[tag]); + + switch (tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + p += sizeof(bch_acl_entry_short); + break; + case ACL_USER: + prt_printf(out, " uid %u", le32_to_cpu(in->e_id)); + p += sizeof(bch_acl_entry); + break; + case ACL_GROUP: + prt_printf(out, " gid %u", le32_to_cpu(in->e_id)); + p += sizeof(bch_acl_entry); + break; + } + + prt_printf(out, " %o", le16_to_cpu(in->e_perm)); + + if (p != end) + prt_char(out, ' '); + } +} + +#ifdef CONFIG_BCACHEFS_POSIX_ACL + +#include "fs.h" + +#include <linux/fs.h> #include <linux/posix_acl_xattr.h> #include <linux/sched.h> #include <linux/slab.h> -#include "acl.h" -#include "fs.h" -#include "xattr.h" - static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long) { return sizeof(bch_acl_header) + @@ -226,18 +279,16 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap, struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; struct bkey_s_c_xattr xattr; struct posix_acl *acl = NULL; struct bkey_s_c k; int ret; - - bch2_trans_init(&trans, c, 0, 0); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc, + ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, inode_inum(inode), &search, 0); if (ret) { if (!bch2_err_matches(ret, ENOENT)) @@ -253,7 +304,7 @@ retry: } xattr = bkey_s_c_to_xattr(k); - acl = bch2_acl_from_disk(&trans, xattr_val(xattr.v), + acl = bch2_acl_from_disk(trans, xattr_val(xattr.v), le16_to_cpu(xattr.v->x_val_len)); if (!IS_ERR(acl)) @@ -262,8 +313,8 @@ out: if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart)) goto retry; - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); return acl; } @@ -303,7 +354,7 @@ int bch2_set_acl(struct mnt_idmap *idmap, { struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter inode_iter = { NULL }; struct bch_inode_unpacked inode_u; struct posix_acl *acl; @@ -311,12 +362,11 @@ int bch2_set_acl(struct mnt_idmap *idmap, int ret; mutex_lock(&inode->ei_update_lock); - bch2_trans_init(&trans, c, 0, 0); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); acl = _acl; - ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode), + ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), BTREE_ITER_INTENT); if (ret) goto btree_err; @@ -329,30 +379,30 @@ retry: goto btree_err; } - ret = bch2_set_acl_trans(&trans, inode_inum(inode), &inode_u, acl, type); + ret = bch2_set_acl_trans(trans, inode_inum(inode), &inode_u, acl, type); if (ret) goto btree_err; inode_u.bi_ctime = bch2_current_time(c); inode_u.bi_mode = mode; - ret = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: - bch2_trans_commit(&trans, NULL, NULL, 0); + ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?: + bch2_trans_commit(trans, NULL, NULL, 0); btree_err: - bch2_trans_iter_exit(&trans, &inode_iter); + bch2_trans_iter_exit(trans, &inode_iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; if (unlikely(ret)) goto err; - bch2_inode_update_after_write(&trans, inode, &inode_u, + bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME|ATTR_MODE); set_cached_acl(&inode->v, type, acl); err: - bch2_trans_exit(&trans); mutex_unlock(&inode->ei_update_lock); + bch2_trans_put(trans); return ret; } @@ -367,7 +417,7 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, struct btree_iter iter; struct bkey_s_c_xattr xattr; struct bkey_i_xattr *new; - struct posix_acl *acl; + struct posix_acl *acl = NULL; struct bkey_s_c k; int ret; @@ -377,9 +427,10 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, return bch2_err_matches(ret, ENOENT) ? 0 : ret; k = bch2_btree_iter_peek_slot(&iter); - xattr = bkey_s_c_to_xattr(k); + ret = bkey_err(k); if (ret) goto err; + xattr = bkey_s_c_to_xattr(k); acl = bch2_acl_from_disk(trans, xattr_val(xattr.v), le16_to_cpu(xattr.v->x_val_len)); diff --git a/libbcachefs/acl.h b/libbcachefs/acl.h index bb21d8d6..27e7eec0 100644 --- a/libbcachefs/acl.h +++ b/libbcachefs/acl.h @@ -7,8 +7,6 @@ struct bch_hash_info; struct bch_inode_info; struct posix_acl; -#ifdef CONFIG_BCACHEFS_POSIX_ACL - #define BCH_ACL_VERSION 0x0001 typedef struct { @@ -26,6 +24,10 @@ typedef struct { __le32 a_version; } bch_acl_header; +void bch2_acl_to_text(struct printbuf *, const void *, size_t); + +#ifdef CONFIG_BCACHEFS_POSIX_ACL + struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int); int bch2_set_acl_trans(struct btree_trans *, subvol_inum, diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 069d98a8..19ef7a44 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -237,13 +237,12 @@ int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, } int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) + enum bkey_invalid_flags flags, struct printbuf *err) { struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); if (alloc_v4_u64s(a.v) > bkey_val_u64s(k.k)) { - prt_printf(err, "bad val size (%u > %lu)", + prt_printf(err, "bad val size (%u > %zu)", alloc_v4_u64s(a.v), bkey_val_u64s(k.k)); return -BCH_ERR_invalid_bkey; } @@ -527,7 +526,7 @@ int bch2_bucket_gens_invalid(const struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) { if (bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens)) { - prt_printf(err, "bad val size (%lu != %zu)", + prt_printf(err, "bad val size (%zu != %zu)", bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens)); return -BCH_ERR_invalid_bkey; } @@ -549,7 +548,7 @@ void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bke int bch2_bucket_gens_init(struct bch_fs *c) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; struct bch_alloc_v4 a; @@ -560,9 +559,7 @@ int bch2_bucket_gens_init(struct bch_fs *c) u8 gen; int ret; - bch2_trans_init(&trans, c, 0, 0); - - for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { /* * Not a fsck error because this is checked/repaired by @@ -575,10 +572,10 @@ int bch2_bucket_gens_init(struct bch_fs *c) pos = alloc_gens_pos(iter.pos, &offset); if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) { - ret = commit_do(&trans, NULL, NULL, + ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW, - __bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0)); + bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); if (ret) break; have_bucket_gens_key = false; @@ -592,15 +589,15 @@ int bch2_bucket_gens_init(struct bch_fs *c) g.v.gens[offset] = gen; } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); if (have_bucket_gens_key && !ret) - ret = commit_do(&trans, NULL, NULL, + ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW, - __bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0)); + bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (ret) bch_err_fn(c, ret); @@ -609,20 +606,19 @@ int bch2_bucket_gens_init(struct bch_fs *c) int bch2_alloc_read(struct bch_fs *c) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; struct bch_dev *ca; int ret; down_read(&c->gc_lock); - bch2_trans_init(&trans, c, 0, 0); if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) { const struct bch_bucket_gens *g; u64 b; - for_each_btree_key(&trans, iter, BTREE_ID_bucket_gens, POS_MIN, + for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; @@ -646,11 +642,11 @@ int bch2_alloc_read(struct bch_fs *c) b++) *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK]; } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); } else { struct bch_alloc_v4 a; - for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { /* * Not a fsck error because this is checked/repaired by @@ -663,10 +659,10 @@ int bch2_alloc_read(struct bch_fs *c) *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); } - bch2_trans_exit(&trans); + bch2_trans_put(trans); up_read(&c->gc_lock); if (ret) @@ -1201,15 +1197,15 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, } if (need_update) { - struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(g)); + struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); - ret = PTR_ERR_OR_ZERO(k); + ret = PTR_ERR_OR_ZERO(u); if (ret) goto err; - memcpy(k, &g, sizeof(g)); + memcpy(u, &g, sizeof(g)); - ret = bch2_trans_update(trans, bucket_gens_iter, k, 0); + ret = bch2_trans_update(trans, bucket_gens_iter, u, 0); if (ret) goto err; } @@ -1286,7 +1282,7 @@ static int bch2_check_discard_freespace_key(struct btree_trans *trans, if (!btree_id_is_extents(iter->btree_id)) { return __bch2_check_discard_freespace_key(trans, iter); } else { - int ret; + int ret = 0; while (!bkey_eq(iter->pos, end) && !(ret = btree_trans_too_many_iters(trans) ?: @@ -1355,15 +1351,14 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, } if (need_update) { - struct bkey_i *k; + struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); - k = bch2_trans_kmalloc(trans, sizeof(g)); - ret = PTR_ERR_OR_ZERO(k); + ret = PTR_ERR_OR_ZERO(u); if (ret) goto out; - memcpy(k, &g, sizeof(g)); - ret = bch2_trans_update(trans, iter, k, 0); + memcpy(u, &g, sizeof(g)); + ret = bch2_trans_update(trans, iter, u, 0); } out: fsck_err: @@ -1373,27 +1368,25 @@ fsck_err: int bch2_check_alloc_info(struct bch_fs *c) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter; struct bkey hole; struct bkey_s_c k; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - - bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_PREFETCH); - bch2_trans_iter_init(&trans, &discard_iter, BTREE_ID_need_discard, POS_MIN, + bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN, BTREE_ITER_PREFETCH); - bch2_trans_iter_init(&trans, &freespace_iter, BTREE_ID_freespace, POS_MIN, + bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN, BTREE_ITER_PREFETCH); - bch2_trans_iter_init(&trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN, + bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN, BTREE_ITER_PREFETCH); while (1) { struct bpos next; - bch2_trans_begin(&trans); + bch2_trans_begin(trans); k = bch2_get_key_or_real_bucket_hole(&iter, &hole); ret = bkey_err(k); @@ -1406,7 +1399,7 @@ int bch2_check_alloc_info(struct bch_fs *c) if (k.k->type) { next = bpos_nosnap_successor(k.k->p); - ret = bch2_check_alloc_key(&trans, + ret = bch2_check_alloc_key(trans, k, &iter, &discard_iter, &freespace_iter, @@ -1416,11 +1409,11 @@ int bch2_check_alloc_info(struct bch_fs *c) } else { next = k.k->p; - ret = bch2_check_alloc_hole_freespace(&trans, + ret = bch2_check_alloc_hole_freespace(trans, bkey_start_pos(k.k), &next, &freespace_iter) ?: - bch2_check_alloc_hole_bucket_gens(&trans, + bch2_check_alloc_hole_bucket_gens(trans, bkey_start_pos(k.k), &next, &bucket_gens_iter); @@ -1428,7 +1421,7 @@ int bch2_check_alloc_info(struct bch_fs *c) goto bkey_err; } - ret = bch2_trans_commit(&trans, NULL, NULL, + ret = bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW); if (ret) @@ -1441,29 +1434,29 @@ bkey_err: if (ret) break; } - bch2_trans_iter_exit(&trans, &bucket_gens_iter); - bch2_trans_iter_exit(&trans, &freespace_iter); - bch2_trans_iter_exit(&trans, &discard_iter); - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &bucket_gens_iter); + bch2_trans_iter_exit(trans, &freespace_iter); + bch2_trans_iter_exit(trans, &discard_iter); + bch2_trans_iter_exit(trans, &iter); if (ret < 0) goto err; - ret = for_each_btree_key2(&trans, iter, + ret = for_each_btree_key2(trans, iter, BTREE_ID_need_discard, POS_MIN, BTREE_ITER_PREFETCH, k, - bch2_check_discard_freespace_key(&trans, &iter, k.k->p)) ?: - for_each_btree_key2(&trans, iter, + bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?: + for_each_btree_key2(trans, iter, BTREE_ID_freespace, POS_MIN, BTREE_ITER_PREFETCH, k, - bch2_check_discard_freespace_key(&trans, &iter, k.k->p)) ?: - for_each_btree_key_commit(&trans, iter, + bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?: + for_each_btree_key_commit(trans, iter, BTREE_ID_bucket_gens, POS_MIN, BTREE_ITER_PREFETCH, k, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, - bch2_check_bucket_gens_key(&trans, &iter, k)); + bch2_check_bucket_gens_key(trans, &iter, k)); err: - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (ret) bch_err_fn(c, ret); return ret; @@ -1549,10 +1542,10 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c) int ret = 0; ret = bch2_trans_run(c, - for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, + for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_PREFETCH, k, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, - bch2_check_alloc_to_lru_ref(&trans, &iter))); + bch2_check_alloc_to_lru_ref(trans, &iter))); if (ret) bch_err_fn(c, ret); return ret; @@ -1677,29 +1670,25 @@ out: static void bch2_do_discards_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, discard_work); - struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0; struct bpos discard_pos_done = POS_MAX; int ret; - bch2_trans_init(&trans, c, 0, 0); - /* * We're doing the commit in bch2_discard_one_bucket instead of using * for_each_btree_key_commit() so that we can increment counters after * successful commit: */ - ret = for_each_btree_key2(&trans, iter, - BTREE_ID_need_discard, POS_MIN, 0, k, - bch2_discard_one_bucket(&trans, &iter, &discard_pos_done, - &seen, - &open, - &need_journal_commit, - &discarded)); - - bch2_trans_exit(&trans); + ret = bch2_trans_run(c, + for_each_btree_key2(trans, iter, + BTREE_ID_need_discard, POS_MIN, 0, k, + bch2_discard_one_bucket(trans, &iter, &discard_pos_done, + &seen, + &open, + &need_journal_commit, + &discarded))); if (need_journal_commit * 2 > seen) bch2_journal_flush_async(&c->journal, NULL); @@ -1805,15 +1794,13 @@ static void bch2_do_invalidates_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); struct bch_dev *ca; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; unsigned i; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - - ret = bch2_btree_write_buffer_flush(&trans); + ret = bch2_btree_write_buffer_flush(trans); if (ret) goto err; @@ -1821,11 +1808,11 @@ static void bch2_do_invalidates_work(struct work_struct *work) s64 nr_to_invalidate = should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); - ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_lru, + ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru, lru_pos(ca->dev_idx, 0, 0), lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX), BTREE_ITER_INTENT, k, - invalidate_one_bucket(&trans, &iter, k, &nr_to_invalidate)); + invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate)); if (ret < 0) { percpu_ref_put(&ca->ref); @@ -1833,7 +1820,7 @@ static void bch2_do_invalidates_work(struct work_struct *work) } } err: - bch2_trans_exit(&trans); + bch2_trans_put(trans); bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } @@ -1847,7 +1834,7 @@ void bch2_do_invalidates(struct bch_fs *c) static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, unsigned long *last_updated) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; struct bkey hole; @@ -1855,9 +1842,7 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, struct bch_member *m; int ret; - bch2_trans_init(&trans, c, 0, 0); - - bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, ca->mi.first_bucket), BTREE_ITER_PREFETCH); /* @@ -1871,7 +1856,7 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, *last_updated = jiffies; } - bch2_trans_begin(&trans); + bch2_trans_begin(trans); if (bkey_ge(iter.pos, end)) { ret = 0; @@ -1891,8 +1876,8 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); - ret = bch2_bucket_do_index(&trans, k, a, true) ?: - bch2_trans_commit(&trans, NULL, NULL, + ret = bch2_bucket_do_index(trans, k, a, true) ?: + bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_LAZY_RW| BTREE_INSERT_NOFAIL); if (ret) @@ -1902,7 +1887,7 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, } else { struct bkey_i *freespace; - freespace = bch2_trans_kmalloc(&trans, sizeof(*freespace)); + freespace = bch2_trans_kmalloc(trans, sizeof(*freespace)); ret = PTR_ERR_OR_ZERO(freespace); if (ret) goto bkey_err; @@ -1912,8 +1897,8 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, freespace->k.p = k.k->p; freespace->k.size = k.k->size; - ret = __bch2_btree_insert(&trans, BTREE_ID_freespace, freespace, 0) ?: - bch2_trans_commit(&trans, NULL, NULL, + ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?: + bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_LAZY_RW| BTREE_INSERT_NOFAIL); if (ret) @@ -1928,11 +1913,11 @@ bkey_err: break; } - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); if (ret < 0) { - bch_err(ca, "error initializing free space: %s", bch2_err_str(ret)); + bch_err_msg(ca, ret, "initializing free space"); return ret; } diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index e02749dd..3bc4abd3 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -25,7 +25,7 @@ #include "disk_groups.h" #include "ec.h" #include "error.h" -#include "io.h" +#include "io_write.h" #include "journal.h" #include "movinggc.h" #include "nocow_locking.h" @@ -502,9 +502,14 @@ again: } /** - * bch_bucket_alloc - allocate a single bucket from a specific device + * bch2_bucket_alloc_trans - allocate a single bucket from a specific device + * @trans: transaction object + * @ca: device to allocate from + * @watermark: how important is this allocation? + * @cl: if not NULL, closure to be used to wait if buckets not available + * @usage: for secondarily also returning the current device usage * - * Returns index of bucket on success, 0 on failure + * Returns: an open_bucket on success, or an ERR_PTR() on failure. */ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, struct bch_dev *ca, @@ -597,7 +602,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, struct open_bucket *ob; bch2_trans_do(c, NULL, NULL, 0, - PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, watermark, + PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, &usage))); return ob; } @@ -775,7 +780,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, struct dev_alloc_list devs_sorted; struct ec_stripe_head *h; struct open_bucket *ob; - struct bch_dev *ca; unsigned i, ec_idx; int ret = 0; @@ -805,8 +809,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, } goto out_put_head; got_bucket: - ca = bch_dev_bkey_exists(c, ob->dev); - ob->ec_idx = ec_idx; ob->ec = h->s; ec_stripe_new_get(h->s, STRIPE_REF_io); @@ -1032,10 +1034,13 @@ static int open_bucket_add_buckets(struct btree_trans *trans, /** * should_drop_bucket - check if this is open_bucket should go away + * @ob: open_bucket to predicate on + * @c: filesystem handle * @ca: if set, we're killing buckets for a particular device * @ec: if true, we're shutting down erasure coding and killing all ec * open_buckets * otherwise, return true + * Returns: true if we should kill this open_bucket * * We're killing open_buckets because we're shutting down a device, erasure * coding, or the entire filesystem - check if this open_bucket matches: diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c index 8747c5e1..cc856150 100644 --- a/libbcachefs/backpointers.c +++ b/libbcachefs/backpointers.c @@ -351,20 +351,17 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ { struct bch_fs *c = trans->c; struct btree_iter alloc_iter = { NULL }; - struct bch_dev *ca; struct bkey_s_c alloc_k; struct printbuf buf = PRINTBUF; int ret = 0; if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c, - "backpointer for mising device:\n%s", + "backpointer for missing device:\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = bch2_btree_delete_at(trans, bp_iter, 0); goto out; } - ca = bch_dev_bkey_exists(c, k.k->p.inode); - alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bp_pos_to_bucket(c, k.k->p), 0); ret = bkey_err(alloc_k); @@ -393,10 +390,10 @@ int bch2_check_btree_backpointers(struct bch_fs *c) int ret; ret = bch2_trans_run(c, - for_each_btree_key_commit(&trans, iter, + for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, POS_MIN, 0, k, NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - bch2_check_btree_backpointer(&trans, &iter, k))); + bch2_check_btree_backpointer(trans, &iter, k))); if (ret) bch_err_fn(c, ret); return ret; @@ -629,7 +626,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter iter; enum btree_id btree_id; - struct bpos_level last_flushed = { UINT_MAX }; + struct bpos_level last_flushed = { UINT_MAX, POS_MIN }; int ret = 0; for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) { @@ -706,7 +703,7 @@ static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, --btree_nodes; if (!btree_nodes) { - *end = alloc_k.k->p; + *end = alloc_k.k ? alloc_k.k->p : SPOS_MAX; break; } @@ -726,13 +723,12 @@ static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, int bch2_check_extents_to_backpointers(struct bch_fs *c) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct bpos start = POS_MIN, end; int ret; - bch2_trans_init(&trans, c, 0, 0); while (1) { - ret = bch2_get_alloc_in_memory_pos(&trans, start, &end); + ret = bch2_get_alloc_in_memory_pos(trans, start, &end); if (ret) break; @@ -752,13 +748,13 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) printbuf_exit(&buf); } - ret = bch2_check_extents_to_backpointers_pass(&trans, start, end); + ret = bch2_check_extents_to_backpointers_pass(trans, start, end); if (ret || bpos_eq(end, SPOS_MAX)) break; start = bpos_successor(end); } - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (ret) bch_err_fn(c, ret); @@ -827,13 +823,12 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, int bch2_check_backpointers_to_extents(struct bch_fs *c) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end; int ret; - bch2_trans_init(&trans, c, 0, 0); while (1) { - ret = bch2_get_btree_in_memory_pos(&trans, + ret = bch2_get_btree_in_memory_pos(trans, (1U << BTREE_ID_extents)| (1U << BTREE_ID_reflink), ~0, @@ -859,13 +854,13 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) printbuf_exit(&buf); } - ret = bch2_check_backpointers_to_extents_pass(&trans, start, end); + ret = bch2_check_backpointers_to_extents_pass(trans, start, end); if (ret || !bbpos_cmp(end, BBPOS_MAX)) break; start = bbpos_successor(end); } - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (ret) bch_err_fn(c, ret); diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 30b3d7b9..9ae82254 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -454,6 +454,7 @@ enum gc_phase { GC_PHASE_BTREE_bucket_gens, GC_PHASE_BTREE_snapshot_trees, GC_PHASE_BTREE_deleted_inodes, + GC_PHASE_BTREE_logged_ops, GC_PHASE_PENDING_DELETE, }; @@ -626,8 +627,8 @@ struct journal_keys { size_t size; }; -struct btree_path_buf { - struct btree_path *path; +struct btree_trans_buf { + struct btree_trans *trans; }; #define REPLICAS_DELTA_LIST_MAX (1U << 16) @@ -786,9 +787,9 @@ struct bch_fs { /* btree_iter.c: */ struct seqmutex btree_trans_lock; struct list_head btree_trans_list; - mempool_t btree_paths_pool; + mempool_t btree_trans_pool; mempool_t btree_trans_mem_pool; - struct btree_path_buf __percpu *btree_paths_bufs; + struct btree_trans_buf __percpu *btree_trans_bufs; struct srcu_struct btree_trans_barrier; bool btree_trans_barrier_initialized; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index f17238be..f0d13044 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -83,8 +83,8 @@ typedef uuid_t __uuid_t; #endif #define BITMASK(name, type, field, offset, end) \ -static const unsigned name##_OFFSET = offset; \ -static const unsigned name##_BITS = (end - offset); \ +static const __maybe_unused unsigned name##_OFFSET = offset; \ +static const __maybe_unused unsigned name##_BITS = (end - offset); \ \ static inline __u64 name(const type *k) \ { \ @@ -98,9 +98,9 @@ static inline void SET_##name(type *k, __u64 v) \ } #define LE_BITMASK(_bits, name, type, field, offset, end) \ -static const unsigned name##_OFFSET = offset; \ -static const unsigned name##_BITS = (end - offset); \ -static const __u##_bits name##_MAX = (1ULL << (end - offset)) - 1; \ +static const __maybe_unused unsigned name##_OFFSET = offset; \ +static const __maybe_unused unsigned name##_BITS = (end - offset); \ +static const __maybe_unused __u##_bits name##_MAX = (1ULL << (end - offset)) - 1;\ \ static inline __u64 name(const type *k) \ { \ @@ -370,7 +370,9 @@ static inline void bkey_init(struct bkey *k) x(backpointer, 28) \ x(inode_v3, 29) \ x(bucket_gens, 30) \ - x(snapshot_tree, 31) + x(snapshot_tree, 31) \ + x(logged_op_truncate, 32) \ + x(logged_op_finsert, 33) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -723,7 +725,7 @@ struct bch_inode { __le64 bi_hash_seed; __le32 bi_flags; __le16 bi_mode; - __u8 fields[0]; + __u8 fields[]; } __packed __aligned(8); struct bch_inode_v2 { @@ -733,7 +735,7 @@ struct bch_inode_v2 { __le64 bi_hash_seed; __le64 bi_flags; __le16 bi_mode; - __u8 fields[0]; + __u8 fields[]; } __packed __aligned(8); struct bch_inode_v3 { @@ -745,7 +747,7 @@ struct bch_inode_v3 { __le64 bi_sectors; __le64 bi_size; __le64 bi_version; - __u8 fields[0]; + __u8 fields[]; } __packed __aligned(8); #define INODEv3_FIELDS_START_INITIAL 6 @@ -847,8 +849,8 @@ enum { __BCH_INODE_NODUMP = 3, __BCH_INODE_NOATIME = 4, - __BCH_INODE_I_SIZE_DIRTY = 5, - __BCH_INODE_I_SECTORS_DIRTY = 6, + __BCH_INODE_I_SIZE_DIRTY = 5, /* obsolete */ + __BCH_INODE_I_SECTORS_DIRTY = 6, /* obsolete */ __BCH_INODE_UNLINKED = 7, __BCH_INODE_BACKPTR_UNTRUSTED = 8, @@ -1097,20 +1099,20 @@ struct bch_reflink_v { struct bch_val v; __le64 refcount; union bch_extent_entry start[0]; - __u64 _data[0]; + __u64 _data[]; } __packed __aligned(8); struct bch_indirect_inline_data { struct bch_val v; __le64 refcount; - u8 data[0]; + u8 data[]; }; /* Inline data */ struct bch_inline_data { struct bch_val v; - u8 data[0]; + u8 data[]; }; /* Subvolumes: */ @@ -1183,6 +1185,33 @@ struct bch_lru { #define LRU_ID_STRIPES (1U << 16) +/* Logged operations btree: */ + +struct bch_logged_op_truncate { + struct bch_val v; + __le32 subvol; + __le32 pad; + __le64 inum; + __le64 new_i_size; +}; + +enum logged_op_finsert_state { + LOGGED_OP_FINSERT_start, + LOGGED_OP_FINSERT_shift_extents, + LOGGED_OP_FINSERT_finish, +}; + +struct bch_logged_op_finsert { + struct bch_val v; + __u8 state; + __u8 pad[3]; + __le32 subvol; + __le64 inum; + __le64 dst_offset; + __le64 src_offset; + __le64 pos; +}; + /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -1223,7 +1252,7 @@ enum bch_sb_field_type { struct bch_sb_field_journal { struct bch_sb_field field; - __le64 buckets[0]; + __le64 buckets[]; }; struct bch_sb_field_journal_v2 { @@ -1232,7 +1261,7 @@ struct bch_sb_field_journal_v2 { struct bch_sb_field_journal_v2_entry { __le64 start; __le64 nr; - } d[0]; + } d[]; }; /* BCH_SB_FIELD_members: */ @@ -1279,7 +1308,7 @@ enum bch_member_state { struct bch_sb_field_members { struct bch_sb_field field; - struct bch_member members[0]; + struct bch_member members[]; }; /* BCH_SB_FIELD_crypt: */ @@ -1377,19 +1406,19 @@ static inline bool data_type_is_hidden(enum bch_data_type type) struct bch_replicas_entry_v0 { __u8 data_type; __u8 nr_devs; - __u8 devs[0]; + __u8 devs[]; } __packed; struct bch_sb_field_replicas_v0 { struct bch_sb_field field; - struct bch_replicas_entry_v0 entries[0]; + struct bch_replicas_entry_v0 entries[]; } __packed __aligned(8); struct bch_replicas_entry { __u8 data_type; __u8 nr_devs; __u8 nr_required; - __u8 devs[0]; + __u8 devs[]; } __packed; #define replicas_entry_bytes(_i) \ @@ -1397,7 +1426,7 @@ struct bch_replicas_entry { struct bch_sb_field_replicas { struct bch_sb_field field; - struct bch_replicas_entry entries[0]; + struct bch_replicas_entry entries[]; } __packed __aligned(8); /* BCH_SB_FIELD_quota: */ @@ -1432,7 +1461,7 @@ LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) struct bch_sb_field_disk_groups { struct bch_sb_field field; - struct bch_disk_group entries[0]; + struct bch_disk_group entries[]; } __packed __aligned(8); /* BCH_SB_FIELD_counters */ @@ -1525,7 +1554,7 @@ enum bch_persistent_counters { struct bch_sb_field_counters { struct bch_sb_field field; - __le64 d[0]; + __le64 d[]; }; /* @@ -1539,10 +1568,8 @@ struct jset_entry { __u8 type; /* designates what this jset holds */ __u8 pad[3]; - union { - struct bkey_i start[0]; - __u64 _data[0]; - }; + struct bkey_i start[0]; + __u64 _data[]; }; struct bch_sb_field_clean { @@ -1553,10 +1580,8 @@ struct bch_sb_field_clean { __le16 _write_clock; __le64 journal_seq; - union { - struct jset_entry start[0]; - __u64 _data[0]; - }; + struct jset_entry start[0]; + __u64 _data[]; }; struct journal_seq_blacklist_entry { @@ -1567,10 +1592,8 @@ struct journal_seq_blacklist_entry { struct bch_sb_field_journal_seq_blacklist { struct bch_sb_field field; - union { - struct journal_seq_blacklist_entry start[0]; - __u64 _data[0]; - }; + struct journal_seq_blacklist_entry start[0]; + __u64 _data[]; }; /* Superblock: */ @@ -1645,7 +1668,8 @@ enum bcachefs_metadata_version { bcachefs_metadata_version_max }; -static const unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_major_minor; +static const __maybe_unused +unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_major_minor; #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) @@ -1706,10 +1730,8 @@ struct bch_sb { struct bch_sb_layout layout; - union { - struct bch_sb_field start[0]; - __le64 _data[0]; - }; + struct bch_sb_field start[0]; + __le64 _data[]; } __packed __aligned(8); /* @@ -1954,7 +1976,7 @@ enum bch_csum_type { BCH_CSUM_NR }; -static const unsigned bch_crc_bytes[] = { +static const __maybe_unused unsigned bch_crc_bytes[] = { [BCH_CSUM_none] = 0, [BCH_CSUM_crc32c_nonzero] = 4, [BCH_CSUM_crc32c] = 4, @@ -2186,10 +2208,8 @@ struct jset { __le64 last_seq; - union { - struct jset_entry start[0]; - __u64 _data[0]; - }; + struct jset_entry start[0]; + __u64 _data[]; } __packed __aligned(8); LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); @@ -2259,7 +2279,10 @@ enum btree_id_flags { x(snapshot_trees, 15, 0, \ BIT_ULL(KEY_TYPE_snapshot_tree)) \ x(deleted_inodes, 16, BTREE_ID_SNAPSHOTS, \ - BIT_ULL(KEY_TYPE_set)) + BIT_ULL(KEY_TYPE_set)) \ + x(logged_ops, 17, 0, \ + BIT_ULL(KEY_TYPE_logged_op_truncate)| \ + BIT_ULL(KEY_TYPE_logged_op_finsert)) enum btree_id { #define x(name, nr, ...) BTREE_ID_##name = nr, @@ -2294,10 +2317,8 @@ struct bset { __le16 version; __le16 u64s; /* count of d[] in u64s */ - union { - struct bkey_packed start[0]; - __u64 _data[0]; - }; + struct bkey_packed start[0]; + __u64 _data[]; } __packed __aligned(8); LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4); diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c index 0a5bfe6e..abdb0550 100644 --- a/libbcachefs/bkey.c +++ b/libbcachefs/bkey.c @@ -127,7 +127,7 @@ static void pack_state_finish(struct pack_state *state, struct bkey_packed *k) { EBUG_ON(state->p < k->_data); - EBUG_ON(state->p >= k->_data + state->format->key_u64s); + EBUG_ON(state->p >= (u64 *) k->_data + state->format->key_u64s); *state->p = state->w; } @@ -308,9 +308,14 @@ struct bpos __bkey_unpack_pos(const struct bkey_format *format, /** * bch2_bkey_pack_key -- pack just the key, not the value + * @out: packed result + * @in: key to pack + * @format: format of packed result + * + * Returns: true on success, false on failure */ bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in, - const struct bkey_format *format) + const struct bkey_format *format) { struct pack_state state = pack_state_init(format, out); u64 *w = out->_data; @@ -336,9 +341,12 @@ bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in, /** * bch2_bkey_unpack -- unpack the key and the value + * @b: btree node of @src key (for packed format) + * @dst: unpacked result + * @src: packed input */ void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst, - const struct bkey_packed *src) + const struct bkey_packed *src) { __bkey_unpack_key(b, &dst->k, src); @@ -349,19 +357,24 @@ void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst, /** * bch2_bkey_pack -- pack the key and the value + * @dst: packed result + * @src: unpacked input + * @format: format of packed result + * + * Returns: true on success, false on failure */ -bool bch2_bkey_pack(struct bkey_packed *out, const struct bkey_i *in, - const struct bkey_format *format) +bool bch2_bkey_pack(struct bkey_packed *dst, const struct bkey_i *src, + const struct bkey_format *format) { struct bkey_packed tmp; - if (!bch2_bkey_pack_key(&tmp, &in->k, format)) + if (!bch2_bkey_pack_key(&tmp, &src->k, format)) return false; - memmove_u64s((u64 *) out + format->key_u64s, - &in->v, - bkey_val_u64s(&in->k)); - memcpy_u64s_small(out, &tmp, format->key_u64s); + memmove_u64s((u64 *) dst + format->key_u64s, + &src->v, + bkey_val_u64s(&src->k)); + memcpy_u64s_small(dst, &tmp, format->key_u64s); return true; } diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index 51969a46..51845020 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -52,7 +52,7 @@ struct bkey_s { static inline struct bkey_i *bkey_next(struct bkey_i *k) { - return (struct bkey_i *) (k->_data + k->k.u64s); + return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s); } #define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) @@ -397,7 +397,7 @@ static inline void set_bkeyp_val_u64s(const struct bkey_format *format, } #define bkeyp_val(_format, _k) \ - ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k))) + ((struct bch_val *) ((u64 *) (_k)->_data + bkeyp_key_u64s(_format, _k))) extern const struct bkey_format bch2_bkey_format_current; @@ -732,7 +732,7 @@ static inline unsigned high_word_offset(const struct bkey_format *f) #error edit for your odd byteorder. #endif -#define high_word(f, k) ((k)->_data + high_word_offset(f)) +#define high_word(f, k) ((u64 *) (k)->_data + high_word_offset(f)) #define next_word(p) nth_word(p, 1) #define prev_word(p) nth_word(p, -1) diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index 6547142d..be9f012f 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -10,6 +10,7 @@ #include "error.h" #include "extents.h" #include "inode.h" +#include "io_misc.h" #include "lru.h" #include "quota.h" #include "reflink.h" @@ -25,7 +26,7 @@ const char * const bch2_bkey_types[] = { }; static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k, - unsigned flags, struct printbuf *err) + enum bkey_invalid_flags flags, struct printbuf *err) { return 0; } @@ -39,7 +40,7 @@ static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k, }) static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k, - unsigned flags, struct printbuf *err) + enum bkey_invalid_flags flags, struct printbuf *err) { if (bkey_val_bytes(k.k)) { prt_printf(err, "incorrect value size (%zu != 0)", @@ -55,7 +56,7 @@ static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k, }) static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k, - unsigned flags, struct printbuf *err) + enum bkey_invalid_flags flags, struct printbuf *err) { return 0; } @@ -70,7 +71,7 @@ static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k, }) static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k, - unsigned flags, struct printbuf *err) + enum bkey_invalid_flags flags, struct printbuf *err) { return 0; } @@ -91,7 +92,7 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, }) static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k, - unsigned flags, struct printbuf *err) + enum bkey_invalid_flags flags, struct printbuf *err) { if (bkey_val_bytes(k.k)) { prt_printf(err, "incorrect value size (%zu != %zu)", @@ -368,7 +369,6 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, { const struct bkey_ops *ops; struct bkey uk; - struct bkey_s u; unsigned nr_compat = 5; int i; @@ -433,7 +433,9 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, } break; - case 4: + case 4: { + struct bkey_s u; + if (!bkey_packed(k)) { u = bkey_i_to_s(packed_to_bkey(k)); } else { @@ -450,6 +452,7 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, if (ops->compat) ops->compat(btree_id, version, big_endian, write, u); break; + } default: BUG(); } diff --git a/libbcachefs/bkey_sort.h b/libbcachefs/bkey_sort.h index 79cf11d1..7c0f0b16 100644 --- a/libbcachefs/bkey_sort.h +++ b/libbcachefs/bkey_sort.h @@ -9,14 +9,24 @@ struct sort_iter { struct sort_iter_set { struct bkey_packed *k, *end; - } data[MAX_BSETS + 1]; + } data[]; }; -static inline void sort_iter_init(struct sort_iter *iter, struct btree *b) +static inline void sort_iter_init(struct sort_iter *iter, struct btree *b, unsigned size) { iter->b = b; iter->used = 0; - iter->size = ARRAY_SIZE(iter->data); + iter->size = size; +} + +struct sort_iter_stack { + struct sort_iter iter; + struct sort_iter_set sets[MAX_BSETS + 1]; +}; + +static inline void sort_iter_stack_init(struct sort_iter_stack *iter, struct btree *b) +{ + sort_iter_init(&iter->iter, b, ARRAY_SIZE(iter->sets)); } static inline void sort_iter_add(struct sort_iter *iter, diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index bcdf28f3..bb73ba90 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -172,10 +172,10 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, printk(KERN_ERR "iter was:"); btree_node_iter_for_each(_iter, set) { - struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); - struct bset_tree *t = bch2_bkey_to_bset(b, k); + struct bkey_packed *k2 = __btree_node_offset_to_key(b, set->k); + struct bset_tree *t = bch2_bkey_to_bset(b, k2); printk(" [%zi %zi]", t - b->set, - k->_data - bset(b, t)->_data); + k2->_data - bset(b, t)->_data); } panic("\n"); } @@ -232,7 +232,7 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, { struct bset_tree *t = bch2_bkey_to_bset(b, where); struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); - struct bkey_packed *next = (void *) (where->_data + clobber_u64s); + struct bkey_packed *next = (void *) ((u64 *) where->_data + clobber_u64s); struct printbuf buf1 = PRINTBUF; struct printbuf buf2 = PRINTBUF; #if 0 @@ -300,7 +300,8 @@ static unsigned bkey_float_byte_offset(unsigned idx) } struct ro_aux_tree { - struct bkey_float f[0]; + u8 nothing[0]; + struct bkey_float f[]; }; struct rw_aux_tree { @@ -476,7 +477,7 @@ static struct bkey_packed *tree_to_prev_bkey(const struct btree *b, { unsigned prev_u64s = ro_aux_tree_prev(b, t)[j]; - return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s); + return (void *) ((u64 *) tree_to_bkey(b, t, j)->_data - prev_u64s); } static struct rw_aux_tree *rw_aux_tree(const struct btree *b, @@ -1010,8 +1011,8 @@ void bch2_bset_insert(struct btree *b, btree_keys_account_key_add(&b->nr, t - b->set, src); if (src->u64s != clobber_u64s) { - u64 *src_p = where->_data + clobber_u64s; - u64 *dst_p = where->_data + src->u64s; + u64 *src_p = (u64 *) where->_data + clobber_u64s; + u64 *dst_p = (u64 *) where->_data + src->u64s; EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) < (int) clobber_u64s - src->u64s); @@ -1037,7 +1038,7 @@ void bch2_bset_delete(struct btree *b, unsigned clobber_u64s) { struct bset_tree *t = bset_tree_last(b); - u64 *src_p = where->_data + clobber_u64s; + u64 *src_p = (u64 *) where->_data + clobber_u64s; u64 *dst_p = where->_data; bch2_bset_verify_rw_aux_tree(b, t); @@ -1188,7 +1189,7 @@ struct bkey_packed *__bch2_bset_search(struct btree *b, case BSET_RO_AUX_TREE: return bset_search_tree(b, t, search, lossy_packed_search); default: - unreachable(); + BUG(); } } @@ -1268,9 +1269,13 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, } /** - * bch_btree_node_iter_init - initialize a btree node iterator, starting from a + * bch2_btree_node_iter_init - initialize a btree node iterator, starting from a * given position * + * @iter: iterator to initialize + * @b: btree node to search + * @search: search key + * * Main entry point to the lookup code for individual btree nodes: * * NOTE: diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index a8283fdc..7c6769cd 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -795,7 +795,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, six_unlock_intent(&b->c.lock); /* Unlock before doing IO: */ - if (trans && sync) + if (path && sync) bch2_trans_unlock_noassert(trans); bch2_btree_node_read(c, b, sync); @@ -934,7 +934,7 @@ retry: } if (unlikely(need_relock)) { - int ret = bch2_trans_relock(trans) ?: + ret = bch2_trans_relock(trans) ?: bch2_btree_path_relock_intent(trans, path); if (ret) { six_unlock_type(&b->c.lock, lock_type); @@ -965,11 +965,20 @@ retry: } /** - * bch_btree_node_get - find a btree node in the cache and lock it, reading it + * bch2_btree_node_get - find a btree node in the cache and lock it, reading it * in from disk if necessary. * + * @trans: btree transaction object + * @path: btree_path being traversed + * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2) + * @level: level of btree node being looked up (0 == leaf node) + * @lock_type: SIX_LOCK_read or SIX_LOCK_intent + * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek()) + * * The btree node will have either a read or a write lock held, depending on * the @write parameter. + * + * Returns: btree node or ERR_PTR() */ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, const struct bkey_i *k, unsigned level, @@ -1016,28 +1025,8 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * } if (unlikely(btree_node_read_in_flight(b))) { - u32 seq = six_lock_seq(&b->c.lock); - six_unlock_type(&b->c.lock, lock_type); - bch2_trans_unlock(trans); - - bch2_btree_node_wait_on_read(b); - - /* - * should_be_locked is not set on this path yet, so we need to - * relock it specifically: - */ - if (trans) { - int ret = bch2_trans_relock(trans) ?: - bch2_btree_path_relock_intent(trans, path); - if (ret) { - BUG_ON(!trans->restarted); - return ERR_PTR(ret); - } - } - - if (!six_relock_type(&b->c.lock, lock_type, seq)) - return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); + return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); } prefetch(b->aux_data); diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 83dcd9eb..97fbd833 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -529,13 +529,11 @@ fsck_err: int bch2_check_topology(struct bch_fs *c) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree *b; unsigned i; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) { struct btree_root *r = bch2_btree_id_root(c, i); @@ -546,8 +544,8 @@ int bch2_check_topology(struct bch_fs *c) if (btree_node_fake(b)) continue; - btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); - ret = bch2_btree_repair_topology_recurse(&trans, b); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); + ret = bch2_btree_repair_topology_recurse(trans, b); six_unlock_read(&b->c.lock); if (ret == DROP_THIS_NODE) { @@ -556,7 +554,7 @@ int bch2_check_topology(struct bch_fs *c) } } - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } @@ -566,8 +564,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id struct bkey_s_c *k) { struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k); - const union bch_extent_entry *entry; + struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(*k); + const union bch_extent_entry *entry_c; struct extent_ptr_decoded p = { 0 }; bool do_update = false; struct printbuf buf = PRINTBUF; @@ -577,10 +575,10 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id * XXX * use check_bucket_ref here */ - bkey_for_each_ptr_decode(k->k, ptrs, p, entry) { + bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) { struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); - enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr); if (!g->gen_valid && (c->opts.reconstruct_alloc || @@ -1068,15 +1066,13 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); enum btree_id ids[BTREE_ID_NR]; unsigned i; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - if (initial) - trans.is_initial_gc = true; + trans->is_initial_gc = true; for (i = 0; i < BTREE_ID_NR; i++) ids[i] = i; @@ -1084,22 +1080,22 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) for (i = 0; i < BTREE_ID_NR && !ret; i++) ret = initial - ? bch2_gc_btree_init(&trans, ids[i], metadata_only) - : bch2_gc_btree(&trans, ids[i], initial, metadata_only); + ? bch2_gc_btree_init(trans, ids[i], metadata_only) + : bch2_gc_btree(trans, ids[i], initial, metadata_only); for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) { if (!bch2_btree_id_root(c, i)->alive) continue; ret = initial - ? bch2_gc_btree_init(&trans, i, metadata_only) - : bch2_gc_btree(&trans, i, initial, metadata_only); + ? bch2_gc_btree_init(trans, i, metadata_only) + : bch2_gc_btree(trans, i, initial, metadata_only); } if (ret < 0) bch_err_fn(c, ret); - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } @@ -1220,14 +1216,6 @@ static int bch2_gc_done(struct bch_fs *c, fsck_err(c, _msg ": got %llu, should be %llu" \ , ##__VA_ARGS__, dst->_f, src->_f))) \ dst->_f = src->_f -#define copy_stripe_field(_f, _msg, ...) \ - if (dst->_f != src->_f && \ - (!verify || \ - fsck_err(c, "stripe %zu has wrong "_msg \ - ": got %u, should be %u", \ - iter.pos, ##__VA_ARGS__, \ - dst->_f, src->_f))) \ - dst->_f = src->_f #define copy_dev_field(_f, _msg, ...) \ copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__) #define copy_fs_field(_f, _msg, ...) \ @@ -1249,7 +1237,7 @@ static int bch2_gc_done(struct bch_fs *c, copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); } - }; + } { unsigned nr = fs_usage_u64s(c); @@ -1469,37 +1457,35 @@ fsck_err: static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; struct bch_dev *ca; unsigned i; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - for_each_member_device(ca, c, i) { - ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, ca->mi.first_bucket), BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k, NULL, NULL, BTREE_INSERT_LAZY_RW, - bch2_alloc_write_key(&trans, &iter, k, metadata_only)); + bch2_alloc_write_key(trans, &iter, k, metadata_only)); if (ret < 0) { - bch_err(c, "error writing alloc info: %s", bch2_err_str(ret)); + bch_err_fn(c, ret); percpu_ref_put(&ca->ref); break; } } - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret < 0 ? ret : 0; } static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) { struct bch_dev *ca; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; struct bucket *g; @@ -1515,17 +1501,16 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) if (!buckets) { percpu_ref_put(&ca->ref); bch_err(c, "error allocating ca->buckets[gc]"); - return -BCH_ERR_ENOMEM_gc_alloc_start; + ret = -BCH_ERR_ENOMEM_gc_alloc_start; + goto err; } buckets->first_bucket = ca->mi.first_bucket; buckets->nbuckets = ca->mi.nbuckets; rcu_assign_pointer(ca->buckets_gc, buckets); - }; + } - bch2_trans_init(&trans, c, 0, 0); - - for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { ca = bch_dev_bkey_exists(c, k.k->p.inode); g = gc_bucket(ca, k.k->p.offset); @@ -1546,13 +1531,11 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) g->stripe_redundancy = a->stripe_redundancy; } } - bch2_trans_iter_exit(&trans, &iter); - - bch2_trans_exit(&trans); - + bch2_trans_iter_exit(trans, &iter); +err: + bch2_trans_put(trans); if (ret) - bch_err(c, "error reading alloc info at gc start: %s", bch2_err_str(ret)); - + bch_err_fn(c, ret); return ret; } @@ -1575,7 +1558,7 @@ static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only) g->dirty_sectors = 0; g->cached_sectors = 0; } - }; + } } static int bch2_gc_write_reflink_key(struct btree_trans *trans, @@ -1627,7 +1610,7 @@ fsck_err: static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only) { - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; size_t idx = 0; @@ -1636,23 +1619,23 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only) if (metadata_only) return 0; - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); - ret = for_each_btree_key_commit(&trans, iter, + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, POS_MIN, BTREE_ITER_PREFETCH, k, NULL, NULL, BTREE_INSERT_NOFAIL, - bch2_gc_write_reflink_key(&trans, &iter, k, &idx)); + bch2_gc_write_reflink_key(trans, &iter, k, &idx)); c->reflink_gc_nr = 0; - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } static int bch2_gc_reflink_start(struct bch_fs *c, bool metadata_only) { - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; struct reflink_gc *r; @@ -1661,10 +1644,10 @@ static int bch2_gc_reflink_start(struct bch_fs *c, if (metadata_only) return 0; - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); c->reflink_gc_nr = 0; - for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, + for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { const __le64 *refcount = bkey_refcount_c(k); @@ -1682,9 +1665,9 @@ static int bch2_gc_reflink_start(struct bch_fs *c, r->size = k.k->size; r->refcount = 0; } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } @@ -1751,7 +1734,7 @@ fsck_err: static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only) { - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; int ret = 0; @@ -1759,15 +1742,15 @@ static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only) if (metadata_only) return 0; - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); - ret = for_each_btree_key_commit(&trans, iter, + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_stripes, POS_MIN, BTREE_ITER_PREFETCH, k, NULL, NULL, BTREE_INSERT_NOFAIL, - bch2_gc_write_stripes_key(&trans, &iter, k)); + bch2_gc_write_stripes_key(trans, &iter, k)); - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } @@ -1779,6 +1762,12 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) /** * bch2_gc - walk _all_ references to buckets, and recompute them: * + * @c: filesystem object + * @initial: are we in recovery? + * @metadata_only: are we just checking metadata references, or everything? + * + * Returns: 0 on success, or standard errcode on failure + * * Order matters here: * - Concurrent GC relies on the fact that we have a total ordering for * everything that GC walks - see gc_will_visit_node(), @@ -1947,7 +1936,7 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i int bch2_gc_gens(struct bch_fs *c) { - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; struct bch_dev *ca; @@ -1965,7 +1954,7 @@ int bch2_gc_gens(struct bch_fs *c) trace_and_count(c, gc_gens_start, c); down_read(&c->gc_lock); - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); for_each_member_device(ca, c, i) { struct bucket_gens *gens; @@ -1988,33 +1977,31 @@ int bch2_gc_gens(struct bch_fs *c) for (i = 0; i < BTREE_ID_NR; i++) if (btree_type_has_ptrs(i)) { - struct btree_iter iter; - struct bkey_s_c k; - c->gc_gens_btree = i; c->gc_gens_pos = POS_MIN; - ret = for_each_btree_key_commit(&trans, iter, i, + + ret = for_each_btree_key_commit(trans, iter, i, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, NULL, NULL, BTREE_INSERT_NOFAIL, - gc_btree_gens_key(&trans, &iter, k)); + gc_btree_gens_key(trans, &iter, k)); if (ret && !bch2_err_matches(ret, EROFS)) - bch_err(c, "error recalculating oldest_gen: %s", bch2_err_str(ret)); + bch_err_fn(c, ret); if (ret) goto err; } - ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_PREFETCH, k, NULL, NULL, BTREE_INSERT_NOFAIL, - bch2_alloc_write_oldest_gen(&trans, &iter, k)); + bch2_alloc_write_oldest_gen(trans, &iter, k)); if (ret && !bch2_err_matches(ret, EROFS)) - bch_err(c, "error writing oldest_gen: %s", bch2_err_str(ret)); + bch_err_fn(c, ret); if (ret) goto err; @@ -2031,7 +2018,7 @@ err: ca->oldest_gen = NULL; } - bch2_trans_exit(&trans); + bch2_trans_put(trans); up_read(&c->gc_lock); mutex_unlock(&c->gc_gens_lock); return ret; @@ -2086,7 +2073,7 @@ static int bch2_gc_thread(void *arg) ret = bch2_gc_gens(c); #endif if (ret < 0) - bch_err(c, "btree gc failed: %s", bch2_err_str(ret)); + bch_err_fn(c, ret); debug_check_no_locks_held(); } @@ -2116,7 +2103,7 @@ int bch2_gc_thread_start(struct bch_fs *c) p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name); if (IS_ERR(p)) { - bch_err(c, "error creating gc thread: %s", bch2_err_str(PTR_ERR(p))); + bch_err_fn(c, PTR_ERR(p)); return PTR_ERR(p); } diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 3b654841..a869cf6a 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -14,7 +14,7 @@ #include "debug.h" #include "error.h" #include "extents.h" -#include "io.h" +#include "io_write.h" #include "journal_reclaim.h" #include "journal_seq_blacklist.h" #include "recovery.h" @@ -106,8 +106,8 @@ static void btree_bounce_free(struct bch_fs *c, size_t size, vpfree(p, size); } -static void *btree_bounce_alloc_noprof(struct bch_fs *c, size_t size, - bool *used_mempool) +static void *btree_bounce_alloc(struct bch_fs *c, size_t size, + bool *used_mempool) { unsigned flags = memalloc_nofs_save(); void *p; @@ -115,7 +115,7 @@ static void *btree_bounce_alloc_noprof(struct bch_fs *c, size_t size, BUG_ON(size > btree_bytes(c)); *used_mempool = false; - p = vpmalloc_noprof(size, __GFP_NOWARN|GFP_NOWAIT); + p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT); if (!p) { *used_mempool = true; p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); @@ -123,8 +123,6 @@ static void *btree_bounce_alloc_noprof(struct bch_fs *c, size_t size, memalloc_nofs_restore(flags); return p; } -#define btree_bounce_alloc(_c, _size, _used_mempool) \ - alloc_hooks(btree_bounce_alloc_noprof(_c, _size, _used_mempool)) static void sort_bkey_ptrs(const struct btree *bt, struct bkey_packed **ptrs, unsigned nr) @@ -294,7 +292,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, bool filter_whiteouts) { struct btree_node *out; - struct sort_iter sort_iter; + struct sort_iter_stack sort_iter; struct bset_tree *t; struct bset *start_bset = bset(b, &b->set[start_idx]); bool used_mempool = false; @@ -303,13 +301,13 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, bool sorting_entire_node = start_idx == 0 && end_idx == b->nsets; - sort_iter_init(&sort_iter, b); + sort_iter_stack_init(&sort_iter, b); for (t = b->set + start_idx; t < b->set + end_idx; t++) { u64s += le16_to_cpu(bset(b, t)->u64s); - sort_iter_add(&sort_iter, + sort_iter_add(&sort_iter.iter, btree_bkey_first(b, t), btree_bkey_last(b, t)); } @@ -322,7 +320,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, start_time = local_clock(); - u64s = bch2_sort_keys(out->keys.start, &sort_iter, filter_whiteouts); + u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter, filter_whiteouts); out->keys.u64s = cpu_to_le16(u64s); @@ -338,7 +336,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, start_bset->journal_seq = cpu_to_le64(seq); if (sorting_entire_node) { - unsigned u64s = le16_to_cpu(out->keys.u64s); + u64s = le16_to_cpu(out->keys.u64s); BUG_ON(bytes != btree_bytes(c)); @@ -412,8 +410,6 @@ void bch2_btree_sort_into(struct bch_fs *c, bch2_verify_btree_nr_keys(dst); } -#define SORT_CRIT (4096 / sizeof(u64)) - /* * We're about to add another bset to the btree node, so if there's currently * too many bsets - sort some of them together: @@ -544,6 +540,7 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, prt_str(out, ": "); } +__printf(8, 9) static int __btree_err(int ret, struct bch_fs *c, struct bch_dev *ca, @@ -624,9 +621,6 @@ __cold void bch2_btree_node_drop_keys_outside_node(struct btree *b) { struct bset_tree *t; - struct bkey_s_c k; - struct bkey unpacked; - struct btree_node_iter iter; for_each_bset(b, t) { struct bset *i = bset(b, t); @@ -662,6 +656,9 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b) bch2_bset_set_no_aux_tree(b, b->set); bch2_btree_build_aux_trees(b); + struct bkey_s_c k; + struct bkey unpacked; + struct btree_node_iter iter; for_each_btree_node_key_unpack(b, k, &iter, &unpacked) { BUG_ON(bpos_lt(k.k->p, b->data->min_key)); BUG_ON(bpos_gt(k.k->p, b->data->max_key)); @@ -910,7 +907,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); unsigned u64s; - unsigned blacklisted_written, nonblacklisted_written = 0; unsigned ptr_written = btree_ptr_sectors_written(&b->key); struct printbuf buf = PRINTBUF; int ret = 0, retry_read = 0, write = READ; @@ -920,8 +916,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, b->written = 0; iter = mempool_alloc(&c->fill_iter, GFP_NOFS); - sort_iter_init(iter, b); - iter->size = (btree_blocks(c) + 1) * 2; + sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2); if (bch2_meta_read_fault("btree")) btree_err(-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, @@ -1045,8 +1040,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, sort_iter_add(iter, vstruct_idx(i, 0), vstruct_last(i)); - - nonblacklisted_written = b->written; } if (ptr_written) { @@ -1064,18 +1057,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, true), -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL, "found bset signature after last bset"); - - /* - * Blacklisted bsets are those that were written after the most recent - * (flush) journal write. Since there wasn't a flush, they may not have - * made it to all devices - which means we shouldn't write new bsets - * after them, as that could leave a gap and then reads from that device - * wouldn't find all the bsets in that btree node - which means it's - * important that we start writing new bsets after the most recent _non_ - * blacklisted bset: - */ - blacklisted_written = b->written; - b->written = nonblacklisted_written; } sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool); @@ -1143,9 +1124,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_node_reset_sib_u64s(b); bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bch_dev *ca2 = bch_dev_bkey_exists(c, ptr->dev); - if (ca->mi.state != BCH_MEMBER_STATE_rw) + if (ca2->mi.state != BCH_MEMBER_STATE_rw) set_btree_node_need_rewrite(b); } @@ -1227,19 +1208,17 @@ start: bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], rb->start_time); bio_put(&rb->bio); - printbuf_exit(&buf); if (saw_error && !btree_node_read_error(b)) { - struct printbuf buf = PRINTBUF; - + printbuf_reset(&buf); bch2_bpos_to_text(&buf, b->key.k.p); bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error", __func__, bch2_btree_ids[b->c.btree_id], b->c.level, buf.buf); - printbuf_exit(&buf); bch2_btree_node_rewrite_async(c, b); } + printbuf_exit(&buf); clear_btree_node_read_in_flight(b); wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); } @@ -1649,8 +1628,7 @@ err: int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, const struct bkey_i *k, unsigned level) { - return bch2_trans_run(c, __bch2_btree_root_read(&trans, id, k, level)); - + return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level)); } void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, @@ -1712,15 +1690,13 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) static void btree_node_write_done(struct bch_fs *c, struct btree *b) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); - bch2_trans_init(&trans, c, 0, 0); - - btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); __btree_node_write_done(c, b); six_unlock_read(&b->c.lock); - bch2_trans_exit(&trans); + bch2_trans_put(trans); } static void btree_node_write_work(struct work_struct *work) @@ -1749,7 +1725,7 @@ static void btree_node_write_work(struct work_struct *work) } } else { ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_btree_node_update_key_get_iter(&trans, b, &wbio->key, + bch2_btree_node_update_key_get_iter(trans, b, &wbio->key, BCH_WATERMARK_reclaim| BTREE_INSERT_JOURNAL_RECLAIM| BTREE_INSERT_NOFAIL| @@ -1854,7 +1830,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) struct bset *i; struct btree_node *bn = NULL; struct btree_node_entry *bne = NULL; - struct sort_iter sort_iter; + struct sort_iter_stack sort_iter; struct nonce nonce; unsigned bytes_to_write, sectors_to_write, bytes, u64s; u64 seq = 0; @@ -1927,7 +1903,7 @@ do_write: bch2_sort_whiteouts(c, b); - sort_iter_init(&sort_iter, b); + sort_iter_stack_init(&sort_iter, b); bytes = !b->written ? sizeof(struct btree_node) @@ -1942,7 +1918,7 @@ do_write: continue; bytes += le16_to_cpu(i->u64s) * sizeof(u64); - sort_iter_add(&sort_iter, + sort_iter_add(&sort_iter.iter, btree_bkey_first(b, t), btree_bkey_last(b, t)); seq = max(seq, le64_to_cpu(i->journal_seq)); @@ -1971,14 +1947,14 @@ do_write: i->journal_seq = cpu_to_le64(seq); i->u64s = 0; - sort_iter_add(&sort_iter, + sort_iter_add(&sort_iter.iter, unwritten_whiteouts_start(c, b), unwritten_whiteouts_end(c, b)); SET_BSET_SEPARATE_WHITEOUTS(i, false); b->whiteout_u64s = 0; - u64s = bch2_sort_keys(i->start, &sort_iter, false); + u64s = bch2_sort_keys(i->start, &sort_iter.iter, false); le16_add_cpu(&i->u64s, u64s); BUG_ON(!b->written && i->u64s != b->data->keys.u64s); diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index cd99bbb0..7e03dd76 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -7,7 +7,7 @@ #include "btree_locking.h" #include "checksum.h" #include "extents.h" -#include "io_types.h" +#include "io_write_types.h" struct bch_fs; struct btree_write; diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 5216d339..4cee5e6c 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -488,7 +488,6 @@ fixup_done: if (!bch2_btree_node_iter_end(node_iter) && iter_current_key_modified && b->c.level) { - struct bset_tree *t; struct bkey_packed *k, *k2, *p; k = bch2_btree_node_iter_peek_all(node_iter, b); @@ -689,7 +688,7 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) if (t != BTREE_NODE_UNLOCKED) { btree_node_unlock(trans, path, b->c.level); six_lock_increment(&b->c.lock, (enum six_lock_type) t); - mark_btree_node_locked(trans, path, b->c.level, (enum six_lock_type) t); + mark_btree_node_locked(trans, path, b->c.level, t); } bch2_btree_path_level_init(trans, path, b); @@ -764,7 +763,8 @@ static inline int btree_path_lock_root(struct btree_trans *trans, for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++) path->l[i].b = NULL; - mark_btree_node_locked(trans, path, path->level, lock_type); + mark_btree_node_locked(trans, path, path->level, + (enum btree_node_locked_type) lock_type); bch2_btree_path_level_init(trans, path, b); return 0; } @@ -936,7 +936,8 @@ static __always_inline int btree_path_down(struct btree_trans *trans, if (btree_node_read_locked(path, level + 1)) btree_node_unlock(trans, path, level + 1); - mark_btree_node_locked(trans, path, level, lock_type); + mark_btree_node_locked(trans, path, level, + (enum btree_node_locked_type) lock_type); path->level = level; bch2_btree_path_level_init(trans, path, b); @@ -1341,14 +1342,14 @@ static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *p __bch2_path_free(trans, path); } -void bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count) +void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count) { panic("trans->restart_count %u, should be %u, last restarted by %pS\n", trans->restart_count, restart_count, (void *) trans->last_begin_ip); } -void bch2_trans_in_restart_error(struct btree_trans *trans) +void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) { panic("in transaction restart: %s, last restarted by %pS\n", bch2_err_str(trans->restarted), @@ -1493,7 +1494,7 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans) static noinline void btree_path_overflow(struct btree_trans *trans) { bch2_dump_trans_paths_updates(trans); - panic("trans path oveflow\n"); + panic("trans path overflow\n"); } static inline struct btree_path *btree_path_alloc(struct btree_trans *trans, @@ -2046,8 +2047,12 @@ out: } /** - * bch2_btree_iter_peek: returns first key greater than or equal to iterator's - * current position + * bch2_btree_iter_peek_upto() - returns first key greater than or equal to + * iterator's current position + * @iter: iterator to peek from + * @end: search limit: returns keys less than or equal to @end + * + * Returns: key if found, or an error extractable with bkey_err(). */ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end) { @@ -2184,10 +2189,13 @@ end: } /** - * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal - * to iterator's current position, returning keys from every level of the btree. - * For keys at different levels of the btree that compare equal, the key from - * the lower level (leaf) is returned first. + * bch2_btree_iter_peek_all_levels() - returns the first key greater than or + * equal to iterator's current position, returning keys from every level of the + * btree. For keys at different levels of the btree that compare equal, the key + * from the lower level (leaf) is returned first. + * @iter: iterator to peek from + * + * Returns: key if found, or an error extractable with bkey_err(). */ struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter) { @@ -2278,8 +2286,11 @@ out_no_locked: } /** - * bch2_btree_iter_next: returns first key greater than iterator's current + * bch2_btree_iter_next() - returns first key greater than iterator's current * position + * @iter: iterator to peek from + * + * Returns: key if found, or an error extractable with bkey_err(). */ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) { @@ -2290,8 +2301,11 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) } /** - * bch2_btree_iter_peek_prev: returns first key less than or equal to + * bch2_btree_iter_peek_prev() - returns first key less than or equal to * iterator's current position + * @iter: iterator to peek from + * + * Returns: key if found, or an error extractable with bkey_err(). */ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) { @@ -2414,8 +2428,11 @@ out_no_locked: } /** - * bch2_btree_iter_prev: returns first key less than iterator's current + * bch2_btree_iter_prev() - returns first key less than iterator's current * position + * @iter: iterator to peek from + * + * Returns: key if found, or an error extractable with bkey_err(). */ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) { @@ -2722,7 +2739,7 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) void bch2_trans_iter_init_outlined(struct btree_trans *trans, struct btree_iter *iter, - unsigned btree_id, struct bpos pos, + enum btree_id btree_id, struct bpos pos, unsigned flags) { bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, @@ -2830,6 +2847,8 @@ static noinline void bch2_trans_reset_srcu_lock(struct btree_trans *trans) * bch2_trans_begin() - reset a transaction after a interrupted attempt * @trans: transaction to reset * + * Returns: current restart counter, to be used with trans_was_restarted() + * * While iterating over nodes or updating nodes a attempt to lock a btree node * may return BCH_ERR_transaction_restart when the trylock fails. When this * occurs bch2_trans_begin() should be called and the transaction retried. @@ -2887,28 +2906,23 @@ u32 bch2_trans_begin(struct btree_trans *trans) return trans->restart_count; } -static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c) +static struct btree_trans *bch2_trans_alloc(struct bch_fs *c) { - size_t paths_bytes = sizeof(struct btree_path) * BTREE_ITER_MAX; - size_t updates_bytes = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX; - void *p = NULL; + struct btree_trans *trans; - BUG_ON(trans->used_mempool); - -#ifdef __KERNEL__ - p = this_cpu_xchg(c->btree_paths_bufs->path, NULL); -#endif - if (!p) { - p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS); - /* - * paths need to be zeroed, bch2_check_for_deadlock looks at - * paths in other threads - */ - memset(p, 0, paths_bytes); + if (IS_ENABLED(__KERNEL__)) { + trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL); + if (trans) + return trans; } - trans->paths = p; p += paths_bytes; - trans->updates = p; p += updates_bytes; + trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS); + /* + * paths need to be zeroed, bch2_check_for_deadlock looks at + * paths in other threads + */ + memset(&trans->paths, 0, sizeof(trans->paths)); + return trans; } const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; @@ -2928,13 +2942,16 @@ unsigned bch2_trans_get_fn_idx(const char *fn) return i; } -void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_idx) +struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) __acquires(&c->btree_trans_barrier) { + struct btree_trans *trans; struct btree_transaction_stats *s; bch2_assert_btree_nodes_not_locked(); + trans = bch2_trans_alloc(c); + memset(trans, 0, sizeof(*trans)); trans->c = c; trans->fn = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns) @@ -2946,8 +2963,6 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_ !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags); closure_init_stack(&trans->ref); - bch2_trans_alloc_paths(trans, c); - s = btree_trans_stats(trans); if (s && s->max_mem) { unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem); @@ -2993,6 +3008,8 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_ list_add_done: seqmutex_unlock(&c->btree_trans_lock); } + + return trans; } static void check_btree_paths_leaked(struct btree_trans *trans) @@ -3017,7 +3034,7 @@ leaked: #endif } -void bch2_trans_exit(struct btree_trans *trans) +void bch2_trans_put(struct btree_trans *trans) __releases(&c->btree_trans_barrier) { struct btree_insert_entry *i; @@ -3063,18 +3080,11 @@ void bch2_trans_exit(struct btree_trans *trans) else kfree(trans->mem); -#ifdef __KERNEL__ - /* - * Userspace doesn't have a real percpu implementation: - */ - trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths); -#endif - - if (trans->paths) - mempool_free(trans->paths, &c->btree_paths_pool); - - trans->mem = (void *) 0x1; - trans->paths = (void *) 0x1; + /* Userspace doesn't have a real percpu implementation: */ + if (IS_ENABLED(__KERNEL__)) + trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans); + if (trans) + mempool_free(trans, &c->btree_trans_pool); } static void __maybe_unused @@ -3152,6 +3162,17 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) void bch2_fs_btree_iter_exit(struct bch_fs *c) { struct btree_transaction_stats *s; + struct btree_trans *trans; + int cpu; + + trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list); + if (trans) + panic("%s leaked btree_trans\n", trans->fn); + + if (c->btree_trans_bufs) + for_each_possible_cpu(cpu) + kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans); + free_percpu(c->btree_trans_bufs); for (s = c->btree_transaction_stats; s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); @@ -3163,13 +3184,12 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) if (c->btree_trans_barrier_initialized) cleanup_srcu_struct(&c->btree_trans_barrier); mempool_exit(&c->btree_trans_mem_pool); - mempool_exit(&c->btree_paths_pool); + mempool_exit(&c->btree_trans_pool); } int bch2_fs_btree_iter_init(struct bch_fs *c) { struct btree_transaction_stats *s; - unsigned nr = BTREE_ITER_MAX; int ret; for (s = c->btree_transaction_stats; @@ -3182,9 +3202,12 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) INIT_LIST_HEAD(&c->btree_trans_list); seqmutex_init(&c->btree_trans_lock); - ret = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1, - sizeof(struct btree_path) * nr + - sizeof(struct btree_insert_entry) * nr) ?: + c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf); + if (!c->btree_trans_bufs) + return -ENOMEM; + + ret = mempool_init_kmalloc_pool(&c->btree_trans_pool, 1, + sizeof(struct btree_trans)) ?: mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, BTREE_TRANS_MEM_MAX) ?: init_srcu_struct(&c->btree_trans_barrier); diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 8876f2b8..fbe27345 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -276,12 +276,14 @@ int bch2_trans_relock_notrace(struct btree_trans *); void bch2_trans_unlock(struct btree_trans *); bool bch2_trans_locked(struct btree_trans *); -static inline bool trans_was_restarted(struct btree_trans *trans, u32 restart_count) +static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count) { - return restart_count != trans->restart_count; + return restart_count != trans->restart_count + ? -BCH_ERR_transaction_restart_nested + : 0; } -void bch2_trans_restart_error(struct btree_trans *, u32); +void __noreturn bch2_trans_restart_error(struct btree_trans *, u32); static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans, u32 restart_count) @@ -290,7 +292,7 @@ static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans, bch2_trans_restart_error(trans, restart_count); } -void bch2_trans_in_restart_error(struct btree_trans *); +void __noreturn bch2_trans_in_restart_error(struct btree_trans *); static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans) { @@ -463,7 +465,7 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans, } void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *, - unsigned, struct bpos, unsigned); + enum btree_id, struct bpos, unsigned); static inline void bch2_trans_iter_init(struct btree_trans *trans, struct btree_iter *iter, @@ -672,17 +674,17 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, #define lockrestart_do(_trans, _do) \ ({ \ u32 _restart_count; \ - int _ret; \ + int _ret2; \ \ do { \ _restart_count = bch2_trans_begin(_trans); \ - _ret = (_do); \ - } while (bch2_err_matches(_ret, BCH_ERR_transaction_restart)); \ + _ret2 = (_do); \ + } while (bch2_err_matches(_ret2, BCH_ERR_transaction_restart)); \ \ - if (!_ret) \ + if (!_ret2) \ bch2_trans_verify_not_restarted(_trans, _restart_count);\ \ - _ret; \ + _ret2; \ }) /* @@ -697,26 +699,23 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, #define nested_lockrestart_do(_trans, _do) \ ({ \ u32 _restart_count, _orig_restart_count; \ - int _ret; \ + int _ret2; \ \ _restart_count = _orig_restart_count = (_trans)->restart_count; \ \ - while (bch2_err_matches(_ret = (_do), BCH_ERR_transaction_restart))\ + while (bch2_err_matches(_ret2 = (_do), BCH_ERR_transaction_restart))\ _restart_count = bch2_trans_begin(_trans); \ \ - if (!_ret) \ + if (!_ret2) \ bch2_trans_verify_not_restarted(_trans, _restart_count);\ \ - if (!_ret && trans_was_restarted(_trans, _orig_restart_count)) \ - _ret = -BCH_ERR_transaction_restart_nested; \ - \ - _ret; \ + _ret2 ?: trans_was_restarted(_trans, _restart_count); \ }) #define for_each_btree_key2(_trans, _iter, _btree_id, \ _start, _flags, _k, _do) \ ({ \ - int _ret = 0; \ + int _ret3 = 0; \ \ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ @@ -724,15 +723,15 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, while (1) { \ u32 _restart_count = bch2_trans_begin(_trans); \ \ - _ret = 0; \ + _ret3 = 0; \ (_k) = bch2_btree_iter_peek_type(&(_iter), (_flags)); \ if (!(_k).k) \ break; \ \ - _ret = bkey_err(_k) ?: (_do); \ - if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ + _ret3 = bkey_err(_k) ?: (_do); \ + if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\ continue; \ - if (_ret) \ + if (_ret3) \ break; \ bch2_trans_verify_not_restarted(_trans, _restart_count);\ if (!bch2_btree_iter_advance(&(_iter))) \ @@ -740,13 +739,13 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, } \ \ bch2_trans_iter_exit((_trans), &(_iter)); \ - _ret; \ + _ret3; \ }) #define for_each_btree_key2_upto(_trans, _iter, _btree_id, \ _start, _end, _flags, _k, _do) \ ({ \ - int _ret = 0; \ + int _ret3 = 0; \ \ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ @@ -754,15 +753,15 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, while (1) { \ u32 _restart_count = bch2_trans_begin(_trans); \ \ - _ret = 0; \ + _ret3 = 0; \ (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, (_flags));\ if (!(_k).k) \ break; \ \ - _ret = bkey_err(_k) ?: (_do); \ - if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ + _ret3 = bkey_err(_k) ?: (_do); \ + if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\ continue; \ - if (_ret) \ + if (_ret3) \ break; \ bch2_trans_verify_not_restarted(_trans, _restart_count);\ if (!bch2_btree_iter_advance(&(_iter))) \ @@ -770,13 +769,13 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, } \ \ bch2_trans_iter_exit((_trans), &(_iter)); \ - _ret; \ + _ret3; \ }) #define for_each_btree_key_reverse(_trans, _iter, _btree_id, \ _start, _flags, _k, _do) \ ({ \ - int _ret = 0; \ + int _ret3 = 0; \ \ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ @@ -785,14 +784,14 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, u32 _restart_count = bch2_trans_begin(_trans); \ (_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\ if (!(_k).k) { \ - _ret = 0; \ + _ret3 = 0; \ break; \ } \ \ - _ret = bkey_err(_k) ?: (_do); \ - if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ + _ret3 = bkey_err(_k) ?: (_do); \ + if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\ continue; \ - if (_ret) \ + if (_ret3) \ break; \ bch2_trans_verify_not_restarted(_trans, _restart_count);\ if (!bch2_btree_iter_rewind(&(_iter))) \ @@ -800,7 +799,7 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, } \ \ bch2_trans_iter_exit((_trans), &(_iter)); \ - _ret; \ + _ret3; \ }) #define for_each_btree_key_commit(_trans, _iter, _btree_id, \ @@ -916,21 +915,21 @@ void bch2_btree_path_to_text(struct printbuf *, struct btree_path *); void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); void bch2_dump_trans_updates(struct btree_trans *); void bch2_dump_trans_paths_updates(struct btree_trans *); -void __bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned); -void bch2_trans_exit(struct btree_trans *); + +struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned); +void bch2_trans_put(struct btree_trans *); extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; unsigned bch2_trans_get_fn_idx(const char *); -#define bch2_trans_init(_trans, _c, _nr_iters, _mem) \ -do { \ +#define bch2_trans_get(_c) \ +({ \ static unsigned trans_fn_idx; \ \ if (unlikely(!trans_fn_idx)) \ trans_fn_idx = bch2_trans_get_fn_idx(__func__); \ - \ - __bch2_trans_init(_trans, _c, trans_fn_idx); \ -} while (0) + __bch2_trans_get(_c, trans_fn_idx); \ +}) void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *); diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index f7c001d4..1407f691 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -243,8 +243,6 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, } if (ck) { - int ret; - ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_); if (unlikely(ret)) { bkey_cached_move_to_freelist(bc, ck); @@ -253,7 +251,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, path->l[0].b = (void *) ck; path->l[0].lock_seq = six_lock_seq(&ck->c.lock); - mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); + mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); ret = bch2_btree_node_lock_write(trans, path, &ck->c); if (unlikely(ret)) { @@ -331,7 +329,7 @@ btree_key_cache_create(struct btree_trans *trans, struct btree_path *path) return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create); } - mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); + mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); } ck->c.level = 0; @@ -479,7 +477,7 @@ retry: if (!ck) goto retry; - mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); + mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); path->locks_want = 1; } else { enum six_lock_type lock_want = __btree_lock_want(path, 0); @@ -497,7 +495,8 @@ retry: goto retry; } - mark_btree_node_locked(trans, path, 0, lock_want); + mark_btree_node_locked(trans, path, 0, + (enum btree_node_locked_type) lock_want); } path->l[0].lock_seq = six_lock_seq(&ck->c.lock); @@ -579,7 +578,8 @@ retry: goto retry; } - mark_btree_node_locked(trans, path, 0, lock_want); + mark_btree_node_locked(trans, path, 0, + (enum btree_node_locked_type) lock_want); } path->l[0].lock_seq = six_lock_seq(&ck->c.lock); @@ -705,13 +705,11 @@ int bch2_btree_key_cache_journal_flush(struct journal *j, struct bkey_cached *ck = container_of(pin, struct bkey_cached, journal); struct bkey_cached_key key; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - - btree_node_lock_nopath_nofail(&trans, &ck->c, SIX_LOCK_read); + btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read); key = ck->key; if (ck->journal.seq != seq || @@ -728,13 +726,13 @@ int bch2_btree_key_cache_journal_flush(struct journal *j, } six_unlock_read(&ck->c.lock); - ret = commit_do(&trans, NULL, NULL, 0, - btree_key_cache_flush_pos(&trans, key, seq, + ret = commit_do(trans, NULL, NULL, 0, + btree_key_cache_flush_pos(trans, key, seq, BTREE_INSERT_JOURNAL_RECLAIM, false)); unlock: srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } @@ -1065,7 +1063,7 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) { - prt_printf(out, "nr_freed:\t%zu", atomic_long_read(&c->nr_freed)); + prt_printf(out, "nr_freed:\t%lu", atomic_long_read(&c->nr_freed)); prt_newline(out); prt_printf(out, "nr_keys:\t%lu", atomic_long_read(&c->nr_keys)); prt_newline(out); diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index 22e2cd39..6231e9ff 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -91,7 +91,7 @@ static inline void mark_btree_node_unlocked(struct btree_path *path, static inline void mark_btree_node_locked(struct btree_trans *trans, struct btree_path *path, unsigned level, - enum six_lock_type type) + enum btree_node_locked_type type) { mark_btree_node_locked_noreset(path, level, (enum btree_node_locked_type) type); #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c index eafb0388..04c1f461 100644 --- a/libbcachefs/btree_trans_commit.c +++ b/libbcachefs/btree_trans_commit.c @@ -163,13 +163,11 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, struct bch_fs *c = container_of(j, struct bch_fs, journal); struct btree_write *w = container_of(pin, struct btree_write, journal); struct btree *b = container_of(w, struct btree, writes[i]); - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); unsigned long old, new, v; unsigned idx = w - b->writes; - bch2_trans_init(&trans, c, 0, 0); - - btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); v = READ_ONCE(b->flags); do { @@ -188,7 +186,7 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, btree_node_write_if_need(c, b, SIX_LOCK_read); six_unlock_read(&b->c.lock); - bch2_trans_exit(&trans); + bch2_trans_put(trans); return 0; } @@ -214,7 +212,11 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c, } /** - * btree_insert_key - insert a key one key into a leaf node + * bch2_btree_insert_key_leaf() - insert a key one key into a leaf node + * @trans: btree transaction object + * @path: path pointing to @insert's pos + * @insert: key to insert + * @journal_seq: sequence number of journal reservation */ inline void bch2_btree_insert_key_leaf(struct btree_trans *trans, struct btree_path *path, @@ -555,7 +557,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, struct btree_write_buffered_key *wb; struct btree_trans_commit_hook *h; unsigned u64s = 0; - bool marking = false; int ret; if (race_fault()) { @@ -584,9 +585,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, *stopped_at = i; return ret; } - - if (btree_node_type_needs_gc(i->bkey_type)) - marking = true; } if (trans->nr_wb_updates && @@ -778,7 +776,6 @@ static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p); } -#ifdef CONFIG_BCACHEFS_DEBUG static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, unsigned flags, struct btree_insert_entry *i, struct printbuf *err) @@ -804,7 +801,6 @@ static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, un return -EINVAL; } -#endif /* * Get journal reservation, take write locks, and attempt to do btree update(s): @@ -1029,7 +1025,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) if (ret) goto out_reset; -#ifdef CONFIG_BCACHEFS_DEBUG trans_for_each_update(trans, i) { struct printbuf buf = PRINTBUF; enum bkey_invalid_flags invalid_flags = 0; @@ -1046,7 +1041,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) if (ret) return ret; } -#endif if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { ret = do_bch2_trans_commit_to_journal_replay(trans); diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 71ad3893..67ecb5e4 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -194,34 +194,34 @@ struct btree_node_iter { /* * Iterate over all possible positions, synthesizing deleted keys for holes: */ -static const u16 BTREE_ITER_SLOTS = 1 << 0; -static const u16 BTREE_ITER_ALL_LEVELS = 1 << 1; +static const __maybe_unused u16 BTREE_ITER_SLOTS = 1 << 0; +static const __maybe_unused u16 BTREE_ITER_ALL_LEVELS = 1 << 1; /* * Indicates that intent locks should be taken on leaf nodes, because we expect * to be doing updates: */ -static const u16 BTREE_ITER_INTENT = 1 << 2; +static const __maybe_unused u16 BTREE_ITER_INTENT = 1 << 2; /* * Causes the btree iterator code to prefetch additional btree nodes from disk: */ -static const u16 BTREE_ITER_PREFETCH = 1 << 3; +static const __maybe_unused u16 BTREE_ITER_PREFETCH = 1 << 3; /* * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for * @pos or the first key strictly greater than @pos */ -static const u16 BTREE_ITER_IS_EXTENTS = 1 << 4; -static const u16 BTREE_ITER_NOT_EXTENTS = 1 << 5; -static const u16 BTREE_ITER_CACHED = 1 << 6; -static const u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 7; -static const u16 BTREE_ITER_WITH_UPDATES = 1 << 8; -static const u16 BTREE_ITER_WITH_JOURNAL = 1 << 9; -static const u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 10; -static const u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 11; -static const u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 12; -static const u16 BTREE_ITER_NOPRESERVE = 1 << 13; -static const u16 BTREE_ITER_CACHED_NOFILL = 1 << 14; -static const u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 15; -#define __BTREE_ITER_FLAGS_END 16 +static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS = 1 << 4; +static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS = 1 << 5; +static const __maybe_unused u16 BTREE_ITER_CACHED = 1 << 6; +static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 7; +static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES = 1 << 8; +static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL = 1 << 9; +static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 10; +static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 11; +static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 12; +static const __maybe_unused u16 BTREE_ITER_NOPRESERVE = 1 << 13; +static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL = 1 << 14; +static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 15; +#define __BTREE_ITER_FLAGS_END 16 enum btree_path_uptodate { BTREE_ITER_UPTODATE = 0, @@ -459,8 +459,8 @@ struct btree_trans { void *mem; u8 sorted[BTREE_ITER_MAX + 8]; - struct btree_path *paths; - struct btree_insert_entry *updates; + struct btree_path paths[BTREE_ITER_MAX]; + struct btree_insert_entry updates[BTREE_ITER_MAX]; struct btree_write_buffered_key *wb_updates; /* update path: */ diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c index 880ce743..324767c0 100644 --- a/libbcachefs/btree_update.c +++ b/libbcachefs/btree_update.c @@ -124,7 +124,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, struct bkey_s_c old_k, new_k; snapshot_id_list s; struct bkey_i *update; - int ret; + int ret = 0; if (!bch2_snapshot_has_children(c, old_pos.snapshot)) return 0; @@ -466,11 +466,49 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, return 0; } +static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, + struct btree_iter *iter, + struct btree_path *path) +{ + if (!iter->key_cache_path || + !iter->key_cache_path->should_be_locked || + !bpos_eq(iter->key_cache_path->pos, iter->pos)) { + struct bkey_cached *ck; + int ret; + + if (!iter->key_cache_path) + iter->key_cache_path = + bch2_path_get(trans, path->btree_id, path->pos, 1, 0, + BTREE_ITER_INTENT| + BTREE_ITER_CACHED, _THIS_IP_); + + iter->key_cache_path = + bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos, + iter->flags & BTREE_ITER_INTENT, + _THIS_IP_); + + ret = bch2_btree_path_traverse(trans, iter->key_cache_path, + BTREE_ITER_CACHED); + if (unlikely(ret)) + return ret; + + ck = (void *) iter->key_cache_path->l[0].b; + + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); + } + + btree_path_set_should_be_locked(iter->key_cache_path); + } + + return 0; +} + int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *k, enum btree_update_flags flags) { struct btree_path *path = iter->update_path ?: iter->path; - struct bkey_cached *ck; int ret; if (iter->flags & BTREE_ITER_IS_EXTENTS) @@ -494,34 +532,9 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter !path->cached && !path->level && btree_id_cached(trans->c, path->btree_id)) { - if (!iter->key_cache_path || - !iter->key_cache_path->should_be_locked || - !bpos_eq(iter->key_cache_path->pos, k->k.p)) { - if (!iter->key_cache_path) - iter->key_cache_path = - bch2_path_get(trans, path->btree_id, path->pos, 1, 0, - BTREE_ITER_INTENT| - BTREE_ITER_CACHED, _THIS_IP_); - - iter->key_cache_path = - bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos, - iter->flags & BTREE_ITER_INTENT, - _THIS_IP_); - - ret = bch2_btree_path_traverse(trans, iter->key_cache_path, - BTREE_ITER_CACHED); - if (unlikely(ret)) - return ret; - - ck = (void *) iter->key_cache_path->l[0].b; - - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); - } - - btree_path_set_should_be_locked(iter->key_cache_path); - } + ret = bch2_trans_update_get_key_cache(trans, iter, path); + if (ret) + return ret; path = iter->key_cache_path; } @@ -640,6 +653,7 @@ int bch2_btree_insert_nonextent(struct btree_trans *trans, int ret; bch2_trans_iter_init(trans, &iter, btree, k->k.p, + BTREE_ITER_CACHED| BTREE_ITER_NOT_EXTENTS| BTREE_ITER_INTENT); ret = bch2_btree_iter_traverse(&iter) ?: @@ -648,8 +662,8 @@ int bch2_btree_insert_nonextent(struct btree_trans *trans, return ret; } -int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id, - struct bkey_i *k, enum btree_update_flags flags) +int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id, + struct bkey_i *k, enum btree_update_flags flags) { struct btree_iter iter; int ret; @@ -667,16 +681,18 @@ int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id, * bch2_btree_insert - insert keys into the extent btree * @c: pointer to struct bch_fs * @id: btree to insert into - * @insert_keys: list of keys to insert - * @hook: insert callback + * @k: key to insert + * @disk_res: must be non-NULL whenever inserting or potentially + * splitting data extents + * @flags: transaction commit flags + * + * Returns: 0 on success, error code on failure */ -int bch2_btree_insert(struct bch_fs *c, enum btree_id id, - struct bkey_i *k, - struct disk_reservation *disk_res, - u64 *journal_seq, int flags) +int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k, + struct disk_reservation *disk_res, int flags) { - return bch2_trans_do(c, disk_res, journal_seq, flags, - __bch2_btree_insert(&trans, id, k, 0)); + return bch2_trans_do(c, disk_res, NULL, flags, + bch2_btree_insert_trans(trans, id, k, 0)); } int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter, @@ -714,6 +730,23 @@ int bch2_btree_delete_at_buffered(struct btree_trans *trans, return bch2_trans_update_buffered(trans, btree, k); } +int bch2_btree_delete(struct btree_trans *trans, + enum btree_id btree, struct bpos pos, + unsigned update_flags) +{ + struct btree_iter iter; + int ret; + + bch2_trans_iter_init(trans, &iter, btree, pos, + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(trans, &iter, update_flags); + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, struct bpos start, struct bpos end, unsigned update_flags, @@ -777,9 +810,7 @@ err: } bch2_trans_iter_exit(trans, &iter); - if (!ret && trans_was_restarted(trans, restart_count)) - ret = -BCH_ERR_transaction_restart_nested; - return ret; + return ret ?: trans_was_restarted(trans, restart_count); } /* @@ -793,7 +824,7 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, u64 *journal_seq) { int ret = bch2_trans_run(c, - bch2_btree_delete_range_trans(&trans, id, start, end, + bch2_btree_delete_range_trans(trans, id, start, end, update_flags, journal_seq)); if (ret == -BCH_ERR_transaction_restart_nested) ret = 0; @@ -818,6 +849,7 @@ int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, return bch2_trans_update_buffered(trans, btree, k); } +__printf(2, 0) static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args) { struct printbuf buf = PRINTBUF; @@ -854,6 +886,7 @@ err: return ret; } +__printf(3, 0) static int __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, va_list args) @@ -865,12 +898,13 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, } else { ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW|commit_flags, - __bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args)); + __bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args)); } return ret; } +__printf(2, 3) int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...) { va_list args; @@ -886,6 +920,7 @@ int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...) * Use for logging messages during recovery to enable reserved space and avoid * blocking. */ +__printf(2, 3) int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...) { va_list args; diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index 901c42b5..9816d228 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -4,7 +4,6 @@ #include "btree_iter.h" #include "journal.h" -#include "journal.h" struct bch_fs; struct btree; @@ -58,14 +57,15 @@ int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *, unsigned, unsigned); int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); int bch2_btree_delete_at_buffered(struct btree_trans *, enum btree_id, struct bpos); +int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned); int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id, struct bkey_i *, enum btree_update_flags); -int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *, +int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *, enum btree_update_flags); int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, - struct disk_reservation *, u64 *, int flags); + struct disk_reservation *, int flags); int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, struct bpos, struct bpos, unsigned, u64 *); @@ -114,8 +114,8 @@ void bch2_trans_commit_hook(struct btree_trans *, struct btree_trans_commit_hook *); int __bch2_trans_commit(struct btree_trans *, unsigned); -int bch2_fs_log_msg(struct bch_fs *, const char *, ...); -int bch2_journal_log_msg(struct bch_fs *, const char *, ...); +__printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...); +__printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...); /** * bch2_trans_commit - insert keys at given iterator positions @@ -145,29 +145,16 @@ static inline int bch2_trans_commit(struct btree_trans *trans, nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ (_journal_seq), (_flags))) -#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \ +#define bch2_trans_run(_c, _do) \ ({ \ - struct btree_trans trans; \ - int _ret; \ - \ - bch2_trans_init(&trans, (_c), 0, 0); \ - _ret = commit_do(&trans, _disk_res, _journal_seq, _flags, _do); \ - bch2_trans_exit(&trans); \ - \ + struct btree_trans *trans = bch2_trans_get(_c); \ + int _ret = (_do); \ + bch2_trans_put(trans); \ _ret; \ }) -#define bch2_trans_run(_c, _do) \ -({ \ - struct btree_trans trans; \ - int _ret; \ - \ - bch2_trans_init(&trans, (_c), 0, 0); \ - _ret = (_do); \ - bch2_trans_exit(&trans); \ - \ - _ret; \ -}) +#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \ + bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do)) #define trans_for_each_update(_trans, _i) \ for ((_i) = (_trans)->updates; \ diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index c741150e..7dbf6b6c 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -143,10 +143,15 @@ static size_t btree_node_u64s_with_format(struct btree *b, } /** - * btree_node_format_fits - check if we could rewrite node with a new format + * bch2_btree_node_format_fits - check if we could rewrite node with a new format * - * This assumes all keys can pack with the new format -- it just checks if - * the re-packed keys would fit inside the node itself. + * @c: filesystem handle + * @b: btree node to rewrite + * @new_f: bkey format to translate keys to + * + * Returns: true if all re-packed keys will be able to fit in a new node. + * + * Assumes all keys will successfully pack with the new format. */ bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, struct bkey_format *new_f) @@ -244,7 +249,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, struct write_point *wp; struct btree *b; BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; - struct open_buckets ob = { .nr = 0 }; + struct open_buckets obs = { .nr = 0 }; struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim @@ -257,7 +262,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, struct btree_alloc *a = &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; - ob = a->ob; + obs = a->ob; bkey_copy(&tmp.k, &a->k); mutex_unlock(&c->btree_reserve_cache_lock); goto mem_alloc; @@ -292,7 +297,7 @@ retry: bkey_btree_ptr_v2_init(&tmp.k); bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false); - bch2_open_bucket_get(c, wp, &ob); + bch2_open_bucket_get(c, wp, &obs); bch2_alloc_sectors_done(c, wp); mem_alloc: b = bch2_btree_node_mem_alloc(trans, interior_node); @@ -304,7 +309,7 @@ mem_alloc: BUG_ON(b->ob.nr); bkey_copy(&b->key, &tmp.k); - b->ob = ob; + b->ob = obs; return b; } @@ -592,12 +597,11 @@ static void btree_update_nodes_written(struct btree_update *as) { struct bch_fs *c = as->c; struct btree *b; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); u64 journal_seq = 0; unsigned i; int ret; - bch2_trans_init(&trans, c, 0, 512); /* * If we're already in an error state, it might be because a btree node * was never written, and we might be trying to free that same btree @@ -618,7 +622,7 @@ static void btree_update_nodes_written(struct btree_update *as) b = as->old_nodes[i]; - btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); seq = b->data ? b->data->keys.seq : 0; six_unlock_read(&b->c.lock); @@ -640,13 +644,13 @@ static void btree_update_nodes_written(struct btree_update *as) * journal reclaim does btree updates when flushing bkey_cached entries, * which may require allocations as well. */ - ret = commit_do(&trans, &as->disk_res, &journal_seq, + ret = commit_do(trans, &as->disk_res, &journal_seq, BCH_WATERMARK_reclaim| BTREE_INSERT_NOFAIL| BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_JOURNAL_RECLAIM, - btree_update_nodes_written_trans(&trans, as)); - bch2_trans_unlock(&trans); + btree_update_nodes_written_trans(trans, as)); + bch2_trans_unlock(trans); bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, "%s(): error %s", __func__, bch2_err_str(ret)); @@ -655,7 +659,7 @@ err: struct btree_path *path; b = as->b; - path = get_unlocked_mut_path(&trans, as->btree_id, b->c.level, b->key.k.p); + path = get_unlocked_mut_path(trans, as->btree_id, b->c.level, b->key.k.p); /* * @b is the node we did the final insert into: * @@ -678,13 +682,13 @@ err: * we may rarely end up with a locked path besides the one we * have here: */ - bch2_trans_unlock(&trans); - btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent); - mark_btree_node_locked(&trans, path, b->c.level, SIX_LOCK_intent); + bch2_trans_unlock(trans); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); + mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED); path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock); path->l[b->c.level].b = b; - bch2_btree_node_lock_write_nofail(&trans, path, &b->c); + bch2_btree_node_lock_write_nofail(trans, path, &b->c); mutex_lock(&c->btree_interior_update_lock); @@ -697,15 +701,15 @@ err: * btree_interior_update_lock: */ if (as->b == b) { - struct bset *i = btree_bset_last(b); - BUG_ON(!b->c.level); BUG_ON(!btree_node_dirty(b)); if (!ret) { - i->journal_seq = cpu_to_le64( + struct bset *last = btree_bset_last(b); + + last->journal_seq = cpu_to_le64( max(journal_seq, - le64_to_cpu(i->journal_seq))); + le64_to_cpu(last->journal_seq))); bch2_btree_add_journal_pin(c, b, journal_seq); } else { @@ -724,8 +728,8 @@ err: six_unlock_write(&b->c.lock); btree_node_write_if_need(c, b, SIX_LOCK_intent); - btree_node_unlock(&trans, path, b->c.level); - bch2_path_put(&trans, path, true); + btree_node_unlock(trans, path, b->c.level); + bch2_path_put(trans, path, true); } bch2_journal_pin_drop(&c->journal, &as->journal); @@ -745,7 +749,7 @@ err: for (i = 0; i < as->nr_new_nodes; i++) { b = as->new_nodes[i]; - btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); btree_node_write_if_need(c, b, SIX_LOCK_read); six_unlock_read(&b->c.lock); } @@ -753,8 +757,8 @@ err: for (i = 0; i < as->nr_open_buckets; i++) bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]); - bch2_btree_update_free(as, &trans); - bch2_trans_exit(&trans); + bch2_btree_update_free(as, trans); + bch2_trans_put(trans); } static void btree_interior_update_work(struct work_struct *work) @@ -1216,18 +1220,6 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) bch2_recalc_btree_reserve(c); } -/** - * bch_btree_set_root - update the root in memory and on disk - * - * To ensure forward progress, the current task must not be holding any - * btree node write locks. However, you must hold an intent lock on the - * old root. - * - * Note: This allocates a journal entry but doesn't add any keys to - * it. All the btree roots are part of every journal write, so there - * is nothing new to be done. This just guarantees that there is a - * journal write. - */ static void bch2_btree_set_root(struct btree_update *as, struct btree_trans *trans, struct btree_path *path, @@ -1341,12 +1333,12 @@ __bch2_btree_insert_keys_interior(struct btree_update *as, ; while (!bch2_keylist_empty(keys)) { - struct bkey_i *k = bch2_keylist_front(keys); + insert = bch2_keylist_front(keys); - if (bpos_gt(k->k.p, b->key.k.p)) + if (bpos_gt(insert->k.p, b->key.k.p)) break; - bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, k); + bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert); bch2_keylist_pop_front(keys); } } @@ -1513,12 +1505,12 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p); six_lock_increment(&n1->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent); + mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, path1, n1); path2 = get_unlocked_mut_path(trans, path->btree_id, n2->c.level, n2->key.k.p); six_lock_increment(&n2->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, path2, n2->c.level, SIX_LOCK_intent); + mark_btree_node_locked(trans, path2, n2->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, path2, n2); /* @@ -1539,7 +1531,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, path2->locks_want++; BUG_ON(btree_node_locked(path2, n3->c.level)); six_lock_increment(&n3->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, path2, n3->c.level, SIX_LOCK_intent); + mark_btree_node_locked(trans, path2, n3->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, path2, n3); n3->sib_u64s[0] = U16_MAX; @@ -1563,7 +1555,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p); six_lock_increment(&n1->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent); + mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, path1, n1); if (parent) @@ -1661,12 +1653,16 @@ bch2_btree_insert_keys_interior(struct btree_update *as, } /** - * bch_btree_insert_node - insert bkeys into a given btree node + * bch2_btree_insert_node - insert bkeys into a given btree node * - * @iter: btree iterator + * @as: btree_update object + * @trans: btree_trans object + * @path: path that points to current node + * @b: node to insert keys into * @keys: list of keys to insert - * @hook: insert callback - * @persistent: if not null, @persistent will wait on journal write + * @flags: transaction commit flags + * + * Returns: 0 on success, typically transaction restart error on failure * * Inserts as many keys as it can into a given btree node, splitting it if full. * If a split occurred, this function will return early. This can only happen @@ -1890,7 +1886,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p); six_lock_increment(&n->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent); + mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, new_path, n); bkey_init(&delete.k); @@ -1934,9 +1930,6 @@ err_free_update: goto out; } -/** - * bch_btree_node_rewrite - Rewrite/move a btree node - */ int bch2_btree_node_rewrite(struct btree_trans *trans, struct btree_iter *iter, struct btree *b, @@ -1967,7 +1960,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p); six_lock_increment(&n->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent); + mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, new_path, n); trace_and_count(c, btree_node_rewrite, c, b); @@ -2055,9 +2048,9 @@ static void async_btree_node_rewrite_work(struct work_struct *work) int ret; ret = bch2_trans_do(c, NULL, NULL, 0, - async_btree_node_rewrite_trans(&trans, a)); + async_btree_node_rewrite_trans(trans, a)); if (ret) - bch_err(c, "%s: error %s", __func__, bch2_err_str(ret)); + bch_err_fn(c, ret); bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); kfree(a); } @@ -2096,8 +2089,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) ret = bch2_fs_read_write_early(c); if (ret) { - bch_err(c, "%s: error going read-write: %s", - __func__, bch2_err_str(ret)); + bch_err_msg(c, ret, "going read-write"); kfree(a); return; } @@ -2372,7 +2364,7 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) { - bch2_trans_run(c, __bch2_btree_root_alloc(&trans, id)); + bch2_trans_run(c, __bch2_btree_root_alloc(trans, id)); } void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) diff --git a/libbcachefs/btree_write_buffer.c b/libbcachefs/btree_write_buffer.c index 6d2d43b6..4e6241db 100644 --- a/libbcachefs/btree_write_buffer.c +++ b/libbcachefs/btree_write_buffer.c @@ -296,7 +296,7 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j, mutex_lock(&wb->flush_lock); return bch2_trans_run(c, - __bch2_btree_write_buffer_flush(&trans, BTREE_INSERT_NOCHECK_RW, true)); + __bch2_btree_write_buffer_flush(trans, BTREE_INSERT_NOCHECK_RW, true)); } static inline u64 btree_write_buffer_ref(int idx) diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index c02c8c91..e7f4506f 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -680,7 +680,7 @@ static int check_bucket_ref(struct btree_trans *trans, struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); - u16 bucket_sectors = !ptr->cached + u32 bucket_sectors = !ptr->cached ? dirty_sectors : cached_sectors; struct printbuf buf = PRINTBUF; @@ -752,9 +752,9 @@ static int check_bucket_ref(struct btree_trans *trans, goto err; } - if ((unsigned) (bucket_sectors + sectors) > U32_MAX) { + if ((u64) bucket_sectors + sectors > U32_MAX) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n" + "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n" "while marking %s", ptr->dev, bucket_nr, b_gen, bch2_data_types[bucket_data_type ?: ptr_data_type], @@ -1201,7 +1201,7 @@ not_found: new->k.p = bkey_start_pos(p.k); new->k.p.offset += *idx - start; bch2_key_resize(&new->k, next_idx - *idx); - ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new->k_i, + ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_NORUN); } @@ -1300,7 +1300,7 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans, static int warned_disk_usage = 0; bool warn = false; unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; - struct replicas_delta *d = deltas->d, *d2; + struct replicas_delta *d, *d2; struct replicas_delta *top = (void *) deltas->d + deltas->used; struct bch_fs_usage *dst; s64 added = 0, should_not_have_added; @@ -1923,7 +1923,7 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca) { - int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(&trans, ca)); + int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca)); if (ret) bch_err_fn(c, ret); diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index f192809f..ecbeb728 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -40,15 +40,42 @@ static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, secto for (_b = (_buckets)->b + (_buckets)->first_bucket; \ _b < (_buckets)->b + (_buckets)->nbuckets; _b++) +/* + * Ugly hack alert: + * + * We need to cram a spinlock in a single byte, because that's what we have left + * in struct bucket, and we care about the size of these - during fsck, we need + * in memory state for every single bucket on every device. + * + * We used to do + * while (xchg(&b->lock, 1) cpu_relax(); + * but, it turns out not all architectures support xchg on a single byte. + * + * So now we use bit_spin_lock(), with fun games since we can't burn a whole + * ulong for this - we just need to make sure the lock bit always ends up in the + * first byte. + */ + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define BUCKET_LOCK_BITNR 0 +#else +#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1) +#endif + +union ulong_byte_assert { + ulong ulong; + u8 byte; +}; + static inline void bucket_unlock(struct bucket *b) { - smp_store_release(&b->lock, 0); + BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte); + bit_spin_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock); } static inline void bucket_lock(struct bucket *b) { - while (xchg(&b->lock, 1)) - cpu_relax(); + bit_spin_lock(BUCKET_LOCK_BITNR, (void *) &b->lock); } static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca) @@ -180,7 +207,7 @@ static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_waterma switch (watermark) { case BCH_WATERMARK_NR: - unreachable(); + BUG(); case BCH_WATERMARK_stripe: reserved += ca->mi.nbuckets >> 6; fallthrough; diff --git a/libbcachefs/buckets_waiting_for_journal.c b/libbcachefs/buckets_waiting_for_journal.c index 81ab685c..ec1b636e 100644 --- a/libbcachefs/buckets_waiting_for_journal.c +++ b/libbcachefs/buckets_waiting_for_journal.c @@ -133,7 +133,7 @@ retry_rehash: b->t = n; kvfree(t); - pr_debug("took %zu rehashes, table at %zu/%zu elements", + pr_debug("took %zu rehashes, table at %zu/%lu elements", nr_rehashes, nr_elements, 1UL << b->t->bits); out: mutex_unlock(&b->lock); diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index fb603df0..f69e15dc 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -86,10 +86,9 @@ static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg) devs[i] = strndup_user((const char __user *)(unsigned long) user_devs[i], PATH_MAX); - if (!devs[i]) { - ret = -ENOMEM; + ret= PTR_ERR_OR_ZERO(devs[i]); + if (ret) goto err; - } } c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty()); @@ -117,8 +116,9 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg return -EINVAL; path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); - if (!path) - return -ENOMEM; + ret = PTR_ERR_OR_ZERO(path); + if (ret) + return ret; err = bch2_fs_open_incremental(path); kfree(path); @@ -149,9 +149,10 @@ static long bch2_global_ioctl(unsigned cmd, void __user *arg) static long bch2_ioctl_query_uuid(struct bch_fs *c, struct bch_ioctl_query_uuid __user *user_arg) { - return copy_to_user(&user_arg->uuid, - &c->sb.user_uuid, - sizeof(c->sb.user_uuid)); + if (copy_to_user(&user_arg->uuid, &c->sb.user_uuid, + sizeof(c->sb.user_uuid))) + return -EFAULT; + return 0; } #if 0 @@ -188,8 +189,9 @@ static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg) return -EINVAL; path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); - if (!path) - return -ENOMEM; + ret = PTR_ERR_OR_ZERO(path); + if (ret) + return ret; ret = bch2_dev_add(c, path); kfree(path); @@ -230,8 +232,9 @@ static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg) return -EINVAL; path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); - if (!path) - return -ENOMEM; + ret = PTR_ERR_OR_ZERO(path); + if (ret) + return ret; ret = bch2_dev_online(c, path); kfree(path); @@ -338,7 +341,10 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf, if (len < sizeof(e)) return -EINVAL; - return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e); + if (copy_to_user(buf, &e, sizeof(e))) + return -EFAULT; + + return sizeof(e); } static const struct file_operations bcachefs_data_ops = { @@ -417,7 +423,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes)) return -EFAULT; - arg = kzalloc(sizeof(*arg) + replica_entries_bytes, GFP_KERNEL); + arg = kzalloc(size_add(sizeof(*arg), replica_entries_bytes), GFP_KERNEL); if (!arg) return -ENOMEM; @@ -466,9 +472,11 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, percpu_up_read(&c->mark_lock); kfree(src); - if (!ret) - ret = copy_to_user(user_arg, arg, - sizeof(*arg) + arg->replica_entries_bytes); + if (ret) + goto err; + if (copy_to_user(user_arg, arg, + sizeof(*arg) + arg->replica_entries_bytes)) + ret = -EFAULT; err: kfree(arg); return ret; @@ -513,7 +521,10 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, percpu_ref_put(&ca->ref); - return copy_to_user(user_arg, &arg, sizeof(arg)); + if (copy_to_user(user_arg, &arg, sizeof(arg))) + return -EFAULT; + + return 0; } static long bch2_ioctl_read_super(struct bch_fs *c, @@ -550,8 +561,9 @@ static long bch2_ioctl_read_super(struct bch_fs *c, goto err; } - ret = copy_to_user((void __user *)(unsigned long)arg.sb, - sb, vstruct_bytes(sb)); + if (copy_to_user((void __user *)(unsigned long)arg.sb, sb, + vstruct_bytes(sb))) + ret = -EFAULT; err: if (!IS_ERR_OR_NULL(ca)) percpu_ref_put(&ca->ref); @@ -617,6 +629,9 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, arg.pad) return -EINVAL; + if (arg.nbuckets > U32_MAX) + return -EINVAL; + ca = bch2_device_lookup(c, arg.dev, arg.flags); if (IS_ERR(ca)) return PTR_ERR(ca); diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c index 4c87c596..1948119e 100644 --- a/libbcachefs/checksum.c +++ b/libbcachefs/checksum.c @@ -139,7 +139,7 @@ static inline int do_encrypt(struct crypto_sync_skcipher *tfm, for (i = 0; i < pages; i++) { unsigned offset = offset_in_page(buf); - unsigned pg_len = min(len, PAGE_SIZE - offset); + unsigned pg_len = min_t(size_t, len, PAGE_SIZE - offset); sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset); buf += pg_len; @@ -159,15 +159,16 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, crypto_alloc_sync_skcipher("chacha20", 0, 0); int ret; - if (!chacha20) { - pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20)); - return PTR_ERR(chacha20); + ret = PTR_ERR_OR_ZERO(chacha20); + if (ret) { + pr_err("error requesting chacha20 cipher: %s", bch2_err_str(ret)); + return ret; } ret = crypto_skcipher_setkey(&chacha20->base, (void *) key, sizeof(*key)); if (ret) { - pr_err("crypto_skcipher_setkey() error: %i", ret); + pr_err("error from crypto_skcipher_setkey(): %s", bch2_err_str(ret)); goto err; } @@ -366,11 +367,11 @@ struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, BUG_ON(!bch2_checksum_mergeable(type)); while (b_len) { - unsigned b = min_t(unsigned, b_len, PAGE_SIZE); + unsigned page_len = min_t(unsigned, b_len, PAGE_SIZE); bch2_checksum_update(&state, - page_address(ZERO_PAGE(0)), b); - b_len -= b; + page_address(ZERO_PAGE(0)), page_len); + b_len -= page_len; } a.lo = (__le64 __force) bch2_checksum_final(&state); a.lo ^= b.lo; @@ -395,9 +396,9 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, unsigned csum_type; struct bch_csum csum; } splits[3] = { - { crc_a, len_a, new_csum_type }, - { crc_b, len_b, new_csum_type }, - { NULL, bio_sectors(bio) - len_a - len_b, new_csum_type }, + { crc_a, len_a, new_csum_type, { 0 }}, + { crc_b, len_b, new_csum_type, { 0 } }, + { NULL, bio_sectors(bio) - len_a - len_b, new_csum_type, { 0 } }, }, *i; bool mergeable = crc_old.csum_type == new_csum_type && bch2_checksum_mergeable(new_csum_type); @@ -558,6 +559,7 @@ int bch2_request_key(struct bch_sb *sb, struct bch_key *key) return ret; } +#ifndef __KERNEL__ int bch2_revoke_key(struct bch_sb *sb) { key_serial_t key_id; @@ -575,6 +577,7 @@ int bch2_revoke_key(struct bch_sb *sb) return 0; } +#endif int bch2_decrypt_sb_key(struct bch_fs *c, struct bch_sb_field_crypt *crypt, @@ -596,7 +599,7 @@ int bch2_decrypt_sb_key(struct bch_fs *c, /* decrypt real key: */ ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), - &sb_key, sizeof(sb_key)); + &sb_key, sizeof(sb_key)); if (ret) goto err; diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h index 9a4898db..13998388 100644 --- a/libbcachefs/checksum.h +++ b/libbcachefs/checksum.h @@ -40,15 +40,16 @@ struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce, */ #define csum_vstruct(_c, _type, _nonce, _i) \ ({ \ - const void *start = ((const void *) (_i)) + sizeof((_i)->csum); \ - const void *end = vstruct_end(_i); \ + const void *_start = ((const void *) (_i)) + sizeof((_i)->csum);\ \ - bch2_checksum(_c, _type, _nonce, start, end - start); \ + bch2_checksum(_c, _type, _nonce, _start, vstruct_end(_i) - _start);\ }) int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); int bch2_request_key(struct bch_sb *, struct bch_key *); +#ifndef __KERNEL__ int bch2_revoke_key(struct bch_sb *); +#endif int bch2_encrypt(struct bch_fs *, unsigned, struct nonce, void *data, size_t); diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c index 6b17f7cc..1480b645 100644 --- a/libbcachefs/compress.c +++ b/libbcachefs/compress.c @@ -3,7 +3,6 @@ #include "checksum.h" #include "compress.h" #include "extents.h" -#include "io.h" #include "super-io.h" #include <linux/lz4.h> @@ -571,7 +570,6 @@ void bch2_fs_compress_exit(struct bch_fs *c) static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) { size_t decompress_workspace_size = 0; - bool decompress_workspace_needed; ZSTD_parameters params = zstd_get_params(zstd_max_clevel(), c->opts.encoded_extent_max); struct { @@ -581,7 +579,8 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) size_t decompress_workspace; } compression_types[] = { { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, - max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) }, + max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS), + 0 }, { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip, zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), zlib_inflate_workspacesize(), }, @@ -620,9 +619,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) if (!(features & (1 << i->feature))) continue; - if (i->decompress_workspace) - decompress_workspace_needed = true; - if (mempool_initialized(&c->compress_workspace[i->type])) continue; diff --git a/libbcachefs/counters.c b/libbcachefs/counters.c index 442a9b80..26eb3d82 100644 --- a/libbcachefs/counters.c +++ b/libbcachefs/counters.c @@ -43,7 +43,7 @@ static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, prt_tab(out); prt_printf(out, "%llu", le64_to_cpu(ctrs->d[i])); prt_newline(out); - }; + } }; int bch2_sb_counters_to_cpu(struct bch_fs *c) diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c index 81518f20..899ff46d 100644 --- a/libbcachefs/data_update.c +++ b/libbcachefs/data_update.c @@ -9,7 +9,7 @@ #include "ec.h" #include "error.h" #include "extents.h" -#include "io.h" +#include "io_write.h" #include "keylist.h" #include "move.h" #include "nocow_locking.h" @@ -49,10 +49,6 @@ static void trace_move_extent_fail2(struct data_update *m, if (insert) { i = 0; bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) { - struct bkey_s new_s; - new_s.k = (void *) new.k; - new_s.v = (void *) new.v; - if (((1U << i) & m->data_opts.rewrite_ptrs) && (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && !ptr->cached) @@ -307,7 +303,7 @@ out: int bch2_data_update_index_update(struct bch_write_op *op) { - return bch2_trans_run(op->c, __bch2_data_update_index_update(&trans, op)); + return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op)); } void bch2_data_update_read_done(struct data_update *m, diff --git a/libbcachefs/data_update.h b/libbcachefs/data_update.h index 49e9055c..7ca1f98d 100644 --- a/libbcachefs/data_update.h +++ b/libbcachefs/data_update.h @@ -4,7 +4,7 @@ #define _BCACHEFS_DATA_UPDATE_H #include "bkey_buf.h" -#include "io_types.h" +#include "io_write_types.h" struct moving_context; diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index ae47e185..75a3dc7c 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -19,7 +19,6 @@ #include "extents.h" #include "fsck.h" #include "inode.h" -#include "io.h" #include "super.h" #include <linux/console.h> @@ -154,10 +153,8 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) BUG_ON(b->nsets != 1); for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_p_next(k)) - if (k->type == KEY_TYPE_btree_ptr_v2) { - struct bch_btree_ptr_v2 *v = (void *) bkeyp_val(&b->format, k); - v->mem_ptr = 0; - } + if (k->type == KEY_TYPE_btree_ptr_v2) + ((struct bch_btree_ptr_v2 *) bkeyp_val(&b->format, k))->mem_ptr = 0; v = c->verify_data; bkey_copy(&v->key, &b->key); @@ -322,16 +319,16 @@ static ssize_t flush_buf(struct dump_iter *i) { if (i->buf.pos) { size_t bytes = min_t(size_t, i->buf.pos, i->size); - int err = copy_to_user(i->ubuf, i->buf.buf, bytes); + int copied = bytes - copy_to_user(i->ubuf, i->buf.buf, bytes); - if (err) - return err; + i->ret += copied; + i->ubuf += copied; + i->size -= copied; + i->buf.pos -= copied; + memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos); - i->ret += bytes; - i->ubuf += bytes; - i->size -= bytes; - i->buf.pos -= bytes; - memmove(i->buf.buf, i->buf.buf + bytes, i->buf.pos); + if (copied != bytes) + return -EFAULT; } return i->size ? 0 : i->ret; @@ -369,7 +366,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; ssize_t ret; @@ -382,17 +379,17 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, if (ret) return ret; - bch2_trans_init(&trans, i->c, 0, 0); - ret = for_each_btree_key2(&trans, iter, i->id, i->from, + trans = bch2_trans_get(i->c); + ret = for_each_btree_key2(trans, iter, i->id, i->from, BTREE_ITER_PREFETCH| BTREE_ITER_ALL_SNAPSHOTS, k, ({ bch2_bkey_val_to_text(&i->buf, i->c, k); prt_newline(&i->buf); - drop_locks_do(&trans, flush_buf(i)); + drop_locks_do(trans, flush_buf(i)); })); i->from = iter.pos; - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (!ret) ret = flush_buf(i); @@ -411,7 +408,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct btree *b; ssize_t ret; @@ -427,26 +424,26 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, if (bpos_eq(SPOS_MAX, i->from)) return i->ret; - bch2_trans_init(&trans, i->c, 0, 0); + trans = bch2_trans_get(i->c); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - for_each_btree_node(&trans, iter, i->id, i->from, 0, b, ret) { + for_each_btree_node(trans, iter, i->id, i->from, 0, b, ret) { bch2_btree_node_to_text(&i->buf, i->c, b); i->from = !bpos_eq(SPOS_MAX, b->key.k.p) ? bpos_successor(b->key.k.p) : b->key.k.p; - ret = drop_locks_do(&trans, flush_buf(i)); + ret = drop_locks_do(trans, flush_buf(i)); if (ret) break; } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (!ret) ret = flush_buf(i); @@ -465,7 +462,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; ssize_t ret; @@ -478,9 +475,9 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, if (ret) return ret; - bch2_trans_init(&trans, i->c, 0, 0); + trans = bch2_trans_get(i->c); - ret = for_each_btree_key2(&trans, iter, i->id, i->from, + ret = for_each_btree_key2(trans, iter, i->id, i->from, BTREE_ITER_PREFETCH| BTREE_ITER_ALL_SNAPSHOTS, k, ({ struct btree_path_level *l = &iter.path->l[0]; @@ -493,11 +490,11 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, } bch2_bfloat_to_text(&i->buf, l->b, _k); - drop_locks_do(&trans, flush_buf(i)); + drop_locks_do(trans, flush_buf(i)); })); i->from = iter.pos; - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (!ret) ret = flush_buf(i); diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index a7559ab0..6c6c8d57 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -479,21 +479,19 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, const struct bch_hash_info *hash_info, const struct qstr *name, subvol_inum *inum) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; int ret; - - bch2_trans_init(&trans, c, 0, 0); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info, + ret = __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; if (!ret) - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); return ret; } @@ -522,7 +520,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; struct bkey_s_c_dirent dirent; @@ -533,15 +531,14 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) int ret; bch2_bkey_buf_init(&sk); - bch2_trans_init(&trans, c, 0, 0); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); if (ret) goto err; - for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_dirents, + for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents, SPOS(inum.inum, ctx->pos, snapshot), POS(inum.inum, U64_MAX), 0, k, ret) { if (k.k->type != KEY_TYPE_dirent) @@ -549,7 +546,7 @@ retry: dirent = bkey_s_c_to_dirent(k); - ret = bch2_dirent_read_target(&trans, inum, dirent, &target); + ret = bch2_dirent_read_target(trans, inum, dirent, &target); if (ret < 0) break; if (ret) @@ -558,7 +555,7 @@ retry: /* dir_emit() can fault and block: */ bch2_bkey_buf_reassemble(&sk, c, k); dirent = bkey_i_to_s_c_dirent(sk.k); - bch2_trans_unlock(&trans); + bch2_trans_unlock(trans); name = bch2_dirent_get_name(dirent); @@ -574,16 +571,16 @@ retry: * read_target looks up subvolumes, we can overflow paths if the * directory has many subvolumes in it */ - ret = btree_trans_too_many_iters(&trans); + ret = btree_trans_too_many_iters(trans); if (ret) break; } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_exit(&trans); + bch2_trans_put(trans); bch2_bkey_buf_exit(&sk, c); return ret; diff --git a/libbcachefs/disk_groups.c b/libbcachefs/disk_groups.c index f36472c4..b292dbef 100644 --- a/libbcachefs/disk_groups.c +++ b/libbcachefs/disk_groups.c @@ -32,21 +32,21 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb, for (i = 0; i < sb->nr_devices; i++) { struct bch_member *m = mi->members + i; - unsigned g; + unsigned group_id; if (!BCH_MEMBER_GROUP(m)) continue; - g = BCH_MEMBER_GROUP(m) - 1; + group_id = BCH_MEMBER_GROUP(m) - 1; - if (g >= nr_groups) { + if (group_id >= nr_groups) { prt_printf(err, "disk %u has invalid label %u (have %u)", - i, g, nr_groups); + i, group_id, nr_groups); return -BCH_ERR_invalid_sb_disk_groups; } - if (BCH_GROUP_DELETED(&groups->entries[g])) { - prt_printf(err, "disk %u has deleted label %u", i, g); + if (BCH_GROUP_DELETED(&groups->entries[group_id])) { + prt_printf(err, "disk %u has deleted label %u", i, group_id); return -BCH_ERR_invalid_sb_disk_groups; } } @@ -183,8 +183,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { struct bch_member *m = mi->members + i; - struct bch_disk_group_cpu *dst = - &cpu_g->entries[BCH_MEMBER_GROUP(m)]; + struct bch_disk_group_cpu *dst; if (!bch2_member_exists(m)) continue; diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index f58e84a2..8646856e 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -11,10 +11,11 @@ #include "btree_update.h" #include "btree_write_buffer.h" #include "buckets.h" +#include "checksum.h" #include "disk_groups.h" #include "ec.h" #include "error.h" -#include "io.h" +#include "io_read.h" #include "keylist.h" #include "recovery.h" #include "replicas.h" @@ -475,7 +476,7 @@ err: static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe) { - return bch2_trans_run(c, get_stripe_key_trans(&trans, idx, stripe)); + return bch2_trans_run(c, get_stripe_key_trans(trans, idx, stripe)); } /* recovery read path: */ @@ -787,12 +788,10 @@ static void ec_stripe_delete_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, ec_stripe_delete_work); - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); int ret; u64 idx; - bch2_trans_init(&trans, c, 0, 0); - while (1) { mutex_lock(&c->ec_stripes_heap_lock); idx = stripe_idx_to_delete(c); @@ -801,15 +800,15 @@ static void ec_stripe_delete_work(struct work_struct *work) if (!idx) break; - ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL, - ec_stripe_delete(&trans, idx)); + ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + ec_stripe_delete(trans, idx)); if (ret) { bch_err_fn(c, ret); break; } } - bch2_trans_exit(&trans); + bch2_trans_put(trans); bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); } @@ -998,24 +997,22 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; unsigned i, nr_data = v->nr_blocks - v->nr_redundant; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - - ret = bch2_btree_write_buffer_flush(&trans); + ret = bch2_btree_write_buffer_flush(trans); if (ret) goto err; for (i = 0; i < nr_data; i++) { - ret = ec_stripe_update_bucket(&trans, s, i); + ret = ec_stripe_update_bucket(trans, s, i); if (ret) break; } err: - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } @@ -1123,7 +1120,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) ret = bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL, - ec_stripe_key_update(&trans, + ec_stripe_key_update(trans, bkey_i_to_stripe(&s->new_stripe.key), !s->have_existing_stripe)); if (ret) { @@ -1133,8 +1130,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) ret = ec_stripe_update_extents(c, &s->new_stripe); if (ret) { - bch_err(c, "error creating stripe: error updating pointers: %s", - bch2_err_str(ret)); + bch_err_msg(c, ret, "creating stripe: error updating pointers"); goto err; } err: @@ -1822,7 +1818,7 @@ void bch2_fs_ec_flush(struct bch_fs *c) int bch2_stripes_read(struct bch_fs *c) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; const struct bch_stripe *s; @@ -1830,9 +1826,7 @@ int bch2_stripes_read(struct bch_fs *c) unsigned i; int ret; - bch2_trans_init(&trans, c, 0, 0); - - for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN, + for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { if (k.k->type != KEY_TYPE_stripe) continue; @@ -1855,9 +1849,9 @@ int bch2_stripes_read(struct bch_fs *c) bch2_stripes_heap_insert(c, m, k.k->p.offset); } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (ret) bch_err_fn(c, ret); diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h index 885ae5d5..966d165a 100644 --- a/libbcachefs/ec.h +++ b/libbcachefs/ec.h @@ -240,7 +240,7 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s, bch2_ec_do_stripe_creates(c); break; default: - unreachable(); + BUG(); } } diff --git a/libbcachefs/errcode.c b/libbcachefs/errcode.c index dc906fc9..d260ff9b 100644 --- a/libbcachefs/errcode.c +++ b/libbcachefs/errcode.c @@ -12,8 +12,6 @@ static const char * const bch2_errcode_strs[] = { NULL }; -#define BCH_ERR_0 0 - static unsigned bch2_errcode_parents[] = { #define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class, BCH_ERRCODES() @@ -61,3 +59,10 @@ int __bch2_err_class(int err) return -err; } + +const char *bch2_blk_status_to_str(blk_status_t status) +{ + if (status == BLK_STS_REMOVED) + return "device removed"; + return blk_status_to_str(status); +} diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index f7fa8744..64f7176c 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -99,6 +99,7 @@ x(ENOENT, ENOENT_str_hash_set_must_replace) \ x(ENOENT, ENOENT_inode) \ x(ENOENT, ENOENT_not_subvol) \ + x(ENOENT, ENOENT_not_directory) \ x(ENOENT, ENOENT_directory_dead) \ x(ENOENT, ENOENT_subvolume) \ x(ENOENT, ENOENT_snapshot_tree) \ @@ -218,7 +219,14 @@ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_bad_node) \ - x(BCH_ERR_btree_node_read_err, btree_node_read_err_incompatible) + x(BCH_ERR_btree_node_read_err, btree_node_read_err_incompatible) \ + x(0, nopromote) \ + x(BCH_ERR_nopromote, nopromote_may_not) \ + x(BCH_ERR_nopromote, nopromote_already_promoted) \ + x(BCH_ERR_nopromote, nopromote_unwritten) \ + x(BCH_ERR_nopromote, nopromote_congested) \ + x(BCH_ERR_nopromote, nopromote_in_flight) \ + x(BCH_ERR_nopromote, nopromote_enomem) enum bch_errcode { BCH_ERR_START = 2048, @@ -249,4 +257,8 @@ static inline long bch2_err_class(long err) return err < 0 ? __bch2_err_class(err) : err; } +#define BLK_STS_REMOVED ((__force blk_status_t)128) + +const char *bch2_blk_status_to_str(blk_status_t); + #endif /* _BCACHFES_ERRCODE_H */ diff --git a/libbcachefs/error.c b/libbcachefs/error.c index 39009cf0..2a5af887 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "error.h" -#include "io.h" #include "super.h" #define FSCK_ERR_RATELIMIT_NR 10 diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c index cbfb5b21..58ccc7b9 100644 --- a/libbcachefs/fs-io-buffered.c +++ b/libbcachefs/fs-io-buffered.c @@ -8,7 +8,8 @@ #include "fs-io-buffered.h" #include "fs-io-direct.h" #include "fs-io-pagecache.h" -#include "io.h" +#include "io_read.h" +#include "io_write.h" #include <linux/backing-dev.h> #include <linux/pagemap.h> @@ -269,7 +270,7 @@ void bch2_readahead(struct readahead_control *ractl) struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_io_opts opts; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct folio *folio; struct readpages_iter readpages_iter; int ret; @@ -279,8 +280,6 @@ void bch2_readahead(struct readahead_control *ractl) ret = readpages_iter_init(&readpages_iter, ractl); BUG_ON(ret); - bch2_trans_init(&trans, c, 0, 0); - bch2_pagecache_add_get(inode); while ((folio = readpage_iter_peek(&readpages_iter))) { @@ -299,31 +298,27 @@ void bch2_readahead(struct readahead_control *ractl) rbio->bio.bi_end_io = bch2_readpages_end_io; BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); - bchfs_read(&trans, rbio, inode_inum(inode), + bchfs_read(trans, rbio, inode_inum(inode), &readpages_iter); - bch2_trans_unlock(&trans); + bch2_trans_unlock(trans); } bch2_pagecache_add_put(inode); - bch2_trans_exit(&trans); + bch2_trans_put(trans); darray_exit(&readpages_iter.folios); } static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio, subvol_inum inum, struct folio *folio) { - struct btree_trans trans; - bch2_folio_create(folio, __GFP_NOFAIL); rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; rbio->bio.bi_iter.bi_sector = folio_sector(folio); BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); - bch2_trans_init(&trans, c, 0, 0); - bchfs_read(&trans, rbio, inum, NULL); - bch2_trans_exit(&trans); + bch2_trans_run(c, (bchfs_read(trans, rbio, inum, NULL), 0)); } static void bch2_read_single_folio_end_io(struct bio *bio) @@ -694,12 +689,12 @@ int bch2_write_begin(struct file *file, struct address_space *mapping, if (IS_ERR_OR_NULL(folio)) goto err_unlock; - if (folio_test_uptodate(folio)) - goto out; - offset = pos - folio_pos(folio); len = min_t(size_t, len, folio_end_pos(folio) - pos); + if (folio_test_uptodate(folio)) + goto out; + /* If we're writing entire folio, don't need to read it in first: */ if (!offset && len == folio_size(folio)) goto out; @@ -800,10 +795,10 @@ int bch2_write_end(struct file *file, struct address_space *mapping, return copied; } -static noinline void folios_trunc(folios *folios, struct folio **fi) +static noinline void folios_trunc(folios *fs, struct folio **fi) { - while (folios->data + folios->nr > fi) { - struct folio *f = darray_pop(folios); + while (fs->data + fs->nr > fi) { + struct folio *f = darray_pop(fs); folio_unlock(f); folio_put(f); @@ -817,35 +812,35 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch2_folio_reservation res; - folios folios; + folios fs; struct folio **fi, *f; - unsigned copied = 0, f_offset; - u64 end = pos + len, f_pos; + unsigned copied = 0, f_offset, f_copied; + u64 end = pos + len, f_pos, f_len; loff_t last_folio_pos = inode->v.i_size; int ret = 0; BUG_ON(!len); bch2_folio_reservation_init(c, inode, &res); - darray_init(&folios); + darray_init(&fs); ret = bch2_filemap_get_contig_folios_d(mapping, pos, end, FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT, mapping_gfp_mask(mapping), - &folios); + &fs); if (ret) goto out; - BUG_ON(!folios.nr); + BUG_ON(!fs.nr); - f = darray_first(folios); + f = darray_first(fs); if (pos != folio_pos(f) && !folio_test_uptodate(f)) { ret = bch2_read_single_folio(f, mapping); if (ret) goto out; } - f = darray_last(folios); + f = darray_last(fs); end = min(end, folio_end_pos(f)); last_folio_pos = folio_pos(f); if (end != folio_end_pos(f) && !folio_test_uptodate(f)) { @@ -858,15 +853,15 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, } } - ret = bch2_folio_set(c, inode_inum(inode), folios.data, folios.nr); + ret = bch2_folio_set(c, inode_inum(inode), fs.data, fs.nr); if (ret) goto out; f_pos = pos; - f_offset = pos - folio_pos(darray_first(folios)); - darray_for_each(folios, fi) { - struct folio *f = *fi; - u64 f_len = min(end, folio_end_pos(f)) - f_pos; + f_offset = pos - folio_pos(darray_first(fs)); + darray_for_each(fs, fi) { + f = *fi; + f_len = min(end, folio_end_pos(f)) - f_pos; /* * XXX: per POSIX and fstests generic/275, on -ENOSPC we're @@ -878,11 +873,11 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, */ ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len); if (unlikely(ret)) { - folios_trunc(&folios, fi); - if (!folios.nr) + folios_trunc(&fs, fi); + if (!fs.nr) goto out; - end = min(end, folio_end_pos(darray_last(folios))); + end = min(end, folio_end_pos(darray_last(fs))); break; } @@ -891,18 +886,17 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, } if (mapping_writably_mapped(mapping)) - darray_for_each(folios, fi) + darray_for_each(fs, fi) flush_dcache_folio(*fi); f_pos = pos; - f_offset = pos - folio_pos(darray_first(folios)); - darray_for_each(folios, fi) { - struct folio *f = *fi; - u64 f_len = min(end, folio_end_pos(f)) - f_pos; - unsigned f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter); - + f_offset = pos - folio_pos(darray_first(fs)); + darray_for_each(fs, fi) { + f = *fi; + f_len = min(end, folio_end_pos(f)) - f_pos; + f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter); if (!f_copied) { - folios_trunc(&folios, fi); + folios_trunc(&fs, fi); break; } @@ -911,7 +905,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, pos + copied + f_copied < inode->v.i_size) { iov_iter_revert(iter, f_copied); folio_zero_range(f, 0, folio_size(f)); - folios_trunc(&folios, fi); + folios_trunc(&fs, fi); break; } @@ -919,7 +913,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, copied += f_copied; if (f_copied != f_len) { - folios_trunc(&folios, fi + 1); + folios_trunc(&fs, fi + 1); break; } @@ -938,10 +932,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, spin_unlock(&inode->v.i_lock); f_pos = pos; - f_offset = pos - folio_pos(darray_first(folios)); - darray_for_each(folios, fi) { - struct folio *f = *fi; - u64 f_len = min(end, folio_end_pos(f)) - f_pos; + f_offset = pos - folio_pos(darray_first(fs)); + darray_for_each(fs, fi) { + f = *fi; + f_len = min(end, folio_end_pos(f)) - f_pos; if (!folio_test_uptodate(f)) folio_mark_uptodate(f); @@ -954,7 +948,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, inode->ei_last_dirtied = (unsigned long) current; out: - darray_for_each(folios, fi) { + darray_for_each(fs, fi) { folio_unlock(*fi); folio_put(*fi); } @@ -967,7 +961,7 @@ out: if (last_folio_pos >= inode->v.i_size) truncate_pagecache(&inode->v, inode->v.i_size); - darray_exit(&folios); + darray_exit(&fs); bch2_folio_reservation_put(c, inode, &res); return copied ?: ret; @@ -1055,8 +1049,6 @@ ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; } - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(&inode->v); inode_lock(&inode->v); ret = generic_write_checks(iocb, from); @@ -1076,7 +1068,6 @@ ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) iocb->ki_pos += ret; unlock: inode_unlock(&inode->v); - current->backing_dev_info = NULL; if (ret > 0) ret = generic_write_sync(iocb, ret); diff --git a/libbcachefs/fs-io-direct.c b/libbcachefs/fs-io-direct.c index 2b29abd2..6a9557e7 100644 --- a/libbcachefs/fs-io-direct.c +++ b/libbcachefs/fs-io-direct.c @@ -7,10 +7,12 @@ #include "fs-io.h" #include "fs-io-direct.h" #include "fs-io-pagecache.h" -#include "io.h" +#include "io_read.h" +#include "io_write.h" #include <linux/kthread.h> #include <linux/pagemap.h> +#include <linux/prefetch.h> #include <linux/task_io_accounting_ops.h> /* O_DIRECT reads */ @@ -232,23 +234,21 @@ static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum, u64 offset, u64 size, unsigned nr_replicas, bool compressed) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; u64 end = offset + size; u32 snapshot; bool ret = true; int err; - - bch2_trans_init(&trans, c, 0, 0); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + err = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); if (err) goto err; - for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, + for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, SPOS(inum.inum, offset, snapshot), BTREE_ITER_SLOTS, k, err) { if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end))) @@ -263,11 +263,11 @@ retry: } offset = iter.pos.offset; - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); err: if (bch2_err_matches(err, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_exit(&trans); + bch2_trans_put(trans); return err ? false : ret; } diff --git a/libbcachefs/fs-io-pagecache.c b/libbcachefs/fs-io-pagecache.c index 1e60eead..8bd9bcdd 100644 --- a/libbcachefs/fs-io-pagecache.c +++ b/libbcachefs/fs-io-pagecache.c @@ -14,7 +14,7 @@ int bch2_filemap_get_contig_folios_d(struct address_space *mapping, loff_t start, u64 end, int fgp_flags, gfp_t gfp, - folios *folios) + folios *fs) { struct folio *f; u64 pos = start; @@ -24,7 +24,7 @@ int bch2_filemap_get_contig_folios_d(struct address_space *mapping, if ((u64) pos >= (u64) start + (1ULL << 20)) fgp_flags &= ~FGP_CREAT; - ret = darray_make_room_gfp(folios, 1, gfp & GFP_KERNEL); + ret = darray_make_room_gfp(fs, 1, gfp & GFP_KERNEL); if (ret) break; @@ -32,16 +32,16 @@ int bch2_filemap_get_contig_folios_d(struct address_space *mapping, if (IS_ERR_OR_NULL(f)) break; - BUG_ON(folios->nr && folio_pos(f) != pos); + BUG_ON(fs->nr && folio_pos(f) != pos); pos = folio_end_pos(f); - darray_push(folios, f); + darray_push(fs, f); } - if (!folios->nr && !ret && (fgp_flags & FGP_CREAT)) + if (!fs->nr && !ret && (fgp_flags & FGP_CREAT)) ret = -ENOMEM; - return folios->nr ? 0 : ret; + return fs->nr ? 0 : ret; } /* pagecache_block must be held */ @@ -73,12 +73,15 @@ int bch2_write_invalidate_inode_pages_range(struct address_space *mapping, return ret; } +#if 0 +/* Useful for debug tracing: */ static const char * const bch2_folio_sector_states[] = { #define x(n) #n, BCH_FOLIO_SECTOR_STATE() #undef x NULL }; +#endif static inline enum bch_folio_sector_state folio_sector_dirty(enum bch_folio_sector_state state) @@ -177,20 +180,20 @@ static void __bch2_folio_set(struct folio *folio, * extents btree: */ int bch2_folio_set(struct bch_fs *c, subvol_inum inum, - struct folio **folios, unsigned nr_folios) + struct folio **fs, unsigned nr_folios) { - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; struct bch_folio *s; - u64 offset = folio_sector(folios[0]); + u64 offset = folio_sector(fs[0]); unsigned folio_idx; u32 snapshot; bool need_set = false; int ret; for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) { - s = bch2_folio_create(folios[folio_idx], GFP_KERNEL); + s = bch2_folio_create(fs[folio_idx], GFP_KERNEL); if (!s) return -ENOMEM; @@ -201,22 +204,22 @@ int bch2_folio_set(struct bch_fs *c, subvol_inum inum, return 0; folio_idx = 0; - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); if (ret) goto err; - for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, + for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, SPOS(inum.inum, offset, snapshot), BTREE_ITER_SLOTS, k, ret) { unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); unsigned state = bkey_to_sector_state(k); while (folio_idx < nr_folios) { - struct folio *folio = folios[folio_idx]; + struct folio *folio = fs[folio_idx]; u64 folio_start = folio_sector(folio); u64 folio_end = folio_end_sector(folio); unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - @@ -240,11 +243,11 @@ retry: } offset = iter.pos.offset; - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 4804e5a4..b0e8144e 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -3,6 +3,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "bkey_buf.h" #include "btree_update.h" #include "buckets.h" #include "clock.h" @@ -16,7 +17,7 @@ #include "fsck.h" #include "inode.h" #include "journal.h" -#include "io.h" +#include "io_misc.h" #include "keylist.h" #include "quota.h" #include "reflink.h" @@ -164,7 +165,6 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, #endif } - /* fsync: */ /* @@ -207,31 +207,29 @@ static inline int range_has_data(struct bch_fs *c, u32 subvol, struct bpos start, struct bpos end) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; int ret = 0; - - bch2_trans_init(&trans, c, 0, 0); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(&trans, subvol, &start.snapshot); + ret = bch2_subvolume_get_snapshot(trans, subvol, &start.snapshot); if (ret) goto err; - for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents, start, end, 0, k, ret) + for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, start, end, 0, k, ret) if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) { ret = 1; break; } start = iter.pos; - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } @@ -241,8 +239,8 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode, struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; struct bch_folio *s; - unsigned start_offset = start & (PAGE_SIZE - 1); - unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1; + unsigned start_offset; + unsigned end_offset; unsigned i; struct folio *folio; s64 i_sectors_delta = 0; @@ -391,33 +389,12 @@ static int bch2_extend(struct mnt_idmap *idmap, return bch2_setattr_nonsize(idmap, inode, iattr); } -static int bch2_truncate_finish_fn(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; - return 0; -} - -static int bch2_truncate_start_fn(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, void *p) -{ - u64 *new_i_size = p; - - bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY; - bi->bi_size = *new_i_size; - return 0; -} - -int bch2_truncate(struct mnt_idmap *idmap, +int bchfs_truncate(struct mnt_idmap *idmap, struct bch_inode_info *inode, struct iattr *iattr) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; struct bch_inode_unpacked inode_u; - u64 new_i_size = iattr->ia_size; s64 i_sectors_delta = 0; int ret = 0; @@ -466,6 +443,8 @@ int bch2_truncate(struct mnt_idmap *idmap, if (unlikely(ret < 0)) goto err; + truncate_setsize(&inode->v, iattr->ia_size); + /* * When extending, we're going to write the new i_size to disk * immediately so we need to flush anything above the current on disk @@ -487,32 +466,22 @@ int bch2_truncate(struct mnt_idmap *idmap, if (ret) goto err; - mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode(c, inode, bch2_truncate_start_fn, - &new_i_size, 0); - mutex_unlock(&inode->ei_update_lock); - - if (unlikely(ret)) - goto err; - - truncate_setsize(&inode->v, iattr->ia_size); - - ret = bch2_fpunch(c, inode_inum(inode), - round_up(iattr->ia_size, block_bytes(c)) >> 9, - U64_MAX, &i_sectors_delta); + ret = bch2_truncate(c, inode_inum(inode), iattr->ia_size, &i_sectors_delta); bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); + if (unlikely(ret)) { + /* + * If we error here, VFS caches are now inconsistent with btree + */ + set_bit(EI_INODE_ERROR, &inode->ei_flags); + goto err; + } + bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks && !bch2_journal_error(&c->journal), c, "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", inode->v.i_ino, (u64) inode->v.i_blocks, inode->ei_inode.bi_sectors); - if (unlikely(ret)) - goto err; - - mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, 0); - mutex_unlock(&inode->ei_update_lock); ret = bch2_setattr_nonsize(idmap, inode, iattr); err: @@ -577,175 +546,33 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; - struct bkey_buf copy; - struct btree_trans trans; - struct btree_iter src, dst, del; - loff_t shift, new_size; - u64 src_start; + s64 i_sectors_delta = 0; int ret = 0; if ((offset | len) & (block_bytes(c) - 1)) return -EINVAL; if (insert) { - if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len) - return -EFBIG; - if (offset >= inode->v.i_size) return -EINVAL; - - src_start = U64_MAX; - shift = len; } else { if (offset + len >= inode->v.i_size) return -EINVAL; - - src_start = offset + len; - shift = -len; } - new_size = inode->v.i_size + shift; - ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); if (ret) return ret; - if (insert) { - i_size_write(&inode->v, new_size); - mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode_size(c, inode, new_size, - ATTR_MTIME|ATTR_CTIME); - mutex_unlock(&inode->ei_update_lock); - } else { - s64 i_sectors_delta = 0; + if (insert) + i_size_write(&inode->v, inode->v.i_size + len); - ret = bch2_fpunch(c, inode_inum(inode), - offset >> 9, (offset + len) >> 9, - &i_sectors_delta); - bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); + ret = bch2_fcollapse_finsert(c, inode_inum(inode), offset >> 9, len >> 9, + insert, &i_sectors_delta); + if (!ret && !insert) + i_size_write(&inode->v, inode->v.i_size - len); + bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); - if (ret) - return ret; - } - - bch2_bkey_buf_init(©); - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - bch2_trans_iter_init(&trans, &src, BTREE_ID_extents, - POS(inode->v.i_ino, src_start >> 9), - BTREE_ITER_INTENT); - bch2_trans_copy_iter(&dst, &src); - bch2_trans_copy_iter(&del, &src); - - while (ret == 0 || - bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - struct disk_reservation disk_res = - bch2_disk_reservation_init(c, 0); - struct bkey_i delete; - struct bkey_s_c k; - struct bpos next_pos; - struct bpos move_pos = POS(inode->v.i_ino, offset >> 9); - struct bpos atomic_end; - unsigned trigger_flags = 0; - u32 snapshot; - - bch2_trans_begin(&trans); - - ret = bch2_subvolume_get_snapshot(&trans, - inode->ei_subvol, &snapshot); - if (ret) - continue; - - bch2_btree_iter_set_snapshot(&src, snapshot); - bch2_btree_iter_set_snapshot(&dst, snapshot); - bch2_btree_iter_set_snapshot(&del, snapshot); - - bch2_trans_begin(&trans); - - k = insert - ? bch2_btree_iter_peek_prev(&src) - : bch2_btree_iter_peek_upto(&src, POS(inode->v.i_ino, U64_MAX)); - if ((ret = bkey_err(k))) - continue; - - if (!k.k || k.k->p.inode != inode->v.i_ino) - break; - - if (insert && - bkey_le(k.k->p, POS(inode->v.i_ino, offset >> 9))) - break; -reassemble: - bch2_bkey_buf_reassemble(©, c, k); - - if (insert && - bkey_lt(bkey_start_pos(k.k), move_pos)) - bch2_cut_front(move_pos, copy.k); - - copy.k->k.p.offset += shift >> 9; - bch2_btree_iter_set_pos(&dst, bkey_start_pos(©.k->k)); - - ret = bch2_extent_atomic_end(&trans, &dst, copy.k, &atomic_end); - if (ret) - continue; - - if (!bkey_eq(atomic_end, copy.k->k.p)) { - if (insert) { - move_pos = atomic_end; - move_pos.offset -= shift >> 9; - goto reassemble; - } else { - bch2_cut_back(atomic_end, copy.k); - } - } - - bkey_init(&delete.k); - delete.k.p = copy.k->k.p; - delete.k.size = copy.k->k.size; - delete.k.p.offset -= shift >> 9; - bch2_btree_iter_set_pos(&del, bkey_start_pos(&delete.k)); - - next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; - - if (copy.k->k.size != k.k->size) { - /* We might end up splitting compressed extents: */ - unsigned nr_ptrs = - bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k)); - - ret = bch2_disk_reservation_get(c, &disk_res, - copy.k->k.size, nr_ptrs, - BCH_DISK_RESERVATION_NOFAIL); - BUG_ON(ret); - } - - ret = bch2_btree_iter_traverse(&del) ?: - bch2_trans_update(&trans, &del, &delete, trigger_flags) ?: - bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?: - bch2_trans_commit(&trans, &disk_res, NULL, - BTREE_INSERT_NOFAIL); - bch2_disk_reservation_put(c, &disk_res); - - if (!ret) - bch2_btree_iter_set_pos(&src, next_pos); - } - bch2_trans_iter_exit(&trans, &del); - bch2_trans_iter_exit(&trans, &dst); - bch2_trans_iter_exit(&trans, &src); - bch2_trans_exit(&trans); - bch2_bkey_buf_exit(©, c); - - if (ret) - return ret; - - mutex_lock(&inode->ei_update_lock); - if (!insert) { - i_size_write(&inode->v, new_size); - ret = bch2_write_inode_size(c, inode, new_size, - ATTR_MTIME|ATTR_CTIME); - } else { - /* We need an inode update to update bi_journal_seq for fsync: */ - ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, - ATTR_MTIME|ATTR_CTIME); - } - mutex_unlock(&inode->ei_update_lock); return ret; } @@ -753,16 +580,15 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, u64 start_sector, u64 end_sector) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bpos end_pos = POS(inode->v.i_ino, end_sector); struct bch_io_opts opts; int ret = 0; bch2_inode_opts_get(&opts, c, &inode->ei_inode); - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, POS(inode->v.i_ino, start_sector), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); @@ -775,9 +601,9 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, u64 hole_start, hole_end; u32 snapshot; - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(&trans, + ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot); if (ret) goto bkey_err; @@ -814,7 +640,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, &hole_start, &hole_end, opts.data_replicas, true)) - ret = drop_locks_do(&trans, + ret = drop_locks_do(trans, (bch2_clamp_data_hole(&inode->v, &hole_start, &hole_end, @@ -837,7 +663,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, goto bkey_err; } - ret = bch2_extent_fallocate(&trans, inode_inum(inode), &iter, + ret = bch2_extent_fallocate(trans, inode_inum(inode), &iter, sectors, opts, &i_sectors_delta, writepoint_hashed((unsigned long) current)); if (ret) @@ -845,7 +671,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); - drop_locks_do(&trans, + drop_locks_do(trans, (bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0)); bkey_err: bch2_quota_reservation_put(c, inode, "a_res); @@ -857,14 +683,14 @@ bkey_err: struct quota_res quota_res = { 0 }; s64 i_sectors_delta = 0; - bch2_fpunch_at(&trans, &iter, inode_inum(inode), + bch2_fpunch_at(trans, &iter, inode_inum(inode), end_sector, &i_sectors_delta); bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); bch2_quota_reservation_put(c, inode, "a_res); } - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); return ret; } @@ -970,26 +796,24 @@ static int quota_reserve_range(struct bch_inode_info *inode, u64 start, u64 end) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; u32 snapshot; u64 sectors = end - start; u64 pos = start; int ret; - - bch2_trans_init(&trans, c, 0, 0); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot); + ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot); if (ret) goto err; - bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inode->v.i_ino, pos, snapshot), 0); - while (!(ret = btree_trans_too_many_iters(&trans)) && + while (!(ret = btree_trans_too_many_iters(trans)) && (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k && !(ret = bkey_err(k))) { if (bkey_extent_is_allocation(k.k)) { @@ -1001,17 +825,14 @@ retry: bch2_btree_iter_advance(&iter); } pos = iter.pos.offset; - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_exit(&trans); + bch2_trans_put(trans); - if (ret) - return ret; - - return bch2_quota_reservation_add(c, inode, res, sectors, true); + return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true); } loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, @@ -1104,7 +925,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; subvol_inum inum = inode_inum(inode); @@ -1116,15 +937,15 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) if (offset >= isize) return -ENXIO; - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); if (ret) goto err; - for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents, + for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, SPOS(inode->v.i_ino, offset >> 9, snapshot), POS(inode->v.i_ino, U64_MAX), 0, k, ret) { @@ -1134,12 +955,12 @@ retry: } else if (k.k->p.offset >> 9 > isize) break; } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (ret) return ret; @@ -1157,7 +978,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; subvol_inum inum = inode_inum(inode); @@ -1169,15 +990,15 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) if (offset >= isize) return -ENXIO; - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); if (ret) goto err; - for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, + for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, SPOS(inode->v.i_ino, offset >> 9, snapshot), BTREE_ITER_SLOTS, k, ret) { if (k.k->p.inode != inode->v.i_ino) { @@ -1195,12 +1016,12 @@ retry: offset = max(offset, bkey_start_offset(k.k) << 9); } } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (ret) return ret; diff --git a/libbcachefs/fs-io.h b/libbcachefs/fs-io.h index bb5b709f..ca70346e 100644 --- a/libbcachefs/fs-io.h +++ b/libbcachefs/fs-io.h @@ -6,7 +6,7 @@ #include "buckets.h" #include "fs.h" -#include "io_types.h" +#include "io_write_types.h" #include "quota.h" #include <linux/uio.h> @@ -165,7 +165,7 @@ int __must_check bch2_write_inode_size(struct bch_fs *, int bch2_fsync(struct file *, loff_t, loff_t, int); -int bch2_truncate(struct mnt_idmap *, +int bchfs_truncate(struct mnt_idmap *, struct bch_inode_info *, struct iattr *); long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index 141bcced..0679b2f7 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -122,7 +122,10 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode, fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ]; - return copy_to_user(arg, &fa, sizeof(fa)); + if (copy_to_user(arg, &fa, sizeof(fa))) + return -EFAULT; + + return 0; } static int fssetxattr_inode_update_fn(struct btree_trans *trans, diff --git a/libbcachefs/fs-ioctl.h b/libbcachefs/fs-ioctl.h index f201980e..54a9c21a 100644 --- a/libbcachefs/fs-ioctl.h +++ b/libbcachefs/fs-ioctl.h @@ -5,7 +5,7 @@ /* Inode flags: */ /* bcachefs inode flags -> vfs inode flags: */ -static const unsigned bch_flags_to_vfs[] = { +static const __maybe_unused unsigned bch_flags_to_vfs[] = { [__BCH_INODE_SYNC] = S_SYNC, [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE, [__BCH_INODE_APPEND] = S_APPEND, @@ -13,7 +13,7 @@ static const unsigned bch_flags_to_vfs[] = { }; /* bcachefs inode flags -> FS_IOC_GETFLAGS: */ -static const unsigned bch_flags_to_uflags[] = { +static const __maybe_unused unsigned bch_flags_to_uflags[] = { [__BCH_INODE_SYNC] = FS_SYNC_FL, [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL, [__BCH_INODE_APPEND] = FS_APPEND_FL, @@ -22,7 +22,7 @@ static const unsigned bch_flags_to_uflags[] = { }; /* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ -static const unsigned bch_flags_to_xflags[] = { +static const __maybe_unused unsigned bch_flags_to_xflags[] = { [__BCH_INODE_SYNC] = FS_XFLAG_SYNC, [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE, [__BCH_INODE_APPEND] = FS_XFLAG_APPEND, diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 80dcda43..1354af2c 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -19,7 +19,7 @@ #include "fs-io-pagecache.h" #include "fsck.h" #include "inode.h" -#include "io.h" +#include "io_read.h" #include "journal.h" #include "keylist.h" #include "quota.h" @@ -82,29 +82,27 @@ int __must_check bch2_write_inode(struct bch_fs *c, inode_set_fn set, void *p, unsigned fields) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; struct bch_inode_unpacked inode_u; int ret; - - bch2_trans_init(&trans, c, 0, 512); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode), + ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_INTENT) ?: - (set ? set(&trans, inode, &inode_u, p) : 0) ?: - bch2_inode_write(&trans, &iter, &inode_u) ?: - bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL); + (set ? set(trans, inode, &inode_u, p) : 0) ?: + bch2_inode_write(trans, &iter, &inode_u) ?: + bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); /* * the btree node lock protects inode->ei_inode, not ei_update_lock; * this is important for inode updates via bchfs_write_index_update */ if (!ret) - bch2_inode_update_after_write(&trans, inode, &inode_u, fields); + bch2_inode_update_after_write(trans, inode, &inode_u, fields); - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; @@ -114,7 +112,7 @@ retry: inode_inum(inode).subvol, inode_inum(inode).inum); - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret < 0 ? ret : 0; } @@ -182,7 +180,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) { struct bch_inode_unpacked inode_u; struct bch_inode_info *inode; - struct btree_trans trans; + struct btree_trans *trans; struct bch_subvolume subvol; int ret; @@ -196,14 +194,14 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) if (!(inode->v.i_state & I_NEW)) return &inode->v; - bch2_trans_init(&trans, c, 8, 0); - ret = lockrestart_do(&trans, - bch2_subvolume_get(&trans, inum.subvol, true, 0, &subvol) ?: - bch2_inode_find_by_inum_trans(&trans, inum, &inode_u)); + trans = bch2_trans_get(c); + ret = lockrestart_do(trans, + bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: + bch2_inode_find_by_inum_trans(trans, inum, &inode_u)); if (!ret) - bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol); - bch2_trans_exit(&trans); + bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); + bch2_trans_put(trans); if (ret) { iget_failed(&inode->v); @@ -226,7 +224,7 @@ __bch2_create(struct mnt_idmap *idmap, unsigned flags) { struct bch_fs *c = dir->v.i_sb->s_fs_info; - struct btree_trans trans; + struct btree_trans *trans; struct bch_inode_unpacked dir_u; struct bch_inode_info *inode, *old; struct bch_inode_unpacked inode_u; @@ -256,13 +254,11 @@ __bch2_create(struct mnt_idmap *idmap, if (!(flags & BCH_CREATE_TMPFILE)) mutex_lock(&dir->ei_update_lock); - bch2_trans_init(&trans, c, 8, - 2048 + (!(flags & BCH_CREATE_TMPFILE) - ? dentry->d_name.len : 0)); + trans = bch2_trans_get(c); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = bch2_create_trans(&trans, + ret = bch2_create_trans(trans, inode_inum(dir), &dir_u, &inode_u, !(flags & BCH_CREATE_TMPFILE) ? &dentry->d_name : NULL, @@ -278,9 +274,9 @@ retry: inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol; inum.inum = inode_u.bi_inum; - ret = bch2_subvolume_get(&trans, inum.subvol, true, + ret = bch2_subvolume_get(trans, inum.subvol, true, BTREE_ITER_WITH_UPDATES, &subvol) ?: - bch2_trans_commit(&trans, NULL, &journal_seq, 0); + bch2_trans_commit(trans, NULL, &journal_seq, 0); if (unlikely(ret)) { bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, KEY_TYPE_QUOTA_WARN); @@ -291,13 +287,13 @@ err_before_quota: } if (!(flags & BCH_CREATE_TMPFILE)) { - bch2_inode_update_after_write(&trans, dir, &dir_u, + bch2_inode_update_after_write(trans, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); mutex_unlock(&dir->ei_update_lock); } bch2_iget5_set(&inode->v, &inum); - bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol); + bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); @@ -337,7 +333,7 @@ err_before_quota: unlock_new_inode(&inode->v); } - bch2_trans_exit(&trans); + bch2_trans_put(trans); err: posix_acl_release(default_acl); posix_acl_release(acl); @@ -346,7 +342,7 @@ err_trans: if (!(flags & BCH_CREATE_TMPFILE)) mutex_unlock(&dir->ei_update_lock); - bch2_trans_exit(&trans); + bch2_trans_put(trans); make_bad_inode(&inode->v); iput(&inode->v); inode = ERR_PTR(ret); @@ -401,26 +397,25 @@ static int __bch2_link(struct bch_fs *c, struct bch_inode_info *dir, struct dentry *dentry) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct bch_inode_unpacked dir_u, inode_u; int ret; mutex_lock(&inode->ei_update_lock); - bch2_trans_init(&trans, c, 4, 1024); - ret = commit_do(&trans, NULL, NULL, 0, - bch2_link_trans(&trans, + ret = commit_do(trans, NULL, NULL, 0, + bch2_link_trans(trans, inode_inum(dir), &dir_u, inode_inum(inode), &inode_u, &dentry->d_name)); if (likely(!ret)) { - bch2_inode_update_after_write(&trans, dir, &dir_u, + bch2_inode_update_after_write(trans, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); - bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME); + bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME); } - bch2_trans_exit(&trans); + bch2_trans_put(trans); mutex_unlock(&inode->ei_update_lock); return ret; } @@ -451,24 +446,23 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, struct bch_inode_info *dir = to_bch_ei(vdir); struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); struct bch_inode_unpacked dir_u, inode_u; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); int ret; bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); - bch2_trans_init(&trans, c, 4, 1024); - ret = commit_do(&trans, NULL, NULL, + ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, - bch2_unlink_trans(&trans, + bch2_unlink_trans(trans, inode_inum(dir), &dir_u, &inode_u, &dentry->d_name, deleting_snapshot)); if (unlikely(ret)) goto err; - bch2_inode_update_after_write(&trans, dir, &dir_u, + bch2_inode_update_after_write(trans, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); - bch2_inode_update_after_write(&trans, inode, &inode_u, + bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_MTIME); if (inode_u.bi_subvol) { @@ -479,8 +473,8 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, set_nlink(&inode->v, 0); } err: - bch2_trans_exit(&trans); bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); + bch2_trans_put(trans); return ret; } @@ -543,7 +537,7 @@ static int bch2_rename2(struct mnt_idmap *idmap, struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); struct bch_inode_unpacked dst_dir_u, src_dir_u; struct bch_inode_unpacked src_inode_u, dst_inode_u; - struct btree_trans trans; + struct btree_trans *trans; enum bch_rename_mode mode = flags & RENAME_EXCHANGE ? BCH_RENAME_EXCHANGE : dst_dentry->d_inode @@ -560,7 +554,7 @@ static int bch2_rename2(struct mnt_idmap *idmap, return ret; } - bch2_trans_init(&trans, c, 8, 2048); + trans = bch2_trans_get(c); bch2_lock_inodes(INODE_UPDATE_LOCK, src_dir, @@ -587,8 +581,8 @@ static int bch2_rename2(struct mnt_idmap *idmap, goto err; } - ret = commit_do(&trans, NULL, NULL, 0, - bch2_rename_trans(&trans, + ret = commit_do(trans, NULL, NULL, 0, + bch2_rename_trans(trans, inode_inum(src_dir), &src_dir_u, inode_inum(dst_dir), &dst_dir_u, &src_inode_u, @@ -603,21 +597,21 @@ static int bch2_rename2(struct mnt_idmap *idmap, BUG_ON(dst_inode && dst_inode->v.i_ino != dst_inode_u.bi_inum); - bch2_inode_update_after_write(&trans, src_dir, &src_dir_u, + bch2_inode_update_after_write(trans, src_dir, &src_dir_u, ATTR_MTIME|ATTR_CTIME); if (src_dir != dst_dir) - bch2_inode_update_after_write(&trans, dst_dir, &dst_dir_u, + bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u, ATTR_MTIME|ATTR_CTIME); - bch2_inode_update_after_write(&trans, src_inode, &src_inode_u, + bch2_inode_update_after_write(trans, src_inode, &src_inode_u, ATTR_CTIME); if (dst_inode) - bch2_inode_update_after_write(&trans, dst_inode, &dst_inode_u, + bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u, ATTR_CTIME); err: - bch2_trans_exit(&trans); + bch2_trans_put(trans); bch2_fs_quota_transfer(c, src_inode, bch_qid(&src_inode->ei_inode), @@ -680,7 +674,7 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap, { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_qid qid; - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter inode_iter = { NULL }; struct bch_inode_unpacked inode_u; struct posix_acl *acl = NULL; @@ -701,13 +695,13 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap, if (ret) goto err; - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); kfree(acl); acl = NULL; - ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode), + ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), BTREE_ITER_INTENT); if (ret) goto btree_err; @@ -715,29 +709,29 @@ retry: bch2_setattr_copy(idmap, inode, &inode_u, attr); if (attr->ia_valid & ATTR_MODE) { - ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u, + ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u, inode_u.bi_mode, &acl); if (ret) goto btree_err; } - ret = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: - bch2_trans_commit(&trans, NULL, NULL, + ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?: + bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); btree_err: - bch2_trans_iter_exit(&trans, &inode_iter); + bch2_trans_iter_exit(trans, &inode_iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; if (unlikely(ret)) goto err_trans; - bch2_inode_update_after_write(&trans, inode, &inode_u, attr->ia_valid); + bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid); if (acl) set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); err_trans: - bch2_trans_exit(&trans); + bch2_trans_put(trans); err: mutex_unlock(&inode->ei_update_lock); @@ -798,7 +792,7 @@ static int bch2_setattr(struct mnt_idmap *idmap, return ret; return iattr->ia_valid & ATTR_SIZE - ? bch2_truncate(idmap, inode, iattr) + ? bchfs_truncate(idmap, inode, iattr) : bch2_setattr_nonsize(idmap, inode, iattr); } @@ -879,7 +873,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, { struct bch_fs *c = vinode->i_sb->s_fs_info; struct bch_inode_info *ei = to_bch_ei(vinode); - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; struct bkey_buf cur, prev; @@ -900,18 +894,18 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bch2_bkey_buf_init(&cur); bch2_bkey_buf_init(&prev); - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(&trans, ei->ei_subvol, &snapshot); + ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot); if (ret) goto err; - bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(ei->v.i_ino, start, snapshot), 0); - while (!(ret = btree_trans_too_many_iters(&trans)) && + while (!(ret = btree_trans_too_many_iters(trans)) && (k = bch2_btree_iter_peek_upto(&iter, end)).k && !(ret = bkey_err(k))) { enum btree_id data_btree = BTREE_ID_extents; @@ -928,7 +922,7 @@ retry: bch2_bkey_buf_reassemble(&cur, c, k); - ret = bch2_read_indirect_extent(&trans, &data_btree, + ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, &cur); if (ret) break; @@ -947,7 +941,7 @@ retry: cur.k->k.p.offset += cur.k->k.size; if (have_extent) { - bch2_trans_unlock(&trans); + bch2_trans_unlock(trans); ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), 0); if (ret) @@ -961,18 +955,18 @@ retry: POS(iter.pos.inode, iter.pos.offset + sectors)); } start = iter.pos.offset; - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; if (!ret && have_extent) { - bch2_trans_unlock(&trans); + bch2_trans_unlock(trans); ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), FIEMAP_EXTENT_LAST); } - bch2_trans_exit(&trans); + bch2_trans_put(trans); bch2_bkey_buf_exit(&cur, c); bch2_bkey_buf_exit(&prev, c); return ret < 0 ? ret : 0; @@ -1230,7 +1224,7 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child struct bch_inode_info *inode = to_bch_ei(child->d_inode); struct bch_inode_info *dir = to_bch_ei(parent->d_inode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter1; struct btree_iter iter2; struct bkey_s_c k; @@ -1245,23 +1239,23 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child if (!S_ISDIR(dir->v.i_mode)) return -EINVAL; - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); - bch2_trans_iter_init(&trans, &iter1, BTREE_ID_dirents, + bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents, POS(dir->ei_inode.bi_inum, 0), 0); - bch2_trans_iter_init(&trans, &iter2, BTREE_ID_dirents, + bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents, POS(dir->ei_inode.bi_inum, 0), 0); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(&trans, dir->ei_subvol, &snapshot); + ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot); if (ret) goto err; bch2_btree_iter_set_snapshot(&iter1, snapshot); bch2_btree_iter_set_snapshot(&iter2, snapshot); - ret = bch2_inode_find_by_inum_trans(&trans, inode_inum(inode), &inode_u); + ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u); if (ret) goto err; @@ -1279,7 +1273,7 @@ retry: } d = bkey_s_c_to_dirent(k); - ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target); + ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); if (ret > 0) ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; if (ret) @@ -1301,7 +1295,7 @@ retry: continue; d = bkey_s_c_to_dirent(k); - ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target); + ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); if (ret < 0) break; if (ret) @@ -1325,9 +1319,9 @@ err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_iter_exit(&trans, &iter1); - bch2_trans_iter_exit(&trans, &iter2); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter1); + bch2_trans_iter_exit(trans, &iter2); + bch2_trans_put(trans); return ret; } @@ -1661,7 +1655,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data) up_write(&c->state_lock); } - if (opts.errors >= 0) + if (opt_defined(opts, errors)) c->opts.errors = opts.errors; err: return bch2_err_class(ret); @@ -1722,6 +1716,35 @@ static void bch2_put_super(struct super_block *sb) __bch2_fs_stop(c); } +/* + * bcachefs doesn't currently integrate intwrite freeze protection but the + * internal write references serve the same purpose. Therefore reuse the + * read-only transition code to perform the quiesce. The caveat is that we don't + * currently have the ability to block tasks that want a write reference while + * the superblock is frozen. This is fine for now, but we should either add + * blocking support or find a way to integrate sb_start_intwrite() and friends. + */ +static int bch2_freeze(struct super_block *sb) +{ + struct bch_fs *c = sb->s_fs_info; + + down_write(&c->state_lock); + bch2_fs_read_only(c); + up_write(&c->state_lock); + return 0; +} + +static int bch2_unfreeze(struct super_block *sb) +{ + struct bch_fs *c = sb->s_fs_info; + int ret; + + down_write(&c->state_lock); + ret = bch2_fs_read_write(c); + up_write(&c->state_lock); + return ret; +} + static const struct super_operations bch_super_operations = { .alloc_inode = bch2_alloc_inode, .destroy_inode = bch2_destroy_inode, @@ -1733,10 +1756,8 @@ static const struct super_operations bch_super_operations = { .show_options = bch2_show_options, .remount_fs = bch2_remount, .put_super = bch2_put_super, -#if 0 .freeze_fs = bch2_freeze, .unfreeze_fs = bch2_unfreeze, -#endif }; static int bch2_set_super(struct super_block *s, void *data) @@ -1890,7 +1911,7 @@ got_sb: vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); ret = PTR_ERR_OR_ZERO(vinode); if (ret) { - bch_err(c, "error mounting: error getting root inode: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "mounting: error getting root inode"); goto err_put_super; } diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h index 10e11119..5edf1d4b 100644 --- a/libbcachefs/fs.h +++ b/libbcachefs/fs.h @@ -197,7 +197,7 @@ int bch2_vfs_init(void); #else -#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) do {} while (0) +#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); }) static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) {} diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 57b3dfab..206302b0 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -80,7 +80,7 @@ static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot, if (!ret) *subvol = le32_to_cpu(s.subvol); else if (bch2_err_matches(ret, ENOENT)) - bch_err(trans->c, "snapshot %u not fonud", snapshot); + bch_err(trans->c, "snapshot %u not found", snapshot); return ret; } @@ -127,8 +127,7 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, ret = bch2_inode_unpack(k, inode); err: if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err(trans->c, "error fetching inode %llu: %s", - inode_nr, bch2_err_str(ret)); + bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -154,8 +153,7 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, *snapshot = iter.pos.snapshot; err: if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err(trans->c, "error fetching inode %llu:%u: %s", - inode_nr, *snapshot, bch2_err_str(ret)); + bch_err_msg(trans->c, ret, "fetching inode %llu:%u", inode_nr, *snapshot); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -206,17 +204,16 @@ static int __write_inode(struct btree_trans *trans, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); } -static int write_inode(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - u32 snapshot) +static int fsck_write_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 snapshot) { int ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW, __write_inode(trans, inode, snapshot)); if (ret) - bch_err(trans->c, "error in fsck: error updating inode: %s", - bch2_err_str(ret)); + bch_err_fn(trans->c, ret); return ret; } @@ -278,13 +275,13 @@ static int lookup_lostfound(struct btree_trans *trans, u32 subvol, } if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err(c, "error looking up lost+found: %s", bch2_err_str(ret)); + bch_err_fn(c, ret); if (ret) return ret; if (d_type != DT_DIR) { bch_err(c, "error looking up lost+found: not a directory"); - return ret; + return -BCH_ERR_ENOENT_not_directory; } /* @@ -301,7 +298,7 @@ create_lostfound: 0, 0, S_IFDIR|0700, 0, NULL, NULL, (subvol_inum) { }, 0); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err(c, "error creating lost+found: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "creating lost+found"); return ret; } @@ -365,8 +362,7 @@ static int reattach_inode(struct btree_trans *trans, BTREE_INSERT_NOFAIL, __reattach_inode(trans, inode, inode_snapshot)); if (ret) { - bch_err(trans->c, "error reattaching inode %llu: %s", - inode->bi_inum, bch2_err_str(ret)); + bch_err_msg(trans->c, ret, "reattaching inode %llu", inode->bi_inum); return ret; } @@ -475,7 +471,12 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor, * and @ancestor hasn't been overwritten in @seen * - * That is, returns whether key in @ancestor snapshot is visible in @id snapshot + * @c: filesystem handle + * @seen: list of snapshot ids already seen at current position + * @id: descendent snapshot id + * @ancestor: ancestor snapshot id + * + * Returns: whether key in @ancestor snapshot is visible in @id snapshot */ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, u32 id, u32 ancestor) @@ -520,14 +521,16 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see * snapshot id @dst, test whether there is some snapshot in which @dst is * visible. * - * This assumes we're visiting @src keys in natural key order. + * @c: filesystem handle + * @s: list of snapshot IDs already seen at @src + * @src: snapshot ID of src key + * @dst: snapshot ID of dst key + * Returns: true if there is some snapshot in which @dst is visible * - * @s - list of snapshot IDs already seen at @src - * @src - snapshot ID of src key - * @dst - snapshot ID of dst key + * Assumes we're visiting @src keys in natural key order */ -static int ref_visible(struct bch_fs *c, struct snapshots_seen *s, - u32 src, u32 dst) +static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s, + u32 src, u32 dst) { return dst <= src ? key_visible_in_snapshot(c, s, dst, src) @@ -618,10 +621,7 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, w->first_this_inode = true; - if (trans_was_restarted(trans, restart_count)) - return -BCH_ERR_transaction_restart_nested; - - return 0; + return trans_was_restarted(trans, restart_count); } static struct inode_walker_entry * @@ -822,7 +822,7 @@ bad_hash: bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err(c, "hash_redo_key err %s", bch2_err_str(ret)); + bch_err_fn(c, ret); if (ret) return ret; ret = -BCH_ERR_transaction_restart_nested; @@ -886,7 +886,8 @@ static int check_inode(struct btree_trans *trans, ret = __write_inode(trans, &u, iter->pos.snapshot); if (ret) { - bch_err_msg(c, ret, "in fsck: error updating inode"); + if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err_msg(c, ret, "in fsck updating inode"); return ret; } @@ -904,8 +905,7 @@ static int check_inode(struct btree_trans *trans, ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err(c, "error in fsck: error while deleting inode: %s", - bch2_err_str(ret)); + bch_err_msg(c, ret, "in fsck deleting inode"); return ret; } @@ -928,8 +928,7 @@ static int check_inode(struct btree_trans *trans, POS(u.bi_inum, U64_MAX), 0, NULL); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err(c, "error in fsck: error truncating inode: %s", - bch2_err_str(ret)); + bch_err_msg(c, ret, "in fsck truncating inode"); if (ret) return ret; @@ -954,8 +953,7 @@ static int check_inode(struct btree_trans *trans, sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot); if (sectors < 0) { - bch_err(c, "error in fsck: error recounting inode sectors: %s", - bch2_err_str(sectors)); + bch_err_msg(c, sectors, "fsck recounting inode sectors"); return sectors; } @@ -974,13 +972,13 @@ static int check_inode(struct btree_trans *trans, if (do_update) { ret = __write_inode(trans, &u, iter->pos.snapshot); if (ret) { - bch_err_msg(c, ret, "in fsck: error updating inode"); + bch_err_msg(c, ret, "in fsck updating inode"); return ret; } } err: fsck_err: - if (ret) + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) bch_err_fn(c, ret); return ret; } @@ -989,7 +987,7 @@ noinline_for_stack int bch2_check_inodes(struct bch_fs *c) { bool full = c->opts.fsck; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bch_inode_unpacked prev = { 0 }; struct snapshots_seen s; @@ -997,16 +995,15 @@ int bch2_check_inodes(struct bch_fs *c) int ret; snapshots_seen_init(&s); - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes, + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_inode(&trans, &iter, k, &prev, &s, full)); + check_inode(trans, &iter, k, &prev, &s, full)); - bch2_trans_exit(&trans); snapshots_seen_exit(&s); + bch2_trans_put(trans); if (ret) bch_err_fn(c, ret); return ret; @@ -1081,7 +1078,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) w->last_pos.inode, i->snapshot, i->inode.bi_sectors, i->count)) { i->inode.bi_sectors = i->count; - ret = write_inode(trans, &i->inode, i->snapshot); + ret = fsck_write_inode(trans, &i->inode, i->snapshot); if (ret) break; } @@ -1089,9 +1086,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) fsck_err: if (ret) bch_err_fn(c, ret); - if (!ret && trans_was_restarted(trans, restart_count)) - ret = -BCH_ERR_transaction_restart_nested; - return ret; + return ret ?: trans_was_restarted(trans, restart_count); } struct extent_end { @@ -1441,7 +1436,7 @@ int bch2_check_extents(struct bch_fs *c) { struct inode_walker w = inode_walker_init(); struct snapshots_seen s; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; struct extent_ends extent_ends; @@ -1450,23 +1445,22 @@ int bch2_check_extents(struct bch_fs *c) snapshots_seen_init(&s); extent_ends_init(&extent_ends); - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); - ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents, + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_extents, POS(BCACHEFS_ROOT_INO, 0), BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, &res, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({ bch2_disk_reservation_put(c, &res); - check_extent(&trans, &iter, k, &w, &s, &extent_ends); + check_extent(trans, &iter, k, &w, &s, &extent_ends); })) ?: - check_i_sectors(&trans, &w); + check_i_sectors(trans, &w); bch2_disk_reservation_put(c, &res); extent_ends_exit(&extent_ends); inode_walker_exit(&w); - bch2_trans_exit(&trans); snapshots_seen_exit(&s); + bch2_trans_put(trans); if (ret) bch_err_fn(c, ret); @@ -1501,7 +1495,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) "directory %llu:%u with wrong i_nlink: got %u, should be %llu", w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { i->inode.bi_nlink = i->count; - ret = write_inode(trans, &i->inode, i->snapshot); + ret = fsck_write_inode(trans, &i->inode, i->snapshot); if (ret) break; } @@ -1509,9 +1503,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) fsck_err: if (ret) bch_err_fn(c, ret); - if (!ret && trans_was_restarted(trans, restart_count)) - ret = -BCH_ERR_transaction_restart_nested; - return ret; + return ret ?: trans_was_restarted(trans, restart_count); } static int check_dirent_target(struct btree_trans *trans, @@ -1809,23 +1801,22 @@ int bch2_check_dirents(struct bch_fs *c) struct inode_walker target = inode_walker_init(); struct snapshots_seen s; struct bch_hash_info hash_info; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; int ret = 0; snapshots_seen_init(&s); - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_dirents, + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, POS(BCACHEFS_ROOT_INO, 0), BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_dirent(&trans, &iter, k, &hash_info, &dir, &target, &s)); + check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)); - bch2_trans_exit(&trans); + bch2_trans_put(trans); snapshots_seen_exit(&s); inode_walker_exit(&dir); inode_walker_exit(&target); @@ -1879,23 +1870,18 @@ int bch2_check_xattrs(struct bch_fs *c) { struct inode_walker inode = inode_walker_init(); struct bch_hash_info hash_info; - struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; int ret = 0; - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - - ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs, + ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, POS(BCACHEFS_ROOT_INO, 0), BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_xattr(&trans, &iter, k, &hash_info, &inode)); - - bch2_trans_exit(&trans); - + check_xattr(trans, &iter, k, &hash_info, &inode))); if (ret) bch_err_fn(c, ret); return ret; @@ -1927,10 +1913,10 @@ static int check_root_trans(struct btree_trans *trans) ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW, - __bch2_btree_insert(trans, BTREE_ID_subvolumes, + bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol.k_i, 0)); if (ret) { - bch_err(c, "error writing root subvol: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "writing root subvol"); goto err; } @@ -1949,7 +1935,7 @@ static int check_root_trans(struct btree_trans *trans) ret = __write_inode(trans, &root_inode, snapshot); if (ret) - bch_err(c, "error writing root inode: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "writing root inode"); } err: fsck_err: @@ -1964,7 +1950,7 @@ int bch2_check_root(struct bch_fs *c) ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW, - check_root_trans(&trans)); + check_root_trans(trans)); if (ret) bch_err_fn(c, ret); @@ -2116,16 +2102,14 @@ fsck_err: */ int bch2_check_directory_structure(struct bch_fs *c) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; struct bch_inode_unpacked u; pathbuf path = { 0, }; int ret; - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - - for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN, BTREE_ITER_INTENT| BTREE_ITER_PREFETCH| BTREE_ITER_ALL_SNAPSHOTS, k, ret) { @@ -2142,12 +2126,12 @@ int bch2_check_directory_structure(struct bch_fs *c) if (u.bi_flags & BCH_INODE_UNLINKED) continue; - ret = check_path(&trans, &path, &u, iter.pos.snapshot); + ret = check_path(trans, &path, &u, iter.pos.snapshot); if (ret) break; } - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); darray_exit(&path); if (ret) @@ -2155,8 +2139,6 @@ int bch2_check_directory_structure(struct bch_fs *c) return ret; } -/* check_nlink pass: */ - struct nlink_table { size_t nr; size_t size; @@ -2238,15 +2220,13 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, struct nlink_table *t, u64 start, u64 *end) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; struct bch_inode_unpacked u; int ret = 0; - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - - for_each_btree_key(&trans, iter, BTREE_ID_inodes, + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, start), BTREE_ITER_INTENT| BTREE_ITER_PREFETCH| @@ -2275,8 +2255,8 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, } } - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); if (ret) bch_err(c, "error in fsck: btree error %i while walking inodes", ret); @@ -2288,7 +2268,7 @@ noinline_for_stack static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links, u64 range_start, u64 range_end) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct snapshots_seen s; struct btree_iter iter; struct bkey_s_c k; @@ -2297,9 +2277,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links snapshots_seen_init(&s); - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - - for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN, + for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, BTREE_ITER_INTENT| BTREE_ITER_PREFETCH| BTREE_ITER_ALL_SNAPSHOTS, k, ret) { @@ -2319,12 +2297,12 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links break; } } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); if (ret) bch_err(c, "error in fsck: btree error %i while walking dirents", ret); - bch2_trans_exit(&trans); + bch2_trans_put(trans); snapshots_seen_exit(&s); return ret; } @@ -2375,22 +2353,17 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, struct nlink_table *links, u64 range_start, u64 range_end) { - struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; size_t idx = 0; int ret = 0; - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - - ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes, - POS(0, range_start), - BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_nlinks_update_inode(&trans, &iter, k, links, &idx, range_end)); - - bch2_trans_exit(&trans); - + ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, + POS(0, range_start), + BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end))); if (ret < 0) { bch_err(c, "error in fsck: btree error %i while walking inodes", ret); return ret; @@ -2472,13 +2445,12 @@ int bch2_fix_reflink_p(struct bch_fs *c) return 0; ret = bch2_trans_run(c, - for_each_btree_key_commit(&trans, iter, + for_each_btree_key_commit(trans, iter, BTREE_ID_extents, POS_MIN, BTREE_ITER_INTENT|BTREE_ITER_PREFETCH| BTREE_ITER_ALL_SNAPSHOTS, k, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, - fix_reflink_p_key(&trans, &iter, k))); - + fix_reflink_p_key(trans, &iter, k))); if (ret) bch_err_fn(c, ret); return ret; diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 8114b6e4..8bfd99cb 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -120,8 +120,7 @@ static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed, if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { struct bch_inode_unpacked unpacked; - int ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), - &unpacked); + ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked); BUG_ON(ret); BUG_ON(unpacked.bi_inum != inode->bi_inum); BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed); @@ -318,7 +317,7 @@ int bch2_inode_unpack(struct bkey_s_c k, return bch2_inode_unpack_slowpath(k, unpacked); } -int bch2_inode_peek(struct btree_trans *trans, +static int bch2_inode_peek_nowarn(struct btree_trans *trans, struct btree_iter *iter, struct bch_inode_unpacked *inode, subvol_inum inum, unsigned flags) @@ -349,7 +348,17 @@ int bch2_inode_peek(struct btree_trans *trans, return 0; err: bch2_trans_iter_exit(trans, iter); - if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; +} + +int bch2_inode_peek(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode, + subvol_inum inum, unsigned flags) +{ + int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags); + + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum); return ret; } @@ -817,7 +826,7 @@ err: int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; struct bkey_i_inode_generation delete; struct bch_inode_unpacked inode_u; @@ -825,8 +834,6 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) u32 snapshot; int ret; - bch2_trans_init(&trans, c, 0, 1024); - /* * If this was a directory, there shouldn't be any real dirents left - * but there could be whiteouts (from hash collisions) that we should @@ -835,19 +842,19 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) * XXX: the dirent could ideally would delete whiteouts when they're no * longer needed */ - ret = bch2_inode_delete_keys(&trans, inum, BTREE_ID_extents) ?: - bch2_inode_delete_keys(&trans, inum, BTREE_ID_xattrs) ?: - bch2_inode_delete_keys(&trans, inum, BTREE_ID_dirents); + ret = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?: + bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?: + bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents); if (ret) goto err; retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); if (ret) goto err; - k = bch2_bkey_get_iter(&trans, &iter, BTREE_ID_inodes, + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, SPOS(0, inum.inum, snapshot), BTREE_ITER_INTENT|BTREE_ITER_CACHED); ret = bkey_err(k); @@ -855,7 +862,7 @@ retry: goto err; if (!bkey_is_inode(k.k)) { - bch2_fs_inconsistent(trans.c, + bch2_fs_inconsistent(c, "inode %llu:%u not found when deleting", inum.inum, snapshot); ret = -EIO; @@ -868,15 +875,28 @@ retry: delete.k.p = iter.pos; delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); - ret = bch2_trans_update(&trans, &iter, &delete.k_i, 0) ?: - bch2_trans_commit(&trans, NULL, NULL, + ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: + bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); err: - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_exit(&trans); + bch2_trans_put(trans); + return ret; +} + +int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans, + subvol_inum inum, + struct bch_inode_unpacked *inode) +{ + struct btree_iter iter; + int ret; + + ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0); + if (!ret) + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -897,7 +917,7 @@ int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, struct bch_inode_unpacked *inode) { return bch2_trans_do(c, NULL, NULL, 0, - bch2_inode_find_by_inum_trans(&trans, inum, inode)); + bch2_inode_find_by_inum_trans(trans, inum, inode)); } int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) @@ -1069,14 +1089,12 @@ delete: int bch2_delete_dead_inodes(struct bch_fs *c) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; int ret; - bch2_trans_init(&trans, c, 0, 0); - - ret = bch2_btree_write_buffer_flush_sync(&trans); + ret = bch2_btree_write_buffer_flush_sync(trans); if (ret) goto err; @@ -1086,26 +1104,26 @@ int bch2_delete_dead_inodes(struct bch_fs *c) * but we can't retry because the btree write buffer won't have been * flushed and we'd spin: */ - for_each_btree_key(&trans, iter, BTREE_ID_deleted_inodes, POS_MIN, + for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - ret = lockrestart_do(&trans, may_delete_deleted_inode(&trans, k.k->p)); + ret = lockrestart_do(trans, may_delete_deleted_inode(trans, k.k->p)); if (ret < 0) break; if (ret) { if (!test_bit(BCH_FS_RW, &c->flags)) { - bch2_trans_unlock(&trans); + bch2_trans_unlock(trans); bch2_fs_lazy_rw(c); } - ret = bch2_inode_rm_snapshot(&trans, k.k->p.offset, k.k->p.snapshot); + ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) break; } } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); err: - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index 22b24405..a7464e1b 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -118,6 +118,9 @@ int bch2_inode_create(struct btree_trans *, struct btree_iter *, int bch2_inode_rm(struct bch_fs *, subvol_inum); +int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *, + subvol_inum, + struct bch_inode_unpacked *); int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *); int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, diff --git a/libbcachefs/io.h b/libbcachefs/io.h deleted file mode 100644 index 831e3f1b..00000000 --- a/libbcachefs/io.h +++ /dev/null @@ -1,202 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_IO_H -#define _BCACHEFS_IO_H - -#include "checksum.h" -#include "bkey_buf.h" -#include "io_types.h" - -#define to_wbio(_bio) \ - container_of((_bio), struct bch_write_bio, bio) - -#define to_rbio(_bio) \ - container_of((_bio), struct bch_read_bio, bio) - -void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); -void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); - -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -void bch2_latency_acct(struct bch_dev *, u64, int); -#else -static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {} -#endif - -void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, - enum bch_data_type, const struct bkey_i *, bool); - -#define BLK_STS_REMOVED ((__force blk_status_t)128) - -const char *bch2_blk_status_to_str(blk_status_t); - -#define BCH_WRITE_FLAGS() \ - x(ALLOC_NOWAIT) \ - x(CACHED) \ - x(DATA_ENCODED) \ - x(PAGES_STABLE) \ - x(PAGES_OWNED) \ - x(ONLY_SPECIFIED_DEVS) \ - x(WROTE_DATA_INLINE) \ - x(FROM_INTERNAL) \ - x(CHECK_ENOSPC) \ - x(SYNC) \ - x(MOVE) \ - x(IN_WORKER) \ - x(DONE) \ - x(IO_ERROR) \ - x(CONVERT_UNWRITTEN) - -enum __bch_write_flags { -#define x(f) __BCH_WRITE_##f, - BCH_WRITE_FLAGS() -#undef x -}; - -enum bch_write_flags { -#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f), - BCH_WRITE_FLAGS() -#undef x -}; - -static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) -{ - return op->watermark == BCH_WATERMARK_copygc - ? op->c->copygc_wq - : op->c->btree_update_wq; -} - -int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, - struct bkey_i *, bool *, s64 *, s64 *); -int bch2_extent_update(struct btree_trans *, subvol_inum, - struct btree_iter *, struct bkey_i *, - struct disk_reservation *, u64, s64 *, bool); -int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *, - unsigned, struct bch_io_opts, s64 *, - struct write_point_specifier); - -int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, - subvol_inum, u64, s64 *); -int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *); - -static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, - struct bch_io_opts opts) -{ - op->c = c; - op->end_io = NULL; - op->flags = 0; - op->written = 0; - op->error = 0; - op->csum_type = bch2_data_checksum_type(c, opts); - op->compression_opt = opts.compression; - op->nr_replicas = 0; - op->nr_replicas_required = c->opts.data_replicas_required; - op->watermark = BCH_WATERMARK_normal; - op->incompressible = 0; - op->open_buckets.nr = 0; - op->devs_have.nr = 0; - op->target = 0; - op->opts = opts; - op->subvol = 0; - op->pos = POS_MAX; - op->version = ZERO_VERSION; - op->write_point = (struct write_point_specifier) { 0 }; - op->res = (struct disk_reservation) { 0 }; - op->new_i_size = U64_MAX; - op->i_sectors_delta = 0; - op->devs_need_flush = NULL; -} - -void bch2_write(struct closure *); - -void bch2_write_point_do_index_updates(struct work_struct *); - -static inline struct bch_write_bio *wbio_init(struct bio *bio) -{ - struct bch_write_bio *wbio = to_wbio(bio); - - memset(&wbio->wbio, 0, sizeof(wbio->wbio)); - return wbio; -} - -void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *); - -struct bch_devs_mask; -struct cache_promote_op; -struct extent_ptr_decoded; - -int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, - struct bkey_buf *); - -static inline int bch2_read_indirect_extent(struct btree_trans *trans, - enum btree_id *data_btree, - unsigned *offset_into_extent, - struct bkey_buf *k) -{ - if (k->k->k.type != KEY_TYPE_reflink_p) - return 0; - - *data_btree = BTREE_ID_reflink; - return __bch2_read_indirect_extent(trans, offset_into_extent, k); -} - -enum bch_read_flags { - BCH_READ_RETRY_IF_STALE = 1 << 0, - BCH_READ_MAY_PROMOTE = 1 << 1, - BCH_READ_USER_MAPPED = 1 << 2, - BCH_READ_NODECODE = 1 << 3, - BCH_READ_LAST_FRAGMENT = 1 << 4, - - /* internal: */ - BCH_READ_MUST_BOUNCE = 1 << 5, - BCH_READ_MUST_CLONE = 1 << 6, - BCH_READ_IN_RETRY = 1 << 7, -}; - -int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, - struct bvec_iter, struct bpos, enum btree_id, - struct bkey_s_c, unsigned, - struct bch_io_failures *, unsigned); - -static inline void bch2_read_extent(struct btree_trans *trans, - struct bch_read_bio *rbio, struct bpos read_pos, - enum btree_id data_btree, struct bkey_s_c k, - unsigned offset_into_extent, unsigned flags) -{ - __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, - data_btree, k, offset_into_extent, NULL, flags); -} - -void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, - subvol_inum, struct bch_io_failures *, unsigned flags); - -static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - subvol_inum inum) -{ - struct bch_io_failures failed = { .nr = 0 }; - - BUG_ON(rbio->_state); - - rbio->c = c; - rbio->start_time = local_clock(); - rbio->subvol = inum.subvol; - - __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, - BCH_READ_RETRY_IF_STALE| - BCH_READ_MAY_PROMOTE| - BCH_READ_USER_MAPPED); -} - -static inline struct bch_read_bio *rbio_init(struct bio *bio, - struct bch_io_opts opts) -{ - struct bch_read_bio *rbio = to_rbio(bio); - - rbio->_state = 0; - rbio->promote = NULL; - rbio->opts = opts; - return rbio; -} - -void bch2_fs_io_exit(struct bch_fs *); -int bch2_fs_io_init(struct bch_fs *); - -#endif /* _BCACHEFS_IO_H */ diff --git a/libbcachefs/io_misc.c b/libbcachefs/io_misc.c new file mode 100644 index 00000000..32432bdd --- /dev/null +++ b/libbcachefs/io_misc.c @@ -0,0 +1,497 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * io_misc.c - fallocate, fpunch, truncate: + */ + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "bkey_buf.h" +#include "btree_update.h" +#include "buckets.h" +#include "clock.h" +#include "error.h" +#include "extents.h" +#include "extent_update.h" +#include "inode.h" +#include "io_misc.h" +#include "io_write.h" +#include "logged_ops.h" +#include "subvolume.h" + +/* Overwrites whatever was present with zeroes: */ +int bch2_extent_fallocate(struct btree_trans *trans, + subvol_inum inum, + struct btree_iter *iter, + unsigned sectors, + struct bch_io_opts opts, + s64 *i_sectors_delta, + struct write_point_specifier write_point) +{ + struct bch_fs *c = trans->c; + struct disk_reservation disk_res = { 0 }; + struct closure cl; + struct open_buckets open_buckets = { 0 }; + struct bkey_s_c k; + struct bkey_buf old, new; + unsigned sectors_allocated = 0; + bool have_reservation = false; + bool unwritten = opts.nocow && + c->sb.version >= bcachefs_metadata_version_unwritten_extents; + int ret; + + bch2_bkey_buf_init(&old); + bch2_bkey_buf_init(&new); + closure_init_stack(&cl); + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + return ret; + + sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset); + + if (!have_reservation) { + unsigned new_replicas = + max(0, (int) opts.data_replicas - + (int) bch2_bkey_nr_ptrs_fully_allocated(k)); + /* + * Get a disk reservation before (in the nocow case) calling + * into the allocator: + */ + ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0); + if (unlikely(ret)) + goto err; + + bch2_bkey_buf_reassemble(&old, c, k); + } + + if (have_reservation) { + if (!bch2_extents_match(k, bkey_i_to_s_c(old.k))) + goto err; + + bch2_key_resize(&new.k->k, sectors); + } else if (!unwritten) { + struct bkey_i_reservation *reservation; + + bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64)); + reservation = bkey_reservation_init(new.k); + reservation->k.p = iter->pos; + bch2_key_resize(&reservation->k, sectors); + reservation->v.nr_replicas = opts.data_replicas; + } else { + struct bkey_i_extent *e; + struct bch_devs_list devs_have; + struct write_point *wp; + struct bch_extent_ptr *ptr; + + devs_have.nr = 0; + + bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX); + + e = bkey_extent_init(new.k); + e->k.p = iter->pos; + + ret = bch2_alloc_sectors_start_trans(trans, + opts.foreground_target, + false, + write_point, + &devs_have, + opts.data_replicas, + opts.data_replicas, + BCH_WATERMARK_normal, 0, &cl, &wp); + if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) + ret = -BCH_ERR_transaction_restart_nested; + if (ret) + goto err; + + sectors = min(sectors, wp->sectors_free); + sectors_allocated = sectors; + + bch2_key_resize(&e->k, sectors); + + bch2_open_bucket_get(c, wp, &open_buckets); + bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); + bch2_alloc_sectors_done(c, wp); + + extent_for_each_ptr(extent_i_to_s(e), ptr) + ptr->unwritten = true; + } + + have_reservation = true; + + ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res, + 0, i_sectors_delta, true); +err: + if (!ret && sectors_allocated) + bch2_increment_clock(c, sectors_allocated, WRITE); + + bch2_open_buckets_put(c, &open_buckets); + bch2_disk_reservation_put(c, &disk_res); + bch2_bkey_buf_exit(&new, c); + bch2_bkey_buf_exit(&old, c); + + if (closure_nr_remaining(&cl) != 1) { + bch2_trans_unlock(trans); + closure_sync(&cl); + } + + return ret; +} + +/* + * Returns -BCH_ERR_transacton_restart if we had to drop locks: + */ +int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, + subvol_inum inum, u64 end, + s64 *i_sectors_delta) +{ + struct bch_fs *c = trans->c; + unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); + struct bpos end_pos = POS(inum.inum, end); + struct bkey_s_c k; + int ret = 0, ret2 = 0; + u32 snapshot; + + while (!ret || + bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(c, 0); + struct bkey_i delete; + + if (ret) + ret2 = ret; + + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + continue; + + bch2_btree_iter_set_snapshot(iter, snapshot); + + /* + * peek_upto() doesn't have ideal semantics for extents: + */ + k = bch2_btree_iter_peek_upto(iter, end_pos); + if (!k.k) + break; + + ret = bkey_err(k); + if (ret) + continue; + + bkey_init(&delete.k); + delete.k.p = iter->pos; + + /* create the biggest key we can */ + bch2_key_resize(&delete.k, max_sectors); + bch2_cut_back(end_pos, &delete); + + ret = bch2_extent_update(trans, inum, iter, &delete, + &disk_res, 0, i_sectors_delta, false); + bch2_disk_reservation_put(c, &disk_res); + } + + return ret ?: ret2; +} + +int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, + s64 *i_sectors_delta) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + POS(inum.inum, start), + BTREE_ITER_INTENT); + + ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta); + + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; + + return ret; +} + +/* truncate: */ + +void bch2_logged_op_truncate_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_logged_op_truncate op = bkey_s_c_to_logged_op_truncate(k); + + prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol)); + prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum)); + prt_printf(out, " new_i_size=%llu", le64_to_cpu(op.v->new_i_size)); +} + +static int truncate_set_isize(struct btree_trans *trans, + subvol_inum inum, + u64 new_i_size) +{ + struct btree_iter iter = { NULL }; + struct bch_inode_unpacked inode_u; + int ret; + + ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT) ?: + (inode_u.bi_size = new_i_size, 0) ?: + bch2_inode_write(trans, &iter, &inode_u); + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int __bch2_resume_logged_op_truncate(struct btree_trans *trans, + struct bkey_i *op_k, + u64 *i_sectors_delta) +{ + struct bch_fs *c = trans->c; + struct btree_iter fpunch_iter; + struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(op_k); + subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) }; + u64 new_i_size = le64_to_cpu(op->v.new_i_size); + int ret; + + ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + truncate_set_isize(trans, inum, new_i_size)); + if (ret) + goto err; + + bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents, + POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9), + BTREE_ITER_INTENT); + ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta); + bch2_trans_iter_exit(trans, &fpunch_iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; +err: + bch2_logged_op_finish(trans, op_k); + return ret; +} + +int bch2_resume_logged_op_truncate(struct btree_trans *trans, struct bkey_i *op_k) +{ + return __bch2_resume_logged_op_truncate(trans, op_k, NULL); +} + +int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sectors_delta) +{ + struct bkey_i_logged_op_truncate op; + + bkey_logged_op_truncate_init(&op.k_i); + op.v.subvol = cpu_to_le32(inum.subvol); + op.v.inum = cpu_to_le64(inum.inum); + op.v.new_i_size = cpu_to_le64(new_i_size); + + return bch2_trans_run(c, + bch2_logged_op_start(trans, &op.k_i) ?: + __bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta)); +} + +/* finsert/fcollapse: */ + +void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_logged_op_finsert op = bkey_s_c_to_logged_op_finsert(k); + + prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol)); + prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum)); + prt_printf(out, " dst_offset=%lli", le64_to_cpu(op.v->dst_offset)); + prt_printf(out, " src_offset=%llu", le64_to_cpu(op.v->src_offset)); +} + +static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset, s64 len) +{ + struct btree_iter iter; + struct bch_inode_unpacked inode_u; + int ret; + + offset <<= 9; + len <<= 9; + + ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT); + if (ret) + return ret; + + if (len > 0) { + if (MAX_LFS_FILESIZE - inode_u.bi_size < len) { + ret = -EFBIG; + goto err; + } + + if (offset >= inode_u.bi_size) { + ret = -EINVAL; + goto err; + } + } + + inode_u.bi_size += len; + inode_u.bi_mtime = inode_u.bi_ctime = bch2_current_time(trans->c); + + ret = bch2_inode_write(trans, &iter, &inode_u); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int __bch2_resume_logged_op_finsert(struct btree_trans *trans, + struct bkey_i *op_k, + u64 *i_sectors_delta) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k); + subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) }; + u64 dst_offset = le64_to_cpu(op->v.dst_offset); + u64 src_offset = le64_to_cpu(op->v.src_offset); + s64 shift = dst_offset - src_offset; + u64 len = abs(shift); + u64 pos = le64_to_cpu(op->v.pos); + bool insert = shift > 0; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + POS(inum.inum, 0), + BTREE_ITER_INTENT); + + switch (op->v.state) { +case LOGGED_OP_FINSERT_start: + op->v.state = LOGGED_OP_FINSERT_shift_extents; + + if (insert) { + ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + adjust_i_size(trans, inum, src_offset, len) ?: + bch2_logged_op_update(trans, &op->k_i)); + if (ret) + goto err; + } else { + bch2_btree_iter_set_pos(&iter, POS(inum.inum, src_offset)); + + ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta); + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto err; + + ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_logged_op_update(trans, &op->k_i)); + } + + fallthrough; +case LOGGED_OP_FINSERT_shift_extents: + while (1) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(c, 0); + struct bkey_i delete, *copy; + struct bkey_s_c k; + struct bpos src_pos = POS(inum.inum, src_offset); + u32 snapshot; + + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto btree_err; + + bch2_btree_iter_set_snapshot(&iter, snapshot); + bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot)); + + k = insert + ? bch2_btree_iter_peek_prev(&iter) + : bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX)); + if ((ret = bkey_err(k))) + goto btree_err; + + if (!k.k || + k.k->p.inode != inum.inum || + bkey_le(k.k->p, POS(inum.inum, src_offset))) + break; + + copy = bch2_bkey_make_mut_noupdate(trans, k); + if ((ret = PTR_ERR_OR_ZERO(copy))) + goto btree_err; + + if (insert && + bkey_lt(bkey_start_pos(k.k), src_pos)) { + bch2_cut_front(src_pos, copy); + + /* Splitting compressed extent? */ + bch2_disk_reservation_add(c, &disk_res, + copy->k.size * + bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy)), + BCH_DISK_RESERVATION_NOFAIL); + } + + bkey_init(&delete.k); + delete.k.p = copy->k.p; + delete.k.p.snapshot = snapshot; + delete.k.size = copy->k.size; + + copy->k.p.offset += shift; + copy->k.p.snapshot = snapshot; + + op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset); + + ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: + bch2_logged_op_update(trans, &op->k_i) ?: + bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL); +btree_err: + bch2_disk_reservation_put(c, &disk_res); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + goto err; + + pos = le64_to_cpu(op->v.pos); + } + + op->v.state = LOGGED_OP_FINSERT_finish; + + if (!insert) { + ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + adjust_i_size(trans, inum, src_offset, shift) ?: + bch2_logged_op_update(trans, &op->k_i)); + } else { + /* We need an inode update to update bi_journal_seq for fsync: */ + ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + adjust_i_size(trans, inum, 0, 0) ?: + bch2_logged_op_update(trans, &op->k_i)); + } + + break; +case LOGGED_OP_FINSERT_finish: + break; + } +err: + bch2_logged_op_finish(trans, op_k); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_resume_logged_op_finsert(struct btree_trans *trans, struct bkey_i *op_k) +{ + return __bch2_resume_logged_op_finsert(trans, op_k, NULL); +} + +int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum, + u64 offset, u64 len, bool insert, + s64 *i_sectors_delta) +{ + struct bkey_i_logged_op_finsert op; + s64 shift = insert ? len : -len; + + bkey_logged_op_finsert_init(&op.k_i); + op.v.subvol = cpu_to_le32(inum.subvol); + op.v.inum = cpu_to_le64(inum.inum); + op.v.dst_offset = cpu_to_le64(offset + shift); + op.v.src_offset = cpu_to_le64(offset); + op.v.pos = cpu_to_le64(insert ? U64_MAX : offset); + + return bch2_trans_run(c, + bch2_logged_op_start(trans, &op.k_i) ?: + __bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta)); +} diff --git a/libbcachefs/io_misc.h b/libbcachefs/io_misc.h new file mode 100644 index 00000000..c9e6ed40 --- /dev/null +++ b/libbcachefs/io_misc.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_IO_MISC_H +#define _BCACHEFS_IO_MISC_H + +int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *, + unsigned, struct bch_io_opts, s64 *, + struct write_point_specifier); +int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, + subvol_inum, u64, s64 *); +int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *); + +void bch2_logged_op_truncate_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_logged_op_truncate ((struct bkey_ops) { \ + .val_to_text = bch2_logged_op_truncate_to_text, \ + .min_val_size = 24, \ +}) + +int bch2_resume_logged_op_truncate(struct btree_trans *, struct bkey_i *); + +int bch2_truncate(struct bch_fs *, subvol_inum, u64, u64 *); + +void bch2_logged_op_finsert_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_logged_op_finsert ((struct bkey_ops) { \ + .val_to_text = bch2_logged_op_finsert_to_text, \ + .min_val_size = 24, \ +}) + +int bch2_resume_logged_op_finsert(struct btree_trans *, struct bkey_i *); + +int bch2_fcollapse_finsert(struct bch_fs *, subvol_inum, u64, u64, bool, s64 *); + +#endif /* _BCACHEFS_IO_MISC_H */ diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c new file mode 100644 index 00000000..443c3ea6 --- /dev/null +++ b/libbcachefs/io_read.c @@ -0,0 +1,1210 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Some low level IO code, and hacks for various block layer limitations + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include "bcachefs.h" +#include "alloc_background.h" +#include "alloc_foreground.h" +#include "btree_update.h" +#include "buckets.h" +#include "checksum.h" +#include "clock.h" +#include "compress.h" +#include "data_update.h" +#include "disk_groups.h" +#include "ec.h" +#include "error.h" +#include "io_read.h" +#include "io_misc.h" +#include "io_write.h" +#include "subvolume.h" +#include "trace.h" + +#include <linux/sched/mm.h> + +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT + +static bool bch2_target_congested(struct bch_fs *c, u16 target) +{ + const struct bch_devs_mask *devs; + unsigned d, nr = 0, total = 0; + u64 now = local_clock(), last; + s64 congested; + struct bch_dev *ca; + + if (!target) + return false; + + rcu_read_lock(); + devs = bch2_target_to_mask(c, target) ?: + &c->rw_devs[BCH_DATA_user]; + + for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { + ca = rcu_dereference(c->devs[d]); + if (!ca) + continue; + + congested = atomic_read(&ca->congested); + last = READ_ONCE(ca->congested_last); + if (time_after64(now, last)) + congested -= (now - last) >> 12; + + total += max(congested, 0LL); + nr++; + } + rcu_read_unlock(); + + return bch2_rand_range(nr * CONGESTED_MAX) < total; +} + +#else + +static bool bch2_target_congested(struct bch_fs *c, u16 target) +{ + return false; +} + +#endif + +/* Cache promotion on read */ + +struct promote_op { + struct rcu_head rcu; + u64 start_time; + + struct rhash_head hash; + struct bpos pos; + + struct data_update write; + struct bio_vec bi_inline_vecs[0]; /* must be last */ +}; + +static const struct rhashtable_params bch_promote_params = { + .head_offset = offsetof(struct promote_op, hash), + .key_offset = offsetof(struct promote_op, pos), + .key_len = sizeof(struct bpos), +}; + +static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, + struct bpos pos, + struct bch_io_opts opts, + unsigned flags) +{ + BUG_ON(!opts.promote_target); + + if (!(flags & BCH_READ_MAY_PROMOTE)) + return -BCH_ERR_nopromote_may_not; + + if (bch2_bkey_has_target(c, k, opts.promote_target)) + return -BCH_ERR_nopromote_already_promoted; + + if (bkey_extent_is_unwritten(k)) + return -BCH_ERR_nopromote_unwritten; + + if (bch2_target_congested(c, opts.promote_target)) + return -BCH_ERR_nopromote_congested; + + if (rhashtable_lookup_fast(&c->promote_table, &pos, + bch_promote_params)) + return -BCH_ERR_nopromote_in_flight; + + return 0; +} + +static void promote_free(struct bch_fs *c, struct promote_op *op) +{ + int ret; + + bch2_data_update_exit(&op->write); + + ret = rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params); + BUG_ON(ret); + bch2_write_ref_put(c, BCH_WRITE_REF_promote); + kfree_rcu(op, rcu); +} + +static void promote_done(struct bch_write_op *wop) +{ + struct promote_op *op = + container_of(wop, struct promote_op, write.op); + struct bch_fs *c = op->write.op.c; + + bch2_time_stats_update(&c->times[BCH_TIME_data_promote], + op->start_time); + promote_free(c, op); +} + +static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) +{ + struct bio *bio = &op->write.op.wbio.bio; + + trace_and_count(op->write.op.c, read_promote, &rbio->bio); + + /* we now own pages: */ + BUG_ON(!rbio->bounce); + BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); + + memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, + sizeof(struct bio_vec) * rbio->bio.bi_vcnt); + swap(bio->bi_vcnt, rbio->bio.bi_vcnt); + + bch2_data_update_read_done(&op->write, rbio->pick.crc); +} + +static struct promote_op *__promote_alloc(struct btree_trans *trans, + enum btree_id btree_id, + struct bkey_s_c k, + struct bpos pos, + struct extent_ptr_decoded *pick, + struct bch_io_opts opts, + unsigned sectors, + struct bch_read_bio **rbio) +{ + struct bch_fs *c = trans->c; + struct promote_op *op = NULL; + struct bio *bio; + unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); + int ret; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) + return NULL; + + op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS); + if (!op) + goto err; + + op->start_time = local_clock(); + op->pos = pos; + + /* + * We don't use the mempool here because extents that aren't + * checksummed or compressed can be too big for the mempool: + */ + *rbio = kzalloc(sizeof(struct bch_read_bio) + + sizeof(struct bio_vec) * pages, + GFP_NOFS); + if (!*rbio) + goto err; + + rbio_init(&(*rbio)->bio, opts); + bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); + + if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, + GFP_NOFS)) + goto err; + + (*rbio)->bounce = true; + (*rbio)->split = true; + (*rbio)->kmalloc = true; + + if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, + bch_promote_params)) + goto err; + + bio = &op->write.op.wbio.bio; + bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); + + ret = bch2_data_update_init(trans, NULL, &op->write, + writepoint_hashed((unsigned long) current), + opts, + (struct data_update_opts) { + .target = opts.promote_target, + .extra_replicas = 1, + .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED, + }, + btree_id, k); + /* + * possible errors: -BCH_ERR_nocow_lock_blocked, + * -BCH_ERR_ENOSPC_disk_reservation: + */ + if (ret) { + ret = rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params); + BUG_ON(ret); + goto err; + } + + op->write.op.end_io = promote_done; + + return op; +err: + if (*rbio) + bio_free_pages(&(*rbio)->bio); + kfree(*rbio); + *rbio = NULL; + kfree(op); + bch2_write_ref_put(c, BCH_WRITE_REF_promote); + return NULL; +} + +noinline +static struct promote_op *promote_alloc(struct btree_trans *trans, + struct bvec_iter iter, + struct bkey_s_c k, + struct extent_ptr_decoded *pick, + struct bch_io_opts opts, + unsigned flags, + struct bch_read_bio **rbio, + bool *bounce, + bool *read_full) +{ + struct bch_fs *c = trans->c; + bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); + /* data might have to be decompressed in the write path: */ + unsigned sectors = promote_full + ? max(pick->crc.compressed_size, pick->crc.live_size) + : bvec_iter_sectors(iter); + struct bpos pos = promote_full + ? bkey_start_pos(k.k) + : POS(k.k->p.inode, iter.bi_sector); + struct promote_op *promote; + int ret; + + ret = should_promote(c, k, pos, opts, flags); + if (ret) + goto nopromote; + + promote = __promote_alloc(trans, + k.k->type == KEY_TYPE_reflink_v + ? BTREE_ID_reflink + : BTREE_ID_extents, + k, pos, pick, opts, sectors, rbio); + if (!promote) { + ret = -BCH_ERR_nopromote_enomem; + goto nopromote; + } + + *bounce = true; + *read_full = promote_full; + return promote; +nopromote: + trace_read_nopromote(c, ret); + return NULL; +} + +/* Read */ + +#define READ_RETRY_AVOID 1 +#define READ_RETRY 2 +#define READ_ERR 3 + +enum rbio_context { + RBIO_CONTEXT_NULL, + RBIO_CONTEXT_HIGHPRI, + RBIO_CONTEXT_UNBOUND, +}; + +static inline struct bch_read_bio * +bch2_rbio_parent(struct bch_read_bio *rbio) +{ + return rbio->split ? rbio->parent : rbio; +} + +__always_inline +static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, + enum rbio_context context, + struct workqueue_struct *wq) +{ + if (context <= rbio->context) { + fn(&rbio->work); + } else { + rbio->work.func = fn; + rbio->context = context; + queue_work(wq, &rbio->work); + } +} + +static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) +{ + BUG_ON(rbio->bounce && !rbio->split); + + if (rbio->promote) + promote_free(rbio->c, rbio->promote); + rbio->promote = NULL; + + if (rbio->bounce) + bch2_bio_free_pages_pool(rbio->c, &rbio->bio); + + if (rbio->split) { + struct bch_read_bio *parent = rbio->parent; + + if (rbio->kmalloc) + kfree(rbio); + else + bio_put(&rbio->bio); + + rbio = parent; + } + + return rbio; +} + +/* + * Only called on a top level bch_read_bio to complete an entire read request, + * not a split: + */ +static void bch2_rbio_done(struct bch_read_bio *rbio) +{ + if (rbio->start_time) + bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], + rbio->start_time); + bio_endio(&rbio->bio); +} + +static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, + struct bch_io_failures *failed, + unsigned flags) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_buf sk; + struct bkey_s_c k; + int ret; + + flags &= ~BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_MUST_CLONE; + + bch2_bkey_buf_init(&sk); + + bch2_trans_iter_init(trans, &iter, rbio->data_btree, + rbio->read_pos, BTREE_ITER_SLOTS); +retry: + rbio->bio.bi_status = 0; + + k = bch2_btree_iter_peek_slot(&iter); + if (bkey_err(k)) + goto err; + + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + bch2_trans_unlock(trans); + + if (!bch2_bkey_matches_ptr(c, k, + rbio->pick.ptr, + rbio->data_pos.offset - + rbio->pick.crc.offset)) { + /* extent we wanted to read no longer exists: */ + rbio->hole = true; + goto out; + } + + ret = __bch2_read_extent(trans, rbio, bvec_iter, + rbio->read_pos, + rbio->data_btree, + k, 0, failed, flags); + if (ret == READ_RETRY) + goto retry; + if (ret) + goto err; +out: + bch2_rbio_done(rbio); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); + bch2_bkey_buf_exit(&sk, c); + return; +err: + rbio->bio.bi_status = BLK_STS_IOERR; + goto out; +} + +static void bch2_rbio_retry(struct work_struct *work) +{ + struct bch_read_bio *rbio = + container_of(work, struct bch_read_bio, work); + struct bch_fs *c = rbio->c; + struct bvec_iter iter = rbio->bvec_iter; + unsigned flags = rbio->flags; + subvol_inum inum = { + .subvol = rbio->subvol, + .inum = rbio->read_pos.inode, + }; + struct bch_io_failures failed = { .nr = 0 }; + + trace_and_count(c, read_retry, &rbio->bio); + + if (rbio->retry == READ_RETRY_AVOID) + bch2_mark_io_failure(&failed, &rbio->pick); + + rbio->bio.bi_status = 0; + + rbio = bch2_rbio_free(rbio); + + flags |= BCH_READ_IN_RETRY; + flags &= ~BCH_READ_MAY_PROMOTE; + + if (flags & BCH_READ_NODECODE) { + bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); + } else { + flags &= ~BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_MUST_CLONE; + + __bch2_read(c, rbio, iter, inum, &failed, flags); + } +} + +static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, + blk_status_t error) +{ + rbio->retry = retry; + + if (rbio->flags & BCH_READ_IN_RETRY) + return; + + if (retry == READ_ERR) { + rbio = bch2_rbio_free(rbio); + + rbio->bio.bi_status = error; + bch2_rbio_done(rbio); + } else { + bch2_rbio_punt(rbio, bch2_rbio_retry, + RBIO_CONTEXT_UNBOUND, system_unbound_wq); + } +} + +static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, + struct bch_read_bio *rbio) +{ + struct bch_fs *c = rbio->c; + u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; + struct bch_extent_crc_unpacked new_crc; + struct btree_iter iter; + struct bkey_i *new; + struct bkey_s_c k; + int ret = 0; + + if (crc_is_compressed(rbio->pick.crc)) + return 0; + + k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + if ((ret = bkey_err(k))) + goto out; + + if (bversion_cmp(k.k->version, rbio->version) || + !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) + goto out; + + /* Extent was merged? */ + if (bkey_start_offset(k.k) < data_offset || + k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) + goto out; + + if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, + rbio->pick.crc, NULL, &new_crc, + bkey_start_offset(k.k) - data_offset, k.k->size, + rbio->pick.crc.csum_type)) { + bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); + ret = 0; + goto out; + } + + /* + * going to be temporarily appending another checksum entry: + */ + new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + + sizeof(struct bch_extent_crc128)); + if ((ret = PTR_ERR_OR_ZERO(new))) + goto out; + + bkey_reassemble(new, k); + + if (!bch2_bkey_narrow_crcs(new, new_crc)) + goto out; + + ret = bch2_trans_update(trans, &iter, new, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) +{ + bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL, + __bch2_rbio_narrow_crcs(trans, rbio)); +} + +/* Inner part that may run in process context */ +static void __bch2_read_endio(struct work_struct *work) +{ + struct bch_read_bio *rbio = + container_of(work, struct bch_read_bio, work); + struct bch_fs *c = rbio->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); + struct bio *src = &rbio->bio; + struct bio *dst = &bch2_rbio_parent(rbio)->bio; + struct bvec_iter dst_iter = rbio->bvec_iter; + struct bch_extent_crc_unpacked crc = rbio->pick.crc; + struct nonce nonce = extent_nonce(rbio->version, crc); + unsigned nofs_flags; + struct bch_csum csum; + int ret; + + nofs_flags = memalloc_nofs_save(); + + /* Reset iterator for checksumming and copying bounced data: */ + if (rbio->bounce) { + src->bi_iter.bi_size = crc.compressed_size << 9; + src->bi_iter.bi_idx = 0; + src->bi_iter.bi_bvec_done = 0; + } else { + src->bi_iter = rbio->bvec_iter; + } + + csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); + if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io) + goto csum_err; + + /* + * XXX + * We need to rework the narrow_crcs path to deliver the read completion + * first, and then punt to a different workqueue, otherwise we're + * holding up reads while doing btree updates which is bad for memory + * reclaim. + */ + if (unlikely(rbio->narrow_crcs)) + bch2_rbio_narrow_crcs(rbio); + + if (rbio->flags & BCH_READ_NODECODE) + goto nodecode; + + /* Adjust crc to point to subset of data we want: */ + crc.offset += rbio->offset_into_extent; + crc.live_size = bvec_iter_sectors(rbio->bvec_iter); + + if (crc_is_compressed(crc)) { + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; + + if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && + !c->opts.no_data_io) + goto decompression_err; + } else { + /* don't need to decrypt the entire bio: */ + nonce = nonce_add(nonce, crc.offset << 9); + bio_advance(src, crc.offset << 9); + + BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); + src->bi_iter.bi_size = dst_iter.bi_size; + + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; + + if (rbio->bounce) { + struct bvec_iter src_iter = src->bi_iter; + + bio_copy_data_iter(dst, &dst_iter, src, &src_iter); + } + } + + if (rbio->promote) { + /* + * Re encrypt data we decrypted, so it's consistent with + * rbio->crc: + */ + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; + + promote_start(rbio->promote, rbio); + rbio->promote = NULL; + } +nodecode: + if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { + rbio = bch2_rbio_free(rbio); + bch2_rbio_done(rbio); + } +out: + memalloc_nofs_restore(nofs_flags); + return; +csum_err: + /* + * Checksum error: if the bio wasn't bounced, we may have been + * reading into buffers owned by userspace (that userspace can + * scribble over) - retry the read, bouncing it this time: + */ + if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { + rbio->flags |= BCH_READ_MUST_BOUNCE; + bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); + goto out; + } + + bch_err_inum_offset_ratelimited(ca, + rbio->read_pos.inode, + rbio->read_pos.offset << 9, + "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)", + rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, + csum.hi, csum.lo, bch2_csum_types[crc.csum_type]); + bch2_io_error(ca); + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; +decompression_err: + bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, + rbio->read_pos.offset << 9, + "decompression error"); + bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + goto out; +decrypt_err: + bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, + rbio->read_pos.offset << 9, + "decrypt error"); + bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + goto out; +} + +static void bch2_read_endio(struct bio *bio) +{ + struct bch_read_bio *rbio = + container_of(bio, struct bch_read_bio, bio); + struct bch_fs *c = rbio->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); + struct workqueue_struct *wq = NULL; + enum rbio_context context = RBIO_CONTEXT_NULL; + + if (rbio->have_ioref) { + bch2_latency_acct(ca, rbio->submit_time, READ); + percpu_ref_put(&ca->io_ref); + } + + if (!rbio->split) + rbio->bio.bi_end_io = rbio->end_io; + + if (bch2_dev_inum_io_err_on(bio->bi_status, ca, + rbio->read_pos.inode, + rbio->read_pos.offset, + "data read error: %s", + bch2_blk_status_to_str(bio->bi_status))) { + bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); + return; + } + + if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || + ptr_stale(ca, &rbio->pick.ptr)) { + trace_and_count(c, read_reuse_race, &rbio->bio); + + if (rbio->flags & BCH_READ_RETRY_IF_STALE) + bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); + else + bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); + return; + } + + if (rbio->narrow_crcs || + rbio->promote || + crc_is_compressed(rbio->pick.crc) || + bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) + context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; + else if (rbio->pick.crc.csum_type) + context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; + + bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); +} + +int __bch2_read_indirect_extent(struct btree_trans *trans, + unsigned *offset_into_extent, + struct bkey_buf *orig_k) +{ + struct btree_iter iter; + struct bkey_s_c k; + u64 reflink_offset; + int ret; + + reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + + *offset_into_extent; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink, + POS(0, reflink_offset), 0); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_reflink_v && + k.k->type != KEY_TYPE_indirect_inline_data) { + bch_err_inum_offset_ratelimited(trans->c, + orig_k->k->k.p.inode, + orig_k->k->k.p.offset << 9, + "%llu len %u points to nonexistent indirect extent %llu", + orig_k->k->k.p.offset, + orig_k->k->k.size, + reflink_offset); + bch2_inconsistent_error(trans->c); + ret = -EIO; + goto err; + } + + *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); + bch2_bkey_buf_reassemble(orig_k, trans->c, k); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, + struct bkey_s_c k, + struct bch_extent_ptr ptr) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev); + struct btree_iter iter; + struct printbuf buf = PRINTBUF; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + PTR_BUCKET_POS(c, &ptr), + BTREE_ITER_CACHED); + + prt_printf(&buf, "Attempting to read from stale dirty pointer:"); + printbuf_indent_add(&buf, 2); + prt_newline(&buf); + + bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + + prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); + + ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + if (!ret) { + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, k); + } + + bch2_fs_inconsistent(c, "%s", buf.buf); + + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); +} + +int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + struct bvec_iter iter, struct bpos read_pos, + enum btree_id data_btree, struct bkey_s_c k, + unsigned offset_into_extent, + struct bch_io_failures *failed, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct extent_ptr_decoded pick; + struct bch_read_bio *rbio = NULL; + struct bch_dev *ca = NULL; + struct promote_op *promote = NULL; + bool bounce = false, read_full = false, narrow_crcs = false; + struct bpos data_pos = bkey_start_pos(k.k); + int pick_ret; + + if (bkey_extent_is_inline_data(k.k)) { + unsigned bytes = min_t(unsigned, iter.bi_size, + bkey_inline_data_bytes(k.k)); + + swap(iter.bi_size, bytes); + memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); + swap(iter.bi_size, bytes); + bio_advance_iter(&orig->bio, &iter, bytes); + zero_fill_bio_iter(&orig->bio, iter); + goto out_read_done; + } +retry_pick: + pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); + + /* hole or reservation - just zero fill: */ + if (!pick_ret) + goto hole; + + if (pick_ret < 0) { + bch_err_inum_offset_ratelimited(c, + read_pos.inode, read_pos.offset << 9, + "no device to read from"); + goto err; + } + + ca = bch_dev_bkey_exists(c, pick.ptr.dev); + + /* + * Stale dirty pointers are treated as IO errors, but @failed isn't + * allocated unless we're in the retry path - so if we're not in the + * retry path, don't check here, it'll be caught in bch2_read_endio() + * and we'll end up in the retry path: + */ + if ((flags & BCH_READ_IN_RETRY) && + !pick.ptr.cached && + unlikely(ptr_stale(ca, &pick.ptr))) { + read_from_stale_dirty_pointer(trans, k, pick.ptr); + bch2_mark_io_failure(failed, &pick); + goto retry_pick; + } + + /* + * Unlock the iterator while the btree node's lock is still in + * cache, before doing the IO: + */ + bch2_trans_unlock(trans); + + if (flags & BCH_READ_NODECODE) { + /* + * can happen if we retry, and the extent we were going to read + * has been merged in the meantime: + */ + if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) + goto hole; + + iter.bi_size = pick.crc.compressed_size << 9; + goto get_bio; + } + + if (!(flags & BCH_READ_LAST_FRAGMENT) || + bio_flagged(&orig->bio, BIO_CHAIN)) + flags |= BCH_READ_MUST_CLONE; + + narrow_crcs = !(flags & BCH_READ_IN_RETRY) && + bch2_can_narrow_extent_crcs(k, pick.crc); + + if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) + flags |= BCH_READ_MUST_BOUNCE; + + EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); + + if (crc_is_compressed(pick.crc) || + (pick.crc.csum_type != BCH_CSUM_none && + (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || + (bch2_csum_type_is_encryption(pick.crc.csum_type) && + (flags & BCH_READ_USER_MAPPED)) || + (flags & BCH_READ_MUST_BOUNCE)))) { + read_full = true; + bounce = true; + } + + if (orig->opts.promote_target) + promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, + &rbio, &bounce, &read_full); + + if (!read_full) { + EBUG_ON(crc_is_compressed(pick.crc)); + EBUG_ON(pick.crc.csum_type && + (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || + bvec_iter_sectors(iter) != pick.crc.live_size || + pick.crc.offset || + offset_into_extent)); + + data_pos.offset += offset_into_extent; + pick.ptr.offset += pick.crc.offset + + offset_into_extent; + offset_into_extent = 0; + pick.crc.compressed_size = bvec_iter_sectors(iter); + pick.crc.uncompressed_size = bvec_iter_sectors(iter); + pick.crc.offset = 0; + pick.crc.live_size = bvec_iter_sectors(iter); + } +get_bio: + if (rbio) { + /* + * promote already allocated bounce rbio: + * promote needs to allocate a bio big enough for uncompressing + * data in the write path, but we're not going to use it all + * here: + */ + EBUG_ON(rbio->bio.bi_iter.bi_size < + pick.crc.compressed_size << 9); + rbio->bio.bi_iter.bi_size = + pick.crc.compressed_size << 9; + } else if (bounce) { + unsigned sectors = pick.crc.compressed_size; + + rbio = rbio_init(bio_alloc_bioset(NULL, + DIV_ROUND_UP(sectors, PAGE_SECTORS), + 0, + GFP_NOFS, + &c->bio_read_split), + orig->opts); + + bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); + rbio->bounce = true; + rbio->split = true; + } else if (flags & BCH_READ_MUST_CLONE) { + /* + * Have to clone if there were any splits, due to error + * reporting issues (if a split errored, and retrying didn't + * work, when it reports the error to its parent (us) we don't + * know if the error was from our bio, and we should retry, or + * from the whole bio, in which case we don't want to retry and + * lose the error) + */ + rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, + &c->bio_read_split), + orig->opts); + rbio->bio.bi_iter = iter; + rbio->split = true; + } else { + rbio = orig; + rbio->bio.bi_iter = iter; + EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); + } + + EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); + + rbio->c = c; + rbio->submit_time = local_clock(); + if (rbio->split) + rbio->parent = orig; + else + rbio->end_io = orig->bio.bi_end_io; + rbio->bvec_iter = iter; + rbio->offset_into_extent= offset_into_extent; + rbio->flags = flags; + rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); + rbio->narrow_crcs = narrow_crcs; + rbio->hole = 0; + rbio->retry = 0; + rbio->context = 0; + /* XXX: only initialize this if needed */ + rbio->devs_have = bch2_bkey_devs(k); + rbio->pick = pick; + rbio->subvol = orig->subvol; + rbio->read_pos = read_pos; + rbio->data_btree = data_btree; + rbio->data_pos = data_pos; + rbio->version = k.k->version; + rbio->promote = promote; + INIT_WORK(&rbio->work, NULL); + + rbio->bio.bi_opf = orig->bio.bi_opf; + rbio->bio.bi_iter.bi_sector = pick.ptr.offset; + rbio->bio.bi_end_io = bch2_read_endio; + + if (rbio->bounce) + trace_and_count(c, read_bounce, &rbio->bio); + + this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); + bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); + + /* + * If it's being moved internally, we don't want to flag it as a cache + * hit: + */ + if (pick.ptr.cached && !(flags & BCH_READ_NODECODE)) + bch2_bucket_io_time_reset(trans, pick.ptr.dev, + PTR_BUCKET_NR(ca, &pick.ptr), READ); + + if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { + bio_inc_remaining(&orig->bio); + trace_and_count(c, read_split, &orig->bio); + } + + if (!rbio->pick.idx) { + if (!rbio->have_ioref) { + bch_err_inum_offset_ratelimited(c, + read_pos.inode, + read_pos.offset << 9, + "no device to read from"); + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; + } + + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], + bio_sectors(&rbio->bio)); + bio_set_dev(&rbio->bio, ca->disk_sb.bdev); + + if (unlikely(c->opts.no_data_io)) { + if (likely(!(flags & BCH_READ_IN_RETRY))) + bio_endio(&rbio->bio); + } else { + if (likely(!(flags & BCH_READ_IN_RETRY))) + submit_bio(&rbio->bio); + else + submit_bio_wait(&rbio->bio); + } + + /* + * We just submitted IO which may block, we expect relock fail + * events and shouldn't count them: + */ + trans->notrace_relock_fail = true; + } else { + /* Attempting reconstruct read: */ + if (bch2_ec_read_extent(c, rbio)) { + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; + } + + if (likely(!(flags & BCH_READ_IN_RETRY))) + bio_endio(&rbio->bio); + } +out: + if (likely(!(flags & BCH_READ_IN_RETRY))) { + return 0; + } else { + int ret; + + rbio->context = RBIO_CONTEXT_UNBOUND; + bch2_read_endio(&rbio->bio); + + ret = rbio->retry; + rbio = bch2_rbio_free(rbio); + + if (ret == READ_RETRY_AVOID) { + bch2_mark_io_failure(failed, &pick); + ret = READ_RETRY; + } + + if (!ret) + goto out_read_done; + + return ret; + } + +err: + if (flags & BCH_READ_IN_RETRY) + return READ_ERR; + + orig->bio.bi_status = BLK_STS_IOERR; + goto out_read_done; + +hole: + /* + * won't normally happen in the BCH_READ_NODECODE + * (bch2_move_extent()) path, but if we retry and the extent we wanted + * to read no longer exists we have to signal that: + */ + if (flags & BCH_READ_NODECODE) + orig->hole = true; + + zero_fill_bio_iter(&orig->bio, iter); +out_read_done: + if (flags & BCH_READ_LAST_FRAGMENT) + bch2_rbio_done(orig); + return 0; +} + +void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, subvol_inum inum, + struct bch_io_failures *failed, unsigned flags) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_buf sk; + struct bkey_s_c k; + u32 snapshot; + int ret; + + BUG_ON(flags & BCH_READ_NODECODE); + + bch2_bkey_buf_init(&sk); +retry: + bch2_trans_begin(trans); + iter = (struct btree_iter) { NULL }; + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto err; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + SPOS(inum.inum, bvec_iter.bi_sector, snapshot), + BTREE_ITER_SLOTS); + while (1) { + unsigned bytes, sectors, offset_into_extent; + enum btree_id data_btree = BTREE_ID_extents; + + /* + * read_extent -> io_time_reset may cause a transaction restart + * without returning an error, we need to check for that here: + */ + ret = bch2_trans_relock(trans); + if (ret) + break; + + bch2_btree_iter_set_pos(&iter, + POS(inum.inum, bvec_iter.bi_sector)); + + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + break; + + offset_into_extent = iter.pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + bch2_bkey_buf_reassemble(&sk, c, k); + + ret = bch2_read_indirect_extent(trans, &data_btree, + &offset_into_extent, &sk); + if (ret) + break; + + k = bkey_i_to_s_c(sk.k); + + /* + * With indirect extents, the amount of data to read is the min + * of the original extent and the indirect extent: + */ + sectors = min(sectors, k.k->size - offset_into_extent); + + bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; + swap(bvec_iter.bi_size, bytes); + + if (bvec_iter.bi_size == bytes) + flags |= BCH_READ_LAST_FRAGMENT; + + ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, + data_btree, k, + offset_into_extent, failed, flags); + if (ret) + break; + + if (flags & BCH_READ_LAST_FRAGMENT) + break; + + swap(bvec_iter.bi_size, bytes); + bio_advance_iter(&rbio->bio, &bvec_iter, bytes); + + ret = btree_trans_too_many_iters(trans); + if (ret) + break; + } +err: + bch2_trans_iter_exit(trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + ret == READ_RETRY || + ret == READ_RETRY_AVOID) + goto retry; + + bch2_trans_put(trans); + bch2_bkey_buf_exit(&sk, c); + + if (ret) { + bch_err_inum_offset_ratelimited(c, inum.inum, + bvec_iter.bi_sector << 9, + "read error %i from btree lookup", ret); + rbio->bio.bi_status = BLK_STS_IOERR; + bch2_rbio_done(rbio); + } +} + +void bch2_fs_io_read_exit(struct bch_fs *c) +{ + if (c->promote_table.tbl) + rhashtable_destroy(&c->promote_table); + bioset_exit(&c->bio_read_split); + bioset_exit(&c->bio_read); +} + +int bch2_fs_io_read_init(struct bch_fs *c) +{ + if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_bio_read_init; + + if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_bio_read_split_init; + + if (rhashtable_init(&c->promote_table, &bch_promote_params)) + return -BCH_ERR_ENOMEM_promote_table_init; + + return 0; +} diff --git a/libbcachefs/io_read.h b/libbcachefs/io_read.h new file mode 100644 index 00000000..d9c18bb7 --- /dev/null +++ b/libbcachefs/io_read.h @@ -0,0 +1,158 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_IO_READ_H +#define _BCACHEFS_IO_READ_H + +#include "bkey_buf.h" + +struct bch_read_bio { + struct bch_fs *c; + u64 start_time; + u64 submit_time; + + /* + * Reads will often have to be split, and if the extent being read from + * was checksummed or compressed we'll also have to allocate bounce + * buffers and copy the data back into the original bio. + * + * If we didn't have to split, we have to save and restore the original + * bi_end_io - @split below indicates which: + */ + union { + struct bch_read_bio *parent; + bio_end_io_t *end_io; + }; + + /* + * Saved copy of bio->bi_iter, from submission time - allows us to + * resubmit on IO error, and also to copy data back to the original bio + * when we're bouncing: + */ + struct bvec_iter bvec_iter; + + unsigned offset_into_extent; + + u16 flags; + union { + struct { + u16 bounce:1, + split:1, + kmalloc:1, + have_ioref:1, + narrow_crcs:1, + hole:1, + retry:2, + context:2; + }; + u16 _state; + }; + + struct bch_devs_list devs_have; + + struct extent_ptr_decoded pick; + + /* + * pos we read from - different from data_pos for indirect extents: + */ + u32 subvol; + struct bpos read_pos; + + /* + * start pos of data we read (may not be pos of data we want) - for + * promote, narrow extents paths: + */ + enum btree_id data_btree; + struct bpos data_pos; + struct bversion version; + + struct promote_op *promote; + + struct bch_io_opts opts; + + struct work_struct work; + + struct bio bio; +}; + +#define to_rbio(_bio) container_of((_bio), struct bch_read_bio, bio) + +struct bch_devs_mask; +struct cache_promote_op; +struct extent_ptr_decoded; + +int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, + struct bkey_buf *); + +static inline int bch2_read_indirect_extent(struct btree_trans *trans, + enum btree_id *data_btree, + unsigned *offset_into_extent, + struct bkey_buf *k) +{ + if (k->k->k.type != KEY_TYPE_reflink_p) + return 0; + + *data_btree = BTREE_ID_reflink; + return __bch2_read_indirect_extent(trans, offset_into_extent, k); +} + +enum bch_read_flags { + BCH_READ_RETRY_IF_STALE = 1 << 0, + BCH_READ_MAY_PROMOTE = 1 << 1, + BCH_READ_USER_MAPPED = 1 << 2, + BCH_READ_NODECODE = 1 << 3, + BCH_READ_LAST_FRAGMENT = 1 << 4, + + /* internal: */ + BCH_READ_MUST_BOUNCE = 1 << 5, + BCH_READ_MUST_CLONE = 1 << 6, + BCH_READ_IN_RETRY = 1 << 7, +}; + +int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, + struct bvec_iter, struct bpos, enum btree_id, + struct bkey_s_c, unsigned, + struct bch_io_failures *, unsigned); + +static inline void bch2_read_extent(struct btree_trans *trans, + struct bch_read_bio *rbio, struct bpos read_pos, + enum btree_id data_btree, struct bkey_s_c k, + unsigned offset_into_extent, unsigned flags) +{ + __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, + data_btree, k, offset_into_extent, NULL, flags); +} + +void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, + subvol_inum, struct bch_io_failures *, unsigned flags); + +static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + subvol_inum inum) +{ + struct bch_io_failures failed = { .nr = 0 }; + + BUG_ON(rbio->_state); + + rbio->c = c; + rbio->start_time = local_clock(); + rbio->subvol = inum.subvol; + + __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, + BCH_READ_RETRY_IF_STALE| + BCH_READ_MAY_PROMOTE| + BCH_READ_USER_MAPPED); +} + +static inline struct bch_read_bio *rbio_init(struct bio *bio, + struct bch_io_opts opts) +{ + struct bch_read_bio *rbio = to_rbio(bio); + + rbio->_state = 0; + rbio->promote = NULL; + rbio->opts = opts; + return rbio; +} + +void bch2_fs_io_read_exit(struct bch_fs *); +int bch2_fs_io_read_init(struct bch_fs *); + +#endif /* _BCACHEFS_IO_READ_H */ diff --git a/libbcachefs/io.c b/libbcachefs/io_write.c similarity index 53% rename from libbcachefs/io.c rename to libbcachefs/io_write.c index 3c614c86..d2a0de88 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io_write.c @@ -1,29 +1,24 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Some low level IO code, and hacks for various block layer limitations - * * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> * Copyright 2012 Google, Inc. */ #include "bcachefs.h" -#include "alloc_background.h" #include "alloc_foreground.h" #include "bkey_buf.h" #include "bset.h" #include "btree_update.h" #include "buckets.h" #include "checksum.h" -#include "compress.h" #include "clock.h" -#include "data_update.h" +#include "compress.h" #include "debug.h" -#include "disk_groups.h" #include "ec.h" #include "error.h" #include "extent_update.h" #include "inode.h" -#include "io.h" +#include "io_write.h" #include "journal.h" #include "keylist.h" #include "move.h" @@ -39,48 +34,8 @@ #include <linux/random.h> #include <linux/sched/mm.h> -const char *bch2_blk_status_to_str(blk_status_t status) -{ - if (status == BLK_STS_REMOVED) - return "device removed"; - return blk_status_to_str(status); -} - #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -static bool bch2_target_congested(struct bch_fs *c, u16 target) -{ - const struct bch_devs_mask *devs; - unsigned d, nr = 0, total = 0; - u64 now = local_clock(), last; - s64 congested; - struct bch_dev *ca; - - if (!target) - return false; - - rcu_read_lock(); - devs = bch2_target_to_mask(c, target) ?: - &c->rw_devs[BCH_DATA_user]; - - for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { - ca = rcu_dereference(c->devs[d]); - if (!ca) - continue; - - congested = atomic_read(&ca->congested); - last = READ_ONCE(ca->congested_last); - if (time_after64(now, last)) - congested -= (now - last) >> 12; - - total += max(congested, 0LL); - nr++; - } - rcu_read_unlock(); - - return bch2_rand_range(nr * CONGESTED_MAX) < total; -} - static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, u64 now, int rw) { @@ -136,13 +91,6 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); } -#else - -static bool bch2_target_congested(struct bch_fs *c, u16 target) -{ - return false; -} - #endif /* Allocate, free from mempool: */ @@ -368,213 +316,13 @@ int bch2_extent_update(struct btree_trans *trans, return 0; } -/* Overwrites whatever was present with zeroes: */ -int bch2_extent_fallocate(struct btree_trans *trans, - subvol_inum inum, - struct btree_iter *iter, - unsigned sectors, - struct bch_io_opts opts, - s64 *i_sectors_delta, - struct write_point_specifier write_point) -{ - struct bch_fs *c = trans->c; - struct disk_reservation disk_res = { 0 }; - struct closure cl; - struct open_buckets open_buckets = { 0 }; - struct bkey_s_c k; - struct bkey_buf old, new; - unsigned sectors_allocated = 0; - bool have_reservation = false; - bool unwritten = opts.nocow && - c->sb.version >= bcachefs_metadata_version_unwritten_extents; - int ret; - - bch2_bkey_buf_init(&old); - bch2_bkey_buf_init(&new); - closure_init_stack(&cl); - - k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); - if (ret) - return ret; - - sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset); - - if (!have_reservation) { - unsigned new_replicas = - max(0, (int) opts.data_replicas - - (int) bch2_bkey_nr_ptrs_fully_allocated(k)); - /* - * Get a disk reservation before (in the nocow case) calling - * into the allocator: - */ - ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0); - if (unlikely(ret)) - goto err; - - bch2_bkey_buf_reassemble(&old, c, k); - } - - if (have_reservation) { - if (!bch2_extents_match(k, bkey_i_to_s_c(old.k))) - goto err; - - bch2_key_resize(&new.k->k, sectors); - } else if (!unwritten) { - struct bkey_i_reservation *reservation; - - bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64)); - reservation = bkey_reservation_init(new.k); - reservation->k.p = iter->pos; - bch2_key_resize(&reservation->k, sectors); - reservation->v.nr_replicas = opts.data_replicas; - } else { - struct bkey_i_extent *e; - struct bch_devs_list devs_have; - struct write_point *wp; - struct bch_extent_ptr *ptr; - - devs_have.nr = 0; - - bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX); - - e = bkey_extent_init(new.k); - e->k.p = iter->pos; - - ret = bch2_alloc_sectors_start_trans(trans, - opts.foreground_target, - false, - write_point, - &devs_have, - opts.data_replicas, - opts.data_replicas, - BCH_WATERMARK_normal, 0, &cl, &wp); - if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) - ret = -BCH_ERR_transaction_restart_nested; - if (ret) - goto err; - - sectors = min(sectors, wp->sectors_free); - sectors_allocated = sectors; - - bch2_key_resize(&e->k, sectors); - - bch2_open_bucket_get(c, wp, &open_buckets); - bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); - bch2_alloc_sectors_done(c, wp); - - extent_for_each_ptr(extent_i_to_s(e), ptr) - ptr->unwritten = true; - } - - have_reservation = true; - - ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res, - 0, i_sectors_delta, true); -err: - if (!ret && sectors_allocated) - bch2_increment_clock(c, sectors_allocated, WRITE); - - bch2_open_buckets_put(c, &open_buckets); - bch2_disk_reservation_put(c, &disk_res); - bch2_bkey_buf_exit(&new, c); - bch2_bkey_buf_exit(&old, c); - - if (closure_nr_remaining(&cl) != 1) { - bch2_trans_unlock(trans); - closure_sync(&cl); - } - - return ret; -} - -/* - * Returns -BCH_ERR_transacton_restart if we had to drop locks: - */ -int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, - subvol_inum inum, u64 end, - s64 *i_sectors_delta) -{ - struct bch_fs *c = trans->c; - unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); - struct bpos end_pos = POS(inum.inum, end); - struct bkey_s_c k; - int ret = 0, ret2 = 0; - u32 snapshot; - - while (!ret || - bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - struct disk_reservation disk_res = - bch2_disk_reservation_init(c, 0); - struct bkey_i delete; - - if (ret) - ret2 = ret; - - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - continue; - - bch2_btree_iter_set_snapshot(iter, snapshot); - - /* - * peek_upto() doesn't have ideal semantics for extents: - */ - k = bch2_btree_iter_peek_upto(iter, end_pos); - if (!k.k) - break; - - ret = bkey_err(k); - if (ret) - continue; - - bkey_init(&delete.k); - delete.k.p = iter->pos; - - /* create the biggest key we can */ - bch2_key_resize(&delete.k, max_sectors); - bch2_cut_back(end_pos, &delete); - - ret = bch2_extent_update(trans, inum, iter, &delete, - &disk_res, 0, i_sectors_delta, false); - bch2_disk_reservation_put(c, &disk_res); - } - - return ret ?: ret2; -} - -int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, - s64 *i_sectors_delta) -{ - struct btree_trans trans; - struct btree_iter iter; - int ret; - - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, - POS(inum.inum, start), - BTREE_ITER_INTENT); - - ret = bch2_fpunch_at(&trans, &iter, inum, end, i_sectors_delta); - - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - ret = 0; - - return ret; -} - static int bch2_write_index_default(struct bch_write_op *op) { struct bch_fs *c = op->c; struct bkey_buf sk; struct keylist *keys = &op->insert_keys; struct bkey_i *k = bch2_keylist_front(keys); - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; subvol_inum inum = { .subvol = op->subvol, @@ -585,30 +333,29 @@ static int bch2_write_index_default(struct bch_write_op *op) BUG_ON(!inum.subvol); bch2_bkey_buf_init(&sk); - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); do { - bch2_trans_begin(&trans); + bch2_trans_begin(trans); k = bch2_keylist_front(keys); bch2_bkey_buf_copy(&sk, c, k); - ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &sk.k->k.p.snapshot); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) break; - bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, bkey_start_pos(&sk.k->k), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - ret = bch2_extent_update(&trans, inum, &iter, sk.k, + ret = bch2_extent_update(trans, inum, &iter, sk.k, &op->res, op->new_i_size, &op->i_sectors_delta, op->flags & BCH_WRITE_CHECK_ENOSPC); - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; @@ -621,7 +368,7 @@ static int bch2_write_index_default(struct bch_write_op *op) bch2_cut_front(iter.pos, k); } while (!bch2_keylist_empty(keys)); - bch2_trans_exit(&trans); + bch2_trans_put(trans); bch2_bkey_buf_exit(&sk, c); return ret; @@ -741,7 +488,8 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) } /** - * bch_write_index - after a write, update index to point to new data + * __bch2_write_index - after a write, update index to point to new data + * @op: bch_write_op to process */ static void __bch2_write_index(struct bch_write_op *op) { @@ -778,10 +526,10 @@ static void __bch2_write_index(struct bch_write_op *op) op->written += sectors_start - keylist_sectors(keys); if (ret && !bch2_err_matches(ret, EROFS)) { - struct bkey_i *k = bch2_keylist_front(&op->insert_keys); + struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); bch_err_inum_offset_ratelimited(c, - k->k.p.inode, k->k.p.offset << 9, + insert->k.p.inode, insert->k.p.offset << 9, "write error while doing btree update: %s", bch2_err_str(ret)); } @@ -1182,7 +930,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, do { struct bch_extent_crc_unpacked crc = { 0 }; struct bversion version = op->version; - size_t dst_len, src_len; + size_t dst_len = 0, src_len = 0; if (page_alloc_failed && dst->bi_iter.bi_size < (wp->sectors_free << 9) && @@ -1414,27 +1162,25 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) { struct bch_fs *c = op->c; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_i *orig; struct bkey_s_c k; int ret; - bch2_trans_init(&trans, c, 0, 0); - for_each_keylist_key(&op->insert_keys, orig) { - ret = for_each_btree_key_upto_commit(&trans, iter, BTREE_ID_extents, + ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents, bkey_start_pos(&orig->k), orig->k.p, BTREE_ITER_INTENT, k, NULL, NULL, BTREE_INSERT_NOFAIL, ({ - bch2_nocow_write_convert_one_unwritten(&trans, &iter, orig, k, op->new_i_size); + bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); })); if (ret && !bch2_err_matches(ret, EROFS)) { - struct bkey_i *k = bch2_keylist_front(&op->insert_keys); + struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); bch_err_inum_offset_ratelimited(c, - k->k.p.inode, k->k.p.offset << 9, + insert->k.p.inode, insert->k.p.offset << 9, "write error while doing btree update: %s", bch2_err_str(ret)); } @@ -1445,7 +1191,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) } } - bch2_trans_exit(&trans); + bch2_trans_put(trans); } static void __bch2_nocow_write_done(struct bch_write_op *op) @@ -1469,7 +1215,7 @@ static void bch2_nocow_write_done(struct closure *cl) static void bch2_nocow_write(struct bch_write_op *op) { struct bch_fs *c = op->c; - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; struct bkey_ptrs_c ptrs; @@ -1486,15 +1232,15 @@ static void bch2_nocow_write(struct bch_write_op *op) if (op->flags & BCH_WRITE_MOVE) return; - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(&trans, op->subvol, &snapshot); + ret = bch2_subvolume_get_snapshot(trans, op->subvol, &snapshot); if (unlikely(ret)) goto err; - bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(op->pos.inode, op->pos.offset, snapshot), BTREE_ITER_SLOTS); while (1) { @@ -1540,7 +1286,7 @@ retry: /* Unlock before taking nocow locks, doing IO: */ bkey_reassemble(op->insert_keys.top, k); - bch2_trans_unlock(&trans); + bch2_trans_unlock(trans); bch2_cut_front(op->pos, op->insert_keys.top); if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN) @@ -1589,7 +1335,7 @@ retry: bch2_btree_iter_advance(&iter); } out: - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; @@ -1604,7 +1350,7 @@ err: op->flags |= BCH_WRITE_DONE; } - bch2_trans_exit(&trans); + bch2_trans_put(trans); /* fallback to cow write path? */ if (!(op->flags & BCH_WRITE_DONE)) { @@ -1682,7 +1428,7 @@ again: * allocations for specific disks may hang arbitrarily long: */ ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_alloc_sectors_start_trans(&trans, + bch2_alloc_sectors_start_trans(trans, op->target, op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), op->write_point, @@ -1798,7 +1544,8 @@ err: } /** - * bch_write - handle a write to a cache device or flash only volume + * bch2_write() - handle a write to a cache device or flash only volume + * @cl: &bch_write_op->cl * * This is the starting point for any data to end up in a cache device; it could * be from a normal write, or a writeback write, or a write to a flash only @@ -1899,1140 +1646,14 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) printbuf_indent_sub(out, 2); } -/* Cache promotion on read */ - -struct promote_op { - struct rcu_head rcu; - u64 start_time; - - struct rhash_head hash; - struct bpos pos; - - struct data_update write; - struct bio_vec bi_inline_vecs[0]; /* must be last */ -}; - -static const struct rhashtable_params bch_promote_params = { - .head_offset = offsetof(struct promote_op, hash), - .key_offset = offsetof(struct promote_op, pos), - .key_len = sizeof(struct bpos), -}; - -static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k, - struct bpos pos, - struct bch_io_opts opts, - unsigned flags) +void bch2_fs_io_write_exit(struct bch_fs *c) { - if (!(flags & BCH_READ_MAY_PROMOTE)) - return false; - - if (!opts.promote_target) - return false; - - if (bch2_bkey_has_target(c, k, opts.promote_target)) - return false; - - if (bkey_extent_is_unwritten(k)) - return false; - - if (bch2_target_congested(c, opts.promote_target)) { - /* XXX trace this */ - return false; - } - - if (rhashtable_lookup_fast(&c->promote_table, &pos, - bch_promote_params)) - return false; - - return true; -} - -static void promote_free(struct bch_fs *c, struct promote_op *op) -{ - int ret; - - bch2_data_update_exit(&op->write); - - ret = rhashtable_remove_fast(&c->promote_table, &op->hash, - bch_promote_params); - BUG_ON(ret); - bch2_write_ref_put(c, BCH_WRITE_REF_promote); - kfree_rcu(op, rcu); -} - -static void promote_done(struct bch_write_op *wop) -{ - struct promote_op *op = - container_of(wop, struct promote_op, write.op); - struct bch_fs *c = op->write.op.c; - - bch2_time_stats_update(&c->times[BCH_TIME_data_promote], - op->start_time); - promote_free(c, op); -} - -static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) -{ - struct bio *bio = &op->write.op.wbio.bio; - - trace_and_count(op->write.op.c, read_promote, &rbio->bio); - - /* we now own pages: */ - BUG_ON(!rbio->bounce); - BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); - - memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, - sizeof(struct bio_vec) * rbio->bio.bi_vcnt); - swap(bio->bi_vcnt, rbio->bio.bi_vcnt); - - bch2_data_update_read_done(&op->write, rbio->pick.crc); -} - -static struct promote_op *__promote_alloc(struct btree_trans *trans, - enum btree_id btree_id, - struct bkey_s_c k, - struct bpos pos, - struct extent_ptr_decoded *pick, - struct bch_io_opts opts, - unsigned sectors, - struct bch_read_bio **rbio) -{ - struct bch_fs *c = trans->c; - struct promote_op *op = NULL; - struct bio *bio; - unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); - int ret; - - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) - return NULL; - - op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS); - if (!op) - goto err; - - op->start_time = local_clock(); - op->pos = pos; - - /* - * We don't use the mempool here because extents that aren't - * checksummed or compressed can be too big for the mempool: - */ - *rbio = kzalloc(sizeof(struct bch_read_bio) + - sizeof(struct bio_vec) * pages, - GFP_NOFS); - if (!*rbio) - goto err; - - rbio_init(&(*rbio)->bio, opts); - bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); - - if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, - GFP_NOFS)) - goto err; - - (*rbio)->bounce = true; - (*rbio)->split = true; - (*rbio)->kmalloc = true; - - if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, - bch_promote_params)) - goto err; - - bio = &op->write.op.wbio.bio; - bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); - - ret = bch2_data_update_init(trans, NULL, &op->write, - writepoint_hashed((unsigned long) current), - opts, - (struct data_update_opts) { - .target = opts.promote_target, - .extra_replicas = 1, - .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED, - }, - btree_id, k); - /* - * possible errors: -BCH_ERR_nocow_lock_blocked, - * -BCH_ERR_ENOSPC_disk_reservation: - */ - if (ret) { - ret = rhashtable_remove_fast(&c->promote_table, &op->hash, - bch_promote_params); - BUG_ON(ret); - goto err; - } - - op->write.op.end_io = promote_done; - - return op; -err: - if (*rbio) - bio_free_pages(&(*rbio)->bio); - kfree(*rbio); - *rbio = NULL; - kfree(op); - bch2_write_ref_put(c, BCH_WRITE_REF_promote); - return NULL; -} - -noinline -static struct promote_op *promote_alloc(struct btree_trans *trans, - struct bvec_iter iter, - struct bkey_s_c k, - struct extent_ptr_decoded *pick, - struct bch_io_opts opts, - unsigned flags, - struct bch_read_bio **rbio, - bool *bounce, - bool *read_full) -{ - struct bch_fs *c = trans->c; - bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); - /* data might have to be decompressed in the write path: */ - unsigned sectors = promote_full - ? max(pick->crc.compressed_size, pick->crc.live_size) - : bvec_iter_sectors(iter); - struct bpos pos = promote_full - ? bkey_start_pos(k.k) - : POS(k.k->p.inode, iter.bi_sector); - struct promote_op *promote; - - if (!should_promote(c, k, pos, opts, flags)) - return NULL; - - promote = __promote_alloc(trans, - k.k->type == KEY_TYPE_reflink_v - ? BTREE_ID_reflink - : BTREE_ID_extents, - k, pos, pick, opts, sectors, rbio); - if (!promote) - return NULL; - - *bounce = true; - *read_full = promote_full; - return promote; -} - -/* Read */ - -#define READ_RETRY_AVOID 1 -#define READ_RETRY 2 -#define READ_ERR 3 - -enum rbio_context { - RBIO_CONTEXT_NULL, - RBIO_CONTEXT_HIGHPRI, - RBIO_CONTEXT_UNBOUND, -}; - -static inline struct bch_read_bio * -bch2_rbio_parent(struct bch_read_bio *rbio) -{ - return rbio->split ? rbio->parent : rbio; -} - -__always_inline -static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, - enum rbio_context context, - struct workqueue_struct *wq) -{ - if (context <= rbio->context) { - fn(&rbio->work); - } else { - rbio->work.func = fn; - rbio->context = context; - queue_work(wq, &rbio->work); - } -} - -static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) -{ - BUG_ON(rbio->bounce && !rbio->split); - - if (rbio->promote) - promote_free(rbio->c, rbio->promote); - rbio->promote = NULL; - - if (rbio->bounce) - bch2_bio_free_pages_pool(rbio->c, &rbio->bio); - - if (rbio->split) { - struct bch_read_bio *parent = rbio->parent; - - if (rbio->kmalloc) - kfree(rbio); - else - bio_put(&rbio->bio); - - rbio = parent; - } - - return rbio; -} - -/* - * Only called on a top level bch_read_bio to complete an entire read request, - * not a split: - */ -static void bch2_rbio_done(struct bch_read_bio *rbio) -{ - if (rbio->start_time) - bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], - rbio->start_time); - bio_endio(&rbio->bio); -} - -static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, - struct bch_io_failures *failed, - unsigned flags) -{ - struct btree_trans trans; - struct btree_iter iter; - struct bkey_buf sk; - struct bkey_s_c k; - int ret; - - flags &= ~BCH_READ_LAST_FRAGMENT; - flags |= BCH_READ_MUST_CLONE; - - bch2_bkey_buf_init(&sk); - bch2_trans_init(&trans, c, 0, 0); - - bch2_trans_iter_init(&trans, &iter, rbio->data_btree, - rbio->read_pos, BTREE_ITER_SLOTS); -retry: - rbio->bio.bi_status = 0; - - k = bch2_btree_iter_peek_slot(&iter); - if (bkey_err(k)) - goto err; - - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - bch2_trans_unlock(&trans); - - if (!bch2_bkey_matches_ptr(c, k, - rbio->pick.ptr, - rbio->data_pos.offset - - rbio->pick.crc.offset)) { - /* extent we wanted to read no longer exists: */ - rbio->hole = true; - goto out; - } - - ret = __bch2_read_extent(&trans, rbio, bvec_iter, - rbio->read_pos, - rbio->data_btree, - k, 0, failed, flags); - if (ret == READ_RETRY) - goto retry; - if (ret) - goto err; -out: - bch2_rbio_done(rbio); - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); - bch2_bkey_buf_exit(&sk, c); - return; -err: - rbio->bio.bi_status = BLK_STS_IOERR; - goto out; -} - -static void bch2_rbio_retry(struct work_struct *work) -{ - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct bvec_iter iter = rbio->bvec_iter; - unsigned flags = rbio->flags; - subvol_inum inum = { - .subvol = rbio->subvol, - .inum = rbio->read_pos.inode, - }; - struct bch_io_failures failed = { .nr = 0 }; - - trace_and_count(c, read_retry, &rbio->bio); - - if (rbio->retry == READ_RETRY_AVOID) - bch2_mark_io_failure(&failed, &rbio->pick); - - rbio->bio.bi_status = 0; - - rbio = bch2_rbio_free(rbio); - - flags |= BCH_READ_IN_RETRY; - flags &= ~BCH_READ_MAY_PROMOTE; - - if (flags & BCH_READ_NODECODE) { - bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); - } else { - flags &= ~BCH_READ_LAST_FRAGMENT; - flags |= BCH_READ_MUST_CLONE; - - __bch2_read(c, rbio, iter, inum, &failed, flags); - } -} - -static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, - blk_status_t error) -{ - rbio->retry = retry; - - if (rbio->flags & BCH_READ_IN_RETRY) - return; - - if (retry == READ_ERR) { - rbio = bch2_rbio_free(rbio); - - rbio->bio.bi_status = error; - bch2_rbio_done(rbio); - } else { - bch2_rbio_punt(rbio, bch2_rbio_retry, - RBIO_CONTEXT_UNBOUND, system_unbound_wq); - } -} - -static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, - struct bch_read_bio *rbio) -{ - struct bch_fs *c = rbio->c; - u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; - struct bch_extent_crc_unpacked new_crc; - struct btree_iter iter; - struct bkey_i *new; - struct bkey_s_c k; - int ret = 0; - - if (crc_is_compressed(rbio->pick.crc)) - return 0; - - k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - if ((ret = bkey_err(k))) - goto out; - - if (bversion_cmp(k.k->version, rbio->version) || - !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) - goto out; - - /* Extent was merged? */ - if (bkey_start_offset(k.k) < data_offset || - k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) - goto out; - - if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, - rbio->pick.crc, NULL, &new_crc, - bkey_start_offset(k.k) - data_offset, k.k->size, - rbio->pick.crc.csum_type)) { - bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); - ret = 0; - goto out; - } - - /* - * going to be temporarily appending another checksum entry: - */ - new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + - sizeof(struct bch_extent_crc128)); - if ((ret = PTR_ERR_OR_ZERO(new))) - goto out; - - bkey_reassemble(new, k); - - if (!bch2_bkey_narrow_crcs(new, new_crc)) - goto out; - - ret = bch2_trans_update(trans, &iter, new, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) -{ - bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL, - __bch2_rbio_narrow_crcs(&trans, rbio)); -} - -/* Inner part that may run in process context */ -static void __bch2_read_endio(struct work_struct *work) -{ - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); - struct bio *src = &rbio->bio; - struct bio *dst = &bch2_rbio_parent(rbio)->bio; - struct bvec_iter dst_iter = rbio->bvec_iter; - struct bch_extent_crc_unpacked crc = rbio->pick.crc; - struct nonce nonce = extent_nonce(rbio->version, crc); - unsigned nofs_flags; - struct bch_csum csum; - int ret; - - nofs_flags = memalloc_nofs_save(); - - /* Reset iterator for checksumming and copying bounced data: */ - if (rbio->bounce) { - src->bi_iter.bi_size = crc.compressed_size << 9; - src->bi_iter.bi_idx = 0; - src->bi_iter.bi_bvec_done = 0; - } else { - src->bi_iter = rbio->bvec_iter; - } - - csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); - if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io) - goto csum_err; - - /* - * XXX - * We need to rework the narrow_crcs path to deliver the read completion - * first, and then punt to a different workqueue, otherwise we're - * holding up reads while doing btree updates which is bad for memory - * reclaim. - */ - if (unlikely(rbio->narrow_crcs)) - bch2_rbio_narrow_crcs(rbio); - - if (rbio->flags & BCH_READ_NODECODE) - goto nodecode; - - /* Adjust crc to point to subset of data we want: */ - crc.offset += rbio->offset_into_extent; - crc.live_size = bvec_iter_sectors(rbio->bvec_iter); - - if (crc_is_compressed(crc)) { - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; - - if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && - !c->opts.no_data_io) - goto decompression_err; - } else { - /* don't need to decrypt the entire bio: */ - nonce = nonce_add(nonce, crc.offset << 9); - bio_advance(src, crc.offset << 9); - - BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); - src->bi_iter.bi_size = dst_iter.bi_size; - - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; - - if (rbio->bounce) { - struct bvec_iter src_iter = src->bi_iter; - - bio_copy_data_iter(dst, &dst_iter, src, &src_iter); - } - } - - if (rbio->promote) { - /* - * Re encrypt data we decrypted, so it's consistent with - * rbio->crc: - */ - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; - - promote_start(rbio->promote, rbio); - rbio->promote = NULL; - } -nodecode: - if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { - rbio = bch2_rbio_free(rbio); - bch2_rbio_done(rbio); - } -out: - memalloc_nofs_restore(nofs_flags); - return; -csum_err: - /* - * Checksum error: if the bio wasn't bounced, we may have been - * reading into buffers owned by userspace (that userspace can - * scribble over) - retry the read, bouncing it this time: - */ - if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { - rbio->flags |= BCH_READ_MUST_BOUNCE; - bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); - goto out; - } - - bch_err_inum_offset_ratelimited(ca, - rbio->read_pos.inode, - rbio->read_pos.offset << 9, - "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)", - rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, - csum.hi, csum.lo, bch2_csum_types[crc.csum_type]); - bch2_io_error(ca); - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); - goto out; -decompression_err: - bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, - rbio->read_pos.offset << 9, - "decompression error"); - bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); - goto out; -decrypt_err: - bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, - rbio->read_pos.offset << 9, - "decrypt error"); - bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); - goto out; -} - -static void bch2_read_endio(struct bio *bio) -{ - struct bch_read_bio *rbio = - container_of(bio, struct bch_read_bio, bio); - struct bch_fs *c = rbio->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); - struct workqueue_struct *wq = NULL; - enum rbio_context context = RBIO_CONTEXT_NULL; - - if (rbio->have_ioref) { - bch2_latency_acct(ca, rbio->submit_time, READ); - percpu_ref_put(&ca->io_ref); - } - - if (!rbio->split) - rbio->bio.bi_end_io = rbio->end_io; - - if (bch2_dev_inum_io_err_on(bio->bi_status, ca, - rbio->read_pos.inode, - rbio->read_pos.offset, - "data read error: %s", - bch2_blk_status_to_str(bio->bi_status))) { - bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); - return; - } - - if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || - ptr_stale(ca, &rbio->pick.ptr)) { - trace_and_count(c, read_reuse_race, &rbio->bio); - - if (rbio->flags & BCH_READ_RETRY_IF_STALE) - bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); - else - bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); - return; - } - - if (rbio->narrow_crcs || - rbio->promote || - crc_is_compressed(rbio->pick.crc) || - bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) - context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; - else if (rbio->pick.crc.csum_type) - context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; - - bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); -} - -int __bch2_read_indirect_extent(struct btree_trans *trans, - unsigned *offset_into_extent, - struct bkey_buf *orig_k) -{ - struct btree_iter iter; - struct bkey_s_c k; - u64 reflink_offset; - int ret; - - reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + - *offset_into_extent; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink, - POS(0, reflink_offset), 0); - ret = bkey_err(k); - if (ret) - goto err; - - if (k.k->type != KEY_TYPE_reflink_v && - k.k->type != KEY_TYPE_indirect_inline_data) { - bch_err_inum_offset_ratelimited(trans->c, - orig_k->k->k.p.inode, - orig_k->k->k.p.offset << 9, - "%llu len %u points to nonexistent indirect extent %llu", - orig_k->k->k.p.offset, - orig_k->k->k.size, - reflink_offset); - bch2_inconsistent_error(trans->c); - ret = -EIO; - goto err; - } - - *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); - bch2_bkey_buf_reassemble(orig_k, trans->c, k); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, - struct bkey_s_c k, - struct bch_extent_ptr ptr) -{ - struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev); - struct btree_iter iter; - struct printbuf buf = PRINTBUF; - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - PTR_BUCKET_POS(c, &ptr), - BTREE_ITER_CACHED); - - prt_printf(&buf, "Attempting to read from stale dirty pointer:"); - printbuf_indent_add(&buf, 2); - prt_newline(&buf); - - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - - prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); - - ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); - if (!ret) { - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, k); - } - - bch2_fs_inconsistent(c, "%s", buf.buf); - - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); -} - -int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - struct bvec_iter iter, struct bpos read_pos, - enum btree_id data_btree, struct bkey_s_c k, - unsigned offset_into_extent, - struct bch_io_failures *failed, unsigned flags) -{ - struct bch_fs *c = trans->c; - struct extent_ptr_decoded pick; - struct bch_read_bio *rbio = NULL; - struct bch_dev *ca = NULL; - struct promote_op *promote = NULL; - bool bounce = false, read_full = false, narrow_crcs = false; - struct bpos data_pos = bkey_start_pos(k.k); - int pick_ret; - - if (bkey_extent_is_inline_data(k.k)) { - unsigned bytes = min_t(unsigned, iter.bi_size, - bkey_inline_data_bytes(k.k)); - - swap(iter.bi_size, bytes); - memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); - swap(iter.bi_size, bytes); - bio_advance_iter(&orig->bio, &iter, bytes); - zero_fill_bio_iter(&orig->bio, iter); - goto out_read_done; - } -retry_pick: - pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); - - /* hole or reservation - just zero fill: */ - if (!pick_ret) - goto hole; - - if (pick_ret < 0) { - bch_err_inum_offset_ratelimited(c, - read_pos.inode, read_pos.offset << 9, - "no device to read from"); - goto err; - } - - ca = bch_dev_bkey_exists(c, pick.ptr.dev); - - /* - * Stale dirty pointers are treated as IO errors, but @failed isn't - * allocated unless we're in the retry path - so if we're not in the - * retry path, don't check here, it'll be caught in bch2_read_endio() - * and we'll end up in the retry path: - */ - if ((flags & BCH_READ_IN_RETRY) && - !pick.ptr.cached && - unlikely(ptr_stale(ca, &pick.ptr))) { - read_from_stale_dirty_pointer(trans, k, pick.ptr); - bch2_mark_io_failure(failed, &pick); - goto retry_pick; - } - - /* - * Unlock the iterator while the btree node's lock is still in - * cache, before doing the IO: - */ - bch2_trans_unlock(trans); - - if (flags & BCH_READ_NODECODE) { - /* - * can happen if we retry, and the extent we were going to read - * has been merged in the meantime: - */ - if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) - goto hole; - - iter.bi_size = pick.crc.compressed_size << 9; - goto get_bio; - } - - if (!(flags & BCH_READ_LAST_FRAGMENT) || - bio_flagged(&orig->bio, BIO_CHAIN)) - flags |= BCH_READ_MUST_CLONE; - - narrow_crcs = !(flags & BCH_READ_IN_RETRY) && - bch2_can_narrow_extent_crcs(k, pick.crc); - - if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) - flags |= BCH_READ_MUST_BOUNCE; - - EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); - - if (crc_is_compressed(pick.crc) || - (pick.crc.csum_type != BCH_CSUM_none && - (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || - (bch2_csum_type_is_encryption(pick.crc.csum_type) && - (flags & BCH_READ_USER_MAPPED)) || - (flags & BCH_READ_MUST_BOUNCE)))) { - read_full = true; - bounce = true; - } - - if (orig->opts.promote_target) - promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, - &rbio, &bounce, &read_full); - - if (!read_full) { - EBUG_ON(crc_is_compressed(pick.crc)); - EBUG_ON(pick.crc.csum_type && - (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || - bvec_iter_sectors(iter) != pick.crc.live_size || - pick.crc.offset || - offset_into_extent)); - - data_pos.offset += offset_into_extent; - pick.ptr.offset += pick.crc.offset + - offset_into_extent; - offset_into_extent = 0; - pick.crc.compressed_size = bvec_iter_sectors(iter); - pick.crc.uncompressed_size = bvec_iter_sectors(iter); - pick.crc.offset = 0; - pick.crc.live_size = bvec_iter_sectors(iter); - offset_into_extent = 0; - } -get_bio: - if (rbio) { - /* - * promote already allocated bounce rbio: - * promote needs to allocate a bio big enough for uncompressing - * data in the write path, but we're not going to use it all - * here: - */ - EBUG_ON(rbio->bio.bi_iter.bi_size < - pick.crc.compressed_size << 9); - rbio->bio.bi_iter.bi_size = - pick.crc.compressed_size << 9; - } else if (bounce) { - unsigned sectors = pick.crc.compressed_size; - - rbio = rbio_init(bio_alloc_bioset(NULL, - DIV_ROUND_UP(sectors, PAGE_SECTORS), - 0, - GFP_NOFS, - &c->bio_read_split), - orig->opts); - - bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); - rbio->bounce = true; - rbio->split = true; - } else if (flags & BCH_READ_MUST_CLONE) { - /* - * Have to clone if there were any splits, due to error - * reporting issues (if a split errored, and retrying didn't - * work, when it reports the error to its parent (us) we don't - * know if the error was from our bio, and we should retry, or - * from the whole bio, in which case we don't want to retry and - * lose the error) - */ - rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, - &c->bio_read_split), - orig->opts); - rbio->bio.bi_iter = iter; - rbio->split = true; - } else { - rbio = orig; - rbio->bio.bi_iter = iter; - EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); - } - - EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); - - rbio->c = c; - rbio->submit_time = local_clock(); - if (rbio->split) - rbio->parent = orig; - else - rbio->end_io = orig->bio.bi_end_io; - rbio->bvec_iter = iter; - rbio->offset_into_extent= offset_into_extent; - rbio->flags = flags; - rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); - rbio->narrow_crcs = narrow_crcs; - rbio->hole = 0; - rbio->retry = 0; - rbio->context = 0; - /* XXX: only initialize this if needed */ - rbio->devs_have = bch2_bkey_devs(k); - rbio->pick = pick; - rbio->subvol = orig->subvol; - rbio->read_pos = read_pos; - rbio->data_btree = data_btree; - rbio->data_pos = data_pos; - rbio->version = k.k->version; - rbio->promote = promote; - INIT_WORK(&rbio->work, NULL); - - rbio->bio.bi_opf = orig->bio.bi_opf; - rbio->bio.bi_iter.bi_sector = pick.ptr.offset; - rbio->bio.bi_end_io = bch2_read_endio; - - if (rbio->bounce) - trace_and_count(c, read_bounce, &rbio->bio); - - this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); - bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); - - /* - * If it's being moved internally, we don't want to flag it as a cache - * hit: - */ - if (pick.ptr.cached && !(flags & BCH_READ_NODECODE)) - bch2_bucket_io_time_reset(trans, pick.ptr.dev, - PTR_BUCKET_NR(ca, &pick.ptr), READ); - - if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { - bio_inc_remaining(&orig->bio); - trace_and_count(c, read_split, &orig->bio); - } - - if (!rbio->pick.idx) { - if (!rbio->have_ioref) { - bch_err_inum_offset_ratelimited(c, - read_pos.inode, - read_pos.offset << 9, - "no device to read from"); - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); - goto out; - } - - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], - bio_sectors(&rbio->bio)); - bio_set_dev(&rbio->bio, ca->disk_sb.bdev); - - if (unlikely(c->opts.no_data_io)) { - if (likely(!(flags & BCH_READ_IN_RETRY))) - bio_endio(&rbio->bio); - } else { - if (likely(!(flags & BCH_READ_IN_RETRY))) - submit_bio(&rbio->bio); - else - submit_bio_wait(&rbio->bio); - } - - /* - * We just submitted IO which may block, we expect relock fail - * events and shouldn't count them: - */ - trans->notrace_relock_fail = true; - } else { - /* Attempting reconstruct read: */ - if (bch2_ec_read_extent(c, rbio)) { - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); - goto out; - } - - if (likely(!(flags & BCH_READ_IN_RETRY))) - bio_endio(&rbio->bio); - } -out: - if (likely(!(flags & BCH_READ_IN_RETRY))) { - return 0; - } else { - int ret; - - rbio->context = RBIO_CONTEXT_UNBOUND; - bch2_read_endio(&rbio->bio); - - ret = rbio->retry; - rbio = bch2_rbio_free(rbio); - - if (ret == READ_RETRY_AVOID) { - bch2_mark_io_failure(failed, &pick); - ret = READ_RETRY; - } - - if (!ret) - goto out_read_done; - - return ret; - } - -err: - if (flags & BCH_READ_IN_RETRY) - return READ_ERR; - - orig->bio.bi_status = BLK_STS_IOERR; - goto out_read_done; - -hole: - /* - * won't normally happen in the BCH_READ_NODECODE - * (bch2_move_extent()) path, but if we retry and the extent we wanted - * to read no longer exists we have to signal that: - */ - if (flags & BCH_READ_NODECODE) - orig->hole = true; - - zero_fill_bio_iter(&orig->bio, iter); -out_read_done: - if (flags & BCH_READ_LAST_FRAGMENT) - bch2_rbio_done(orig); - return 0; -} - -void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, subvol_inum inum, - struct bch_io_failures *failed, unsigned flags) -{ - struct btree_trans trans; - struct btree_iter iter; - struct bkey_buf sk; - struct bkey_s_c k; - u32 snapshot; - int ret; - - BUG_ON(flags & BCH_READ_NODECODE); - - bch2_bkey_buf_init(&sk); - bch2_trans_init(&trans, c, 0, 0); -retry: - bch2_trans_begin(&trans); - iter = (struct btree_iter) { NULL }; - - ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); - if (ret) - goto err; - - bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, - SPOS(inum.inum, bvec_iter.bi_sector, snapshot), - BTREE_ITER_SLOTS); - while (1) { - unsigned bytes, sectors, offset_into_extent; - enum btree_id data_btree = BTREE_ID_extents; - - /* - * read_extent -> io_time_reset may cause a transaction restart - * without returning an error, we need to check for that here: - */ - ret = bch2_trans_relock(&trans); - if (ret) - break; - - bch2_btree_iter_set_pos(&iter, - POS(inum.inum, bvec_iter.bi_sector)); - - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - break; - - offset_into_extent = iter.pos.offset - - bkey_start_offset(k.k); - sectors = k.k->size - offset_into_extent; - - bch2_bkey_buf_reassemble(&sk, c, k); - - ret = bch2_read_indirect_extent(&trans, &data_btree, - &offset_into_extent, &sk); - if (ret) - break; - - k = bkey_i_to_s_c(sk.k); - - /* - * With indirect extents, the amount of data to read is the min - * of the original extent and the indirect extent: - */ - sectors = min(sectors, k.k->size - offset_into_extent); - - bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; - swap(bvec_iter.bi_size, bytes); - - if (bvec_iter.bi_size == bytes) - flags |= BCH_READ_LAST_FRAGMENT; - - ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos, - data_btree, k, - offset_into_extent, failed, flags); - if (ret) - break; - - if (flags & BCH_READ_LAST_FRAGMENT) - break; - - swap(bvec_iter.bi_size, bytes); - bio_advance_iter(&rbio->bio, &bvec_iter, bytes); - - ret = btree_trans_too_many_iters(&trans); - if (ret) - break; - } -err: - bch2_trans_iter_exit(&trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || - ret == READ_RETRY || - ret == READ_RETRY_AVOID) - goto retry; - - bch2_trans_exit(&trans); - bch2_bkey_buf_exit(&sk, c); - - if (ret) { - bch_err_inum_offset_ratelimited(c, inum.inum, - bvec_iter.bi_sector << 9, - "read error %i from btree lookup", ret); - rbio->bio.bi_status = BLK_STS_IOERR; - bch2_rbio_done(rbio); - } -} - -void bch2_fs_io_exit(struct bch_fs *c) -{ - if (c->promote_table.tbl) - rhashtable_destroy(&c->promote_table); mempool_exit(&c->bio_bounce_pages); bioset_exit(&c->bio_write); - bioset_exit(&c->bio_read_split); - bioset_exit(&c->bio_read); } -int bch2_fs_io_init(struct bch_fs *c) +int bch2_fs_io_write_init(struct bch_fs *c) { - if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), - BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_bio_read_init; - - if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), - BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_bio_read_split_init; - if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS)) return -BCH_ERR_ENOMEM_bio_write_init; @@ -3044,8 +1665,5 @@ int bch2_fs_io_init(struct bch_fs *c) PAGE_SIZE, 0)) return -BCH_ERR_ENOMEM_bio_bounce_pages_init; - if (rhashtable_init(&c->promote_table, &bch_promote_params)) - return -BCH_ERR_ENOMEM_promote_table_init; - return 0; } diff --git a/libbcachefs/io_write.h b/libbcachefs/io_write.h new file mode 100644 index 00000000..93231672 --- /dev/null +++ b/libbcachefs/io_write.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_IO_WRITE_H +#define _BCACHEFS_IO_WRITE_H + +#include "checksum.h" +#include "io_write_types.h" + +#define to_wbio(_bio) \ + container_of((_bio), struct bch_write_bio, bio) + +void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); +void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); + +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT +void bch2_latency_acct(struct bch_dev *, u64, int); +#else +static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {} +#endif + +void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, + enum bch_data_type, const struct bkey_i *, bool); + +#define BCH_WRITE_FLAGS() \ + x(ALLOC_NOWAIT) \ + x(CACHED) \ + x(DATA_ENCODED) \ + x(PAGES_STABLE) \ + x(PAGES_OWNED) \ + x(ONLY_SPECIFIED_DEVS) \ + x(WROTE_DATA_INLINE) \ + x(FROM_INTERNAL) \ + x(CHECK_ENOSPC) \ + x(SYNC) \ + x(MOVE) \ + x(IN_WORKER) \ + x(DONE) \ + x(IO_ERROR) \ + x(CONVERT_UNWRITTEN) + +enum __bch_write_flags { +#define x(f) __BCH_WRITE_##f, + BCH_WRITE_FLAGS() +#undef x +}; + +enum bch_write_flags { +#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f), + BCH_WRITE_FLAGS() +#undef x +}; + +static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) +{ + return op->watermark == BCH_WATERMARK_copygc + ? op->c->copygc_wq + : op->c->btree_update_wq; +} + +int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, + struct bkey_i *, bool *, s64 *, s64 *); +int bch2_extent_update(struct btree_trans *, subvol_inum, + struct btree_iter *, struct bkey_i *, + struct disk_reservation *, u64, s64 *, bool); + +static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, + struct bch_io_opts opts) +{ + op->c = c; + op->end_io = NULL; + op->flags = 0; + op->written = 0; + op->error = 0; + op->csum_type = bch2_data_checksum_type(c, opts); + op->compression_opt = opts.compression; + op->nr_replicas = 0; + op->nr_replicas_required = c->opts.data_replicas_required; + op->watermark = BCH_WATERMARK_normal; + op->incompressible = 0; + op->open_buckets.nr = 0; + op->devs_have.nr = 0; + op->target = 0; + op->opts = opts; + op->subvol = 0; + op->pos = POS_MAX; + op->version = ZERO_VERSION; + op->write_point = (struct write_point_specifier) { 0 }; + op->res = (struct disk_reservation) { 0 }; + op->new_i_size = U64_MAX; + op->i_sectors_delta = 0; + op->devs_need_flush = NULL; +} + +void bch2_write(struct closure *); + +void bch2_write_point_do_index_updates(struct work_struct *); + +static inline struct bch_write_bio *wbio_init(struct bio *bio) +{ + struct bch_write_bio *wbio = to_wbio(bio); + + memset(&wbio->wbio, 0, sizeof(wbio->wbio)); + return wbio; +} + +void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *); + +void bch2_fs_io_write_exit(struct bch_fs *); +int bch2_fs_io_write_init(struct bch_fs *); + +#endif /* _BCACHEFS_IO_WRITE_H */ diff --git a/libbcachefs/io_types.h b/libbcachefs/io_write_types.h similarity index 54% rename from libbcachefs/io_types.h rename to libbcachefs/io_write_types.h index 737f16d7..c7f97c2c 100644 --- a/libbcachefs/io_types.h +++ b/libbcachefs/io_write_types.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_IO_TYPES_H -#define _BCACHEFS_IO_TYPES_H +#ifndef _BCACHEFS_IO_WRITE_TYPES_H +#define _BCACHEFS_IO_WRITE_TYPES_H #include "alloc_types.h" #include "btree_types.h" @@ -13,75 +13,6 @@ #include <linux/llist.h> #include <linux/workqueue.h> -struct bch_read_bio { - struct bch_fs *c; - u64 start_time; - u64 submit_time; - - /* - * Reads will often have to be split, and if the extent being read from - * was checksummed or compressed we'll also have to allocate bounce - * buffers and copy the data back into the original bio. - * - * If we didn't have to split, we have to save and restore the original - * bi_end_io - @split below indicates which: - */ - union { - struct bch_read_bio *parent; - bio_end_io_t *end_io; - }; - - /* - * Saved copy of bio->bi_iter, from submission time - allows us to - * resubmit on IO error, and also to copy data back to the original bio - * when we're bouncing: - */ - struct bvec_iter bvec_iter; - - unsigned offset_into_extent; - - u16 flags; - union { - struct { - u16 bounce:1, - split:1, - kmalloc:1, - have_ioref:1, - narrow_crcs:1, - hole:1, - retry:2, - context:2; - }; - u16 _state; - }; - - struct bch_devs_list devs_have; - - struct extent_ptr_decoded pick; - - /* - * pos we read from - different from data_pos for indirect extents: - */ - u32 subvol; - struct bpos read_pos; - - /* - * start pos of data we read (may not be pos of data we want) - for - * promote, narrow extents paths: - */ - enum btree_id data_btree; - struct bpos data_pos; - struct bversion version; - - struct promote_op *promote; - - struct bch_io_opts opts; - - struct work_struct work; - - struct bio bio; -}; - struct bch_write_bio { struct_group(wbio, struct bch_fs *c; @@ -162,4 +93,4 @@ struct bch_write_op { struct bch_write_bio wbio; }; -#endif /* _BCACHEFS_IO_TYPES_H */ +#endif /* _BCACHEFS_IO_WRITE_TYPES_H */ diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 055920c2..fc3dd5be 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -132,13 +132,21 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) return stuck; } -/* journal entry close/open: */ - -void __bch2_journal_buf_put(struct journal *j) +/* + * Final processing when the last reference of a journal buffer has been + * dropped. Drop the pin list reference acquired at journal entry open and write + * the buffer, if requested. + */ +void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); + lockdep_assert_held(&j->lock); + + if (__bch2_journal_pin_put(j, seq)) + bch2_journal_reclaim_fast(j); + if (write) + closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); } /* @@ -204,13 +212,11 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val) buf->data->last_seq = cpu_to_le64(buf->last_seq); BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq)); - __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq)); - cancel_delayed_work(&j->write_work); bch2_journal_space_available(j); - bch2_journal_buf_put(j, old.idx); + __bch2_journal_buf_put(j, old.idx, le64_to_cpu(buf->data->seq)); } void bch2_journal_halt(struct journal *j) @@ -588,8 +594,13 @@ out: /** * bch2_journal_flush_seq_async - wait for a journal entry to be written + * @j: journal object + * @seq: seq to flush + * @parent: closure object to wait with + * Returns: 1 if @seq has already been flushed, 0 if @seq is being flushed, + * -EIO if @seq will never be flushed * - * like bch2_journal_wait_on_seq, except that it triggers a write immediately if + * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if * necessary */ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, @@ -829,12 +840,12 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, break; ret = bch2_trans_run(c, - bch2_trans_mark_metadata_bucket(&trans, ca, + bch2_trans_mark_metadata_bucket(trans, ca, ob[nr_got]->bucket, BCH_DATA_journal, ca->mi.bucket_size)); if (ret) { bch2_open_bucket_put(c, ob[nr_got]); - bch_err(c, "error marking new journal buckets: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "marking new journal buckets"); break; } @@ -910,7 +921,7 @@ err_unblock: if (ret && !new_fs) for (i = 0; i < nr_got; i++) bch2_trans_run(c, - bch2_trans_mark_metadata_bucket(&trans, ca, + bch2_trans_mark_metadata_bucket(trans, ca, bu[i], BCH_DATA_free, 0)); err_free: if (!new_fs) @@ -944,7 +955,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, goto unlock; while (ja->nr < nr) { - struct disk_reservation disk_res = { 0, 0 }; + struct disk_reservation disk_res = { 0, 0, 0 }; /* * note: journal buckets aren't really counted as _sectors_ used yet, so diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 008a2e25..491133cc 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -252,9 +252,10 @@ static inline bool journal_entry_empty(struct jset *j) return true; } -void __bch2_journal_buf_put(struct journal *); - -static inline void bch2_journal_buf_put(struct journal *j, unsigned idx) +/* + * Drop reference on a buffer index and return true if the count has hit zero. + */ +static inline union journal_res_state journal_state_buf_put(struct journal *j, unsigned idx) { union journal_res_state s; @@ -264,9 +265,30 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx) .buf2_count = idx == 2, .buf3_count = idx == 3, }).v, &j->reservations.counter); + return s; +} - if (!journal_state_count(s, idx) && idx == s.unwritten_idx) - __bch2_journal_buf_put(j); +void bch2_journal_buf_put_final(struct journal *, u64, bool); + +static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) +{ + union journal_res_state s; + + s = journal_state_buf_put(j, idx); + if (!journal_state_count(s, idx)) + bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx); +} + +static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) +{ + union journal_res_state s; + + s = journal_state_buf_put(j, idx); + if (!journal_state_count(s, idx)) { + spin_lock(&j->lock); + bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx); + spin_unlock(&j->lock); + } } /* @@ -286,7 +308,7 @@ static inline void bch2_journal_res_put(struct journal *j, BCH_JSET_ENTRY_btree_keys, 0, 0, 0); - bch2_journal_buf_put(j, res->idx); + bch2_journal_buf_put(j, res->idx, res->seq); res->ref = 0; } diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 34740dca..6a3d6a37 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -8,7 +8,6 @@ #include "checksum.h" #include "disk_groups.h" #include "error.h" -#include "io.h" #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" @@ -238,17 +237,17 @@ static void journal_entry_err_msg(struct printbuf *out, #define journal_entry_err(c, version, jset, entry, msg, ...) \ ({ \ - struct printbuf buf = PRINTBUF; \ + struct printbuf _buf = PRINTBUF; \ \ - journal_entry_err_msg(&buf, version, jset, entry); \ - prt_printf(&buf, msg, ##__VA_ARGS__); \ + journal_entry_err_msg(&_buf, version, jset, entry); \ + prt_printf(&_buf, msg, ##__VA_ARGS__); \ \ switch (flags & BKEY_INVALID_WRITE) { \ case READ: \ - mustfix_fsck_err(c, "%s", buf.buf); \ + mustfix_fsck_err(c, "%s", _buf.buf); \ break; \ case WRITE: \ - bch_err(c, "corrupt metadata before write: %s\n", buf.buf);\ + bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\ if (bch2_fs_inconsistent(c)) { \ ret = -BCH_ERR_fsck_errors_not_fixed; \ goto fsck_err; \ @@ -256,7 +255,7 @@ static void journal_entry_err_msg(struct printbuf *out, break; \ } \ \ - printbuf_exit(&buf); \ + printbuf_exit(&_buf); \ true; \ }) @@ -1282,7 +1281,7 @@ int bch2_journal_read(struct bch_fs *c, continue; for (ptr = 0; ptr < i->nr_ptrs; ptr++) { - struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); + ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); if (!i->ptrs[ptr].csum_good) bch_err_dev_offset(ca, i->ptrs[ptr].sector, @@ -1380,16 +1379,21 @@ static void __journal_write_alloc(struct journal *j, } /** - * journal_next_bucket - move on to the next journal bucket if possible + * journal_write_alloc - decide where to write next journal entry + * + * @j: journal object + * @w: journal buf (entry to be written) + * + * Returns: 0 on success, or -EROFS on failure */ -static int journal_write_alloc(struct journal *j, struct journal_buf *w, - unsigned sectors) +static int journal_write_alloc(struct journal *j, struct journal_buf *w) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_devs_mask devs; struct journal_device *ja; struct bch_dev *ca; struct dev_alloc_list devs_sorted; + unsigned sectors = vstruct_sectors(w->data, c->block_bits); unsigned target = c->opts.metadata_target ?: c->opts.foreground_target; unsigned i, replicas = 0, replicas_want = @@ -1550,6 +1554,7 @@ static void journal_write_done(struct closure *cl) if (!journal_state_count(new, new.unwritten_idx) && journal_last_unwritten_seq(j) <= journal_cur_seq(j)) { + spin_unlock(&j->lock); closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { @@ -1562,10 +1567,11 @@ static void journal_write_done(struct closure *cl) * might want to be written now: */ + spin_unlock(&j->lock); mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta)); + } else { + spin_unlock(&j->lock); } - - spin_unlock(&j->lock); } static void journal_write_endio(struct bio *bio) @@ -1813,7 +1819,7 @@ void bch2_journal_write(struct closure *cl) retry_alloc: spin_lock(&j->lock); - ret = journal_write_alloc(j, w, sectors); + ret = journal_write_alloc(j, w); if (ret && j->can_discard) { spin_unlock(&j->lock); diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 10e1860d..9a584aaa 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -290,9 +290,8 @@ void bch2_journal_do_discards(struct journal *j) * entry, holding it open to ensure it gets replayed during recovery: */ -static void bch2_journal_reclaim_fast(struct journal *j) +void bch2_journal_reclaim_fast(struct journal *j) { - struct journal_entry_pin_list temp; bool popped = false; lockdep_assert_held(&j->lock); @@ -303,7 +302,7 @@ static void bch2_journal_reclaim_fast(struct journal *j) */ while (!fifo_empty(&j->pin) && !atomic_read(&fifo_peek_front(&j->pin).count)) { - fifo_pop(&j->pin, temp); + j->pin.front++; popped = true; } @@ -311,19 +310,16 @@ static void bch2_journal_reclaim_fast(struct journal *j) bch2_journal_space_available(j); } -void __bch2_journal_pin_put(struct journal *j, u64 seq) +bool __bch2_journal_pin_put(struct journal *j, u64 seq) { struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); - if (atomic_dec_and_test(&pin_list->count)) - bch2_journal_reclaim_fast(j); + return atomic_dec_and_test(&pin_list->count); } void bch2_journal_pin_put(struct journal *j, u64 seq) { - struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); - - if (atomic_dec_and_test(&pin_list->count)) { + if (__bch2_journal_pin_put(j, seq)) { spin_lock(&j->lock); bch2_journal_reclaim_fast(j); spin_unlock(&j->lock); @@ -419,6 +415,8 @@ void bch2_journal_pin_set(struct journal *j, u64 seq, /** * bch2_journal_pin_flush: ensure journal pin callback is no longer running + * @j: journal object + * @pin: pin to flush */ void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) { @@ -579,7 +577,11 @@ static u64 journal_seq_to_flush(struct journal *j) } /** - * bch2_journal_reclaim - free up journal buckets + * __bch2_journal_reclaim - free up journal buckets + * @j: journal object + * @direct: direct or background reclaim? + * @kicked: requested to run since we last ran? + * Returns: 0 on success, or -EIO if the journal has been shutdown * * Background journal reclaim writes out btree nodes. It should be run * early enough so that we never completely run out of journal buckets. @@ -758,7 +760,7 @@ int bch2_journal_reclaim_start(struct journal *j) "bch-reclaim/%s", c->name); ret = PTR_ERR_OR_ZERO(p); if (ret) { - bch_err(c, "error creating journal reclaim thread: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "creating journal reclaim thread"); return ret; } diff --git a/libbcachefs/journal_reclaim.h b/libbcachefs/journal_reclaim.h index 0fd1af12..494d1a6e 100644 --- a/libbcachefs/journal_reclaim.h +++ b/libbcachefs/journal_reclaim.h @@ -31,7 +31,8 @@ journal_seq_pin(struct journal *j, u64 seq) return &j->pin.data[seq & j->pin.mask]; } -void __bch2_journal_pin_put(struct journal *, u64); +void bch2_journal_reclaim_fast(struct journal *); +bool __bch2_journal_pin_put(struct journal *, u64); void bch2_journal_pin_put(struct journal *, u64); void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c index d6b9f2cd..1e1a7940 100644 --- a/libbcachefs/journal_seq_blacklist.c +++ b/libbcachefs/journal_seq_blacklist.c @@ -250,20 +250,18 @@ void bch2_blacklist_entries_gc(struct work_struct *work) struct journal_seq_blacklist_table *t; struct bch_sb_field_journal_seq_blacklist *bl; struct journal_seq_blacklist_entry *src, *dst; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); unsigned i, nr, new_nr; int ret; - bch2_trans_init(&trans, c, 0, 0); - for (i = 0; i < BTREE_ID_NR; i++) { struct btree_iter iter; struct btree *b; - bch2_trans_node_iter_init(&trans, &iter, i, POS_MIN, + bch2_trans_node_iter_init(trans, &iter, i, POS_MIN, 0, 0, BTREE_ITER_PREFETCH); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); b = bch2_btree_iter_peek_node(&iter); @@ -275,10 +273,10 @@ retry: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); } - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (ret) return; diff --git a/libbcachefs/logged_ops.c b/libbcachefs/logged_ops.c new file mode 100644 index 00000000..1bf19aaa --- /dev/null +++ b/libbcachefs/logged_ops.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bkey_buf.h" +#include "btree_update.h" +#include "error.h" +#include "io_misc.h" +#include "logged_ops.h" + +struct bch_logged_op_fn { + u8 type; + int (*resume)(struct btree_trans *, struct bkey_i *); +}; + +static const struct bch_logged_op_fn logged_op_fns[] = { +#define x(n) { \ + .type = KEY_TYPE_logged_op_##n, \ + .resume = bch2_resume_logged_op_##n, \ +}, + BCH_LOGGED_OPS() +#undef x +}; + +static const struct bch_logged_op_fn *logged_op_fn(enum bch_bkey_type type) +{ + for (unsigned i = 0; i < ARRAY_SIZE(logged_op_fns); i++) + if (logged_op_fns[i].type == type) + return logged_op_fns + i; + return NULL; +} + +static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type); + struct bkey_buf sk; + u32 restart_count = trans->restart_count; + int ret; + + if (!fn) + return 0; + + bch2_bkey_buf_init(&sk); + bch2_bkey_buf_reassemble(&sk, c, k); + + ret = fn->resume(trans, sk.k) ?: trans_was_restarted(trans, restart_count); + + bch2_bkey_buf_exit(&sk, c); + return ret; +} + +int bch2_resume_logged_ops(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + ret = bch2_trans_run(c, + for_each_btree_key2(trans, iter, + BTREE_ID_logged_ops, POS_MIN, BTREE_ITER_PREFETCH, k, + resume_logged_op(trans, &iter, k))); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) +{ + struct btree_iter iter; + int ret; + + ret = bch2_bkey_get_empty_slot(trans, &iter, BTREE_ID_logged_ops, POS_MAX); + if (ret) + return ret; + + k->k.p = iter.pos; + + ret = bch2_trans_update(trans, &iter, k, 0); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) +{ + return commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + __bch2_logged_op_start(trans, k)); +} + +void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k) +{ + int ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0)); + /* + * This needs to be a fatal error because we've left an unfinished + * operation in the logged ops btree. + * + * We should only ever see an error here if the filesystem has already + * been shut down, but make sure of that here: + */ + if (ret) { + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + bch2_fs_fatal_error(c, "%s: error deleting logged operation %s: %s", + __func__, buf.buf, bch2_err_str(ret)); + printbuf_exit(&buf); + } +} diff --git a/libbcachefs/logged_ops.h b/libbcachefs/logged_ops.h new file mode 100644 index 00000000..4d1e786a --- /dev/null +++ b/libbcachefs/logged_ops.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_LOGGED_OPS_H +#define _BCACHEFS_LOGGED_OPS_H + +#include "bkey.h" + +#define BCH_LOGGED_OPS() \ + x(truncate) \ + x(finsert) + +static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i *op) +{ + return bch2_btree_insert_nonextent(trans, BTREE_ID_logged_ops, op, 0); +} + +int bch2_resume_logged_ops(struct bch_fs *); +int bch2_logged_op_start(struct btree_trans *, struct bkey_i *); +void bch2_logged_op_finish(struct btree_trans *, struct bkey_i *); + +#endif /* _BCACHEFS_LOGGED_OPS_H */ diff --git a/libbcachefs/lru.c b/libbcachefs/lru.c index 3e8b8f2f..215a6533 100644 --- a/libbcachefs/lru.c +++ b/libbcachefs/lru.c @@ -151,10 +151,10 @@ int bch2_check_lrus(struct bch_fs *c) int ret = 0; ret = bch2_trans_run(c, - for_each_btree_key_commit(&trans, iter, + for_each_btree_key_commit(trans, iter, BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, - bch2_check_lru_key(&trans, &iter, k, &last_flushed_pos))); + bch2_check_lru_key(trans, &iter, k, &last_flushed_pos))); if (ret) bch_err_fn(c, ret); return ret; diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index 81c8cdba..e3a51f6d 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -10,7 +10,7 @@ #include "buckets.h" #include "errcode.h" #include "extents.h" -#include "io.h" +#include "io_write.h" #include "journal.h" #include "keylist.h" #include "migrate.h" @@ -78,34 +78,32 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; enum btree_id id; int ret = 0; - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - for (id = 0; id < BTREE_ID_NR; id++) { if (!btree_type_has_ptrs(id)) continue; - ret = for_each_btree_key_commit(&trans, iter, id, POS_MIN, + ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, NULL, NULL, BTREE_INSERT_NOFAIL, - bch2_dev_usrdata_drop_key(&trans, &iter, k, dev_idx, flags)); + bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags)); if (ret) break; } - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) { - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct closure cl; struct btree *b; @@ -117,16 +115,16 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) if (flags & BCH_FORCE_IF_METADATA_LOST) return -EINVAL; + trans = bch2_trans_get(c); bch2_bkey_buf_init(&k); - bch2_trans_init(&trans, c, 0, 0); closure_init_stack(&cl); for (id = 0; id < BTREE_ID_NR; id++) { - bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0, + bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0, BTREE_ITER_PREFETCH); retry: ret = 0; - while (bch2_trans_begin(&trans), + while (bch2_trans_begin(trans), (b = bch2_btree_iter_peek_node(&iter)) && !(ret = PTR_ERR_OR_ZERO(b))) { if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx)) @@ -141,15 +139,14 @@ retry: break; } - ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, 0, false); + ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { ret = 0; continue; } if (ret) { - bch_err(c, "Error updating btree node key: %s", - bch2_err_str(ret)); + bch_err_msg(c, ret, "updating btree node key"); break; } next: @@ -158,7 +155,7 @@ next: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); if (ret) goto err; @@ -167,8 +164,8 @@ next: bch2_btree_interior_updates_flush(c); ret = 0; err: - bch2_trans_exit(&trans); bch2_bkey_buf_exit(&k, c); + bch2_trans_put(trans); BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); diff --git a/libbcachefs/move.c b/libbcachefs/move.c index fb76a1da..39a14e32 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -14,7 +14,8 @@ #include "errcode.h" #include "error.h" #include "inode.h" -#include "io.h" +#include "io_read.h" +#include "io_write.h" #include "journal_reclaim.h" #include "keylist.h" #include "move.h" @@ -524,7 +525,7 @@ static int __bch2_move_data(struct moving_context *ctxt, struct bch_fs *c = ctxt->c; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); struct bkey_buf sk; - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; struct data_update_opts data_opts; @@ -532,7 +533,6 @@ static int __bch2_move_data(struct moving_context *ctxt, int ret = 0, ret2; bch2_bkey_buf_init(&sk); - bch2_trans_init(&trans, c, 0, 0); if (ctxt->stats) { ctxt->stats->data_type = BCH_DATA_user; @@ -540,15 +540,15 @@ static int __bch2_move_data(struct moving_context *ctxt, ctxt->stats->pos = start; } - bch2_trans_iter_init(&trans, &iter, btree_id, start, + bch2_trans_iter_init(trans, &iter, btree_id, start, BTREE_ITER_PREFETCH| BTREE_ITER_ALL_SNAPSHOTS); if (ctxt->rate) bch2_ratelimit_reset(ctxt->rate); - while (!move_ratelimit(&trans, ctxt)) { - bch2_trans_begin(&trans); + while (!move_ratelimit(trans, ctxt)) { + bch2_trans_begin(trans); k = bch2_btree_iter_peek(&iter); if (!k.k) @@ -569,7 +569,7 @@ static int __bch2_move_data(struct moving_context *ctxt, if (!bkey_extent_is_direct_data(k.k)) goto next_nondata; - ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum); + ret = move_get_io_opts(trans, &io_opts, k, &cur_inum); if (ret) continue; @@ -584,7 +584,7 @@ static int __bch2_move_data(struct moving_context *ctxt, bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - ret2 = bch2_move_extent(&trans, &iter, ctxt, NULL, + ret2 = bch2_move_extent(trans, &iter, ctxt, NULL, io_opts, btree_id, k, data_opts); if (ret2) { if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) @@ -592,7 +592,7 @@ static int __bch2_move_data(struct moving_context *ctxt, if (ret2 == -ENOMEM) { /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(ctxt, &trans); + bch2_move_ctxt_wait_for_io(ctxt, trans); continue; } @@ -609,8 +609,8 @@ next_nondata: bch2_btree_iter_advance(&iter); } - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); bch2_bkey_buf_exit(&sk, c); return ret; @@ -627,7 +627,7 @@ int bch2_move_data(struct bch_fs *c, { struct moving_context ctxt; enum btree_id id; - int ret; + int ret = 0; bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); @@ -723,7 +723,6 @@ int __bch2_evacuate_bucket(struct btree_trans *trans, if (!bp.level) { const struct bch_extent_ptr *ptr; - struct bkey_s_c k; unsigned i = 0; k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0); @@ -826,15 +825,14 @@ int bch2_evacuate_bucket(struct bch_fs *c, struct write_point_specifier wp, bool wait_on_copygc) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct moving_context ctxt; int ret; - bch2_trans_init(&trans, c, 0, 0); bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); - ret = __bch2_evacuate_bucket(&trans, &ctxt, NULL, bucket, gen, data_opts); + ret = __bch2_evacuate_bucket(trans, &ctxt, NULL, bucket, gen, data_opts); bch2_moving_ctxt_exit(&ctxt); - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } @@ -851,14 +849,13 @@ static int bch2_move_btree(struct bch_fs *c, { bool kthread = (current->flags & PF_KTHREAD) != 0; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct btree *b; enum btree_id id; struct data_update_opts data_opts; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); progress_list_add(c, stats); stats->data_type = BCH_DATA_btree; @@ -871,11 +868,11 @@ static int bch2_move_btree(struct bch_fs *c, if (!bch2_btree_id_root(c, id)->b) continue; - bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0, + bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0, BTREE_ITER_PREFETCH); retry: ret = 0; - while (bch2_trans_begin(&trans), + while (bch2_trans_begin(trans), (b = bch2_btree_iter_peek_node(&iter)) && !(ret = PTR_ERR_OR_ZERO(b))) { if (kthread && kthread_should_stop()) @@ -890,7 +887,7 @@ retry: if (!pred(c, arg, b, &io_opts, &data_opts)) goto next; - ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret; + ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret; if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) @@ -901,13 +898,13 @@ next: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); if (kthread && kthread_should_stop()) break; } - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (ret) bch_err_fn(c, ret); diff --git a/libbcachefs/move.h b/libbcachefs/move.h index c3136abe..cbdd58db 100644 --- a/libbcachefs/move.h +++ b/libbcachefs/move.h @@ -2,6 +2,7 @@ #ifndef _BCACHEFS_MOVE_H #define _BCACHEFS_MOVE_H +#include "bcachefs_ioctl.h" #include "btree_iter.h" #include "buckets.h" #include "data_update.h" diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 256431a6..4017120b 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -13,25 +13,17 @@ #include "btree_write_buffer.h" #include "buckets.h" #include "clock.h" -#include "disk_groups.h" #include "errcode.h" #include "error.h" -#include "extents.h" -#include "eytzinger.h" -#include "io.h" -#include "keylist.h" #include "lru.h" #include "move.h" #include "movinggc.h" -#include "super-io.h" #include "trace.h" -#include <linux/bsearch.h> #include <linux/freezer.h> #include <linux/kthread.h> #include <linux/math64.h> #include <linux/sched/task.h> -#include <linux/sort.h> #include <linux/wait.h> struct buckets_in_flight { @@ -156,7 +148,7 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; - size_t nr_to_get = max(16UL, buckets_in_flight->nr / 4); + size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4); size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0; int ret; @@ -172,7 +164,7 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans, lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX), 0, k, ({ struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) }; - int ret = 0; + int ret2 = 0; saw++; @@ -181,11 +173,11 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans, else if (bucket_in_flight(buckets_in_flight, b.k)) in_flight++; else { - ret = darray_push(buckets, b) ?: buckets->nr >= nr_to_get; - if (ret >= 0) + ret2 = darray_push(buckets, b) ?: buckets->nr >= nr_to_get; + if (ret2 >= 0) sectors += b.sectors; } - ret; + ret2; })); pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i", @@ -242,7 +234,7 @@ err: ret = 0; if (ret < 0 && !bch2_err_matches(ret, EROFS)) - bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "from bch2_move_data()"); moved = atomic64_read(&ctxt->stats->sectors_moved) - moved; trace_and_count(c, copygc, c, moved, 0, 0, 0); @@ -308,25 +300,24 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) static int bch2_copygc_thread(void *arg) { struct bch_fs *c = arg; - struct btree_trans trans; + struct btree_trans *trans; struct moving_context ctxt; struct bch_move_stats move_stats; struct io_clock *clock = &c->io_clock[WRITE]; - struct buckets_in_flight move_buckets; + struct buckets_in_flight buckets; u64 last, wait; int ret = 0; - memset(&move_buckets, 0, sizeof(move_buckets)); + memset(&buckets, 0, sizeof(buckets)); - ret = rhashtable_init(&move_buckets.table, &bch_move_bucket_params); + ret = rhashtable_init(&buckets.table, &bch_move_bucket_params); if (ret) { - bch_err(c, "error allocating copygc buckets in flight: %s", - bch2_err_str(ret)); + bch_err_msg(c, ret, "allocating copygc buckets in flight"); return ret; } set_freezable(); - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); bch2_move_stats_init(&move_stats, "copygc"); bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats, @@ -334,16 +325,16 @@ static int bch2_copygc_thread(void *arg) false); while (!ret && !kthread_should_stop()) { - bch2_trans_unlock(&trans); + bch2_trans_unlock(trans); cond_resched(); if (!c->copy_gc_enabled) { - move_buckets_wait(&trans, &ctxt, &move_buckets, true); + move_buckets_wait(trans, &ctxt, &buckets, true); kthread_wait_freezable(c->copy_gc_enabled); } if (unlikely(freezing(current))) { - move_buckets_wait(&trans, &ctxt, &move_buckets, true); + move_buckets_wait(trans, &ctxt, &buckets, true); __refrigerator(false); continue; } @@ -354,7 +345,7 @@ static int bch2_copygc_thread(void *arg) if (wait > clock->max_slop) { c->copygc_wait_at = last; c->copygc_wait = last + wait; - move_buckets_wait(&trans, &ctxt, &move_buckets, true); + move_buckets_wait(trans, &ctxt, &buckets, true); trace_and_count(c, copygc_wait, c, wait, last + wait); bch2_kthread_io_clock_wait(clock, last + wait, MAX_SCHEDULE_TIMEOUT); @@ -364,15 +355,15 @@ static int bch2_copygc_thread(void *arg) c->copygc_wait = 0; c->copygc_running = true; - ret = bch2_copygc(&trans, &ctxt, &move_buckets); + ret = bch2_copygc(trans, &ctxt, &buckets); c->copygc_running = false; wake_up(&c->copygc_running_wq); } - move_buckets_wait(&trans, &ctxt, &move_buckets, true); - rhashtable_destroy(&move_buckets.table); - bch2_trans_exit(&trans); + move_buckets_wait(trans, &ctxt, &buckets, true); + rhashtable_destroy(&buckets.table); + bch2_trans_put(trans); bch2_moving_ctxt_exit(&ctxt); return 0; @@ -404,7 +395,7 @@ int bch2_copygc_start(struct bch_fs *c) t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name); ret = PTR_ERR_OR_ZERO(t); if (ret) { - bch_err(c, "error creating copygc thread: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "creating copygc thread"); return ret; } diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index 960bb247..739a2ef8 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -471,8 +471,9 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, val = "0"; } + /* Unknown options are ignored: */ if (id < 0) - goto bad_opt; + continue; if (!(bch2_opt_table[id].flags & OPT_MOUNT)) goto bad_opt; diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 8a9db110..c21c258e 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -469,7 +469,7 @@ struct bch_opts { #undef x }; -static const struct bch_opts bch2_opts_default = { +static const __maybe_unused struct bch_opts bch2_opts_default = { #define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \ ._name##_defined = true, \ ._name = _default, \ diff --git a/libbcachefs/printbuf.c b/libbcachefs/printbuf.c index c41daa18..de41f9a1 100644 --- a/libbcachefs/printbuf.c +++ b/libbcachefs/printbuf.c @@ -81,8 +81,10 @@ void bch2_prt_printf(struct printbuf *out, const char *fmt, ...) } /** - * printbuf_str - returns printbuf's buf as a C string, guaranteed to be null - * terminated + * bch2_printbuf_str() - returns printbuf's buf as a C string, guaranteed to be + * null terminated + * @buf: printbuf to terminate + * Returns: Printbuf contents, as a nul terminated C string */ const char *bch2_printbuf_str(const struct printbuf *buf) { @@ -97,8 +99,9 @@ const char *bch2_printbuf_str(const struct printbuf *buf) } /** - * printbuf_exit - exit a printbuf, freeing memory it owns and poisoning it + * bch2_printbuf_exit() - exit a printbuf, freeing memory it owns and poisoning it * against accidental use. + * @buf: printbuf to exit */ void bch2_printbuf_exit(struct printbuf *buf) { @@ -120,7 +123,7 @@ void bch2_printbuf_tabstop_pop(struct printbuf *buf) } /* - * printbuf_tabstop_set - add a tabstop, n spaces from the previous tabstop + * bch2_printbuf_tabstop_set() - add a tabstop, n spaces from the previous tabstop * * @buf: printbuf to control * @spaces: number of spaces from previous tabpstop @@ -144,7 +147,7 @@ int bch2_printbuf_tabstop_push(struct printbuf *buf, unsigned spaces) } /** - * printbuf_indent_add - add to the current indent level + * bch2_printbuf_indent_add() - add to the current indent level * * @buf: printbuf to control * @spaces: number of spaces to add to the current indent level @@ -164,7 +167,7 @@ void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces) } /** - * printbuf_indent_sub - subtract from the current indent level + * bch2_printbuf_indent_sub() - subtract from the current indent level * * @buf: printbuf to control * @spaces: number of spaces to subtract from the current indent level @@ -227,9 +230,8 @@ static void __prt_tab(struct printbuf *out) } /** - * prt_tab - Advance printbuf to the next tabstop - * - * @buf: printbuf to control + * bch2_prt_tab() - Advance printbuf to the next tabstop + * @out: printbuf to control * * Advance output to the next tabstop by printing spaces. */ @@ -267,7 +269,7 @@ static void __prt_tab_rjust(struct printbuf *buf) } /** - * prt_tab_rjust - Advance printbuf to the next tabstop, right justifying + * bch2_prt_tab_rjust - Advance printbuf to the next tabstop, right justifying * previous output * * @buf: printbuf to control @@ -284,11 +286,11 @@ void bch2_prt_tab_rjust(struct printbuf *buf) } /** - * prt_bytes_indented - Print an array of chars, handling embedded control characters + * bch2_prt_bytes_indented() - Print an array of chars, handling embedded control characters * - * @out: printbuf to output to - * @str: string to print - * @count: number of bytes to print + * @out: output printbuf + * @str: string to print + * @count: number of bytes to print * * The following contol characters are handled as so: * \n: prt_newline newline that obeys current indent level @@ -335,32 +337,38 @@ void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned cou } /** - * prt_human_readable_u64 - Print out a u64 in human readable units + * bch2_prt_human_readable_u64() - Print out a u64 in human readable units + * @out: output printbuf + * @v: integer to print * - * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units + * Units of 2^10 (default) or 10^3 are controlled via @out->si_units */ -void bch2_prt_human_readable_u64(struct printbuf *buf, u64 v) +void bch2_prt_human_readable_u64(struct printbuf *out, u64 v) { - bch2_printbuf_make_room(buf, 10); - buf->pos += string_get_size(v, 1, !buf->si_units, - buf->buf + buf->pos, - printbuf_remaining_size(buf)); + bch2_printbuf_make_room(out, 10); + out->pos += string_get_size(v, 1, !out->si_units, + out->buf + out->pos, + printbuf_remaining_size(out)); } /** - * prt_human_readable_s64 - Print out a s64 in human readable units + * bch2_prt_human_readable_s64() - Print out a s64 in human readable units + * @out: output printbuf + * @v: integer to print * - * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units + * Units of 2^10 (default) or 10^3 are controlled via @out->si_units */ -void bch2_prt_human_readable_s64(struct printbuf *buf, s64 v) +void bch2_prt_human_readable_s64(struct printbuf *out, s64 v) { if (v < 0) - prt_char(buf, '-'); - bch2_prt_human_readable_u64(buf, abs(v)); + prt_char(out, '-'); + bch2_prt_human_readable_u64(out, abs(v)); } /** - * prt_units_u64 - Print out a u64 according to printbuf unit options + * bch2_prt_units_u64() - Print out a u64 according to printbuf unit options + * @out: output printbuf + * @v: integer to print * * Units are either raw (default), or human reabable units (controlled via * @buf->human_readable_units) @@ -374,7 +382,9 @@ void bch2_prt_units_u64(struct printbuf *out, u64 v) } /** - * prt_units_s64 - Print out a s64 according to printbuf unit options + * bch2_prt_units_s64() - Print out a s64 according to printbuf unit options + * @out: output printbuf + * @v: integer to print * * Units are either raw (default), or human reabable units (controlled via * @buf->human_readable_units) diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c index ca99772a..36de2f07 100644 --- a/libbcachefs/quota.c +++ b/libbcachefs/quota.c @@ -572,7 +572,7 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans, if (!s_t.master_subvol) goto advance; - ret = bch2_inode_find_by_inum_trans(trans, + ret = bch2_inode_find_by_inum_nowarn_trans(trans, (subvol_inum) { le32_to_cpu(s_t.master_subvol), k.k->p.offset, @@ -599,7 +599,7 @@ advance: int bch2_fs_quota_read(struct bch_fs *c) { struct bch_sb_field_quota *sb_quota; - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; int ret; @@ -614,16 +614,16 @@ int bch2_fs_quota_read(struct bch_fs *c) bch2_sb_quota_read(c); mutex_unlock(&c->sb_lock); - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); - ret = for_each_btree_key2(&trans, iter, BTREE_ID_quotas, + ret = for_each_btree_key2(trans, iter, BTREE_ID_quotas, POS_MIN, BTREE_ITER_PREFETCH, k, __bch2_quota_set(c, k, NULL)) ?: - for_each_btree_key2(&trans, iter, BTREE_ID_inodes, + for_each_btree_key2(trans, iter, BTREE_ID_inodes, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - bch2_fs_quota_read_inode(&trans, &iter, k)); + bch2_fs_quota_read_inode(trans, &iter, k)); - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (ret) bch_err_fn(c, ret); @@ -786,7 +786,6 @@ static int bch2_quota_set_info(struct super_block *sb, int type, { struct bch_fs *c = sb->s_fs_info; struct bch_sb_field_quota *sb_quota; - struct bch_memquota_type *q; int ret = 0; if (0) { @@ -810,8 +809,6 @@ static int bch2_quota_set_info(struct super_block *sb, int type, ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS)) return -EINVAL; - q = &c->quotas[type]; - mutex_lock(&c->sb_lock); sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); if (!sb_quota) { @@ -959,7 +956,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid, new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_set_quota_trans(&trans, &new_quota, qdq)) ?: + bch2_set_quota_trans(trans, &new_quota, qdq)) ?: __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq); return bch2_err_class(ret); diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c index 15ce3ecb..568f1e8e 100644 --- a/libbcachefs/rebalance.c +++ b/libbcachefs/rebalance.c @@ -8,8 +8,6 @@ #include "compress.h" #include "disk_groups.h" #include "errcode.h" -#include "extents.h" -#include "io.h" #include "move.h" #include "rebalance.h" #include "super-io.h" @@ -350,7 +348,7 @@ int bch2_rebalance_start(struct bch_fs *c) p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name); ret = PTR_ERR_OR_ZERO(p); if (ret) { - bch_err(c, "error creating rebalance thread: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "creating rebalance thread"); return ret; } diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 30efb3c9..1dceb7ee 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -20,6 +20,7 @@ #include "journal_reclaim.h" #include "journal_seq_blacklist.h" #include "lru.h" +#include "logged_ops.h" #include "move.h" #include "quota.h" #include "recovery.h" @@ -164,7 +165,7 @@ static int bch2_journal_replay(struct bch_fs *c) (!k->allocated ? BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim : 0), - bch2_journal_replay_key(&trans, k)); + bch2_journal_replay_key(trans, k)); if (ret) { bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s", bch2_btree_ids[k->btree_id], k->level, bch2_err_str(ret)); @@ -422,15 +423,9 @@ static int bch2_initialize_subvolumes(struct bch_fs *c) root_volume.v.snapshot = cpu_to_le32(U32_MAX); root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); - ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, - &root_tree.k_i, - NULL, NULL, 0) ?: - bch2_btree_insert(c, BTREE_ID_snapshots, - &root_snapshot.k_i, - NULL, NULL, 0) ?: - bch2_btree_insert(c, BTREE_ID_subvolumes, - &root_volume.k_i, - NULL, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?: + bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?: + bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0); if (ret) bch_err_fn(c, ret); return ret; @@ -471,7 +466,7 @@ noinline_for_stack static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) { int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW, - __bch2_fs_upgrade_for_subvolumes(&trans)); + __bch2_fs_upgrade_for_subvolumes(trans)); if (ret) bch_err_fn(c, ret); return ret; @@ -561,7 +556,7 @@ static void check_version_upgrade(struct bch_fs *c) if ((recovery_passes & RECOVERY_PASS_ALL_FSCK) == RECOVERY_PASS_ALL_FSCK) prt_str(&buf, "fsck required"); else { - prt_str(&buf, "running recovery passses: "); + prt_str(&buf, "running recovery passes: "); prt_bitflags(&buf, bch2_recovery_passes, recovery_passes); } @@ -1009,9 +1004,7 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_inode_pack(&packed_inode, &root_inode); packed_inode.inode.k.p.snapshot = U32_MAX; - ret = bch2_btree_insert(c, BTREE_ID_inodes, - &packed_inode.inode.k_i, - NULL, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0); if (ret) { bch_err_msg(c, ret, "creating root directory"); goto err; @@ -1020,7 +1013,7 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_inode_init_early(c, &lostfound_inode); ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_create_trans(&trans, + bch2_create_trans(trans, BCACHEFS_ROOT_SUBVOL_INUM, &root_inode, &lostfound_inode, &lostfound, diff --git a/libbcachefs/recovery_types.h b/libbcachefs/recovery_types.h index abf1f834..f3c9ea77 100644 --- a/libbcachefs/recovery_types.h +++ b/libbcachefs/recovery_types.h @@ -24,6 +24,7 @@ x(check_alloc_to_lru_refs, PASS_FSCK) \ x(fs_freespace_init, PASS_ALWAYS|PASS_SILENT) \ x(bucket_gens_init, 0) \ + x(resume_logged_ops, PASS_ALWAYS) \ x(check_snapshot_trees, PASS_FSCK) \ x(check_snapshots, PASS_FSCK) \ x(check_subvols, PASS_FSCK) \ diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c index 39f711d5..d77d0ea9 100644 --- a/libbcachefs/reflink.c +++ b/libbcachefs/reflink.c @@ -5,9 +5,11 @@ #include "buckets.h" #include "extents.h" #include "inode.h" -#include "io.h" +#include "io_misc.h" +#include "io_write.h" #include "reflink.h" #include "subvolume.h" +#include "super-io.h" #include <linux/sched/signal.h> @@ -89,6 +91,9 @@ void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, bch2_bkey_ptrs_to_text(out, c, k); } +#if 0 +Currently disabled, needs to be debugged: + bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) { struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l); @@ -96,6 +101,7 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r); } +#endif int bch2_trans_mark_reflink_v(struct btree_trans *trans, enum btree_id btree_id, unsigned level, @@ -247,7 +253,7 @@ s64 bch2_remap_range(struct bch_fs *c, u64 remap_sectors, u64 new_i_size, s64 *i_sectors_delta) { - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter dst_iter, src_iter; struct bkey_s_c src_k; struct bkey_buf new_dst, new_src; @@ -269,11 +275,11 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_bkey_buf_init(&new_dst); bch2_bkey_buf_init(&new_src); - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); + trans = bch2_trans_get(c); - bch2_trans_iter_init(&trans, &src_iter, BTREE_ID_extents, src_start, + bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start, BTREE_ITER_INTENT); - bch2_trans_iter_init(&trans, &dst_iter, BTREE_ID_extents, dst_start, + bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start, BTREE_ITER_INTENT); while ((ret == 0 || @@ -281,21 +287,21 @@ s64 bch2_remap_range(struct bch_fs *c, bkey_lt(dst_iter.pos, dst_end)) { struct disk_reservation disk_res = { 0 }; - bch2_trans_begin(&trans); + bch2_trans_begin(trans); if (fatal_signal_pending(current)) { ret = -EINTR; break; } - ret = bch2_subvolume_get_snapshot(&trans, src_inum.subvol, + ret = bch2_subvolume_get_snapshot(trans, src_inum.subvol, &src_snapshot); if (ret) continue; bch2_btree_iter_set_snapshot(&src_iter, src_snapshot); - ret = bch2_subvolume_get_snapshot(&trans, dst_inum.subvol, + ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol, &dst_snapshot); if (ret) continue; @@ -312,7 +318,7 @@ s64 bch2_remap_range(struct bch_fs *c, continue; if (bkey_lt(src_want, src_iter.pos)) { - ret = bch2_fpunch_at(&trans, &dst_iter, dst_inum, + ret = bch2_fpunch_at(trans, &dst_iter, dst_inum, min(dst_end.offset, dst_iter.pos.offset + src_iter.pos.offset - src_want.offset), @@ -326,7 +332,7 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_bkey_buf_reassemble(&new_src, c, src_k); src_k = bkey_i_to_s_c(new_src.k); - ret = bch2_make_extent_indirect(&trans, &src_iter, + ret = bch2_make_extent_indirect(trans, &src_iter, new_src.k); if (ret) continue; @@ -354,14 +360,14 @@ s64 bch2_remap_range(struct bch_fs *c, min(src_k.k->p.offset - src_want.offset, dst_end.offset - dst_iter.pos.offset)); - ret = bch2_extent_update(&trans, dst_inum, &dst_iter, + ret = bch2_extent_update(trans, dst_inum, &dst_iter, new_dst.k, &disk_res, new_i_size, i_sectors_delta, true); bch2_disk_reservation_put(c, &disk_res); } - bch2_trans_iter_exit(&trans, &dst_iter); - bch2_trans_iter_exit(&trans, &src_iter); + bch2_trans_iter_exit(trans, &dst_iter); + bch2_trans_iter_exit(trans, &src_iter); BUG_ON(!ret && !bkey_eq(dst_iter.pos, dst_end)); BUG_ON(bkey_gt(dst_iter.pos, dst_end)); @@ -373,23 +379,23 @@ s64 bch2_remap_range(struct bch_fs *c, struct bch_inode_unpacked inode_u; struct btree_iter inode_iter = { NULL }; - bch2_trans_begin(&trans); + bch2_trans_begin(trans); - ret2 = bch2_inode_peek(&trans, &inode_iter, &inode_u, + ret2 = bch2_inode_peek(trans, &inode_iter, &inode_u, dst_inum, BTREE_ITER_INTENT); if (!ret2 && inode_u.bi_size < new_i_size) { inode_u.bi_size = new_i_size; - ret2 = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: - bch2_trans_commit(&trans, NULL, NULL, + ret2 = bch2_inode_write(trans, &inode_iter, &inode_u) ?: + bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); } - bch2_trans_iter_exit(&trans, &inode_iter); + bch2_trans_iter_exit(trans, &inode_iter); } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart)); - bch2_trans_exit(&trans); + bch2_trans_put(trans); bch2_bkey_buf_exit(&new_src, c); bch2_bkey_buf_exit(&new_dst, c); diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index 5b591c59..dbef41cd 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -429,7 +429,7 @@ out: return ret; err: - bch_err(c, "error adding replicas entry: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "adding replicas entry"); goto out; } diff --git a/libbcachefs/six.c b/libbcachefs/six.c index 14cffa68..458a1de0 100644 --- a/libbcachefs/six.c +++ b/libbcachefs/six.c @@ -31,7 +31,6 @@ static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type); #define SIX_LOCK_HELD_intent (1U << 26) #define SIX_LOCK_HELD_write (1U << 27) #define SIX_LOCK_WAITING_read (1U << (28 + SIX_LOCK_read)) -#define SIX_LOCK_WAITING_intent (1U << (28 + SIX_LOCK_intent)) #define SIX_LOCK_WAITING_write (1U << (28 + SIX_LOCK_write)) #define SIX_LOCK_NOSPIN (1U << 31) diff --git a/libbcachefs/snapshot.c b/libbcachefs/snapshot.c index 03ae280a..cdf9eda2 100644 --- a/libbcachefs/snapshot.c +++ b/libbcachefs/snapshot.c @@ -163,8 +163,7 @@ static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id) rcu_assign_pointer(c->snapshots, new); c->snapshot_table_size = new_size; - if (old) - kvfree_rcu(old); + kvfree_rcu_mightsleep(old); return &rcu_dereference_protected(c->snapshots, true)->s[idx]; } @@ -344,7 +343,7 @@ int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, BTREE_ITER_WITH_UPDATES, snapshot, s); } -int bch2_snapshot_live(struct btree_trans *trans, u32 id) +static int bch2_snapshot_live(struct btree_trans *trans, u32 id) { struct bch_snapshot v; int ret; @@ -371,7 +370,7 @@ int bch2_snapshot_live(struct btree_trans *trans, u32 id) * it's part of such a linear chain: this correctly sets equivalence classes on * startup if we run leaf to root (i.e. in natural key order). */ -int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k) +static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k) { struct bch_fs *c = trans->c; unsigned i, nr_live = 0, live_idx = 0; @@ -488,18 +487,18 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, bch2_trans_iter_exit(trans, &iter); if (!ret && !found) { - struct bkey_i_subvolume *s; + struct bkey_i_subvolume *u; *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root); - s = bch2_bkey_get_mut_typed(trans, &iter, + u = bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_subvolumes, POS(0, *subvol_id), 0, subvolume); - ret = PTR_ERR_OR_ZERO(s); + ret = PTR_ERR_OR_ZERO(u); if (ret) return ret; - SET_BCH_SUBVOLUME_SNAP(&s->v, false); + SET_BCH_SUBVOLUME_SNAP(&u->v, false); } return ret; @@ -591,11 +590,11 @@ int bch2_check_snapshot_trees(struct bch_fs *c) int ret; ret = bch2_trans_run(c, - for_each_btree_key_commit(&trans, iter, + for_each_btree_key_commit(trans, iter, BTREE_ID_snapshot_trees, POS_MIN, BTREE_ITER_PREFETCH, k, NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_snapshot_tree(&trans, &iter, k))); + check_snapshot_tree(trans, &iter, k))); if (ret) bch_err(c, "error %i checking snapshot trees", ret); @@ -864,11 +863,11 @@ int bch2_check_snapshots(struct bch_fs *c) * the parent's depth already be correct: */ ret = bch2_trans_run(c, - for_each_btree_key_reverse_commit(&trans, iter, + for_each_btree_key_reverse_commit(trans, iter, BTREE_ID_snapshots, POS_MAX, BTREE_ITER_PREFETCH, k, NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_snapshot(&trans, &iter, k))); + check_snapshot(trans, &iter, k))); if (ret) bch_err_fn(c, ret); return ret; @@ -911,7 +910,7 @@ static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s) swap(s->children[0], s->children[1]); } -int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) +static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) { struct bch_fs *c = trans->c; struct btree_iter iter, p_iter = (struct btree_iter) { NULL }; @@ -1072,6 +1071,10 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, goto err; new_snapids[i] = iter.pos.offset; + + mutex_lock(&c->snapshot_table_lock); + snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i]; + mutex_unlock(&c->snapshot_table_lock); } err: bch2_trans_iter_exit(trans, &iter); @@ -1354,7 +1357,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, int bch2_delete_dead_snapshots(struct bch_fs *c) { - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; struct bkey_s_c_snapshot snap; @@ -1366,35 +1369,35 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) if (!test_bit(BCH_FS_STARTED, &c->flags)) { ret = bch2_fs_read_write_early(c); if (ret) { - bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "deleting dead snapshots: error going rw"); return ret; } } - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); /* * For every snapshot node: If we have no live children and it's not * pointed to by a subvolume, delete it: */ - ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots, + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, NULL, NULL, 0, - bch2_delete_redundant_snapshot(&trans, &iter, k)); + bch2_delete_redundant_snapshot(trans, &iter, k)); if (ret) { - bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "deleting redundant snapshots"); goto err; } - for_each_btree_key2(&trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, - bch2_snapshot_set_equiv(&trans, k)); + ret = for_each_btree_key2(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, + bch2_snapshot_set_equiv(trans, k)); if (ret) { - bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "in bch2_snapshots_set_equiv"); goto err; } - for_each_btree_key(&trans, iter, BTREE_ID_snapshots, + for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) { if (k.k->type != KEY_TYPE_snapshot) continue; @@ -1406,7 +1409,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) break; } } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); if (ret) { bch_err_msg(c, ret, "walking snapshots"); @@ -1421,16 +1424,16 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) if (!btree_type_has_snapshots(id)) continue; - ret = for_each_btree_key_commit(&trans, iter, + ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, &res, NULL, BTREE_INSERT_NOFAIL, - snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?: - for_each_btree_key_commit(&trans, iter, + snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?: + for_each_btree_key_commit(trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, &res, NULL, BTREE_INSERT_NOFAIL, - move_key_to_correct_snapshot(&trans, &iter, k)); + move_key_to_correct_snapshot(trans, &iter, k)); bch2_disk_reservation_put(c, &res); darray_exit(&equiv_seen); @@ -1441,7 +1444,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) } } - for_each_btree_key(&trans, iter, BTREE_ID_snapshots, + for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) { u32 snapshot = k.k->p.offset; u32 equiv = bch2_snapshot_equiv(c, snapshot); @@ -1449,23 +1452,23 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) if (equiv != snapshot) snapshot_list_add(c, &deleted_interior, snapshot); } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); /* * Fixing children of deleted snapshots can't be done completely * atomically, if we crash between here and when we delete the interior * nodes some depth fields will be off: */ - ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots, POS_MIN, + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, BTREE_ITER_INTENT, k, NULL, NULL, BTREE_INSERT_NOFAIL, - bch2_fix_child_of_deleted_snapshot(&trans, &iter, k, &deleted_interior)); + bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior)); if (ret) goto err; darray_for_each(deleted, i) { - ret = commit_do(&trans, NULL, NULL, 0, - bch2_snapshot_node_delete(&trans, *i)); + ret = commit_do(trans, NULL, NULL, 0, + bch2_snapshot_node_delete(trans, *i)); if (ret) { bch_err_msg(c, ret, "deleting snapshot %u", *i); goto err; @@ -1473,8 +1476,8 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) } darray_for_each(deleted_interior, i) { - ret = commit_do(&trans, NULL, NULL, 0, - bch2_snapshot_node_delete(&trans, *i)); + ret = commit_do(trans, NULL, NULL, 0, + bch2_snapshot_node_delete(trans, *i)); if (ret) { bch_err_msg(c, ret, "deleting snapshot %u", *i); goto err; @@ -1485,7 +1488,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) err: darray_exit(&deleted_interior); darray_exit(&deleted); - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (ret) bch_err_fn(c, ret); return ret; @@ -1618,7 +1621,8 @@ int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bkey_buf sk; - int ret; + u32 restart_count = trans->restart_count; + int ret = 0; bch2_bkey_buf_init(&sk); bch2_bkey_buf_reassemble(&sk, c, k); @@ -1640,7 +1644,8 @@ int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans, } bch2_bkey_buf_exit(&sk, c); - return ret; + + return ret ?: trans_was_restarted(trans, restart_count); } int bch2_snapshots_read(struct bch_fs *c) @@ -1650,11 +1655,11 @@ int bch2_snapshots_read(struct bch_fs *c) int ret = 0; ret = bch2_trans_run(c, - for_each_btree_key2(&trans, iter, BTREE_ID_snapshots, + for_each_btree_key2(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, - bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: - bch2_snapshot_set_equiv(&trans, k)) ?: - for_each_btree_key2(&trans, iter, BTREE_ID_snapshots, + bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: + bch2_snapshot_set_equiv(trans, k)) ?: + for_each_btree_key2(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, (set_is_ancestor_bitmap(c, k.k->p.offset), 0))); if (ret) diff --git a/libbcachefs/snapshot.h b/libbcachefs/snapshot.h index dabc9b9d..de215d9d 100644 --- a/libbcachefs/snapshot.h +++ b/libbcachefs/snapshot.h @@ -235,8 +235,6 @@ int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, struct bch_snapshot *s); int bch2_snapshot_get_subvol(struct btree_trans *, u32, struct bch_subvolume *); -int bch2_snapshot_live(struct btree_trans *trans, u32 id); -int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k); /* only exported for tests: */ int bch2_snapshot_node_create(struct btree_trans *, u32, diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c index 0214a98d..caf2dd7d 100644 --- a/libbcachefs/subvolume.c +++ b/libbcachefs/subvolume.c @@ -41,8 +41,7 @@ static int check_subvol(struct btree_trans *trans, ret = bch2_subvolume_delete(trans, iter->pos.offset); if (ret) - bch_err(c, "error deleting subvolume %llu: %s", - iter->pos.offset, bch2_err_str(ret)); + bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); return ret ?: -BCH_ERR_transaction_restart_nested; } @@ -87,10 +86,10 @@ int bch2_check_subvols(struct bch_fs *c) int ret; ret = bch2_trans_run(c, - for_each_btree_key_commit(&trans, iter, + for_each_btree_key_commit(trans, iter, BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_subvol(&trans, &iter, k))); + check_subvol(trans, &iter, k))); if (ret) bch_err_fn(c, ret); return ret; @@ -99,7 +98,7 @@ int bch2_check_subvols(struct bch_fs *c) /* Subvolumes: */ int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k, - unsigned flags, struct printbuf *err) + enum bkey_invalid_flags flags, struct printbuf *err) { if (bkey_lt(k.k->p, SUBVOL_POS_MIN) || bkey_gt(k.k->p, SUBVOL_POS_MAX)) { @@ -294,9 +293,9 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor bch2_evict_subvolume_inodes(c, &s); for (id = s.data; id < s.data + s.nr; id++) { - ret = bch2_trans_run(c, bch2_subvolume_delete(&trans, *id)); + ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id)); if (ret) { - bch_err(c, "error deleting subvolume %u: %s", *id, bch2_err_str(ret)); + bch_err_msg(c, ret, "deleting subvolume %u", *id); break; } } diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h index 8d4c50f4..bb14f92e 100644 --- a/libbcachefs/subvolume.h +++ b/libbcachefs/subvolume.h @@ -10,7 +10,7 @@ enum bkey_invalid_flags; int bch2_check_subvols(struct bch_fs *); int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c, - unsigned, struct printbuf *); + enum bkey_invalid_flags, struct printbuf *); void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_subvolume ((struct bkey_ops) { \ diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index b6021b73..c9bf342d 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -6,7 +6,6 @@ #include "disk_groups.h" #include "ec.h" #include "error.h" -#include "io.h" #include "journal.h" #include "journal_sb.h" #include "journal_seq_blacklist.h" @@ -23,6 +22,9 @@ #include <linux/backing-dev.h> #include <linux/sort.h> +static const struct blk_holder_ops bch2_sb_handle_bdev_ops = { +}; + struct bch2_metadata_version { u16 version; const char *name; @@ -161,7 +163,8 @@ void bch2_free_super(struct bch_sb_handle *sb) { kfree(sb->bio); if (!IS_ERR_OR_NULL(sb->bdev)) - blkdev_put(sb->bdev, sb->mode); + blkdev_put(sb->bdev, sb->holder); + kfree(sb->holder); kfree(sb->sb); memset(sb, 0, sizeof(*sb)); @@ -182,7 +185,7 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) if (sb->sb && sb->buffer_size >= new_buffer_size) return 0; - if (sb->have_layout) { + if (sb->sb && sb->have_layout) { u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; if (new_bytes > max_bytes) { @@ -243,9 +246,9 @@ struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb, /* XXX: we're not checking that offline device have enough space */ for_each_online_member(ca, c, i) { - struct bch_sb_handle *sb = &ca->disk_sb; + struct bch_sb_handle *dev_sb = &ca->disk_sb; - if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) { + if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) { percpu_ref_put(&ca->ref); return NULL; } @@ -381,7 +384,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, } if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) { - prt_printf(out, "Bad intenal UUID (got zeroes)"); + prt_printf(out, "Bad internal UUID (got zeroes)"); return -BCH_ERR_invalid_sb_uuid; } @@ -664,27 +667,30 @@ int bch2_read_super(const char *path, struct bch_opts *opts, retry: #endif memset(sb, 0, sizeof(*sb)); - sb->mode = FMODE_READ; + sb->mode = BLK_OPEN_READ; sb->have_bio = true; + sb->holder = kmalloc(1, GFP_KERNEL); + if (!sb->holder) + return -ENOMEM; #ifndef __KERNEL__ if (opt_get(*opts, direct_io) == false) - sb->mode |= FMODE_BUFFERED; + sb->mode |= BLK_OPEN_BUFFERED; #endif if (!opt_get(*opts, noexcl)) - sb->mode |= FMODE_EXCL; + sb->mode |= BLK_OPEN_EXCL; if (!opt_get(*opts, nochanges)) - sb->mode |= FMODE_WRITE; + sb->mode |= BLK_OPEN_WRITE; - sb->bdev = blkdev_get_by_path(path, sb->mode, sb); + sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); if (IS_ERR(sb->bdev) && PTR_ERR(sb->bdev) == -EACCES && opt_get(*opts, read_only)) { - sb->mode &= ~FMODE_WRITE; + sb->mode &= ~BLK_OPEN_WRITE; - sb->bdev = blkdev_get_by_path(path, sb->mode, sb); + sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); if (!IS_ERR(sb->bdev)) opt_set(*opts, nochanges, true); } diff --git a/libbcachefs/super.c b/libbcachefs/super.c index e7dbc31b..e94a63a2 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -35,7 +35,8 @@ #include "fs-io-direct.h" #include "fsck.h" #include "inode.h" -#include "io.h" +#include "io_read.h" +#include "io_write.h" #include "journal.h" #include "journal_reclaim.h" #include "journal_seq_blacklist.h" @@ -68,6 +69,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); +MODULE_DESCRIPTION("bcachefs filesystem"); #define KTYPE(type) \ static const struct attribute_group type ## _group = { \ @@ -421,6 +423,10 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) return ret; } + ret = bch2_journal_reclaim_start(&c->journal); + if (ret) + goto err; + if (!early) { ret = bch2_fs_read_write_late(c); if (ret) @@ -430,7 +436,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) #ifndef BCH_WRITE_REF_DEBUG percpu_ref_reinit(&c->writes); #else - for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) { + for (i = 0; i < BCH_WRITE_REF_NR; i++) { BUG_ON(atomic_long_read(&c->writes[i])); atomic_long_inc(&c->writes[i]); } @@ -465,7 +471,6 @@ int bch2_fs_read_write_early(struct bch_fs *c) static void __bch2_fs_free(struct bch_fs *c) { unsigned i; - int cpu; for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_exit(&c->times[i]); @@ -479,7 +484,8 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_fs_fsio_exit(c); bch2_fs_ec_exit(c); bch2_fs_encryption_exit(c); - bch2_fs_io_exit(c); + bch2_fs_io_write_exit(c); + bch2_fs_io_read_exit(c); bch2_fs_buckets_waiting_for_journal_exit(c); bch2_fs_btree_interior_update_exit(c); bch2_fs_btree_iter_exit(c); @@ -496,12 +502,7 @@ static void __bch2_fs_free(struct bch_fs *c) percpu_free_rwsem(&c->mark_lock); free_percpu(c->online_reserved); - if (c->btree_paths_bufs) - for_each_possible_cpu(cpu) - kfree(per_cpu_ptr(c->btree_paths_bufs, cpu)->path); - darray_exit(&c->btree_roots_extra); - free_percpu(c->btree_paths_bufs); free_percpu(c->pcpu); mempool_exit(&c->large_bkey_pool); mempool_exit(&c->btree_bounce_pool); @@ -581,8 +582,6 @@ void bch2_fs_free(struct bch_fs *c) { unsigned i; - BUG_ON(!test_bit(BCH_FS_STOPPING, &c->flags)); - mutex_lock(&bch_fs_list_lock); list_del(&c->list); mutex_unlock(&bch_fs_list_lock); @@ -787,6 +786,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; if (c->opts.inodes_use_key_cache) c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes; + c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops; c->block_bits = ilog2(block_sectors(c)); c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); @@ -824,7 +824,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) BIOSET_NEED_BVECS) || !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || !(c->online_reserved = alloc_percpu(u64)) || - !(c->btree_paths_bufs = alloc_percpu(struct btree_path_buf)) || mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) || mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || @@ -846,13 +845,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_buckets_waiting_for_journal_init(c) ?: bch2_fs_btree_write_buffer_init(c) ?: bch2_fs_subvolumes_init(c) ?: - bch2_fs_io_init(c) ?: + bch2_fs_io_read_init(c) ?: + bch2_fs_io_write_init(c) ?: bch2_fs_nocow_locking_init(c) ?: bch2_fs_encryption_init(c) ?: bch2_fs_compress_init(c) ?: bch2_fs_ec_init(c) ?: bch2_fs_fsio_init(c) ?: - bch2_fs_fs_io_buffered_init(c); + bch2_fs_fs_io_buffered_init(c) ?: bch2_fs_fs_io_direct_init(c); if (ret) goto err; @@ -990,7 +990,7 @@ out: up_write(&c->state_lock); return ret; err: - bch_err(c, "error starting filesystem: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "starting filesystem"); goto out; } @@ -1237,8 +1237,6 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) /* Commit: */ ca->disk_sb = *sb; - if (sb->mode & FMODE_EXCL) - ca->disk_sb.bdev->bd_holder = ca; memset(sb, 0, sizeof(*sb)); ca->dev = ca->disk_sb.bdev->bd_dev; @@ -1457,7 +1455,7 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end, BTREE_TRIGGER_NORUN, NULL); if (ret) - bch_err(c, "error removing dev alloc info: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "removing dev alloc info"); return ret; } @@ -1486,31 +1484,31 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) ret = bch2_dev_data_drop(c, ca->dev_idx, flags); if (ret) { - bch_err(ca, "Remove failed: error dropping data: %s", bch2_err_str(ret)); + bch_err_msg(ca, ret, "dropping data"); goto err; } ret = bch2_dev_remove_alloc(c, ca); if (ret) { - bch_err(ca, "Remove failed, error deleting alloc info"); + bch_err_msg(ca, ret, "deleting alloc info"); goto err; } ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); if (ret) { - bch_err(ca, "Remove failed: error flushing journal: %s", bch2_err_str(ret)); + bch_err_msg(ca, ret, "flushing journal"); goto err; } ret = bch2_journal_flush(&c->journal); if (ret) { - bch_err(ca, "Remove failed, journal error"); + bch_err(ca, "journal error"); goto err; } ret = bch2_replicas_gc2(c); if (ret) { - bch_err(ca, "Remove failed: error from replicas gc: %s", bch2_err_str(ret)); + bch_err_msg(ca, ret, "in replicas_gc2()"); goto err; } @@ -1585,7 +1583,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) ret = bch2_read_super(path, &opts, &sb); if (ret) { - bch_err(c, "device add error: error reading super: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "reading super"); goto err; } @@ -1601,13 +1599,12 @@ int bch2_dev_add(struct bch_fs *c, const char *path) ret = bch2_dev_may_add(sb.sb, c); if (ret) { - bch_err(c, "device add error: %s", bch2_err_str(ret)); + bch_err_fn(c, ret); goto err; } ca = __bch2_dev_alloc(c, &dev_mi); if (!ca) { - bch2_free_super(&sb); ret = -ENOMEM; goto err; } @@ -1615,14 +1612,12 @@ int bch2_dev_add(struct bch_fs *c, const char *path) bch2_dev_usage_init(ca); ret = __bch2_dev_attach_bdev(ca, &sb); - if (ret) { - bch2_dev_free(ca); + if (ret) goto err; - } ret = bch2_dev_journal_alloc(ca); if (ret) { - bch_err(c, "device add error: journal alloc failed"); + bch_err_msg(c, ret, "allocating journal"); goto err; } @@ -1631,7 +1626,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) ret = bch2_sb_from_fs(c, ca); if (ret) { - bch_err(c, "device add error: new device superblock too small"); + bch_err_msg(c, ret, "setting up new superblock"); goto err_unlock; } @@ -1640,8 +1635,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (!bch2_sb_resize_members(&ca->disk_sb, le32_to_cpu(mi->field.u64s) + sizeof(dev_mi) / sizeof(u64))) { - bch_err(c, "device add error: new device superblock too small"); ret = -BCH_ERR_ENOSPC_sb_members; + bch_err_msg(c, ret, "setting up new superblock"); goto err_unlock; } @@ -1653,8 +1648,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx)) goto have_slot; no_slot: - bch_err(c, "device add error: already have maximum number of devices"); ret = -BCH_ERR_ENOSPC_sb_members; + bch_err_msg(c, ret, "setting up new superblock"); goto err_unlock; have_slot: @@ -1664,8 +1659,8 @@ have_slot: mi = bch2_sb_resize_members(&c->disk_sb, u64s); if (!mi) { - bch_err(c, "device add error: no room in superblock for member info"); ret = -BCH_ERR_ENOSPC_sb_members; + bch_err_msg(c, ret, "setting up new superblock"); goto err_unlock; } @@ -1681,7 +1676,7 @@ have_slot: if (BCH_MEMBER_GROUP(&dev_mi)) { ret = __bch2_dev_group_set(c, ca, label.buf); if (ret) { - bch_err(c, "device add error: error setting label"); + bch_err_msg(c, ret, "creating new label"); goto err_unlock; } } @@ -1693,13 +1688,13 @@ have_slot: ret = bch2_trans_mark_dev_sb(c, ca); if (ret) { - bch_err(c, "device add error: error marking new superblock: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "marking new superblock"); goto err_late; } ret = bch2_fs_freespace_init(c); if (ret) { - bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "initializing free space"); goto err_late; } @@ -1749,7 +1744,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path) ret = bch2_dev_in_fs(c->disk_sb.sb, sb.sb); if (ret) { - bch_err(c, "error bringing %s online: %s", path, bch2_err_str(ret)); + bch_err_msg(c, ret, "bringing %s online", path); goto err; } @@ -1761,8 +1756,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path) ret = bch2_trans_mark_dev_sb(c, ca); if (ret) { - bch_err(c, "error bringing %s online: error from bch2_trans_mark_dev_sb: %s", - path, bch2_err_str(ret)); + bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path); goto err; } @@ -1780,7 +1774,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path) ret = bch2_fs_freespace_init(c); if (ret) - bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret)); + bch_err_msg(c, ret, "initializing free space"); up_write(&c->state_lock); return 0; @@ -1835,7 +1829,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ret = bch2_dev_buckets_resize(c, ca, nbuckets); if (ret) { - bch_err(ca, "Resize error: %s", bch2_err_str(ret)); + bch_err_msg(ca, ret, "resizing buckets"); goto err; } diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h index 89419fc7..597a8db7 100644 --- a/libbcachefs/super_types.h +++ b/libbcachefs/super_types.h @@ -6,8 +6,9 @@ struct bch_sb_handle { struct bch_sb *sb; struct block_device *bdev; struct bio *bio; + void *holder; size_t buffer_size; - fmode_t mode; + blk_mode_t mode; unsigned have_layout:1; unsigned have_bio:1; unsigned fs_sb:1; diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 941f4bcb..1abc61cb 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -113,10 +113,6 @@ do { \ prt_human_readable_s64(out, val); \ } while (0) -#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var)) -#define var_print(_var) sysfs_print(_var, var(_var)) -#define var_hprint(_var) sysfs_hprint(_var, var(_var)) - #define sysfs_strtoul(file, var) \ do { \ if (attr == &sysfs_ ## file) \ @@ -139,30 +135,6 @@ do { \ _v; \ }) -#define strtoul_restrict_or_return(cp, min, max) \ -({ \ - unsigned long __v = 0; \ - int _r = strtoul_safe_restrict(cp, __v, min, max); \ - if (_r) \ - return _r; \ - __v; \ -}) - -#define strtoi_h_or_return(cp) \ -({ \ - u64 _v; \ - int _r = strtoi_h(cp, &_v); \ - if (_r) \ - return _r; \ - _v; \ -}) - -#define sysfs_hatoi(file, var) \ -do { \ - if (attr == &sysfs_ ## file) \ - return strtoi_h(buf, &var) ?: (ssize_t) size; \ -} while (0) - write_attribute(trigger_gc); write_attribute(trigger_discards); write_attribute(trigger_invalidates); @@ -280,7 +252,7 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) { - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; enum btree_id id; @@ -291,18 +263,18 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c incompressible_sectors = 0, compressed_sectors_compressed = 0, compressed_sectors_uncompressed = 0; - int ret; + int ret = 0; if (!test_bit(BCH_FS_STARTED, &c->flags)) return -EPERM; - bch2_trans_init(&trans, c, 0, 0); + trans = bch2_trans_get(c); for (id = 0; id < BTREE_ID_NR; id++) { if (!btree_type_has_ptrs(id)) continue; - for_each_btree_key(&trans, iter, id, POS_MIN, + for_each_btree_key(trans, iter, id, POS_MIN, BTREE_ITER_ALL_SNAPSHOTS, k, ret) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -336,10 +308,10 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c else if (compressed) nr_compressed_extents++; } - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); } - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (ret) return ret; @@ -1005,7 +977,7 @@ STORE(bch2_dev) mutex_lock(&c->sb_lock); mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; - if (v != BCH_MEMBER_DURABILITY(mi)) { + if (v + 1 != BCH_MEMBER_DURABILITY(mi)) { SET_BCH_MEMBER_DURABILITY(mi, v + 1); bch2_write_super(c); } diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c index 72389c73..c907b3e0 100644 --- a/libbcachefs/tests.c +++ b/libbcachefs/tests.c @@ -31,7 +31,7 @@ static void delete_test_keys(struct bch_fs *c) static int test_delete(struct bch_fs *c, u64 nr) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_i_cookie k; int ret; @@ -39,44 +39,43 @@ static int test_delete(struct bch_fs *c, u64 nr) bkey_cookie_init(&k.k_i); k.k.p.snapshot = U32_MAX; - bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p, + bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p, BTREE_ITER_INTENT); - ret = commit_do(&trans, NULL, NULL, 0, + ret = commit_do(trans, NULL, NULL, 0, bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update(&trans, &iter, &k.k_i, 0)); + bch2_trans_update(trans, &iter, &k.k_i, 0)); if (ret) { bch_err_msg(c, ret, "update error"); goto err; } pr_info("deleting once"); - ret = commit_do(&trans, NULL, NULL, 0, + ret = commit_do(trans, NULL, NULL, 0, bch2_btree_iter_traverse(&iter) ?: - bch2_btree_delete_at(&trans, &iter, 0)); + bch2_btree_delete_at(trans, &iter, 0)); if (ret) { bch_err_msg(c, ret, "delete error (first)"); goto err; } pr_info("deleting twice"); - ret = commit_do(&trans, NULL, NULL, 0, + ret = commit_do(trans, NULL, NULL, 0, bch2_btree_iter_traverse(&iter) ?: - bch2_btree_delete_at(&trans, &iter, 0)); + bch2_btree_delete_at(trans, &iter, 0)); if (ret) { bch_err_msg(c, ret, "delete error (second)"); goto err; } err: - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); return ret; } static int test_delete_written(struct bch_fs *c, u64 nr) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_i_cookie k; int ret; @@ -84,58 +83,53 @@ static int test_delete_written(struct bch_fs *c, u64 nr) bkey_cookie_init(&k.k_i); k.k.p.snapshot = U32_MAX; - bch2_trans_init(&trans, c, 0, 0); - - bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p, + bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p, BTREE_ITER_INTENT); - ret = commit_do(&trans, NULL, NULL, 0, + ret = commit_do(trans, NULL, NULL, 0, bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update(&trans, &iter, &k.k_i, 0)); + bch2_trans_update(trans, &iter, &k.k_i, 0)); if (ret) { bch_err_msg(c, ret, "update error"); goto err; } - bch2_trans_unlock(&trans); + bch2_trans_unlock(trans); bch2_journal_flush_all_pins(&c->journal); - ret = commit_do(&trans, NULL, NULL, 0, + ret = commit_do(trans, NULL, NULL, 0, bch2_btree_iter_traverse(&iter) ?: - bch2_btree_delete_at(&trans, &iter, 0)); + bch2_btree_delete_at(trans, &iter, 0)); if (ret) { bch_err_msg(c, ret, "delete error"); goto err; } err: - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); return ret; } static int test_iterate(struct bch_fs *c, u64 nr) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; struct bkey_s_c k; u64 i; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - delete_test_keys(c); pr_info("inserting test keys"); for (i = 0; i < nr; i++) { - struct bkey_i_cookie k; + struct bkey_i_cookie ck; - bkey_cookie_init(&k.k_i); - k.k.p.offset = i; - k.k.p.snapshot = U32_MAX; + bkey_cookie_init(&ck.k_i); + ck.k.p.offset = i; + ck.k.p.snapshot = U32_MAX; - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i, - NULL, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0); if (ret) { bch_err_msg(c, ret, "insert error"); goto err; @@ -146,7 +140,7 @@ static int test_iterate(struct bch_fs *c, u64 nr) i = 0; - ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs, + ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, ({ BUG_ON(k.k->p.offset != i++); @@ -161,7 +155,7 @@ static int test_iterate(struct bch_fs *c, u64 nr) pr_info("iterating backwards"); - ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_xattrs, + ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs, SPOS(0, U64_MAX, U32_MAX), 0, k, ({ BUG_ON(k.k->p.offset != --i); @@ -174,35 +168,32 @@ static int test_iterate(struct bch_fs *c, u64 nr) BUG_ON(i); err: - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); return ret; } static int test_iterate_extents(struct bch_fs *c, u64 nr) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; struct bkey_s_c k; u64 i; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - delete_test_keys(c); pr_info("inserting test extents"); for (i = 0; i < nr; i += 8) { - struct bkey_i_cookie k; + struct bkey_i_cookie ck; - bkey_cookie_init(&k.k_i); - k.k.p.offset = i + 8; - k.k.p.snapshot = U32_MAX; - k.k.size = 8; + bkey_cookie_init(&ck.k_i); + ck.k.p.offset = i + 8; + ck.k.p.snapshot = U32_MAX; + ck.k.size = 8; - ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, - NULL, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0); if (ret) { bch_err_msg(c, ret, "insert error"); goto err; @@ -213,7 +204,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) i = 0; - ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents, + ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, ({ BUG_ON(bkey_start_offset(k.k) != i); @@ -229,7 +220,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) pr_info("iterating backwards"); - ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_extents, + ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_extents, SPOS(0, U64_MAX, U32_MAX), 0, k, ({ BUG_ON(k.k->p.offset != i); @@ -243,34 +234,31 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) BUG_ON(i); err: - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); return ret; } static int test_iterate_slots(struct bch_fs *c, u64 nr) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; struct bkey_s_c k; u64 i; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - delete_test_keys(c); pr_info("inserting test keys"); for (i = 0; i < nr; i++) { - struct bkey_i_cookie k; + struct bkey_i_cookie ck; - bkey_cookie_init(&k.k_i); - k.k.p.offset = i * 2; - k.k.p.snapshot = U32_MAX; + bkey_cookie_init(&ck.k_i); + ck.k.p.offset = i * 2; + ck.k.p.snapshot = U32_MAX; - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i, - NULL, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0); if (ret) { bch_err_msg(c, ret, "insert error"); goto err; @@ -281,7 +269,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) i = 0; - ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs, + ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, ({ BUG_ON(k.k->p.offset != i); @@ -299,7 +287,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) i = 0; - ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs, + ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), BTREE_ITER_SLOTS, k, ({ if (i >= nr * 2) @@ -317,34 +305,31 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) } ret = 0; err: - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; struct bkey_s_c k; u64 i; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - delete_test_keys(c); pr_info("inserting test keys"); for (i = 0; i < nr; i += 16) { - struct bkey_i_cookie k; + struct bkey_i_cookie ck; - bkey_cookie_init(&k.k_i); - k.k.p.offset = i + 16; - k.k.p.snapshot = U32_MAX; - k.k.size = 8; + bkey_cookie_init(&ck.k_i); + ck.k.p.offset = i + 16; + ck.k.p.snapshot = U32_MAX; + ck.k.size = 8; - ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, - NULL, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0); if (ret) { bch_err_msg(c, ret, "insert error"); goto err; @@ -355,7 +340,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) i = 0; - ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents, + ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, ({ BUG_ON(bkey_start_offset(k.k) != i + 8); @@ -374,7 +359,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) i = 0; - ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents, + ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), BTREE_ITER_SLOTS, k, ({ if (i == nr) @@ -392,7 +377,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) } ret = 0; err: - bch2_trans_exit(&trans); + bch2_trans_put(trans); return 0; } @@ -402,43 +387,41 @@ err: */ static int test_peek_end(struct bch_fs *c, u64 nr) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; - bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, + bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), 0); - lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); - lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); return 0; } static int test_peek_end_extents(struct bch_fs *c, u64 nr) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; - bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), 0); - lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); - lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); return 0; } @@ -458,8 +441,7 @@ static int insert_test_extent(struct bch_fs *c, k.k_i.k.size = end - start; k.k_i.k.version.lo = test_version++; - ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, - NULL, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0); if (ret) bch_err_fn(c, ret); return ret; @@ -515,7 +497,7 @@ static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start, k.k_i.k.size = len; ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_btree_insert_nonextent(&trans, BTREE_ID_extents, &k.k_i, + bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); if (ret) bch_err_fn(c, ret); @@ -538,7 +520,7 @@ static int test_extent_create_overlapping(struct bch_fs *c, u64 inum) /* Test skipping over keys in unrelated snapshots: */ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) { - struct btree_trans trans; + struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; struct bkey_i_cookie cookie; @@ -546,20 +528,19 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) bkey_cookie_init(&cookie.k_i); cookie.k.p.snapshot = snapid_hi; - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, - NULL, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0); if (ret) return ret; - bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, + trans = bch2_trans_get(c); + bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, SPOS(0, 0, snapid_lo), 0); - lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); BUG_ON(k.k->p.snapshot != U32_MAX); - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); return ret; } @@ -572,13 +553,12 @@ static int test_snapshots(struct bch_fs *c, u64 nr) bkey_cookie_init(&cookie.k_i); cookie.k.p.snapshot = U32_MAX; - ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, - NULL, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0); if (ret) return ret; ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_snapshot_node_create(&trans, U32_MAX, + bch2_snapshot_node_create(trans, U32_MAX, snapids, snapid_subvols, 2)); @@ -609,38 +589,34 @@ static u64 test_rand(void) static int rand_insert(struct bch_fs *c, u64 nr) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct bkey_i_cookie k; int ret = 0; u64 i; - bch2_trans_init(&trans, c, 0, 0); - for (i = 0; i < nr; i++) { bkey_cookie_init(&k.k_i); k.k.p.offset = test_rand(); k.k.p.snapshot = U32_MAX; - ret = commit_do(&trans, NULL, NULL, 0, - __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i, 0)); + ret = commit_do(trans, NULL, NULL, 0, + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k.k_i, 0)); if (ret) break; } - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } static int rand_insert_multi(struct bch_fs *c, u64 nr) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct bkey_i_cookie k[8]; int ret = 0; unsigned j; u64 i; - bch2_trans_init(&trans, c, 0, 0); - for (i = 0; i < nr; i += ARRAY_SIZE(k)) { for (j = 0; j < ARRAY_SIZE(k); j++) { bkey_cookie_init(&k[j].k_i); @@ -648,46 +624,45 @@ static int rand_insert_multi(struct bch_fs *c, u64 nr) k[j].k.p.snapshot = U32_MAX; } - ret = commit_do(&trans, NULL, NULL, 0, - __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?: - __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?: - __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?: - __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?: - __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?: - __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?: - __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?: - __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i, 0)); + ret = commit_do(trans, NULL, NULL, 0, + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[7].k_i, 0)); if (ret) break; } - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } static int rand_lookup(struct bch_fs *c, u64 nr) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; int ret = 0; u64 i; - bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, + bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), 0); for (i = 0; i < nr; i++) { bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX)); - lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter))); ret = bkey_err(k); if (ret) break; } - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); return ret; } @@ -719,26 +694,25 @@ static int rand_mixed_trans(struct btree_trans *trans, static int rand_mixed(struct bch_fs *c, u64 nr) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_i_cookie cookie; int ret = 0; u64 i, rand; - bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, + bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), 0); for (i = 0; i < nr; i++) { rand = test_rand(); - ret = commit_do(&trans, NULL, NULL, 0, - rand_mixed_trans(&trans, &iter, &cookie, i, rand)); + ret = commit_do(trans, NULL, NULL, 0, + rand_mixed_trans(trans, &iter, &cookie, i, rand)); if (ret) break; } - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); return ret; } @@ -766,22 +740,20 @@ err: static int rand_delete(struct bch_fs *c, u64 nr) { - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); int ret = 0; u64 i; - bch2_trans_init(&trans, c, 0, 0); - for (i = 0; i < nr; i++) { struct bpos pos = SPOS(0, test_rand(), U32_MAX); - ret = commit_do(&trans, NULL, NULL, 0, - __do_delete(&trans, pos)); + ret = commit_do(trans, NULL, NULL, 0, + __do_delete(trans, pos)); if (ret) break; } - bch2_trans_exit(&trans); + bch2_trans_put(trans); return ret; } @@ -794,14 +766,14 @@ static int seq_insert(struct bch_fs *c, u64 nr) bkey_cookie_init(&insert.k_i); return bch2_trans_run(c, - for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs, + for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, NULL, NULL, 0, ({ if (iter.pos.offset >= nr) break; insert.k.p = iter.pos; - bch2_trans_update(&trans, &iter, &insert.k_i, 0); + bch2_trans_update(trans, &iter, &insert.k_i, 0); }))); } @@ -811,7 +783,7 @@ static int seq_lookup(struct bch_fs *c, u64 nr) struct bkey_s_c k; return bch2_trans_run(c, - for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs, + for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, 0)); @@ -823,14 +795,14 @@ static int seq_overwrite(struct bch_fs *c, u64 nr) struct bkey_s_c k; return bch2_trans_run(c, - for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs, + for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), BTREE_ITER_INTENT, k, NULL, NULL, 0, ({ struct bkey_i_cookie u; bkey_reassemble(&u.k_i, k); - bch2_trans_update(&trans, &iter, &u.k_i, 0); + bch2_trans_update(trans, &iter, &u.k_i, 0); }))); } diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h index 97fe7742..19264492 100644 --- a/libbcachefs/trace.h +++ b/libbcachefs/trace.h @@ -137,6 +137,25 @@ DEFINE_EVENT(bio, read_promote, TP_ARGS(bio) ); +TRACE_EVENT(read_nopromote, + TP_PROTO(struct bch_fs *c, int ret), + TP_ARGS(c, ret), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __array(char, ret, 32 ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret)); + ), + + TP_printk("%d,%d ret %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ret) +); + DEFINE_EVENT(bio, read_bounce, TP_PROTO(struct bio *bio), TP_ARGS(bio) diff --git a/libbcachefs/util.c b/libbcachefs/util.c index 80a6c566..adeec805 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -112,10 +112,10 @@ got_unit: #define parse_or_ret(cp, _f) \ do { \ - int ret = _f; \ - if (ret < 0) \ - return ret; \ - cp += ret; \ + int _ret = _f; \ + if (_ret < 0) \ + return _ret; \ + cp += _ret; \ } while (0) static int __bch2_strtou64_h(const char *cp, u64 *res) @@ -605,11 +605,9 @@ void bch2_time_stats_init(struct bch2_time_stats *stats) /** * bch2_ratelimit_delay() - return how long to delay until the next time to do - * some work - * - * @d - the struct bch_ratelimit to update - * - * Returns the amount of time to delay by, in jiffies + * some work + * @d: the struct bch_ratelimit to update + * Returns: the amount of time to delay by, in jiffies */ u64 bch2_ratelimit_delay(struct bch_ratelimit *d) { @@ -622,9 +620,8 @@ u64 bch2_ratelimit_delay(struct bch_ratelimit *d) /** * bch2_ratelimit_increment() - increment @d by the amount of work done - * - * @d - the struct bch_ratelimit to update - * @done - the amount of work done, in arbitrary units + * @d: the struct bch_ratelimit to update + * @done: the amount of work done, in arbitrary units */ void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done) { @@ -761,10 +758,10 @@ void bch2_bio_map(struct bio *bio, void *base, size_t size) } } -int bch2_bio_alloc_pages_noprof(struct bio *bio, size_t size, gfp_t gfp_mask) +int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) { while (size) { - struct page *page = alloc_pages_noprof(gfp_mask, 0); + struct page *page = alloc_pages(gfp_mask, 0); unsigned len = min_t(size_t, PAGE_SIZE, size); if (!page) diff --git a/libbcachefs/util.h b/libbcachefs/util.h index d06671a0..67f1a1d2 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -60,13 +60,12 @@ static inline void vpfree(void *p, size_t size) free_pages((unsigned long) p, get_order(size)); } -static inline void *vpmalloc_noprof(size_t size, gfp_t gfp_mask) +static inline void *vpmalloc(size_t size, gfp_t gfp_mask) { - return (void *) get_free_pages_noprof(gfp_mask|__GFP_NOWARN, - get_order(size)) ?: - __vmalloc_noprof(size, gfp_mask); + return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, + get_order(size)) ?: + __vmalloc(size, gfp_mask); } -#define vpmalloc(_size, _gfp) alloc_hooks(vpmalloc_noprof(_size, _gfp)) static inline void kvpfree(void *p, size_t size) { @@ -76,13 +75,12 @@ static inline void kvpfree(void *p, size_t size) vpfree(p, size); } -static inline void *kvpmalloc_noprof(size_t size, gfp_t gfp_mask) +static inline void *kvpmalloc(size_t size, gfp_t gfp_mask) { return size < PAGE_SIZE - ? kmalloc_noprof(size, gfp_mask) - : vpmalloc_noprof(size, gfp_mask); + ? kmalloc(size, gfp_mask) + : vpmalloc(size, gfp_mask); } -#define kvpmalloc(_size, _gfp) alloc_hooks(kvpmalloc_noprof(_size, _gfp)) int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t); @@ -534,9 +532,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) } void bch2_bio_map(struct bio *bio, void *base, size_t); -int bch2_bio_alloc_pages_noprof(struct bio *, size_t, gfp_t); -#define bch2_bio_alloc_pages(_bio, _size, _gfp) \ - alloc_hooks(bch2_bio_alloc_pages_noprof(_bio, _size, _gfp)) +int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t); static inline sector_t bdev_sectors(struct block_device *bdev) { @@ -779,12 +775,12 @@ static inline void __move_gap(void *array, size_t element_size, #define bubble_sort(_base, _nr, _cmp) \ do { \ - ssize_t _i, _end; \ + ssize_t _i, _last; \ bool _swapped = true; \ \ - for (_end = (ssize_t) (_nr) - 1; _end > 0 && _swapped; --_end) {\ + for (_last= (ssize_t) (_nr) - 1; _last > 0 && _swapped; --_last) {\ _swapped = false; \ - for (_i = 0; _i < _end; _i++) \ + for (_i = 0; _i < _last; _i++) \ if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) { \ swap((_base)[_i], (_base)[_i + 1]); \ _swapped = true; \ diff --git a/libbcachefs/varint.c b/libbcachefs/varint.c index 2a2ab86e..cb4f33ed 100644 --- a/libbcachefs/varint.c +++ b/libbcachefs/varint.c @@ -13,10 +13,9 @@ /** * bch2_varint_encode - encode a variable length integer - * @out - destination to encode to - * @v - unsigned integer to encode - * - * Returns the size in bytes of the encoded integer - at most 9 bytes + * @out: destination to encode to + * @v: unsigned integer to encode + * Returns: size in bytes of the encoded integer - at most 9 bytes */ int bch2_varint_encode(u8 *out, u64 v) { @@ -40,11 +39,10 @@ int bch2_varint_encode(u8 *out, u64 v) /** * bch2_varint_decode - encode a variable length integer - * @in - varint to decode - * @end - end of buffer to decode from - * @out - on success, decoded integer - * - * Returns the size in bytes of the decoded integer - or -1 on failure (would + * @in: varint to decode + * @end: end of buffer to decode from + * @out: on success, decoded integer + * Returns: size in bytes of the decoded integer - or -1 on failure (would * have read past the end of the buffer) */ int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) @@ -73,6 +71,9 @@ int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) /** * bch2_varint_encode_fast - fast version of bch2_varint_encode + * @out: destination to encode to + * @v: unsigned integer to encode + * Returns: size in bytes of the encoded integer - at most 9 bytes * * This version assumes it's always safe to write 8 bytes to @out, even if the * encoded integer would be smaller. @@ -96,6 +97,11 @@ int bch2_varint_encode_fast(u8 *out, u64 v) /** * bch2_varint_decode_fast - fast version of bch2_varint_decode + * @in: varint to decode + * @end: end of buffer to decode from + * @out: on success, decoded integer + * Returns: size in bytes of the decoded integer - or -1 on failure (would + * have read past the end of the buffer) * * This version assumes that it is safe to read at most 8 bytes past the end of * @end (we still return an error if the varint extends past @end). diff --git a/libbcachefs/vstructs.h b/libbcachefs/vstructs.h index 53a694d7..a6561b4b 100644 --- a/libbcachefs/vstructs.h +++ b/libbcachefs/vstructs.h @@ -41,11 +41,11 @@ (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9) #define vstruct_next(_s) \ - ((typeof(_s)) ((_s)->_data + __vstruct_u64s(_s))) + ((typeof(_s)) ((u64 *) (_s)->_data + __vstruct_u64s(_s))) #define vstruct_last(_s) \ - ((typeof(&(_s)->start[0])) ((_s)->_data + __vstruct_u64s(_s))) + ((typeof(&(_s)->start[0])) ((u64 *) (_s)->_data + __vstruct_u64s(_s))) #define vstruct_end(_s) \ - ((void *) ((_s)->_data + __vstruct_u64s(_s))) + ((void *) ((u64 *) (_s)->_data + __vstruct_u64s(_s))) #define vstruct_for_each(_s, _i) \ for (_i = (_s)->start; \ diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index 6f6b3caf..b069b1a6 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "acl.h" #include "bkey_methods.h" #include "btree_update.h" #include "extents.h" @@ -130,6 +131,13 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, xattr.v->x_name, le16_to_cpu(xattr.v->x_val_len), (char *) xattr_val(xattr.v)); + + if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS || + xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT) { + prt_char(out, ' '); + bch2_acl_to_text(out, xattr_val(xattr.v), + le16_to_cpu(xattr.v->x_val_len)); + } } static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode, @@ -299,24 +307,22 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { struct bch_fs *c = dentry->d_sb->s_fs_info; struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); - struct btree_trans trans; + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; u64 offset = 0, inum = inode->ei_inode.bi_inum; u32 snapshot; int ret; - - bch2_trans_init(&trans, c, 0, 0); retry: - bch2_trans_begin(&trans); + bch2_trans_begin(trans); iter = (struct btree_iter) { NULL }; - ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot); + ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot); if (ret) goto err; - for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_xattrs, + for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_xattrs, SPOS(inum, offset, snapshot), POS(inum, U64_MAX), 0, k, ret) { if (k.k->type != KEY_TYPE_xattr) @@ -328,12 +334,12 @@ retry: } offset = iter.pos.offset; - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - bch2_trans_exit(&trans); + bch2_trans_put(trans); if (ret) goto out; @@ -358,7 +364,7 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler, struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; int ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_xattr_get_trans(&trans, inode, name, buffer, size, handler->flags)); + bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags)); return bch2_err_class(ret); } @@ -373,18 +379,14 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler, struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); struct bch_inode_unpacked inode_u; - struct btree_trans trans; int ret; - bch2_trans_init(&trans, c, 0, 0); - - ret = commit_do(&trans, NULL, NULL, 0, - bch2_xattr_set(&trans, inode_inum(inode), &inode_u, + ret = bch2_trans_run(c, + commit_do(trans, NULL, NULL, 0, + bch2_xattr_set(trans, inode_inum(inode), &inode_u, &hash, name, value, size, - handler->flags, flags)); - if (!ret) - bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME); - bch2_trans_exit(&trans); + handler->flags, flags)) ?: + (bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME), 0)); return bch2_err_class(ret); } diff --git a/linux/blkdev.c b/linux/blkdev.c index ea901a46..54af9f87 100644 --- a/linux/blkdev.c +++ b/linux/blkdev.c @@ -162,7 +162,7 @@ sector_t get_capacity(struct gendisk *disk) return bytes >> 9; } -void blkdev_put(struct block_device *bdev, fmode_t mode) +void blkdev_put(struct block_device *bdev, void *holder) { fdatasync(bdev->bd_fd); close(bdev->bd_sync_fd); @@ -170,25 +170,25 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) free(bdev); } -struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, - void *holder) +struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode, + void *holder, const struct blk_holder_ops *hop) { struct block_device *bdev; int fd, sync_fd, buffered_fd, flags = 0; - if ((mode & (FMODE_READ|FMODE_WRITE)) == (FMODE_READ|FMODE_WRITE)) + if ((mode & (BLK_OPEN_READ|BLK_OPEN_WRITE)) == (BLK_OPEN_READ|BLK_OPEN_WRITE)) flags = O_RDWR; - else if (mode & FMODE_READ) + else if (mode & BLK_OPEN_READ) flags = O_RDONLY; - else if (mode & FMODE_WRITE) + else if (mode & BLK_OPEN_WRITE) flags = O_WRONLY; - if (!(mode & FMODE_BUFFERED)) + if (!(mode & BLK_OPEN_BUFFERED)) flags |= O_DIRECT; #if 0 /* using O_EXCL doesn't work with opening twice for an O_SYNC fd: */ - if (mode & FMODE_EXCL) + if (mode & BLK_OPEN_EXCL) flags |= O_EXCL; #endif buffered_fd = open(path, flags & ~O_DIRECT); diff --git a/rust-src/bch_bindgen/src/bkey.rs b/rust-src/bch_bindgen/src/bkey.rs index 64697ea6..d4830839 100644 --- a/rust-src/bch_bindgen/src/bkey.rs +++ b/rust-src/bch_bindgen/src/bkey.rs @@ -47,6 +47,8 @@ pub enum BkeyValC<'a> { inode_v3(&'a c::bch_inode_v3), bucket_gens(&'a c::bch_bucket_gens), snapshot_tree(&'a c::bch_snapshot_tree), + logged_op_truncate(&'a c::bch_logged_op_truncate), + logged_op_finsert(&'a c::bch_logged_op_finsert), } impl<'a, 'b> BkeySC<'a> { @@ -96,6 +98,8 @@ impl<'a, 'b> BkeySC<'a> { KEY_TYPE_inode_v3 => inode_v3(unsafe { transmute(self.v) }), KEY_TYPE_bucket_gens => bucket_gens(unsafe { transmute(self.v) }), KEY_TYPE_snapshot_tree => snapshot_tree(unsafe { transmute(self.v) }), + KEY_TYPE_logged_op_truncate => logged_op_truncate(unsafe { transmute(self.v) }), + KEY_TYPE_logged_op_finsert => logged_op_finsert(unsafe { transmute(self.v) }), KEY_TYPE_MAX => unreachable!(), } } diff --git a/rust-src/bch_bindgen/src/btree.rs b/rust-src/bch_bindgen/src/btree.rs index 32b4e743..f738a466 100644 --- a/rust-src/bch_bindgen/src/btree.rs +++ b/rust-src/bch_bindgen/src/btree.rs @@ -11,24 +11,21 @@ use std::ptr; use bitflags::bitflags; pub struct BtreeTrans<'f> { - raw: c::btree_trans, + raw: *mut c::btree_trans, fs: PhantomData<&'f Fs> } impl<'f> BtreeTrans<'f> { pub fn new(fs: &'f Fs) -> BtreeTrans { unsafe { - let mut trans: MaybeUninit<c::btree_trans> = MaybeUninit::uninit(); - - c::__bch2_trans_init(&mut (*trans.as_mut_ptr()), fs.raw, 0); - BtreeTrans { raw: trans.assume_init(), fs: PhantomData } + BtreeTrans { raw: &mut *c::__bch2_trans_get(fs.raw, 0), fs: PhantomData } } } } impl<'f> Drop for BtreeTrans<'f> { fn drop(&mut self) { - unsafe { c::bch2_trans_exit(&mut self.raw) } + unsafe { c::bch2_trans_put(&mut *self.raw) } } } @@ -64,9 +61,9 @@ impl<'t> BtreeIter<'t> { let mut iter: MaybeUninit<c::btree_iter> = MaybeUninit::uninit(); c::bch2_trans_iter_init_outlined( - ptr::addr_of!(trans.raw).cast_mut(), + trans.raw, iter.as_mut_ptr(), - btree as u32, + btree, pos, flags.bits as u32); @@ -123,7 +120,7 @@ impl<'t> BtreeNodeIter<'t> { unsafe { let mut iter: MaybeUninit<c::btree_iter> = MaybeUninit::uninit(); c::bch2_trans_node_iter_init( - ptr::addr_of!(trans.raw).cast_mut(), + trans.raw, iter.as_mut_ptr(), btree, pos, diff --git a/rust-src/bch_bindgen/src/libbcachefs_wrapper.h b/rust-src/bch_bindgen/src/libbcachefs_wrapper.h index e7bcfcfb..e68de664 100644 --- a/rust-src/bch_bindgen/src/libbcachefs_wrapper.h +++ b/rust-src/bch_bindgen/src/libbcachefs_wrapper.h @@ -13,8 +13,8 @@ #include "../include/linux/blkdev.h" -#define MARK_FIX_753(req_name) const fmode_t Fix753_##req_name = req_name; +#define MARK_FIX_753(req_name) const blk_mode_t Fix753_##req_name = req_name; -MARK_FIX_753(FMODE_READ); -MARK_FIX_753(FMODE_WRITE); -MARK_FIX_753(FMODE_EXCL); \ No newline at end of file +MARK_FIX_753(BLK_OPEN_READ); +MARK_FIX_753(BLK_OPEN_WRITE); +MARK_FIX_753(BLK_OPEN_EXCL);