From e61b61c03bf1f1eedc5e2dbd6887f77e45144a31 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 26 Sep 2021 18:19:46 -0400 Subject: [PATCH] Update bcachefs sources to 386f00b639 bcachefs: Snapshot creation, deletion --- .bcachefs_revision | 2 +- cmd_debug.c | 1 + cmd_migrate.c | 17 +- libbcachefs/acl.c | 25 +- libbcachefs/acl.h | 11 +- libbcachefs/bcachefs.h | 23 + libbcachefs/bcachefs_format.h | 59 +- libbcachefs/bcachefs_ioctl.h | 15 + libbcachefs/bkey.h | 2 +- libbcachefs/bkey_methods.c | 33 +- libbcachefs/btree_iter.c | 180 +++- libbcachefs/btree_iter.h | 9 + libbcachefs/btree_key_cache.c | 12 +- libbcachefs/btree_locking.h | 17 +- libbcachefs/btree_types.h | 18 +- libbcachefs/btree_update.h | 2 +- libbcachefs/btree_update_leaf.c | 204 ++++- libbcachefs/buckets.c | 3 + libbcachefs/dirent.c | 203 +++-- libbcachefs/dirent.h | 35 +- libbcachefs/extents.c | 32 - libbcachefs/extents.h | 1 - libbcachefs/fs-common.c | 288 +++++-- libbcachefs/fs-common.h | 26 +- libbcachefs/fs-io.c | 181 +++- libbcachefs/fs-ioctl.c | 176 +++- libbcachefs/fs.c | 161 ++-- libbcachefs/fs.h | 17 +- libbcachefs/fsck.c | 1376 ++++++++++++++++++++++++------- libbcachefs/inode.c | 128 ++- libbcachefs/inode.h | 7 +- libbcachefs/io.c | 128 +-- libbcachefs/io.h | 19 +- libbcachefs/io_types.h | 2 + libbcachefs/migrate.c | 6 +- libbcachefs/move.c | 84 +- libbcachefs/opts.c | 3 +- libbcachefs/opts.h | 12 +- libbcachefs/recovery.c | 126 ++- libbcachefs/reflink.c | 38 +- libbcachefs/reflink.h | 4 +- libbcachefs/str_hash.h | 48 +- libbcachefs/subvolume.c | 981 ++++++++++++++++++++++ libbcachefs/subvolume.h | 115 +++ libbcachefs/super.c | 4 + libbcachefs/xattr.c | 23 +- libbcachefs/xattr.h | 3 +- 47 files changed, 4025 insertions(+), 835 deletions(-) create mode 100644 libbcachefs/subvolume.c create mode 100644 libbcachefs/subvolume.h diff --git a/.bcachefs_revision b/.bcachefs_revision index d53addfb..76bc7256 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -bd6ed9fb42c0aa36d1f4a21eeab45fe12e1fb792 +386f00b6399a1eb38053c236aae87678f3535df7 diff --git a/cmd_debug.c b/cmd_debug.c index b3a6ea0c..aee19fbf 100644 --- a/cmd_debug.c +++ b/cmd_debug.c @@ -191,6 +191,7 @@ static void list_keys(struct bch_fs *c, enum btree_id btree_id, bch2_trans_init(&trans, c, 0, 0); for_each_btree_key(&trans, iter, btree_id, start, + BTREE_ITER_ALL_SNAPSHOTS| BTREE_ITER_PREFETCH, k, ret) { if (bkey_cmp(k.k->p, end) > 0) break; diff --git a/cmd_migrate.c b/cmd_migrate.c index 51260906..41cfe5d9 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -138,8 +138,9 @@ static void create_link(struct bch_fs *c, struct bch_inode_unpacked inode; int ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_link_trans(&trans, parent->bi_inum, inum, - &parent_u, &inode, &qstr)); + bch2_link_trans(&trans, + (subvol_inum) { 1, parent->bi_inum }, &parent_u, + (subvol_inum) { 1, inum }, &inode, &qstr)); if (ret) die("error creating hardlink: %s", strerror(-ret)); } @@ -155,9 +156,10 @@ static struct bch_inode_unpacked create_file(struct bch_fs *c, int ret = bch2_trans_do(c, NULL, NULL, 0, bch2_create_trans(&trans, - parent->bi_inum, parent, + (subvol_inum) { 1, parent->bi_inum }, parent, &new_inode, &qstr, - uid, gid, mode, rdev, NULL, NULL)); + uid, gid, mode, rdev, NULL, NULL, + (subvol_inum) {}, 0)); if (ret) die("error creating file: %s", strerror(-ret)); @@ -225,7 +227,9 @@ static void copy_xattrs(struct bch_fs *c, struct bch_inode_unpacked *dst, const struct xattr_handler *h = xattr_resolve_name(&attr); int ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_xattr_set(&trans, dst->bi_inum, &hash_info, attr, + bch2_xattr_set(&trans, + (subvol_inum) { 1, dst->bi_inum }, + &hash_info, attr, val, val_size, h->flags, 0)); if (ret < 0) die("error creating xattr: %s", strerror(-ret)); @@ -569,7 +573,8 @@ static void copy_fs(struct bch_fs *c, int src_fd, const char *src_path, syncfs(src_fd); struct bch_inode_unpacked root_inode; - int ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, &root_inode); + int ret = bch2_inode_find_by_inum(c, (subvol_inum) { 1, BCACHEFS_ROOT_INO }, + &root_inode); if (ret) die("error looking up root directory: %s", strerror(-ret)); diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c index 2146a63d..f92b52e4 100644 --- a/libbcachefs/acl.c +++ b/libbcachefs/acl.c @@ -229,7 +229,7 @@ retry: bch2_trans_begin(&trans); ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc, - &hash, inode->v.i_ino, + &hash, inode_inum(inode), &X_SEARCH(acl_to_xattr_type(type), "", 0), 0); if (ret) { @@ -259,11 +259,11 @@ out: return acl; } -int bch2_set_acl_trans(struct btree_trans *trans, +int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum, struct bch_inode_unpacked *inode_u, - const struct bch_hash_info *hash_info, struct posix_acl *acl, int type) { + struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u); int ret; if (type == ACL_TYPE_DEFAULT && @@ -276,14 +276,14 @@ int bch2_set_acl_trans(struct btree_trans *trans, if (IS_ERR(xattr)) return PTR_ERR(xattr); - ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, - inode_u->bi_inum, &xattr->k_i, 0); + ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info, + inum, &xattr->k_i, 0); } else { struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); - ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info, - inode_u->bi_inum, &search); + ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info, + inum, &search); } return ret == -ENOENT ? 0 : ret; @@ -297,7 +297,6 @@ int bch2_set_acl(struct user_namespace *mnt_userns, struct btree_trans trans; struct btree_iter inode_iter = { NULL }; struct bch_inode_unpacked inode_u; - struct bch_hash_info hash_info; struct posix_acl *acl; umode_t mode; int ret; @@ -308,7 +307,7 @@ retry: bch2_trans_begin(&trans); acl = _acl; - ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode->v.i_ino, + ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode), BTREE_ITER_INTENT); if (ret) goto btree_err; @@ -321,9 +320,7 @@ retry: goto btree_err; } - hash_info = bch2_hash_info_init(c, &inode_u); - - ret = bch2_set_acl_trans(&trans, &inode_u, &hash_info, acl, type); + ret = bch2_set_acl_trans(&trans, inode_inum(inode), &inode_u, acl, type); if (ret) goto btree_err; @@ -352,7 +349,7 @@ err: return ret; } -int bch2_acl_chmod(struct btree_trans *trans, +int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, struct bch_inode_unpacked *inode, umode_t mode, struct posix_acl **new_acl) @@ -366,7 +363,7 @@ int bch2_acl_chmod(struct btree_trans *trans, int ret; ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, - &hash_info, inode->bi_inum, + &hash_info, inum, &X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0), BTREE_ITER_INTENT); if (ret) diff --git a/libbcachefs/acl.h b/libbcachefs/acl.h index 25fc54dd..2ad214bd 100644 --- a/libbcachefs/acl.h +++ b/libbcachefs/acl.h @@ -28,25 +28,24 @@ typedef struct { struct posix_acl *bch2_get_acl(struct inode *, int); -int bch2_set_acl_trans(struct btree_trans *, +int bch2_set_acl_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, - const struct bch_hash_info *, struct posix_acl *, int); int bch2_set_acl(struct user_namespace *, struct inode *, struct posix_acl *, int); -int bch2_acl_chmod(struct btree_trans *, struct bch_inode_unpacked *, +int bch2_acl_chmod(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *, umode_t, struct posix_acl **); #else -static inline int bch2_set_acl_trans(struct btree_trans *trans, +static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum, struct bch_inode_unpacked *inode_u, - const struct bch_hash_info *hash_info, struct posix_acl *acl, int type) { return 0; } -static inline int bch2_acl_chmod(struct btree_trans *trans, +static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, struct bch_inode_unpacked *inode, umode_t mode, struct posix_acl **new_acl) diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 9975fc17..0efb1aaa 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -380,6 +380,8 @@ enum gc_phase { GC_PHASE_BTREE_alloc, GC_PHASE_BTREE_quotas, GC_PHASE_BTREE_reflink, + GC_PHASE_BTREE_subvolumes, + GC_PHASE_BTREE_snapshots, GC_PHASE_PENDING_DELETE, }; @@ -563,6 +565,21 @@ struct btree_path_buf { #define REPLICAS_DELTA_LIST_MAX (1U << 16) +struct snapshot_t { + u32 parent; + u32 children[2]; + u32 subvol; /* Nonzero only if a subvolume points to this node: */ + u32 equiv; +}; + +typedef struct { + u32 subvol; + u64 inum; +} subvol_inum; + +#define BCACHEFS_ROOT_SUBVOL_INUM \ + ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) + struct bch_fs { struct closure cl; @@ -634,6 +651,12 @@ struct bch_fs { struct closure sb_write; struct mutex sb_lock; + /* snapshot.c: */ + GENRADIX(struct snapshot_t) snapshots; + struct bch_snapshot_table __rcu *snapshot_table; + struct mutex snapshot_table_lock; + struct work_struct snapshot_delete_work; + /* BTREE CACHE */ struct bio_set btree_bio; struct workqueue_struct *io_complete_wq; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 98779e46..c082d5fc 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -323,7 +323,7 @@ static inline void bkey_init(struct bkey *k) */ #define BCH_BKEY_TYPES() \ x(deleted, 0) \ - x(discard, 1) \ + x(whiteout, 1) \ x(error, 2) \ x(cookie, 3) \ x(hash_whiteout, 4) \ @@ -342,7 +342,9 @@ static inline void bkey_init(struct bkey *k) x(inline_data, 17) \ x(btree_ptr_v2, 18) \ x(indirect_inline_data, 19) \ - x(alloc_v2, 20) + x(alloc_v2, 20) \ + x(subvolume, 21) \ + x(snapshot, 22) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -355,7 +357,7 @@ struct bch_deleted { struct bch_val v; }; -struct bch_discard { +struct bch_whiteout { struct bch_val v; }; @@ -686,6 +688,10 @@ struct bch_inode_generation { __le32 pad; } __attribute__((packed, aligned(8))); +/* + * bi_subvol and bi_parent_subvol are only set for subvolume roots: + */ + #define BCH_INODE_FIELDS() \ x(bi_atime, 96) \ x(bi_ctime, 96) \ @@ -709,7 +715,9 @@ struct bch_inode_generation { x(bi_erasure_code, 16) \ x(bi_fields_set, 16) \ x(bi_dir, 64) \ - x(bi_dir_offset, 64) + x(bi_dir_offset, 64) \ + x(bi_subvol, 32) \ + x(bi_parent_subvol, 32) /* subset of BCH_INODE_FIELDS */ #define BCH_INODE_OPTS() \ @@ -792,6 +800,9 @@ struct bch_dirent { __u8 d_name[]; } __attribute__((packed, aligned(8))); +#define DT_SUBVOL 16 +#define BCH_DT_MAX 17 + #define BCH_NAME_MAX (U8_MAX * sizeof(u64) - \ sizeof(struct bkey) - \ offsetof(struct bch_dirent, d_name)) @@ -928,6 +939,42 @@ struct bch_inline_data { u8 data[0]; }; +/* Subvolumes: */ + +#define SUBVOL_POS_MIN POS(0, 1) +#define SUBVOL_POS_MAX POS(0, S32_MAX) +#define BCACHEFS_ROOT_SUBVOL 1 + +struct bch_subvolume { + struct bch_val v; + __le32 flags; + __le32 snapshot; + __le64 inode; +}; + +LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1) +/* + * We need to know whether a subvolume is a snapshot so we can know whether we + * can delete it (or whether it should just be rm -rf'd) + */ +LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2) + +/* Snapshots */ + +struct bch_snapshot { + struct bch_val v; + __le32 flags; + __le32 parent; + __le32 children[2]; + __le32 subvol; + __le32 pad; +}; + +LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) + +/* True if a subvolume points to this snapshot node: */ +LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) + /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -1695,7 +1742,9 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); x(alloc, 4) \ x(quotas, 5) \ x(stripes, 6) \ - x(reflink, 7) + x(reflink, 7) \ + x(subvolumes, 8) \ + x(snapshots, 9) enum btree_id { #define x(kwd, val) BTREE_ID_##kwd = val, diff --git a/libbcachefs/bcachefs_ioctl.h b/libbcachefs/bcachefs_ioctl.h index f679fc21..930981ad 100644 --- a/libbcachefs/bcachefs_ioctl.h +++ b/libbcachefs/bcachefs_ioctl.h @@ -78,6 +78,9 @@ struct bch_ioctl_incremental { #define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize) #define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15, struct bch_ioctl_disk_resize_journal) +#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume) +#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume) + /* ioctl below act on a particular file, not the filesystem as a whole: */ #define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *) @@ -349,4 +352,16 @@ struct bch_ioctl_disk_resize_journal { __u64 nbuckets; }; +struct bch_ioctl_subvolume { + __u32 flags; + __u32 dirfd; + __u16 mode; + __u16 pad[3]; + __u64 dst_ptr; + __u64 src_ptr; +}; + +#define BCH_SUBVOL_SNAPSHOT_CREATE (1U << 0) +#define BCH_SUBVOL_SNAPSHOT_RO (1U << 1) + #endif /* _BCACHEFS_IOCTL_H */ diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index c4a66f28..7dee3d8e 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -55,7 +55,7 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) #define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) #define bkey_whiteout(_k) \ - ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard) + ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) enum bkey_lr_packed { BKEY_PACKED_BOTH, diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index a03b5514..874defd8 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -11,6 +11,7 @@ #include "inode.h" #include "quota.h" #include "reflink.h" +#include "subvolume.h" #include "xattr.h" const char * const bch2_bkey_types[] = { @@ -30,7 +31,7 @@ static const char *deleted_key_invalid(const struct bch_fs *c, .key_invalid = deleted_key_invalid, \ } -#define bch2_bkey_ops_discard (struct bkey_ops) { \ +#define bch2_bkey_ops_whiteout (struct bkey_ops) { \ .key_invalid = deleted_key_invalid, \ } @@ -100,6 +101,8 @@ const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k) static unsigned bch2_key_types_allowed[] = { [BKEY_TYPE_extents] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_whiteout)| (1U << KEY_TYPE_error)| (1U << KEY_TYPE_cookie)| (1U << KEY_TYPE_extent)| @@ -107,26 +110,43 @@ static unsigned bch2_key_types_allowed[] = { (1U << KEY_TYPE_reflink_p)| (1U << KEY_TYPE_inline_data), [BKEY_TYPE_inodes] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_whiteout)| (1U << KEY_TYPE_inode)| (1U << KEY_TYPE_inode_generation), [BKEY_TYPE_dirents] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_whiteout)| (1U << KEY_TYPE_hash_whiteout)| (1U << KEY_TYPE_dirent), [BKEY_TYPE_xattrs] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_whiteout)| (1U << KEY_TYPE_cookie)| (1U << KEY_TYPE_hash_whiteout)| (1U << KEY_TYPE_xattr), [BKEY_TYPE_alloc] = + (1U << KEY_TYPE_deleted)| (1U << KEY_TYPE_alloc)| (1U << KEY_TYPE_alloc_v2), [BKEY_TYPE_quotas] = + (1U << KEY_TYPE_deleted)| (1U << KEY_TYPE_quota), [BKEY_TYPE_stripes] = + (1U << KEY_TYPE_deleted)| (1U << KEY_TYPE_stripe), [BKEY_TYPE_reflink] = + (1U << KEY_TYPE_deleted)| (1U << KEY_TYPE_reflink_v)| (1U << KEY_TYPE_indirect_inline_data), + [BKEY_TYPE_subvolumes] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_subvolume), + [BKEY_TYPE_snapshots] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_snapshot), [BKEY_TYPE_btree] = + (1U << KEY_TYPE_deleted)| (1U << KEY_TYPE_btree_ptr)| (1U << KEY_TYPE_btree_ptr_v2), }; @@ -134,21 +154,18 @@ static unsigned bch2_key_types_allowed[] = { const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, enum btree_node_type type) { - unsigned key_types_allowed = (1U << KEY_TYPE_deleted)| - bch2_key_types_allowed[type] ; - if (k.k->u64s < BKEY_U64s) return "u64s too small"; - if (!(key_types_allowed & (1U << k.k->type))) + if (!(bch2_key_types_allowed[type] & (1U << k.k->type))) return "invalid key type for this btree"; if (type == BKEY_TYPE_btree && bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) return "value too big"; - if (btree_node_type_is_extents(type)) { - if ((k.k->size == 0) != bkey_deleted(k.k)) + if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { + if (k.k->size == 0) return "bad size field"; if (k.k->size > k.k->p.offset) @@ -165,7 +182,7 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, if (type != BKEY_TYPE_btree && btree_type_has_snapshots(type) && - k.k->p.snapshot != U32_MAX) + !k.k->p.snapshot) return "invalid snapshot field"; if (type != BKEY_TYPE_btree && diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index ce4d7c7e..b5484d77 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -13,6 +13,7 @@ #include "extents.h" #include "journal.h" #include "replicas.h" +#include "subvolume.h" #include #include @@ -152,7 +153,7 @@ bool __bch2_btree_node_relock(struct btree_trans *trans, if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) || (btree_node_lock_seq_matches(path, b, level) && btree_node_lock_increment(trans, b, level, want))) { - mark_btree_node_locked(trans, path, level, want); + mark_btree_node_locked(path, level, want); return true; } else { return false; @@ -188,7 +189,7 @@ static bool bch2_btree_node_upgrade(struct btree_trans *trans, return false; success: - mark_btree_node_intent_locked(trans, path, level); + mark_btree_node_intent_locked(path, level); return true; } @@ -674,6 +675,9 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) { + BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && + !iter->pos.snapshot); + BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && iter->pos.snapshot != iter->snapshot); @@ -681,6 +685,55 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) bkey_cmp(iter->pos, iter->k.p) > 0); } +static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) +{ + struct btree_trans *trans = iter->trans; + struct btree_iter copy; + struct bkey_s_c prev; + int ret = 0; + + if (!bch2_debug_check_iterators) + return 0; + + if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) + return 0; + + if (bkey_err(k) || !k.k) + return 0; + + BUG_ON(!bch2_snapshot_is_ancestor(trans->c, + iter->snapshot, + k.k->p.snapshot)); + + bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, + BTREE_ITER_ALL_SNAPSHOTS); + prev = bch2_btree_iter_prev(©); + if (!prev.k) + goto out; + + ret = bkey_err(prev); + if (ret) + goto out; + + if (!bkey_cmp(prev.k->p, k.k->p) && + bch2_snapshot_is_ancestor(trans->c, iter->snapshot, + prev.k->p.snapshot) > 0) { + char buf1[100], buf2[200]; + + bch2_bkey_to_text(&PBUF(buf1), k.k); + bch2_bkey_to_text(&PBUF(buf2), prev.k); + + panic("iter snap %u\n" + "k %s\n" + "prev %s\n", + iter->snapshot, + buf1, buf2); + } +out: + bch2_trans_iter_exit(trans, ©); + return ret; +} + #else static inline void bch2_btree_path_verify_level(struct btree_trans *trans, @@ -689,6 +742,7 @@ static inline void bch2_btree_path_verify(struct btree_trans *trans, struct btree_path *path) {} static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {} +static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; } #endif @@ -896,12 +950,12 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, bch2_btree_node_iter_peek_all(&l->iter, l->b)); } -static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans, +static inline struct bkey_s_c btree_path_level_peek(struct bch_fs *c, struct btree_path *path, struct btree_path_level *l, struct bkey *u) { - struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, + struct bkey_s_c k = __btree_iter_unpack(c, l, u, bch2_btree_node_iter_peek(&l->iter, l->b)); path->pos = k.k ? k.k->p : l->b->key.k.p; @@ -1041,7 +1095,7 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) t != BTREE_NODE_UNLOCKED) { btree_node_unlock(path, b->c.level); six_lock_increment(&b->c.lock, t); - mark_btree_node_locked(trans, path, b->c.level, t); + mark_btree_node_locked(path, b->c.level, t); } btree_path_level_init(trans, path, b); @@ -1118,7 +1172,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans, for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++) path->l[i].b = NULL; - mark_btree_node_locked(trans, path, path->level, lock_type); + mark_btree_node_locked(path, path->level, lock_type); btree_path_level_init(trans, path, b); return 0; } @@ -1210,7 +1264,7 @@ static __always_inline int btree_path_down(struct btree_trans *trans, if (unlikely(ret)) goto err; - mark_btree_node_locked(trans, path, level, lock_type); + mark_btree_node_locked(path, level, lock_type); btree_path_level_init(trans, path, b); if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 && @@ -1252,10 +1306,6 @@ retry_all: btree_trans_verify_sorted(trans); -#ifdef CONFIG_BCACHEFS_DEBUG - trans->traverse_all_idx = U8_MAX; -#endif - for (i = trans->nr_sorted - 2; i >= 0; --i) { struct btree_path *path1 = trans->paths + trans->sorted[i]; struct btree_path *path2 = trans->paths + trans->sorted[i + 1]; @@ -1294,9 +1344,6 @@ retry_all: path = trans->paths + trans->sorted[i]; EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); -#ifdef CONFIG_BCACHEFS_DEBUG - trans->traverse_all_idx = path->idx; -#endif ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_); if (ret) @@ -1985,11 +2032,25 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) } if (likely(k.k)) { - if (likely(!bkey_deleted(k.k))) - break; + /* + * We can never have a key in a leaf node at POS_MAX, so + * we don't have to check these successor() calls: + */ + if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && + !bch2_snapshot_is_ancestor(trans->c, + iter->snapshot, + k.k->p.snapshot)) { + search_key = bpos_successor(k.k->p); + continue; + } - /* Advance to next key: */ - search_key = bkey_successor(iter, k.k->p); + if (bkey_whiteout(k.k) && + !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { + search_key = bkey_successor(iter, k.k->p); + continue; + } + + break; } else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) { /* Advance to next leaf node: */ search_key = bpos_successor(iter->path->l[0].b->key.k.p); @@ -2010,6 +2071,9 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) iter->pos = bkey_start_pos(k.k); + if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + iter->pos.snapshot = iter->snapshot; + cmp = bpos_cmp(k.k->p, iter->path->pos); if (cmp) { iter->path = bch2_btree_path_make_mut(trans, iter->path, @@ -2022,6 +2086,10 @@ out: bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); + ret = bch2_btree_iter_verify_ret(iter, k); + if (unlikely(ret)) + return bkey_s_c_err(ret); + return k; } @@ -2045,7 +2113,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) { struct btree_trans *trans = iter->trans; struct bpos search_key = iter->pos; + struct btree_path *saved_path = NULL; struct bkey_s_c k; + struct bkey saved_k; + const struct bch_val *saved_v; int ret; EBUG_ON(iter->path->cached || iter->path->level); @@ -2053,6 +2124,9 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); + if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + search_key.snapshot = U32_MAX; + while (1) { iter->path = btree_path_set_pos(trans, iter->path, search_key, iter->flags & BTREE_ITER_INTENT); @@ -2065,18 +2139,61 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) goto out; } - k = btree_path_level_peek(trans, iter->path, + k = btree_path_level_peek(trans->c, iter->path, &iter->path->l[0], &iter->k); if (!k.k || ((iter->flags & BTREE_ITER_IS_EXTENTS) - ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0 - : bkey_cmp(k.k->p, iter->pos) > 0)) + ? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0 + : bpos_cmp(k.k->p, search_key) > 0)) k = btree_path_level_prev(trans->c, iter->path, &iter->path->l[0], &iter->k); btree_path_check_sort(trans, iter->path, 0); if (likely(k.k)) { + if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) { + if (k.k->p.snapshot == iter->snapshot) + goto got_key; + + /* + * If we have a saved candidate, and we're no + * longer at the same _key_ (not pos), return + * that candidate + */ + if (saved_path && bkey_cmp(k.k->p, saved_k.p)) { + bch2_path_put(trans, iter->path, + iter->flags & BTREE_ITER_INTENT); + iter->path = saved_path; + saved_path = NULL; + iter->k = saved_k; + k.v = saved_v; + goto got_key; + } + + if (bch2_snapshot_is_ancestor(iter->trans->c, + iter->snapshot, + k.k->p.snapshot)) { + if (saved_path) + bch2_path_put(trans, saved_path, + iter->flags & BTREE_ITER_INTENT); + saved_path = btree_path_clone(trans, iter->path, + iter->flags & BTREE_ITER_INTENT); + saved_k = *k.k; + saved_v = k.v; + } + + search_key = bpos_predecessor(k.k->p); + continue; + } +got_key: + if (bkey_whiteout(k.k) && + !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { + search_key = bkey_predecessor(iter, k.k->p); + if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + search_key.snapshot = U32_MAX; + continue; + } + break; } else if (likely(bpos_cmp(iter->path->l[0].b->data->min_key, POS_MIN))) { /* Advance to previous leaf node: */ @@ -2094,7 +2211,12 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) /* Extents can straddle iter->pos: */ if (bkey_cmp(k.k->p, iter->pos) < 0) iter->pos = k.k->p; + + if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + iter->pos.snapshot = iter->snapshot; out: + if (saved_path) + bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_INTENT); iter->path->should_be_locked = true; bch2_btree_iter_verify_entry_exit(iter); @@ -2143,7 +2265,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) if (unlikely(ret)) return bkey_s_c_err(ret); - if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) { + if ((iter->flags & BTREE_ITER_CACHED) || + !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { struct bkey_i *next_update; next_update = iter->flags & BTREE_ITER_WITH_UPDATES @@ -2202,6 +2325,9 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); + ret = bch2_btree_iter_verify_ret(iter, k); + if (unlikely(ret)) + return bkey_s_c_err(ret); return k; } @@ -2352,13 +2478,13 @@ static void __bch2_trans_iter_init(struct btree_trans *trans, btree_node_type_is_extents(btree_id)) flags |= BTREE_ITER_IS_EXTENTS; - if (!btree_type_has_snapshots(btree_id) && - !(flags & __BTREE_ITER_ALL_SNAPSHOTS)) + if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) && + !btree_type_has_snapshots(btree_id)) flags &= ~BTREE_ITER_ALL_SNAPSHOTS; - if (!(flags & BTREE_ITER_ALL_SNAPSHOTS)) - pos.snapshot = btree_type_has_snapshots(btree_id) - ? U32_MAX : 0; + if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) && + btree_type_has_snapshots(btree_id)) + flags |= BTREE_ITER_FILTER_SNAPSHOTS; iter->trans = trans; iter->path = NULL; diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index be1bb489..19ca73f5 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -234,6 +234,15 @@ static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *it iter->pos = bkey_start_pos(&iter->k); } +static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot) +{ + struct bpos pos = iter->pos; + + iter->snapshot = snapshot; + pos.snapshot = snapshot; + bch2_btree_iter_set_pos(iter, pos); +} + /* * Unlocks before scheduling * Note: does not revalidate iterator diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index 938ced36..4f1bc1d1 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -163,6 +163,11 @@ btree_key_cache_create(struct btree_key_cache *c, was_new = false; } + if (btree_id == BTREE_ID_subvolumes) + six_lock_pcpu_alloc(&ck->c.lock); + else + six_lock_pcpu_free(&ck->c.lock); + ck->c.level = 0; ck->c.btree_id = btree_id; ck->key.btree_id = btree_id; @@ -296,7 +301,7 @@ retry: if (!ck) goto retry; - mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); + mark_btree_node_locked(path, 0, SIX_LOCK_intent); path->locks_want = 1; } else { enum six_lock_type lock_want = __btree_lock_want(path, 0); @@ -318,7 +323,7 @@ retry: goto retry; } - mark_btree_node_locked(trans, path, 0, lock_want); + mark_btree_node_locked(path, 0, lock_want); } path->l[0].lock_seq = ck->c.lock.state.seq; @@ -366,7 +371,8 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, BTREE_ITER_SLOTS| - BTREE_ITER_INTENT); + BTREE_ITER_INTENT| + BTREE_ITER_ALL_SNAPSHOTS); bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, BTREE_ITER_CACHED| BTREE_ITER_CACHED_NOFILL| diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index 5c6b7580..d599008c 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -58,8 +58,7 @@ static inline void mark_btree_node_unlocked(struct btree_path *path, path->nodes_intent_locked &= ~(1 << level); } -static inline void mark_btree_node_locked(struct btree_trans *trans, - struct btree_path *path, +static inline void mark_btree_node_locked(struct btree_path *path, unsigned level, enum six_lock_type type) { @@ -69,19 +68,12 @@ static inline void mark_btree_node_locked(struct btree_trans *trans, path->nodes_locked |= 1 << level; path->nodes_intent_locked |= type << level; -#ifdef CONFIG_BCACHEFS_DEBUG - path->ip_locked = _RET_IP_; - BUG_ON(trans->in_traverse_all && - trans->traverse_all_idx != U8_MAX && - path->sorted_idx > trans->paths[trans->traverse_all_idx].sorted_idx); -#endif } -static inline void mark_btree_node_intent_locked(struct btree_trans *trans, - struct btree_path *path, +static inline void mark_btree_node_intent_locked(struct btree_path *path, unsigned level) { - mark_btree_node_locked(trans, path, level, SIX_LOCK_intent); + mark_btree_node_locked(path, level, SIX_LOCK_intent); } static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level) @@ -120,9 +112,6 @@ static inline void __bch2_btree_path_unlock(struct btree_path *path) while (path->nodes_locked) btree_node_unlock(path, __ffs(path->nodes_locked)); -#ifdef CONFIG_BCACHEFS_DEBUG - path->ip_locked = 0; -#endif } static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type) diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index ccf91ebd..7fcd2ceb 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -209,6 +209,7 @@ struct btree_node_iter { #define BTREE_ITER_WITH_UPDATES (1 << 10) #define __BTREE_ITER_ALL_SNAPSHOTS (1 << 11) #define BTREE_ITER_ALL_SNAPSHOTS (1 << 12) +#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 13) enum btree_path_uptodate { BTREE_ITER_UPTODATE = 0, @@ -255,7 +256,6 @@ struct btree_path { } l[BTREE_MAX_DEPTH]; #ifdef CONFIG_BCACHEFS_DEBUG unsigned long ip_allocated; - unsigned long ip_locked; #endif }; @@ -369,7 +369,6 @@ struct btree_trans { struct bpos locking_pos; u8 locking_btree_id; u8 locking_level; - u8 traverse_all_idx; pid_t pid; #endif unsigned long ip; @@ -607,7 +606,8 @@ static inline bool btree_node_is_extents(struct btree *b) #define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \ ((1U << BKEY_TYPE_alloc)| \ - (1U << BKEY_TYPE_stripes)) + (1U << BKEY_TYPE_stripes)| \ + (1U << BKEY_TYPE_snapshots)) #define BTREE_NODE_TYPE_HAS_TRIGGERS \ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ @@ -654,7 +654,8 @@ enum btree_update_flags { #define BTREE_TRIGGER_WANTS_OLD_AND_NEW \ ((1U << KEY_TYPE_stripe)| \ - (1U << KEY_TYPE_inode)) + (1U << KEY_TYPE_inode)| \ + (1U << KEY_TYPE_snapshot)) static inline bool btree_node_type_needs_gc(enum btree_node_type type) { @@ -671,11 +672,6 @@ struct btree_root { s8 error; }; -/* - * Optional hook that will be called just prior to a btree node update, when - * we're holding the write lock and we know what key is about to be overwritten: - */ - enum btree_insert_ret { BTREE_INSERT_OK, /* leaf node needs to be split */ @@ -696,8 +692,4 @@ enum btree_node_sibling { btree_next_sib, }; -typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *, - struct btree *, - struct btree_node_iter *); - #endif /* _BCACHEFS_BTREE_TYPES_H */ diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index 23b73d3a..4d0ece34 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -61,7 +61,7 @@ int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct disk_reservation *, u64 *, int flags); int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, - struct bpos, struct bpos, u64 *); + struct bpos, struct bpos, unsigned, u64 *); int bch2_btree_delete_range(struct bch_fs *, enum btree_id, struct bpos, struct bpos, u64 *); diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index a0da9673..f69f919d 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -15,6 +15,7 @@ #include "journal.h" #include "journal_reclaim.h" #include "keylist.h" +#include "subvolume.h" #include "replicas.h" #include @@ -245,6 +246,11 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, BUG_ON(i->cached != i->path->cached); BUG_ON(i->level != i->path->level); BUG_ON(i->btree_id != i->path->btree_id); + EBUG_ON(!i->level && + !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && + test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && + i->k->k.p.snapshot && + bch2_snapshot_internal_node(trans->c, i->k->k.p.snapshot)); } static noinline int @@ -934,6 +940,43 @@ err: goto retry; } +static int check_pos_snapshot_overwritten(struct btree_trans *trans, + enum btree_id id, + struct bpos pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + if (!snapshot_t(c, pos.snapshot)->children[0]) + return 0; + + bch2_trans_iter_init(trans, &iter, id, pos, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); + while (1) { + k = bch2_btree_iter_prev(&iter); + ret = bkey_err(k); + if (ret) + break; + + if (!k.k) + break; + + if (bkey_cmp(pos, k.k->p)) + break; + + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) { + ret = 1; + break; + } + } + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + static int bch2_trans_update_extent(struct btree_trans *trans, struct btree_iter *orig_iter, struct bkey_i *insert, @@ -958,6 +1001,28 @@ static int bch2_trans_update_extent(struct btree_trans *trans, goto out; if (bch2_bkey_maybe_mergable(k.k, &insert->k)) { + /* + * We can't merge extents if they belong to interior snapshot + * tree nodes, and there's a snapshot in which one extent is + * visible and the other is not - i.e. if visibility is + * different. + * + * Instead of checking if visibilitiy of the two extents is + * different, for now we just check if either has been + * overwritten: + */ + ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p); + if (ret < 0) + goto err; + if (ret) + goto nomerge1; + + ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p); + if (ret < 0) + goto err; + if (ret) + goto nomerge1; + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); if ((ret = PTR_ERR_OR_ZERO(update))) goto err; @@ -973,22 +1038,26 @@ static int bch2_trans_update_extent(struct btree_trans *trans, goto next; } } - - if (!bkey_cmp(k.k->p, bkey_start_pos(&insert->k))) +nomerge1: + ret = 0; + if (!bkey_cmp(k.k->p, start)) goto next; while (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) > 0) { + bool front_split = bkey_cmp(bkey_start_pos(k.k), start) < 0; + bool back_split = bkey_cmp(k.k->p, insert->k.p) > 0; + /* * If we're going to be splitting a compressed extent, note it * so that __bch2_trans_commit() can increase our disk * reservation: */ - if (bkey_cmp(bkey_start_pos(k.k), start) < 0 && - bkey_cmp(k.k->p, insert->k.p) > 0 && + if (((front_split && back_split) || + ((front_split || back_split) && k.k->p.snapshot != insert->k.p.snapshot)) && (compressed_sectors = bch2_bkey_sectors_compressed(k))) trans->extra_journal_res += compressed_sectors; - if (bkey_cmp(bkey_start_pos(k.k), start) < 0) { + if (front_split) { update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); if ((ret = PTR_ERR_OR_ZERO(update))) goto err; @@ -999,6 +1068,32 @@ static int bch2_trans_update_extent(struct btree_trans *trans, bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p, BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&update_iter) ?: + bch2_trans_update(trans, &update_iter, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + flags); + bch2_trans_iter_exit(trans, &update_iter); + + if (ret) + goto err; + } + + if (k.k->p.snapshot != insert->k.p.snapshot && + (front_split || back_split)) { + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; + + bkey_reassemble(update, k); + + bch2_cut_front(start, update); + bch2_cut_back(insert->k.p, update); + + bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS| BTREE_ITER_INTENT); ret = bch2_btree_iter_traverse(&update_iter) ?: bch2_trans_update(trans, &update_iter, update, @@ -1010,12 +1105,32 @@ static int bch2_trans_update_extent(struct btree_trans *trans, } if (bkey_cmp(k.k->p, insert->k.p) <= 0) { - ret = bch2_btree_delete_at(trans, &iter, flags); + update = bch2_trans_kmalloc(trans, sizeof(*update)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; + + bkey_init(&update->k); + update->k.p = k.k->p; + + if (insert->k.p.snapshot != k.k->p.snapshot) { + update->k.p.snapshot = insert->k.p.snapshot; + update->k.type = KEY_TYPE_whiteout; + } + + bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&update_iter) ?: + bch2_trans_update(trans, &update_iter, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + flags); + bch2_trans_iter_exit(trans, &update_iter); + if (ret) goto err; } - if (bkey_cmp(k.k->p, insert->k.p) > 0) { + if (back_split) { update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); if ((ret = PTR_ERR_OR_ZERO(update))) goto err; @@ -1023,10 +1138,15 @@ static int bch2_trans_update_extent(struct btree_trans *trans, bkey_reassemble(update, k); bch2_cut_front(insert->k.p, update); - ret = bch2_trans_update(trans, &iter, update, flags); + bch2_trans_copy_iter(&update_iter, &iter); + update_iter.pos = update->k.p; + ret = bch2_trans_update(trans, &update_iter, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + flags); + bch2_trans_iter_exit(trans, &update_iter); + if (ret) goto err; - goto out; } next: @@ -1037,7 +1157,23 @@ next: goto out; } - bch2_bkey_merge(c, bkey_i_to_s(insert), k); + if (bch2_bkey_maybe_mergable(&insert->k, k.k)) { + ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p); + if (ret < 0) + goto out; + if (ret) + goto nomerge2; + + ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p); + if (ret < 0) + goto out; + if (ret) + goto nomerge2; + + bch2_bkey_merge(c, bkey_i_to_s(insert), k); + } +nomerge2: + ret = 0; out: if (!bkey_deleted(&insert->k)) { /* @@ -1057,6 +1193,39 @@ err: return ret; } +/* + * When deleting, check if we need to emit a whiteout (because we're overwriting + * something in an ancestor snapshot) + */ +static int need_whiteout_for_snapshot(struct btree_trans *trans, + enum btree_id btree_id, struct bpos pos) +{ + struct btree_iter iter; + struct bkey_s_c k; + u32 snapshot = pos.snapshot; + int ret; + + if (!bch2_snapshot_parent(trans->c, pos.snapshot)) + return 0; + + pos.snapshot++; + + for_each_btree_key(trans, iter, btree_id, pos, + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + if (bkey_cmp(k.k->p, pos)) + break; + + if (bch2_snapshot_is_ancestor(trans->c, snapshot, + k.k->p.snapshot)) { + ret = !bkey_whiteout(k.k); + break; + } + } + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *k, enum btree_update_flags flags) { @@ -1089,6 +1258,16 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, btree_insert_entry_cmp(i - 1, i) >= 0); #endif + if (bkey_deleted(&n.k->k) && + (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { + int ret = need_whiteout_for_snapshot(trans, n.btree_id, n.k->k.p); + if (unlikely(ret < 0)) + return ret; + + if (ret) + n.k->k.type = KEY_TYPE_whiteout; + } + /* * Pending updates are kept sorted: first, find position of new update, * then delete/trim any updates the new update overwrites: @@ -1175,13 +1354,14 @@ int bch2_btree_delete_at(struct btree_trans *trans, int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, struct bpos start, struct bpos end, + unsigned iter_flags, u64 *journal_seq) { struct btree_iter iter; struct bkey_s_c k; int ret = 0; - bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT|iter_flags); retry: while ((bch2_trans_begin(trans), (k = bch2_btree_iter_peek(&iter)).k) && @@ -1248,5 +1428,5 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, u64 *journal_seq) { return bch2_trans_do(c, NULL, journal_seq, 0, - bch2_btree_delete_range_trans(&trans, id, start, end, journal_seq)); + bch2_btree_delete_range_trans(&trans, id, start, end, 0, journal_seq)); } diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index df12416e..5fd3aabb 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -16,6 +16,7 @@ #include "movinggc.h" #include "reflink.h" #include "replicas.h" +#include "subvolume.h" #include #include @@ -1200,6 +1201,8 @@ static int bch2_mark_key_locked(struct bch_fs *c, return bch2_mark_reservation(c, old, new, journal_seq, flags); case KEY_TYPE_reflink_p: return bch2_mark_reflink_p(c, old, new, journal_seq, flags); + case KEY_TYPE_snapshot: + return bch2_mark_snapshot(c, old, new, journal_seq, flags); default: return 0; } diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index 1d510f77..8653a106 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -8,6 +8,7 @@ #include "fs.h" #include "keylist.h" #include "str_hash.h" +#include "subvolume.h" #include @@ -99,7 +100,8 @@ const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k) if (memchr(d.v->d_name, '/', len)) return "invalid name"; - if (le64_to_cpu(d.v->d_inum) == d.k->p.inode) + if (d.v->d_type != DT_SUBVOL && + le64_to_cpu(d.v->d_inum) == d.k->p.inode) return "dirent points to own directory"; return NULL; @@ -113,7 +115,7 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, bch_scnmemcpy(out, d.v->d_name, bch2_dirent_name_bytes(d)); pr_buf(out, " -> %llu type %s", d.v->d_inum, - d.v->d_type < DT_MAX + d.v->d_type < BCH_DT_MAX ? bch2_d_types[d.v->d_type] : "(bad d_type)"); } @@ -149,8 +151,8 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, return dirent; } -int bch2_dirent_create(struct btree_trans *trans, - u64 dir_inum, const struct bch_hash_info *hash_info, +int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, + const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, u64 *dir_offset, int flags) { @@ -163,7 +165,7 @@ int bch2_dirent_create(struct btree_trans *trans, return ret; ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, - dir_inum, &dirent->k_i, flags); + dir, &dirent->k_i, flags); *dir_offset = dirent->k.p.offset; return ret; @@ -176,22 +178,86 @@ static void dirent_copy_target(struct bkey_i_dirent *dst, dst->v.d_type = src.v->d_type; } +int __bch2_dirent_read_target(struct btree_trans *trans, + struct bkey_s_c_dirent d, + u32 *subvol, u32 *snapshot, u64 *inum, + bool is_fsck) +{ + int ret = 0; + + *subvol = 0; + *snapshot = d.k->p.snapshot; + + if (likely(d.v->d_type != DT_SUBVOL)) { + *inum = le64_to_cpu(d.v->d_inum); + } else { + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_subvolume s; + int ret; + + *subvol = le64_to_cpu(d.v->d_inum); + bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, + POS(0, *subvol), + BTREE_ITER_CACHED); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_subvolume) { + ret = -ENOENT; + goto err; + } + + s = bkey_s_c_to_subvolume(k); + *snapshot = le32_to_cpu(s.v->snapshot); + *inum = le64_to_cpu(s.v->inode); +err: + if (ret == -ENOENT && !is_fsck) + bch2_fs_inconsistent(trans->c, "pointer to missing subvolume %u", + *subvol); + + bch2_trans_iter_exit(trans, &iter); + } + + return ret; +} + +static int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, + struct bkey_s_c_dirent d, subvol_inum *target) +{ + u32 snapshot; + int ret = 0; + + ret = __bch2_dirent_read_target(trans, d, &target->subvol, &snapshot, + &target->inum, false); + if (!target->subvol) + target->subvol = dir.subvol; + + return ret; +} + int bch2_dirent_rename(struct btree_trans *trans, - u64 src_dir, struct bch_hash_info *src_hash, - u64 dst_dir, struct bch_hash_info *dst_hash, - const struct qstr *src_name, u64 *src_inum, u64 *src_offset, - const struct qstr *dst_name, u64 *dst_inum, u64 *dst_offset, - enum bch_rename_mode mode) + subvol_inum src_dir, struct bch_hash_info *src_hash, + subvol_inum dst_dir, struct bch_hash_info *dst_hash, + const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset, + const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset, + enum bch_rename_mode mode) { struct btree_iter src_iter = { NULL }; struct btree_iter dst_iter = { NULL }; struct bkey_s_c old_src, old_dst; struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; struct bpos dst_pos = - POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name)); + POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name)); int ret = 0; - *src_inum = *dst_inum = 0; + if (src_dir.subvol != dst_dir.subvol) + return -EXDEV; + + memset(src_inum, 0, sizeof(*src_inum)); + memset(dst_inum, 0, sizeof(*dst_inum)); /* * Lookup dst: @@ -214,8 +280,12 @@ int bch2_dirent_rename(struct btree_trans *trans, if (ret) goto out; - if (mode != BCH_RENAME) - *dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum); + if (mode != BCH_RENAME) { + ret = bch2_dirent_read_target(trans, dst_dir, + bkey_s_c_to_dirent(old_dst), dst_inum); + if (ret) + goto out; + } if (mode != BCH_RENAME_EXCHANGE) *src_offset = dst_iter.pos.offset; @@ -231,7 +301,10 @@ int bch2_dirent_rename(struct btree_trans *trans, if (ret) goto out; - *src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum); + ret = bch2_dirent_read_target(trans, src_dir, + bkey_s_c_to_dirent(old_src), src_inum); + if (ret) + goto out; /* Create new dst key: */ new_dst = dirent_create_key(trans, 0, dst_name, 0); @@ -310,63 +383,79 @@ out: return ret; } -int bch2_dirent_delete_at(struct btree_trans *trans, - const struct bch_hash_info *hash_info, - struct btree_iter *iter) -{ - return bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - hash_info, iter); -} - int __bch2_dirent_lookup_trans(struct btree_trans *trans, struct btree_iter *iter, - u64 dir_inum, + subvol_inum dir, const struct bch_hash_info *hash_info, - const struct qstr *name, unsigned flags) + const struct qstr *name, subvol_inum *inum, + unsigned flags) { - return bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, - hash_info, dir_inum, name, flags); + struct bkey_s_c k; + struct bkey_s_c_dirent d; + u32 snapshot; + int ret; + + ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); + if (ret) + return ret; + + ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, + hash_info, dir, name, flags); + if (ret) + return ret; + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) { + bch2_trans_iter_exit(trans, iter); + return ret; + } + + d = bkey_s_c_to_dirent(k); + + ret = bch2_dirent_read_target(trans, dir, d, inum); + if (ret) + bch2_trans_iter_exit(trans, iter); + + return ret; } -u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, +u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, const struct bch_hash_info *hash_info, - const struct qstr *name) + const struct qstr *name, subvol_inum *inum) { struct btree_trans trans; struct btree_iter iter; - struct bkey_s_c k; - u64 inum = 0; - int ret = 0; + int ret; bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); - ret = __bch2_dirent_lookup_trans(&trans, &iter, dir_inum, - hash_info, name, 0); - if (ret) - goto out; + ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info, + name, inum, 0); - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto out; - - inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); bch2_trans_iter_exit(&trans, &iter); -out: - BUG_ON(ret == -EINTR); + if (ret == -EINTR) + goto retry; bch2_trans_exit(&trans); - return inum; + return ret; } -int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) +int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) { struct btree_iter iter; struct bkey_s_c k; + u32 snapshot; int ret; + ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); + if (ret) + return ret; + for_each_btree_key(trans, iter, BTREE_ID_dirents, - POS(dir_inum, 0), 0, k, ret) { - if (k.k->p.inode > dir_inum) + SPOS(dir.inum, 0, snapshot), 0, k, ret) { + if (k.k->p.inode > dir.inum) break; if (k.k->type == KEY_TYPE_dirent) { @@ -379,19 +468,26 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) return ret; } -int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) +int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) { struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; struct bkey_s_c_dirent dirent; + u32 snapshot; int ret; bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; for_each_btree_key(&trans, iter, BTREE_ID_dirents, - POS(inum, ctx->pos), 0, k, ret) { - if (k.k->p.inode > inum) + SPOS(inum.inum, ctx->pos, snapshot), 0, k, ret) { + if (k.k->p.inode > inum.inum) break; if (k.k->type != KEY_TYPE_dirent) @@ -407,11 +503,14 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) if (!dir_emit(ctx, dirent.v->d_name, bch2_dirent_name_bytes(dirent), le64_to_cpu(dirent.v->d_inum), - dirent.v->d_type)) + vfs_d_type(dirent.v->d_type))) break; ctx->pos = dirent.k->p.offset + 1; } bch2_trans_iter_exit(&trans, &iter); +err: + if (ret == -EINTR) + goto retry; ret = bch2_trans_exit(&trans) ?: ret; diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h index c14f6029..e7f65fbd 100644 --- a/libbcachefs/dirent.h +++ b/libbcachefs/dirent.h @@ -29,13 +29,17 @@ static inline unsigned dirent_val_u64s(unsigned len) sizeof(u64)); } -int bch2_dirent_create(struct btree_trans *, u64, +int bch2_dirent_create(struct btree_trans *, subvol_inum, const struct bch_hash_info *, u8, const struct qstr *, u64, u64 *, int); -int bch2_dirent_delete_at(struct btree_trans *, - const struct bch_hash_info *, - struct btree_iter *); +int __bch2_dirent_read_target(struct btree_trans *, struct bkey_s_c_dirent, + u32 *, u32 *, u64 *, bool); + +static inline unsigned vfs_d_type(unsigned type) +{ + return type == DT_SUBVOL ? DT_DIR : type; +} enum bch_rename_mode { BCH_RENAME, @@ -44,19 +48,20 @@ enum bch_rename_mode { }; int bch2_dirent_rename(struct btree_trans *, - u64, struct bch_hash_info *, - u64, struct bch_hash_info *, - const struct qstr *, u64 *, u64 *, - const struct qstr *, u64 *, u64 *, + subvol_inum, struct bch_hash_info *, + subvol_inum, struct bch_hash_info *, + const struct qstr *, subvol_inum *, u64 *, + const struct qstr *, subvol_inum *, u64 *, enum bch_rename_mode); -int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, u64, - const struct bch_hash_info *, - const struct qstr *, unsigned); -u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *, - const struct qstr *); +int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, + subvol_inum, const struct bch_hash_info *, + const struct qstr *, subvol_inum *, unsigned); +u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum, + const struct bch_hash_info *, + const struct qstr *, subvol_inum *); -int bch2_empty_dir_trans(struct btree_trans *, u64); -int bch2_readdir(struct bch_fs *, u64, struct dir_context *); +int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); +int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); #endif /* _BCACHEFS_DIRENT_H */ diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index f66640c2..6c2eed77 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -612,38 +612,6 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c k) return false; } -bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, - unsigned nr_replicas, bool compressed) -{ - struct btree_trans trans; - struct btree_iter iter; - struct bpos end = pos; - struct bkey_s_c k; - bool ret = true; - int err; - - end.offset += size; - - bch2_trans_init(&trans, c, 0, 0); - - for_each_btree_key(&trans, iter, BTREE_ID_extents, pos, - BTREE_ITER_SLOTS, k, err) { - if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) - break; - - if (nr_replicas > bch2_bkey_replicas(c, k) || - (!compressed && bch2_bkey_sectors_compressed(k))) { - ret = false; - break; - } - } - bch2_trans_iter_exit(&trans, &iter); - - bch2_trans_exit(&trans); - - return ret; -} - unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 43cef0a3..afd3067b 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -567,7 +567,6 @@ unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); bool bch2_bkey_is_incompressible(struct bkey_s_c); unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); -bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned, bool); unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c); unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c index 6bc82559..3e8e3c5b 100644 --- a/libbcachefs/fs-common.c +++ b/libbcachefs/fs-common.c @@ -6,82 +6,186 @@ #include "dirent.h" #include "fs-common.h" #include "inode.h" +#include "subvolume.h" #include "xattr.h" #include -int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, +static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode) +{ + return S_ISDIR(inode->bi_mode) && !inode->bi_subvol; +} + +int bch2_create_trans(struct btree_trans *trans, + subvol_inum dir, struct bch_inode_unpacked *dir_u, struct bch_inode_unpacked *new_inode, const struct qstr *name, uid_t uid, gid_t gid, umode_t mode, dev_t rdev, struct posix_acl *default_acl, - struct posix_acl *acl) + struct posix_acl *acl, + subvol_inum snapshot_src, + unsigned flags) { struct bch_fs *c = trans->c; struct btree_iter dir_iter = { NULL }; struct btree_iter inode_iter = { NULL }; - struct bch_hash_info hash = bch2_hash_info_init(c, new_inode); + subvol_inum new_inum = dir; u64 now = bch2_current_time(c); u64 cpu = raw_smp_processor_id(); - u64 dir_offset = 0; + u64 dir_target; + u32 snapshot; + unsigned dir_type = mode_to_type(mode); int ret; - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT); + ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); if (ret) goto err; - bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); - - if (!name) - new_inode->bi_flags |= BCH_INODE_UNLINKED; - - ret = bch2_inode_create(trans, &inode_iter, new_inode, U32_MAX, cpu); + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); if (ret) goto err; - if (default_acl) { - ret = bch2_set_acl_trans(trans, new_inode, &hash, - default_acl, ACL_TYPE_DEFAULT); + if (!(flags & BCH_CREATE_SNAPSHOT)) { + /* Normal create path - allocate a new inode: */ + bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); + + if (flags & BCH_CREATE_TMPFILE) + new_inode->bi_flags |= BCH_INODE_UNLINKED; + + ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu); + if (ret) + goto err; + + snapshot_src = (subvol_inum) { 0 }; + } else { + /* + * Creating a snapshot - we're not allocating a new inode, but + * we do have to lookup the root inode of the subvolume we're + * snapshotting and update it (in the new snapshot): + */ + + if (!snapshot_src.inum) { + /* Inode wasn't specified, just snapshot: */ + struct btree_iter subvol_iter; + struct bkey_s_c k; + + bch2_trans_iter_init(trans, &subvol_iter, BTREE_ID_subvolumes, + POS(0, snapshot_src.subvol), 0); + k = bch2_btree_iter_peek_slot(&subvol_iter); + + ret = bkey_err(k); + if (!ret && k.k->type != KEY_TYPE_subvolume) { + bch_err(c, "subvolume %u not found", + snapshot_src.subvol); + ret = -ENOENT; + } + + if (!ret) + snapshot_src.inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode); + bch2_trans_iter_exit(trans, &subvol_iter); + + if (ret) + goto err; + } + + ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src, + BTREE_ITER_INTENT); + if (ret) + goto err; + + if (new_inode->bi_subvol != snapshot_src.subvol) { + /* Not a subvolume root: */ + ret = -EINVAL; + goto err; + } + + /* + * If we're not root, we have to own the subvolume being + * snapshotted: + */ + if (uid && new_inode->bi_uid != uid) { + ret = -EPERM; + goto err; + } + + flags |= BCH_CREATE_SUBVOL; + } + + new_inum.inum = new_inode->bi_inum; + dir_target = new_inode->bi_inum; + + if (flags & BCH_CREATE_SUBVOL) { + u32 new_subvol, dir_snapshot; + + ret = bch2_subvolume_create(trans, new_inode->bi_inum, + snapshot_src.subvol, + &new_subvol, &snapshot, + (flags & BCH_CREATE_SNAPSHOT_RO) != 0); + if (ret) + goto err; + + new_inode->bi_parent_subvol = dir.subvol; + new_inode->bi_subvol = new_subvol; + new_inum.subvol = new_subvol; + dir_target = new_subvol; + dir_type = DT_SUBVOL; + + ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot); + if (ret) + goto err; + + bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot); + ret = bch2_btree_iter_traverse(&dir_iter); if (ret) goto err; } - if (acl) { - ret = bch2_set_acl_trans(trans, new_inode, &hash, - acl, ACL_TYPE_ACCESS); - if (ret) - goto err; + if (!(flags & BCH_CREATE_SNAPSHOT)) { + if (default_acl) { + ret = bch2_set_acl_trans(trans, new_inum, new_inode, + default_acl, ACL_TYPE_DEFAULT); + if (ret) + goto err; + } + + if (acl) { + ret = bch2_set_acl_trans(trans, new_inum, new_inode, + acl, ACL_TYPE_ACCESS); + if (ret) + goto err; + } } - if (name) { + if (!(flags & BCH_CREATE_TMPFILE)) { struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u); - dir_u->bi_mtime = dir_u->bi_ctime = now; + u64 dir_offset; - if (S_ISDIR(new_inode->bi_mode)) + if (is_subdir_for_nlink(new_inode)) dir_u->bi_nlink++; + dir_u->bi_mtime = dir_u->bi_ctime = now; ret = bch2_inode_write(trans, &dir_iter, dir_u); if (ret) goto err; - ret = bch2_dirent_create(trans, dir_inum, &dir_hash, - mode_to_type(new_inode->bi_mode), - name, new_inode->bi_inum, + ret = bch2_dirent_create(trans, dir, &dir_hash, + dir_type, + name, + dir_target, &dir_offset, BCH_HASH_SET_MUST_CREATE); if (ret) goto err; + + if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { + new_inode->bi_dir = dir_u->bi_inum; + new_inode->bi_dir_offset = dir_offset; + } } - if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { - new_inode->bi_dir = dir_u->bi_inum; - new_inode->bi_dir_offset = dir_offset; - } - - /* XXX use bch2_btree_iter_set_snapshot() */ - inode_iter.snapshot = U32_MAX; - bch2_btree_iter_set_pos(&inode_iter, SPOS(0, new_inode->bi_inum, U32_MAX)); + inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS; + bch2_btree_iter_set_snapshot(&inode_iter, snapshot); ret = bch2_btree_iter_traverse(&inode_iter) ?: bch2_inode_write(trans, &inode_iter, new_inode); @@ -91,9 +195,10 @@ err: return ret; } -int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, - u64 inum, struct bch_inode_unpacked *dir_u, - struct bch_inode_unpacked *inode_u, const struct qstr *name) +int bch2_link_trans(struct btree_trans *trans, + subvol_inum dir, struct bch_inode_unpacked *dir_u, + subvol_inum inum, struct bch_inode_unpacked *inode_u, + const struct qstr *name) { struct bch_fs *c = trans->c; struct btree_iter dir_iter = { NULL }; @@ -103,6 +208,9 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, u64 dir_offset = 0; int ret; + if (dir.subvol != inum.subvol) + return -EXDEV; + ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); if (ret) goto err; @@ -110,7 +218,7 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, inode_u->bi_ctime = now; bch2_inode_nlink_inc(inode_u); - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT); + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); if (ret) goto err; @@ -118,15 +226,15 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, dir_hash = bch2_hash_info_init(c, dir_u); - ret = bch2_dirent_create(trans, dir_inum, &dir_hash, + ret = bch2_dirent_create(trans, dir, &dir_hash, mode_to_type(inode_u->bi_mode), - name, inum, &dir_offset, + name, inum.inum, &dir_offset, BCH_HASH_SET_MUST_CREATE); if (ret) goto err; if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { - inode_u->bi_dir = dir_inum; + inode_u->bi_dir = dir.inum; inode_u->bi_dir_offset = dir_offset; } @@ -139,55 +247,83 @@ err: } int bch2_unlink_trans(struct btree_trans *trans, - u64 dir_inum, struct bch_inode_unpacked *dir_u, + subvol_inum dir, + struct bch_inode_unpacked *dir_u, struct bch_inode_unpacked *inode_u, - const struct qstr *name) + const struct qstr *name, + int deleting_snapshot) { struct bch_fs *c = trans->c; struct btree_iter dir_iter = { NULL }; struct btree_iter dirent_iter = { NULL }; struct btree_iter inode_iter = { NULL }; struct bch_hash_info dir_hash; - u64 inum, now = bch2_current_time(c); + subvol_inum inum; + u64 now = bch2_current_time(c); struct bkey_s_c k; int ret; - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT); + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); if (ret) goto err; dir_hash = bch2_hash_info_init(c, dir_u); - ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir_inum, &dir_hash, - name, BTREE_ITER_INTENT); + ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, + name, &inum, BTREE_ITER_INTENT); if (ret) goto err; - k = bch2_btree_iter_peek_slot(&dirent_iter); - ret = bkey_err(k); + ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, + BTREE_ITER_INTENT); if (ret) goto err; - inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); - - ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); - if (ret) + if (deleting_snapshot == 1 && !inode_u->bi_subvol) { + ret = -ENOENT; goto err; + } - if (inode_u->bi_dir == k.k->p.inode && - inode_u->bi_dir_offset == k.k->p.offset) { + if (deleting_snapshot <= 0 && S_ISDIR(inode_u->bi_mode)) { + ret = bch2_empty_dir_trans(trans, inum); + if (ret) + goto err; + } + + if (inode_u->bi_subvol) { + ret = bch2_subvolume_delete(trans, inode_u->bi_subvol, + deleting_snapshot); + if (ret) + goto err; + + k = bch2_btree_iter_peek_slot(&dirent_iter); + ret = bkey_err(k); + if (ret) + goto err; + + /* + * If we're deleting a subvolume, we need to really delete the + * dirent, not just emit a whiteout in the current snapshot: + */ + bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot); + ret = bch2_btree_iter_traverse(&dirent_iter); + if (ret) + goto err; + } + + if (inode_u->bi_dir == dirent_iter.pos.inode && + inode_u->bi_dir_offset == dirent_iter.pos.offset) { inode_u->bi_dir = 0; inode_u->bi_dir_offset = 0; } dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; - dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode); + dir_u->bi_nlink -= is_subdir_for_nlink(inode_u); bch2_inode_nlink_dec(inode_u); - ret = (S_ISDIR(inode_u->bi_mode) - ? bch2_empty_dir_trans(trans, inum) - : 0) ?: - bch2_dirent_delete_at(trans, &dir_hash, &dirent_iter) ?: + ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + &dir_hash, &dirent_iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: bch2_inode_write(trans, &dir_iter, dir_u) ?: bch2_inode_write(trans, &inode_iter, inode_u); err: @@ -222,8 +358,8 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, } int bch2_rename_trans(struct btree_trans *trans, - u64 src_dir, struct bch_inode_unpacked *src_dir_u, - u64 dst_dir, struct bch_inode_unpacked *dst_dir_u, + subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u, + subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u, struct bch_inode_unpacked *src_inode_u, struct bch_inode_unpacked *dst_inode_u, const struct qstr *src_name, @@ -236,7 +372,8 @@ int bch2_rename_trans(struct btree_trans *trans, struct btree_iter src_inode_iter = { NULL }; struct btree_iter dst_inode_iter = { NULL }; struct bch_hash_info src_hash, dst_hash; - u64 src_inode, src_offset, dst_inode, dst_offset; + subvol_inum src_inum, dst_inum; + u64 src_offset, dst_offset; u64 now = bch2_current_time(c); int ret; @@ -247,7 +384,8 @@ int bch2_rename_trans(struct btree_trans *trans, src_hash = bch2_hash_info_init(c, src_dir_u); - if (dst_dir != src_dir) { + if (dst_dir.inum != src_dir.inum || + dst_dir.subvol != src_dir.subvol) { ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir, BTREE_ITER_INTENT); if (ret) @@ -262,19 +400,19 @@ int bch2_rename_trans(struct btree_trans *trans, ret = bch2_dirent_rename(trans, src_dir, &src_hash, dst_dir, &dst_hash, - src_name, &src_inode, &src_offset, - dst_name, &dst_inode, &dst_offset, + src_name, &src_inum, &src_offset, + dst_name, &dst_inum, &dst_offset, mode); if (ret) goto err; - ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inode, + ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum, BTREE_ITER_INTENT); if (ret) goto err; - if (dst_inode) { - ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inode, + if (dst_inum.inum) { + ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum, BTREE_ITER_INTENT); if (ret) goto err; @@ -305,7 +443,7 @@ int bch2_rename_trans(struct btree_trans *trans, } if (S_ISDIR(dst_inode_u->bi_mode) && - bch2_empty_dir_trans(trans, dst_inode)) { + bch2_empty_dir_trans(trans, dst_inum)) { ret = -ENOTEMPTY; goto err; } @@ -324,12 +462,12 @@ int bch2_rename_trans(struct btree_trans *trans, goto err; } - if (S_ISDIR(src_inode_u->bi_mode)) { + if (is_subdir_for_nlink(src_inode_u)) { src_dir_u->bi_nlink--; dst_dir_u->bi_nlink++; } - if (dst_inode && S_ISDIR(dst_inode_u->bi_mode)) { + if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) { dst_dir_u->bi_nlink--; src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; } @@ -340,22 +478,22 @@ int bch2_rename_trans(struct btree_trans *trans, src_dir_u->bi_mtime = now; src_dir_u->bi_ctime = now; - if (src_dir != dst_dir) { + if (src_dir.inum != dst_dir.inum) { dst_dir_u->bi_mtime = now; dst_dir_u->bi_ctime = now; } src_inode_u->bi_ctime = now; - if (dst_inode) + if (dst_inum.inum) dst_inode_u->bi_ctime = now; ret = bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?: - (src_dir != dst_dir + (src_dir.inum != dst_dir.inum ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u) : 0 ) ?: bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?: - (dst_inode + (dst_inum.inum ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u) : 0 ); err: diff --git a/libbcachefs/fs-common.h b/libbcachefs/fs-common.h index 2273b796..9bb0a967 100644 --- a/libbcachefs/fs-common.h +++ b/libbcachefs/fs-common.h @@ -4,27 +4,33 @@ struct posix_acl; -int bch2_create_trans(struct btree_trans *, u64, +#define BCH_CREATE_TMPFILE (1U << 0) +#define BCH_CREATE_SUBVOL (1U << 1) +#define BCH_CREATE_SNAPSHOT (1U << 2) +#define BCH_CREATE_SNAPSHOT_RO (1U << 3) + +int bch2_create_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, struct bch_inode_unpacked *, const struct qstr *, uid_t, gid_t, umode_t, dev_t, struct posix_acl *, - struct posix_acl *); + struct posix_acl *, + subvol_inum, unsigned); -int bch2_link_trans(struct btree_trans *, u64, - u64, struct bch_inode_unpacked *, - struct bch_inode_unpacked *, +int bch2_link_trans(struct btree_trans *, + subvol_inum, struct bch_inode_unpacked *, + subvol_inum, struct bch_inode_unpacked *, const struct qstr *); -int bch2_unlink_trans(struct btree_trans *, - u64, struct bch_inode_unpacked *, +int bch2_unlink_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, - const struct qstr *); + struct bch_inode_unpacked *, + const struct qstr *, int); int bch2_rename_trans(struct btree_trans *, - u64, struct bch_inode_unpacked *, - u64, struct bch_inode_unpacked *, + subvol_inum, struct bch_inode_unpacked *, + subvol_inum, struct bch_inode_unpacked *, struct bch_inode_unpacked *, struct bch_inode_unpacked *, const struct qstr *, diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 29210377..c07755c6 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -786,23 +786,35 @@ static void readpage_bio_extend(struct readpages_iter *iter, } } -static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, - struct bch_read_bio *rbio, u64 inum, +static void bchfs_read(struct btree_trans *trans, + struct bch_read_bio *rbio, + subvol_inum inum, struct readpages_iter *readpages_iter) { struct bch_fs *c = trans->c; + struct btree_iter iter; struct bkey_buf sk; int flags = BCH_READ_RETRY_IF_STALE| BCH_READ_MAY_PROMOTE; + u32 snapshot; int ret = 0; rbio->c = c; rbio->start_time = local_clock(); + rbio->subvol = inum.subvol; bch2_bkey_buf_init(&sk); retry: bch2_trans_begin(trans); + iter = (struct btree_iter) { NULL }; + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto err; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot), + BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS); while (1) { struct bkey_s_c k; unsigned bytes, sectors, offset_into_extent; @@ -817,15 +829,15 @@ retry: break; } - bch2_btree_iter_set_pos(iter, - POS(inum, rbio->bio.bi_iter.bi_sector)); + bch2_btree_iter_set_pos(&iter, + POS(inum.inum, rbio->bio.bi_iter.bi_sector)); - k = bch2_btree_iter_peek_slot(iter); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) break; - offset_into_extent = iter->pos.offset - + offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; @@ -855,7 +867,7 @@ retry: if (bkey_extent_is_allocation(k.k)) bch2_add_page_sectors(&rbio->bio, k); - bch2_read_extent(trans, rbio, iter->pos, + bch2_read_extent(trans, rbio, iter.pos, data_btree, k, offset_into_extent, flags); if (flags & BCH_READ_LAST_FRAGMENT) @@ -864,12 +876,14 @@ retry: swap(rbio->bio.bi_iter.bi_size, bytes); bio_advance(&rbio->bio, bytes); } +err: + bch2_trans_iter_exit(trans, &iter); if (ret == -EINTR) goto retry; if (ret) { - bch_err_inum_ratelimited(c, inum, + bch_err_inum_ratelimited(c, inum.inum, "read error %i from btree lookup", ret); rbio->bio.bi_status = BLK_STS_IOERR; bio_endio(&rbio->bio); @@ -884,7 +898,6 @@ void bch2_readahead(struct readahead_control *ractl) struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_io_opts opts = io_opts(c, &inode->ei_inode); struct btree_trans trans; - struct btree_iter iter; struct page *page; struct readpages_iter readpages_iter; int ret; @@ -893,8 +906,6 @@ void bch2_readahead(struct readahead_control *ractl) BUG_ON(ret); bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN, - BTREE_ITER_SLOTS); bch2_pagecache_add_get(&inode->ei_pagecache_lock); @@ -915,22 +926,20 @@ void bch2_readahead(struct readahead_control *ractl) rbio->bio.bi_end_io = bch2_readpages_end_io; BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); - bchfs_read(&trans, &iter, rbio, inode->v.i_ino, + bchfs_read(&trans, rbio, inode_inum(inode), &readpages_iter); } bch2_pagecache_add_put(&inode->ei_pagecache_lock); - bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); kfree(readpages_iter.pages); } static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, - u64 inum, struct page *page) + subvol_inum inum, struct page *page) { struct btree_trans trans; - struct btree_iter iter; bch2_page_state_create(page, __GFP_NOFAIL); @@ -940,12 +949,7 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN, - BTREE_ITER_SLOTS); - - bchfs_read(&trans, &iter, rbio, inum, NULL); - - bch2_trans_iter_exit(&trans, &iter); + bchfs_read(&trans, rbio, inum, NULL); bch2_trans_exit(&trans); } @@ -959,7 +963,7 @@ int bch2_readpage(struct file *file, struct page *page) rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts); rbio->bio.bi_end_io = bch2_readpages_end_io; - __bchfs_readpage(c, rbio, inode->v.i_ino, page); + __bchfs_readpage(c, rbio, inode_inum(inode), page); return 0; } @@ -982,7 +986,7 @@ static int bch2_read_single_page(struct page *page, rbio->bio.bi_private = &done; rbio->bio.bi_end_io = bch2_read_single_page_end_io; - __bchfs_readpage(c, rbio, inode->v.i_ino, page); + __bchfs_readpage(c, rbio, inode_inum(inode), page); wait_for_completion(&done); ret = blk_status_to_errno(rbio->bio.bi_status); @@ -1126,6 +1130,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, op->nr_replicas = nr_replicas; op->res.nr_replicas = nr_replicas; op->write_point = writepoint_hashed(inode->ei_last_dirtied); + op->subvol = inode->ei_subvol; op->pos = POS(inode->v.i_ino, sector); op->wbio.bio.bi_iter.bi_sector = sector; op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); @@ -1758,7 +1763,7 @@ start: if (iter->count) closure_get(&dio->cl); - bch2_read(c, rbio_init(bio, opts), inode->v.i_ino); + bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); } iter->count += shorten; @@ -1813,6 +1818,50 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) /* O_DIRECT writes */ +static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum, + u64 offset, u64 size, + unsigned nr_replicas, bool compressed) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + u64 end = offset + size; + u32 snapshot; + bool ret = true; + int err; + + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (err) + goto err; + + for_each_btree_key(&trans, iter, BTREE_ID_extents, + SPOS(inum.inum, offset, snapshot), + BTREE_ITER_SLOTS, k, err) { + if (bkey_cmp(bkey_start_pos(k.k), POS(inum.inum, end)) >= 0) + break; + + if (k.k->p.snapshot != snapshot || + nr_replicas > bch2_bkey_replicas(c, k) || + (!compressed && bch2_bkey_sectors_compressed(k))) { + ret = false; + break; + } + } + + offset = iter.pos.offset; + bch2_trans_iter_exit(&trans, &iter); +err: + if (err == -EINTR) + goto retry; + bch2_trans_exit(&trans); + + return err ? false : ret; +} + static void bch2_dio_write_loop_async(struct bch_write_op *); static long bch2_dio_write_loop(struct dio_write *dio) @@ -1891,6 +1940,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) op_journal_seq_set(&dio->op, &inode->ei_journal_seq); dio->op.write_point = writepoint_hashed((unsigned long) current); dio->op.nr_replicas = dio->op.opts.data_replicas; + dio->op.subvol = inode->ei_subvol; dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); if ((req->ki_flags & IOCB_DSYNC) && @@ -1901,8 +1951,8 @@ static long bch2_dio_write_loop(struct dio_write *dio) ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), dio->op.opts.data_replicas, 0); if (unlikely(ret) && - !bch2_check_range_allocated(c, dio->op.pos, - bio_sectors(bio), + !bch2_check_range_allocated(c, inode_inum(inode), + dio->op.pos.offset, bio_sectors(bio), dio->op.opts.data_replicas, dio->op.opts.compression != 0)) goto err; @@ -2146,9 +2196,9 @@ out: /* truncate: */ -static inline int range_has_data(struct bch_fs *c, - struct bpos start, - struct bpos end) +static inline int range_has_data(struct bch_fs *c, u32 subvol, + struct bpos start, + struct bpos end) { struct btree_trans trans; struct btree_iter iter; @@ -2156,6 +2206,12 @@ static inline int range_has_data(struct bch_fs *c, int ret = 0; bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, subvol, &start.snapshot); + if (ret) + goto err; for_each_btree_key(&trans, iter, BTREE_ID_extents, start, 0, k, ret) { if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) @@ -2166,7 +2222,11 @@ static inline int range_has_data(struct bch_fs *c, break; } } + start = iter.pos; bch2_trans_iter_exit(&trans, &iter); +err: + if (ret == -EINTR) + goto retry; return bch2_trans_exit(&trans) ?: ret; } @@ -2198,7 +2258,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, * XXX: we're doing two index lookups when we end up reading the * page */ - ret = range_has_data(c, + ret = range_has_data(c, inode->ei_subvol, POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT), POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT)); if (ret <= 0) @@ -2332,7 +2392,7 @@ int bch2_truncate(struct user_namespace *mnt_userns, inode_dio_wait(&inode->v); bch2_pagecache_block_get(&inode->ei_pagecache_lock); - ret = bch2_inode_find_by_inum(c, inode->v.i_ino, &inode_u); + ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u); if (ret) goto err; @@ -2390,7 +2450,7 @@ int bch2_truncate(struct user_namespace *mnt_userns, truncate_setsize(&inode->v, iattr->ia_size); - ret = bch2_fpunch(c, inode->v.i_ino, + ret = bch2_fpunch(c, inode_inum(inode), round_up(iattr->ia_size, block_bytes(c)) >> 9, U64_MAX, &inode->ei_journal_seq, &i_sectors_delta); i_sectors_acct(c, inode, NULL, i_sectors_delta); @@ -2450,7 +2510,7 @@ static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len if (discard_start < discard_end) { s64 i_sectors_delta = 0; - ret = bch2_fpunch(c, inode->v.i_ino, + ret = bch2_fpunch(c, inode_inum(inode), discard_start, discard_end, &inode->ei_journal_seq, &i_sectors_delta); @@ -2529,7 +2589,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, } else { s64 i_sectors_delta = 0; - ret = bch2_fpunch(c, inode->v.i_ino, + ret = bch2_fpunch(c, inode_inum(inode), offset >> 9, (offset + len) >> 9, &inode->ei_journal_seq, &i_sectors_delta); @@ -2556,6 +2616,18 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, struct bpos move_pos = POS(inode->v.i_ino, offset >> 9); struct bpos atomic_end; unsigned trigger_flags = 0; + u32 snapshot; + + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, + inode->ei_subvol, &snapshot); + if (ret) + continue; + + bch2_btree_iter_set_snapshot(&src, snapshot); + bch2_btree_iter_set_snapshot(&dst, snapshot); + bch2_btree_iter_set_snapshot(&del, snapshot); bch2_trans_begin(&trans); @@ -2676,9 +2748,17 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, struct bkey_i_reservation reservation; struct bkey_s_c k; unsigned sectors; + u32 snapshot; bch2_trans_begin(&trans); + ret = bch2_subvolume_get_snapshot(&trans, + inode->ei_subvol, &snapshot); + if (ret) + goto bkey_err; + + bch2_btree_iter_set_snapshot(&iter, snapshot); + k = bch2_btree_iter_peek_slot(&iter); if ((ret = bkey_err(k))) goto bkey_err; @@ -2725,7 +2805,8 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, reservation.v.nr_replicas = disk_res.nr_replicas; } - ret = bch2_extent_update(&trans, &iter, &reservation.k_i, + ret = bch2_extent_update(&trans, inode_inum(inode), &iter, + &reservation.k_i, &disk_res, &inode->ei_journal_seq, 0, &i_sectors_delta, true); i_sectors_acct(c, inode, "a_res, i_sectors_delta); @@ -2927,8 +3008,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, mark_range_unallocated(src, pos_src, pos_src + aligned_len); ret = bch2_remap_range(c, - POS(dst->v.i_ino, pos_dst >> 9), - POS(src->v.i_ino, pos_src >> 9), + inode_inum(dst), pos_dst >> 9, + inode_inum(src), pos_src >> 9, aligned_len >> 9, &dst->ei_journal_seq, pos_dst + len, &i_sectors_delta); @@ -3019,7 +3100,9 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; + subvol_inum inum = inode_inum(inode); u64 isize, next_data = MAX_LFS_FILESIZE; + u32 snapshot; int ret; isize = i_size_read(&inode->v); @@ -3027,9 +3110,15 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) return -ENXIO; bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; for_each_btree_key(&trans, iter, BTREE_ID_extents, - POS(inode->v.i_ino, offset >> 9), 0, k, ret) { + SPOS(inode->v.i_ino, offset >> 9, snapshot), 0, k, ret) { if (k.k->p.inode != inode->v.i_ino) { break; } else if (bkey_extent_is_data(k.k)) { @@ -3039,6 +3128,9 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) break; } bch2_trans_iter_exit(&trans, &iter); +err: + if (ret == -EINTR) + goto retry; ret = bch2_trans_exit(&trans) ?: ret; if (ret) @@ -3115,7 +3207,9 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; + subvol_inum inum = inode_inum(inode); u64 isize, next_hole = MAX_LFS_FILESIZE; + u32 snapshot; int ret; isize = i_size_read(&inode->v); @@ -3123,9 +3217,15 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) return -ENXIO; bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; for_each_btree_key(&trans, iter, BTREE_ID_extents, - POS(inode->v.i_ino, offset >> 9), + SPOS(inode->v.i_ino, offset >> 9, snapshot), BTREE_ITER_SLOTS, k, ret) { if (k.k->p.inode != inode->v.i_ino) { next_hole = bch2_seek_pagecache_hole(&inode->v, @@ -3143,6 +3243,9 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) } } bch2_trans_iter_exit(&trans, &iter); +err: + if (ret == -EINTR) + goto retry; ret = bch2_trans_exit(&trans) ?: ret; if (ret) diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index 91a0e761..3ed53f42 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -10,7 +10,11 @@ #include "quota.h" #include +#include #include +#include +#include +#include #define FS_IOC_GOINGDOWN _IOR('X', 125, __u32) #define FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */ @@ -192,7 +196,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, char *kname = NULL; struct qstr qstr; int ret = 0; - u64 inum; + subvol_inum inum; kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL); if (!kname) @@ -205,10 +209,8 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, qstr.len = ret; qstr.name = kname; - ret = -ENOENT; - inum = bch2_dirent_lookup(c, src->v.i_ino, &hash, - &qstr); - if (!inum) + ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum); + if (ret) goto err1; vinode = bch2_vfs_inode_get(c, inum); @@ -294,6 +296,154 @@ err: return ret; } +static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, + struct bch_ioctl_subvolume arg) +{ + struct inode *dir; + struct bch_inode_info *inode; + struct user_namespace *s_user_ns; + struct dentry *dst_dentry; + struct path src_path, dst_path; + int how = LOOKUP_FOLLOW; + int error; + subvol_inum snapshot_src = { 0 }; + unsigned lookup_flags = 0; + unsigned create_flags = BCH_CREATE_SUBVOL; + + if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE| + BCH_SUBVOL_SNAPSHOT_RO)) + return -EINVAL; + + if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && + (arg.src_ptr || + (arg.flags & BCH_SUBVOL_SNAPSHOT_RO))) + return -EINVAL; + + if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) + create_flags |= BCH_CREATE_SNAPSHOT; + + if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO) + create_flags |= BCH_CREATE_SNAPSHOT_RO; + + /* why do we need this lock? */ + down_read(&c->vfs_sb->s_umount); + + if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) + sync_inodes_sb(c->vfs_sb); +retry: + if (arg.src_ptr) { + error = user_path_at(arg.dirfd, + (const char __user *)(unsigned long)arg.src_ptr, + how, &src_path); + if (error) + goto err1; + + if (src_path.dentry->d_sb->s_fs_info != c) { + path_put(&src_path); + error = -EXDEV; + goto err1; + } + + snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode)); + } + + dst_dentry = user_path_create(arg.dirfd, + (const char __user *)(unsigned long)arg.dst_ptr, + &dst_path, lookup_flags); + error = PTR_ERR_OR_ZERO(dst_dentry); + if (error) + goto err2; + + if (dst_dentry->d_sb->s_fs_info != c) { + error = -EXDEV; + goto err3; + } + + if (dst_dentry->d_inode) { + error = -EEXIST; + goto err3; + } + + dir = dst_path.dentry->d_inode; + if (IS_DEADDIR(dir)) { + error = -ENOENT; + goto err3; + } + + s_user_ns = dir->i_sb->s_user_ns; + if (!kuid_has_mapping(s_user_ns, current_fsuid()) || + !kgid_has_mapping(s_user_ns, current_fsgid())) { + error = -EOVERFLOW; + goto err3; + } + + error = inode_permission(file_mnt_user_ns(filp), + dir, MAY_WRITE | MAY_EXEC); + if (error) + goto err3; + + if (!IS_POSIXACL(dir)) + arg.mode &= ~current_umask(); + + error = security_path_mkdir(&dst_path, dst_dentry, arg.mode); + if (error) + goto err3; + + if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && + !arg.src_ptr) + snapshot_src.subvol = to_bch_ei(dir)->ei_inode.bi_subvol; + + inode = __bch2_create(file_mnt_user_ns(filp), to_bch_ei(dir), + dst_dentry, arg.mode|S_IFDIR, + 0, snapshot_src, create_flags); + error = PTR_ERR_OR_ZERO(inode); + if (error) + goto err3; + + d_instantiate(dst_dentry, &inode->v); + fsnotify_mkdir(dir, dst_dentry); +err3: + done_path_create(&dst_path, dst_dentry); +err2: + if (arg.src_ptr) + path_put(&src_path); + + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } +err1: + up_read(&c->vfs_sb->s_umount); + + return error; +} + +static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, + struct bch_ioctl_subvolume arg) +{ + struct path path; + int ret = 0; + + if (arg.flags) + return -EINVAL; + + ret = user_path_at(arg.dirfd, + (const char __user *)(unsigned long)arg.dst_ptr, + LOOKUP_FOLLOW, &path); + if (ret) + return ret; + + if (path.dentry->d_sb->s_fs_info != c) { + path_put(&path); + return -EXDEV; + } + + ret = __bch2_unlink(path.dentry->d_parent->d_inode, path.dentry, 1); + path_put(&path); + + return ret; +} + long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) { struct bch_inode_info *inode = file_bch_inode(file); @@ -324,6 +474,22 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) case FS_IOC_GOINGDOWN: return bch2_ioc_goingdown(c, (u32 __user *) arg); + case BCH_IOCTL_SUBVOLUME_CREATE: { + struct bch_ioctl_subvolume i; + + if (copy_from_user(&i, (void __user *) arg, sizeof(i))) + return -EFAULT; + return bch2_ioctl_subvolume_create(c, file, i); + } + + case BCH_IOCTL_SUBVOLUME_DESTROY: { + struct bch_ioctl_subvolume i; + + if (copy_from_user(&i, (void __user *) arg, sizeof(i))) + return -EFAULT; + return bch2_ioctl_subvolume_destroy(c, file, i); + } + default: return bch2_fs_ioctl(c, cmd, (void __user *) arg); } diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 6cc56871..2094c18c 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -36,7 +36,7 @@ static struct kmem_cache *bch2_inode_cache; -static void bch2_vfs_inode_init(struct bch_fs *, +static void bch2_vfs_inode_init(struct bch_fs *, subvol_inum, struct bch_inode_info *, struct bch_inode_unpacked *); @@ -149,7 +149,7 @@ int __must_check bch2_write_inode(struct bch_fs *c, retry: bch2_trans_begin(&trans); - ret = bch2_inode_peek(&trans, &iter, &inode_u, inode->v.i_ino, + ret = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_INTENT) ?: (set ? set(inode, &inode_u, p) : 0) ?: bch2_inode_write(&trans, &iter, &inode_u) ?: @@ -208,13 +208,42 @@ int bch2_fs_quota_transfer(struct bch_fs *c, return ret; } -struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) +static int bch2_iget5_test(struct inode *vinode, void *p) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + subvol_inum *inum = p; + + return inode->ei_subvol == inum->subvol && + inode->ei_inode.bi_inum == inum->inum; +} + +static int bch2_iget5_set(struct inode *vinode, void *p) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + subvol_inum *inum = p; + + inode->v.i_ino = inum->inum; + inode->ei_subvol = inum->subvol; + inode->ei_inode.bi_inum = inum->inum; + return 0; +} + +static unsigned bch2_inode_hash(subvol_inum inum) +{ + return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL); +} + +struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) { struct bch_inode_unpacked inode_u; struct bch_inode_info *inode; int ret; - inode = to_bch_ei(iget_locked(c->vfs_sb, inum)); + inode = to_bch_ei(iget5_locked(c->vfs_sb, + bch2_inode_hash(inum), + bch2_iget5_test, + bch2_iget5_set, + &inum)); if (unlikely(!inode)) return ERR_PTR(-ENOMEM); if (!(inode->v.i_state & I_NEW)) @@ -226,26 +255,20 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) return ERR_PTR(ret); } - bch2_vfs_inode_init(c, inode, &inode_u); + bch2_vfs_inode_init(c, inum, inode, &inode_u); - inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum); + inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum.inum); unlock_new_inode(&inode->v); return &inode->v; } -static int inum_test(struct inode *inode, void *p) -{ - unsigned long *ino = p; - - return *ino == inode->i_ino; -} - -static struct bch_inode_info * +struct bch_inode_info * __bch2_create(struct user_namespace *mnt_userns, struct bch_inode_info *dir, struct dentry *dentry, - umode_t mode, dev_t rdev, bool tmpfile) + umode_t mode, dev_t rdev, subvol_inum snapshot_src, + unsigned flags) { struct bch_fs *c = dir->v.i_sb->s_fs_info; struct btree_trans trans; @@ -253,6 +276,7 @@ __bch2_create(struct user_namespace *mnt_userns, struct bch_inode_info *inode, *old; struct bch_inode_unpacked inode_u; struct posix_acl *default_acl = NULL, *acl = NULL; + subvol_inum inum; u64 journal_seq = 0; int ret; @@ -273,20 +297,23 @@ __bch2_create(struct user_namespace *mnt_userns, bch2_inode_init_early(c, &inode_u); - if (!tmpfile) + if (!(flags & BCH_CREATE_TMPFILE)) mutex_lock(&dir->ei_update_lock); bch2_trans_init(&trans, c, 8, - 2048 + (!tmpfile ? dentry->d_name.len : 0)); + 2048 + (!(flags & BCH_CREATE_TMPFILE) + ? dentry->d_name.len : 0)); retry: bch2_trans_begin(&trans); - ret = bch2_create_trans(&trans, dir->v.i_ino, &dir_u, &inode_u, - !tmpfile ? &dentry->d_name : NULL, + ret = bch2_create_trans(&trans, + inode_inum(dir), &dir_u, &inode_u, + !(flags & BCH_CREATE_TMPFILE) + ? &dentry->d_name : NULL, from_kuid(mnt_userns, current_fsuid()), from_kgid(mnt_userns, current_fsgid()), mode, rdev, - default_acl, acl) ?: + default_acl, acl, snapshot_src, flags) ?: bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, KEY_TYPE_QUOTA_PREALLOC); if (unlikely(ret)) @@ -302,14 +329,17 @@ err_before_quota: goto err_trans; } - if (!tmpfile) { + if (!(flags & BCH_CREATE_TMPFILE)) { bch2_inode_update_after_write(c, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); journal_seq_copy(c, dir, journal_seq); mutex_unlock(&dir->ei_update_lock); } - bch2_vfs_inode_init(c, inode, &inode_u); + inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol; + inum.inum = inode_u.bi_inum; + + bch2_vfs_inode_init(c, inum, inode, &inode_u); journal_seq_copy(c, inode, journal_seq); set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); @@ -322,8 +352,12 @@ err_before_quota: */ inode->v.i_state |= I_CREATING; - old = to_bch_ei(inode_insert5(&inode->v, inode->v.i_ino, - inum_test, NULL, &inode->v.i_ino)); + + old = to_bch_ei(inode_insert5(&inode->v, + bch2_inode_hash(inum), + bch2_iget5_test, + bch2_iget5_set, + &inum)); BUG_ON(!old); if (unlikely(old != inode)) { @@ -350,7 +384,7 @@ err: posix_acl_release(acl); return inode; err_trans: - if (!tmpfile) + if (!(flags & BCH_CREATE_TMPFILE)) mutex_unlock(&dir->ei_update_lock); bch2_trans_exit(&trans); @@ -369,12 +403,13 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, struct bch_inode_info *dir = to_bch_ei(vdir); struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); struct inode *vinode = NULL; - u64 inum; + subvol_inum inum = { .subvol = 1 }; + int ret; - inum = bch2_dirent_lookup(c, dir->v.i_ino, &hash, - &dentry->d_name); + ret = bch2_dirent_lookup(c, inode_inum(dir), &hash, + &dentry->d_name, &inum); - if (inum) + if (!ret) vinode = bch2_vfs_inode_get(c, inum); return d_splice_alias(vinode, dentry); @@ -385,7 +420,8 @@ static int bch2_mknod(struct user_namespace *mnt_userns, umode_t mode, dev_t rdev) { struct bch_inode_info *inode = - __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev, false); + __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev, + (subvol_inum) { 0 }, 0); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -415,8 +451,8 @@ static int __bch2_link(struct bch_fs *c, ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq, 0, bch2_link_trans(&trans, - dir->v.i_ino, - inode->v.i_ino, &dir_u, &inode_u, + inode_inum(dir), &dir_u, + inode_inum(inode), &inode_u, &dentry->d_name)); if (likely(!ret)) { @@ -452,7 +488,8 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir, return 0; } -static int bch2_unlink(struct inode *vdir, struct dentry *dentry) +int __bch2_unlink(struct inode *vdir, struct dentry *dentry, + int deleting_snapshot) { struct bch_fs *c = vdir->i_sb->s_fs_info; struct bch_inode_info *dir = to_bch_ei(vdir); @@ -467,8 +504,9 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) ret = __bch2_trans_do(&trans, NULL, &dir->ei_journal_seq, BTREE_INSERT_NOFAIL, bch2_unlink_trans(&trans, - dir->v.i_ino, &dir_u, - &inode_u, &dentry->d_name)); + inode_inum(dir), &dir_u, + &inode_u, &dentry->d_name, + deleting_snapshot)); if (likely(!ret)) { BUG_ON(inode_u.bi_inum != inode->v.i_ino); @@ -486,6 +524,11 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) return ret; } +static int bch2_unlink(struct inode *vdir, struct dentry *dentry) +{ + return __bch2_unlink(vdir, dentry, -1); +} + static int bch2_symlink(struct user_namespace *mnt_userns, struct inode *vdir, struct dentry *dentry, const char *symname) @@ -494,7 +537,8 @@ static int bch2_symlink(struct user_namespace *mnt_userns, struct bch_inode_info *dir = to_bch_ei(vdir), *inode; int ret; - inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0, true); + inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0, + (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); if (unlikely(IS_ERR(inode))) return PTR_ERR(inode); @@ -587,8 +631,8 @@ static int bch2_rename2(struct user_namespace *mnt_userns, ret = __bch2_trans_do(&trans, NULL, &journal_seq, 0, bch2_rename_trans(&trans, - src_dir->v.i_ino, &src_dir_u, - dst_dir->v.i_ino, &dst_dir_u, + inode_inum(src_dir), &src_dir_u, + inode_inum(dst_dir), &dst_dir_u, &src_inode_u, &dst_inode_u, &src_dentry->d_name, @@ -711,7 +755,7 @@ retry: kfree(acl); acl = NULL; - ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode->v.i_ino, + ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode), BTREE_ITER_INTENT); if (ret) goto btree_err; @@ -719,7 +763,8 @@ retry: bch2_setattr_copy(mnt_userns, inode, &inode_u, attr); if (attr->ia_valid & ATTR_MODE) { - ret = bch2_acl_chmod(&trans, &inode_u, inode_u.bi_mode, &acl); + ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u, + inode_u.bi_mode, &acl); if (ret) goto btree_err; } @@ -810,7 +855,8 @@ static int bch2_tmpfile(struct user_namespace *mnt_userns, struct inode *vdir, struct dentry *dentry, umode_t mode) { struct bch_inode_info *inode = - __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0, true); + __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0, + (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -885,6 +931,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); unsigned offset_into_extent, sectors; bool have_extent = false; + u32 snapshot; int ret = 0; ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); @@ -894,15 +941,21 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, if (start + len < start) return -EINVAL; + start >>= 9; + bch2_bkey_buf_init(&cur); bch2_bkey_buf_init(&prev); bch2_trans_init(&trans, c, 0, 0); - - bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, - POS(ei->v.i_ino, start >> 9), 0); retry: bch2_trans_begin(&trans); + ret = bch2_subvolume_get_snapshot(&trans, ei->ei_subvol, &snapshot); + if (ret) + goto err; + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + SPOS(ei->v.i_ino, start, snapshot), 0); + while ((k = bch2_btree_iter_peek(&iter)).k && !(ret = bkey_err(k)) && bkey_cmp(iter.pos, end) < 0) { @@ -951,7 +1004,9 @@ retry: bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, iter.pos.offset + sectors)); } - + start = iter.pos.offset; + bch2_trans_iter_exit(&trans, &iter); +err: if (ret == -EINTR) goto retry; @@ -959,7 +1014,6 @@ retry: ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), FIEMAP_EXTENT_LAST); - bch2_trans_iter_exit(&trans, &iter); ret = bch2_trans_exit(&trans) ?: ret; bch2_bkey_buf_exit(&cur, c); bch2_bkey_buf_exit(&prev, c); @@ -996,7 +1050,7 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit_dots(file, ctx)) return 0; - return bch2_readdir(c, inode->v.i_ino, ctx); + return bch2_readdir(c, inode_inum(inode), ctx); } static const struct file_operations bch_file_operations = { @@ -1096,6 +1150,7 @@ static const struct address_space_operations bch_address_space_operations = { .error_remove_page = generic_error_remove_page, }; +#if 0 static struct inode *bch2_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { @@ -1129,14 +1184,15 @@ static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid, return generic_fh_to_parent(sb, fid, fh_len, fh_type, bch2_nfs_get_inode); } +#endif static const struct export_operations bch_export_ops = { - .fh_to_dentry = bch2_fh_to_dentry, - .fh_to_parent = bch2_fh_to_parent, + //.fh_to_dentry = bch2_fh_to_dentry, + //.fh_to_parent = bch2_fh_to_parent, //.get_parent = bch2_get_parent, }; -static void bch2_vfs_inode_init(struct bch_fs *c, +static void bch2_vfs_inode_init(struct bch_fs *c, subvol_inum inum, struct bch_inode_info *inode, struct bch_inode_unpacked *bi) { @@ -1152,6 +1208,7 @@ static void bch2_vfs_inode_init(struct bch_fs *c, inode->ei_journal_seq = 0; inode->ei_quota_reserved = 0; inode->ei_qid = bch_qid(bi); + inode->ei_subvol = inum.subvol; inode->v.i_mapping->a_ops = &bch_address_space_operations; @@ -1249,7 +1306,7 @@ static void bch2_evict_inode(struct inode *vinode) KEY_TYPE_QUOTA_WARN); bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, KEY_TYPE_QUOTA_WARN); - bch2_inode_rm(c, inode->v.i_ino, true); + bch2_inode_rm(c, inode_inum(inode), true); } } @@ -1593,7 +1650,7 @@ got_sb: sb->s_flags |= SB_POSIXACL; #endif - vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO); + vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); if (IS_ERR(vinode)) { bch_err(c, "error mounting: error getting root inode %i", (int) PTR_ERR(vinode)); diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h index 36cc6ba2..48fc504e 100644 --- a/libbcachefs/fs.h +++ b/libbcachefs/fs.h @@ -45,10 +45,20 @@ struct bch_inode_info { struct mutex ei_quota_lock; struct bch_qid ei_qid; + u32 ei_subvol; + /* copy of inode in btree: */ struct bch_inode_unpacked ei_inode; }; +static inline subvol_inum inode_inum(struct bch_inode_info *inode) +{ + return (subvol_inum) { + .subvol = inode->ei_subvol, + .inum = inode->ei_inode.bi_inum, + }; +} + /* * Set if we've gotten a btree error for this inode, and thus the vfs inode and * btree inode may be inconsistent: @@ -135,6 +145,10 @@ struct bch_inode_unpacked; #ifndef NO_BCACHEFS_FS +struct bch_inode_info * +__bch2_create(struct user_namespace *, struct bch_inode_info *, + struct dentry *, umode_t, dev_t, subvol_inum, unsigned); + int bch2_fs_quota_transfer(struct bch_fs *, struct bch_inode_info *, struct bch_qid, @@ -154,7 +168,7 @@ static inline int bch2_set_projid(struct bch_fs *c, KEY_TYPE_QUOTA_PREALLOC); } -struct inode *bch2_vfs_inode_get(struct bch_fs *, u64); +struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum); /* returns 0 if we want to do the update, or error is passed up */ typedef int (*inode_set_fn)(struct bch_inode_info *, @@ -170,6 +184,7 @@ int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, int bch2_setattr_nonsize(struct user_namespace *, struct bch_inode_info *, struct iattr *); +int __bch2_unlink(struct inode *, struct dentry *, int); void bch2_vfs_exit(void); int bch2_vfs_init(void); diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index eb979e79..16a1eae9 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -9,6 +9,7 @@ #include "fsck.h" #include "inode.h" #include "keylist.h" +#include "subvolume.h" #include "super.h" #include "xattr.h" @@ -17,7 +18,8 @@ #define QSTR(n) { { { .len = strlen(n) } }, .name = n } -static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) +static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, + u32 snapshot) { struct btree_iter iter; struct bkey_s_c k; @@ -25,7 +27,7 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) int ret; for_each_btree_key(trans, iter, BTREE_ID_extents, - POS(inum, 0), 0, k, ret) { + SPOS(inum, 0, snapshot), 0, k, ret) { if (k.k->p.inode != inum) break; @@ -38,6 +40,100 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) return ret ?: sectors; } +static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, + u32 snapshot) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_dirent d; + u64 subdirs = 0; + int ret; + + for_each_btree_key(trans, iter, BTREE_ID_dirents, + SPOS(inum, 0, snapshot), 0, k, ret) { + if (k.k->p.inode != inum) + break; + + if (k.k->type != KEY_TYPE_dirent) + continue; + + d = bkey_s_c_to_dirent(k); + if (d.v->d_type == DT_DIR) + subdirs++; + } + + bch2_trans_iter_exit(trans, &iter); + + return ret ?: subdirs; +} + +static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot, + u32 *subvol) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, + POS(0, snapshot), 0); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_snapshot) { + bch_err(trans->c, "snapshot %u not fonud", snapshot); + ret = -ENOENT; + goto err; + } + + *subvol = le32_to_cpu(bkey_s_c_to_snapshot(k).v->subvol); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; + +} + +static int snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot, + u32 *subvol) +{ + return lockrestart_do(trans, __snapshot_lookup_subvol(trans, snapshot, subvol)); +} + +static int __subvol_lookup(struct btree_trans *trans, u32 subvol, + u32 *snapshot, u64 *inum) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, + POS(0, subvol), 0); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_subvolume) { + bch_err(trans->c, "subvolume %u not fonud", subvol); + ret = -ENOENT; + goto err; + } + + *snapshot = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot); + *inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; + +} + +static int subvol_lookup(struct btree_trans *trans, u32 subvol, + u32 *snapshot, u64 *inum) +{ + return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum)); +} + static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, struct bch_inode_unpacked *inode, u32 *snapshot) @@ -47,14 +143,13 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, int ret; bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, - POS(0, inode_nr), 0); + SPOS(0, inode_nr, *snapshot), 0); k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto err; - if (snapshot) - *snapshot = iter.pos.snapshot; + *snapshot = iter.pos.snapshot; ret = k.k->type == KEY_TYPE_inode ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode) : -ENOENT; @@ -70,6 +165,36 @@ static int lookup_inode(struct btree_trans *trans, u64 inode_nr, return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot)); } +static int __lookup_dirent(struct btree_trans *trans, + struct bch_hash_info hash_info, + subvol_inum dir, struct qstr *name, + u64 *target, unsigned *type) +{ + struct btree_iter iter; + struct bkey_s_c_dirent d; + int ret; + + ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc, + &hash_info, dir, name, 0); + if (ret) + return ret; + + d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter)); + *target = le64_to_cpu(d.v->d_inum); + *type = d.v->d_type; + bch2_trans_iter_exit(trans, &iter); + return 0; +} + +static int lookup_dirent(struct btree_trans *trans, + struct bch_hash_info hash_info, + subvol_inum dir, struct qstr *name, + u64 *target, unsigned *type) +{ + return lockrestart_do(trans, + __lookup_dirent(trans, hash_info, dir, name, target, type)); +} + static int __write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode, u32 snapshot) @@ -100,6 +225,71 @@ static int write_inode(struct btree_trans *trans, return ret; } +static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot) +{ + struct btree_iter iter = { NULL }; + struct bkey_i_inode_generation delete; + struct bch_inode_unpacked inode_u; + struct bkey_s_c k; + int ret; + + ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, + SPOS(inum, 0, snapshot), + SPOS(inum, U64_MAX, snapshot), + 0, NULL) ?: + bch2_btree_delete_range_trans(trans, BTREE_ID_dirents, + SPOS(inum, 0, snapshot), + SPOS(inum, U64_MAX, snapshot), + 0, NULL) ?: + bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs, + SPOS(inum, 0, snapshot), + SPOS(inum, U64_MAX, snapshot), + 0, NULL); + if (ret) + goto err; +retry: + bch2_trans_begin(trans); + + bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, + SPOS(0, inum, snapshot), BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_inode) { + bch2_fs_inconsistent(trans->c, + "inode %llu:%u not found when deleting", + inum, snapshot); + ret = -EIO; + goto err; + } + + bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u); + + /* Subvolume root? */ + if (inode_u.bi_subvol) { + ret = bch2_subvolume_delete(trans, inode_u.bi_subvol, -1); + if (ret) + goto err; + } + + bkey_inode_generation_init(&delete.k_i); + delete.k.p = iter.pos; + delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); + + ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL); +err: + bch2_trans_iter_exit(trans, &iter); + if (ret == -EINTR) + goto retry; + + return ret; +} + static int __remove_dirent(struct btree_trans *trans, struct bpos pos) { struct bch_fs *c = trans->c; @@ -117,7 +307,7 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos) bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT); ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - &dir_hash_info, &iter); + &dir_hash_info, &iter, 0); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -134,29 +324,49 @@ static int remove_dirent(struct btree_trans *trans, struct bpos pos) } /* Get lost+found, create if it doesn't exist: */ -static int lookup_lostfound(struct btree_trans *trans, +static int lookup_lostfound(struct btree_trans *trans, u32 subvol, struct bch_inode_unpacked *lostfound) { struct bch_fs *c = trans->c; struct bch_inode_unpacked root; struct bch_hash_info root_hash_info; struct qstr lostfound_str = QSTR("lost+found"); - u64 inum; + subvol_inum root_inum = { .subvol = subvol }; + u64 inum = 0; + unsigned d_type = 0; u32 snapshot; int ret; - ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root, &snapshot); - if (ret && ret != -ENOENT) + ret = subvol_lookup(trans, subvol, &snapshot, &root_inum.inum); + if (ret) return ret; + ret = lookup_inode(trans, root_inum.inum, &root, &snapshot); + if (ret) { + bch_err(c, "error fetching subvol root: %i", ret); + return ret; + } + root_hash_info = bch2_hash_info_init(c, &root); - inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info, - &lostfound_str); - if (!inum) { + + ret = lookup_dirent(trans, root_hash_info, root_inum, + &lostfound_str, &inum, &d_type); + if (ret == -ENOENT) { bch_notice(c, "creating lost+found"); goto create_lostfound; } + if (ret) { + bch_err(c, "error looking up lost+found: %i", ret); + return ret; + } + + if (d_type != DT_DIR) { + bch_err(c, "error looking up lost+found: not a directory"); + return ret; + + } + ret = lookup_inode(trans, inum, lostfound, &snapshot); if (ret && ret != -ENOENT) { /* @@ -174,11 +384,10 @@ create_lostfound: ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW, - bch2_create_trans(trans, - BCACHEFS_ROOT_INO, &root, - lostfound, - &lostfound_str, - 0, 0, S_IFDIR|0700, 0, NULL, NULL)); + bch2_create_trans(trans, root_inum, &root, + lostfound, &lostfound_str, + 0, 0, S_IFDIR|0700, 0, NULL, NULL, + (subvol_inum) { }, 0)); if (ret) bch_err(c, "error creating lost+found: %i", ret); } @@ -187,16 +396,22 @@ create_lostfound: } static int reattach_inode(struct btree_trans *trans, - struct bch_inode_unpacked *inode) + struct bch_inode_unpacked *inode, + u32 inode_snapshot) { struct bch_hash_info dir_hash; struct bch_inode_unpacked lostfound; char name_buf[20]; struct qstr name; u64 dir_offset = 0; + u32 subvol; int ret; - ret = lookup_lostfound(trans, &lostfound); + ret = snapshot_lookup_subvol(trans, inode_snapshot, &subvol); + if (ret) + return ret; + + ret = lookup_lostfound(trans, subvol, &lostfound); if (ret) return ret; @@ -214,10 +429,15 @@ static int reattach_inode(struct btree_trans *trans, name = (struct qstr) QSTR(name_buf); ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW, - bch2_dirent_create(trans, lostfound.bi_inum, &dir_hash, - mode_to_type(inode->bi_mode), - &name, inode->bi_inum, &dir_offset, - BCH_HASH_SET_MUST_CREATE)); + bch2_dirent_create(trans, + (subvol_inum) { + .subvol = subvol, + .inum = lostfound.bi_inum, + }, + &dir_hash, + mode_to_type(inode->bi_mode), + &name, inode->bi_inum, &dir_offset, + BCH_HASH_SET_MUST_CREATE)); if (ret) { bch_err(trans->c, "error %i reattaching inode %llu", ret, inode->bi_inum); @@ -227,7 +447,7 @@ static int reattach_inode(struct btree_trans *trans, inode->bi_dir = lostfound.bi_inum; inode->bi_dir_offset = dir_offset; - return write_inode(trans, inode, U32_MAX); + return write_inode(trans, inode, inode_snapshot); } static int remove_backpointer(struct btree_trans *trans, @@ -254,45 +474,254 @@ out: return ret; } +static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, struct bpos pos) +{ + pos.snapshot = snapshot_t(c, pos.snapshot)->equiv; + + if (bkey_cmp(s->pos, pos)) + s->nr = 0; + s->pos = pos; + + /* Might get called multiple times due to lock restarts */ + if (s->nr && s->d[s->nr - 1] == pos.snapshot) + return 0; + + return snapshots_seen_add(c, s, pos.snapshot); +} + +/** + * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor, + * and @ancestor hasn't been overwritten in @seen + * + * That is, returns whether key in @ancestor snapshot is visible in @id snapshot + */ +static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, + u32 id, u32 ancestor) +{ + ssize_t i; + + BUG_ON(id > ancestor); + + id = snapshot_t(c, id)->equiv; + ancestor = snapshot_t(c, ancestor)->equiv; + + /* @ancestor should be the snapshot most recently added to @seen */ + BUG_ON(!seen->nr || seen->d[seen->nr - 1] != ancestor); + BUG_ON(seen->pos.snapshot != ancestor); + + if (id == ancestor) + return true; + + if (!bch2_snapshot_is_ancestor(c, id, ancestor)) + return false; + + for (i = seen->nr - 2; + i >= 0 && seen->d[i] >= id; + --i) + if (bch2_snapshot_is_ancestor(c, id, seen->d[i]) && + bch2_snapshot_is_ancestor(c, seen->d[i], ancestor)) + return false; + + return true; +} + +/** + * ref_visible - given a key with snapshot id @src that points to a key with + * snapshot id @dst, test whether there is some snapshot in which @dst is + * visible. + * + * This assumes we're visiting @src keys in natural key order. + * + * @s - list of snapshot IDs already seen at @src + * @src - snapshot ID of src key + * @dst - snapshot ID of dst key + */ +static int ref_visible(struct bch_fs *c, struct snapshots_seen *s, + u32 src, u32 dst) +{ + return dst <= src + ? key_visible_in_snapshot(c, s, dst, src) + : bch2_snapshot_is_ancestor(c, src, dst); +} + +#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ + for (_i = (_w)->d; _i < (_w)->d + (_w)->nr && (_i)->snapshot <= (_snapshot); _i++)\ + if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) + struct inode_walker { - bool first_this_inode; - bool have_inode; - u64 cur_inum; - u32 snapshot; - struct bch_inode_unpacked inode; + bool first_this_inode; + u64 cur_inum; + + size_t nr; + size_t size; + struct inode_walker_entry { + struct bch_inode_unpacked inode; + u32 snapshot; + u64 count; + } *d; }; +static void inode_walker_exit(struct inode_walker *w) +{ + kfree(w->d); + w->d = NULL; +} + static struct inode_walker inode_walker_init(void) { - return (struct inode_walker) { - .cur_inum = -1, - .have_inode = false, - }; + return (struct inode_walker) { 0, }; } -static int __walk_inode(struct btree_trans *trans, - struct inode_walker *w, u64 inum) +static int inode_walker_realloc(struct inode_walker *w) { - if (inum != w->cur_inum) { - int ret = __lookup_inode(trans, inum, &w->inode, &w->snapshot); + if (w->nr == w->size) { + size_t new_size = max_t(size_t, 8UL, w->size * 2); + void *d = krealloc(w->d, new_size * sizeof(w->d[0]), + GFP_KERNEL); + if (!d) + return -ENOMEM; - if (ret && ret != -ENOENT) - return ret; - - w->have_inode = !ret; - w->cur_inum = inum; - w->first_this_inode = true; - } else { - w->first_this_inode = false; + w->d = d; + w->size = new_size; } return 0; } -static int walk_inode(struct btree_trans *trans, - struct inode_walker *w, u64 inum) +static int add_inode(struct bch_fs *c, struct inode_walker *w, + struct bkey_s_c_inode inode) { - return lockrestart_do(trans, __walk_inode(trans, w, inum)); + struct bch_inode_unpacked u; + int ret; + + ret = inode_walker_realloc(w); + if (ret) + return ret; + + BUG_ON(bch2_inode_unpack(inode, &u)); + + w->d[w->nr++] = (struct inode_walker_entry) { + .inode = u, + .snapshot = snapshot_t(c, inode.k->p.snapshot)->equiv, + }; + + return 0; +} + +static int __walk_inode(struct btree_trans *trans, + struct inode_walker *w, struct bpos pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + unsigned i, ancestor_pos; + int ret; + + pos.snapshot = snapshot_t(c, pos.snapshot)->equiv; + + if (pos.inode == w->cur_inum) { + w->first_this_inode = false; + goto lookup_snapshot; + } + + w->nr = 0; + + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode), + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + if (k.k->p.offset != pos.inode) + break; + + if (k.k->type == KEY_TYPE_inode) + add_inode(c, w, bkey_s_c_to_inode(k)); + } + bch2_trans_iter_exit(trans, &iter); + + if (ret) + return ret; + + w->cur_inum = pos.inode; + w->first_this_inode = true; +lookup_snapshot: + for (i = 0; i < w->nr; i++) + if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->d[i].snapshot)) + goto found; + return INT_MAX; +found: + BUG_ON(pos.snapshot > w->d[i].snapshot); + + if (pos.snapshot != w->d[i].snapshot) { + ancestor_pos = i; + + while (i && w->d[i - 1].snapshot > pos.snapshot) + --i; + + ret = inode_walker_realloc(w); + if (ret) + return ret; + + array_insert_item(w->d, w->nr, i, w->d[ancestor_pos]); + w->d[i].snapshot = pos.snapshot; + w->d[i].count = 0; + } + + return i; +} + +static int walk_inode(struct btree_trans *trans, + struct inode_walker *w, struct bpos pos) +{ + return lockrestart_do(trans, __walk_inode(trans, w, pos)); +} + +static int __get_visible_inodes(struct btree_trans *trans, + struct inode_walker *w, + struct snapshots_seen *s, + u64 inum) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + w->nr = 0; + + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum), + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + if (k.k->p.offset != inum) + break; + + if (k.k->type != KEY_TYPE_inode) + continue; + + if (ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) { + add_inode(c, w, bkey_s_c_to_inode(k)); + if (k.k->p.snapshot >= s->pos.snapshot) + break; + } + } + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +static int check_key_has_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + char buf[200]; + int ret = 0; + + if (fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c, + "key in missing snapshot: %s", + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) { + ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW, + bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); + return ret ?: -EINTR; + } +fsck_err: + return ret; } static int hash_redo_key(struct btree_trans *trans, @@ -300,6 +729,9 @@ static int hash_redo_key(struct btree_trans *trans, struct bch_hash_info *hash_info, struct btree_iter *k_iter, struct bkey_s_c k) { + bch_err(trans->c, "hash_redo_key() not implemented yet"); + return -EINVAL; +#if 0 struct bkey_i *delete; struct bkey_i *tmp; @@ -318,6 +750,7 @@ static int hash_redo_key(struct btree_trans *trans, return bch2_btree_iter_traverse(k_iter) ?: bch2_trans_update(trans, k_iter, delete, 0) ?: bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0); +#endif } static int fsck_hash_delete_at(struct btree_trans *trans, @@ -327,7 +760,7 @@ static int fsck_hash_delete_at(struct btree_trans *trans, { int ret; retry: - ret = bch2_hash_delete_at(trans, desc, info, iter) ?: + ret = bch2_hash_delete_at(trans, desc, info, iter, 0) ?: bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW); @@ -409,30 +842,29 @@ fsck_err: static int check_inode(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c_inode inode) + struct bch_inode_unpacked *prev, + struct bch_inode_unpacked u) { struct bch_fs *c = trans->c; - struct bch_inode_unpacked u; bool do_update = false; int ret = 0; - ret = bch2_inode_unpack(inode, &u); - - if (bch2_fs_inconsistent_on(ret, c, - "error unpacking inode %llu in fsck", - inode.k->p.inode)) - return ret; + if (fsck_err_on(prev && + (prev->bi_hash_seed != u.bi_hash_seed || + mode_to_type(prev->bi_mode) != mode_to_type(u.bi_mode)), c, + "inodes in different snapshots don't match")) { + bch_err(c, "repair not implemented yet"); + return -EINVAL; + } if (u.bi_flags & BCH_INODE_UNLINKED && (!c->sb.clean || fsck_err(c, "filesystem marked clean, but inode %llu unlinked", u.bi_inum))) { - bch_verbose(c, "deleting inode %llu", u.bi_inum); - bch2_trans_unlock(trans); bch2_fs_lazy_rw(c); - ret = bch2_inode_rm(c, u.bi_inum, false); + ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot); if (ret) bch_err(c, "error in fsck: error %i while deleting inode", ret); return ret; @@ -452,9 +884,10 @@ static int check_inode(struct btree_trans *trans, * just switch units to bytes and that issue goes away */ ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, - POS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9), + SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9, + iter->pos.snapshot), POS(u.bi_inum, U64_MAX), - NULL); + 0, NULL); if (ret) { bch_err(c, "error in fsck: error %i truncating inode", ret); return ret; @@ -479,7 +912,7 @@ static int check_inode(struct btree_trans *trans, bch_verbose(c, "recounting sectors for inode %llu", u.bi_inum); - sectors = bch2_count_inode_sectors(trans, u.bi_inum); + sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot); if (sectors < 0) { bch_err(c, "error in fsck: error %i recounting inode sectors", (int) sectors); @@ -499,11 +932,7 @@ static int check_inode(struct btree_trans *trans, } if (do_update) { - ret = __bch2_trans_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - bch2_btree_iter_traverse(iter) ?: - bch2_inode_write(trans, iter, &u)); + ret = write_inode(trans, &u, iter->pos.snapshot); if (ret) bch_err(c, "error in fsck: error %i " "updating inode", ret); @@ -519,26 +948,49 @@ static int check_inodes(struct bch_fs *c, bool full) struct btree_iter iter; struct bkey_s_c k; struct bkey_s_c_inode inode; + struct bch_inode_unpacked prev, u; int ret; + memset(&prev, 0, sizeof(prev)); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH, k, ret) { + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + ret = check_key_has_snapshot(&trans, &iter, k); + if (ret) + break; + + /* + * if snapshot id isn't a leaf node, skip it - deletion in + * particular is not atomic, so on the internal snapshot nodes + * we can see inodes marked for deletion after a clean shutdown + */ + if (bch2_snapshot_internal_node(c, k.k->p.snapshot)) + continue; + if (k.k->type != KEY_TYPE_inode) continue; inode = bkey_s_c_to_inode(k); - if (full || - (inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY| - BCH_INODE_I_SECTORS_DIRTY| - BCH_INODE_UNLINKED))) { - ret = check_inode(&trans, &iter, inode); - if (ret) - break; - } + if (!full && + !(inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY| + BCH_INODE_I_SECTORS_DIRTY| + BCH_INODE_UNLINKED))) + continue; + + BUG_ON(bch2_inode_unpack(inode, &u)); + + ret = check_inode(&trans, &iter, + full && prev.bi_inum == u.bi_inum + ? &prev : NULL, u); + if (ret) + break; + + prev = u; } bch2_trans_iter_exit(&trans, &iter); @@ -547,6 +999,29 @@ static int check_inodes(struct bch_fs *c, bool full) return bch2_trans_exit(&trans) ?: ret; } +noinline_for_stack +static int check_subvols(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_subvolumes, POS_MIN, + 0, k, ret) { + } + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + return ret; +} + +/* + * Checking for overlapping extents needs to be reimplemented + */ +#if 0 static int fix_overlapping_extent(struct btree_trans *trans, struct bkey_s_c k, struct bpos cut_at) { @@ -582,16 +1057,18 @@ static int fix_overlapping_extent(struct btree_trans *trans, bch2_trans_iter_exit(trans, &iter); return ret; } +#endif static int inode_backpointer_exists(struct btree_trans *trans, - struct bch_inode_unpacked *inode) + struct bch_inode_unpacked *inode, + u32 snapshot) { struct btree_iter iter; struct bkey_s_c k; int ret; bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, - POS(inode->bi_dir, inode->bi_dir_offset), 0); + SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot), 0); k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) @@ -612,6 +1089,144 @@ static bool inode_backpointer_matches(struct bkey_s_c_dirent d, d.k->p.offset == inode->bi_dir_offset; } +static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) +{ + struct bch_fs *c = trans->c; + struct inode_walker_entry *i; + int ret = 0, ret2 = 0; + s64 count2; + + for (i = w->d; i < w->d + w->nr; i++) { + if (i->inode.bi_sectors == i->count) + continue; + + count2 = lockrestart_do(trans, + bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot)); + + if (i->count != count2) { + bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu", + i->count, count2); + i->count = count2; + if (i->inode.bi_sectors == i->count) + continue; + } + + if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c, + "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", + w->cur_inum, i->snapshot, + i->inode.bi_sectors, i->count) == FSCK_ERR_IGNORE) + continue; + + i->inode.bi_sectors = i->count; + ret = write_inode(trans, &i->inode, i->snapshot); + if (ret) + break; + ret2 = -EINTR; + } +fsck_err: + return ret ?: ret2; +} + +static int check_extent(struct btree_trans *trans, struct btree_iter *iter, + struct inode_walker *inode, + struct snapshots_seen *s) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + struct inode_walker_entry *i; + char buf[200]; + int ret = 0; + + k = bch2_btree_iter_peek(iter); + if (!k.k) + return 0; + + ret = bkey_err(k); + if (ret) + return ret; + + ret = check_key_has_snapshot(trans, iter, k); + if (ret) + return ret; + + ret = snapshots_seen_update(c, s, k.k->p); + if (ret) + return ret; + + if (k.k->type == KEY_TYPE_whiteout) + return 0; + + if (inode->cur_inum != k.k->p.inode) { + ret = check_i_sectors(trans, inode); + if (ret) + return ret; + } +#if 0 + if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { + char buf1[200]; + char buf2[200]; + + bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k)); + bch2_bkey_val_to_text(&PBUF(buf2), c, k); + + if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) + return fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR; + } +#endif + ret = __walk_inode(trans, inode, k.k->p); + if (ret < 0) + return ret; + + if (fsck_err_on(ret == INT_MAX, c, + "extent in missing inode:\n %s", + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) + return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW, + bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); + + if (ret == INT_MAX) + return 0; + + i = inode->d + ret; + ret = 0; + + if (fsck_err_on(!S_ISREG(i->inode.bi_mode) && + !S_ISLNK(i->inode.bi_mode), c, + "extent in non regular inode mode %o:\n %s", + i->inode.bi_mode, + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) + return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW, + bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); + + if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) { + for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) { + if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && + k.k->type != KEY_TYPE_reservation && + k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c, + "extent type %u offset %llu past end of inode %llu, i_size %llu", + k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) { + bch2_fs_lazy_rw(c); + return bch2_btree_delete_range_trans(trans, BTREE_ID_extents, + SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9, + k.k->p.snapshot), + POS(k.k->p.inode, U64_MAX), + 0, NULL) ?: -EINTR; + } + } + } + + if (bkey_extent_is_allocation(k.k)) + for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) + i->count += k.k->size; +#if 0 + bch2_bkey_buf_reassemble(&prev, c, k); +#endif + +fsck_err: + return ret; +} + /* * Walk extents: verify that extents have a corresponding S_ISREG inode, and * that i_size an i_sectors are consistent @@ -620,15 +1235,17 @@ noinline_for_stack static int check_extents(struct bch_fs *c) { struct inode_walker w = inode_walker_init(); + struct snapshots_seen s; struct btree_trans trans; struct btree_iter iter; - struct bkey_s_c k; - struct bkey_buf prev; - u64 i_sectors = 0; int ret = 0; +#if 0 + struct bkey_buf prev; bch2_bkey_buf_init(&prev); prev.k->k = KEY(0, 0, 0); +#endif + snapshots_seen_init(&s); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); bch_verbose(c, "checking extents"); @@ -636,95 +1253,172 @@ static int check_extents(struct bch_fs *c) bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS(BCACHEFS_ROOT_INO, 0), BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH); -retry: - while ((k = bch2_btree_iter_peek(&iter)).k && - !(ret = bkey_err(k))) { - if (w.have_inode && - w.cur_inum != k.k->p.inode && - !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) && - fsck_err_on(w.inode.bi_sectors != i_sectors, c, - "inode %llu has incorrect i_sectors: got %llu, should be %llu", - w.inode.bi_inum, - w.inode.bi_sectors, i_sectors)) { - w.inode.bi_sectors = i_sectors; + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); - ret = write_inode(&trans, &w.inode, w.snapshot); - if (ret) - break; - } - - if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { - char buf1[200]; - char buf2[200]; - - bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k)); - bch2_bkey_val_to_text(&PBUF(buf2), c, k); - - if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) - return fix_overlapping_extent(&trans, k, prev.k->k.p) ?: -EINTR; - } - - ret = walk_inode(&trans, &w, k.k->p.inode); + do { + ret = lockrestart_do(&trans, + check_extent(&trans, &iter, &w, &s)); if (ret) break; + } while (bch2_btree_iter_advance(&iter)); + bch2_trans_iter_exit(&trans, &iter); +#if 0 + bch2_bkey_buf_exit(&prev, c); +#endif + inode_walker_exit(&w); + bch2_trans_exit(&trans); + snapshots_seen_exit(&s); - if (w.first_this_inode) - i_sectors = 0; + return ret; +} - if (fsck_err_on(!w.have_inode, c, - "extent type %u for missing inode %llu", - k.k->type, k.k->p.inode) || - fsck_err_on(w.have_inode && - !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c, - "extent type %u for non regular file, inode %llu mode %o", - k.k->type, k.k->p.inode, w.inode.bi_mode)) { - bch2_fs_lazy_rw(c); - return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents, - POS(k.k->p.inode, 0), - POS(k.k->p.inode, U64_MAX), - NULL) ?: -EINTR; +static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) +{ + struct bch_fs *c = trans->c; + struct inode_walker_entry *i; + int ret = 0, ret2 = 0; + s64 count2; + + for (i = w->d; i < w->d + w->nr; i++) { + if (i->inode.bi_nlink == i->count) + continue; + + count2 = lockrestart_do(trans, + bch2_count_subdirs(trans, w->cur_inum, i->snapshot)); + + if (i->count != count2) { + bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu", + i->count, count2); + i->count = count2; + if (i->inode.bi_nlink == i->count) + continue; } - if (fsck_err_on(w.have_inode && - !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && - k.k->type != KEY_TYPE_reservation && - k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c, - "extent type %u offset %llu past end of inode %llu, i_size %llu", - k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) { - bch2_fs_lazy_rw(c); - return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents, - POS(k.k->p.inode, round_up(w.inode.bi_size, block_bytes(c)) >> 9), - POS(k.k->p.inode, U64_MAX), - NULL) ?: -EINTR; + if (fsck_err_on(i->inode.bi_nlink != i->count, c, + "directory %llu:%u with wrong i_nlink: got %u, should be %llu", + w->cur_inum, i->snapshot, i->inode.bi_nlink, i->count)) { + i->inode.bi_nlink = i->count; + ret = write_inode(trans, &i->inode, i->snapshot); + if (ret) + break; + ret2 = -EINTR; } - - if (bkey_extent_is_allocation(k.k)) - i_sectors += k.k->size; - bch2_bkey_buf_reassemble(&prev, c, k); - - bch2_btree_iter_advance(&iter); } fsck_err: - if (ret == -EINTR) - goto retry; - bch2_trans_iter_exit(&trans, &iter); - bch2_bkey_buf_exit(&prev, c); - return bch2_trans_exit(&trans) ?: ret; + return ret ?: ret2; +} + +static int check_dirent_target(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + u32 target_snapshot) +{ + struct bch_fs *c = trans->c; + bool backpointer_exists = true; + char buf[200]; + int ret = 0; + + if (!target->bi_dir && + !target->bi_dir_offset) { + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + + ret = write_inode(trans, target, target_snapshot); + if (ret) + goto err; + } + + if (!inode_backpointer_matches(d, target)) { + ret = inode_backpointer_exists(trans, target, d.k->p.snapshot); + if (ret < 0) + goto err; + + backpointer_exists = ret; + ret = 0; + + if (fsck_err_on(S_ISDIR(target->bi_mode) && + backpointer_exists, c, + "directory %llu with multiple links", + target->bi_inum)) { + ret = remove_dirent(trans, d.k->p); + if (ret) + goto err; + return 0; + } + + if (fsck_err_on(backpointer_exists && + !target->bi_nlink, c, + "inode %llu has multiple links but i_nlink 0", + target->bi_inum)) { + target->bi_nlink++; + target->bi_flags &= ~BCH_INODE_UNLINKED; + + ret = write_inode(trans, target, target_snapshot); + if (ret) + goto err; + } + + if (fsck_err_on(!backpointer_exists, c, + "inode %llu has wrong backpointer:\n" + "got %llu:%llu\n" + "should be %llu:%llu", + target->bi_inum, + target->bi_dir, + target->bi_dir_offset, + d.k->p.inode, + d.k->p.offset)) { + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + + ret = write_inode(trans, target, target_snapshot); + if (ret) + goto err; + } + } + + if (fsck_err_on(vfs_d_type(d.v->d_type) != mode_to_type(target->bi_mode), c, + "incorrect d_type: should be %u:\n%s", + mode_to_type(target->bi_mode), + (bch2_bkey_val_to_text(&PBUF(buf), c, d.s_c), buf))) { + struct bkey_i_dirent *n; + + n = kmalloc(bkey_bytes(d.k), GFP_KERNEL); + if (!n) { + ret = -ENOMEM; + goto err; + } + + bkey_reassemble(&n->k_i, d.s_c); + n->v.d_type = mode_to_type(target->bi_mode); + + ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + bch2_trans_update(trans, iter, &n->k_i, 0)); + kfree(n); + if (ret) + goto err; + } +err: +fsck_err: + return ret; } static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, struct bch_hash_info *hash_info, - struct inode_walker *w, unsigned *nr_subdirs) + struct inode_walker *dir, + struct inode_walker *target, + struct snapshots_seen *s) { struct bch_fs *c = trans->c; struct bkey_s_c k; struct bkey_s_c_dirent d; - struct bch_inode_unpacked target; + struct inode_walker_entry *i; u32 target_snapshot; - bool have_target; - bool backpointer_exists = true; - u64 d_inum; + u32 target_subvol; + u64 target_inum; char buf[200]; int ret; @@ -736,38 +1430,49 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (ret) return ret; - if (w->have_inode && - w->cur_inum != k.k->p.inode && - fsck_err_on(w->inode.bi_nlink != *nr_subdirs, c, - "directory %llu with wrong i_nlink: got %u, should be %u", - w->inode.bi_inum, w->inode.bi_nlink, *nr_subdirs)) { - w->inode.bi_nlink = *nr_subdirs; - ret = write_inode(trans, &w->inode, w->snapshot); - return ret ?: -EINTR; - } - - ret = __walk_inode(trans, w, k.k->p.inode); + ret = check_key_has_snapshot(trans, iter, k); if (ret) return ret; - if (w->first_this_inode) - *nr_subdirs = 0; + ret = snapshots_seen_update(c, s, k.k->p); + if (ret) + return ret; - if (fsck_err_on(!w->have_inode, c, + if (k.k->type == KEY_TYPE_whiteout) + return 0; + + if (dir->cur_inum != k.k->p.inode) { + ret = check_subdir_count(trans, dir); + if (ret) + return ret; + } + + ret = __walk_inode(trans, dir, k.k->p); + if (ret < 0) + return ret; + + if (fsck_err_on(ret == INT_MAX, c, "dirent in nonexisting directory:\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)) || - fsck_err_on(!S_ISDIR(w->inode.bi_mode), c, + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) + return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW, + bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); + + if (ret == INT_MAX) + return 0; + + i = dir->d + ret; + ret = 0; + + if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c, "dirent in non directory inode type %u:\n%s", - mode_to_type(w->inode.bi_mode), + mode_to_type(i->inode.bi_mode), (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) return __bch2_trans_do(trans, NULL, NULL, 0, bch2_btree_delete_at(trans, iter, 0)); - if (!w->have_inode) - return 0; - - if (w->first_this_inode) - *hash_info = bch2_hash_info_init(c, &w->inode); + if (dir->first_this_inode) + *hash_info = bch2_hash_info_init(c, &dir->d[0].inode); ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k); @@ -780,105 +1485,76 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, return 0; d = bkey_s_c_to_dirent(k); - d_inum = le64_to_cpu(d.v->d_inum); - ret = __lookup_inode(trans, d_inum, &target, &target_snapshot); + ret = __bch2_dirent_read_target(trans, d, + &target_subvol, + &target_snapshot, + &target_inum, + true); if (ret && ret != -ENOENT) return ret; - have_target = !ret; - ret = 0; - - if (fsck_err_on(!have_target, c, - "dirent points to missing inode:\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, - k), buf))) + if (fsck_err_on(ret, c, + "dirent points to missing subvolume %llu", + le64_to_cpu(d.v->d_inum))) return remove_dirent(trans, d.k->p); - if (!have_target) - return 0; + if (target_subvol) { + struct bch_inode_unpacked subvol_root; - if (!target.bi_dir && - !target.bi_dir_offset) { - target.bi_dir = k.k->p.inode; - target.bi_dir_offset = k.k->p.offset; + ret = __lookup_inode(trans, target_inum, + &subvol_root, &target_snapshot); + if (ret && ret != -ENOENT) + return ret; - ret = __write_inode(trans, &target, target_snapshot) ?: - bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); + if (fsck_err_on(ret, c, + "subvolume %u points to missing subvolume root %llu", + target_subvol, + target_inum)) { + bch_err(c, "repair not implemented yet"); + return -EINVAL; + } + + if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c, + "subvol root %llu has wrong bi_subvol field: got %u, should be %u", + target_inum, + subvol_root.bi_subvol, target_subvol)) { + subvol_root.bi_subvol = target_subvol; + ret = write_inode(trans, &subvol_root, target_snapshot); + if (ret) + return ret; + } + + ret = check_dirent_target(trans, iter, d, &subvol_root, + target_snapshot); if (ret) return ret; - return -EINTR; - } - - if (!inode_backpointer_matches(d, &target)) { - ret = inode_backpointer_exists(trans, &target); - if (ret < 0) + } else { + ret = __get_visible_inodes(trans, target, s, target_inum); + if (ret) return ret; - backpointer_exists = ret; - ret = 0; - - if (fsck_err_on(S_ISDIR(target.bi_mode) && - backpointer_exists, c, - "directory %llu with multiple links", - target.bi_inum)) - return remove_dirent(trans, d.k->p); - - if (fsck_err_on(backpointer_exists && - !target.bi_nlink, c, - "inode %llu has multiple links but i_nlink 0", - d_inum)) { - target.bi_nlink++; - target.bi_flags &= ~BCH_INODE_UNLINKED; - - ret = write_inode(trans, &target, target_snapshot); - return ret ?: -EINTR; + if (fsck_err_on(!target->nr, c, + "dirent points to missing inode:\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, + k), buf))) { + ret = remove_dirent(trans, d.k->p); + if (ret) + return ret; } - if (fsck_err_on(!backpointer_exists, c, - "inode %llu has wrong backpointer:\n" - "got %llu:%llu\n" - "should be %llu:%llu", - d_inum, - target.bi_dir, - target.bi_dir_offset, - k.k->p.inode, - k.k->p.offset)) { - target.bi_dir = k.k->p.inode; - target.bi_dir_offset = k.k->p.offset; - - ret = write_inode(trans, &target, target_snapshot); - return ret ?: -EINTR; + for (i = target->d; i < target->d + target->nr; i++) { + ret = check_dirent_target(trans, iter, d, + &i->inode, i->snapshot); + if (ret) + return ret; } } - if (fsck_err_on(d.v->d_type != mode_to_type(target.bi_mode), c, - "incorrect d_type: should be %u:\n%s", - mode_to_type(target.bi_mode), - (bch2_bkey_val_to_text(&PBUF(buf), c, - k), buf))) { - struct bkey_i_dirent *n; + if (d.v->d_type == DT_DIR) + for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) + i->count++; - n = kmalloc(bkey_bytes(d.k), GFP_KERNEL); - if (!n) - return -ENOMEM; - - bkey_reassemble(&n->k_i, d.s_c); - n->v.d_type = mode_to_type(target.bi_mode); - - ret = __bch2_trans_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - bch2_btree_iter_traverse(iter) ?: - bch2_trans_update(trans, iter, &n->k_i, 0)); - kfree(n); - return ret ?: -EINTR; - } - - *nr_subdirs += d.v->d_type == DT_DIR; - return 0; fsck_err: return ret; } @@ -890,31 +1566,39 @@ fsck_err: noinline_for_stack static int check_dirents(struct bch_fs *c) { - struct inode_walker w = inode_walker_init(); + struct inode_walker dir = inode_walker_init(); + struct inode_walker target = inode_walker_init(); + struct snapshots_seen s; struct bch_hash_info hash_info; struct btree_trans trans; struct btree_iter iter; - unsigned nr_subdirs = 0; int ret = 0; bch_verbose(c, "checking dirents"); + snapshots_seen_init(&s); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); bch2_trans_iter_init(&trans, &iter, BTREE_ID_dirents, POS(BCACHEFS_ROOT_INO, 0), BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH); + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); do { ret = lockrestart_do(&trans, - check_dirent(&trans, &iter, &hash_info, &w, &nr_subdirs)); + check_dirent(&trans, &iter, &hash_info, + &dir, &target, &s)); if (ret) break; } while (bch2_btree_iter_advance(&iter)); bch2_trans_iter_exit(&trans, &iter); - return bch2_trans_exit(&trans) ?: ret; + bch2_trans_exit(&trans); + snapshots_seen_exit(&s); + inode_walker_exit(&dir); + inode_walker_exit(&target); + return ret; } /* @@ -937,15 +1621,22 @@ static int check_xattrs(struct bch_fs *c) bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS(BCACHEFS_ROOT_INO, 0), BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH); + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); retry: + bch2_trans_begin(&trans); + while ((k = bch2_btree_iter_peek(&iter)).k && !(ret = bkey_err(k))) { - ret = walk_inode(&trans, &w, k.k->p.inode); + ret = check_key_has_snapshot(&trans, &iter, k); if (ret) break; - if (fsck_err_on(!w.have_inode, c, + ret = walk_inode(&trans, &w, k.k->p); + if (ret < 0) + break; + + if (fsck_err_on(ret == INT_MAX, c, "xattr for missing inode %llu", k.k->p.inode)) { ret = bch2_btree_delete_at(&trans, &iter, 0); @@ -954,14 +1645,18 @@ retry: continue; } - if (w.first_this_inode && w.have_inode) - hash_info = bch2_hash_info_init(c, &w.inode); + if (ret == INT_MAX) + goto next; + ret = 0; + + if (w.first_this_inode) + hash_info = bch2_hash_info_init(c, &w.d[0].inode); ret = hash_check_key(&trans, bch2_xattr_hash_desc, &hash_info, &iter, k); if (ret) break; - +next: bch2_btree_iter_advance(&iter); } fsck_err: @@ -973,40 +1668,63 @@ fsck_err: } /* Get root directory, create if it doesn't exist: */ -static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode) +static int check_root(struct bch_fs *c) { - struct bkey_inode_buf packed; + struct btree_trans trans; + struct bch_inode_unpacked root_inode; u32 snapshot; + u64 inum; int ret; + bch2_trans_init(&trans, c, 0, 0); + bch_verbose(c, "checking root directory"); - ret = bch2_trans_do(c, NULL, NULL, 0, - lookup_inode(&trans, BCACHEFS_ROOT_INO, root_inode, &snapshot)); + ret = subvol_lookup(&trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); if (ret && ret != -ENOENT) return ret; - if (fsck_err_on(ret, c, "root directory missing")) - goto create_root; + if (mustfix_fsck_err_on(ret, c, "root subvol missing")) { + struct bkey_i_subvolume root_subvol; - if (fsck_err_on(!S_ISDIR(root_inode->bi_mode), c, - "root inode not a directory")) - goto create_root; + snapshot = U32_MAX; + inum = BCACHEFS_ROOT_INO; - return 0; + bkey_subvolume_init(&root_subvol.k_i); + root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL; + root_subvol.v.flags = 0; + root_subvol.v.snapshot = cpu_to_le32(snapshot); + root_subvol.v.inode = cpu_to_le64(inum); + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + __bch2_btree_insert(&trans, BTREE_ID_subvolumes, &root_subvol.k_i)); + if (ret) { + bch_err(c, "error writing root subvol: %i", ret); + goto err; + } + + } + + ret = lookup_inode(&trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot); + if (ret && ret != -ENOENT) + return ret; + + if (mustfix_fsck_err_on(ret, c, "root directory missing") || + mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c, + "root inode not a directory")) { + bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, + 0, NULL); + root_inode.bi_inum = inum; + + ret = write_inode(&trans, &root_inode, snapshot); + if (ret) + bch_err(c, "error writing root inode: %i", ret); + } +err: fsck_err: + bch2_trans_exit(&trans); return ret; -create_root: - bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|0755, - 0, NULL); - root_inode->bi_inum = BCACHEFS_ROOT_INO; - - bch2_inode_pack(c, &packed, root_inode); - - return bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i, - NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); } struct pathbuf { @@ -1041,29 +1759,30 @@ static int path_down(struct pathbuf *p, u64 inum) static int check_path(struct btree_trans *trans, struct pathbuf *p, - struct bch_inode_unpacked *inode) + struct bch_inode_unpacked *inode, + u32 snapshot) { struct bch_fs *c = trans->c; - u32 snapshot; size_t i; int ret = 0; + snapshot = snapshot_t(c, snapshot)->equiv; p->nr = 0; while (inode->bi_inum != BCACHEFS_ROOT_INO) { ret = lockrestart_do(trans, - inode_backpointer_exists(trans, inode)); + inode_backpointer_exists(trans, inode, snapshot)); if (ret < 0) break; if (!ret) { - if (fsck_err(c, "unreachable inode %llu, type %u nlink %u backptr %llu:%llu", - inode->bi_inum, + if (fsck_err(c, "unreachable inode %llu:%u, type %u nlink %u backptr %llu:%llu", + inode->bi_inum, snapshot, mode_to_type(inode->bi_mode), inode->bi_nlink, inode->bi_dir, inode->bi_dir_offset)) - ret = reattach_inode(trans, inode); + ret = reattach_inode(trans, inode, snapshot); break; } ret = 0; @@ -1086,13 +1805,13 @@ static int check_path(struct btree_trans *trans, return 0; ret = lockrestart_do(trans, - remove_backpointer(trans, inode)); + remove_backpointer(trans, inode)); if (ret) { bch_err(c, "error removing dirent: %i", ret); break; } - ret = reattach_inode(trans, inode); + ret = reattach_inode(trans, inode, snapshot); break; } @@ -1127,7 +1846,8 @@ static int check_directory_structure(struct bch_fs *c) for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH, k, ret) { + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { if (k.k->type != KEY_TYPE_inode) continue; @@ -1138,7 +1858,10 @@ static int check_directory_structure(struct bch_fs *c) break; } - ret = check_path(&trans, &path, &u); + if (u.bi_flags & BCH_INODE_UNLINKED) + continue; + + ret = check_path(&trans, &path, &u, iter.pos.snapshot); if (ret) break; } @@ -1196,8 +1919,9 @@ static int nlink_cmp(const void *_l, const void *_r) return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot); } -static void inc_link(struct bch_fs *c, struct nlink_table *links, - u64 range_start, u64 range_end, u64 inum) +static void inc_link(struct bch_fs *c, struct snapshots_seen *s, + struct nlink_table *links, + u64 range_start, u64 range_end, u64 inum, u32 snapshot) { struct nlink *link, key = { .inum = inum, .snapshot = U32_MAX, @@ -1208,8 +1932,18 @@ static void inc_link(struct bch_fs *c, struct nlink_table *links, link = __inline_bsearch(&key, links->d, links->nr, sizeof(links->d[0]), nlink_cmp); - if (link) - link->count++; + if (!link) + return; + + while (link > links->d && link[0].inum == link[-1].inum) + --link; + + for (; link < links->d + links->nr && link->inum == inum; link++) + if (ref_visible(c, s, snapshot, link->snapshot)) { + link->count++; + if (link->snapshot >= snapshot) + break; + } } noinline_for_stack @@ -1229,7 +1963,8 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS(0, start), BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH, k, ret) { + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { if (k.k->type != KEY_TYPE_inode) continue; @@ -1270,23 +2005,33 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links u64 range_start, u64 range_end) { struct btree_trans trans; + struct snapshots_seen s; struct btree_iter iter; struct bkey_s_c k; struct bkey_s_c_dirent d; int ret; + snapshots_seen_init(&s); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN, BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH, k, ret) { + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + ret = snapshots_seen_update(c, &s, k.k->p); + if (ret) + break; + switch (k.k->type) { case KEY_TYPE_dirent: d = bkey_s_c_to_dirent(k); - if (d.v->d_type != DT_DIR) - inc_link(c, links, range_start, range_end, - le64_to_cpu(d.v->d_inum)); + if (d.v->d_type != DT_DIR && + d.v->d_type != DT_SUBVOL) + inc_link(c, &s, links, range_start, range_end, + le64_to_cpu(d.v->d_inum), + d.k->p.snapshot); break; } @@ -1294,10 +2039,11 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links } bch2_trans_iter_exit(&trans, &iter); - ret = bch2_trans_exit(&trans) ?: ret; if (ret) bch_err(c, "error in fsck: btree error %i while walking dirents", ret); + bch2_trans_exit(&trans); + snapshots_seen_exit(&s); return ret; } @@ -1319,7 +2065,8 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS(0, range_start), BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH, k, ret) { + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { if (k.k->p.offset >= range_end) break; @@ -1335,7 +2082,8 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, if (!u.bi_nlink) continue; - while (link->inum < k.k->p.offset) { + while ((cmp_int(link->inum, k.k->p.offset) ?: + cmp_int(link->snapshot, k.k->p.snapshot)) < 0) { link++; BUG_ON(link >= links->d + links->nr); } @@ -1408,13 +2156,13 @@ static int check_nlinks(struct bch_fs *c) */ int bch2_fsck_full(struct bch_fs *c) { - struct bch_inode_unpacked root_inode; - - return check_inodes(c, true) ?: + return bch2_fs_snapshots_check(c) ?: + check_inodes(c, true) ?: + check_subvols(c) ?: check_extents(c) ?: check_dirents(c) ?: check_xattrs(c) ?: - check_root(c, &root_inode) ?: + check_root(c) ?: check_directory_structure(c) ?: check_nlinks(c); } diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 14b0e8c0..9130d571 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -6,8 +6,10 @@ #include "btree_update.h" #include "error.h" #include "extents.h" +#include "extent_update.h" #include "inode.h" #include "str_hash.h" +#include "subvolume.h" #include "varint.h" #include @@ -295,15 +297,21 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode, int bch2_inode_peek(struct btree_trans *trans, struct btree_iter *iter, struct bch_inode_unpacked *inode, - u64 inum, unsigned flags) + subvol_inum inum, unsigned flags) { struct bkey_s_c k; + u32 snapshot; int ret; - if (trans->c->opts.inodes_use_key_cache) + if (0 && trans->c->opts.inodes_use_key_cache) flags |= BTREE_ITER_CACHED; - bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, inum), flags); + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; + + bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, + SPOS(0, inum.inum, snapshot), flags); k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) @@ -340,8 +348,8 @@ int bch2_inode_write(struct btree_trans *trans, const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) { - struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); - struct bch_inode_unpacked unpacked; + struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); + struct bch_inode_unpacked unpacked; if (k.k->p.inode) return "nonzero k.p.inode"; @@ -368,6 +376,9 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) unpacked.bi_nlink != 0) return "flagged as unlinked but bi_nlink != 0"; + if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) + return "subvolume root but not a directory"; + return NULL; } @@ -482,6 +493,9 @@ static inline u32 bkey_generation(struct bkey_s_c k) } } +/* + * This just finds an empty slot: + */ int bch2_inode_create(struct btree_trans *trans, struct btree_iter *iter, struct bch_inode_unpacked *inode_u, @@ -581,19 +595,77 @@ found_slot: return 0; } -int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) +static int bch2_inode_delete_keys(struct btree_trans *trans, + subvol_inum inum, enum btree_id id) +{ + u64 offset = 0; + int ret = 0; + + while (!ret || ret == -EINTR) { + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i delete; + u32 snapshot; + + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + continue; + + bch2_trans_iter_init(trans, &iter, id, + SPOS(inum.inum, offset, snapshot), + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek(&iter); + + if (!k.k || iter.pos.inode != inum.inum) { + bch2_trans_iter_exit(trans, &iter); + break; + } + + ret = bkey_err(k); + if (ret) + goto err; + + bkey_init(&delete.k); + delete.k.p = iter.pos; + + if (btree_node_type_is_extents(iter.btree_id)) { + unsigned max_sectors = + min_t(u64, U64_MAX - iter.pos.offset, + KEY_SIZE_MAX & (~0 << trans->c->block_bits)); + + /* create the biggest key we can */ + bch2_key_resize(&delete.k, max_sectors); + + ret = bch2_extent_trim_atomic(trans, &iter, &delete); + if (ret) + goto err; + } + + ret = bch2_trans_update(trans, &iter, &delete, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL); +err: + offset = iter.pos.offset; + bch2_trans_iter_exit(trans, &iter); + } + + return ret; +} + +int bch2_inode_rm(struct bch_fs *c, subvol_inum inum, bool cached) { struct btree_trans trans; struct btree_iter iter = { NULL }; struct bkey_i_inode_generation delete; - struct bpos start = POS(inode_nr, 0); - struct bpos end = POS(inode_nr + 1, 0); struct bch_inode_unpacked inode_u; struct bkey_s_c k; unsigned iter_flags = BTREE_ITER_INTENT; + u32 snapshot; int ret; - if (cached && c->opts.inodes_use_key_cache) + if (0 && cached && c->opts.inodes_use_key_cache) iter_flags |= BTREE_ITER_CACHED; bch2_trans_init(&trans, c, 0, 1024); @@ -606,19 +678,20 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) * XXX: the dirent could ideally would delete whiteouts when they're no * longer needed */ - ret = bch2_btree_delete_range_trans(&trans, BTREE_ID_extents, - start, end, NULL) ?: - bch2_btree_delete_range_trans(&trans, BTREE_ID_xattrs, - start, end, NULL) ?: - bch2_btree_delete_range_trans(&trans, BTREE_ID_dirents, - start, end, NULL); + ret = bch2_inode_delete_keys(&trans, inum, BTREE_ID_extents) ?: + bch2_inode_delete_keys(&trans, inum, BTREE_ID_xattrs) ?: + bch2_inode_delete_keys(&trans, inum, BTREE_ID_dirents); if (ret) goto err; retry: bch2_trans_begin(&trans); + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; + bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, - POS(0, inode_nr), iter_flags); + SPOS(0, inum.inum, snapshot), iter_flags); k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); @@ -628,13 +701,20 @@ retry: if (k.k->type != KEY_TYPE_inode) { bch2_fs_inconsistent(trans.c, "inode %llu not found when deleting", - inode_nr); + inum.inum); ret = -EIO; goto err; } bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u); + /* Subvolume root? */ + if (inode_u.bi_subvol) { + ret = bch2_subvolume_delete(&trans, inode_u.bi_subvol, -1); + if (ret) + goto err; + } + bkey_inode_generation_init(&delete.k_i); delete.k.p = iter.pos; delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); @@ -651,20 +731,22 @@ err: return ret; } -static int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, +static int bch2_inode_find_by_inum_trans(struct btree_trans *trans, + subvol_inum inum, struct bch_inode_unpacked *inode) { - struct btree_iter iter = { NULL }; + struct btree_iter iter; int ret; - ret = bch2_inode_peek(trans, &iter, inode, inode_nr, 0); - bch2_trans_iter_exit(trans, &iter); + ret = bch2_inode_peek(trans, &iter, inode, inum, 0); + if (!ret) + bch2_trans_iter_exit(trans, &iter); return ret; } -int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, +int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, struct bch_inode_unpacked *inode) { return bch2_trans_do(c, NULL, NULL, 0, - bch2_inode_find_by_inum_trans(&trans, inode_nr, inode)); + bch2_inode_find_by_inum_trans(&trans, inum, inode)); } diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index 25bef104..9e84cddc 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -58,7 +58,7 @@ int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *); int bch2_inode_peek(struct btree_trans *, struct btree_iter *, - struct bch_inode_unpacked *, u64, unsigned); + struct bch_inode_unpacked *, subvol_inum, unsigned); int bch2_inode_write(struct btree_trans *, struct btree_iter *, struct bch_inode_unpacked *); @@ -74,9 +74,10 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, int bch2_inode_create(struct btree_trans *, struct btree_iter *, struct bch_inode_unpacked *, u32, u64); -int bch2_inode_rm(struct bch_fs *, u64, bool); +int bch2_inode_rm(struct bch_fs *, subvol_inum, bool); -int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *); +int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, + struct bch_inode_unpacked *); static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode) { diff --git a/libbcachefs/io.c b/libbcachefs/io.c index ccde9001..0bc72d2a 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -27,6 +27,7 @@ #include "keylist.h" #include "move.h" #include "rebalance.h" +#include "subvolume.h" #include "super.h" #include "super-io.h" @@ -220,7 +221,8 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, : 0; if (!*usage_increasing && - (new_replicas > bch2_bkey_replicas(c, old) || + (new->k.p.snapshot != old.k->p.snapshot || + new_replicas > bch2_bkey_replicas(c, old) || (!new_compressed && bch2_bkey_sectors_compressed(old)))) *usage_increasing = true; @@ -256,6 +258,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, } int bch2_extent_update(struct btree_trans *trans, + subvol_inum inum, struct btree_iter *iter, struct bkey_i *k, struct disk_reservation *disk_res, @@ -314,8 +317,8 @@ int bch2_extent_update(struct btree_trans *trans, struct btree_iter inode_iter; struct bch_inode_unpacked inode_u; - ret = bch2_inode_peek(trans, &inode_iter, &inode_u, - k->k.p.inode, BTREE_ITER_INTENT); + ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum, + BTREE_ITER_INTENT); if (ret) return ret; @@ -371,22 +374,37 @@ int bch2_extent_update(struct btree_trans *trans, return 0; } +/* + * Returns -EINTR if we had to drop locks: + */ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, - struct bpos end, u64 *journal_seq, - s64 *i_sectors_delta) + subvol_inum inum, u64 end, + u64 *journal_seq, s64 *i_sectors_delta) { struct bch_fs *c = trans->c; unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); + struct bpos end_pos = POS(inum.inum, end); struct bkey_s_c k; int ret = 0, ret2 = 0; + u32 snapshot; - while ((bch2_trans_begin(trans), - (k = bch2_btree_iter_peek(iter)).k) && - bkey_cmp(iter->pos, end) < 0) { + while (1) { struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); struct bkey_i delete; + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto btree_err; + + bch2_btree_iter_set_snapshot(iter, snapshot); + + k = bch2_btree_iter_peek(iter); + if (bkey_cmp(iter->pos, end_pos) >= 0) + break; + ret = bkey_err(k); if (ret) goto btree_err; @@ -396,9 +414,9 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, /* create the biggest key we can */ bch2_key_resize(&delete.k, max_sectors); - bch2_cut_back(end, &delete); + bch2_cut_back(end_pos, &delete); - ret = bch2_extent_update(trans, iter, &delete, + ret = bch2_extent_update(trans, inum, iter, &delete, &disk_res, journal_seq, 0, i_sectors_delta, false); bch2_disk_reservation_put(c, &disk_res); @@ -411,36 +429,31 @@ btree_err: break; } - if (bkey_cmp(iter->pos, end) > 0) { - bch2_btree_iter_set_pos(iter, end); - ret = bch2_btree_iter_traverse(iter); - } + if (bkey_cmp(iter->pos, end_pos) > 0) + bch2_btree_iter_set_pos(iter, end_pos); return ret ?: ret2; } -int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end, +int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, u64 *journal_seq, s64 *i_sectors_delta) { struct btree_trans trans; struct btree_iter iter; - int ret = 0; + int ret; bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, - POS(inum, start), - BTREE_ITER_INTENT); + POS(inum.inum, start), + BTREE_ITER_INTENT); - ret = bch2_fpunch_at(&trans, &iter, POS(inum, end), + ret = bch2_fpunch_at(&trans, &iter, inum, end, journal_seq, i_sectors_delta); bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); - if (ret == -EINTR) - ret = 0; - - return ret; + return ret == -EINTR ? 0 : ret; } int bch2_write_index_default(struct bch_write_op *op) @@ -451,40 +464,51 @@ int bch2_write_index_default(struct bch_write_op *op) struct bkey_i *k = bch2_keylist_front(keys); struct btree_trans trans; struct btree_iter iter; + subvol_inum inum = { + .subvol = op->subvol, + .inum = k->k.p.inode, + }; int ret; + BUG_ON(!inum.subvol); + bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, - bkey_start_pos(&k->k), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - do { bch2_trans_begin(&trans); k = bch2_keylist_front(keys); + bch2_bkey_buf_copy(&sk, c, k); - k->k.p.snapshot = iter.snapshot; + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, + &sk.k->k.p.snapshot); + if (ret == -EINTR) + continue; + if (ret) + break; - bch2_bkey_buf_realloc(&sk, c, k->k.u64s); - bkey_copy(sk.k, k); - bch2_cut_front(iter.pos, sk.k); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + bkey_start_pos(&sk.k->k), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - ret = bch2_extent_update(&trans, &iter, sk.k, + ret = bch2_extent_update(&trans, inum, &iter, sk.k, &op->res, op_journal_seq(op), op->new_i_size, &op->i_sectors_delta, op->flags & BCH_WRITE_CHECK_ENOSPC); + bch2_trans_iter_exit(&trans, &iter); + if (ret == -EINTR) continue; if (ret) break; if (bkey_cmp(iter.pos, k->k.p) >= 0) - bch2_keylist_pop_front(keys); + bch2_keylist_pop_front(&op->insert_keys); + else + bch2_cut_front(iter.pos, k); } while (!bch2_keylist_empty(keys)); - bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); bch2_bkey_buf_exit(&sk, c); @@ -1645,7 +1669,7 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) } static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, u64 inode, + struct bvec_iter bvec_iter, struct bch_io_failures *failed, unsigned flags) { @@ -1709,7 +1733,10 @@ static void bch2_rbio_retry(struct work_struct *work) struct bch_fs *c = rbio->c; struct bvec_iter iter = rbio->bvec_iter; unsigned flags = rbio->flags; - u64 inode = rbio->read_pos.inode; + subvol_inum inum = { + .subvol = rbio->subvol, + .inum = rbio->read_pos.inode, + }; struct bch_io_failures failed = { .nr = 0 }; trace_read_retry(&rbio->bio); @@ -1725,12 +1752,12 @@ static void bch2_rbio_retry(struct work_struct *work) flags &= ~BCH_READ_MAY_PROMOTE; if (flags & BCH_READ_NODECODE) { - bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags); + bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); } else { flags &= ~BCH_READ_LAST_FRAGMENT; flags |= BCH_READ_MUST_CLONE; - __bch2_read(c, rbio, iter, inode, &failed, flags); + __bch2_read(c, rbio, iter, inum, &failed, flags); } } @@ -1804,7 +1831,8 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, if (!bch2_bkey_narrow_crcs(new, new_crc)) goto out; - ret = bch2_trans_update(trans, &iter, new, 0); + ret = bch2_trans_update(trans, &iter, new, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); out: bch2_trans_iter_exit(trans, &iter); return ret; @@ -2172,6 +2200,7 @@ get_bio: /* XXX: only initialize this if needed */ rbio->devs_have = bch2_bkey_devs(k); rbio->pick = pick; + rbio->subvol = orig->subvol; rbio->read_pos = read_pos; rbio->data_btree = data_btree; rbio->data_pos = data_pos; @@ -2274,25 +2303,31 @@ out_read_done: } void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, u64 inode, + struct bvec_iter bvec_iter, subvol_inum inum, struct bch_io_failures *failed, unsigned flags) { struct btree_trans trans; struct btree_iter iter; struct bkey_buf sk; struct bkey_s_c k; + u32 snapshot; int ret; BUG_ON(flags & BCH_READ_NODECODE); bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, - POS(inode, bvec_iter.bi_sector), - BTREE_ITER_SLOTS); retry: bch2_trans_begin(&trans); + iter = (struct btree_iter) { NULL }; + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + SPOS(inum.inum, bvec_iter.bi_sector, snapshot), + BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS); while (1) { unsigned bytes, sectors, offset_into_extent; enum btree_id data_btree = BTREE_ID_extents; @@ -2307,7 +2342,7 @@ retry: } bch2_btree_iter_set_pos(&iter, - POS(inode, bvec_iter.bi_sector)); + POS(inum.inum, bvec_iter.bi_sector)); k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); @@ -2357,16 +2392,17 @@ retry: swap(bvec_iter.bi_size, bytes); bio_advance_iter(&rbio->bio, &bvec_iter, bytes); } +err: + bch2_trans_iter_exit(&trans, &iter); if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID) goto retry; - bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); bch2_bkey_buf_exit(&sk, c); if (ret) { - bch_err_inum_ratelimited(c, inode, + bch_err_inum_ratelimited(c, inum.inum, "read error %i from btree lookup", ret); rbio->bio.bi_status = BLK_STS_IOERR; bch2_rbio_done(rbio); diff --git a/libbcachefs/io.h b/libbcachefs/io.h index bc0a0bd6..38efd39c 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -63,12 +63,13 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, struct bkey_i *, bool *, bool *, s64 *, s64 *); -int bch2_extent_update(struct btree_trans *, struct btree_iter *, - struct bkey_i *, struct disk_reservation *, - u64 *, u64, s64 *, bool); +int bch2_extent_update(struct btree_trans *, subvol_inum, + struct btree_iter *, struct bkey_i *, + struct disk_reservation *, u64 *, u64, s64 *, bool); + int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, - struct bpos, u64 *, s64 *); -int bch2_fpunch(struct bch_fs *c, u64, u64, u64, u64 *, s64 *); + subvol_inum, u64, u64 *, s64 *); +int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, u64 *, s64 *); int bch2_write_index_default(struct bch_write_op *); @@ -90,6 +91,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, op->devs_have.nr = 0; op->target = 0; op->opts = opts; + op->subvol = 0; op->pos = POS_MAX; op->version = ZERO_VERSION; op->write_point = (struct write_point_specifier) { 0 }; @@ -157,10 +159,10 @@ static inline void bch2_read_extent(struct btree_trans *trans, } void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, - u64, struct bch_io_failures *, unsigned flags); + subvol_inum, struct bch_io_failures *, unsigned flags); static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - u64 inode) + subvol_inum inum) { struct bch_io_failures failed = { .nr = 0 }; @@ -168,8 +170,9 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, rbio->c = c; rbio->start_time = local_clock(); + rbio->subvol = inum.subvol; - __bch2_read(c, rbio, rbio->bio.bi_iter, inode, &failed, + __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, BCH_READ_RETRY_IF_STALE| BCH_READ_MAY_PROMOTE| BCH_READ_USER_MAPPED); diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h index 0aab7795..78bff13d 100644 --- a/libbcachefs/io_types.h +++ b/libbcachefs/io_types.h @@ -62,6 +62,7 @@ struct bch_read_bio { /* * pos we read from - different from data_pos for indirect extents: */ + u32 subvol; struct bpos read_pos; /* @@ -122,6 +123,7 @@ struct bch_write_op { u16 nonce; struct bch_io_opts opts; + u32 subvol; struct bpos pos; struct bversion version; diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index 1899326d..7c764ee4 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -48,7 +48,8 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN, - BTREE_ITER_PREFETCH); + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); while ((k = bch2_btree_iter_peek(&iter)).k && !(ret = bkey_err(k))) { @@ -74,7 +75,8 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags bch2_btree_iter_set_pos(&iter, bkey_start_pos(&sk.k->k)); ret = bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update(&trans, &iter, sk.k, 0) ?: + bch2_trans_update(&trans, &iter, sk.k, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL); diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 7001e3cd..44a61818 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -13,6 +13,7 @@ #include "journal_reclaim.h" #include "move.h" #include "replicas.h" +#include "subvolume.h" #include "super-io.h" #include "keylist.h" @@ -53,6 +54,81 @@ struct moving_context { wait_queue_head_t wait; }; +static int insert_snapshot_whiteouts(struct btree_trans *trans, + enum btree_id id, + struct bpos old_pos, + struct bpos new_pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter, update_iter; + struct bkey_s_c k; + struct snapshots_seen s; + int ret; + + if (!btree_type_has_snapshots(id)) + return 0; + + snapshots_seen_init(&s); + + if (!bkey_cmp(old_pos, new_pos)) + return 0; + + if (!snapshot_t(c, old_pos.snapshot)->children[0]) + return 0; + + bch2_trans_iter_init(trans, &iter, id, old_pos, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); + while (1) { +next: + k = bch2_btree_iter_prev(&iter); + ret = bkey_err(k); + if (ret) + break; + + if (bkey_cmp(old_pos, k.k->p)) + break; + + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) { + struct bkey_i *update; + size_t i; + + for (i = 0; i < s.nr; i++) + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, s.d[i])) + goto next; + + update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); + + ret = PTR_ERR_OR_ZERO(update); + if (ret) + break; + + bkey_init(&update->k); + update->k.p = new_pos; + update->k.p.snapshot = k.k->p.snapshot; + + bch2_trans_iter_init(trans, &update_iter, id, update->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&update_iter) ?: + bch2_trans_update(trans, &update_iter, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch2_trans_iter_exit(trans, &update_iter); + if (ret) + break; + + ret = snapshots_seen_add(c, &s, k.k->p.snapshot); + if (ret) + break; + } + } + bch2_trans_iter_exit(trans, &iter); + kfree(s.d); + + return ret; +} + static int bch2_migrate_index_update(struct bch_write_op *op) { struct bch_fs *c = op->c; @@ -166,7 +242,10 @@ static int bch2_migrate_index_update(struct bch_write_op *op) next_pos = insert->k.p; - ret = bch2_trans_update(&trans, &iter, insert, 0) ?: + ret = insert_snapshot_whiteouts(&trans, m->btree_id, + k.k->p, insert->k.p) ?: + bch2_trans_update(&trans, &iter, insert, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: bch2_trans_commit(&trans, &op->res, op_journal_seq(op), BTREE_INSERT_NOFAIL| @@ -581,7 +660,8 @@ static int __bch2_move_data(struct bch_fs *c, stats->pos = start; bch2_trans_iter_init(&trans, &iter, btree_id, start, - BTREE_ITER_PREFETCH); + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); if (rate) bch2_ratelimit_reset(rate); diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index 5de29607..ff99c6d2 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -63,7 +63,7 @@ const char * const bch2_member_states[] = { #undef x -const char * const bch2_d_types[DT_MAX] = { +const char * const bch2_d_types[BCH_DT_MAX] = { [DT_UNKNOWN] = "unknown", [DT_FIFO] = "fifo", [DT_CHR] = "chr", @@ -73,6 +73,7 @@ const char * const bch2_d_types[DT_MAX] = { [DT_LNK] = "lnk", [DT_SOCK] = "sock", [DT_WHT] = "whiteout", + [DT_SUBVOL] = "subvol", }; void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 147b4021..d39d6a54 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -215,19 +215,19 @@ enum opt_type { BCH_SB_POSIX_ACL, true, \ NULL, "Enable POSIX acls") \ x(usrquota, u8, \ - OPT_FORMAT|OPT_MOUNT, \ + 0, \ OPT_BOOL(), \ - BCH_SB_USRQUOTA, false, \ + NO_SB_OPT, false, \ NULL, "Enable user quotas") \ x(grpquota, u8, \ - OPT_FORMAT|OPT_MOUNT, \ + 0, \ OPT_BOOL(), \ - BCH_SB_GRPQUOTA, false, \ + NO_SB_OPT, false, \ NULL, "Enable group quotas") \ x(prjquota, u8, \ - OPT_FORMAT|OPT_MOUNT, \ + 0, \ OPT_BOOL(), \ - BCH_SB_PRJQUOTA, false, \ + NO_SB_OPT, false, \ NULL, "Enable project quotas") \ x(degraded, u8, \ OPT_MOUNT, \ diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 11208e83..64e0b542 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -20,6 +20,7 @@ #include "quota.h" #include "recovery.h" #include "replicas.h" +#include "subvolume.h" #include "super-io.h" #include @@ -961,6 +962,81 @@ fsck_err: return ret; } +static int bch2_fs_initialize_subvolumes(struct bch_fs *c) +{ + struct bkey_i_snapshot root_snapshot; + struct bkey_i_subvolume root_volume; + int ret; + + bkey_snapshot_init(&root_snapshot.k_i); + root_snapshot.k.p.offset = U32_MAX; + root_snapshot.v.flags = 0; + root_snapshot.v.parent = 0; + root_snapshot.v.subvol = BCACHEFS_ROOT_SUBVOL; + root_snapshot.v.pad = 0; + SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true); + + ret = bch2_btree_insert(c, BTREE_ID_snapshots, + &root_snapshot.k_i, + NULL, NULL, 0); + if (ret) + return ret; + + + bkey_subvolume_init(&root_volume.k_i); + root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL; + root_volume.v.flags = 0; + root_volume.v.snapshot = cpu_to_le32(U32_MAX); + root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); + + ret = bch2_btree_insert(c, BTREE_ID_subvolumes, + &root_volume.k_i, + NULL, NULL, 0); + if (ret) + return ret; + + return 0; +} + +static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_inode_unpacked inode; + struct bkey_inode_buf *packed; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, + POS(0, BCACHEFS_ROOT_INO), 0); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_inode) { + bch_err(c, "root inode not found"); + ret = -ENOENT; + goto err; + } + + ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &inode); + BUG_ON(ret); + + inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; + + packed = bch2_trans_kmalloc(trans, sizeof(*packed)); + ret = PTR_ERR_OR_ZERO(packed); + if (ret) + goto err; + + bch2_inode_pack(c, packed, &inode); + ret = bch2_trans_update(trans, &iter, &packed->inode.k_i, 0); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + int bch2_fs_recovery(struct bch_fs *c) { const char *err = "cannot allocate memory"; @@ -1017,11 +1093,12 @@ int bch2_fs_recovery(struct bch_fs *c) c->opts.version_upgrade = true; c->opts.fsck = true; c->opts.fix_errors = FSCK_OPT_YES; - } - - if (c->sb.version < bcachefs_metadata_version_btree_ptr_sectors_written) { + } else if (c->sb.version < bcachefs_metadata_version_btree_ptr_sectors_written) { bch_info(c, "version prior to btree_ptr_sectors_written, upgrade required"); c->opts.version_upgrade = true; + } else if (c->sb.version < bcachefs_metadata_version_snapshot) { + bch_info(c, "filesystem version is prior to snapshot field - upgrading"); + c->opts.version_upgrade = true; } ret = bch2_blacklist_table_initialize(c); @@ -1190,6 +1267,29 @@ use_clean: bch_verbose(c, "alloc write done"); } + if (c->sb.version < bcachefs_metadata_version_snapshot) { + err = "error creating root snapshot node"; + ret = bch2_fs_initialize_subvolumes(c); + if (ret) + goto err; + } + + bch_verbose(c, "reading snapshots table"); + err = "error reading snapshots table"; + ret = bch2_fs_snapshots_start(c); + if (ret) + goto err; + bch_verbose(c, "reading snapshots done"); + + if (c->sb.version < bcachefs_metadata_version_snapshot) { + /* set bi_subvol on root inode */ + err = "error upgrade root inode for subvolumes"; + ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW, + bch2_fs_upgrade_for_subvolumes(&trans)); + if (ret) + goto err; + } + if (c->opts.fsck) { bch_info(c, "starting fsck"); err = "error in fsck"; @@ -1350,9 +1450,22 @@ int bch2_fs_initialize(struct bch_fs *c) } } + err = "error creating root snapshot node"; + ret = bch2_fs_initialize_subvolumes(c); + if (ret) + goto err; + + bch_verbose(c, "reading snapshots table"); + err = "error reading snapshots table"; + ret = bch2_fs_snapshots_start(c); + if (ret) + goto err; + bch_verbose(c, "reading snapshots done"); + bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); - root_inode.bi_inum = BCACHEFS_ROOT_INO; + root_inode.bi_inum = BCACHEFS_ROOT_INO; + root_inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; bch2_inode_pack(c, &packed_inode, &root_inode); packed_inode.inode.k.p.snapshot = U32_MAX; @@ -1367,11 +1480,12 @@ int bch2_fs_initialize(struct bch_fs *c) err = "error creating lost+found"; ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_create_trans(&trans, BCACHEFS_ROOT_INO, + bch2_create_trans(&trans, + BCACHEFS_ROOT_SUBVOL_INUM, &root_inode, &lostfound_inode, &lostfound, 0, 0, S_IFDIR|0700, 0, - NULL, NULL)); + NULL, NULL, (subvol_inum) { 0 }, 0)); if (ret) { bch_err(c, "error creating lost+found"); goto err; diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c index 576cfbcc..92ff6094 100644 --- a/libbcachefs/reflink.c +++ b/libbcachefs/reflink.c @@ -7,6 +7,7 @@ #include "inode.h" #include "io.h" #include "reflink.h" +#include "subvolume.h" #include @@ -197,7 +198,8 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) } s64 bch2_remap_range(struct bch_fs *c, - struct bpos dst_start, struct bpos src_start, + subvol_inum dst_inum, u64 dst_offset, + subvol_inum src_inum, u64 src_offset, u64 remap_sectors, u64 *journal_seq, u64 new_i_size, s64 *i_sectors_delta) { @@ -205,9 +207,12 @@ s64 bch2_remap_range(struct bch_fs *c, struct btree_iter dst_iter, src_iter; struct bkey_s_c src_k; struct bkey_buf new_dst, new_src; + struct bpos dst_start = POS(dst_inum.inum, dst_offset); + struct bpos src_start = POS(src_inum.inum, src_offset); struct bpos dst_end = dst_start, src_end = src_start; struct bpos src_want; u64 dst_done; + u32 dst_snapshot, src_snapshot; int ret = 0, ret2 = 0; if (!percpu_ref_tryget(&c->writes)) @@ -238,6 +243,20 @@ s64 bch2_remap_range(struct bch_fs *c, break; } + ret = bch2_subvolume_get_snapshot(&trans, src_inum.subvol, + &src_snapshot); + if (ret) + continue; + + bch2_btree_iter_set_snapshot(&src_iter, src_snapshot); + + ret = bch2_subvolume_get_snapshot(&trans, dst_inum.subvol, + &dst_snapshot); + if (ret) + continue; + + bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot); + dst_done = dst_iter.pos.offset - dst_start.offset; src_want = POS(src_start.inode, src_start.offset + dst_done); bch2_btree_iter_set_pos(&src_iter, src_want); @@ -248,11 +267,11 @@ s64 bch2_remap_range(struct bch_fs *c, continue; if (bkey_cmp(src_want, src_iter.pos) < 0) { - ret = bch2_fpunch_at(&trans, &dst_iter, - bpos_min(dst_end, - POS(dst_iter.pos.inode, dst_iter.pos.offset + - src_iter.pos.offset - src_want.offset)), - journal_seq, i_sectors_delta); + ret = bch2_fpunch_at(&trans, &dst_iter, dst_inum, + min(dst_end.offset, + dst_iter.pos.offset + + src_iter.pos.offset - src_want.offset), + journal_seq, i_sectors_delta); continue; } @@ -289,8 +308,9 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_key_resize(&new_dst.k->k, min(src_k.k->p.offset - src_want.offset, dst_end.offset - dst_iter.pos.offset)); - ret = bch2_extent_update(&trans, &dst_iter, new_dst.k, - &disk_res, journal_seq, + + ret = bch2_extent_update(&trans, dst_inum, &dst_iter, + new_dst.k, &disk_res, journal_seq, new_i_size, i_sectors_delta, true); bch2_disk_reservation_put(c, &disk_res); @@ -311,7 +331,7 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_trans_begin(&trans); ret2 = bch2_inode_peek(&trans, &inode_iter, &inode_u, - dst_start.inode, BTREE_ITER_INTENT); + dst_inum, BTREE_ITER_INTENT); if (!ret2 && inode_u.bi_size < new_i_size) { diff --git a/libbcachefs/reflink.h b/libbcachefs/reflink.h index 68c5cb5a..4c1b8286 100644 --- a/libbcachefs/reflink.h +++ b/libbcachefs/reflink.h @@ -57,7 +57,7 @@ static inline __le64 *bkey_refcount(struct bkey_i *k) } } -s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos, - u64, u64 *, u64, s64 *); +s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64, + subvol_inum, u64, u64, u64 *, u64, s64 *); #endif /* _BCACHEFS_REFLINK_H */ diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h index c6a132b3..6486e709 100644 --- a/libbcachefs/str_hash.h +++ b/libbcachefs/str_hash.h @@ -8,6 +8,7 @@ #include "error.h" #include "inode.h" #include "siphash.h" +#include "subvolume.h" #include "super.h" #include @@ -144,16 +145,21 @@ bch2_hash_lookup(struct btree_trans *trans, struct btree_iter *iter, const struct bch_hash_desc desc, const struct bch_hash_info *info, - u64 inode, const void *key, + subvol_inum inum, const void *key, unsigned flags) { struct bkey_s_c k; + u32 snapshot; int ret; + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; + for_each_btree_key(trans, *iter, desc.btree_id, - POS(inode, desc.hash_key(info, key)), + SPOS(inum.inum, desc.hash_key(info, key), snapshot), BTREE_ITER_SLOTS|flags, k, ret) { - if (iter->pos.inode != inode) + if (iter->pos.inode != inum.inum) break; if (k.k->type == desc.key_type) { @@ -176,15 +182,20 @@ bch2_hash_hole(struct btree_trans *trans, struct btree_iter *iter, const struct bch_hash_desc desc, const struct bch_hash_info *info, - u64 inode, const void *key) + subvol_inum inum, const void *key) { struct bkey_s_c k; + u32 snapshot; int ret; + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; + for_each_btree_key(trans, *iter, desc.btree_id, - POS(inode, desc.hash_key(info, key)), + SPOS(inum.inum, desc.hash_key(info, key), snapshot), BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { - if (iter->pos.inode != inode) + if (iter->pos.inode != inum.inum) break; if (k.k->type != desc.key_type) @@ -229,17 +240,25 @@ static __always_inline int bch2_hash_set(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, - u64 inode, struct bkey_i *insert, int flags) + subvol_inum inum, + struct bkey_i *insert, int flags) { struct btree_iter iter, slot = { NULL }; struct bkey_s_c k; bool found = false; + u32 snapshot; int ret; + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; + for_each_btree_key(trans, iter, desc.btree_id, - POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))), + SPOS(inum.inum, + desc.hash_bkey(info, bkey_i_to_s_c(insert)), + snapshot), BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { - if (iter.pos.inode != inode) + if (iter.pos.inode != inum.inum) break; if (k.k->type == desc.key_type) { @@ -288,7 +307,8 @@ static __always_inline int bch2_hash_delete_at(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, - struct btree_iter *iter) + struct btree_iter *iter, + unsigned update_flags) { struct bkey_i *delete; int ret; @@ -306,24 +326,24 @@ int bch2_hash_delete_at(struct btree_trans *trans, delete->k.p = iter->pos; delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted; - return bch2_trans_update(trans, iter, delete, 0); + return bch2_trans_update(trans, iter, delete, update_flags); } static __always_inline int bch2_hash_delete(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, - u64 inode, const void *key) + subvol_inum inum, const void *key) { struct btree_iter iter; int ret; - ret = bch2_hash_lookup(trans, &iter, desc, info, inode, key, + ret = bch2_hash_lookup(trans, &iter, desc, info, inum, key, BTREE_ITER_INTENT); if (ret) return ret; - ret = bch2_hash_delete_at(trans, desc, info, &iter); + ret = bch2_hash_delete_at(trans, desc, info, &iter, 0); bch2_trans_iter_exit(trans, &iter); return ret; } diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c new file mode 100644 index 00000000..ff3b4d2d --- /dev/null +++ b/libbcachefs/subvolume.c @@ -0,0 +1,981 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_key_cache.h" +#include "btree_update.h" +#include "error.h" +#include "subvolume.h" + +/* Snapshot tree: */ + +static void bch2_delete_dead_snapshots_work(struct work_struct *); +static void bch2_delete_dead_snapshots(struct bch_fs *); + +void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); + + pr_buf(out, "is_subvol %llu deleted %llu parent %u children %u %u subvol %u", + BCH_SNAPSHOT_SUBVOL(s.v), + BCH_SNAPSHOT_DELETED(s.v), + le32_to_cpu(s.v->parent), + le32_to_cpu(s.v->children[0]), + le32_to_cpu(s.v->children[1]), + le32_to_cpu(s.v->subvol)); +} + +const char *bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_snapshot s; + u32 i, id; + + if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0 || + bkey_cmp(k.k->p, POS(0, 1)) < 0) + return "bad pos"; + + if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot)) + return "bad val size"; + + s = bkey_s_c_to_snapshot(k); + + id = le32_to_cpu(s.v->parent); + if (id && id <= k.k->p.offset) + return "bad parent node"; + + if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) + return "children not normalized"; + + if (s.v->children[0] && + s.v->children[0] == s.v->children[1]) + return "duplicate child nodes"; + + for (i = 0; i < 2; i++) { + id = le32_to_cpu(s.v->children[i]); + + if (id >= k.k->p.offset) + return "bad child node"; + } + + return NULL; +} + +int bch2_mark_snapshot(struct bch_fs *c, + struct bkey_s_c old, struct bkey_s_c new, + u64 journal_seq, unsigned flags) +{ + struct snapshot_t *t; + + t = genradix_ptr_alloc(&c->snapshots, + U32_MAX - new.k->p.offset, + GFP_KERNEL); + if (!t) + return -ENOMEM; + + if (new.k->type == KEY_TYPE_snapshot) { + struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); + + t->parent = le32_to_cpu(s.v->parent); + t->children[0] = le32_to_cpu(s.v->children[0]); + t->children[1] = le32_to_cpu(s.v->children[1]); + t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0; + } else { + t->parent = 0; + t->children[0] = 0; + t->children[1] = 0; + t->subvol = 0; + } + + return 0; +} + +static int subvol_lookup(struct btree_trans *trans, unsigned id, struct bch_subvolume *s) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, POS(0, id), 0); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -ENOENT; + + if (!ret) + *s = *bkey_s_c_to_subvolume(k).v; + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int snapshot_lookup(struct btree_trans *trans, u32 id, + struct bch_snapshot *s) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id), + BTREE_ITER_WITH_UPDATES); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k) ?: k.k->type == KEY_TYPE_snapshot ? 0 : -ENOENT; + + if (!ret) + *s = *bkey_s_c_to_snapshot(k).v; + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int snapshot_live(struct btree_trans *trans, u32 id) +{ + struct bch_snapshot v; + int ret; + + if (!id) + return 0; + + ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v)); + if (ret == -ENOENT) + bch_err(trans->c, "snapshot node %u not found", id); + if (ret) + return ret; + + return !BCH_SNAPSHOT_DELETED(&v); +} + +static int bch2_snapshots_set_equiv(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_snapshot snap; + unsigned i; + int ret; + + for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ret) { + u32 id = k.k->p.offset, child[2]; + unsigned nr_live = 0, live_idx; + + if (k.k->type != KEY_TYPE_snapshot) + continue; + + snap = bkey_s_c_to_snapshot(k); + child[0] = le32_to_cpu(snap.v->children[0]); + child[1] = le32_to_cpu(snap.v->children[1]); + + for (i = 0; i < 2; i++) { + ret = snapshot_live(trans, child[i]); + if (ret < 0) + break; + + if (ret) + live_idx = i; + nr_live += ret; + } + + snapshot_t(c, id)->equiv = nr_live == 1 + ? snapshot_t(c, child[live_idx])->equiv + : id; + } + bch2_trans_iter_exit(trans, &iter); + + if (ret) + bch_err(c, "error walking snapshots: %i", ret); + + return ret; +} + +/* fsck: */ +static int bch2_snapshot_check(struct btree_trans *trans, + struct bkey_s_c_snapshot s) +{ + struct bch_subvolume subvol; + struct bch_snapshot v; + u32 i, id; + int ret; + + id = le32_to_cpu(s.v->subvol); + ret = lockrestart_do(trans, subvol_lookup(trans, id, &subvol)); + if (ret == -ENOENT) + bch_err(trans->c, "snapshot node %llu has nonexistent subvolume %u", + s.k->p.offset, id); + if (ret) + return ret; + + if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) { + bch_err(trans->c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL", + s.k->p.offset); + return -EINVAL; + } + + id = le32_to_cpu(s.v->parent); + if (id) { + ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v)); + if (ret == -ENOENT) + bch_err(trans->c, "snapshot node %llu has nonexistent parent %u", + s.k->p.offset, id); + if (ret) + return ret; + + if (le32_to_cpu(v.children[0]) != s.k->p.offset && + le32_to_cpu(v.children[1]) != s.k->p.offset) { + bch_err(trans->c, "snapshot parent %u missing pointer to child %llu", + id, s.k->p.offset); + return -EINVAL; + } + } + + for (i = 0; i < 2 && s.v->children[i]; i++) { + id = le32_to_cpu(s.v->children[i]); + + ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v)); + if (ret == -ENOENT) + bch_err(trans->c, "snapshot node %llu has nonexistent child %u", + s.k->p.offset, id); + if (ret) + return ret; + + if (le32_to_cpu(v.parent) != s.k->p.offset) { + bch_err(trans->c, "snapshot child %u has wrong parent (got %u should be %llu)", + id, le32_to_cpu(v.parent), s.k->p.offset); + return -EINVAL; + } + } + + return 0; +} + +int bch2_fs_snapshots_check(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_snapshot s; + unsigned id; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ret) { + if (k.k->type != KEY_TYPE_snapshot) + continue; + + ret = bch2_snapshot_check(&trans, bkey_s_c_to_snapshot(k)); + if (ret) + break; + } + bch2_trans_iter_exit(&trans, &iter); + + if (ret) { + bch_err(c, "error %i checking snapshots", ret); + goto err; + } + + for_each_btree_key(&trans, iter, BTREE_ID_subvolumes, + POS_MIN, 0, k, ret) { + if (k.k->type != KEY_TYPE_subvolume) + continue; +again_2: + id = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot); + ret = snapshot_lookup(&trans, id, &s); + + if (ret == -EINTR) { + k = bch2_btree_iter_peek(&iter); + goto again_2; + } else if (ret == -ENOENT) + bch_err(c, "subvolume %llu points to nonexistent snapshot %u", + k.k->p.offset, id); + else if (ret) + break; + } + bch2_trans_iter_exit(&trans, &iter); +err: + bch2_trans_exit(&trans); + return ret; +} + +void bch2_fs_snapshots_exit(struct bch_fs *c) +{ + genradix_free(&c->snapshots); +} + +int bch2_fs_snapshots_start(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + bool have_deleted = false; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ret) { + if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) + break; + + if (k.k->type != KEY_TYPE_snapshot) { + bch_err(c, "found wrong key type %u in snapshot node table", + k.k->type); + continue; + } + + if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v)) + have_deleted = true; + + ret = bch2_mark_snapshot(c, bkey_s_c_null, k, 0, 0); + if (ret) + break; + } + bch2_trans_iter_exit(&trans, &iter); + + if (ret) + goto err; + + ret = bch2_snapshots_set_equiv(&trans); + if (ret) + goto err; +err: + bch2_trans_exit(&trans); + + if (!ret && have_deleted) { + bch_info(c, "restarting deletion of dead snapshots"); + if (c->opts.fsck) { + bch2_delete_dead_snapshots_work(&c->snapshot_delete_work); + } else { + bch2_delete_dead_snapshots(c); + } + } + + return ret; +} + +/* + * Mark a snapshot as deleted, for future cleanup: + */ +static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_snapshot *s; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id), + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_snapshot) { + bch2_fs_inconsistent(trans->c, "missing snapshot %u", id); + ret = -ENOENT; + goto err; + } + + /* already deleted? */ + if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v)) + goto err; + + s = bch2_trans_kmalloc(trans, sizeof(*s)); + ret = PTR_ERR_OR_ZERO(s); + if (ret) + goto err; + + bkey_reassemble(&s->k_i, k); + + SET_BCH_SNAPSHOT_DELETED(&s->v, true); + ret = bch2_trans_update(trans, &iter, &s->k_i, 0); + if (ret) + goto err; +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) +{ + struct btree_iter iter, p_iter = (struct btree_iter) { NULL }; + struct bkey_s_c k; + struct bkey_s_c_snapshot s; + struct bkey_i_snapshot *parent; + u32 parent_id; + unsigned i; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id), + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_snapshot) { + bch2_fs_inconsistent(trans->c, "missing snapshot %u", id); + ret = -ENOENT; + goto err; + } + + s = bkey_s_c_to_snapshot(k); + + BUG_ON(!BCH_SNAPSHOT_DELETED(s.v)); + parent_id = le32_to_cpu(s.v->parent); + + if (parent_id) { + bch2_trans_iter_init(trans, &p_iter, BTREE_ID_snapshots, + POS(0, parent_id), + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&p_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_snapshot) { + bch2_fs_inconsistent(trans->c, "missing snapshot %u", parent_id); + ret = -ENOENT; + goto err; + } + + parent = bch2_trans_kmalloc(trans, sizeof(*parent)); + ret = PTR_ERR_OR_ZERO(parent); + if (ret) + goto err; + + bkey_reassemble(&parent->k_i, k); + + for (i = 0; i < 2; i++) + if (le32_to_cpu(parent->v.children[i]) == id) + break; + + if (i == 2) + bch_err(trans->c, "snapshot %u missing child pointer to %u", + parent_id, id); + else + parent->v.children[i] = 0; + + if (le32_to_cpu(parent->v.children[0]) < + le32_to_cpu(parent->v.children[1])) + swap(parent->v.children[0], + parent->v.children[1]); + + ret = bch2_trans_update(trans, &p_iter, &parent->k_i, 0); + if (ret) + goto err; + } + + ret = bch2_btree_delete_at(trans, &iter, 0); +err: + bch2_trans_iter_exit(trans, &p_iter); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, + u32 *new_snapids, + u32 *snapshot_subvols, + unsigned nr_snapids) +{ + struct btree_iter iter; + struct bkey_i_snapshot *n; + struct bkey_s_c k; + unsigned i; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, + POS_MIN, BTREE_ITER_INTENT); + k = bch2_btree_iter_peek(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + for (i = 0; i < nr_snapids; i++) { + k = bch2_btree_iter_prev_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (!k.k || !k.k->p.offset) { + ret = -ENOSPC; + goto err; + } + + n = bch2_trans_kmalloc(trans, sizeof(*n)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + bkey_snapshot_init(&n->k_i); + n->k.p = iter.pos; + n->v.flags = 0; + n->v.parent = cpu_to_le32(parent); + n->v.subvol = cpu_to_le32(snapshot_subvols[i]); + n->v.pad = 0; + SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); + + bch2_trans_update(trans, &iter, &n->k_i, 0); + + ret = bch2_mark_snapshot(trans->c, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0, 0); + if (ret) + break; + + new_snapids[i] = iter.pos.offset; + } + + if (parent) { + bch2_btree_iter_set_pos(&iter, POS(0, parent)); + k = bch2_btree_iter_peek(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_snapshot) { + bch_err(trans->c, "snapshot %u not found", parent); + ret = -ENOENT; + goto err; + } + + n = bch2_trans_kmalloc(trans, sizeof(*n)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + bkey_reassemble(&n->k_i, k); + + if (n->v.children[0] || n->v.children[1]) { + bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children"); + ret = -EINVAL; + goto err; + } + + n->v.children[0] = cpu_to_le32(new_snapids[0]); + n->v.children[1] = cpu_to_le32(new_snapids[1]); + SET_BCH_SNAPSHOT_SUBVOL(&n->v, false); + bch2_trans_update(trans, &iter, &n->k_i, 0); + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/* List of snapshot IDs that are being deleted: */ +struct snapshot_id_list { + u32 nr; + u32 size; + u32 *d; +}; + +static bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id) +{ + unsigned i; + + for (i = 0; i < s->nr; i++) + if (id == s->d[i]) + return true; + return false; +} + +static int snapshot_id_add(struct snapshot_id_list *s, u32 id) +{ + BUG_ON(snapshot_list_has_id(s, id)); + + if (s->nr == s->size) { + size_t new_size = max(8U, s->size * 2); + void *n = krealloc(s->d, + new_size * sizeof(s->d[0]), + GFP_KERNEL); + if (!n) { + pr_err("error allocating snapshot ID list"); + return -ENOMEM; + } + + s->d = n; + s->size = new_size; + }; + + s->d[s->nr++] = id; + return 0; +} + +static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans, + struct snapshot_id_list *deleted, + enum btree_id btree_id) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct snapshot_id_list equiv_seen = { 0 }; + struct bpos last_pos = POS_MIN; + int ret = 0; + + /* + * XXX: We should also delete whiteouts that no longer overwrite + * anything + */ + + bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN, + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); + + while ((bch2_trans_begin(trans), + (k = bch2_btree_iter_peek(&iter)).k) && + !(ret = bkey_err(k))) { + u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv; + + if (bkey_cmp(k.k->p, last_pos)) + equiv_seen.nr = 0; + last_pos = k.k->p; + + if (snapshot_list_has_id(deleted, k.k->p.snapshot) || + snapshot_list_has_id(&equiv_seen, equiv)) { + if (btree_id == BTREE_ID_inodes && + bch2_btree_key_cache_flush(trans, btree_id, iter.pos)) + continue; + + ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL, + bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(trans, &iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); + if (ret) + break; + } else { + ret = snapshot_id_add(&equiv_seen, equiv); + if (ret) + break; + } + + bch2_btree_iter_advance(&iter); + } + bch2_trans_iter_exit(trans, &iter); + + kfree(equiv_seen.d); + + return ret; +} + +static void bch2_delete_dead_snapshots_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_snapshot snap; + struct snapshot_id_list deleted = { 0 }; + u32 i, id, children[2]; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + /* + * For every snapshot node: If we have no live children and it's not + * pointed to by a subvolume, delete it: + */ + for_each_btree_key(&trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ret) { + if (k.k->type != KEY_TYPE_snapshot) + continue; + + snap = bkey_s_c_to_snapshot(k); + if (BCH_SNAPSHOT_DELETED(snap.v) || + BCH_SNAPSHOT_SUBVOL(snap.v)) + continue; + + children[0] = le32_to_cpu(snap.v->children[0]); + children[1] = le32_to_cpu(snap.v->children[1]); + + ret = snapshot_live(&trans, children[0]) ?: + snapshot_live(&trans, children[1]); + if (ret < 0) + break; + if (ret) + continue; + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_snapshot_node_set_deleted(&trans, iter.pos.offset)); + if (ret) { + bch_err(c, "error deleting snapshot %llu: %i", iter.pos.offset, ret); + break; + } + } + bch2_trans_iter_exit(&trans, &iter); + + if (ret) { + bch_err(c, "error walking snapshots: %i", ret); + goto err; + } + + ret = bch2_snapshots_set_equiv(&trans); + if (ret) + goto err; + + for_each_btree_key(&trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ret) { + if (k.k->type != KEY_TYPE_snapshot) + continue; + + snap = bkey_s_c_to_snapshot(k); + if (BCH_SNAPSHOT_DELETED(snap.v)) { + ret = snapshot_id_add(&deleted, k.k->p.offset); + if (ret) + break; + } + } + bch2_trans_iter_exit(&trans, &iter); + + if (ret) { + bch_err(c, "error walking snapshots: %i", ret); + goto err; + } + + for (id = 0; id < BTREE_ID_NR; id++) { + if (!btree_type_has_snapshots(id)) + continue; + + ret = bch2_snapshot_delete_keys_btree(&trans, &deleted, id); + if (ret) { + bch_err(c, "error deleting snapshot keys: %i", ret); + goto err; + } + } + + for (i = 0; i < deleted.nr; i++) { + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_snapshot_node_delete(&trans, deleted.d[i])); + if (ret) { + bch_err(c, "error deleting snapshot %u: %i", + deleted.d[i], ret); + goto err; + } + } +err: + kfree(deleted.d); + bch2_trans_exit(&trans); + percpu_ref_put(&c->writes); +} + +static void bch2_delete_dead_snapshots(struct bch_fs *c) +{ + if (unlikely(!percpu_ref_tryget(&c->writes))) + return; + + if (!queue_work(system_long_wq, &c->snapshot_delete_work)) + percpu_ref_put(&c->writes); +} + +static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans, + struct btree_trans_commit_hook *h) +{ + bch2_delete_dead_snapshots(trans->c); + return 0; +} + +/* Subvolumes: */ + +const char *bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0) + return "invalid pos"; + + if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) + return "invalid pos"; + + if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume)) + return "bad val size"; + + return NULL; +} + +void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); + + pr_buf(out, "root %llu snapshot id %u", + le64_to_cpu(s.v->inode), + le32_to_cpu(s.v->snapshot)); +} + +int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol, + u32 *snapid) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, + POS(0, subvol), + BTREE_ITER_CACHED| + BTREE_ITER_WITH_UPDATES); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_subvolume) { + bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol); + ret = -EIO; + goto err; + } + + *snapid = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/* XXX: mark snapshot id for deletion, walk btree and delete: */ +int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid, + int deleting_snapshot) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_subvolume subvol; + struct btree_trans_commit_hook *h; + struct bkey_i *delete; + u32 snapid; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, + POS(0, subvolid), + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_subvolume) { + bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid); + ret = -EIO; + goto err; + } + + subvol = bkey_s_c_to_subvolume(k); + snapid = le32_to_cpu(subvol.v->snapshot); + + if (deleting_snapshot >= 0 && + deleting_snapshot != BCH_SUBVOLUME_SNAP(subvol.v)) { + ret = -ENOENT; + goto err; + } + + delete = bch2_trans_kmalloc(trans, sizeof(*delete)); + ret = PTR_ERR_OR_ZERO(delete); + if (ret) + goto err; + + bkey_init(&delete->k); + delete->k.p = iter.pos; + ret = bch2_trans_update(trans, &iter, delete, 0); + if (ret) + goto err; + + ret = bch2_snapshot_node_set_deleted(trans, snapid); + + h = bch2_trans_kmalloc(trans, sizeof(*h)); + ret = PTR_ERR_OR_ZERO(h); + if (ret) + goto err; + + h->fn = bch2_delete_dead_snapshots_hook; + bch2_trans_commit_hook(trans, h); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_subvolume_create(struct btree_trans *trans, u64 inode, + u32 src_subvolid, + u32 *new_subvolid, + u32 *new_snapshotid, + bool ro) +{ + struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL }; + struct bkey_i_subvolume *new_subvol = NULL; + struct bkey_i_subvolume *src_subvol = NULL; + struct bkey_s_c k; + u32 parent = 0, new_nodes[2], snapshot_subvols[2]; + int ret = 0; + + for_each_btree_key(trans, dst_iter, BTREE_ID_subvolumes, SUBVOL_POS_MIN, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) + break; + if (bkey_deleted(k.k)) + goto found_slot; + } + + if (!ret) + ret = -ENOSPC; + goto err; +found_slot: + snapshot_subvols[0] = dst_iter.pos.offset; + snapshot_subvols[1] = src_subvolid; + + if (src_subvolid) { + /* Creating a snapshot: */ + src_subvol = bch2_trans_kmalloc(trans, sizeof(*src_subvol)); + ret = PTR_ERR_OR_ZERO(src_subvol); + if (ret) + goto err; + + bch2_trans_iter_init(trans, &src_iter, BTREE_ID_subvolumes, + POS(0, src_subvolid), + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&src_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_subvolume) { + bch_err(trans->c, "subvolume %u not found", src_subvolid); + ret = -ENOENT; + goto err; + } + + bkey_reassemble(&src_subvol->k_i, k); + parent = le32_to_cpu(src_subvol->v.snapshot); + } + + ret = bch2_snapshot_node_create(trans, parent, new_nodes, + snapshot_subvols, + src_subvolid ? 2 : 1); + if (ret) + goto err; + + if (src_subvolid) { + src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]); + bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0); + } + + new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol)); + ret = PTR_ERR_OR_ZERO(new_subvol); + if (ret) + goto err; + + bkey_subvolume_init(&new_subvol->k_i); + new_subvol->v.flags = 0; + new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]); + new_subvol->v.inode = cpu_to_le64(inode); + SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro); + SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0); + new_subvol->k.p = dst_iter.pos; + bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0); + + *new_subvolid = new_subvol->k.p.offset; + *new_snapshotid = new_nodes[0]; +err: + bch2_trans_iter_exit(trans, &src_iter); + bch2_trans_iter_exit(trans, &dst_iter); + return ret; +} + +int bch2_fs_subvolumes_init(struct bch_fs *c) +{ + INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work); + return 0; +} diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h new file mode 100644 index 00000000..0740c7b7 --- /dev/null +++ b/libbcachefs/subvolume.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUBVOLUME_H +#define _BCACHEFS_SUBVOLUME_H + +void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +const char *bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_snapshot (struct bkey_ops) { \ + .key_invalid = bch2_snapshot_invalid, \ + .val_to_text = bch2_snapshot_to_text, \ +} + +int bch2_mark_snapshot(struct bch_fs *, struct bkey_s_c, + struct bkey_s_c, u64, unsigned); + +static inline struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) +{ + return genradix_ptr(&c->snapshots, U32_MAX - id); +} + +static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) +{ + return snapshot_t(c, id)->parent; +} + +static inline u32 bch2_snapshot_internal_node(struct bch_fs *c, u32 id) +{ + struct snapshot_t *s = snapshot_t(c, id); + + return s->children[0] || s->children[1]; +} + +static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id) +{ + struct snapshot_t *s; + u32 parent = bch2_snapshot_parent(c, id); + + if (!parent) + return 0; + + s = snapshot_t(c, bch2_snapshot_parent(c, id)); + if (id == s->children[0]) + return s->children[1]; + if (id == s->children[1]) + return s->children[0]; + return 0; +} + +static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) +{ + while (id && id < ancestor) + id = bch2_snapshot_parent(c, id); + + return id == ancestor; +} + +struct snapshots_seen { + struct bpos pos; + size_t nr; + size_t size; + u32 *d; +}; + +static inline void snapshots_seen_exit(struct snapshots_seen *s) +{ + kfree(s->d); + s->d = NULL; +} + +static inline void snapshots_seen_init(struct snapshots_seen *s) +{ + memset(s, 0, sizeof(*s)); +} + +static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id) +{ + if (s->nr == s->size) { + size_t new_size = max(s->size, 128UL) * 2; + u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL); + + if (!d) { + bch_err(c, "error reallocating snapshots_seen table (new size %zu)", + new_size); + return -ENOMEM; + } + + s->size = new_size; + s->d = d; + } + + s->d[s->nr++] = id; + return 0; +} + +int bch2_fs_snapshots_check(struct bch_fs *); +void bch2_fs_snapshots_exit(struct bch_fs *); +int bch2_fs_snapshots_start(struct bch_fs *); + +const char *bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_subvolume (struct bkey_ops) { \ + .key_invalid = bch2_subvolume_invalid, \ + .val_to_text = bch2_subvolume_to_text, \ +} + +int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); + +int bch2_subvolume_delete(struct btree_trans *, u32, int); +int bch2_subvolume_create(struct btree_trans *, u64, u32, + u32 *, u32 *, bool); + +int bch2_fs_subvolumes_init(struct bch_fs *); + +#endif /* _BCACHEFS_SUBVOLUME_H */ diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 8f847661..1feb7dee 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -39,6 +39,7 @@ #include "rebalance.h" #include "recovery.h" #include "replicas.h" +#include "subvolume.h" #include "super.h" #include "super-io.h" #include "sysfs.h" @@ -468,6 +469,7 @@ static void __bch2_fs_free(struct bch_fs *c) for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_exit(&c->times[i]); + bch2_fs_snapshots_exit(c); bch2_fs_quota_exit(c); bch2_fs_fsio_exit(c); bch2_fs_ec_exit(c); @@ -686,6 +688,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) mutex_init(&c->usage_scratch_lock); mutex_init(&c->bio_bounce_pages_lock); + mutex_init(&c->snapshot_table_lock); spin_lock_init(&c->btree_write_error_lock); @@ -789,6 +792,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_btree_key_cache_init(&c->btree_key_cache) || bch2_fs_btree_iter_init(c) || bch2_fs_btree_interior_update_init(c) || + bch2_fs_subvolumes_init(c) || bch2_fs_io_init(c) || bch2_fs_encryption_init(c) || bch2_fs_compress_init(c) || diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index ef6ae97e..a182e242 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -128,7 +128,7 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info int ret; ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, - inode->v.i_ino, + inode_inum(inode), &X_SEARCH(type, name, strlen(name)), 0); if (ret) @@ -160,7 +160,7 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, bch2_xattr_get_trans(&trans, inode, name, buffer, size, type)); } -int bch2_xattr_set(struct btree_trans *trans, u64 inum, +int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, const struct bch_hash_info *hash_info, const char *name, const void *value, size_t size, int type, int flags) @@ -282,13 +282,21 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) struct btree_iter iter; struct bkey_s_c k; struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; - u64 inum = dentry->d_inode->i_ino; + u64 offset = 0, inum = inode->ei_inode.bi_inum; + u32 snapshot; int ret; bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + iter = (struct btree_iter) { NULL }; + + ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot); + if (ret) + goto err; for_each_btree_key(&trans, iter, BTREE_ID_xattrs, - POS(inum, 0), 0, k, ret) { + SPOS(inum, offset, snapshot), 0, k, ret) { BUG_ON(k.k->p.inode < inum); if (k.k->p.inode > inum) @@ -301,7 +309,12 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) if (ret) break; } + + offset = iter.pos.offset; bch2_trans_iter_exit(&trans, &iter); +err: + if (ret == -EINTR) + goto retry; ret = bch2_trans_exit(&trans) ?: ret; @@ -340,7 +353,7 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler, struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0, - bch2_xattr_set(&trans, inode->v.i_ino, &hash, + bch2_xattr_set(&trans, inode_inum(inode), &hash, name, value, size, handler->flags, flags)); } diff --git a/libbcachefs/xattr.h b/libbcachefs/xattr.h index 4151065a..f4f89654 100644 --- a/libbcachefs/xattr.h +++ b/libbcachefs/xattr.h @@ -39,7 +39,8 @@ struct bch_inode_info; int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *, const char *, void *, size_t, int); -int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *, +int bch2_xattr_set(struct btree_trans *, subvol_inum, + const struct bch_hash_info *, const char *, const void *, size_t, int, int); ssize_t bch2_xattr_list(struct dentry *, char *, size_t);