Update bcachefs sources to 386f00b639 bcachefs: Snapshot creation, deletion

This commit is contained in:
Kent Overstreet 2021-09-26 18:19:46 -04:00
parent 9942fc82d4
commit e61b61c03b
47 changed files with 4025 additions and 835 deletions

View File

@ -1 +1 @@
bd6ed9fb42c0aa36d1f4a21eeab45fe12e1fb792
386f00b6399a1eb38053c236aae87678f3535df7

View File

@ -191,6 +191,7 @@ static void list_keys(struct bch_fs *c, enum btree_id btree_id,
bch2_trans_init(&trans, c, 0, 0);
for_each_btree_key(&trans, iter, btree_id, start,
BTREE_ITER_ALL_SNAPSHOTS|
BTREE_ITER_PREFETCH, k, ret) {
if (bkey_cmp(k.k->p, end) > 0)
break;

View File

@ -138,8 +138,9 @@ static void create_link(struct bch_fs *c,
struct bch_inode_unpacked inode;
int ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_link_trans(&trans, parent->bi_inum, inum,
&parent_u, &inode, &qstr));
bch2_link_trans(&trans,
(subvol_inum) { 1, parent->bi_inum }, &parent_u,
(subvol_inum) { 1, inum }, &inode, &qstr));
if (ret)
die("error creating hardlink: %s", strerror(-ret));
}
@ -155,9 +156,10 @@ static struct bch_inode_unpacked create_file(struct bch_fs *c,
int ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_create_trans(&trans,
parent->bi_inum, parent,
(subvol_inum) { 1, parent->bi_inum }, parent,
&new_inode, &qstr,
uid, gid, mode, rdev, NULL, NULL));
uid, gid, mode, rdev, NULL, NULL,
(subvol_inum) {}, 0));
if (ret)
die("error creating file: %s", strerror(-ret));
@ -225,7 +227,9 @@ static void copy_xattrs(struct bch_fs *c, struct bch_inode_unpacked *dst,
const struct xattr_handler *h = xattr_resolve_name(&attr);
int ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_xattr_set(&trans, dst->bi_inum, &hash_info, attr,
bch2_xattr_set(&trans,
(subvol_inum) { 1, dst->bi_inum },
&hash_info, attr,
val, val_size, h->flags, 0));
if (ret < 0)
die("error creating xattr: %s", strerror(-ret));
@ -569,7 +573,8 @@ static void copy_fs(struct bch_fs *c, int src_fd, const char *src_path,
syncfs(src_fd);
struct bch_inode_unpacked root_inode;
int ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, &root_inode);
int ret = bch2_inode_find_by_inum(c, (subvol_inum) { 1, BCACHEFS_ROOT_INO },
&root_inode);
if (ret)
die("error looking up root directory: %s", strerror(-ret));

View File

@ -229,7 +229,7 @@ retry:
bch2_trans_begin(&trans);
ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc,
&hash, inode->v.i_ino,
&hash, inode_inum(inode),
&X_SEARCH(acl_to_xattr_type(type), "", 0),
0);
if (ret) {
@ -259,11 +259,11 @@ out:
return acl;
}
int bch2_set_acl_trans(struct btree_trans *trans,
int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
struct bch_inode_unpacked *inode_u,
const struct bch_hash_info *hash_info,
struct posix_acl *acl, int type)
{
struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u);
int ret;
if (type == ACL_TYPE_DEFAULT &&
@ -276,14 +276,14 @@ int bch2_set_acl_trans(struct btree_trans *trans,
if (IS_ERR(xattr))
return PTR_ERR(xattr);
ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
inode_u->bi_inum, &xattr->k_i, 0);
ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info,
inum, &xattr->k_i, 0);
} else {
struct xattr_search_key search =
X_SEARCH(acl_to_xattr_type(type), "", 0);
ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info,
inode_u->bi_inum, &search);
ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info,
inum, &search);
}
return ret == -ENOENT ? 0 : ret;
@ -297,7 +297,6 @@ int bch2_set_acl(struct user_namespace *mnt_userns,
struct btree_trans trans;
struct btree_iter inode_iter = { NULL };
struct bch_inode_unpacked inode_u;
struct bch_hash_info hash_info;
struct posix_acl *acl;
umode_t mode;
int ret;
@ -308,7 +307,7 @@ retry:
bch2_trans_begin(&trans);
acl = _acl;
ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode->v.i_ino,
ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
BTREE_ITER_INTENT);
if (ret)
goto btree_err;
@ -321,9 +320,7 @@ retry:
goto btree_err;
}
hash_info = bch2_hash_info_init(c, &inode_u);
ret = bch2_set_acl_trans(&trans, &inode_u, &hash_info, acl, type);
ret = bch2_set_acl_trans(&trans, inode_inum(inode), &inode_u, acl, type);
if (ret)
goto btree_err;
@ -352,7 +349,7 @@ err:
return ret;
}
int bch2_acl_chmod(struct btree_trans *trans,
int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
struct bch_inode_unpacked *inode,
umode_t mode,
struct posix_acl **new_acl)
@ -366,7 +363,7 @@ int bch2_acl_chmod(struct btree_trans *trans,
int ret;
ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
&hash_info, inode->bi_inum,
&hash_info, inum,
&X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
BTREE_ITER_INTENT);
if (ret)

View File

@ -28,25 +28,24 @@ typedef struct {
struct posix_acl *bch2_get_acl(struct inode *, int);
int bch2_set_acl_trans(struct btree_trans *,
int bch2_set_acl_trans(struct btree_trans *, subvol_inum,
struct bch_inode_unpacked *,
const struct bch_hash_info *,
struct posix_acl *, int);
int bch2_set_acl(struct user_namespace *, struct inode *, struct posix_acl *, int);
int bch2_acl_chmod(struct btree_trans *, struct bch_inode_unpacked *,
int bch2_acl_chmod(struct btree_trans *, subvol_inum,
struct bch_inode_unpacked *,
umode_t, struct posix_acl **);
#else
static inline int bch2_set_acl_trans(struct btree_trans *trans,
static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
struct bch_inode_unpacked *inode_u,
const struct bch_hash_info *hash_info,
struct posix_acl *acl, int type)
{
return 0;
}
static inline int bch2_acl_chmod(struct btree_trans *trans,
static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
struct bch_inode_unpacked *inode,
umode_t mode,
struct posix_acl **new_acl)

View File

@ -380,6 +380,8 @@ enum gc_phase {
GC_PHASE_BTREE_alloc,
GC_PHASE_BTREE_quotas,
GC_PHASE_BTREE_reflink,
GC_PHASE_BTREE_subvolumes,
GC_PHASE_BTREE_snapshots,
GC_PHASE_PENDING_DELETE,
};
@ -563,6 +565,21 @@ struct btree_path_buf {
#define REPLICAS_DELTA_LIST_MAX (1U << 16)
struct snapshot_t {
u32 parent;
u32 children[2];
u32 subvol; /* Nonzero only if a subvolume points to this node: */
u32 equiv;
};
typedef struct {
u32 subvol;
u64 inum;
} subvol_inum;
#define BCACHEFS_ROOT_SUBVOL_INUM \
((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO })
struct bch_fs {
struct closure cl;
@ -634,6 +651,12 @@ struct bch_fs {
struct closure sb_write;
struct mutex sb_lock;
/* snapshot.c: */
GENRADIX(struct snapshot_t) snapshots;
struct bch_snapshot_table __rcu *snapshot_table;
struct mutex snapshot_table_lock;
struct work_struct snapshot_delete_work;
/* BTREE CACHE */
struct bio_set btree_bio;
struct workqueue_struct *io_complete_wq;

View File

@ -323,7 +323,7 @@ static inline void bkey_init(struct bkey *k)
*/
#define BCH_BKEY_TYPES() \
x(deleted, 0) \
x(discard, 1) \
x(whiteout, 1) \
x(error, 2) \
x(cookie, 3) \
x(hash_whiteout, 4) \
@ -342,7 +342,9 @@ static inline void bkey_init(struct bkey *k)
x(inline_data, 17) \
x(btree_ptr_v2, 18) \
x(indirect_inline_data, 19) \
x(alloc_v2, 20)
x(alloc_v2, 20) \
x(subvolume, 21) \
x(snapshot, 22)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
@ -355,7 +357,7 @@ struct bch_deleted {
struct bch_val v;
};
struct bch_discard {
struct bch_whiteout {
struct bch_val v;
};
@ -686,6 +688,10 @@ struct bch_inode_generation {
__le32 pad;
} __attribute__((packed, aligned(8)));
/*
* bi_subvol and bi_parent_subvol are only set for subvolume roots:
*/
#define BCH_INODE_FIELDS() \
x(bi_atime, 96) \
x(bi_ctime, 96) \
@ -709,7 +715,9 @@ struct bch_inode_generation {
x(bi_erasure_code, 16) \
x(bi_fields_set, 16) \
x(bi_dir, 64) \
x(bi_dir_offset, 64)
x(bi_dir_offset, 64) \
x(bi_subvol, 32) \
x(bi_parent_subvol, 32)
/* subset of BCH_INODE_FIELDS */
#define BCH_INODE_OPTS() \
@ -792,6 +800,9 @@ struct bch_dirent {
__u8 d_name[];
} __attribute__((packed, aligned(8)));
#define DT_SUBVOL 16
#define BCH_DT_MAX 17
#define BCH_NAME_MAX (U8_MAX * sizeof(u64) - \
sizeof(struct bkey) - \
offsetof(struct bch_dirent, d_name))
@ -928,6 +939,42 @@ struct bch_inline_data {
u8 data[0];
};
/* Subvolumes: */
#define SUBVOL_POS_MIN POS(0, 1)
#define SUBVOL_POS_MAX POS(0, S32_MAX)
#define BCACHEFS_ROOT_SUBVOL 1
struct bch_subvolume {
struct bch_val v;
__le32 flags;
__le32 snapshot;
__le64 inode;
};
LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1)
/*
* We need to know whether a subvolume is a snapshot so we can know whether we
* can delete it (or whether it should just be rm -rf'd)
*/
LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2)
/* Snapshots */
struct bch_snapshot {
struct bch_val v;
__le32 flags;
__le32 parent;
__le32 children[2];
__le32 subvol;
__le32 pad;
};
LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1)
/* True if a subvolume points to this snapshot node: */
LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2)
/* Optional/variable size superblock sections: */
struct bch_sb_field {
@ -1695,7 +1742,9 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6);
x(alloc, 4) \
x(quotas, 5) \
x(stripes, 6) \
x(reflink, 7)
x(reflink, 7) \
x(subvolumes, 8) \
x(snapshots, 9)
enum btree_id {
#define x(kwd, val) BTREE_ID_##kwd = val,

View File

@ -78,6 +78,9 @@ struct bch_ioctl_incremental {
#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize)
#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15, struct bch_ioctl_disk_resize_journal)
#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume)
#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume)
/* ioctl below act on a particular file, not the filesystem as a whole: */
#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *)
@ -349,4 +352,16 @@ struct bch_ioctl_disk_resize_journal {
__u64 nbuckets;
};
struct bch_ioctl_subvolume {
__u32 flags;
__u32 dirfd;
__u16 mode;
__u16 pad[3];
__u64 dst_ptr;
__u64 src_ptr;
};
#define BCH_SUBVOL_SNAPSHOT_CREATE (1U << 0)
#define BCH_SUBVOL_SNAPSHOT_RO (1U << 1)
#endif /* _BCACHEFS_IOCTL_H */

View File

@ -55,7 +55,7 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted)
#define bkey_whiteout(_k) \
((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard)
((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
enum bkey_lr_packed {
BKEY_PACKED_BOTH,

View File

@ -11,6 +11,7 @@
#include "inode.h"
#include "quota.h"
#include "reflink.h"
#include "subvolume.h"
#include "xattr.h"
const char * const bch2_bkey_types[] = {
@ -30,7 +31,7 @@ static const char *deleted_key_invalid(const struct bch_fs *c,
.key_invalid = deleted_key_invalid, \
}
#define bch2_bkey_ops_discard (struct bkey_ops) { \
#define bch2_bkey_ops_whiteout (struct bkey_ops) { \
.key_invalid = deleted_key_invalid, \
}
@ -100,6 +101,8 @@ const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k)
static unsigned bch2_key_types_allowed[] = {
[BKEY_TYPE_extents] =
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_whiteout)|
(1U << KEY_TYPE_error)|
(1U << KEY_TYPE_cookie)|
(1U << KEY_TYPE_extent)|
@ -107,26 +110,43 @@ static unsigned bch2_key_types_allowed[] = {
(1U << KEY_TYPE_reflink_p)|
(1U << KEY_TYPE_inline_data),
[BKEY_TYPE_inodes] =
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_whiteout)|
(1U << KEY_TYPE_inode)|
(1U << KEY_TYPE_inode_generation),
[BKEY_TYPE_dirents] =
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_whiteout)|
(1U << KEY_TYPE_hash_whiteout)|
(1U << KEY_TYPE_dirent),
[BKEY_TYPE_xattrs] =
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_whiteout)|
(1U << KEY_TYPE_cookie)|
(1U << KEY_TYPE_hash_whiteout)|
(1U << KEY_TYPE_xattr),
[BKEY_TYPE_alloc] =
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_alloc)|
(1U << KEY_TYPE_alloc_v2),
[BKEY_TYPE_quotas] =
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_quota),
[BKEY_TYPE_stripes] =
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_stripe),
[BKEY_TYPE_reflink] =
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_reflink_v)|
(1U << KEY_TYPE_indirect_inline_data),
[BKEY_TYPE_subvolumes] =
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_subvolume),
[BKEY_TYPE_snapshots] =
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_snapshot),
[BKEY_TYPE_btree] =
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_btree_ptr)|
(1U << KEY_TYPE_btree_ptr_v2),
};
@ -134,21 +154,18 @@ static unsigned bch2_key_types_allowed[] = {
const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
enum btree_node_type type)
{
unsigned key_types_allowed = (1U << KEY_TYPE_deleted)|
bch2_key_types_allowed[type] ;
if (k.k->u64s < BKEY_U64s)
return "u64s too small";
if (!(key_types_allowed & (1U << k.k->type)))
if (!(bch2_key_types_allowed[type] & (1U << k.k->type)))
return "invalid key type for this btree";
if (type == BKEY_TYPE_btree &&
bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
return "value too big";
if (btree_node_type_is_extents(type)) {
if ((k.k->size == 0) != bkey_deleted(k.k))
if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
if (k.k->size == 0)
return "bad size field";
if (k.k->size > k.k->p.offset)
@ -165,7 +182,7 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
if (type != BKEY_TYPE_btree &&
btree_type_has_snapshots(type) &&
k.k->p.snapshot != U32_MAX)
!k.k->p.snapshot)
return "invalid snapshot field";
if (type != BKEY_TYPE_btree &&

View File

@ -13,6 +13,7 @@
#include "extents.h"
#include "journal.h"
#include "replicas.h"
#include "subvolume.h"
#include <linux/prefetch.h>
#include <trace/events/bcachefs.h>
@ -152,7 +153,7 @@ bool __bch2_btree_node_relock(struct btree_trans *trans,
if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
(btree_node_lock_seq_matches(path, b, level) &&
btree_node_lock_increment(trans, b, level, want))) {
mark_btree_node_locked(trans, path, level, want);
mark_btree_node_locked(path, level, want);
return true;
} else {
return false;
@ -188,7 +189,7 @@ static bool bch2_btree_node_upgrade(struct btree_trans *trans,
return false;
success:
mark_btree_node_intent_locked(trans, path, level);
mark_btree_node_intent_locked(path, level);
return true;
}
@ -674,6 +675,9 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
{
BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
!iter->pos.snapshot);
BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
iter->pos.snapshot != iter->snapshot);
@ -681,6 +685,55 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
bkey_cmp(iter->pos, iter->k.p) > 0);
}
static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
{
struct btree_trans *trans = iter->trans;
struct btree_iter copy;
struct bkey_s_c prev;
int ret = 0;
if (!bch2_debug_check_iterators)
return 0;
if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS))
return 0;
if (bkey_err(k) || !k.k)
return 0;
BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
iter->snapshot,
k.k->p.snapshot));
bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
BTREE_ITER_ALL_SNAPSHOTS);
prev = bch2_btree_iter_prev(&copy);
if (!prev.k)
goto out;
ret = bkey_err(prev);
if (ret)
goto out;
if (!bkey_cmp(prev.k->p, k.k->p) &&
bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
prev.k->p.snapshot) > 0) {
char buf1[100], buf2[200];
bch2_bkey_to_text(&PBUF(buf1), k.k);
bch2_bkey_to_text(&PBUF(buf2), prev.k);
panic("iter snap %u\n"
"k %s\n"
"prev %s\n",
iter->snapshot,
buf1, buf2);
}
out:
bch2_trans_iter_exit(trans, &copy);
return ret;
}
#else
static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
@ -689,6 +742,7 @@ static inline void bch2_btree_path_verify(struct btree_trans *trans,
struct btree_path *path) {}
static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
#endif
@ -896,12 +950,12 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
bch2_btree_node_iter_peek_all(&l->iter, l->b));
}
static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
static inline struct bkey_s_c btree_path_level_peek(struct bch_fs *c,
struct btree_path *path,
struct btree_path_level *l,
struct bkey *u)
{
struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
struct bkey_s_c k = __btree_iter_unpack(c, l, u,
bch2_btree_node_iter_peek(&l->iter, l->b));
path->pos = k.k ? k.k->p : l->b->key.k.p;
@ -1041,7 +1095,7 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
t != BTREE_NODE_UNLOCKED) {
btree_node_unlock(path, b->c.level);
six_lock_increment(&b->c.lock, t);
mark_btree_node_locked(trans, path, b->c.level, t);
mark_btree_node_locked(path, b->c.level, t);
}
btree_path_level_init(trans, path, b);
@ -1118,7 +1172,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
path->l[i].b = NULL;
mark_btree_node_locked(trans, path, path->level, lock_type);
mark_btree_node_locked(path, path->level, lock_type);
btree_path_level_init(trans, path, b);
return 0;
}
@ -1210,7 +1264,7 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
if (unlikely(ret))
goto err;
mark_btree_node_locked(trans, path, level, lock_type);
mark_btree_node_locked(path, level, lock_type);
btree_path_level_init(trans, path, b);
if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
@ -1252,10 +1306,6 @@ retry_all:
btree_trans_verify_sorted(trans);
#ifdef CONFIG_BCACHEFS_DEBUG
trans->traverse_all_idx = U8_MAX;
#endif
for (i = trans->nr_sorted - 2; i >= 0; --i) {
struct btree_path *path1 = trans->paths + trans->sorted[i];
struct btree_path *path2 = trans->paths + trans->sorted[i + 1];
@ -1294,9 +1344,6 @@ retry_all:
path = trans->paths + trans->sorted[i];
EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
#ifdef CONFIG_BCACHEFS_DEBUG
trans->traverse_all_idx = path->idx;
#endif
ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
if (ret)
@ -1985,11 +2032,25 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
}
if (likely(k.k)) {
if (likely(!bkey_deleted(k.k)))
break;
/*
* We can never have a key in a leaf node at POS_MAX, so
* we don't have to check these successor() calls:
*/
if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
!bch2_snapshot_is_ancestor(trans->c,
iter->snapshot,
k.k->p.snapshot)) {
search_key = bpos_successor(k.k->p);
continue;
}
/* Advance to next key: */
search_key = bkey_successor(iter, k.k->p);
if (bkey_whiteout(k.k) &&
!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
search_key = bkey_successor(iter, k.k->p);
continue;
}
break;
} else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) {
/* Advance to next leaf node: */
search_key = bpos_successor(iter->path->l[0].b->key.k.p);
@ -2010,6 +2071,9 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
iter->pos = bkey_start_pos(k.k);
if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
iter->pos.snapshot = iter->snapshot;
cmp = bpos_cmp(k.k->p, iter->path->pos);
if (cmp) {
iter->path = bch2_btree_path_make_mut(trans, iter->path,
@ -2022,6 +2086,10 @@ out:
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
ret = bch2_btree_iter_verify_ret(iter, k);
if (unlikely(ret))
return bkey_s_c_err(ret);
return k;
}
@ -2045,7 +2113,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
{
struct btree_trans *trans = iter->trans;
struct bpos search_key = iter->pos;
struct btree_path *saved_path = NULL;
struct bkey_s_c k;
struct bkey saved_k;
const struct bch_val *saved_v;
int ret;
EBUG_ON(iter->path->cached || iter->path->level);
@ -2053,6 +2124,9 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter);
if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
search_key.snapshot = U32_MAX;
while (1) {
iter->path = btree_path_set_pos(trans, iter->path, search_key,
iter->flags & BTREE_ITER_INTENT);
@ -2065,18 +2139,61 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
goto out;
}
k = btree_path_level_peek(trans, iter->path,
k = btree_path_level_peek(trans->c, iter->path,
&iter->path->l[0], &iter->k);
if (!k.k ||
((iter->flags & BTREE_ITER_IS_EXTENTS)
? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0
: bkey_cmp(k.k->p, iter->pos) > 0))
? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0
: bpos_cmp(k.k->p, search_key) > 0))
k = btree_path_level_prev(trans->c, iter->path,
&iter->path->l[0], &iter->k);
btree_path_check_sort(trans, iter->path, 0);
if (likely(k.k)) {
if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
if (k.k->p.snapshot == iter->snapshot)
goto got_key;
/*
* If we have a saved candidate, and we're no
* longer at the same _key_ (not pos), return
* that candidate
*/
if (saved_path && bkey_cmp(k.k->p, saved_k.p)) {
bch2_path_put(trans, iter->path,
iter->flags & BTREE_ITER_INTENT);
iter->path = saved_path;
saved_path = NULL;
iter->k = saved_k;
k.v = saved_v;
goto got_key;
}
if (bch2_snapshot_is_ancestor(iter->trans->c,
iter->snapshot,
k.k->p.snapshot)) {
if (saved_path)
bch2_path_put(trans, saved_path,
iter->flags & BTREE_ITER_INTENT);
saved_path = btree_path_clone(trans, iter->path,
iter->flags & BTREE_ITER_INTENT);
saved_k = *k.k;
saved_v = k.v;
}
search_key = bpos_predecessor(k.k->p);
continue;
}
got_key:
if (bkey_whiteout(k.k) &&
!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
search_key = bkey_predecessor(iter, k.k->p);
if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
search_key.snapshot = U32_MAX;
continue;
}
break;
} else if (likely(bpos_cmp(iter->path->l[0].b->data->min_key, POS_MIN))) {
/* Advance to previous leaf node: */
@ -2094,7 +2211,12 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
/* Extents can straddle iter->pos: */
if (bkey_cmp(k.k->p, iter->pos) < 0)
iter->pos = k.k->p;
if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
iter->pos.snapshot = iter->snapshot;
out:
if (saved_path)
bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
iter->path->should_be_locked = true;
bch2_btree_iter_verify_entry_exit(iter);
@ -2143,7 +2265,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
if (unlikely(ret))
return bkey_s_c_err(ret);
if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) {
if ((iter->flags & BTREE_ITER_CACHED) ||
!(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
struct bkey_i *next_update;
next_update = iter->flags & BTREE_ITER_WITH_UPDATES
@ -2202,6 +2325,9 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
ret = bch2_btree_iter_verify_ret(iter, k);
if (unlikely(ret))
return bkey_s_c_err(ret);
return k;
}
@ -2352,13 +2478,13 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
btree_node_type_is_extents(btree_id))
flags |= BTREE_ITER_IS_EXTENTS;
if (!btree_type_has_snapshots(btree_id) &&
!(flags & __BTREE_ITER_ALL_SNAPSHOTS))
if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
!btree_type_has_snapshots(btree_id))
flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
if (!(flags & BTREE_ITER_ALL_SNAPSHOTS))
pos.snapshot = btree_type_has_snapshots(btree_id)
? U32_MAX : 0;
if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&
btree_type_has_snapshots(btree_id))
flags |= BTREE_ITER_FILTER_SNAPSHOTS;
iter->trans = trans;
iter->path = NULL;

View File

@ -234,6 +234,15 @@ static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *it
iter->pos = bkey_start_pos(&iter->k);
}
static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot)
{
struct bpos pos = iter->pos;
iter->snapshot = snapshot;
pos.snapshot = snapshot;
bch2_btree_iter_set_pos(iter, pos);
}
/*
* Unlocks before scheduling
* Note: does not revalidate iterator

View File

@ -163,6 +163,11 @@ btree_key_cache_create(struct btree_key_cache *c,
was_new = false;
}
if (btree_id == BTREE_ID_subvolumes)
six_lock_pcpu_alloc(&ck->c.lock);
else
six_lock_pcpu_free(&ck->c.lock);
ck->c.level = 0;
ck->c.btree_id = btree_id;
ck->key.btree_id = btree_id;
@ -296,7 +301,7 @@ retry:
if (!ck)
goto retry;
mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
mark_btree_node_locked(path, 0, SIX_LOCK_intent);
path->locks_want = 1;
} else {
enum six_lock_type lock_want = __btree_lock_want(path, 0);
@ -318,7 +323,7 @@ retry:
goto retry;
}
mark_btree_node_locked(trans, path, 0, lock_want);
mark_btree_node_locked(path, 0, lock_want);
}
path->l[0].lock_seq = ck->c.lock.state.seq;
@ -366,7 +371,8 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
BTREE_ITER_SLOTS|
BTREE_ITER_INTENT);
BTREE_ITER_INTENT|
BTREE_ITER_ALL_SNAPSHOTS);
bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
BTREE_ITER_CACHED|
BTREE_ITER_CACHED_NOFILL|

View File

@ -58,8 +58,7 @@ static inline void mark_btree_node_unlocked(struct btree_path *path,
path->nodes_intent_locked &= ~(1 << level);
}
static inline void mark_btree_node_locked(struct btree_trans *trans,
struct btree_path *path,
static inline void mark_btree_node_locked(struct btree_path *path,
unsigned level,
enum six_lock_type type)
{
@ -69,19 +68,12 @@ static inline void mark_btree_node_locked(struct btree_trans *trans,
path->nodes_locked |= 1 << level;
path->nodes_intent_locked |= type << level;
#ifdef CONFIG_BCACHEFS_DEBUG
path->ip_locked = _RET_IP_;
BUG_ON(trans->in_traverse_all &&
trans->traverse_all_idx != U8_MAX &&
path->sorted_idx > trans->paths[trans->traverse_all_idx].sorted_idx);
#endif
}
static inline void mark_btree_node_intent_locked(struct btree_trans *trans,
struct btree_path *path,
static inline void mark_btree_node_intent_locked(struct btree_path *path,
unsigned level)
{
mark_btree_node_locked(trans, path, level, SIX_LOCK_intent);
mark_btree_node_locked(path, level, SIX_LOCK_intent);
}
static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
@ -120,9 +112,6 @@ static inline void __bch2_btree_path_unlock(struct btree_path *path)
while (path->nodes_locked)
btree_node_unlock(path, __ffs(path->nodes_locked));
#ifdef CONFIG_BCACHEFS_DEBUG
path->ip_locked = 0;
#endif
}
static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)

View File

@ -209,6 +209,7 @@ struct btree_node_iter {
#define BTREE_ITER_WITH_UPDATES (1 << 10)
#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 11)
#define BTREE_ITER_ALL_SNAPSHOTS (1 << 12)
#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 13)
enum btree_path_uptodate {
BTREE_ITER_UPTODATE = 0,
@ -255,7 +256,6 @@ struct btree_path {
} l[BTREE_MAX_DEPTH];
#ifdef CONFIG_BCACHEFS_DEBUG
unsigned long ip_allocated;
unsigned long ip_locked;
#endif
};
@ -369,7 +369,6 @@ struct btree_trans {
struct bpos locking_pos;
u8 locking_btree_id;
u8 locking_level;
u8 traverse_all_idx;
pid_t pid;
#endif
unsigned long ip;
@ -607,7 +606,8 @@ static inline bool btree_node_is_extents(struct btree *b)
#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \
((1U << BKEY_TYPE_alloc)| \
(1U << BKEY_TYPE_stripes))
(1U << BKEY_TYPE_stripes)| \
(1U << BKEY_TYPE_snapshots))
#define BTREE_NODE_TYPE_HAS_TRIGGERS \
(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \
@ -654,7 +654,8 @@ enum btree_update_flags {
#define BTREE_TRIGGER_WANTS_OLD_AND_NEW \
((1U << KEY_TYPE_stripe)| \
(1U << KEY_TYPE_inode))
(1U << KEY_TYPE_inode)| \
(1U << KEY_TYPE_snapshot))
static inline bool btree_node_type_needs_gc(enum btree_node_type type)
{
@ -671,11 +672,6 @@ struct btree_root {
s8 error;
};
/*
* Optional hook that will be called just prior to a btree node update, when
* we're holding the write lock and we know what key is about to be overwritten:
*/
enum btree_insert_ret {
BTREE_INSERT_OK,
/* leaf node needs to be split */
@ -696,8 +692,4 @@ enum btree_node_sibling {
btree_next_sib,
};
typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *,
struct btree *,
struct btree_node_iter *);
#endif /* _BCACHEFS_BTREE_TYPES_H */

View File

@ -61,7 +61,7 @@ int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
struct disk_reservation *, u64 *, int flags);
int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
struct bpos, struct bpos, u64 *);
struct bpos, struct bpos, unsigned, u64 *);
int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
struct bpos, struct bpos, u64 *);

View File

@ -15,6 +15,7 @@
#include "journal.h"
#include "journal_reclaim.h"
#include "keylist.h"
#include "subvolume.h"
#include "replicas.h"
#include <linux/prefetch.h>
@ -245,6 +246,11 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
BUG_ON(i->cached != i->path->cached);
BUG_ON(i->level != i->path->level);
BUG_ON(i->btree_id != i->path->btree_id);
EBUG_ON(!i->level &&
!(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
i->k->k.p.snapshot &&
bch2_snapshot_internal_node(trans->c, i->k->k.p.snapshot));
}
static noinline int
@ -934,6 +940,43 @@ err:
goto retry;
}
static int check_pos_snapshot_overwritten(struct btree_trans *trans,
enum btree_id id,
struct bpos pos)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
int ret;
if (!snapshot_t(c, pos.snapshot)->children[0])
return 0;
bch2_trans_iter_init(trans, &iter, id, pos,
BTREE_ITER_NOT_EXTENTS|
BTREE_ITER_ALL_SNAPSHOTS);
while (1) {
k = bch2_btree_iter_prev(&iter);
ret = bkey_err(k);
if (ret)
break;
if (!k.k)
break;
if (bkey_cmp(pos, k.k->p))
break;
if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
ret = 1;
break;
}
}
bch2_trans_iter_exit(trans, &iter);
return ret;
}
static int bch2_trans_update_extent(struct btree_trans *trans,
struct btree_iter *orig_iter,
struct bkey_i *insert,
@ -958,6 +1001,28 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
goto out;
if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
/*
* We can't merge extents if they belong to interior snapshot
* tree nodes, and there's a snapshot in which one extent is
* visible and the other is not - i.e. if visibility is
* different.
*
* Instead of checking if visibilitiy of the two extents is
* different, for now we just check if either has been
* overwritten:
*/
ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p);
if (ret < 0)
goto err;
if (ret)
goto nomerge1;
ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p);
if (ret < 0)
goto err;
if (ret)
goto nomerge1;
update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
if ((ret = PTR_ERR_OR_ZERO(update)))
goto err;
@ -973,22 +1038,26 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
goto next;
}
}
if (!bkey_cmp(k.k->p, bkey_start_pos(&insert->k)))
nomerge1:
ret = 0;
if (!bkey_cmp(k.k->p, start))
goto next;
while (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) > 0) {
bool front_split = bkey_cmp(bkey_start_pos(k.k), start) < 0;
bool back_split = bkey_cmp(k.k->p, insert->k.p) > 0;
/*
* If we're going to be splitting a compressed extent, note it
* so that __bch2_trans_commit() can increase our disk
* reservation:
*/
if (bkey_cmp(bkey_start_pos(k.k), start) < 0 &&
bkey_cmp(k.k->p, insert->k.p) > 0 &&
if (((front_split && back_split) ||
((front_split || back_split) && k.k->p.snapshot != insert->k.p.snapshot)) &&
(compressed_sectors = bch2_bkey_sectors_compressed(k)))
trans->extra_journal_res += compressed_sectors;
if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
if (front_split) {
update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
if ((ret = PTR_ERR_OR_ZERO(update)))
goto err;
@ -999,6 +1068,32 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
BTREE_ITER_NOT_EXTENTS|
BTREE_ITER_ALL_SNAPSHOTS|
BTREE_ITER_INTENT);
ret = bch2_btree_iter_traverse(&update_iter) ?:
bch2_trans_update(trans, &update_iter, update,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
flags);
bch2_trans_iter_exit(trans, &update_iter);
if (ret)
goto err;
}
if (k.k->p.snapshot != insert->k.p.snapshot &&
(front_split || back_split)) {
update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
if ((ret = PTR_ERR_OR_ZERO(update)))
goto err;
bkey_reassemble(update, k);
bch2_cut_front(start, update);
bch2_cut_back(insert->k.p, update);
bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
BTREE_ITER_NOT_EXTENTS|
BTREE_ITER_ALL_SNAPSHOTS|
BTREE_ITER_INTENT);
ret = bch2_btree_iter_traverse(&update_iter) ?:
bch2_trans_update(trans, &update_iter, update,
@ -1010,12 +1105,32 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
}
if (bkey_cmp(k.k->p, insert->k.p) <= 0) {
ret = bch2_btree_delete_at(trans, &iter, flags);
update = bch2_trans_kmalloc(trans, sizeof(*update));
if ((ret = PTR_ERR_OR_ZERO(update)))
goto err;
bkey_init(&update->k);
update->k.p = k.k->p;
if (insert->k.p.snapshot != k.k->p.snapshot) {
update->k.p.snapshot = insert->k.p.snapshot;
update->k.type = KEY_TYPE_whiteout;
}
bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
BTREE_ITER_NOT_EXTENTS|
BTREE_ITER_INTENT);
ret = bch2_btree_iter_traverse(&update_iter) ?:
bch2_trans_update(trans, &update_iter, update,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
flags);
bch2_trans_iter_exit(trans, &update_iter);
if (ret)
goto err;
}
if (bkey_cmp(k.k->p, insert->k.p) > 0) {
if (back_split) {
update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
if ((ret = PTR_ERR_OR_ZERO(update)))
goto err;
@ -1023,10 +1138,15 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
bkey_reassemble(update, k);
bch2_cut_front(insert->k.p, update);
ret = bch2_trans_update(trans, &iter, update, flags);
bch2_trans_copy_iter(&update_iter, &iter);
update_iter.pos = update->k.p;
ret = bch2_trans_update(trans, &update_iter, update,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
flags);
bch2_trans_iter_exit(trans, &update_iter);
if (ret)
goto err;
goto out;
}
next:
@ -1037,7 +1157,23 @@ next:
goto out;
}
bch2_bkey_merge(c, bkey_i_to_s(insert), k);
if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p);
if (ret < 0)
goto out;
if (ret)
goto nomerge2;
ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p);
if (ret < 0)
goto out;
if (ret)
goto nomerge2;
bch2_bkey_merge(c, bkey_i_to_s(insert), k);
}
nomerge2:
ret = 0;
out:
if (!bkey_deleted(&insert->k)) {
/*
@ -1057,6 +1193,39 @@ err:
return ret;
}
/*
* When deleting, check if we need to emit a whiteout (because we're overwriting
* something in an ancestor snapshot)
*/
static int need_whiteout_for_snapshot(struct btree_trans *trans,
enum btree_id btree_id, struct bpos pos)
{
struct btree_iter iter;
struct bkey_s_c k;
u32 snapshot = pos.snapshot;
int ret;
if (!bch2_snapshot_parent(trans->c, pos.snapshot))
return 0;
pos.snapshot++;
for_each_btree_key(trans, iter, btree_id, pos,
BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
if (bkey_cmp(k.k->p, pos))
break;
if (bch2_snapshot_is_ancestor(trans->c, snapshot,
k.k->p.snapshot)) {
ret = !bkey_whiteout(k.k);
break;
}
}
bch2_trans_iter_exit(trans, &iter);
return ret;
}
int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_i *k, enum btree_update_flags flags)
{
@ -1089,6 +1258,16 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
btree_insert_entry_cmp(i - 1, i) >= 0);
#endif
if (bkey_deleted(&n.k->k) &&
(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
int ret = need_whiteout_for_snapshot(trans, n.btree_id, n.k->k.p);
if (unlikely(ret < 0))
return ret;
if (ret)
n.k->k.type = KEY_TYPE_whiteout;
}
/*
* Pending updates are kept sorted: first, find position of new update,
* then delete/trim any updates the new update overwrites:
@ -1175,13 +1354,14 @@ int bch2_btree_delete_at(struct btree_trans *trans,
int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
struct bpos start, struct bpos end,
unsigned iter_flags,
u64 *journal_seq)
{
struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT|iter_flags);
retry:
while ((bch2_trans_begin(trans),
(k = bch2_btree_iter_peek(&iter)).k) &&
@ -1248,5 +1428,5 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
u64 *journal_seq)
{
return bch2_trans_do(c, NULL, journal_seq, 0,
bch2_btree_delete_range_trans(&trans, id, start, end, journal_seq));
bch2_btree_delete_range_trans(&trans, id, start, end, 0, journal_seq));
}

View File

@ -16,6 +16,7 @@
#include "movinggc.h"
#include "reflink.h"
#include "replicas.h"
#include "subvolume.h"
#include <linux/preempt.h>
#include <trace/events/bcachefs.h>
@ -1200,6 +1201,8 @@ static int bch2_mark_key_locked(struct bch_fs *c,
return bch2_mark_reservation(c, old, new, journal_seq, flags);
case KEY_TYPE_reflink_p:
return bch2_mark_reflink_p(c, old, new, journal_seq, flags);
case KEY_TYPE_snapshot:
return bch2_mark_snapshot(c, old, new, journal_seq, flags);
default:
return 0;
}

View File

@ -8,6 +8,7 @@
#include "fs.h"
#include "keylist.h"
#include "str_hash.h"
#include "subvolume.h"
#include <linux/dcache.h>
@ -99,7 +100,8 @@ const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
if (memchr(d.v->d_name, '/', len))
return "invalid name";
if (le64_to_cpu(d.v->d_inum) == d.k->p.inode)
if (d.v->d_type != DT_SUBVOL &&
le64_to_cpu(d.v->d_inum) == d.k->p.inode)
return "dirent points to own directory";
return NULL;
@ -113,7 +115,7 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
bch_scnmemcpy(out, d.v->d_name,
bch2_dirent_name_bytes(d));
pr_buf(out, " -> %llu type %s", d.v->d_inum,
d.v->d_type < DT_MAX
d.v->d_type < BCH_DT_MAX
? bch2_d_types[d.v->d_type]
: "(bad d_type)");
}
@ -149,8 +151,8 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
return dirent;
}
int bch2_dirent_create(struct btree_trans *trans,
u64 dir_inum, const struct bch_hash_info *hash_info,
int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum,
u64 *dir_offset, int flags)
{
@ -163,7 +165,7 @@ int bch2_dirent_create(struct btree_trans *trans,
return ret;
ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
dir_inum, &dirent->k_i, flags);
dir, &dirent->k_i, flags);
*dir_offset = dirent->k.p.offset;
return ret;
@ -176,22 +178,86 @@ static void dirent_copy_target(struct bkey_i_dirent *dst,
dst->v.d_type = src.v->d_type;
}
int __bch2_dirent_read_target(struct btree_trans *trans,
struct bkey_s_c_dirent d,
u32 *subvol, u32 *snapshot, u64 *inum,
bool is_fsck)
{
int ret = 0;
*subvol = 0;
*snapshot = d.k->p.snapshot;
if (likely(d.v->d_type != DT_SUBVOL)) {
*inum = le64_to_cpu(d.v->d_inum);
} else {
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_s_c_subvolume s;
int ret;
*subvol = le64_to_cpu(d.v->d_inum);
bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
POS(0, *subvol),
BTREE_ITER_CACHED);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
if (k.k->type != KEY_TYPE_subvolume) {
ret = -ENOENT;
goto err;
}
s = bkey_s_c_to_subvolume(k);
*snapshot = le32_to_cpu(s.v->snapshot);
*inum = le64_to_cpu(s.v->inode);
err:
if (ret == -ENOENT && !is_fsck)
bch2_fs_inconsistent(trans->c, "pointer to missing subvolume %u",
*subvol);
bch2_trans_iter_exit(trans, &iter);
}
return ret;
}
static int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
struct bkey_s_c_dirent d, subvol_inum *target)
{
u32 snapshot;
int ret = 0;
ret = __bch2_dirent_read_target(trans, d, &target->subvol, &snapshot,
&target->inum, false);
if (!target->subvol)
target->subvol = dir.subvol;
return ret;
}
int bch2_dirent_rename(struct btree_trans *trans,
u64 src_dir, struct bch_hash_info *src_hash,
u64 dst_dir, struct bch_hash_info *dst_hash,
const struct qstr *src_name, u64 *src_inum, u64 *src_offset,
const struct qstr *dst_name, u64 *dst_inum, u64 *dst_offset,
enum bch_rename_mode mode)
subvol_inum src_dir, struct bch_hash_info *src_hash,
subvol_inum dst_dir, struct bch_hash_info *dst_hash,
const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset,
const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset,
enum bch_rename_mode mode)
{
struct btree_iter src_iter = { NULL };
struct btree_iter dst_iter = { NULL };
struct bkey_s_c old_src, old_dst;
struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
struct bpos dst_pos =
POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name));
POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
int ret = 0;
*src_inum = *dst_inum = 0;
if (src_dir.subvol != dst_dir.subvol)
return -EXDEV;
memset(src_inum, 0, sizeof(*src_inum));
memset(dst_inum, 0, sizeof(*dst_inum));
/*
* Lookup dst:
@ -214,8 +280,12 @@ int bch2_dirent_rename(struct btree_trans *trans,
if (ret)
goto out;
if (mode != BCH_RENAME)
*dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum);
if (mode != BCH_RENAME) {
ret = bch2_dirent_read_target(trans, dst_dir,
bkey_s_c_to_dirent(old_dst), dst_inum);
if (ret)
goto out;
}
if (mode != BCH_RENAME_EXCHANGE)
*src_offset = dst_iter.pos.offset;
@ -231,7 +301,10 @@ int bch2_dirent_rename(struct btree_trans *trans,
if (ret)
goto out;
*src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum);
ret = bch2_dirent_read_target(trans, src_dir,
bkey_s_c_to_dirent(old_src), src_inum);
if (ret)
goto out;
/* Create new dst key: */
new_dst = dirent_create_key(trans, 0, dst_name, 0);
@ -310,63 +383,79 @@ out:
return ret;
}
int bch2_dirent_delete_at(struct btree_trans *trans,
const struct bch_hash_info *hash_info,
struct btree_iter *iter)
{
return bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
hash_info, iter);
}
int __bch2_dirent_lookup_trans(struct btree_trans *trans,
struct btree_iter *iter,
u64 dir_inum,
subvol_inum dir,
const struct bch_hash_info *hash_info,
const struct qstr *name, unsigned flags)
const struct qstr *name, subvol_inum *inum,
unsigned flags)
{
return bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
hash_info, dir_inum, name, flags);
struct bkey_s_c k;
struct bkey_s_c_dirent d;
u32 snapshot;
int ret;
ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
if (ret)
return ret;
ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
hash_info, dir, name, flags);
if (ret)
return ret;
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret) {
bch2_trans_iter_exit(trans, iter);
return ret;
}
d = bkey_s_c_to_dirent(k);
ret = bch2_dirent_read_target(trans, dir, d, inum);
if (ret)
bch2_trans_iter_exit(trans, iter);
return ret;
}
u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
const struct bch_hash_info *hash_info,
const struct qstr *name)
const struct qstr *name, subvol_inum *inum)
{
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
u64 inum = 0;
int ret = 0;
int ret;
bch2_trans_init(&trans, c, 0, 0);
retry:
bch2_trans_begin(&trans);
ret = __bch2_dirent_lookup_trans(&trans, &iter, dir_inum,
hash_info, name, 0);
if (ret)
goto out;
ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info,
name, inum, 0);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto out;
inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
bch2_trans_iter_exit(&trans, &iter);
out:
BUG_ON(ret == -EINTR);
if (ret == -EINTR)
goto retry;
bch2_trans_exit(&trans);
return inum;
return ret;
}
int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
{
struct btree_iter iter;
struct bkey_s_c k;
u32 snapshot;
int ret;
ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
if (ret)
return ret;
for_each_btree_key(trans, iter, BTREE_ID_dirents,
POS(dir_inum, 0), 0, k, ret) {
if (k.k->p.inode > dir_inum)
SPOS(dir.inum, 0, snapshot), 0, k, ret) {
if (k.k->p.inode > dir.inum)
break;
if (k.k->type == KEY_TYPE_dirent) {
@ -379,19 +468,26 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
return ret;
}
int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
{
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_s_c_dirent dirent;
u32 snapshot;
int ret;
bch2_trans_init(&trans, c, 0, 0);
retry:
bch2_trans_begin(&trans);
ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
if (ret)
goto err;
for_each_btree_key(&trans, iter, BTREE_ID_dirents,
POS(inum, ctx->pos), 0, k, ret) {
if (k.k->p.inode > inum)
SPOS(inum.inum, ctx->pos, snapshot), 0, k, ret) {
if (k.k->p.inode > inum.inum)
break;
if (k.k->type != KEY_TYPE_dirent)
@ -407,11 +503,14 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
if (!dir_emit(ctx, dirent.v->d_name,
bch2_dirent_name_bytes(dirent),
le64_to_cpu(dirent.v->d_inum),
dirent.v->d_type))
vfs_d_type(dirent.v->d_type)))
break;
ctx->pos = dirent.k->p.offset + 1;
}
bch2_trans_iter_exit(&trans, &iter);
err:
if (ret == -EINTR)
goto retry;
ret = bch2_trans_exit(&trans) ?: ret;

View File

@ -29,13 +29,17 @@ static inline unsigned dirent_val_u64s(unsigned len)
sizeof(u64));
}
int bch2_dirent_create(struct btree_trans *, u64,
int bch2_dirent_create(struct btree_trans *, subvol_inum,
const struct bch_hash_info *, u8,
const struct qstr *, u64, u64 *, int);
int bch2_dirent_delete_at(struct btree_trans *,
const struct bch_hash_info *,
struct btree_iter *);
int __bch2_dirent_read_target(struct btree_trans *, struct bkey_s_c_dirent,
u32 *, u32 *, u64 *, bool);
static inline unsigned vfs_d_type(unsigned type)
{
return type == DT_SUBVOL ? DT_DIR : type;
}
enum bch_rename_mode {
BCH_RENAME,
@ -44,19 +48,20 @@ enum bch_rename_mode {
};
int bch2_dirent_rename(struct btree_trans *,
u64, struct bch_hash_info *,
u64, struct bch_hash_info *,
const struct qstr *, u64 *, u64 *,
const struct qstr *, u64 *, u64 *,
subvol_inum, struct bch_hash_info *,
subvol_inum, struct bch_hash_info *,
const struct qstr *, subvol_inum *, u64 *,
const struct qstr *, subvol_inum *, u64 *,
enum bch_rename_mode);
int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, u64,
const struct bch_hash_info *,
const struct qstr *, unsigned);
u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *,
const struct qstr *);
int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
subvol_inum, const struct bch_hash_info *,
const struct qstr *, subvol_inum *, unsigned);
u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
const struct bch_hash_info *,
const struct qstr *, subvol_inum *);
int bch2_empty_dir_trans(struct btree_trans *, u64);
int bch2_readdir(struct bch_fs *, u64, struct dir_context *);
int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
#endif /* _BCACHEFS_DIRENT_H */

View File

@ -612,38 +612,6 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c k)
return false;
}
bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
unsigned nr_replicas, bool compressed)
{
struct btree_trans trans;
struct btree_iter iter;
struct bpos end = pos;
struct bkey_s_c k;
bool ret = true;
int err;
end.offset += size;
bch2_trans_init(&trans, c, 0, 0);
for_each_btree_key(&trans, iter, BTREE_ID_extents, pos,
BTREE_ITER_SLOTS, k, err) {
if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
break;
if (nr_replicas > bch2_bkey_replicas(c, k) ||
(!compressed && bch2_bkey_sectors_compressed(k))) {
ret = false;
break;
}
}
bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
}
unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);

View File

@ -567,7 +567,6 @@ unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
bool bch2_bkey_is_incompressible(struct bkey_s_c);
unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned, bool);
unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);

View File

@ -6,82 +6,186 @@
#include "dirent.h"
#include "fs-common.h"
#include "inode.h"
#include "subvolume.h"
#include "xattr.h"
#include <linux/posix_acl.h>
int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode)
{
return S_ISDIR(inode->bi_mode) && !inode->bi_subvol;
}
int bch2_create_trans(struct btree_trans *trans,
subvol_inum dir,
struct bch_inode_unpacked *dir_u,
struct bch_inode_unpacked *new_inode,
const struct qstr *name,
uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
struct posix_acl *default_acl,
struct posix_acl *acl)
struct posix_acl *acl,
subvol_inum snapshot_src,
unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree_iter dir_iter = { NULL };
struct btree_iter inode_iter = { NULL };
struct bch_hash_info hash = bch2_hash_info_init(c, new_inode);
subvol_inum new_inum = dir;
u64 now = bch2_current_time(c);
u64 cpu = raw_smp_processor_id();
u64 dir_offset = 0;
u64 dir_target;
u32 snapshot;
unsigned dir_type = mode_to_type(mode);
int ret;
ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT);
ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
if (ret)
goto err;
bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
if (!name)
new_inode->bi_flags |= BCH_INODE_UNLINKED;
ret = bch2_inode_create(trans, &inode_iter, new_inode, U32_MAX, cpu);
ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
if (ret)
goto err;
if (default_acl) {
ret = bch2_set_acl_trans(trans, new_inode, &hash,
default_acl, ACL_TYPE_DEFAULT);
if (!(flags & BCH_CREATE_SNAPSHOT)) {
/* Normal create path - allocate a new inode: */
bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
if (flags & BCH_CREATE_TMPFILE)
new_inode->bi_flags |= BCH_INODE_UNLINKED;
ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu);
if (ret)
goto err;
snapshot_src = (subvol_inum) { 0 };
} else {
/*
* Creating a snapshot - we're not allocating a new inode, but
* we do have to lookup the root inode of the subvolume we're
* snapshotting and update it (in the new snapshot):
*/
if (!snapshot_src.inum) {
/* Inode wasn't specified, just snapshot: */
struct btree_iter subvol_iter;
struct bkey_s_c k;
bch2_trans_iter_init(trans, &subvol_iter, BTREE_ID_subvolumes,
POS(0, snapshot_src.subvol), 0);
k = bch2_btree_iter_peek_slot(&subvol_iter);
ret = bkey_err(k);
if (!ret && k.k->type != KEY_TYPE_subvolume) {
bch_err(c, "subvolume %u not found",
snapshot_src.subvol);
ret = -ENOENT;
}
if (!ret)
snapshot_src.inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode);
bch2_trans_iter_exit(trans, &subvol_iter);
if (ret)
goto err;
}
ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src,
BTREE_ITER_INTENT);
if (ret)
goto err;
if (new_inode->bi_subvol != snapshot_src.subvol) {
/* Not a subvolume root: */
ret = -EINVAL;
goto err;
}
/*
* If we're not root, we have to own the subvolume being
* snapshotted:
*/
if (uid && new_inode->bi_uid != uid) {
ret = -EPERM;
goto err;
}
flags |= BCH_CREATE_SUBVOL;
}
new_inum.inum = new_inode->bi_inum;
dir_target = new_inode->bi_inum;
if (flags & BCH_CREATE_SUBVOL) {
u32 new_subvol, dir_snapshot;
ret = bch2_subvolume_create(trans, new_inode->bi_inum,
snapshot_src.subvol,
&new_subvol, &snapshot,
(flags & BCH_CREATE_SNAPSHOT_RO) != 0);
if (ret)
goto err;
new_inode->bi_parent_subvol = dir.subvol;
new_inode->bi_subvol = new_subvol;
new_inum.subvol = new_subvol;
dir_target = new_subvol;
dir_type = DT_SUBVOL;
ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot);
if (ret)
goto err;
bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot);
ret = bch2_btree_iter_traverse(&dir_iter);
if (ret)
goto err;
}
if (acl) {
ret = bch2_set_acl_trans(trans, new_inode, &hash,
acl, ACL_TYPE_ACCESS);
if (ret)
goto err;
if (!(flags & BCH_CREATE_SNAPSHOT)) {
if (default_acl) {
ret = bch2_set_acl_trans(trans, new_inum, new_inode,
default_acl, ACL_TYPE_DEFAULT);
if (ret)
goto err;
}
if (acl) {
ret = bch2_set_acl_trans(trans, new_inum, new_inode,
acl, ACL_TYPE_ACCESS);
if (ret)
goto err;
}
}
if (name) {
if (!(flags & BCH_CREATE_TMPFILE)) {
struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u);
dir_u->bi_mtime = dir_u->bi_ctime = now;
u64 dir_offset;
if (S_ISDIR(new_inode->bi_mode))
if (is_subdir_for_nlink(new_inode))
dir_u->bi_nlink++;
dir_u->bi_mtime = dir_u->bi_ctime = now;
ret = bch2_inode_write(trans, &dir_iter, dir_u);
if (ret)
goto err;
ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
mode_to_type(new_inode->bi_mode),
name, new_inode->bi_inum,
ret = bch2_dirent_create(trans, dir, &dir_hash,
dir_type,
name,
dir_target,
&dir_offset,
BCH_HASH_SET_MUST_CREATE);
if (ret)
goto err;
if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
new_inode->bi_dir = dir_u->bi_inum;
new_inode->bi_dir_offset = dir_offset;
}
}
if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
new_inode->bi_dir = dir_u->bi_inum;
new_inode->bi_dir_offset = dir_offset;
}
/* XXX use bch2_btree_iter_set_snapshot() */
inode_iter.snapshot = U32_MAX;
bch2_btree_iter_set_pos(&inode_iter, SPOS(0, new_inode->bi_inum, U32_MAX));
inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
bch2_btree_iter_set_snapshot(&inode_iter, snapshot);
ret = bch2_btree_iter_traverse(&inode_iter) ?:
bch2_inode_write(trans, &inode_iter, new_inode);
@ -91,9 +195,10 @@ err:
return ret;
}
int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
u64 inum, struct bch_inode_unpacked *dir_u,
struct bch_inode_unpacked *inode_u, const struct qstr *name)
int bch2_link_trans(struct btree_trans *trans,
subvol_inum dir, struct bch_inode_unpacked *dir_u,
subvol_inum inum, struct bch_inode_unpacked *inode_u,
const struct qstr *name)
{
struct bch_fs *c = trans->c;
struct btree_iter dir_iter = { NULL };
@ -103,6 +208,9 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
u64 dir_offset = 0;
int ret;
if (dir.subvol != inum.subvol)
return -EXDEV;
ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
if (ret)
goto err;
@ -110,7 +218,7 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
inode_u->bi_ctime = now;
bch2_inode_nlink_inc(inode_u);
ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT);
ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
if (ret)
goto err;
@ -118,15 +226,15 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
dir_hash = bch2_hash_info_init(c, dir_u);
ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
ret = bch2_dirent_create(trans, dir, &dir_hash,
mode_to_type(inode_u->bi_mode),
name, inum, &dir_offset,
name, inum.inum, &dir_offset,
BCH_HASH_SET_MUST_CREATE);
if (ret)
goto err;
if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
inode_u->bi_dir = dir_inum;
inode_u->bi_dir = dir.inum;
inode_u->bi_dir_offset = dir_offset;
}
@ -139,55 +247,83 @@ err:
}
int bch2_unlink_trans(struct btree_trans *trans,
u64 dir_inum, struct bch_inode_unpacked *dir_u,
subvol_inum dir,
struct bch_inode_unpacked *dir_u,
struct bch_inode_unpacked *inode_u,
const struct qstr *name)
const struct qstr *name,
int deleting_snapshot)
{
struct bch_fs *c = trans->c;
struct btree_iter dir_iter = { NULL };
struct btree_iter dirent_iter = { NULL };
struct btree_iter inode_iter = { NULL };
struct bch_hash_info dir_hash;
u64 inum, now = bch2_current_time(c);
subvol_inum inum;
u64 now = bch2_current_time(c);
struct bkey_s_c k;
int ret;
ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT);
ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
if (ret)
goto err;
dir_hash = bch2_hash_info_init(c, dir_u);
ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir_inum, &dir_hash,
name, BTREE_ITER_INTENT);
ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
name, &inum, BTREE_ITER_INTENT);
if (ret)
goto err;
k = bch2_btree_iter_peek_slot(&dirent_iter);
ret = bkey_err(k);
ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum,
BTREE_ITER_INTENT);
if (ret)
goto err;
inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
if (ret)
if (deleting_snapshot == 1 && !inode_u->bi_subvol) {
ret = -ENOENT;
goto err;
}
if (inode_u->bi_dir == k.k->p.inode &&
inode_u->bi_dir_offset == k.k->p.offset) {
if (deleting_snapshot <= 0 && S_ISDIR(inode_u->bi_mode)) {
ret = bch2_empty_dir_trans(trans, inum);
if (ret)
goto err;
}
if (inode_u->bi_subvol) {
ret = bch2_subvolume_delete(trans, inode_u->bi_subvol,
deleting_snapshot);
if (ret)
goto err;
k = bch2_btree_iter_peek_slot(&dirent_iter);
ret = bkey_err(k);
if (ret)
goto err;
/*
* If we're deleting a subvolume, we need to really delete the
* dirent, not just emit a whiteout in the current snapshot:
*/
bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot);
ret = bch2_btree_iter_traverse(&dirent_iter);
if (ret)
goto err;
}
if (inode_u->bi_dir == dirent_iter.pos.inode &&
inode_u->bi_dir_offset == dirent_iter.pos.offset) {
inode_u->bi_dir = 0;
inode_u->bi_dir_offset = 0;
}
dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode);
dir_u->bi_nlink -= is_subdir_for_nlink(inode_u);
bch2_inode_nlink_dec(inode_u);
ret = (S_ISDIR(inode_u->bi_mode)
? bch2_empty_dir_trans(trans, inum)
: 0) ?:
bch2_dirent_delete_at(trans, &dir_hash, &dirent_iter) ?:
ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
&dir_hash, &dirent_iter,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_inode_write(trans, &dir_iter, dir_u) ?:
bch2_inode_write(trans, &inode_iter, inode_u);
err:
@ -222,8 +358,8 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
}
int bch2_rename_trans(struct btree_trans *trans,
u64 src_dir, struct bch_inode_unpacked *src_dir_u,
u64 dst_dir, struct bch_inode_unpacked *dst_dir_u,
subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u,
subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u,
struct bch_inode_unpacked *src_inode_u,
struct bch_inode_unpacked *dst_inode_u,
const struct qstr *src_name,
@ -236,7 +372,8 @@ int bch2_rename_trans(struct btree_trans *trans,
struct btree_iter src_inode_iter = { NULL };
struct btree_iter dst_inode_iter = { NULL };
struct bch_hash_info src_hash, dst_hash;
u64 src_inode, src_offset, dst_inode, dst_offset;
subvol_inum src_inum, dst_inum;
u64 src_offset, dst_offset;
u64 now = bch2_current_time(c);
int ret;
@ -247,7 +384,8 @@ int bch2_rename_trans(struct btree_trans *trans,
src_hash = bch2_hash_info_init(c, src_dir_u);
if (dst_dir != src_dir) {
if (dst_dir.inum != src_dir.inum ||
dst_dir.subvol != src_dir.subvol) {
ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir,
BTREE_ITER_INTENT);
if (ret)
@ -262,19 +400,19 @@ int bch2_rename_trans(struct btree_trans *trans,
ret = bch2_dirent_rename(trans,
src_dir, &src_hash,
dst_dir, &dst_hash,
src_name, &src_inode, &src_offset,
dst_name, &dst_inode, &dst_offset,
src_name, &src_inum, &src_offset,
dst_name, &dst_inum, &dst_offset,
mode);
if (ret)
goto err;
ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inode,
ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum,
BTREE_ITER_INTENT);
if (ret)
goto err;
if (dst_inode) {
ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inode,
if (dst_inum.inum) {
ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum,
BTREE_ITER_INTENT);
if (ret)
goto err;
@ -305,7 +443,7 @@ int bch2_rename_trans(struct btree_trans *trans,
}
if (S_ISDIR(dst_inode_u->bi_mode) &&
bch2_empty_dir_trans(trans, dst_inode)) {
bch2_empty_dir_trans(trans, dst_inum)) {
ret = -ENOTEMPTY;
goto err;
}
@ -324,12 +462,12 @@ int bch2_rename_trans(struct btree_trans *trans,
goto err;
}
if (S_ISDIR(src_inode_u->bi_mode)) {
if (is_subdir_for_nlink(src_inode_u)) {
src_dir_u->bi_nlink--;
dst_dir_u->bi_nlink++;
}
if (dst_inode && S_ISDIR(dst_inode_u->bi_mode)) {
if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) {
dst_dir_u->bi_nlink--;
src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE;
}
@ -340,22 +478,22 @@ int bch2_rename_trans(struct btree_trans *trans,
src_dir_u->bi_mtime = now;
src_dir_u->bi_ctime = now;
if (src_dir != dst_dir) {
if (src_dir.inum != dst_dir.inum) {
dst_dir_u->bi_mtime = now;
dst_dir_u->bi_ctime = now;
}
src_inode_u->bi_ctime = now;
if (dst_inode)
if (dst_inum.inum)
dst_inode_u->bi_ctime = now;
ret = bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?:
(src_dir != dst_dir
(src_dir.inum != dst_dir.inum
? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u)
: 0 ) ?:
bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?:
(dst_inode
(dst_inum.inum
? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u)
: 0 );
err:

View File

@ -4,27 +4,33 @@
struct posix_acl;
int bch2_create_trans(struct btree_trans *, u64,
#define BCH_CREATE_TMPFILE (1U << 0)
#define BCH_CREATE_SUBVOL (1U << 1)
#define BCH_CREATE_SNAPSHOT (1U << 2)
#define BCH_CREATE_SNAPSHOT_RO (1U << 3)
int bch2_create_trans(struct btree_trans *, subvol_inum,
struct bch_inode_unpacked *,
struct bch_inode_unpacked *,
const struct qstr *,
uid_t, gid_t, umode_t, dev_t,
struct posix_acl *,
struct posix_acl *);
struct posix_acl *,
subvol_inum, unsigned);
int bch2_link_trans(struct btree_trans *, u64,
u64, struct bch_inode_unpacked *,
struct bch_inode_unpacked *,
int bch2_link_trans(struct btree_trans *,
subvol_inum, struct bch_inode_unpacked *,
subvol_inum, struct bch_inode_unpacked *,
const struct qstr *);
int bch2_unlink_trans(struct btree_trans *,
u64, struct bch_inode_unpacked *,
int bch2_unlink_trans(struct btree_trans *, subvol_inum,
struct bch_inode_unpacked *,
const struct qstr *);
struct bch_inode_unpacked *,
const struct qstr *, int);
int bch2_rename_trans(struct btree_trans *,
u64, struct bch_inode_unpacked *,
u64, struct bch_inode_unpacked *,
subvol_inum, struct bch_inode_unpacked *,
subvol_inum, struct bch_inode_unpacked *,
struct bch_inode_unpacked *,
struct bch_inode_unpacked *,
const struct qstr *,

View File

@ -786,23 +786,35 @@ static void readpage_bio_extend(struct readpages_iter *iter,
}
}
static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
struct bch_read_bio *rbio, u64 inum,
static void bchfs_read(struct btree_trans *trans,
struct bch_read_bio *rbio,
subvol_inum inum,
struct readpages_iter *readpages_iter)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_buf sk;
int flags = BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE;
u32 snapshot;
int ret = 0;
rbio->c = c;
rbio->start_time = local_clock();
rbio->subvol = inum.subvol;
bch2_bkey_buf_init(&sk);
retry:
bch2_trans_begin(trans);
iter = (struct btree_iter) { NULL };
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
goto err;
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS);
while (1) {
struct bkey_s_c k;
unsigned bytes, sectors, offset_into_extent;
@ -817,15 +829,15 @@ retry:
break;
}
bch2_btree_iter_set_pos(iter,
POS(inum, rbio->bio.bi_iter.bi_sector));
bch2_btree_iter_set_pos(&iter,
POS(inum.inum, rbio->bio.bi_iter.bi_sector));
k = bch2_btree_iter_peek_slot(iter);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
break;
offset_into_extent = iter->pos.offset -
offset_into_extent = iter.pos.offset -
bkey_start_offset(k.k);
sectors = k.k->size - offset_into_extent;
@ -855,7 +867,7 @@ retry:
if (bkey_extent_is_allocation(k.k))
bch2_add_page_sectors(&rbio->bio, k);
bch2_read_extent(trans, rbio, iter->pos,
bch2_read_extent(trans, rbio, iter.pos,
data_btree, k, offset_into_extent, flags);
if (flags & BCH_READ_LAST_FRAGMENT)
@ -864,12 +876,14 @@ retry:
swap(rbio->bio.bi_iter.bi_size, bytes);
bio_advance(&rbio->bio, bytes);
}
err:
bch2_trans_iter_exit(trans, &iter);
if (ret == -EINTR)
goto retry;
if (ret) {
bch_err_inum_ratelimited(c, inum,
bch_err_inum_ratelimited(c, inum.inum,
"read error %i from btree lookup", ret);
rbio->bio.bi_status = BLK_STS_IOERR;
bio_endio(&rbio->bio);
@ -884,7 +898,6 @@ void bch2_readahead(struct readahead_control *ractl)
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
struct btree_trans trans;
struct btree_iter iter;
struct page *page;
struct readpages_iter readpages_iter;
int ret;
@ -893,8 +906,6 @@ void bch2_readahead(struct readahead_control *ractl)
BUG_ON(ret);
bch2_trans_init(&trans, c, 0, 0);
bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN,
BTREE_ITER_SLOTS);
bch2_pagecache_add_get(&inode->ei_pagecache_lock);
@ -915,22 +926,20 @@ void bch2_readahead(struct readahead_control *ractl)
rbio->bio.bi_end_io = bch2_readpages_end_io;
BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
bchfs_read(&trans, &iter, rbio, inode->v.i_ino,
bchfs_read(&trans, rbio, inode_inum(inode),
&readpages_iter);
}
bch2_pagecache_add_put(&inode->ei_pagecache_lock);
bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
kfree(readpages_iter.pages);
}
static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
u64 inum, struct page *page)
subvol_inum inum, struct page *page)
{
struct btree_trans trans;
struct btree_iter iter;
bch2_page_state_create(page, __GFP_NOFAIL);
@ -940,12 +949,7 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
bch2_trans_init(&trans, c, 0, 0);
bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN,
BTREE_ITER_SLOTS);
bchfs_read(&trans, &iter, rbio, inum, NULL);
bch2_trans_iter_exit(&trans, &iter);
bchfs_read(&trans, rbio, inum, NULL);
bch2_trans_exit(&trans);
}
@ -959,7 +963,7 @@ int bch2_readpage(struct file *file, struct page *page)
rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts);
rbio->bio.bi_end_io = bch2_readpages_end_io;
__bchfs_readpage(c, rbio, inode->v.i_ino, page);
__bchfs_readpage(c, rbio, inode_inum(inode), page);
return 0;
}
@ -982,7 +986,7 @@ static int bch2_read_single_page(struct page *page,
rbio->bio.bi_private = &done;
rbio->bio.bi_end_io = bch2_read_single_page_end_io;
__bchfs_readpage(c, rbio, inode->v.i_ino, page);
__bchfs_readpage(c, rbio, inode_inum(inode), page);
wait_for_completion(&done);
ret = blk_status_to_errno(rbio->bio.bi_status);
@ -1126,6 +1130,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
op->nr_replicas = nr_replicas;
op->res.nr_replicas = nr_replicas;
op->write_point = writepoint_hashed(inode->ei_last_dirtied);
op->subvol = inode->ei_subvol;
op->pos = POS(inode->v.i_ino, sector);
op->wbio.bio.bi_iter.bi_sector = sector;
op->wbio.bio.bi_opf = wbc_to_write_flags(wbc);
@ -1758,7 +1763,7 @@ start:
if (iter->count)
closure_get(&dio->cl);
bch2_read(c, rbio_init(bio, opts), inode->v.i_ino);
bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
}
iter->count += shorten;
@ -1813,6 +1818,50 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
/* O_DIRECT writes */
static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
u64 offset, u64 size,
unsigned nr_replicas, bool compressed)
{
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
u64 end = offset + size;
u32 snapshot;
bool ret = true;
int err;
bch2_trans_init(&trans, c, 0, 0);
retry:
bch2_trans_begin(&trans);
err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
if (err)
goto err;
for_each_btree_key(&trans, iter, BTREE_ID_extents,
SPOS(inum.inum, offset, snapshot),
BTREE_ITER_SLOTS, k, err) {
if (bkey_cmp(bkey_start_pos(k.k), POS(inum.inum, end)) >= 0)
break;
if (k.k->p.snapshot != snapshot ||
nr_replicas > bch2_bkey_replicas(c, k) ||
(!compressed && bch2_bkey_sectors_compressed(k))) {
ret = false;
break;
}
}
offset = iter.pos.offset;
bch2_trans_iter_exit(&trans, &iter);
err:
if (err == -EINTR)
goto retry;
bch2_trans_exit(&trans);
return err ? false : ret;
}
static void bch2_dio_write_loop_async(struct bch_write_op *);
static long bch2_dio_write_loop(struct dio_write *dio)
@ -1891,6 +1940,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
op_journal_seq_set(&dio->op, &inode->ei_journal_seq);
dio->op.write_point = writepoint_hashed((unsigned long) current);
dio->op.nr_replicas = dio->op.opts.data_replicas;
dio->op.subvol = inode->ei_subvol;
dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
if ((req->ki_flags & IOCB_DSYNC) &&
@ -1901,8 +1951,8 @@ static long bch2_dio_write_loop(struct dio_write *dio)
ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
dio->op.opts.data_replicas, 0);
if (unlikely(ret) &&
!bch2_check_range_allocated(c, dio->op.pos,
bio_sectors(bio),
!bch2_check_range_allocated(c, inode_inum(inode),
dio->op.pos.offset, bio_sectors(bio),
dio->op.opts.data_replicas,
dio->op.opts.compression != 0))
goto err;
@ -2146,9 +2196,9 @@ out:
/* truncate: */
static inline int range_has_data(struct bch_fs *c,
struct bpos start,
struct bpos end)
static inline int range_has_data(struct bch_fs *c, u32 subvol,
struct bpos start,
struct bpos end)
{
struct btree_trans trans;
struct btree_iter iter;
@ -2156,6 +2206,12 @@ static inline int range_has_data(struct bch_fs *c,
int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
retry:
bch2_trans_begin(&trans);
ret = bch2_subvolume_get_snapshot(&trans, subvol, &start.snapshot);
if (ret)
goto err;
for_each_btree_key(&trans, iter, BTREE_ID_extents, start, 0, k, ret) {
if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
@ -2166,7 +2222,11 @@ static inline int range_has_data(struct bch_fs *c,
break;
}
}
start = iter.pos;
bch2_trans_iter_exit(&trans, &iter);
err:
if (ret == -EINTR)
goto retry;
return bch2_trans_exit(&trans) ?: ret;
}
@ -2198,7 +2258,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
* XXX: we're doing two index lookups when we end up reading the
* page
*/
ret = range_has_data(c,
ret = range_has_data(c, inode->ei_subvol,
POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT),
POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT));
if (ret <= 0)
@ -2332,7 +2392,7 @@ int bch2_truncate(struct user_namespace *mnt_userns,
inode_dio_wait(&inode->v);
bch2_pagecache_block_get(&inode->ei_pagecache_lock);
ret = bch2_inode_find_by_inum(c, inode->v.i_ino, &inode_u);
ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u);
if (ret)
goto err;
@ -2390,7 +2450,7 @@ int bch2_truncate(struct user_namespace *mnt_userns,
truncate_setsize(&inode->v, iattr->ia_size);
ret = bch2_fpunch(c, inode->v.i_ino,
ret = bch2_fpunch(c, inode_inum(inode),
round_up(iattr->ia_size, block_bytes(c)) >> 9,
U64_MAX, &inode->ei_journal_seq, &i_sectors_delta);
i_sectors_acct(c, inode, NULL, i_sectors_delta);
@ -2450,7 +2510,7 @@ static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len
if (discard_start < discard_end) {
s64 i_sectors_delta = 0;
ret = bch2_fpunch(c, inode->v.i_ino,
ret = bch2_fpunch(c, inode_inum(inode),
discard_start, discard_end,
&inode->ei_journal_seq,
&i_sectors_delta);
@ -2529,7 +2589,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
} else {
s64 i_sectors_delta = 0;
ret = bch2_fpunch(c, inode->v.i_ino,
ret = bch2_fpunch(c, inode_inum(inode),
offset >> 9, (offset + len) >> 9,
&inode->ei_journal_seq,
&i_sectors_delta);
@ -2556,6 +2616,18 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
struct bpos move_pos = POS(inode->v.i_ino, offset >> 9);
struct bpos atomic_end;
unsigned trigger_flags = 0;
u32 snapshot;
bch2_trans_begin(&trans);
ret = bch2_subvolume_get_snapshot(&trans,
inode->ei_subvol, &snapshot);
if (ret)
continue;
bch2_btree_iter_set_snapshot(&src, snapshot);
bch2_btree_iter_set_snapshot(&dst, snapshot);
bch2_btree_iter_set_snapshot(&del, snapshot);
bch2_trans_begin(&trans);
@ -2676,9 +2748,17 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
struct bkey_i_reservation reservation;
struct bkey_s_c k;
unsigned sectors;
u32 snapshot;
bch2_trans_begin(&trans);
ret = bch2_subvolume_get_snapshot(&trans,
inode->ei_subvol, &snapshot);
if (ret)
goto bkey_err;
bch2_btree_iter_set_snapshot(&iter, snapshot);
k = bch2_btree_iter_peek_slot(&iter);
if ((ret = bkey_err(k)))
goto bkey_err;
@ -2725,7 +2805,8 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
reservation.v.nr_replicas = disk_res.nr_replicas;
}
ret = bch2_extent_update(&trans, &iter, &reservation.k_i,
ret = bch2_extent_update(&trans, inode_inum(inode), &iter,
&reservation.k_i,
&disk_res, &inode->ei_journal_seq,
0, &i_sectors_delta, true);
i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
@ -2927,8 +3008,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
mark_range_unallocated(src, pos_src, pos_src + aligned_len);
ret = bch2_remap_range(c,
POS(dst->v.i_ino, pos_dst >> 9),
POS(src->v.i_ino, pos_src >> 9),
inode_inum(dst), pos_dst >> 9,
inode_inum(src), pos_src >> 9,
aligned_len >> 9,
&dst->ei_journal_seq,
pos_dst + len, &i_sectors_delta);
@ -3019,7 +3100,9 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
subvol_inum inum = inode_inum(inode);
u64 isize, next_data = MAX_LFS_FILESIZE;
u32 snapshot;
int ret;
isize = i_size_read(&inode->v);
@ -3027,9 +3110,15 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
return -ENXIO;
bch2_trans_init(&trans, c, 0, 0);
retry:
bch2_trans_begin(&trans);
ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
if (ret)
goto err;
for_each_btree_key(&trans, iter, BTREE_ID_extents,
POS(inode->v.i_ino, offset >> 9), 0, k, ret) {
SPOS(inode->v.i_ino, offset >> 9, snapshot), 0, k, ret) {
if (k.k->p.inode != inode->v.i_ino) {
break;
} else if (bkey_extent_is_data(k.k)) {
@ -3039,6 +3128,9 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
break;
}
bch2_trans_iter_exit(&trans, &iter);
err:
if (ret == -EINTR)
goto retry;
ret = bch2_trans_exit(&trans) ?: ret;
if (ret)
@ -3115,7 +3207,9 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
subvol_inum inum = inode_inum(inode);
u64 isize, next_hole = MAX_LFS_FILESIZE;
u32 snapshot;
int ret;
isize = i_size_read(&inode->v);
@ -3123,9 +3217,15 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
return -ENXIO;
bch2_trans_init(&trans, c, 0, 0);
retry:
bch2_trans_begin(&trans);
ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
if (ret)
goto err;
for_each_btree_key(&trans, iter, BTREE_ID_extents,
POS(inode->v.i_ino, offset >> 9),
SPOS(inode->v.i_ino, offset >> 9, snapshot),
BTREE_ITER_SLOTS, k, ret) {
if (k.k->p.inode != inode->v.i_ino) {
next_hole = bch2_seek_pagecache_hole(&inode->v,
@ -3143,6 +3243,9 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
}
}
bch2_trans_iter_exit(&trans, &iter);
err:
if (ret == -EINTR)
goto retry;
ret = bch2_trans_exit(&trans) ?: ret;
if (ret)

View File

@ -10,7 +10,11 @@
#include "quota.h"
#include <linux/compat.h>
#include <linux/fsnotify.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/writeback.h>
#define FS_IOC_GOINGDOWN _IOR('X', 125, __u32)
#define FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */
@ -192,7 +196,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
char *kname = NULL;
struct qstr qstr;
int ret = 0;
u64 inum;
subvol_inum inum;
kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL);
if (!kname)
@ -205,10 +209,8 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
qstr.len = ret;
qstr.name = kname;
ret = -ENOENT;
inum = bch2_dirent_lookup(c, src->v.i_ino, &hash,
&qstr);
if (!inum)
ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum);
if (ret)
goto err1;
vinode = bch2_vfs_inode_get(c, inum);
@ -294,6 +296,154 @@ err:
return ret;
}
static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
struct bch_ioctl_subvolume arg)
{
struct inode *dir;
struct bch_inode_info *inode;
struct user_namespace *s_user_ns;
struct dentry *dst_dentry;
struct path src_path, dst_path;
int how = LOOKUP_FOLLOW;
int error;
subvol_inum snapshot_src = { 0 };
unsigned lookup_flags = 0;
unsigned create_flags = BCH_CREATE_SUBVOL;
if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE|
BCH_SUBVOL_SNAPSHOT_RO))
return -EINVAL;
if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
(arg.src_ptr ||
(arg.flags & BCH_SUBVOL_SNAPSHOT_RO)))
return -EINVAL;
if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
create_flags |= BCH_CREATE_SNAPSHOT;
if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)
create_flags |= BCH_CREATE_SNAPSHOT_RO;
/* why do we need this lock? */
down_read(&c->vfs_sb->s_umount);
if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
sync_inodes_sb(c->vfs_sb);
retry:
if (arg.src_ptr) {
error = user_path_at(arg.dirfd,
(const char __user *)(unsigned long)arg.src_ptr,
how, &src_path);
if (error)
goto err1;
if (src_path.dentry->d_sb->s_fs_info != c) {
path_put(&src_path);
error = -EXDEV;
goto err1;
}
snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode));
}
dst_dentry = user_path_create(arg.dirfd,
(const char __user *)(unsigned long)arg.dst_ptr,
&dst_path, lookup_flags);
error = PTR_ERR_OR_ZERO(dst_dentry);
if (error)
goto err2;
if (dst_dentry->d_sb->s_fs_info != c) {
error = -EXDEV;
goto err3;
}
if (dst_dentry->d_inode) {
error = -EEXIST;
goto err3;
}
dir = dst_path.dentry->d_inode;
if (IS_DEADDIR(dir)) {
error = -ENOENT;
goto err3;
}
s_user_ns = dir->i_sb->s_user_ns;
if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
!kgid_has_mapping(s_user_ns, current_fsgid())) {
error = -EOVERFLOW;
goto err3;
}
error = inode_permission(file_mnt_user_ns(filp),
dir, MAY_WRITE | MAY_EXEC);
if (error)
goto err3;
if (!IS_POSIXACL(dir))
arg.mode &= ~current_umask();
error = security_path_mkdir(&dst_path, dst_dentry, arg.mode);
if (error)
goto err3;
if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
!arg.src_ptr)
snapshot_src.subvol = to_bch_ei(dir)->ei_inode.bi_subvol;
inode = __bch2_create(file_mnt_user_ns(filp), to_bch_ei(dir),
dst_dentry, arg.mode|S_IFDIR,
0, snapshot_src, create_flags);
error = PTR_ERR_OR_ZERO(inode);
if (error)
goto err3;
d_instantiate(dst_dentry, &inode->v);
fsnotify_mkdir(dir, dst_dentry);
err3:
done_path_create(&dst_path, dst_dentry);
err2:
if (arg.src_ptr)
path_put(&src_path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
err1:
up_read(&c->vfs_sb->s_umount);
return error;
}
static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
struct bch_ioctl_subvolume arg)
{
struct path path;
int ret = 0;
if (arg.flags)
return -EINVAL;
ret = user_path_at(arg.dirfd,
(const char __user *)(unsigned long)arg.dst_ptr,
LOOKUP_FOLLOW, &path);
if (ret)
return ret;
if (path.dentry->d_sb->s_fs_info != c) {
path_put(&path);
return -EXDEV;
}
ret = __bch2_unlink(path.dentry->d_parent->d_inode, path.dentry, 1);
path_put(&path);
return ret;
}
long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
struct bch_inode_info *inode = file_bch_inode(file);
@ -324,6 +474,22 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case FS_IOC_GOINGDOWN:
return bch2_ioc_goingdown(c, (u32 __user *) arg);
case BCH_IOCTL_SUBVOLUME_CREATE: {
struct bch_ioctl_subvolume i;
if (copy_from_user(&i, (void __user *) arg, sizeof(i)))
return -EFAULT;
return bch2_ioctl_subvolume_create(c, file, i);
}
case BCH_IOCTL_SUBVOLUME_DESTROY: {
struct bch_ioctl_subvolume i;
if (copy_from_user(&i, (void __user *) arg, sizeof(i)))
return -EFAULT;
return bch2_ioctl_subvolume_destroy(c, file, i);
}
default:
return bch2_fs_ioctl(c, cmd, (void __user *) arg);
}

View File

@ -36,7 +36,7 @@
static struct kmem_cache *bch2_inode_cache;
static void bch2_vfs_inode_init(struct bch_fs *,
static void bch2_vfs_inode_init(struct bch_fs *, subvol_inum,
struct bch_inode_info *,
struct bch_inode_unpacked *);
@ -149,7 +149,7 @@ int __must_check bch2_write_inode(struct bch_fs *c,
retry:
bch2_trans_begin(&trans);
ret = bch2_inode_peek(&trans, &iter, &inode_u, inode->v.i_ino,
ret = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode),
BTREE_ITER_INTENT) ?:
(set ? set(inode, &inode_u, p) : 0) ?:
bch2_inode_write(&trans, &iter, &inode_u) ?:
@ -208,13 +208,42 @@ int bch2_fs_quota_transfer(struct bch_fs *c,
return ret;
}
struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
static int bch2_iget5_test(struct inode *vinode, void *p)
{
struct bch_inode_info *inode = to_bch_ei(vinode);
subvol_inum *inum = p;
return inode->ei_subvol == inum->subvol &&
inode->ei_inode.bi_inum == inum->inum;
}
static int bch2_iget5_set(struct inode *vinode, void *p)
{
struct bch_inode_info *inode = to_bch_ei(vinode);
subvol_inum *inum = p;
inode->v.i_ino = inum->inum;
inode->ei_subvol = inum->subvol;
inode->ei_inode.bi_inum = inum->inum;
return 0;
}
static unsigned bch2_inode_hash(subvol_inum inum)
{
return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
}
struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
{
struct bch_inode_unpacked inode_u;
struct bch_inode_info *inode;
int ret;
inode = to_bch_ei(iget_locked(c->vfs_sb, inum));
inode = to_bch_ei(iget5_locked(c->vfs_sb,
bch2_inode_hash(inum),
bch2_iget5_test,
bch2_iget5_set,
&inum));
if (unlikely(!inode))
return ERR_PTR(-ENOMEM);
if (!(inode->v.i_state & I_NEW))
@ -226,26 +255,20 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
return ERR_PTR(ret);
}
bch2_vfs_inode_init(c, inode, &inode_u);
bch2_vfs_inode_init(c, inum, inode, &inode_u);
inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum);
inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum.inum);
unlock_new_inode(&inode->v);
return &inode->v;
}
static int inum_test(struct inode *inode, void *p)
{
unsigned long *ino = p;
return *ino == inode->i_ino;
}
static struct bch_inode_info *
struct bch_inode_info *
__bch2_create(struct user_namespace *mnt_userns,
struct bch_inode_info *dir, struct dentry *dentry,
umode_t mode, dev_t rdev, bool tmpfile)
umode_t mode, dev_t rdev, subvol_inum snapshot_src,
unsigned flags)
{
struct bch_fs *c = dir->v.i_sb->s_fs_info;
struct btree_trans trans;
@ -253,6 +276,7 @@ __bch2_create(struct user_namespace *mnt_userns,
struct bch_inode_info *inode, *old;
struct bch_inode_unpacked inode_u;
struct posix_acl *default_acl = NULL, *acl = NULL;
subvol_inum inum;
u64 journal_seq = 0;
int ret;
@ -273,20 +297,23 @@ __bch2_create(struct user_namespace *mnt_userns,
bch2_inode_init_early(c, &inode_u);
if (!tmpfile)
if (!(flags & BCH_CREATE_TMPFILE))
mutex_lock(&dir->ei_update_lock);
bch2_trans_init(&trans, c, 8,
2048 + (!tmpfile ? dentry->d_name.len : 0));
2048 + (!(flags & BCH_CREATE_TMPFILE)
? dentry->d_name.len : 0));
retry:
bch2_trans_begin(&trans);
ret = bch2_create_trans(&trans, dir->v.i_ino, &dir_u, &inode_u,
!tmpfile ? &dentry->d_name : NULL,
ret = bch2_create_trans(&trans,
inode_inum(dir), &dir_u, &inode_u,
!(flags & BCH_CREATE_TMPFILE)
? &dentry->d_name : NULL,
from_kuid(mnt_userns, current_fsuid()),
from_kgid(mnt_userns, current_fsgid()),
mode, rdev,
default_acl, acl) ?:
default_acl, acl, snapshot_src, flags) ?:
bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
KEY_TYPE_QUOTA_PREALLOC);
if (unlikely(ret))
@ -302,14 +329,17 @@ err_before_quota:
goto err_trans;
}
if (!tmpfile) {
if (!(flags & BCH_CREATE_TMPFILE)) {
bch2_inode_update_after_write(c, dir, &dir_u,
ATTR_MTIME|ATTR_CTIME);
journal_seq_copy(c, dir, journal_seq);
mutex_unlock(&dir->ei_update_lock);
}
bch2_vfs_inode_init(c, inode, &inode_u);
inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
inum.inum = inode_u.bi_inum;
bch2_vfs_inode_init(c, inum, inode, &inode_u);
journal_seq_copy(c, inode, journal_seq);
set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
@ -322,8 +352,12 @@ err_before_quota:
*/
inode->v.i_state |= I_CREATING;
old = to_bch_ei(inode_insert5(&inode->v, inode->v.i_ino,
inum_test, NULL, &inode->v.i_ino));
old = to_bch_ei(inode_insert5(&inode->v,
bch2_inode_hash(inum),
bch2_iget5_test,
bch2_iget5_set,
&inum));
BUG_ON(!old);
if (unlikely(old != inode)) {
@ -350,7 +384,7 @@ err:
posix_acl_release(acl);
return inode;
err_trans:
if (!tmpfile)
if (!(flags & BCH_CREATE_TMPFILE))
mutex_unlock(&dir->ei_update_lock);
bch2_trans_exit(&trans);
@ -369,12 +403,13 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
struct bch_inode_info *dir = to_bch_ei(vdir);
struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
struct inode *vinode = NULL;
u64 inum;
subvol_inum inum = { .subvol = 1 };
int ret;
inum = bch2_dirent_lookup(c, dir->v.i_ino, &hash,
&dentry->d_name);
ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
&dentry->d_name, &inum);
if (inum)
if (!ret)
vinode = bch2_vfs_inode_get(c, inum);
return d_splice_alias(vinode, dentry);
@ -385,7 +420,8 @@ static int bch2_mknod(struct user_namespace *mnt_userns,
umode_t mode, dev_t rdev)
{
struct bch_inode_info *inode =
__bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev, false);
__bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev,
(subvol_inum) { 0 }, 0);
if (IS_ERR(inode))
return PTR_ERR(inode);
@ -415,8 +451,8 @@ static int __bch2_link(struct bch_fs *c,
ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq, 0,
bch2_link_trans(&trans,
dir->v.i_ino,
inode->v.i_ino, &dir_u, &inode_u,
inode_inum(dir), &dir_u,
inode_inum(inode), &inode_u,
&dentry->d_name));
if (likely(!ret)) {
@ -452,7 +488,8 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
return 0;
}
static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
int deleting_snapshot)
{
struct bch_fs *c = vdir->i_sb->s_fs_info;
struct bch_inode_info *dir = to_bch_ei(vdir);
@ -467,8 +504,9 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
ret = __bch2_trans_do(&trans, NULL, &dir->ei_journal_seq,
BTREE_INSERT_NOFAIL,
bch2_unlink_trans(&trans,
dir->v.i_ino, &dir_u,
&inode_u, &dentry->d_name));
inode_inum(dir), &dir_u,
&inode_u, &dentry->d_name,
deleting_snapshot));
if (likely(!ret)) {
BUG_ON(inode_u.bi_inum != inode->v.i_ino);
@ -486,6 +524,11 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
return ret;
}
static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
{
return __bch2_unlink(vdir, dentry, -1);
}
static int bch2_symlink(struct user_namespace *mnt_userns,
struct inode *vdir, struct dentry *dentry,
const char *symname)
@ -494,7 +537,8 @@ static int bch2_symlink(struct user_namespace *mnt_userns,
struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
int ret;
inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0, true);
inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
(subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
if (unlikely(IS_ERR(inode)))
return PTR_ERR(inode);
@ -587,8 +631,8 @@ static int bch2_rename2(struct user_namespace *mnt_userns,
ret = __bch2_trans_do(&trans, NULL, &journal_seq, 0,
bch2_rename_trans(&trans,
src_dir->v.i_ino, &src_dir_u,
dst_dir->v.i_ino, &dst_dir_u,
inode_inum(src_dir), &src_dir_u,
inode_inum(dst_dir), &dst_dir_u,
&src_inode_u,
&dst_inode_u,
&src_dentry->d_name,
@ -711,7 +755,7 @@ retry:
kfree(acl);
acl = NULL;
ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode->v.i_ino,
ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
BTREE_ITER_INTENT);
if (ret)
goto btree_err;
@ -719,7 +763,8 @@ retry:
bch2_setattr_copy(mnt_userns, inode, &inode_u, attr);
if (attr->ia_valid & ATTR_MODE) {
ret = bch2_acl_chmod(&trans, &inode_u, inode_u.bi_mode, &acl);
ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u,
inode_u.bi_mode, &acl);
if (ret)
goto btree_err;
}
@ -810,7 +855,8 @@ static int bch2_tmpfile(struct user_namespace *mnt_userns,
struct inode *vdir, struct dentry *dentry, umode_t mode)
{
struct bch_inode_info *inode =
__bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0, true);
__bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0,
(subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
if (IS_ERR(inode))
return PTR_ERR(inode);
@ -885,6 +931,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
unsigned offset_into_extent, sectors;
bool have_extent = false;
u32 snapshot;
int ret = 0;
ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
@ -894,15 +941,21 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
if (start + len < start)
return -EINVAL;
start >>= 9;
bch2_bkey_buf_init(&cur);
bch2_bkey_buf_init(&prev);
bch2_trans_init(&trans, c, 0, 0);
bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
POS(ei->v.i_ino, start >> 9), 0);
retry:
bch2_trans_begin(&trans);
ret = bch2_subvolume_get_snapshot(&trans, ei->ei_subvol, &snapshot);
if (ret)
goto err;
bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
SPOS(ei->v.i_ino, start, snapshot), 0);
while ((k = bch2_btree_iter_peek(&iter)).k &&
!(ret = bkey_err(k)) &&
bkey_cmp(iter.pos, end) < 0) {
@ -951,7 +1004,9 @@ retry:
bch2_btree_iter_set_pos(&iter,
POS(iter.pos.inode, iter.pos.offset + sectors));
}
start = iter.pos.offset;
bch2_trans_iter_exit(&trans, &iter);
err:
if (ret == -EINTR)
goto retry;
@ -959,7 +1014,6 @@ retry:
ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
FIEMAP_EXTENT_LAST);
bch2_trans_iter_exit(&trans, &iter);
ret = bch2_trans_exit(&trans) ?: ret;
bch2_bkey_buf_exit(&cur, c);
bch2_bkey_buf_exit(&prev, c);
@ -996,7 +1050,7 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
if (!dir_emit_dots(file, ctx))
return 0;
return bch2_readdir(c, inode->v.i_ino, ctx);
return bch2_readdir(c, inode_inum(inode), ctx);
}
static const struct file_operations bch_file_operations = {
@ -1096,6 +1150,7 @@ static const struct address_space_operations bch_address_space_operations = {
.error_remove_page = generic_error_remove_page,
};
#if 0
static struct inode *bch2_nfs_get_inode(struct super_block *sb,
u64 ino, u32 generation)
{
@ -1129,14 +1184,15 @@ static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid,
return generic_fh_to_parent(sb, fid, fh_len, fh_type,
bch2_nfs_get_inode);
}
#endif
static const struct export_operations bch_export_ops = {
.fh_to_dentry = bch2_fh_to_dentry,
.fh_to_parent = bch2_fh_to_parent,
//.fh_to_dentry = bch2_fh_to_dentry,
//.fh_to_parent = bch2_fh_to_parent,
//.get_parent = bch2_get_parent,
};
static void bch2_vfs_inode_init(struct bch_fs *c,
static void bch2_vfs_inode_init(struct bch_fs *c, subvol_inum inum,
struct bch_inode_info *inode,
struct bch_inode_unpacked *bi)
{
@ -1152,6 +1208,7 @@ static void bch2_vfs_inode_init(struct bch_fs *c,
inode->ei_journal_seq = 0;
inode->ei_quota_reserved = 0;
inode->ei_qid = bch_qid(bi);
inode->ei_subvol = inum.subvol;
inode->v.i_mapping->a_ops = &bch_address_space_operations;
@ -1249,7 +1306,7 @@ static void bch2_evict_inode(struct inode *vinode)
KEY_TYPE_QUOTA_WARN);
bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
KEY_TYPE_QUOTA_WARN);
bch2_inode_rm(c, inode->v.i_ino, true);
bch2_inode_rm(c, inode_inum(inode), true);
}
}
@ -1593,7 +1650,7 @@ got_sb:
sb->s_flags |= SB_POSIXACL;
#endif
vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO);
vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
if (IS_ERR(vinode)) {
bch_err(c, "error mounting: error getting root inode %i",
(int) PTR_ERR(vinode));

View File

@ -45,10 +45,20 @@ struct bch_inode_info {
struct mutex ei_quota_lock;
struct bch_qid ei_qid;
u32 ei_subvol;
/* copy of inode in btree: */
struct bch_inode_unpacked ei_inode;
};
static inline subvol_inum inode_inum(struct bch_inode_info *inode)
{
return (subvol_inum) {
.subvol = inode->ei_subvol,
.inum = inode->ei_inode.bi_inum,
};
}
/*
* Set if we've gotten a btree error for this inode, and thus the vfs inode and
* btree inode may be inconsistent:
@ -135,6 +145,10 @@ struct bch_inode_unpacked;
#ifndef NO_BCACHEFS_FS
struct bch_inode_info *
__bch2_create(struct user_namespace *, struct bch_inode_info *,
struct dentry *, umode_t, dev_t, subvol_inum, unsigned);
int bch2_fs_quota_transfer(struct bch_fs *,
struct bch_inode_info *,
struct bch_qid,
@ -154,7 +168,7 @@ static inline int bch2_set_projid(struct bch_fs *c,
KEY_TYPE_QUOTA_PREALLOC);
}
struct inode *bch2_vfs_inode_get(struct bch_fs *, u64);
struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum);
/* returns 0 if we want to do the update, or error is passed up */
typedef int (*inode_set_fn)(struct bch_inode_info *,
@ -170,6 +184,7 @@ int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
int bch2_setattr_nonsize(struct user_namespace *,
struct bch_inode_info *,
struct iattr *);
int __bch2_unlink(struct inode *, struct dentry *, int);
void bch2_vfs_exit(void);
int bch2_vfs_init(void);

File diff suppressed because it is too large Load Diff

View File

@ -6,8 +6,10 @@
#include "btree_update.h"
#include "error.h"
#include "extents.h"
#include "extent_update.h"
#include "inode.h"
#include "str_hash.h"
#include "subvolume.h"
#include "varint.h"
#include <linux/random.h>
@ -295,15 +297,21 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode,
int bch2_inode_peek(struct btree_trans *trans,
struct btree_iter *iter,
struct bch_inode_unpacked *inode,
u64 inum, unsigned flags)
subvol_inum inum, unsigned flags)
{
struct bkey_s_c k;
u32 snapshot;
int ret;
if (trans->c->opts.inodes_use_key_cache)
if (0 && trans->c->opts.inodes_use_key_cache)
flags |= BTREE_ITER_CACHED;
bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, inum), flags);
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
return ret;
bch2_trans_iter_init(trans, iter, BTREE_ID_inodes,
SPOS(0, inum.inum, snapshot), flags);
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret)
@ -340,8 +348,8 @@ int bch2_inode_write(struct btree_trans *trans,
const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
struct bch_inode_unpacked unpacked;
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
struct bch_inode_unpacked unpacked;
if (k.k->p.inode)
return "nonzero k.p.inode";
@ -368,6 +376,9 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
unpacked.bi_nlink != 0)
return "flagged as unlinked but bi_nlink != 0";
if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode))
return "subvolume root but not a directory";
return NULL;
}
@ -482,6 +493,9 @@ static inline u32 bkey_generation(struct bkey_s_c k)
}
}
/*
* This just finds an empty slot:
*/
int bch2_inode_create(struct btree_trans *trans,
struct btree_iter *iter,
struct bch_inode_unpacked *inode_u,
@ -581,19 +595,77 @@ found_slot:
return 0;
}
int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
static int bch2_inode_delete_keys(struct btree_trans *trans,
subvol_inum inum, enum btree_id id)
{
u64 offset = 0;
int ret = 0;
while (!ret || ret == -EINTR) {
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_i delete;
u32 snapshot;
bch2_trans_begin(trans);
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
continue;
bch2_trans_iter_init(trans, &iter, id,
SPOS(inum.inum, offset, snapshot),
BTREE_ITER_INTENT);
k = bch2_btree_iter_peek(&iter);
if (!k.k || iter.pos.inode != inum.inum) {
bch2_trans_iter_exit(trans, &iter);
break;
}
ret = bkey_err(k);
if (ret)
goto err;
bkey_init(&delete.k);
delete.k.p = iter.pos;
if (btree_node_type_is_extents(iter.btree_id)) {
unsigned max_sectors =
min_t(u64, U64_MAX - iter.pos.offset,
KEY_SIZE_MAX & (~0 << trans->c->block_bits));
/* create the biggest key we can */
bch2_key_resize(&delete.k, max_sectors);
ret = bch2_extent_trim_atomic(trans, &iter, &delete);
if (ret)
goto err;
}
ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL);
err:
offset = iter.pos.offset;
bch2_trans_iter_exit(trans, &iter);
}
return ret;
}
int bch2_inode_rm(struct bch_fs *c, subvol_inum inum, bool cached)
{
struct btree_trans trans;
struct btree_iter iter = { NULL };
struct bkey_i_inode_generation delete;
struct bpos start = POS(inode_nr, 0);
struct bpos end = POS(inode_nr + 1, 0);
struct bch_inode_unpacked inode_u;
struct bkey_s_c k;
unsigned iter_flags = BTREE_ITER_INTENT;
u32 snapshot;
int ret;
if (cached && c->opts.inodes_use_key_cache)
if (0 && cached && c->opts.inodes_use_key_cache)
iter_flags |= BTREE_ITER_CACHED;
bch2_trans_init(&trans, c, 0, 1024);
@ -606,19 +678,20 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
* XXX: the dirent could ideally would delete whiteouts when they're no
* longer needed
*/
ret = bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
start, end, NULL) ?:
bch2_btree_delete_range_trans(&trans, BTREE_ID_xattrs,
start, end, NULL) ?:
bch2_btree_delete_range_trans(&trans, BTREE_ID_dirents,
start, end, NULL);
ret = bch2_inode_delete_keys(&trans, inum, BTREE_ID_extents) ?:
bch2_inode_delete_keys(&trans, inum, BTREE_ID_xattrs) ?:
bch2_inode_delete_keys(&trans, inum, BTREE_ID_dirents);
if (ret)
goto err;
retry:
bch2_trans_begin(&trans);
ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
if (ret)
goto err;
bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes,
POS(0, inode_nr), iter_flags);
SPOS(0, inum.inum, snapshot), iter_flags);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
@ -628,13 +701,20 @@ retry:
if (k.k->type != KEY_TYPE_inode) {
bch2_fs_inconsistent(trans.c,
"inode %llu not found when deleting",
inode_nr);
inum.inum);
ret = -EIO;
goto err;
}
bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
/* Subvolume root? */
if (inode_u.bi_subvol) {
ret = bch2_subvolume_delete(&trans, inode_u.bi_subvol, -1);
if (ret)
goto err;
}
bkey_inode_generation_init(&delete.k_i);
delete.k.p = iter.pos;
delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
@ -651,20 +731,22 @@ err:
return ret;
}
static int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
static int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
subvol_inum inum,
struct bch_inode_unpacked *inode)
{
struct btree_iter iter = { NULL };
struct btree_iter iter;
int ret;
ret = bch2_inode_peek(trans, &iter, inode, inode_nr, 0);
bch2_trans_iter_exit(trans, &iter);
ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
if (!ret)
bch2_trans_iter_exit(trans, &iter);
return ret;
}
int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
struct bch_inode_unpacked *inode)
{
return bch2_trans_do(c, NULL, NULL, 0,
bch2_inode_find_by_inum_trans(&trans, inode_nr, inode));
bch2_inode_find_by_inum_trans(&trans, inum, inode));
}

View File

@ -58,7 +58,7 @@ int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
struct bch_inode_unpacked *, u64, unsigned);
struct bch_inode_unpacked *, subvol_inum, unsigned);
int bch2_inode_write(struct btree_trans *, struct btree_iter *,
struct bch_inode_unpacked *);
@ -74,9 +74,10 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
int bch2_inode_create(struct btree_trans *, struct btree_iter *,
struct bch_inode_unpacked *, u32, u64);
int bch2_inode_rm(struct bch_fs *, u64, bool);
int bch2_inode_rm(struct bch_fs *, subvol_inum, bool);
int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *);
int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
struct bch_inode_unpacked *);
static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
{

View File

@ -27,6 +27,7 @@
#include "keylist.h"
#include "move.h"
#include "rebalance.h"
#include "subvolume.h"
#include "super.h"
#include "super-io.h"
@ -220,7 +221,8 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
: 0;
if (!*usage_increasing &&
(new_replicas > bch2_bkey_replicas(c, old) ||
(new->k.p.snapshot != old.k->p.snapshot ||
new_replicas > bch2_bkey_replicas(c, old) ||
(!new_compressed && bch2_bkey_sectors_compressed(old))))
*usage_increasing = true;
@ -256,6 +258,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
}
int bch2_extent_update(struct btree_trans *trans,
subvol_inum inum,
struct btree_iter *iter,
struct bkey_i *k,
struct disk_reservation *disk_res,
@ -314,8 +317,8 @@ int bch2_extent_update(struct btree_trans *trans,
struct btree_iter inode_iter;
struct bch_inode_unpacked inode_u;
ret = bch2_inode_peek(trans, &inode_iter, &inode_u,
k->k.p.inode, BTREE_ITER_INTENT);
ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum,
BTREE_ITER_INTENT);
if (ret)
return ret;
@ -371,22 +374,37 @@ int bch2_extent_update(struct btree_trans *trans,
return 0;
}
/*
* Returns -EINTR if we had to drop locks:
*/
int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
struct bpos end, u64 *journal_seq,
s64 *i_sectors_delta)
subvol_inum inum, u64 end,
u64 *journal_seq, s64 *i_sectors_delta)
{
struct bch_fs *c = trans->c;
unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
struct bpos end_pos = POS(inum.inum, end);
struct bkey_s_c k;
int ret = 0, ret2 = 0;
u32 snapshot;
while ((bch2_trans_begin(trans),
(k = bch2_btree_iter_peek(iter)).k) &&
bkey_cmp(iter->pos, end) < 0) {
while (1) {
struct disk_reservation disk_res =
bch2_disk_reservation_init(c, 0);
struct bkey_i delete;
bch2_trans_begin(trans);
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
goto btree_err;
bch2_btree_iter_set_snapshot(iter, snapshot);
k = bch2_btree_iter_peek(iter);
if (bkey_cmp(iter->pos, end_pos) >= 0)
break;
ret = bkey_err(k);
if (ret)
goto btree_err;
@ -396,9 +414,9 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
/* create the biggest key we can */
bch2_key_resize(&delete.k, max_sectors);
bch2_cut_back(end, &delete);
bch2_cut_back(end_pos, &delete);
ret = bch2_extent_update(trans, iter, &delete,
ret = bch2_extent_update(trans, inum, iter, &delete,
&disk_res, journal_seq,
0, i_sectors_delta, false);
bch2_disk_reservation_put(c, &disk_res);
@ -411,36 +429,31 @@ btree_err:
break;
}
if (bkey_cmp(iter->pos, end) > 0) {
bch2_btree_iter_set_pos(iter, end);
ret = bch2_btree_iter_traverse(iter);
}
if (bkey_cmp(iter->pos, end_pos) > 0)
bch2_btree_iter_set_pos(iter, end_pos);
return ret ?: ret2;
}
int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
u64 *journal_seq, s64 *i_sectors_delta)
{
struct btree_trans trans;
struct btree_iter iter;
int ret = 0;
int ret;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
POS(inum, start),
BTREE_ITER_INTENT);
POS(inum.inum, start),
BTREE_ITER_INTENT);
ret = bch2_fpunch_at(&trans, &iter, POS(inum, end),
ret = bch2_fpunch_at(&trans, &iter, inum, end,
journal_seq, i_sectors_delta);
bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
if (ret == -EINTR)
ret = 0;
return ret;
return ret == -EINTR ? 0 : ret;
}
int bch2_write_index_default(struct bch_write_op *op)
@ -451,40 +464,51 @@ int bch2_write_index_default(struct bch_write_op *op)
struct bkey_i *k = bch2_keylist_front(keys);
struct btree_trans trans;
struct btree_iter iter;
subvol_inum inum = {
.subvol = op->subvol,
.inum = k->k.p.inode,
};
int ret;
BUG_ON(!inum.subvol);
bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
bkey_start_pos(&k->k),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
do {
bch2_trans_begin(&trans);
k = bch2_keylist_front(keys);
bch2_bkey_buf_copy(&sk, c, k);
k->k.p.snapshot = iter.snapshot;
ret = bch2_subvolume_get_snapshot(&trans, inum.subvol,
&sk.k->k.p.snapshot);
if (ret == -EINTR)
continue;
if (ret)
break;
bch2_bkey_buf_realloc(&sk, c, k->k.u64s);
bkey_copy(sk.k, k);
bch2_cut_front(iter.pos, sk.k);
bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
bkey_start_pos(&sk.k->k),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
ret = bch2_extent_update(&trans, &iter, sk.k,
ret = bch2_extent_update(&trans, inum, &iter, sk.k,
&op->res, op_journal_seq(op),
op->new_i_size, &op->i_sectors_delta,
op->flags & BCH_WRITE_CHECK_ENOSPC);
bch2_trans_iter_exit(&trans, &iter);
if (ret == -EINTR)
continue;
if (ret)
break;
if (bkey_cmp(iter.pos, k->k.p) >= 0)
bch2_keylist_pop_front(keys);
bch2_keylist_pop_front(&op->insert_keys);
else
bch2_cut_front(iter.pos, k);
} while (!bch2_keylist_empty(keys));
bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&sk, c);
@ -1645,7 +1669,7 @@ static void bch2_rbio_done(struct bch_read_bio *rbio)
}
static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
struct bvec_iter bvec_iter, u64 inode,
struct bvec_iter bvec_iter,
struct bch_io_failures *failed,
unsigned flags)
{
@ -1709,7 +1733,10 @@ static void bch2_rbio_retry(struct work_struct *work)
struct bch_fs *c = rbio->c;
struct bvec_iter iter = rbio->bvec_iter;
unsigned flags = rbio->flags;
u64 inode = rbio->read_pos.inode;
subvol_inum inum = {
.subvol = rbio->subvol,
.inum = rbio->read_pos.inode,
};
struct bch_io_failures failed = { .nr = 0 };
trace_read_retry(&rbio->bio);
@ -1725,12 +1752,12 @@ static void bch2_rbio_retry(struct work_struct *work)
flags &= ~BCH_READ_MAY_PROMOTE;
if (flags & BCH_READ_NODECODE) {
bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags);
bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
} else {
flags &= ~BCH_READ_LAST_FRAGMENT;
flags |= BCH_READ_MUST_CLONE;
__bch2_read(c, rbio, iter, inode, &failed, flags);
__bch2_read(c, rbio, iter, inum, &failed, flags);
}
}
@ -1804,7 +1831,8 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
if (!bch2_bkey_narrow_crcs(new, new_crc))
goto out;
ret = bch2_trans_update(trans, &iter, new, 0);
ret = bch2_trans_update(trans, &iter, new,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
out:
bch2_trans_iter_exit(trans, &iter);
return ret;
@ -2172,6 +2200,7 @@ get_bio:
/* XXX: only initialize this if needed */
rbio->devs_have = bch2_bkey_devs(k);
rbio->pick = pick;
rbio->subvol = orig->subvol;
rbio->read_pos = read_pos;
rbio->data_btree = data_btree;
rbio->data_pos = data_pos;
@ -2274,25 +2303,31 @@ out_read_done:
}
void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
struct bvec_iter bvec_iter, u64 inode,
struct bvec_iter bvec_iter, subvol_inum inum,
struct bch_io_failures *failed, unsigned flags)
{
struct btree_trans trans;
struct btree_iter iter;
struct bkey_buf sk;
struct bkey_s_c k;
u32 snapshot;
int ret;
BUG_ON(flags & BCH_READ_NODECODE);
bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, 0, 0);
bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
POS(inode, bvec_iter.bi_sector),
BTREE_ITER_SLOTS);
retry:
bch2_trans_begin(&trans);
iter = (struct btree_iter) { NULL };
ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
if (ret)
goto err;
bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS);
while (1) {
unsigned bytes, sectors, offset_into_extent;
enum btree_id data_btree = BTREE_ID_extents;
@ -2307,7 +2342,7 @@ retry:
}
bch2_btree_iter_set_pos(&iter,
POS(inode, bvec_iter.bi_sector));
POS(inum.inum, bvec_iter.bi_sector));
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
@ -2357,16 +2392,17 @@ retry:
swap(bvec_iter.bi_size, bytes);
bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
}
err:
bch2_trans_iter_exit(&trans, &iter);
if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID)
goto retry;
bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&sk, c);
if (ret) {
bch_err_inum_ratelimited(c, inode,
bch_err_inum_ratelimited(c, inum.inum,
"read error %i from btree lookup", ret);
rbio->bio.bi_status = BLK_STS_IOERR;
bch2_rbio_done(rbio);

View File

@ -63,12 +63,13 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
struct bkey_i *, bool *, bool *, s64 *, s64 *);
int bch2_extent_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, struct disk_reservation *,
u64 *, u64, s64 *, bool);
int bch2_extent_update(struct btree_trans *, subvol_inum,
struct btree_iter *, struct bkey_i *,
struct disk_reservation *, u64 *, u64, s64 *, bool);
int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
struct bpos, u64 *, s64 *);
int bch2_fpunch(struct bch_fs *c, u64, u64, u64, u64 *, s64 *);
subvol_inum, u64, u64 *, s64 *);
int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, u64 *, s64 *);
int bch2_write_index_default(struct bch_write_op *);
@ -90,6 +91,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
op->devs_have.nr = 0;
op->target = 0;
op->opts = opts;
op->subvol = 0;
op->pos = POS_MAX;
op->version = ZERO_VERSION;
op->write_point = (struct write_point_specifier) { 0 };
@ -157,10 +159,10 @@ static inline void bch2_read_extent(struct btree_trans *trans,
}
void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
u64, struct bch_io_failures *, unsigned flags);
subvol_inum, struct bch_io_failures *, unsigned flags);
static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
u64 inode)
subvol_inum inum)
{
struct bch_io_failures failed = { .nr = 0 };
@ -168,8 +170,9 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
rbio->c = c;
rbio->start_time = local_clock();
rbio->subvol = inum.subvol;
__bch2_read(c, rbio, rbio->bio.bi_iter, inode, &failed,
__bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE|
BCH_READ_USER_MAPPED);

View File

@ -62,6 +62,7 @@ struct bch_read_bio {
/*
* pos we read from - different from data_pos for indirect extents:
*/
u32 subvol;
struct bpos read_pos;
/*
@ -122,6 +123,7 @@ struct bch_write_op {
u16 nonce;
struct bch_io_opts opts;
u32 subvol;
struct bpos pos;
struct bversion version;

View File

@ -48,7 +48,8 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN,
BTREE_ITER_PREFETCH);
BTREE_ITER_PREFETCH|
BTREE_ITER_ALL_SNAPSHOTS);
while ((k = bch2_btree_iter_peek(&iter)).k &&
!(ret = bkey_err(k))) {
@ -74,7 +75,8 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
bch2_btree_iter_set_pos(&iter, bkey_start_pos(&sk.k->k));
ret = bch2_btree_iter_traverse(&iter) ?:
bch2_trans_update(&trans, &iter, sk.k, 0) ?:
bch2_trans_update(&trans, &iter, sk.k,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(&trans, NULL, NULL,
BTREE_INSERT_NOFAIL);

View File

@ -13,6 +13,7 @@
#include "journal_reclaim.h"
#include "move.h"
#include "replicas.h"
#include "subvolume.h"
#include "super-io.h"
#include "keylist.h"
@ -53,6 +54,81 @@ struct moving_context {
wait_queue_head_t wait;
};
static int insert_snapshot_whiteouts(struct btree_trans *trans,
enum btree_id id,
struct bpos old_pos,
struct bpos new_pos)
{
struct bch_fs *c = trans->c;
struct btree_iter iter, update_iter;
struct bkey_s_c k;
struct snapshots_seen s;
int ret;
if (!btree_type_has_snapshots(id))
return 0;
snapshots_seen_init(&s);
if (!bkey_cmp(old_pos, new_pos))
return 0;
if (!snapshot_t(c, old_pos.snapshot)->children[0])
return 0;
bch2_trans_iter_init(trans, &iter, id, old_pos,
BTREE_ITER_NOT_EXTENTS|
BTREE_ITER_ALL_SNAPSHOTS);
while (1) {
next:
k = bch2_btree_iter_prev(&iter);
ret = bkey_err(k);
if (ret)
break;
if (bkey_cmp(old_pos, k.k->p))
break;
if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) {
struct bkey_i *update;
size_t i;
for (i = 0; i < s.nr; i++)
if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, s.d[i]))
goto next;
update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
ret = PTR_ERR_OR_ZERO(update);
if (ret)
break;
bkey_init(&update->k);
update->k.p = new_pos;
update->k.p.snapshot = k.k->p.snapshot;
bch2_trans_iter_init(trans, &update_iter, id, update->k.p,
BTREE_ITER_NOT_EXTENTS|
BTREE_ITER_ALL_SNAPSHOTS|
BTREE_ITER_INTENT);
ret = bch2_btree_iter_traverse(&update_iter) ?:
bch2_trans_update(trans, &update_iter, update,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
bch2_trans_iter_exit(trans, &update_iter);
if (ret)
break;
ret = snapshots_seen_add(c, &s, k.k->p.snapshot);
if (ret)
break;
}
}
bch2_trans_iter_exit(trans, &iter);
kfree(s.d);
return ret;
}
static int bch2_migrate_index_update(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
@ -166,7 +242,10 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
next_pos = insert->k.p;
ret = bch2_trans_update(&trans, &iter, insert, 0) ?:
ret = insert_snapshot_whiteouts(&trans, m->btree_id,
k.k->p, insert->k.p) ?:
bch2_trans_update(&trans, &iter, insert,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(&trans, &op->res,
op_journal_seq(op),
BTREE_INSERT_NOFAIL|
@ -581,7 +660,8 @@ static int __bch2_move_data(struct bch_fs *c,
stats->pos = start;
bch2_trans_iter_init(&trans, &iter, btree_id, start,
BTREE_ITER_PREFETCH);
BTREE_ITER_PREFETCH|
BTREE_ITER_ALL_SNAPSHOTS);
if (rate)
bch2_ratelimit_reset(rate);

View File

@ -63,7 +63,7 @@ const char * const bch2_member_states[] = {
#undef x
const char * const bch2_d_types[DT_MAX] = {
const char * const bch2_d_types[BCH_DT_MAX] = {
[DT_UNKNOWN] = "unknown",
[DT_FIFO] = "fifo",
[DT_CHR] = "chr",
@ -73,6 +73,7 @@ const char * const bch2_d_types[DT_MAX] = {
[DT_LNK] = "lnk",
[DT_SOCK] = "sock",
[DT_WHT] = "whiteout",
[DT_SUBVOL] = "subvol",
};
void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)

View File

@ -215,19 +215,19 @@ enum opt_type {
BCH_SB_POSIX_ACL, true, \
NULL, "Enable POSIX acls") \
x(usrquota, u8, \
OPT_FORMAT|OPT_MOUNT, \
0, \
OPT_BOOL(), \
BCH_SB_USRQUOTA, false, \
NO_SB_OPT, false, \
NULL, "Enable user quotas") \
x(grpquota, u8, \
OPT_FORMAT|OPT_MOUNT, \
0, \
OPT_BOOL(), \
BCH_SB_GRPQUOTA, false, \
NO_SB_OPT, false, \
NULL, "Enable group quotas") \
x(prjquota, u8, \
OPT_FORMAT|OPT_MOUNT, \
0, \
OPT_BOOL(), \
BCH_SB_PRJQUOTA, false, \
NO_SB_OPT, false, \
NULL, "Enable project quotas") \
x(degraded, u8, \
OPT_MOUNT, \

View File

@ -20,6 +20,7 @@
#include "quota.h"
#include "recovery.h"
#include "replicas.h"
#include "subvolume.h"
#include "super-io.h"
#include <linux/sort.h>
@ -961,6 +962,81 @@ fsck_err:
return ret;
}
static int bch2_fs_initialize_subvolumes(struct bch_fs *c)
{
struct bkey_i_snapshot root_snapshot;
struct bkey_i_subvolume root_volume;
int ret;
bkey_snapshot_init(&root_snapshot.k_i);
root_snapshot.k.p.offset = U32_MAX;
root_snapshot.v.flags = 0;
root_snapshot.v.parent = 0;
root_snapshot.v.subvol = BCACHEFS_ROOT_SUBVOL;
root_snapshot.v.pad = 0;
SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
ret = bch2_btree_insert(c, BTREE_ID_snapshots,
&root_snapshot.k_i,
NULL, NULL, 0);
if (ret)
return ret;
bkey_subvolume_init(&root_volume.k_i);
root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
root_volume.v.flags = 0;
root_volume.v.snapshot = cpu_to_le32(U32_MAX);
root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO);
ret = bch2_btree_insert(c, BTREE_ID_subvolumes,
&root_volume.k_i,
NULL, NULL, 0);
if (ret)
return ret;
return 0;
}
static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
struct bch_inode_unpacked inode;
struct bkey_inode_buf *packed;
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
POS(0, BCACHEFS_ROOT_INO), 0);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
if (k.k->type != KEY_TYPE_inode) {
bch_err(c, "root inode not found");
ret = -ENOENT;
goto err;
}
ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &inode);
BUG_ON(ret);
inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
packed = bch2_trans_kmalloc(trans, sizeof(*packed));
ret = PTR_ERR_OR_ZERO(packed);
if (ret)
goto err;
bch2_inode_pack(c, packed, &inode);
ret = bch2_trans_update(trans, &iter, &packed->inode.k_i, 0);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
int bch2_fs_recovery(struct bch_fs *c)
{
const char *err = "cannot allocate memory";
@ -1017,11 +1093,12 @@ int bch2_fs_recovery(struct bch_fs *c)
c->opts.version_upgrade = true;
c->opts.fsck = true;
c->opts.fix_errors = FSCK_OPT_YES;
}
if (c->sb.version < bcachefs_metadata_version_btree_ptr_sectors_written) {
} else if (c->sb.version < bcachefs_metadata_version_btree_ptr_sectors_written) {
bch_info(c, "version prior to btree_ptr_sectors_written, upgrade required");
c->opts.version_upgrade = true;
} else if (c->sb.version < bcachefs_metadata_version_snapshot) {
bch_info(c, "filesystem version is prior to snapshot field - upgrading");
c->opts.version_upgrade = true;
}
ret = bch2_blacklist_table_initialize(c);
@ -1190,6 +1267,29 @@ use_clean:
bch_verbose(c, "alloc write done");
}
if (c->sb.version < bcachefs_metadata_version_snapshot) {
err = "error creating root snapshot node";
ret = bch2_fs_initialize_subvolumes(c);
if (ret)
goto err;
}
bch_verbose(c, "reading snapshots table");
err = "error reading snapshots table";
ret = bch2_fs_snapshots_start(c);
if (ret)
goto err;
bch_verbose(c, "reading snapshots done");
if (c->sb.version < bcachefs_metadata_version_snapshot) {
/* set bi_subvol on root inode */
err = "error upgrade root inode for subvolumes";
ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
bch2_fs_upgrade_for_subvolumes(&trans));
if (ret)
goto err;
}
if (c->opts.fsck) {
bch_info(c, "starting fsck");
err = "error in fsck";
@ -1350,9 +1450,22 @@ int bch2_fs_initialize(struct bch_fs *c)
}
}
err = "error creating root snapshot node";
ret = bch2_fs_initialize_subvolumes(c);
if (ret)
goto err;
bch_verbose(c, "reading snapshots table");
err = "error reading snapshots table";
ret = bch2_fs_snapshots_start(c);
if (ret)
goto err;
bch_verbose(c, "reading snapshots done");
bch2_inode_init(c, &root_inode, 0, 0,
S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
root_inode.bi_inum = BCACHEFS_ROOT_INO;
root_inode.bi_inum = BCACHEFS_ROOT_INO;
root_inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
bch2_inode_pack(c, &packed_inode, &root_inode);
packed_inode.inode.k.p.snapshot = U32_MAX;
@ -1367,11 +1480,12 @@ int bch2_fs_initialize(struct bch_fs *c)
err = "error creating lost+found";
ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_create_trans(&trans, BCACHEFS_ROOT_INO,
bch2_create_trans(&trans,
BCACHEFS_ROOT_SUBVOL_INUM,
&root_inode, &lostfound_inode,
&lostfound,
0, 0, S_IFDIR|0700, 0,
NULL, NULL));
NULL, NULL, (subvol_inum) { 0 }, 0));
if (ret) {
bch_err(c, "error creating lost+found");
goto err;

View File

@ -7,6 +7,7 @@
#include "inode.h"
#include "io.h"
#include "reflink.h"
#include "subvolume.h"
#include <linux/sched/signal.h>
@ -197,7 +198,8 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
}
s64 bch2_remap_range(struct bch_fs *c,
struct bpos dst_start, struct bpos src_start,
subvol_inum dst_inum, u64 dst_offset,
subvol_inum src_inum, u64 src_offset,
u64 remap_sectors, u64 *journal_seq,
u64 new_i_size, s64 *i_sectors_delta)
{
@ -205,9 +207,12 @@ s64 bch2_remap_range(struct bch_fs *c,
struct btree_iter dst_iter, src_iter;
struct bkey_s_c src_k;
struct bkey_buf new_dst, new_src;
struct bpos dst_start = POS(dst_inum.inum, dst_offset);
struct bpos src_start = POS(src_inum.inum, src_offset);
struct bpos dst_end = dst_start, src_end = src_start;
struct bpos src_want;
u64 dst_done;
u32 dst_snapshot, src_snapshot;
int ret = 0, ret2 = 0;
if (!percpu_ref_tryget(&c->writes))
@ -238,6 +243,20 @@ s64 bch2_remap_range(struct bch_fs *c,
break;
}
ret = bch2_subvolume_get_snapshot(&trans, src_inum.subvol,
&src_snapshot);
if (ret)
continue;
bch2_btree_iter_set_snapshot(&src_iter, src_snapshot);
ret = bch2_subvolume_get_snapshot(&trans, dst_inum.subvol,
&dst_snapshot);
if (ret)
continue;
bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot);
dst_done = dst_iter.pos.offset - dst_start.offset;
src_want = POS(src_start.inode, src_start.offset + dst_done);
bch2_btree_iter_set_pos(&src_iter, src_want);
@ -248,11 +267,11 @@ s64 bch2_remap_range(struct bch_fs *c,
continue;
if (bkey_cmp(src_want, src_iter.pos) < 0) {
ret = bch2_fpunch_at(&trans, &dst_iter,
bpos_min(dst_end,
POS(dst_iter.pos.inode, dst_iter.pos.offset +
src_iter.pos.offset - src_want.offset)),
journal_seq, i_sectors_delta);
ret = bch2_fpunch_at(&trans, &dst_iter, dst_inum,
min(dst_end.offset,
dst_iter.pos.offset +
src_iter.pos.offset - src_want.offset),
journal_seq, i_sectors_delta);
continue;
}
@ -289,8 +308,9 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_key_resize(&new_dst.k->k,
min(src_k.k->p.offset - src_want.offset,
dst_end.offset - dst_iter.pos.offset));
ret = bch2_extent_update(&trans, &dst_iter, new_dst.k,
&disk_res, journal_seq,
ret = bch2_extent_update(&trans, dst_inum, &dst_iter,
new_dst.k, &disk_res, journal_seq,
new_i_size, i_sectors_delta,
true);
bch2_disk_reservation_put(c, &disk_res);
@ -311,7 +331,7 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_trans_begin(&trans);
ret2 = bch2_inode_peek(&trans, &inode_iter, &inode_u,
dst_start.inode, BTREE_ITER_INTENT);
dst_inum, BTREE_ITER_INTENT);
if (!ret2 &&
inode_u.bi_size < new_i_size) {

View File

@ -57,7 +57,7 @@ static inline __le64 *bkey_refcount(struct bkey_i *k)
}
}
s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos,
u64, u64 *, u64, s64 *);
s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64,
subvol_inum, u64, u64, u64 *, u64, s64 *);
#endif /* _BCACHEFS_REFLINK_H */

View File

@ -8,6 +8,7 @@
#include "error.h"
#include "inode.h"
#include "siphash.h"
#include "subvolume.h"
#include "super.h"
#include <linux/crc32c.h>
@ -144,16 +145,21 @@ bch2_hash_lookup(struct btree_trans *trans,
struct btree_iter *iter,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
u64 inode, const void *key,
subvol_inum inum, const void *key,
unsigned flags)
{
struct bkey_s_c k;
u32 snapshot;
int ret;
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
return ret;
for_each_btree_key(trans, *iter, desc.btree_id,
POS(inode, desc.hash_key(info, key)),
SPOS(inum.inum, desc.hash_key(info, key), snapshot),
BTREE_ITER_SLOTS|flags, k, ret) {
if (iter->pos.inode != inode)
if (iter->pos.inode != inum.inum)
break;
if (k.k->type == desc.key_type) {
@ -176,15 +182,20 @@ bch2_hash_hole(struct btree_trans *trans,
struct btree_iter *iter,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
u64 inode, const void *key)
subvol_inum inum, const void *key)
{
struct bkey_s_c k;
u32 snapshot;
int ret;
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
return ret;
for_each_btree_key(trans, *iter, desc.btree_id,
POS(inode, desc.hash_key(info, key)),
SPOS(inum.inum, desc.hash_key(info, key), snapshot),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
if (iter->pos.inode != inode)
if (iter->pos.inode != inum.inum)
break;
if (k.k->type != desc.key_type)
@ -229,17 +240,25 @@ static __always_inline
int bch2_hash_set(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
u64 inode, struct bkey_i *insert, int flags)
subvol_inum inum,
struct bkey_i *insert, int flags)
{
struct btree_iter iter, slot = { NULL };
struct bkey_s_c k;
bool found = false;
u32 snapshot;
int ret;
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
return ret;
for_each_btree_key(trans, iter, desc.btree_id,
POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))),
SPOS(inum.inum,
desc.hash_bkey(info, bkey_i_to_s_c(insert)),
snapshot),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
if (iter.pos.inode != inode)
if (iter.pos.inode != inum.inum)
break;
if (k.k->type == desc.key_type) {
@ -288,7 +307,8 @@ static __always_inline
int bch2_hash_delete_at(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
struct btree_iter *iter)
struct btree_iter *iter,
unsigned update_flags)
{
struct bkey_i *delete;
int ret;
@ -306,24 +326,24 @@ int bch2_hash_delete_at(struct btree_trans *trans,
delete->k.p = iter->pos;
delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted;
return bch2_trans_update(trans, iter, delete, 0);
return bch2_trans_update(trans, iter, delete, update_flags);
}
static __always_inline
int bch2_hash_delete(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
u64 inode, const void *key)
subvol_inum inum, const void *key)
{
struct btree_iter iter;
int ret;
ret = bch2_hash_lookup(trans, &iter, desc, info, inode, key,
ret = bch2_hash_lookup(trans, &iter, desc, info, inum, key,
BTREE_ITER_INTENT);
if (ret)
return ret;
ret = bch2_hash_delete_at(trans, desc, info, &iter);
ret = bch2_hash_delete_at(trans, desc, info, &iter, 0);
bch2_trans_iter_exit(trans, &iter);
return ret;
}

981
libbcachefs/subvolume.c Normal file
View File

@ -0,0 +1,981 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "btree_key_cache.h"
#include "btree_update.h"
#include "error.h"
#include "subvolume.h"
/* Snapshot tree: */
static void bch2_delete_dead_snapshots_work(struct work_struct *);
static void bch2_delete_dead_snapshots(struct bch_fs *);
void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
pr_buf(out, "is_subvol %llu deleted %llu parent %u children %u %u subvol %u",
BCH_SNAPSHOT_SUBVOL(s.v),
BCH_SNAPSHOT_DELETED(s.v),
le32_to_cpu(s.v->parent),
le32_to_cpu(s.v->children[0]),
le32_to_cpu(s.v->children[1]),
le32_to_cpu(s.v->subvol));
}
const char *bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_snapshot s;
u32 i, id;
if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0 ||
bkey_cmp(k.k->p, POS(0, 1)) < 0)
return "bad pos";
if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot))
return "bad val size";
s = bkey_s_c_to_snapshot(k);
id = le32_to_cpu(s.v->parent);
if (id && id <= k.k->p.offset)
return "bad parent node";
if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]))
return "children not normalized";
if (s.v->children[0] &&
s.v->children[0] == s.v->children[1])
return "duplicate child nodes";
for (i = 0; i < 2; i++) {
id = le32_to_cpu(s.v->children[i]);
if (id >= k.k->p.offset)
return "bad child node";
}
return NULL;
}
int bch2_mark_snapshot(struct bch_fs *c,
struct bkey_s_c old, struct bkey_s_c new,
u64 journal_seq, unsigned flags)
{
struct snapshot_t *t;
t = genradix_ptr_alloc(&c->snapshots,
U32_MAX - new.k->p.offset,
GFP_KERNEL);
if (!t)
return -ENOMEM;
if (new.k->type == KEY_TYPE_snapshot) {
struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
t->parent = le32_to_cpu(s.v->parent);
t->children[0] = le32_to_cpu(s.v->children[0]);
t->children[1] = le32_to_cpu(s.v->children[1]);
t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
} else {
t->parent = 0;
t->children[0] = 0;
t->children[1] = 0;
t->subvol = 0;
}
return 0;
}
static int subvol_lookup(struct btree_trans *trans, unsigned id, struct bch_subvolume *s)
{
struct btree_iter iter;
struct bkey_s_c k;
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, POS(0, id), 0);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -ENOENT;
if (!ret)
*s = *bkey_s_c_to_subvolume(k).v;
bch2_trans_iter_exit(trans, &iter);
return ret;
}
static int snapshot_lookup(struct btree_trans *trans, u32 id,
struct bch_snapshot *s)
{
struct btree_iter iter;
struct bkey_s_c k;
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
BTREE_ITER_WITH_UPDATES);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k) ?: k.k->type == KEY_TYPE_snapshot ? 0 : -ENOENT;
if (!ret)
*s = *bkey_s_c_to_snapshot(k).v;
bch2_trans_iter_exit(trans, &iter);
return ret;
}
static int snapshot_live(struct btree_trans *trans, u32 id)
{
struct bch_snapshot v;
int ret;
if (!id)
return 0;
ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
if (ret == -ENOENT)
bch_err(trans->c, "snapshot node %u not found", id);
if (ret)
return ret;
return !BCH_SNAPSHOT_DELETED(&v);
}
static int bch2_snapshots_set_equiv(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_s_c_snapshot snap;
unsigned i;
int ret;
for_each_btree_key(trans, iter, BTREE_ID_snapshots,
POS_MIN, 0, k, ret) {
u32 id = k.k->p.offset, child[2];
unsigned nr_live = 0, live_idx;
if (k.k->type != KEY_TYPE_snapshot)
continue;
snap = bkey_s_c_to_snapshot(k);
child[0] = le32_to_cpu(snap.v->children[0]);
child[1] = le32_to_cpu(snap.v->children[1]);
for (i = 0; i < 2; i++) {
ret = snapshot_live(trans, child[i]);
if (ret < 0)
break;
if (ret)
live_idx = i;
nr_live += ret;
}
snapshot_t(c, id)->equiv = nr_live == 1
? snapshot_t(c, child[live_idx])->equiv
: id;
}
bch2_trans_iter_exit(trans, &iter);
if (ret)
bch_err(c, "error walking snapshots: %i", ret);
return ret;
}
/* fsck: */
static int bch2_snapshot_check(struct btree_trans *trans,
struct bkey_s_c_snapshot s)
{
struct bch_subvolume subvol;
struct bch_snapshot v;
u32 i, id;
int ret;
id = le32_to_cpu(s.v->subvol);
ret = lockrestart_do(trans, subvol_lookup(trans, id, &subvol));
if (ret == -ENOENT)
bch_err(trans->c, "snapshot node %llu has nonexistent subvolume %u",
s.k->p.offset, id);
if (ret)
return ret;
if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) {
bch_err(trans->c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
s.k->p.offset);
return -EINVAL;
}
id = le32_to_cpu(s.v->parent);
if (id) {
ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
if (ret == -ENOENT)
bch_err(trans->c, "snapshot node %llu has nonexistent parent %u",
s.k->p.offset, id);
if (ret)
return ret;
if (le32_to_cpu(v.children[0]) != s.k->p.offset &&
le32_to_cpu(v.children[1]) != s.k->p.offset) {
bch_err(trans->c, "snapshot parent %u missing pointer to child %llu",
id, s.k->p.offset);
return -EINVAL;
}
}
for (i = 0; i < 2 && s.v->children[i]; i++) {
id = le32_to_cpu(s.v->children[i]);
ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
if (ret == -ENOENT)
bch_err(trans->c, "snapshot node %llu has nonexistent child %u",
s.k->p.offset, id);
if (ret)
return ret;
if (le32_to_cpu(v.parent) != s.k->p.offset) {
bch_err(trans->c, "snapshot child %u has wrong parent (got %u should be %llu)",
id, le32_to_cpu(v.parent), s.k->p.offset);
return -EINVAL;
}
}
return 0;
}
int bch2_fs_snapshots_check(struct bch_fs *c)
{
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
struct bch_snapshot s;
unsigned id;
int ret;
bch2_trans_init(&trans, c, 0, 0);
for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
POS_MIN, 0, k, ret) {
if (k.k->type != KEY_TYPE_snapshot)
continue;
ret = bch2_snapshot_check(&trans, bkey_s_c_to_snapshot(k));
if (ret)
break;
}
bch2_trans_iter_exit(&trans, &iter);
if (ret) {
bch_err(c, "error %i checking snapshots", ret);
goto err;
}
for_each_btree_key(&trans, iter, BTREE_ID_subvolumes,
POS_MIN, 0, k, ret) {
if (k.k->type != KEY_TYPE_subvolume)
continue;
again_2:
id = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
ret = snapshot_lookup(&trans, id, &s);
if (ret == -EINTR) {
k = bch2_btree_iter_peek(&iter);
goto again_2;
} else if (ret == -ENOENT)
bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
k.k->p.offset, id);
else if (ret)
break;
}
bch2_trans_iter_exit(&trans, &iter);
err:
bch2_trans_exit(&trans);
return ret;
}
void bch2_fs_snapshots_exit(struct bch_fs *c)
{
genradix_free(&c->snapshots);
}
int bch2_fs_snapshots_start(struct bch_fs *c)
{
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
bool have_deleted = false;
int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
POS_MIN, 0, k, ret) {
if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0)
break;
if (k.k->type != KEY_TYPE_snapshot) {
bch_err(c, "found wrong key type %u in snapshot node table",
k.k->type);
continue;
}
if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
have_deleted = true;
ret = bch2_mark_snapshot(c, bkey_s_c_null, k, 0, 0);
if (ret)
break;
}
bch2_trans_iter_exit(&trans, &iter);
if (ret)
goto err;
ret = bch2_snapshots_set_equiv(&trans);
if (ret)
goto err;
err:
bch2_trans_exit(&trans);
if (!ret && have_deleted) {
bch_info(c, "restarting deletion of dead snapshots");
if (c->opts.fsck) {
bch2_delete_dead_snapshots_work(&c->snapshot_delete_work);
} else {
bch2_delete_dead_snapshots(c);
}
}
return ret;
}
/*
* Mark a snapshot as deleted, for future cleanup:
*/
static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
{
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_i_snapshot *s;
int ret = 0;
bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
BTREE_ITER_INTENT);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
if (k.k->type != KEY_TYPE_snapshot) {
bch2_fs_inconsistent(trans->c, "missing snapshot %u", id);
ret = -ENOENT;
goto err;
}
/* already deleted? */
if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
goto err;
s = bch2_trans_kmalloc(trans, sizeof(*s));
ret = PTR_ERR_OR_ZERO(s);
if (ret)
goto err;
bkey_reassemble(&s->k_i, k);
SET_BCH_SNAPSHOT_DELETED(&s->v, true);
ret = bch2_trans_update(trans, &iter, &s->k_i, 0);
if (ret)
goto err;
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
{
struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
struct bkey_s_c k;
struct bkey_s_c_snapshot s;
struct bkey_i_snapshot *parent;
u32 parent_id;
unsigned i;
int ret = 0;
bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
BTREE_ITER_INTENT);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
if (k.k->type != KEY_TYPE_snapshot) {
bch2_fs_inconsistent(trans->c, "missing snapshot %u", id);
ret = -ENOENT;
goto err;
}
s = bkey_s_c_to_snapshot(k);
BUG_ON(!BCH_SNAPSHOT_DELETED(s.v));
parent_id = le32_to_cpu(s.v->parent);
if (parent_id) {
bch2_trans_iter_init(trans, &p_iter, BTREE_ID_snapshots,
POS(0, parent_id),
BTREE_ITER_INTENT);
k = bch2_btree_iter_peek_slot(&p_iter);
ret = bkey_err(k);
if (ret)
goto err;
if (k.k->type != KEY_TYPE_snapshot) {
bch2_fs_inconsistent(trans->c, "missing snapshot %u", parent_id);
ret = -ENOENT;
goto err;
}
parent = bch2_trans_kmalloc(trans, sizeof(*parent));
ret = PTR_ERR_OR_ZERO(parent);
if (ret)
goto err;
bkey_reassemble(&parent->k_i, k);
for (i = 0; i < 2; i++)
if (le32_to_cpu(parent->v.children[i]) == id)
break;
if (i == 2)
bch_err(trans->c, "snapshot %u missing child pointer to %u",
parent_id, id);
else
parent->v.children[i] = 0;
if (le32_to_cpu(parent->v.children[0]) <
le32_to_cpu(parent->v.children[1]))
swap(parent->v.children[0],
parent->v.children[1]);
ret = bch2_trans_update(trans, &p_iter, &parent->k_i, 0);
if (ret)
goto err;
}
ret = bch2_btree_delete_at(trans, &iter, 0);
err:
bch2_trans_iter_exit(trans, &p_iter);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
u32 *new_snapids,
u32 *snapshot_subvols,
unsigned nr_snapids)
{
struct btree_iter iter;
struct bkey_i_snapshot *n;
struct bkey_s_c k;
unsigned i;
int ret = 0;
bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
POS_MIN, BTREE_ITER_INTENT);
k = bch2_btree_iter_peek(&iter);
ret = bkey_err(k);
if (ret)
goto err;
for (i = 0; i < nr_snapids; i++) {
k = bch2_btree_iter_prev_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
if (!k.k || !k.k->p.offset) {
ret = -ENOSPC;
goto err;
}
n = bch2_trans_kmalloc(trans, sizeof(*n));
ret = PTR_ERR_OR_ZERO(n);
if (ret)
return ret;
bkey_snapshot_init(&n->k_i);
n->k.p = iter.pos;
n->v.flags = 0;
n->v.parent = cpu_to_le32(parent);
n->v.subvol = cpu_to_le32(snapshot_subvols[i]);
n->v.pad = 0;
SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
bch2_trans_update(trans, &iter, &n->k_i, 0);
ret = bch2_mark_snapshot(trans->c, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0, 0);
if (ret)
break;
new_snapids[i] = iter.pos.offset;
}
if (parent) {
bch2_btree_iter_set_pos(&iter, POS(0, parent));
k = bch2_btree_iter_peek(&iter);
ret = bkey_err(k);
if (ret)
goto err;
if (k.k->type != KEY_TYPE_snapshot) {
bch_err(trans->c, "snapshot %u not found", parent);
ret = -ENOENT;
goto err;
}
n = bch2_trans_kmalloc(trans, sizeof(*n));
ret = PTR_ERR_OR_ZERO(n);
if (ret)
return ret;
bkey_reassemble(&n->k_i, k);
if (n->v.children[0] || n->v.children[1]) {
bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
ret = -EINVAL;
goto err;
}
n->v.children[0] = cpu_to_le32(new_snapids[0]);
n->v.children[1] = cpu_to_le32(new_snapids[1]);
SET_BCH_SNAPSHOT_SUBVOL(&n->v, false);
bch2_trans_update(trans, &iter, &n->k_i, 0);
}
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
/* List of snapshot IDs that are being deleted: */
struct snapshot_id_list {
u32 nr;
u32 size;
u32 *d;
};
static bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id)
{
unsigned i;
for (i = 0; i < s->nr; i++)
if (id == s->d[i])
return true;
return false;
}
static int snapshot_id_add(struct snapshot_id_list *s, u32 id)
{
BUG_ON(snapshot_list_has_id(s, id));
if (s->nr == s->size) {
size_t new_size = max(8U, s->size * 2);
void *n = krealloc(s->d,
new_size * sizeof(s->d[0]),
GFP_KERNEL);
if (!n) {
pr_err("error allocating snapshot ID list");
return -ENOMEM;
}
s->d = n;
s->size = new_size;
};
s->d[s->nr++] = id;
return 0;
}
static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
struct snapshot_id_list *deleted,
enum btree_id btree_id)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
struct snapshot_id_list equiv_seen = { 0 };
struct bpos last_pos = POS_MIN;
int ret = 0;
/*
* XXX: We should also delete whiteouts that no longer overwrite
* anything
*/
bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN,
BTREE_ITER_INTENT|
BTREE_ITER_PREFETCH|
BTREE_ITER_NOT_EXTENTS|
BTREE_ITER_ALL_SNAPSHOTS);
while ((bch2_trans_begin(trans),
(k = bch2_btree_iter_peek(&iter)).k) &&
!(ret = bkey_err(k))) {
u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
if (bkey_cmp(k.k->p, last_pos))
equiv_seen.nr = 0;
last_pos = k.k->p;
if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
snapshot_list_has_id(&equiv_seen, equiv)) {
if (btree_id == BTREE_ID_inodes &&
bch2_btree_key_cache_flush(trans, btree_id, iter.pos))
continue;
ret = __bch2_trans_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL,
bch2_btree_iter_traverse(&iter) ?:
bch2_btree_delete_at(trans, &iter,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
if (ret)
break;
} else {
ret = snapshot_id_add(&equiv_seen, equiv);
if (ret)
break;
}
bch2_btree_iter_advance(&iter);
}
bch2_trans_iter_exit(trans, &iter);
kfree(equiv_seen.d);
return ret;
}
static void bch2_delete_dead_snapshots_work(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_s_c_snapshot snap;
struct snapshot_id_list deleted = { 0 };
u32 i, id, children[2];
int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
/*
* For every snapshot node: If we have no live children and it's not
* pointed to by a subvolume, delete it:
*/
for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
POS_MIN, 0, k, ret) {
if (k.k->type != KEY_TYPE_snapshot)
continue;
snap = bkey_s_c_to_snapshot(k);
if (BCH_SNAPSHOT_DELETED(snap.v) ||
BCH_SNAPSHOT_SUBVOL(snap.v))
continue;
children[0] = le32_to_cpu(snap.v->children[0]);
children[1] = le32_to_cpu(snap.v->children[1]);
ret = snapshot_live(&trans, children[0]) ?:
snapshot_live(&trans, children[1]);
if (ret < 0)
break;
if (ret)
continue;
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
bch2_snapshot_node_set_deleted(&trans, iter.pos.offset));
if (ret) {
bch_err(c, "error deleting snapshot %llu: %i", iter.pos.offset, ret);
break;
}
}
bch2_trans_iter_exit(&trans, &iter);
if (ret) {
bch_err(c, "error walking snapshots: %i", ret);
goto err;
}
ret = bch2_snapshots_set_equiv(&trans);
if (ret)
goto err;
for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
POS_MIN, 0, k, ret) {
if (k.k->type != KEY_TYPE_snapshot)
continue;
snap = bkey_s_c_to_snapshot(k);
if (BCH_SNAPSHOT_DELETED(snap.v)) {
ret = snapshot_id_add(&deleted, k.k->p.offset);
if (ret)
break;
}
}
bch2_trans_iter_exit(&trans, &iter);
if (ret) {
bch_err(c, "error walking snapshots: %i", ret);
goto err;
}
for (id = 0; id < BTREE_ID_NR; id++) {
if (!btree_type_has_snapshots(id))
continue;
ret = bch2_snapshot_delete_keys_btree(&trans, &deleted, id);
if (ret) {
bch_err(c, "error deleting snapshot keys: %i", ret);
goto err;
}
}
for (i = 0; i < deleted.nr; i++) {
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
bch2_snapshot_node_delete(&trans, deleted.d[i]));
if (ret) {
bch_err(c, "error deleting snapshot %u: %i",
deleted.d[i], ret);
goto err;
}
}
err:
kfree(deleted.d);
bch2_trans_exit(&trans);
percpu_ref_put(&c->writes);
}
static void bch2_delete_dead_snapshots(struct bch_fs *c)
{
if (unlikely(!percpu_ref_tryget(&c->writes)))
return;
if (!queue_work(system_long_wq, &c->snapshot_delete_work))
percpu_ref_put(&c->writes);
}
static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
struct btree_trans_commit_hook *h)
{
bch2_delete_dead_snapshots(trans->c);
return 0;
}
/* Subvolumes: */
const char *bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0)
return "invalid pos";
if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0)
return "invalid pos";
if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume))
return "bad val size";
return NULL;
}
void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
pr_buf(out, "root %llu snapshot id %u",
le64_to_cpu(s.v->inode),
le32_to_cpu(s.v->snapshot));
}
int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
u32 *snapid)
{
struct btree_iter iter;
struct bkey_s_c k;
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
POS(0, subvol),
BTREE_ITER_CACHED|
BTREE_ITER_WITH_UPDATES);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
if (k.k->type != KEY_TYPE_subvolume) {
bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol);
ret = -EIO;
goto err;
}
*snapid = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
/* XXX: mark snapshot id for deletion, walk btree and delete: */
int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid,
int deleting_snapshot)
{
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_s_c_subvolume subvol;
struct btree_trans_commit_hook *h;
struct bkey_i *delete;
u32 snapid;
int ret = 0;
bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
POS(0, subvolid),
BTREE_ITER_CACHED|
BTREE_ITER_INTENT);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
if (k.k->type != KEY_TYPE_subvolume) {
bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid);
ret = -EIO;
goto err;
}
subvol = bkey_s_c_to_subvolume(k);
snapid = le32_to_cpu(subvol.v->snapshot);
if (deleting_snapshot >= 0 &&
deleting_snapshot != BCH_SUBVOLUME_SNAP(subvol.v)) {
ret = -ENOENT;
goto err;
}
delete = bch2_trans_kmalloc(trans, sizeof(*delete));
ret = PTR_ERR_OR_ZERO(delete);
if (ret)
goto err;
bkey_init(&delete->k);
delete->k.p = iter.pos;
ret = bch2_trans_update(trans, &iter, delete, 0);
if (ret)
goto err;
ret = bch2_snapshot_node_set_deleted(trans, snapid);
h = bch2_trans_kmalloc(trans, sizeof(*h));
ret = PTR_ERR_OR_ZERO(h);
if (ret)
goto err;
h->fn = bch2_delete_dead_snapshots_hook;
bch2_trans_commit_hook(trans, h);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
u32 src_subvolid,
u32 *new_subvolid,
u32 *new_snapshotid,
bool ro)
{
struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL };
struct bkey_i_subvolume *new_subvol = NULL;
struct bkey_i_subvolume *src_subvol = NULL;
struct bkey_s_c k;
u32 parent = 0, new_nodes[2], snapshot_subvols[2];
int ret = 0;
for_each_btree_key(trans, dst_iter, BTREE_ID_subvolumes, SUBVOL_POS_MIN,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0)
break;
if (bkey_deleted(k.k))
goto found_slot;
}
if (!ret)
ret = -ENOSPC;
goto err;
found_slot:
snapshot_subvols[0] = dst_iter.pos.offset;
snapshot_subvols[1] = src_subvolid;
if (src_subvolid) {
/* Creating a snapshot: */
src_subvol = bch2_trans_kmalloc(trans, sizeof(*src_subvol));
ret = PTR_ERR_OR_ZERO(src_subvol);
if (ret)
goto err;
bch2_trans_iter_init(trans, &src_iter, BTREE_ID_subvolumes,
POS(0, src_subvolid),
BTREE_ITER_CACHED|
BTREE_ITER_INTENT);
k = bch2_btree_iter_peek_slot(&src_iter);
ret = bkey_err(k);
if (ret)
goto err;
if (k.k->type != KEY_TYPE_subvolume) {
bch_err(trans->c, "subvolume %u not found", src_subvolid);
ret = -ENOENT;
goto err;
}
bkey_reassemble(&src_subvol->k_i, k);
parent = le32_to_cpu(src_subvol->v.snapshot);
}
ret = bch2_snapshot_node_create(trans, parent, new_nodes,
snapshot_subvols,
src_subvolid ? 2 : 1);
if (ret)
goto err;
if (src_subvolid) {
src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]);
bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0);
}
new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
ret = PTR_ERR_OR_ZERO(new_subvol);
if (ret)
goto err;
bkey_subvolume_init(&new_subvol->k_i);
new_subvol->v.flags = 0;
new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]);
new_subvol->v.inode = cpu_to_le64(inode);
SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
new_subvol->k.p = dst_iter.pos;
bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0);
*new_subvolid = new_subvol->k.p.offset;
*new_snapshotid = new_nodes[0];
err:
bch2_trans_iter_exit(trans, &src_iter);
bch2_trans_iter_exit(trans, &dst_iter);
return ret;
}
int bch2_fs_subvolumes_init(struct bch_fs *c)
{
INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
return 0;
}

115
libbcachefs/subvolume.h Normal file
View File

@ -0,0 +1,115 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_SUBVOLUME_H
#define _BCACHEFS_SUBVOLUME_H
void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
const char *bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_snapshot (struct bkey_ops) { \
.key_invalid = bch2_snapshot_invalid, \
.val_to_text = bch2_snapshot_to_text, \
}
int bch2_mark_snapshot(struct bch_fs *, struct bkey_s_c,
struct bkey_s_c, u64, unsigned);
static inline struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
{
return genradix_ptr(&c->snapshots, U32_MAX - id);
}
static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
{
return snapshot_t(c, id)->parent;
}
static inline u32 bch2_snapshot_internal_node(struct bch_fs *c, u32 id)
{
struct snapshot_t *s = snapshot_t(c, id);
return s->children[0] || s->children[1];
}
static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
{
struct snapshot_t *s;
u32 parent = bch2_snapshot_parent(c, id);
if (!parent)
return 0;
s = snapshot_t(c, bch2_snapshot_parent(c, id));
if (id == s->children[0])
return s->children[1];
if (id == s->children[1])
return s->children[0];
return 0;
}
static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
{
while (id && id < ancestor)
id = bch2_snapshot_parent(c, id);
return id == ancestor;
}
struct snapshots_seen {
struct bpos pos;
size_t nr;
size_t size;
u32 *d;
};
static inline void snapshots_seen_exit(struct snapshots_seen *s)
{
kfree(s->d);
s->d = NULL;
}
static inline void snapshots_seen_init(struct snapshots_seen *s)
{
memset(s, 0, sizeof(*s));
}
static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id)
{
if (s->nr == s->size) {
size_t new_size = max(s->size, 128UL) * 2;
u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL);
if (!d) {
bch_err(c, "error reallocating snapshots_seen table (new size %zu)",
new_size);
return -ENOMEM;
}
s->size = new_size;
s->d = d;
}
s->d[s->nr++] = id;
return 0;
}
int bch2_fs_snapshots_check(struct bch_fs *);
void bch2_fs_snapshots_exit(struct bch_fs *);
int bch2_fs_snapshots_start(struct bch_fs *);
const char *bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_subvolume (struct bkey_ops) { \
.key_invalid = bch2_subvolume_invalid, \
.val_to_text = bch2_subvolume_to_text, \
}
int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
int bch2_subvolume_delete(struct btree_trans *, u32, int);
int bch2_subvolume_create(struct btree_trans *, u64, u32,
u32 *, u32 *, bool);
int bch2_fs_subvolumes_init(struct bch_fs *);
#endif /* _BCACHEFS_SUBVOLUME_H */

View File

@ -39,6 +39,7 @@
#include "rebalance.h"
#include "recovery.h"
#include "replicas.h"
#include "subvolume.h"
#include "super.h"
#include "super-io.h"
#include "sysfs.h"
@ -468,6 +469,7 @@ static void __bch2_fs_free(struct bch_fs *c)
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_exit(&c->times[i]);
bch2_fs_snapshots_exit(c);
bch2_fs_quota_exit(c);
bch2_fs_fsio_exit(c);
bch2_fs_ec_exit(c);
@ -686,6 +688,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
mutex_init(&c->usage_scratch_lock);
mutex_init(&c->bio_bounce_pages_lock);
mutex_init(&c->snapshot_table_lock);
spin_lock_init(&c->btree_write_error_lock);
@ -789,6 +792,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_btree_key_cache_init(&c->btree_key_cache) ||
bch2_fs_btree_iter_init(c) ||
bch2_fs_btree_interior_update_init(c) ||
bch2_fs_subvolumes_init(c) ||
bch2_fs_io_init(c) ||
bch2_fs_encryption_init(c) ||
bch2_fs_compress_init(c) ||

View File

@ -128,7 +128,7 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info
int ret;
ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash,
inode->v.i_ino,
inode_inum(inode),
&X_SEARCH(type, name, strlen(name)),
0);
if (ret)
@ -160,7 +160,7 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
bch2_xattr_get_trans(&trans, inode, name, buffer, size, type));
}
int bch2_xattr_set(struct btree_trans *trans, u64 inum,
int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
const struct bch_hash_info *hash_info,
const char *name, const void *value, size_t size,
int type, int flags)
@ -282,13 +282,21 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
struct btree_iter iter;
struct bkey_s_c k;
struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
u64 inum = dentry->d_inode->i_ino;
u64 offset = 0, inum = inode->ei_inode.bi_inum;
u32 snapshot;
int ret;
bch2_trans_init(&trans, c, 0, 0);
retry:
bch2_trans_begin(&trans);
iter = (struct btree_iter) { NULL };
ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot);
if (ret)
goto err;
for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
POS(inum, 0), 0, k, ret) {
SPOS(inum, offset, snapshot), 0, k, ret) {
BUG_ON(k.k->p.inode < inum);
if (k.k->p.inode > inum)
@ -301,7 +309,12 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
if (ret)
break;
}
offset = iter.pos.offset;
bch2_trans_iter_exit(&trans, &iter);
err:
if (ret == -EINTR)
goto retry;
ret = bch2_trans_exit(&trans) ?: ret;
@ -340,7 +353,7 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler,
struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0,
bch2_xattr_set(&trans, inode->v.i_ino, &hash,
bch2_xattr_set(&trans, inode_inum(inode), &hash,
name, value, size,
handler->flags, flags));
}

View File

@ -39,7 +39,8 @@ struct bch_inode_info;
int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *,
const char *, void *, size_t, int);
int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *,
int bch2_xattr_set(struct btree_trans *, subvol_inum,
const struct bch_hash_info *,
const char *, const void *, size_t, int, int);
ssize_t bch2_xattr_list(struct dentry *, char *, size_t);