From 75c7148e0aff2184c75a52e7c4c58e46e715757b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 13 Jul 2018 00:43:23 -0400 Subject: [PATCH] Update bcachefs sources to 940d6ca657 bcachefs: acl code improvements --- .bcachefs_revision | 2 +- cmd_migrate.c | 4 +- include/linux/dcache.h | 19 -- include/linux/kernel.h | 19 ++ libbcachefs/acl.c | 303 +++++++++++-------- libbcachefs/acl.h | 29 -- libbcachefs/bkey.h | 25 +- libbcachefs/bset.c | 112 ++++--- libbcachefs/bset.h | 74 +++-- libbcachefs/btree_cache.c | 85 +++--- libbcachefs/btree_cache.h | 6 +- libbcachefs/btree_io.c | 2 +- libbcachefs/btree_iter.c | 436 +++++++++++++++++----------- libbcachefs/btree_iter.h | 25 +- libbcachefs/btree_locking.h | 8 +- libbcachefs/btree_types.h | 22 +- libbcachefs/btree_update_interior.c | 10 +- libbcachefs/btree_update_interior.h | 9 +- libbcachefs/btree_update_leaf.c | 20 +- libbcachefs/dirent.c | 6 +- libbcachefs/extents.c | 34 +-- libbcachefs/fs-io.c | 50 ++-- libbcachefs/fsck.c | 3 - libbcachefs/journal_seq_blacklist.c | 3 +- libbcachefs/tests.c | 243 +++++++++++++++- libbcachefs/xattr.c | 58 ++-- libbcachefs/xattr.h | 28 +- 27 files changed, 1021 insertions(+), 614 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index a8916efb..f1807172 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -2cb70a82bc0ca05d8c3cf666d221badd5724e339 +940d6ca657ea70758f3f43323bfd531019a40d3c diff --git a/cmd_migrate.c b/cmd_migrate.c index db20b71c..61866534 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -239,8 +239,8 @@ static void copy_xattrs(struct bch_fs *c, struct bch_inode_unpacked *dst, const struct xattr_handler *h = xattr_resolve_name(&attr); - int ret = __bch2_xattr_set(c, dst->bi_inum, &hash_info, attr, - val, val_size, 0, h->flags, NULL); + int ret = bch2_xattr_set(c, dst->bi_inum, &hash_info, attr, + val, val_size, 0, h->flags, NULL); if (ret < 0) die("error creating xattr: %s", strerror(-ret)); } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 15b803ea..7637854d 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -4,25 +4,6 @@ struct super_block; struct inode; -/* The hash is always the low bits of hash_len */ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - #define HASH_LEN_DECLARE u32 hash; u32 len -#else - #define HASH_LEN_DECLARE u32 len; u32 hash -#endif - -struct qstr { - union { - struct { - HASH_LEN_DECLARE; - }; - u64 hash_len; - }; - const unsigned char *name; -}; - -#define QSTR_INIT(n,l) { { { .len = l } }, .name = n } - struct dentry { struct super_block *d_sb; struct inode *d_inode; diff --git a/include/linux/kernel.h b/include/linux/kernel.h index b6afea43..a4c8149e 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -222,4 +222,23 @@ static inline int __must_check kstrtos32(const char *s, unsigned int base, s32 * BUILD_BUG_ON_ZERO((perms) & 2) + \ (perms)) +/* The hash is always the low bits of hash_len */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + #define HASH_LEN_DECLARE u32 hash; u32 len +#else + #define HASH_LEN_DECLARE u32 len; u32 hash +#endif + +struct qstr { + union { + struct { + HASH_LEN_DECLARE; + }; + u64 hash_len; + }; + const unsigned char *name; +}; + +#define QSTR_INIT(n,l) { { { .len = l } }, .name = n } + #endif diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c index 29774e5d..a8735bc0 100644 --- a/libbcachefs/acl.c +++ b/libbcachefs/acl.c @@ -12,96 +12,175 @@ #include "fs.h" #include "xattr.h" +static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long) +{ + return sizeof(bch_acl_header) + + sizeof(bch_acl_entry_short) * nr_short + + sizeof(bch_acl_entry) * nr_long; +} + +static inline int acl_to_xattr_type(int type) +{ + switch (type) { + case ACL_TYPE_ACCESS: + return BCH_XATTR_INDEX_POSIX_ACL_ACCESS; + case ACL_TYPE_DEFAULT: + return BCH_XATTR_INDEX_POSIX_ACL_DEFAULT; + default: + BUG(); + } +} + /* * Convert from filesystem to in-memory representation. */ static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size) { - const char *end = (char *)value + size; - int n, count; + const void *p, *end = value + size; struct posix_acl *acl; + struct posix_acl_entry *out; + unsigned count = 0; if (!value) return NULL; if (size < sizeof(bch_acl_header)) - return ERR_PTR(-EINVAL); + goto invalid; if (((bch_acl_header *)value)->a_version != cpu_to_le32(BCH_ACL_VERSION)) - return ERR_PTR(-EINVAL); - value = (char *)value + sizeof(bch_acl_header); - count = bch2_acl_count(size); - if (count < 0) - return ERR_PTR(-EINVAL); - if (count == 0) - return NULL; - acl = posix_acl_alloc(count, GFP_KERNEL); - if (!acl) - return ERR_PTR(-ENOMEM); - for (n = 0; n < count; n++) { - bch_acl_entry *entry = - (bch_acl_entry *)value; - if ((char *)value + sizeof(bch_acl_entry_short) > end) - goto fail; - acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); - acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); - switch (acl->a_entries[n].e_tag) { + goto invalid; + + p = value + sizeof(bch_acl_header); + while (p < end) { + const bch_acl_entry *entry = p; + + if (p + sizeof(bch_acl_entry_short) > end) + goto invalid; + + switch (le16_to_cpu(entry->e_tag)) { case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: - value = (char *)value + - sizeof(bch_acl_entry_short); + p += sizeof(bch_acl_entry_short); break; - case ACL_USER: - value = (char *)value + sizeof(bch_acl_entry); - if ((char *)value > end) - goto fail; - acl->a_entries[n].e_uid = - make_kuid(&init_user_ns, - le32_to_cpu(entry->e_id)); + case ACL_GROUP: + p += sizeof(bch_acl_entry); + break; + default: + goto invalid; + } + + count++; + } + + if (p > end) + goto invalid; + + if (!count) + return NULL; + + acl = posix_acl_alloc(count, GFP_KERNEL); + if (!acl) + return ERR_PTR(-ENOMEM); + + out = acl->a_entries; + + p = value + sizeof(bch_acl_header); + while (p < end) { + const bch_acl_entry *in = p; + + out->e_tag = le16_to_cpu(in->e_tag); + out->e_perm = le16_to_cpu(in->e_perm); + + switch (out->e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + p += sizeof(bch_acl_entry_short); + break; + case ACL_USER: + out->e_uid = make_kuid(&init_user_ns, + le32_to_cpu(in->e_id)); + p += sizeof(bch_acl_entry); break; case ACL_GROUP: - value = (char *)value + sizeof(bch_acl_entry); - if ((char *)value > end) - goto fail; - acl->a_entries[n].e_gid = - make_kgid(&init_user_ns, - le32_to_cpu(entry->e_id)); + out->e_gid = make_kgid(&init_user_ns, + le32_to_cpu(in->e_id)); + p += sizeof(bch_acl_entry); break; - - default: - goto fail; } - } - if (value != end) - goto fail; - return acl; -fail: - posix_acl_release(acl); + out++; + } + + BUG_ON(out != acl->a_entries + acl->a_count); + + return acl; +invalid: + pr_err("invalid acl entry"); return ERR_PTR(-EINVAL); } +#define acl_for_each_entry(acl, acl_e) \ + for (acl_e = acl->a_entries; \ + acl_e < acl->a_entries + acl->a_count; \ + acl_e++) + /* * Convert from in-memory to filesystem representation. */ -static void *bch2_acl_to_disk(const struct posix_acl *acl, size_t *size) +static struct bkey_i_xattr * +bch2_acl_to_xattr(const struct posix_acl *acl, + int type) { - bch_acl_header *ext_acl; - char *e; - size_t n; + struct bkey_i_xattr *xattr; + bch_acl_header *acl_header; + const struct posix_acl_entry *acl_e; + void *outptr; + unsigned nr_short = 0, nr_long = 0, acl_len, u64s; - *size = bch2_acl_size(acl->a_count); - ext_acl = kmalloc(sizeof(bch_acl_header) + acl->a_count * - sizeof(bch_acl_entry), GFP_KERNEL); - if (!ext_acl) - return ERR_PTR(-ENOMEM); - ext_acl->a_version = cpu_to_le32(BCH_ACL_VERSION); - e = (char *)ext_acl + sizeof(bch_acl_header); - for (n = 0; n < acl->a_count; n++) { - const struct posix_acl_entry *acl_e = &acl->a_entries[n]; - bch_acl_entry *entry = (bch_acl_entry *)e; + acl_for_each_entry(acl, acl_e) { + switch (acl_e->e_tag) { + case ACL_USER: + case ACL_GROUP: + nr_long++; + break; + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + nr_short++; + break; + default: + return ERR_PTR(-EINVAL); + } + } + + acl_len = bch2_acl_size(nr_short, nr_long); + u64s = BKEY_U64s + xattr_val_u64s(0, acl_len); + + if (u64s > U8_MAX) + return ERR_PTR(-E2BIG); + + xattr = kmalloc(u64s * sizeof(u64), GFP_KERNEL); + if (IS_ERR(xattr)) + return xattr; + + bkey_xattr_init(&xattr->k_i); + xattr->k.u64s = u64s; + xattr->v.x_type = acl_to_xattr_type(type); + xattr->v.x_name_len = 0, + xattr->v.x_val_len = cpu_to_le16(acl_len); + + acl_header = xattr_val(&xattr->v); + acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION); + + outptr = (void *) acl_header + sizeof(*acl_header); + + acl_for_each_entry(acl, acl_e) { + bch_acl_entry *entry = outptr; entry->e_tag = cpu_to_le16(acl_e->e_tag); entry->e_perm = cpu_to_le16(acl_e->e_perm); @@ -109,70 +188,54 @@ static void *bch2_acl_to_disk(const struct posix_acl *acl, size_t *size) case ACL_USER: entry->e_id = cpu_to_le32( from_kuid(&init_user_ns, acl_e->e_uid)); - e += sizeof(bch_acl_entry); + outptr += sizeof(bch_acl_entry); break; case ACL_GROUP: entry->e_id = cpu_to_le32( from_kgid(&init_user_ns, acl_e->e_gid)); - e += sizeof(bch_acl_entry); + outptr += sizeof(bch_acl_entry); break; case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: - e += sizeof(bch_acl_entry_short); + outptr += sizeof(bch_acl_entry_short); break; - - default: - goto fail; } } - return (char *)ext_acl; -fail: - kfree(ext_acl); - return ERR_PTR(-EINVAL); + BUG_ON(outptr != xattr_val(&xattr->v) + acl_len); + + return xattr; } struct posix_acl *bch2_get_acl(struct inode *vinode, int type) { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - int name_index; - char *value = NULL; - struct posix_acl *acl; - int ret; + struct btree_iter iter; + struct bkey_s_c_xattr xattr; + struct bkey_s_c k; + struct posix_acl *acl = NULL; + int name_index = acl_to_xattr_type(type); - switch (type) { - case ACL_TYPE_ACCESS: - name_index = BCH_XATTR_INDEX_POSIX_ACL_ACCESS; - break; - case ACL_TYPE_DEFAULT: - name_index = BCH_XATTR_INDEX_POSIX_ACL_DEFAULT; - break; - default: - BUG(); + k = bch2_xattr_get_iter(c, &iter, inode, "", name_index); + if (IS_ERR(k.k)) { + if (PTR_ERR(k.k) != -ENOENT) + acl = ERR_CAST(k.k); + goto out; } - ret = bch2_xattr_get(c, inode, "", NULL, 0, name_index); - if (ret > 0) { - value = kmalloc(ret, GFP_KERNEL); - if (!value) - return ERR_PTR(-ENOMEM); - ret = bch2_xattr_get(c, inode, "", value, - ret, name_index); - } - if (ret > 0) - acl = bch2_acl_from_disk(value, ret); - else if (ret == -ENODATA || ret == -ENOSYS) - acl = NULL; - else - acl = ERR_PTR(ret); - kfree(value); + + xattr = bkey_s_c_to_xattr(k); + + acl = bch2_acl_from_disk(xattr_val(xattr.v), + le16_to_cpu(xattr.v->x_val_len)); if (!IS_ERR(acl)) set_cached_acl(&inode->v, type, acl); - +out: + bch2_btree_iter_unlock(&iter); return acl; } @@ -180,37 +243,31 @@ int __bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type) { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - int name_index; - void *value = NULL; - size_t size = 0; int ret; - switch (type) { - case ACL_TYPE_ACCESS: - name_index = BCH_XATTR_INDEX_POSIX_ACL_ACCESS; - break; - case ACL_TYPE_DEFAULT: - name_index = BCH_XATTR_INDEX_POSIX_ACL_DEFAULT; - if (!S_ISDIR(inode->v.i_mode)) - return acl ? -EACCES : 0; - break; - - default: - return -EINVAL; - } + if (type == ACL_TYPE_DEFAULT && + !S_ISDIR(inode->v.i_mode)) + return acl ? -EACCES : 0; if (acl) { - value = bch2_acl_to_disk(acl, &size); - if (IS_ERR(value)) - return (int)PTR_ERR(value); + struct bkey_i_xattr *xattr = + bch2_acl_to_xattr(acl, type); + if (IS_ERR(xattr)) + return PTR_ERR(xattr); + + ret = bch2_hash_set(bch2_xattr_hash_desc, &inode->ei_str_hash, + c, inode->v.i_ino, &inode->ei_journal_seq, + &xattr->k_i, 0); + kfree(xattr); + } else { + struct xattr_search_key search = + X_SEARCH(acl_to_xattr_type(type), "", 0); + + ret = bch2_hash_delete(bch2_xattr_hash_desc, &inode->ei_str_hash, + c, inode->v.i_ino, &inode->ei_journal_seq, + &search); } - ret = bch2_xattr_set(c, inode, "", value, size, 0, name_index); - kfree(value); - - if (ret == -ERANGE) - ret = -E2BIG; - if (!ret) set_cached_acl(&inode->v, type, acl); diff --git a/libbcachefs/acl.h b/libbcachefs/acl.h index a66338d4..0be31ee9 100644 --- a/libbcachefs/acl.h +++ b/libbcachefs/acl.h @@ -20,35 +20,6 @@ typedef struct { __le32 a_version; } bch_acl_header; -static inline size_t bch2_acl_size(int count) -{ - if (count <= 4) { - return sizeof(bch_acl_header) + - count * sizeof(bch_acl_entry_short); - } else { - return sizeof(bch_acl_header) + - 4 * sizeof(bch_acl_entry_short) + - (count - 4) * sizeof(bch_acl_entry); - } -} - -static inline int bch2_acl_count(size_t size) -{ - ssize_t s; - - size -= sizeof(bch_acl_header); - s = size - 4 * sizeof(bch_acl_entry_short); - if (s < 0) { - if (size % sizeof(bch_acl_entry_short)) - return -1; - return size / sizeof(bch_acl_entry_short); - } else { - if (s % sizeof(bch_acl_entry)) - return -1; - return s / sizeof(bch_acl_entry) + 4; - } -} - struct posix_acl; extern struct posix_acl *bch2_get_acl(struct inode *, int); diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index 2d6c8a23..2f62bd8e 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -52,21 +52,6 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64)); } -/* - * Mark a key as deleted without changing the size of the value (i.e. modifying - * keys in the btree in place) - */ -static inline void __set_bkey_deleted(struct bkey *k) -{ - k->type = KEY_TYPE_DELETED; -} - -static inline void set_bkey_deleted(struct bkey *k) -{ - __set_bkey_deleted(k); - set_bkey_val_u64s(k, 0); -} - #define bkey_deleted(_k) ((_k)->type == KEY_TYPE_DELETED) #define bkey_whiteout(_k) \ @@ -284,6 +269,16 @@ static inline struct bpos bkey_successor(struct bpos p) return ret; } +static inline struct bpos bkey_predecessor(struct bpos p) +{ + struct bpos ret = p; + + if (!ret.offset--) + BUG_ON(!ret.inode--); + + return ret; +} + static inline u64 bkey_start_offset(const struct bkey *k) { return k->p.offset - k->size; diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index 9a274774..5c777872 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -987,6 +987,10 @@ void bch2_bset_init_next(struct bch_fs *c, struct btree *b, set_btree_bset(b, t, i); } +/* + * find _some_ key in the same bset as @k that precedes @k - not necessarily the + * immediate predecessor: + */ static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k) { @@ -1025,40 +1029,31 @@ static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, return p; } -struct bkey_packed *bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, - struct bkey_packed *k) +struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, + struct bset_tree *t, + struct bkey_packed *k, + unsigned min_key_type) { - struct bkey_packed *p; - - p = __bkey_prev(b, t, k); - if (!p) - return NULL; - - while (bkey_next(p) != k) - p = bkey_next(p); - - return p; -} - -struct bkey_packed *bch2_bkey_prev(struct btree *b, struct bset_tree *t, - struct bkey_packed *k) -{ - while (1) { - struct bkey_packed *p, *i, *ret = NULL; - - p = __bkey_prev(b, t, k); - if (!p) - return NULL; + struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; + while ((p = __bkey_prev(b, t, k)) && !ret) { for (i = p; i != k; i = bkey_next(i)) - if (!bkey_deleted(i)) + if (i->type >= min_key_type) ret = i; - if (ret) - return ret; - k = p; } + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { + BUG_ON(ret >= orig_k); + + for (i = ret ? bkey_next(ret) : btree_bkey_first(b, t); + i != orig_k; + i = bkey_next(i)) + BUG_ON(i->type >= min_key_type); + } + + return ret; } /* Insert */ @@ -1677,7 +1672,7 @@ void bch2_btree_node_iter_advance(struct btree_node_iter *iter, #endif } -static inline bool __btree_node_iter_used(struct btree_node_iter *iter) +static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) { unsigned n = ARRAY_SIZE(iter->data); @@ -1690,69 +1685,68 @@ static inline bool __btree_node_iter_used(struct btree_node_iter *iter) /* * Expensive: */ -struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, - struct btree *b) +struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter, + struct btree *b, + unsigned min_key_type) { struct bkey_packed *k, *prev = NULL; + struct bkey_packed *orig_pos = bch2_btree_node_iter_peek_all(iter, b); struct btree_node_iter_set *set; struct bset_tree *t; - struct bset_tree *prev_t; - unsigned end, used; + unsigned end; bch2_btree_node_iter_verify(iter, b); for_each_bset(b, t) { - k = bch2_bkey_prev_all(b, t, - bch2_btree_node_iter_bset_pos(iter, b, t)); + k = bch2_bkey_prev_filter(b, t, + bch2_btree_node_iter_bset_pos(iter, b, t), + min_key_type); if (k && (!prev || __btree_node_iter_cmp(iter->is_extents, b, k, prev) > 0)) { prev = k; - prev_t = t; + end = t->end_offset; } } if (!prev) - return NULL; + goto out; /* * We're manually memmoving instead of just calling sort() to ensure the * prev we picked ends up in slot 0 - sort won't necessarily put it * there because of duplicate deleted keys: */ - end = __btree_node_key_to_offset(b, btree_bkey_last(b, prev_t)); btree_node_iter_for_each(iter, set) - if (set->end == end) { - memmove(&iter->data[1], - &iter->data[0], - (void *) set - (void *) &iter->data[0]); - goto out; - } + if (set->end == end) + goto found; - used = __btree_node_iter_used(iter); - BUG_ON(used >= ARRAY_SIZE(iter->data)); + BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]); +found: + BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data)); memmove(&iter->data[1], &iter->data[0], - (void *) &iter->data[used] - (void *) &iter->data[0]); -out: + (void *) set - (void *) &iter->data[0]); + iter->data[0].k = __btree_node_key_to_offset(b, prev); iter->data[0].end = end; +out: + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { + struct btree_node_iter iter2 = *iter; + + if (prev) + bch2_btree_node_iter_advance(&iter2, b); + + while ((k = bch2_btree_node_iter_peek_all(&iter2, b)) != orig_pos) { + BUG_ON(k->type >= min_key_type); + bch2_btree_node_iter_advance(&iter2, b); + } + } + return prev; } -struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter, - struct btree *b) -{ - struct bkey_packed *k; - - do { - k = bch2_btree_node_iter_prev_all(iter, b); - } while (k && bkey_deleted(k)); - - return k; -} - struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter, struct btree *b, struct bkey *u) diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h index 153e2b3f..296c05b4 100644 --- a/libbcachefs/bset.h +++ b/libbcachefs/bset.h @@ -393,10 +393,21 @@ static inline bool btree_iter_pos_cmp_p_or_unp(const struct btree *b, } struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *); -struct bkey_packed *bch2_bkey_prev_all(struct btree *, struct bset_tree *, - struct bkey_packed *); -struct bkey_packed *bch2_bkey_prev(struct btree *, struct bset_tree *, - struct bkey_packed *); + +struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *, + struct bkey_packed *, unsigned); + +static inline struct bkey_packed * +bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k) +{ + return bch2_bkey_prev_filter(b, t, k, 0); +} + +static inline struct bkey_packed * +bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k) +{ + return bch2_bkey_prev_filter(b, t, k, KEY_TYPE_DISCARD + 1); +} enum bch_extent_overlap { BCH_EXTENT_OVERLAP_ALL = 0, @@ -471,9 +482,11 @@ static inline int __btree_node_iter_cmp(bool is_extents, * For extents, bkey_deleted() is used as a proxy for k->size == 0, so * deleted keys have to sort last. */ - return bkey_cmp_packed(b, l, r) ?: is_extents - ? (int) bkey_deleted(l) - (int) bkey_deleted(r) - : (int) bkey_deleted(r) - (int) bkey_deleted(l); + return bkey_cmp_packed(b, l, r) + ?: (is_extents + ? (int) bkey_deleted(l) - (int) bkey_deleted(r) + : (int) bkey_deleted(r) - (int) bkey_deleted(l)) + ?: (l > r) - (l < r); } static inline int btree_node_iter_cmp(struct btree_node_iter *iter, @@ -512,25 +525,34 @@ __bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, return __btree_node_offset_to_key(b, iter->data->k); } +static inline struct bkey_packed * +bch2_btree_node_iter_peek_filter(struct btree_node_iter *iter, + struct btree *b, + unsigned min_key_type) +{ + while (!bch2_btree_node_iter_end(iter)) { + struct bkey_packed *k = __bch2_btree_node_iter_peek_all(iter, b); + + if (k->type >= min_key_type) + return k; + + bch2_btree_node_iter_advance(iter, b); + } + + return NULL; +} + static inline struct bkey_packed * bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, struct btree *b) { - return bch2_btree_node_iter_end(iter) - ? NULL - : __bch2_btree_node_iter_peek_all(iter, b); + return bch2_btree_node_iter_peek_filter(iter, b, 0); } static inline struct bkey_packed * bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b) { - struct bkey_packed *ret; - - while ((ret = bch2_btree_node_iter_peek_all(iter, b)) && - bkey_deleted(ret)) - bch2_btree_node_iter_advance(iter, b); - - return ret; + return bch2_btree_node_iter_peek_filter(iter, b, KEY_TYPE_DISCARD + 1); } static inline struct bkey_packed * @@ -544,10 +566,20 @@ bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b) return ret; } -struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *, - struct btree *); -struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *, - struct btree *); +struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *, + struct btree *, unsigned); + +static inline struct bkey_packed * +bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, struct btree *b) +{ + return bch2_btree_node_iter_prev_filter(iter, b, 0); +} + +static inline struct bkey_packed * +bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b) +{ + return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_DISCARD + 1); +} /* * Iterates over all _live_ keys - skipping deleted (and potentially diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index b0dc4c8a..f15a415e 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -577,10 +577,11 @@ err: /* Slowpath, don't want it inlined into btree_iter_traverse() */ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, - struct btree_iter *iter, - const struct bkey_i *k, - unsigned level, - enum six_lock_type lock_type) + struct btree_iter *iter, + const struct bkey_i *k, + unsigned level, + enum six_lock_type lock_type, + bool sync) { struct btree_cache *bc = &c->btree_cache; struct btree *b; @@ -590,6 +591,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, * been freed: */ BUG_ON(!btree_node_locked(iter, level + 1)); + BUG_ON(level >= BTREE_MAX_DEPTH); b = bch2_btree_node_mem_alloc(c); if (IS_ERR(b)) @@ -623,9 +625,15 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, if (btree_node_read_locked(iter, level + 1)) btree_node_unlock(iter, level + 1); - bch2_btree_node_read(c, b, true); + bch2_btree_node_read(c, b, sync); + six_unlock_write(&b->lock); + if (!sync) { + six_unlock_intent(&b->lock); + return NULL; + } + if (lock_type == SIX_LOCK_read) six_lock_downgrade(&b->lock); @@ -643,7 +651,8 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, */ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, const struct bkey_i *k, unsigned level, - enum six_lock_type lock_type) + enum six_lock_type lock_type, + bool may_drop_locks) { struct btree_cache *bc = &c->btree_cache; struct btree *b; @@ -670,7 +679,7 @@ retry: * else we could read in a btree node from disk that's been * freed: */ - b = bch2_btree_node_fill(c, iter, k, level, lock_type); + b = bch2_btree_node_fill(c, iter, k, level, lock_type, true); /* We raced and found the btree node in the cache */ if (!b) @@ -710,7 +719,8 @@ retry: if (btree_node_read_locked(iter, level + 1)) btree_node_unlock(iter, level + 1); - if (!btree_node_lock(b, k->k.p, level, iter, lock_type)) + if (!btree_node_lock(b, k->k.p, level, iter, + lock_type, may_drop_locks)) return ERR_PTR(-EINTR); if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) || @@ -778,18 +788,17 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, k = bch2_btree_node_iter_peek_all(&node_iter, parent); BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p)); - do { - k = sib == btree_prev_sib - ? bch2_btree_node_iter_prev_all(&node_iter, parent) - : (bch2_btree_node_iter_advance(&node_iter, parent), - bch2_btree_node_iter_peek_all(&node_iter, parent)); - if (!k) - goto out; - } while (bkey_deleted(k)); + k = sib == btree_prev_sib + ? bch2_btree_node_iter_prev(&node_iter, parent) + : (bch2_btree_node_iter_advance(&node_iter, parent), + bch2_btree_node_iter_peek(&node_iter, parent)); + if (!k) + goto out; bch2_bkey_unpack(parent, &tmp.k, k); - ret = bch2_btree_node_get(c, iter, &tmp.k, level, SIX_LOCK_intent); + ret = bch2_btree_node_get(c, iter, &tmp.k, level, + SIX_LOCK_intent, may_drop_locks); if (PTR_ERR_OR_ZERO(ret) == -EINTR && may_drop_locks) { struct btree_iter *linked; @@ -809,7 +818,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, btree_node_unlock(iter, level); ret = bch2_btree_node_get(c, iter, &tmp.k, level, - SIX_LOCK_intent); + SIX_LOCK_intent, may_drop_locks); /* * before btree_iter_relock() calls btree_iter_verify_locks(): @@ -838,20 +847,32 @@ out: (iter->uptodate >= BTREE_ITER_NEED_RELOCK || !btree_node_locked(iter, level))); + if (!IS_ERR_OR_NULL(ret)) { + struct btree *n1 = ret, *n2 = b; + + if (sib != btree_prev_sib) + swap(n1, n2); + + BUG_ON(bkey_cmp(btree_type_successor(n1->btree_id, + n1->key.k.p), + n2->data->min_key)); + } + return ret; out_upgrade: if (may_drop_locks) - bch2_btree_iter_upgrade(iter, level + 2); + bch2_btree_iter_upgrade(iter, level + 2, true); ret = ERR_PTR(-EINTR); goto out; } -void bch2_btree_node_prefetch(struct bch_fs *c, const struct bkey_i *k, - unsigned level, enum btree_id btree_id) +void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, + const struct bkey_i *k, unsigned level) { struct btree_cache *bc = &c->btree_cache; struct btree *b; + BUG_ON(!btree_node_locked(iter, level + 1)); BUG_ON(level >= BTREE_MAX_DEPTH); rcu_read_lock(); @@ -861,27 +882,7 @@ void bch2_btree_node_prefetch(struct bch_fs *c, const struct bkey_i *k, if (b) return; - b = bch2_btree_node_mem_alloc(c); - if (IS_ERR(b)) - return; - - bkey_copy(&b->key, k); - if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { - /* raced with another fill: */ - - /* mark as unhashed... */ - bkey_i_to_extent(&b->key)->v._data[0] = 0; - - mutex_lock(&bc->lock); - list_add(&b->list, &bc->freeable); - mutex_unlock(&bc->lock); - goto out; - } - - bch2_btree_node_read(c, b, false); -out: - six_unlock_write(&b->lock); - six_unlock_intent(&b->lock); + bch2_btree_node_fill(c, iter, k, level, SIX_LOCK_read, false); } int bch2_print_btree_node(struct bch_fs *c, struct btree *b, diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index 43109d08..96d134f4 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -23,14 +23,14 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, const struct bkey_i *, unsigned, - enum six_lock_type); + enum six_lock_type, bool); struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *, struct btree *, bool, enum btree_node_sibling); -void bch2_btree_node_prefetch(struct bch_fs *, const struct bkey_i *, - unsigned, enum btree_id); +void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *, + const struct bkey_i *, unsigned); void bch2_fs_btree_cache_exit(struct bch_fs *); int bch2_fs_btree_cache_init(struct bch_fs *); diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 0c825bcb..847dfd68 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1547,7 +1547,7 @@ static void bch2_btree_node_write_error(struct bch_fs *c, __bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p, BTREE_MAX_DEPTH, - b->level, 0); + b->level, BTREE_ITER_NODES); retry: ret = bch2_btree_iter_traverse(&iter); if (ret) diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 682a9143..097b68e0 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -18,7 +18,9 @@ static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *, static inline bool is_btree_node(struct btree_iter *iter, unsigned l) { - return iter->l[l].b && iter->l[l].b != BTREE_ITER_NOT_END; + return l < BTREE_MAX_DEPTH && + iter->l[l].b && + iter->l[l].b != BTREE_ITER_NOT_END; } /* Btree node locking: */ @@ -88,10 +90,10 @@ static inline bool btree_node_lock_increment(struct btree_iter *iter, bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) { - struct btree *b = iter->l[level].b; + struct btree *b = btree_iter_node(iter, level); int want = __btree_lock_want(iter, level); - if (!is_btree_node(iter, level)) + if (!b || b == BTREE_ITER_NOT_END) return false; if (race_fault()) @@ -115,12 +117,12 @@ static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level) if (!is_btree_node(iter, level)) return false; - if (race_fault()) - return false; - if (btree_node_intent_locked(iter, level)) return true; + if (race_fault()) + return false; + if (btree_node_locked(iter, level) ? six_lock_tryupgrade(&b->lock) : six_relock_type(&b->lock, SIX_LOCK_intent, iter->lock_seq[level])) @@ -180,7 +182,8 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter, bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, unsigned level, struct btree_iter *iter, - enum six_lock_type type) + enum six_lock_type type, + bool may_drop_locks) { struct bch_fs *c = iter->c; struct btree_iter *linked; @@ -231,10 +234,12 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, */ if (type == SIX_LOCK_intent && linked->nodes_locked != linked->nodes_intent_locked) { - linked->locks_want = max_t(unsigned, - linked->locks_want, - __fls(linked->nodes_locked) + 1); - btree_iter_get_locks(linked, true); + if (may_drop_locks) { + linked->locks_want = max_t(unsigned, + linked->locks_want, + __fls(linked->nodes_locked) + 1); + btree_iter_get_locks(linked, true); + } ret = false; } @@ -245,10 +250,12 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, */ if (linked->btree_id == iter->btree_id && level > __fls(linked->nodes_locked)) { - linked->locks_want = max_t(unsigned, - linked->locks_want, - iter->locks_want); - btree_iter_get_locks(linked, true); + if (may_drop_locks) { + linked->locks_want = max_t(unsigned, + linked->locks_want, + iter->locks_want); + btree_iter_get_locks(linked, true); + } ret = false; } } @@ -265,11 +272,6 @@ void bch2_btree_iter_verify_locks(struct btree_iter *iter) { unsigned l; - if (iter->uptodate == BTREE_ITER_END) { - BUG_ON(iter->nodes_locked); - return; - } - for (l = 0; btree_iter_node(iter, l); l++) { if (iter->uptodate >= BTREE_ITER_NEED_RELOCK && !btree_node_locked(iter, l)) @@ -284,13 +286,9 @@ void bch2_btree_iter_verify_locks(struct btree_iter *iter) __flatten static bool __bch2_btree_iter_relock(struct btree_iter *iter) { - if (iter->uptodate < BTREE_ITER_NEED_RELOCK) - return true; - - if (iter->uptodate > BTREE_ITER_NEED_TRAVERSE) - return false; - - return btree_iter_get_locks(iter, false); + return iter->uptodate >= BTREE_ITER_NEED_RELOCK + ? btree_iter_get_locks(iter, false) + : true; } bool bch2_btree_iter_relock(struct btree_iter *iter) @@ -332,6 +330,30 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter, return false; } +bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter, + unsigned new_locks_want) +{ + unsigned l = iter->level; + + EBUG_ON(iter->locks_want >= new_locks_want); + + iter->locks_want = new_locks_want; + + do { + if (!btree_iter_node(iter, l)) + break; + + if (!bch2_btree_node_upgrade(iter, l)) { + iter->locks_want = l; + return false; + } + + l++; + } while (l < iter->locks_want); + + return true; +} + void __bch2_btree_iter_downgrade(struct btree_iter *iter, unsigned downgrade_to) { @@ -419,6 +441,12 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter, panic("next key should be before iter pos:\n%llu:%llu\n%s\n", iter->pos.inode, iter->pos.offset, buf); } + + if (iter->uptodate == BTREE_ITER_UPTODATE && + (iter->flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES) { + BUG_ON(!bkey_whiteout(&iter->k) && + bch2_btree_node_iter_end(&l->iter)); + } } void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b) @@ -453,6 +481,8 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, if (new_u64s && btree_iter_pos_cmp_packed(b, &iter->pos, where, iter->flags & BTREE_ITER_IS_EXTENTS)) { + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + bch2_btree_node_iter_push(node_iter, b, where, end); if (!b->level && @@ -482,6 +512,8 @@ found: goto iter_current_key_not_modified; } + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + bch2_btree_node_iter_sort(node_iter, b); if (!b->level && node_iter == &iter->l[0].iter) __btree_iter_peek_all(iter, &iter->l[0], &iter->k); @@ -666,7 +698,8 @@ static inline bool btree_iter_pos_cmp(struct btree_iter *iter, static inline bool btree_iter_pos_after_node(struct btree_iter *iter, struct btree *b) { - return !btree_iter_pos_cmp(iter, &b->key.k); + return !btree_iter_pos_cmp(iter, &b->key.k) && + bkey_cmp(b->key.k.p, POS_MAX); } static inline bool btree_iter_pos_in_node(struct btree_iter *iter, @@ -788,7 +821,7 @@ static inline int btree_iter_lock_root(struct btree_iter *iter, lock_type = __btree_lock_want(iter, iter->level); if (unlikely(!btree_node_lock(b, POS_MAX, iter->level, - iter, lock_type))) + iter, lock_type, true))) return -EINTR; if (likely(b == c->btree_roots[iter->btree_id].b && @@ -830,9 +863,8 @@ static void btree_iter_prefetch(struct btree_iter *iter) break; bch2_bkey_unpack(l->b, &tmp.k, k); - bch2_btree_node_prefetch(iter->c, &tmp.k, - iter->level - 1, - iter->btree_id); + bch2_btree_node_prefetch(iter->c, iter, &tmp.k, + iter->level - 1); } if (!was_locked) @@ -852,7 +884,7 @@ static inline int btree_iter_down(struct btree_iter *iter) bch2_bkey_unpack(l->b, &tmp.k, bch2_btree_node_iter_peek(&l->iter, l->b)); - b = bch2_btree_node_get(iter->c, iter, &tmp.k, level, lock_type); + b = bch2_btree_node_get(iter->c, iter, &tmp.k, level, lock_type, true); if (unlikely(IS_ERR(b))) return PTR_ERR(b); @@ -872,12 +904,6 @@ static void btree_iter_up(struct btree_iter *iter) btree_node_unlock(iter, iter->level++); } -static void btree_iter_set_end(struct btree_iter *iter) -{ - iter->uptodate = BTREE_ITER_END; - __bch2_btree_iter_unlock(iter); -} - int __must_check __bch2_btree_iter_traverse(struct btree_iter *); static int btree_iter_traverse_error(struct btree_iter *iter, int ret) @@ -954,6 +980,24 @@ io_error: goto out; } +static unsigned btree_iter_up_until_locked(struct btree_iter *iter, + bool check_pos) +{ + unsigned l = iter->level; + + while (btree_iter_node(iter, l) && + !(is_btree_node(iter, l) && + bch2_btree_node_relock(iter, l) && + (!check_pos || + btree_iter_pos_in_node(iter, iter->l[l].b)))) { + btree_node_unlock(iter, l); + iter->l[l].b = BTREE_ITER_NOT_END; + l++; + } + + return l; +} + /* * This is the main state machine for walking down the btree - walks down to a * specified depth @@ -967,45 +1011,19 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) { unsigned depth_want = iter->level; - if (unlikely(iter->uptodate == BTREE_ITER_END)) + if (unlikely(iter->level >= BTREE_MAX_DEPTH)) return 0; - BUG_ON(iter->level >= BTREE_MAX_DEPTH); - BUG_ON(!iter->l[iter->level].b); + if (__bch2_btree_iter_relock(iter)) + return 0; iter->flags &= ~BTREE_ITER_AT_END_OF_LEAF; - /* make sure we have all the intent locks we need - ugh */ - if (unlikely(iter->l[iter->level].b && - iter->level + 1 < iter->locks_want)) { - unsigned i; - - for (i = iter->level + 1; - i < iter->locks_want && iter->l[i].b; - i++) - if (!bch2_btree_node_relock(iter, i)) { - while (iter->level < BTREE_MAX_DEPTH && - iter->l[iter->level].b && - iter->level + 1 < iter->locks_want) - btree_iter_up(iter); - break; - } - } - /* - * If the current node isn't locked, go up until we have a locked node - * or run out of nodes: + * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos + * here unnecessary */ - while (btree_iter_node(iter, iter->level) && - !(is_btree_node(iter, iter->level) && - bch2_btree_node_relock(iter, iter->level) && - - /* - * XXX: correctly using BTREE_ITER_UPTODATE should make - * comparing iter->pos against node's key unnecessary - */ - btree_iter_pos_in_node(iter, iter->l[iter->level].b))) - btree_iter_up(iter); + iter->level = btree_iter_up_until_locked(iter, true); /* * If we've got a btree node locked (i.e. we aren't about to relock the @@ -1049,9 +1067,6 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *iter) { int ret; - if (__bch2_btree_iter_relock(iter)) - return 0; - ret = __bch2_btree_iter_traverse(iter); if (unlikely(ret)) ret = btree_iter_traverse_error(iter, ret); @@ -1061,6 +1076,18 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *iter) return ret; } +static inline void bch2_btree_iter_checks(struct btree_iter *iter, + enum btree_iter_type type) +{ + EBUG_ON(iter->btree_id >= BTREE_ID_NR); + EBUG_ON((iter->flags & BTREE_ITER_TYPE) != type); + EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != + (iter->btree_id == BTREE_ID_EXTENTS && + type != BTREE_ITER_NODES)); + + bch2_btree_iter_verify_locks(iter); +} + /* Iterate across nodes (leaf and interior nodes) */ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) @@ -1068,24 +1095,18 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) struct btree *b; int ret; - EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); - bch2_btree_iter_verify_locks(iter); + bch2_btree_iter_checks(iter, BTREE_ITER_NODES); if (iter->uptodate == BTREE_ITER_UPTODATE) return iter->l[iter->level].b; - if (unlikely(iter->uptodate == BTREE_ITER_END)) - return NULL; - ret = bch2_btree_iter_traverse(iter); if (ret) - return ERR_PTR(ret); - - b = iter->l[iter->level].b; - if (!b) { - btree_iter_set_end(iter); return NULL; - } + + b = btree_iter_node(iter, iter->level); + if (!b) + return NULL; BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0); @@ -1100,25 +1121,25 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth) struct btree *b; int ret; - EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); - bch2_btree_iter_verify_locks(iter); + bch2_btree_iter_checks(iter, BTREE_ITER_NODES); + + /* already got to end? */ + if (!btree_iter_node(iter, iter->level)) + return NULL; btree_iter_up(iter); - if (!btree_iter_node(iter, iter->level)) { - btree_iter_set_end(iter); + if (!bch2_btree_node_relock(iter, iter->level)) + btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); + + ret = bch2_btree_iter_traverse(iter); + if (ret) return NULL; - } - if (!bch2_btree_node_relock(iter, iter->level)) { - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - ret = bch2_btree_iter_traverse(iter); - if (ret) - return NULL; - } - - b = iter->l[iter->level].b; - BUG_ON(!b); + /* got to end? */ + b = btree_iter_node(iter, iter->level); + if (!b) + return NULL; if (bkey_cmp(iter->pos, b->key.k.p) < 0) { /* @@ -1150,6 +1171,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth) } iter->pos = b->key.k.p; + iter->uptodate = BTREE_ITER_UPTODATE; return b; } @@ -1182,10 +1204,68 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_ void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) { - EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); /* XXX handle this */ + int cmp = bkey_cmp(new_pos, iter->pos); + unsigned level; + + if (!cmp) + return; + iter->pos = new_pos; - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); + level = btree_iter_up_until_locked(iter, true); + + if (btree_iter_node(iter, level)) { + unsigned nr_advanced = 0; + struct btree_iter_level *l = &iter->l[level]; + struct bkey_s_c k; + struct bkey u; + + /* + * We might have to skip over many keys, or just a few: try + * advancing the node iterator, and if we have to skip over too + * many keys just reinit it (or if we're rewinding, since that + * is expensive). + */ + if (cmp > 0) { + while ((k = __btree_iter_peek_all(iter, l, &u)).k && + !btree_iter_pos_cmp(iter, k.k)) { + if (nr_advanced > 8) + goto reinit_node; + + __btree_iter_advance(l); + nr_advanced++; + } + } else { +reinit_node: + __btree_iter_init(iter, iter->l[level].b); + } + + /* Don't leave it locked if we're not supposed to: */ + if (btree_lock_want(iter, level) == BTREE_NODE_UNLOCKED) + btree_node_unlock(iter, level); + } + + if (level != iter->level) + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); + else + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); +} + +static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter) +{ + struct btree_iter_level *l = &iter->l[0]; + struct bkey_s_c ret = { .k = &iter->k }; + + if (!bkey_deleted(&iter->k)) { + EBUG_ON(bch2_btree_node_iter_end(&l->iter)); + ret.v = bkeyp_val(&l->b->format, + __bch2_btree_node_iter_peek_all(&l->iter, l->b)); + } + + if (debug_check_bkeys(iter->c) && + !bkey_deleted(ret.k)) + bch2_bkey_debugcheck(iter->c, l->b, ret); + return ret; } struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) @@ -1194,26 +1274,10 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) struct bkey_s_c k; int ret; - EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != - (iter->btree_id == BTREE_ID_EXTENTS)); - EBUG_ON(iter->flags & BTREE_ITER_SLOTS); - bch2_btree_iter_verify_locks(iter); + bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); - if (iter->uptodate == BTREE_ITER_UPTODATE) { - struct bkey_packed *k = - __bch2_btree_node_iter_peek_all(&l->iter, l->b); - struct bkey_s_c ret = { - .k = &iter->k, - .v = bkeyp_val(&l->b->format, k) - }; - - if (debug_check_bkeys(iter->c)) - bch2_bkey_debugcheck(iter->c, l->b, ret); - return ret; - } - - if (iter->uptodate == BTREE_ITER_END) - return bkey_s_c_null; + if (iter->uptodate == BTREE_ITER_UPTODATE) + return btree_iter_peek_uptodate(iter); while (1) { ret = bch2_btree_iter_traverse(iter); @@ -1225,14 +1289,13 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) break; /* got to the end of the leaf, iterator needs to be traversed: */ - iter->pos = l->b->key.k.p; - if (!bkey_cmp(iter->pos, POS_MAX)) { - btree_iter_set_end(iter); + iter->pos = l->b->key.k.p; + iter->uptodate = BTREE_ITER_NEED_TRAVERSE; + + if (!bkey_cmp(iter->pos, POS_MAX)) return bkey_s_c_null; - } iter->pos = btree_type_successor(iter->btree_id, iter->pos); - iter->uptodate = BTREE_ITER_NEED_TRAVERSE; } /* @@ -1252,14 +1315,13 @@ struct bkey_s_c bch2_btree_iter_peek_next_leaf(struct btree_iter *iter) { struct btree_iter_level *l = &iter->l[0]; - iter->pos = l->b->key.k.p; - if (!bkey_cmp(iter->pos, POS_MAX)) { - btree_iter_set_end(iter); + iter->pos = l->b->key.k.p; + iter->uptodate = BTREE_ITER_NEED_TRAVERSE; + + if (!bkey_cmp(iter->pos, POS_MAX)) return bkey_s_c_null; - } iter->pos = btree_type_successor(iter->btree_id, iter->pos); - iter->uptodate = BTREE_ITER_NEED_TRAVERSE; return bch2_btree_iter_peek(iter); } @@ -1270,10 +1332,7 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) struct bkey_packed *p; struct bkey_s_c k; - EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != - (iter->btree_id == BTREE_ID_EXTENTS)); - EBUG_ON(iter->flags & BTREE_ITER_SLOTS); - bch2_btree_iter_verify_locks(iter); + bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { k = bch2_btree_iter_peek(iter); @@ -1286,7 +1345,7 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) p = bch2_btree_node_iter_peek_all(&l->iter, l->b); if (unlikely(!p)) return bch2_btree_iter_peek_next_leaf(iter); - } while (bkey_deleted(p)); + } while (bkey_whiteout(p)); k = __btree_iter_unpack(iter, l, &iter->k, p); @@ -1295,6 +1354,51 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) return k; } +struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) +{ + struct btree_iter_level *l = &iter->l[0]; + struct bkey_packed *p; + struct bkey_s_c k; + int ret; + + bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); + + if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { + k = bch2_btree_iter_peek(iter); + if (IS_ERR(k.k)) + return k; + } + + while (1) { + p = bch2_btree_node_iter_prev(&l->iter, l->b); + if (likely(p)) + break; + + iter->pos = l->b->data->min_key; + if (!bkey_cmp(iter->pos, POS_MIN)) + return bkey_s_c_null; + + bch2_btree_iter_set_pos(iter, + btree_type_predecessor(iter->btree_id, iter->pos)); + + ret = bch2_btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); + + p = bch2_btree_node_iter_peek(&l->iter, l->b); + if (p) + break; + } + + k = __btree_iter_unpack(iter, l, &iter->k, p); + + EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0); + + iter->pos = bkey_start_pos(k.k); + iter->uptodate = BTREE_ITER_UPTODATE; + return k; +} + static inline struct bkey_s_c __bch2_btree_iter_peek_slot(struct btree_iter *iter) { @@ -1309,13 +1413,6 @@ recheck: bkey_cmp(bkey_start_pos(k.k), iter->pos) == 0) __btree_iter_advance(l); - if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { - EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0); - EBUG_ON(bkey_deleted(k.k)); - iter->uptodate = BTREE_ITER_UPTODATE; - return k; - } - /* * If we got to the end of the node, check if we need to traverse to the * next node: @@ -1329,21 +1426,35 @@ recheck: goto recheck; } + if (k.k && + !bkey_whiteout(k.k) && + bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { + EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0); + EBUG_ON(bkey_deleted(k.k)); + iter->uptodate = BTREE_ITER_UPTODATE; + return k; + } + /* hole */ bkey_init(&n); n.p = iter->pos; if (iter->flags & BTREE_ITER_IS_EXTENTS) { if (n.p.offset == KEY_OFFSET_MAX) { - if (n.p.inode == KEY_INODE_MAX) { - btree_iter_set_end(iter); + if (n.p.inode == KEY_INODE_MAX) return bkey_s_c_null; - } iter->pos = bkey_successor(iter->pos); goto recheck; } + if (k.k && bkey_whiteout(k.k)) { + struct btree_node_iter node_iter = l->iter; + + k = __btree_iter_unpack(iter, l, &iter->k, + bch2_btree_node_iter_peek(&node_iter, l->b)); + } + if (!k.k) k.k = &l->b->key.k; @@ -1357,35 +1468,19 @@ recheck: EBUG_ON(!n.size); } - iter->k = n; + iter->k = n; iter->uptodate = BTREE_ITER_UPTODATE; return (struct bkey_s_c) { &iter->k, NULL }; } struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) { - struct btree_iter_level *l = &iter->l[0]; int ret; - EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != - (iter->btree_id == BTREE_ID_EXTENTS)); - EBUG_ON(!(iter->flags & BTREE_ITER_SLOTS)); - bch2_btree_iter_verify_locks(iter); + bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS); - if (iter->uptodate == BTREE_ITER_UPTODATE) { - struct bkey_s_c ret = { .k = &iter->k }; - - if (!bkey_deleted(&iter->k)) - ret.v = bkeyp_val(&l->b->format, - __bch2_btree_node_iter_peek_all(&l->iter, l->b)); - - if (debug_check_bkeys(iter->c)) - bch2_bkey_debugcheck(iter->c, l->b, ret); - return ret; - } - - if (iter->uptodate == BTREE_ITER_END) - return bkey_s_c_null; + if (iter->uptodate == BTREE_ITER_UPTODATE) + return btree_iter_peek_uptodate(iter); ret = bch2_btree_iter_traverse(iter); if (unlikely(ret)) @@ -1396,10 +1491,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) { - EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != - (iter->btree_id == BTREE_ID_EXTENTS)); - EBUG_ON(!(iter->flags & BTREE_ITER_SLOTS)); - bch2_btree_iter_verify_locks(iter); + bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS); iter->pos = btree_type_successor(iter->btree_id, iter->k.p); @@ -1417,6 +1509,8 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) if (!bkey_deleted(&iter->k)) __btree_iter_advance(&iter->l[0]); + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + return __bch2_btree_iter_peek_slot(iter); } @@ -1446,10 +1540,6 @@ void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c, iter->l[iter->level].b = BTREE_ITER_NOT_END; iter->next = iter; - if (unlikely((flags & BTREE_ITER_IS_EXTENTS) && - !bkey_cmp(pos, POS_MAX))) - iter->uptodate = BTREE_ITER_END; - prefetch(c->btree_roots[btree_id].b); } diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 99e51b27..5db1cc58 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -106,14 +106,18 @@ void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *, int bch2_btree_iter_unlock(struct btree_iter *); bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned); +bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned); static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter, - unsigned new_locks_want) + unsigned new_locks_want, + bool may_drop_locks) { new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); return iter->locks_want < new_locks_want - ? __bch2_btree_iter_upgrade(iter, new_locks_want) + ? (may_drop_locks + ? __bch2_btree_iter_upgrade(iter, new_locks_want) + : __bch2_btree_iter_upgrade_nounlock(iter, new_locks_want)) : iter->uptodate <= BTREE_ITER_NEED_PEEK; } @@ -137,6 +141,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *, unsigned); struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *); struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); @@ -175,6 +180,19 @@ static inline struct bpos btree_type_successor(enum btree_id id, return pos; } +static inline struct bpos btree_type_predecessor(enum btree_id id, + struct bpos pos) +{ + if (id == BTREE_ID_INODES) { + --pos.inode; + pos.offset = 0; + } else /* if (id != BTREE_ID_EXTENTS) */ { + pos = bkey_predecessor(pos); + } + + return pos; +} + static inline int __btree_iter_cmp(enum btree_id id, struct bpos pos, const struct btree_iter *r) @@ -207,7 +225,8 @@ static inline void bch2_btree_iter_cond_resched(struct btree_iter *iter) #define __for_each_btree_node(_iter, _c, _btree_id, _start, \ _locks_want, _depth, _flags, _b) \ for (__bch2_btree_iter_init((_iter), (_c), (_btree_id), _start, \ - _locks_want, _depth, _flags), \ + _locks_want, _depth, \ + _flags|BTREE_ITER_NODES), \ _b = bch2_btree_iter_peek_node(_iter); \ (_b); \ (_b) = bch2_btree_iter_next_node(_iter, _depth)) diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index 1d975207..419d0e81 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -147,17 +147,19 @@ static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b, } bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned, - struct btree_iter *, enum six_lock_type); + struct btree_iter *, enum six_lock_type, bool); static inline bool btree_node_lock(struct btree *b, struct bpos pos, unsigned level, struct btree_iter *iter, - enum six_lock_type type) + enum six_lock_type type, + bool may_drop_locks) { EBUG_ON(level >= BTREE_MAX_DEPTH); return likely(six_trylock_type(&b->lock, type)) || - __bch2_btree_node_lock(b, pos, level, iter, type); + __bch2_btree_node_lock(b, pos, level, iter, + type, may_drop_locks); } bool __bch2_btree_node_relock(struct btree_iter *, unsigned); diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index aed8d693..daa648c6 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -182,26 +182,32 @@ struct btree_node_iter { } data[MAX_BSETS]; }; -#define BTREE_ITER_SLOTS (1 << 0) -#define BTREE_ITER_INTENT (1 << 1) -#define BTREE_ITER_PREFETCH (1 << 2) +enum btree_iter_type { + BTREE_ITER_KEYS, + BTREE_ITER_SLOTS, + BTREE_ITER_NODES, +}; + +#define BTREE_ITER_TYPE ((1 << 2) - 1) + +#define BTREE_ITER_INTENT (1 << 2) +#define BTREE_ITER_PREFETCH (1 << 3) /* * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for * @pos or the first key strictly greater than @pos */ -#define BTREE_ITER_IS_EXTENTS (1 << 3) +#define BTREE_ITER_IS_EXTENTS (1 << 4) /* * indicates we need to call bch2_btree_iter_traverse() to revalidate iterator: */ -#define BTREE_ITER_AT_END_OF_LEAF (1 << 4) -#define BTREE_ITER_ERROR (1 << 5) +#define BTREE_ITER_AT_END_OF_LEAF (1 << 5) +#define BTREE_ITER_ERROR (1 << 6) enum btree_iter_uptodate { BTREE_ITER_UPTODATE = 0, BTREE_ITER_NEED_PEEK = 1, BTREE_ITER_NEED_RELOCK = 2, BTREE_ITER_NEED_TRAVERSE = 3, - BTREE_ITER_END = 4, }; /* @@ -216,7 +222,7 @@ struct btree_iter { struct bpos pos; u8 flags; - unsigned uptodate:4; + enum btree_iter_uptodate uptodate:4; enum btree_id btree_id:4; unsigned level:4, locks_want:4, diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 3e13f784..392ee0a0 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -1586,7 +1586,8 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, * XXX: figure out how far we might need to split, * instead of locking/reserving all the way to the root: */ - if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { + if (!bch2_btree_iter_upgrade(iter, U8_MAX, + !(flags & BTREE_INSERT_NOUNLOCK))) { ret = -EINTR; goto out; } @@ -1694,7 +1695,8 @@ retry: if (!down_read_trylock(&c->gc_lock)) goto err_cycle_gc_lock; - if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { + if (!bch2_btree_iter_upgrade(iter, U8_MAX, + !(flags & BTREE_INSERT_NOUNLOCK))) { ret = -EINTR; goto err_unlock; } @@ -1857,7 +1859,7 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, closure_init_stack(&cl); - bch2_btree_iter_upgrade(iter, U8_MAX); + bch2_btree_iter_upgrade(iter, U8_MAX, true); if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) { if (!down_read_trylock(&c->gc_lock)) { @@ -2000,7 +2002,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, closure_init_stack(&cl); - if (!bch2_btree_iter_upgrade(iter, U8_MAX)) + if (!bch2_btree_iter_upgrade(iter, U8_MAX, true)) return -EINTR; if (!down_read_trylock(&c->gc_lock)) { diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index 3a17de5c..e6f05071 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -199,14 +199,17 @@ void bch2_btree_root_alloc(struct bch_fs *, enum btree_id); static inline unsigned btree_update_reserve_required(struct bch_fs *c, struct btree *b) { - unsigned depth = btree_node_root(c, b)->level - b->level + 1; + unsigned depth = btree_node_root(c, b)->level + 1; /* * Number of nodes we might have to allocate in a worst case btree * split operation - we split all the way up to the root, then allocate - * a new root. + * a new root, unless we're already at max depth: */ - return depth * 2 + 1; + if (depth < BTREE_MAX_DEPTH) + return (depth - b->level) * 2 + 1; + else + return (depth - b->level) * 2 - 1; } static inline void btree_node_reset_sib_u64s(struct btree *b) diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index a62d8307..588a1997 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -205,8 +205,6 @@ btree_insert_key_leaf(struct btree_insert *trans, int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; - btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); - ret = !btree_node_is_extents(b) ? bch2_insert_fixup_key(trans, insert) : bch2_insert_fixup_extent(trans, insert); @@ -430,9 +428,9 @@ int __bch2_btree_insert_at(struct btree_insert *trans) BUG_ON(i->iter->level); BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos)); BUG_ON(debug_check_bkeys(c) && + !bkey_deleted(&i->k->k) && bch2_bkey_invalid(c, i->iter->btree_id, bkey_i_to_s_c(i->k))); - BUG_ON(i->iter->uptodate == BTREE_ITER_END); } bubble_sort(trans->entries, trans->nr, btree_trans_cmp); @@ -444,7 +442,7 @@ retry: cycle_gc_lock = false; trans_for_each_entry(trans, i) { - if (!bch2_btree_iter_upgrade(i->iter, 1)) { + if (!bch2_btree_iter_upgrade(i->iter, 1, true)) { ret = -EINTR; goto err; } @@ -647,11 +645,6 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, if (bkey_cmp(iter.pos, end) >= 0) break; - if (k.k->type == KEY_TYPE_DISCARD) { - bch2_btree_iter_next(&iter); - continue; - } - bkey_init(&delete.k); /* @@ -668,15 +661,6 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, delete.k.version = version; if (iter.flags & BTREE_ITER_IS_EXTENTS) { - /* - * The extents btree is special - KEY_TYPE_DISCARD is - * used for deletions, not KEY_TYPE_DELETED. This is an - * internal implementation detail that probably - * shouldn't be exposed (internally, KEY_TYPE_DELETED is - * used as a proxy for k->size == 0): - */ - delete.k.type = KEY_TYPE_DISCARD; - /* create the biggest key we can */ bch2_key_resize(&delete.k, max_sectors); bch2_cut_back(end, &delete.k); diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index 36dca6b2..d3dd3eb7 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -97,7 +97,11 @@ const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k) if (!len) return "empty name"; - if (bkey_val_u64s(k.k) > dirent_val_u64s(len)) + /* + * older versions of bcachefs were buggy and creating dirent + * keys that were bigger than necessary: + */ + if (bkey_val_u64s(k.k) > dirent_val_u64s(len + 7)) return "value too big"; if (len > BCH_NAME_MAX) diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index b85af711..fe4bb527 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -778,7 +778,7 @@ static bool __bch2_cut_front(struct bpos where, struct bkey_s k) * cause offset to point to the next bucket: */ if (!len) - __set_bkey_deleted(k.k); + k.k->type = KEY_TYPE_DELETED; else if (bkey_extent_is_data(k.k)) { struct bkey_s_extent e = bkey_s_to_extent(k); union bch_extent_entry *entry; @@ -833,7 +833,7 @@ bool bch2_cut_back(struct bpos where, struct bkey *k) k->size = len; if (!len) - __set_bkey_deleted(k); + k->type = KEY_TYPE_DELETED; return true; } @@ -1103,7 +1103,7 @@ static void bch2_drop_subtract(struct extent_insert_state *s, struct bkey_s k) bch2_subtract_sectors(s, k.s_c, bkey_start_offset(k.k), k.k->size); k.k->size = 0; - __set_bkey_deleted(k.k); + k.k->type = KEY_TYPE_DELETED; } static bool bch2_extent_merge_inline(struct bch_fs *, @@ -1143,10 +1143,13 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, struct bset_tree *t = bset_tree_last(l->b); struct bkey_packed *where = bch2_btree_node_iter_bset_pos(&l->iter, l->b, t); - struct bkey_packed *prev = bch2_bkey_prev(l->b, t, where); + struct bkey_packed *prev = bch2_bkey_prev_filter(l->b, t, where, + KEY_TYPE_DISCARD); struct bkey_packed *next_live_key = where; unsigned clobber_u64s; + EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); + if (prev) where = bkey_next(prev); @@ -1188,6 +1191,7 @@ static void extent_insert_committed(struct extent_insert_state *s) : &s->whiteout; BKEY_PADDED(k) split; + EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); EBUG_ON(bkey_cmp(insert->k.p, s->committed) < 0); EBUG_ON(bkey_cmp(s->committed, bkey_start_pos(&insert->k)) < 0); @@ -1246,8 +1250,6 @@ __extent_insert_advance_pos(struct extent_insert_state *s, else ret = BTREE_INSERT_OK; - EBUG_ON(bkey_deleted(&s->insert->k->k) || !s->insert->k->k.size); - if (ret == BTREE_INSERT_OK) s->committed = next_pos; @@ -1446,6 +1448,7 @@ __bch2_delete_fixup_extent(struct extent_insert_state *s) EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); s->whiteout = *insert; + s->whiteout.k.type = KEY_TYPE_DISCARD; while (bkey_cmp(s->committed, insert->k.p) < 0 && (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK && @@ -1488,6 +1491,8 @@ __bch2_delete_fixup_extent(struct extent_insert_state *s) bset_written(b, bset(b, t))) { struct bkey_i discard = *insert; + discard.k.type = KEY_TYPE_DISCARD; + switch (overlap) { case BCH_EXTENT_OVERLAP_FRONT: bch2_cut_front(bkey_start_pos(k.k), &discard); @@ -1634,7 +1639,7 @@ bch2_insert_fixup_extent(struct btree_insert *trans, }; EBUG_ON(iter->level); - EBUG_ON(bkey_deleted(&insert->k->k) || !insert->k->k.size); + EBUG_ON(!insert->k->k.size); /* * As we process overlapping extents, we advance @iter->pos both to @@ -1979,11 +1984,11 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) return false; case KEY_TYPE_DELETED: - case KEY_TYPE_COOKIE: return true; - case KEY_TYPE_DISCARD: return bversion_zero(k.k->version); + case KEY_TYPE_COOKIE: + return false; case BCH_EXTENT: case BCH_EXTENT_CACHED: @@ -2051,11 +2056,6 @@ int bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k, int ret; switch (k.k->type) { - case KEY_TYPE_DELETED: - case KEY_TYPE_DISCARD: - case KEY_TYPE_COOKIE: - return 0; - case KEY_TYPE_ERROR: return -EIO; @@ -2069,11 +2069,8 @@ int bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k, return ret; - case BCH_RESERVATION: - return 0; - default: - BUG(); + return 0; } } @@ -2099,7 +2096,6 @@ enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *b, return BCH_MERGE_NOMERGE; switch (l->k.type) { - case KEY_TYPE_DELETED: case KEY_TYPE_DISCARD: case KEY_TYPE_ERROR: /* These types are mergeable, and no val to check */ diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 737b9be3..9e78798a 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -2078,6 +2078,29 @@ out: /* truncate: */ +static inline int range_has_data(struct bch_fs *c, + struct bpos start, + struct bpos end) +{ + + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, + start, 0, k) { + if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) + break; + + if (bkey_extent_is_data(k.k)) { + ret = 1; + break; + } + } + + return bch2_btree_iter_unlock(&iter) ?: ret; +} + static int __bch2_truncate_page(struct bch_inode_info *inode, pgoff_t index, loff_t start, loff_t end) { @@ -2099,30 +2122,16 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, page = find_lock_page(mapping, index); if (!page) { - struct btree_iter iter; - struct bkey_s_c k = bkey_s_c_null; - /* * XXX: we're doing two index lookups when we end up reading the * page */ - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, - POS(inode->v.i_ino, - index << PAGE_SECTOR_SHIFT), 0, k) { - if (bkey_cmp(bkey_start_pos(k.k), - POS(inode->v.i_ino, - (index + 1) << PAGE_SECTOR_SHIFT)) >= 0) - break; + ret = range_has_data(c, + POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT), + POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT)); + if (ret <= 0) + return ret; - if (k.k->type != KEY_TYPE_DISCARD && - k.k->type != BCH_RESERVATION) { - bch2_btree_iter_unlock(&iter); - goto create; - } - } - bch2_btree_iter_unlock(&iter); - return 0; -create: page = find_or_create_page(mapping, index, GFP_KERNEL); if (unlikely(!page)) { ret = -ENOMEM; @@ -2389,9 +2398,6 @@ static long bch2_fcollapse(struct bch_inode_info *inode, bkey_reassemble(©.k, k); - if (bkey_deleted(©.k.k)) - copy.k.k.type = KEY_TYPE_DISCARD; - bch2_cut_front(src.pos, ©.k); copy.k.k.p.offset -= len >> 9; diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 048b5c10..edf714f7 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -252,9 +252,6 @@ static int check_extents(struct bch_fs *c) for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(BCACHEFS_ROOT_INO, 0), 0, k) { - if (k.k->type == KEY_TYPE_DISCARD) - continue; - ret = walk_inode(c, &w, k.k->p.inode); if (ret) break; diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c index 567289e2..dd0e8d2f 100644 --- a/libbcachefs/journal_seq_blacklist.c +++ b/libbcachefs/journal_seq_blacklist.c @@ -72,7 +72,8 @@ static void journal_seq_blacklist_flush(struct journal *j, n = bl->entries[i]; mutex_unlock(&j->blacklist_lock); - __bch2_btree_iter_init(&iter, c, n.btree_id, n.pos, 0, 0, 0); + __bch2_btree_iter_init(&iter, c, n.btree_id, n.pos, + 0, 0, BTREE_ITER_NODES); b = bch2_btree_iter_peek_node(&iter); diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c index 9dcadd20..31847a94 100644 --- a/libbcachefs/tests.c +++ b/libbcachefs/tests.c @@ -2,11 +2,29 @@ #include "bcachefs.h" #include "btree_update.h" +#include "journal_reclaim.h" #include "tests.h" #include "linux/kthread.h" #include "linux/random.h" +static void delete_test_keys(struct bch_fs *c) +{ + int ret; + + ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS, + POS(0, 0), POS(0, U64_MAX), + ZERO_VERSION, NULL, NULL, NULL); + BUG_ON(ret); + + ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS, + POS(0, 0), POS(0, U64_MAX), + ZERO_VERSION, NULL, NULL, NULL); + BUG_ON(ret); +} + +/* unit tests */ + static void test_delete(struct bch_fs *c, u64 nr) { struct btree_iter iter; @@ -36,6 +54,224 @@ static void test_delete(struct bch_fs *c, u64 nr) bch2_btree_iter_unlock(&iter); } +static void test_delete_written(struct bch_fs *c, u64 nr) +{ + struct btree_iter iter; + struct bkey_i_cookie k; + int ret; + + bkey_cookie_init(&k.k_i); + + bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS, k.k.p, + BTREE_ITER_INTENT); + + ret = bch2_btree_iter_traverse(&iter); + BUG_ON(ret); + + ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0, + BTREE_INSERT_ENTRY(&iter, &k.k_i)); + BUG_ON(ret); + + bch2_journal_flush_all_pins(&c->journal); + + ret = bch2_btree_delete_at(&iter, 0); + BUG_ON(ret); + + bch2_btree_iter_unlock(&iter); +} + +static void test_iterate(struct bch_fs *c, u64 nr) +{ + struct btree_iter iter; + struct bkey_s_c k; + u64 i; + int ret; + + delete_test_keys(c); + + pr_info("inserting test keys"); + + for (i = 0; i < nr; i++) { + struct bkey_i_cookie k; + + bkey_cookie_init(&k.k_i); + k.k.p.offset = i; + + ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i, + NULL, NULL, NULL, 0); + BUG_ON(ret); + } + + pr_info("iterating forwards"); + + i = 0; + + for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(0, 0), 0, k) + BUG_ON(k.k->p.offset != i++); + bch2_btree_iter_unlock(&iter); + + BUG_ON(i != nr); + + pr_info("iterating backwards"); + + while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k)) + BUG_ON(k.k->p.offset != --i); + bch2_btree_iter_unlock(&iter); + + BUG_ON(i); +} + +static void test_iterate_extents(struct bch_fs *c, u64 nr) +{ + struct btree_iter iter; + struct bkey_s_c k; + u64 i; + int ret; + + delete_test_keys(c); + + pr_info("inserting test extents"); + + for (i = 0; i < nr; i += 8) { + struct bkey_i_cookie k; + + bkey_cookie_init(&k.k_i); + k.k.p.offset = i + 8; + k.k.size = 8; + + ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, + NULL, NULL, NULL, 0); + BUG_ON(ret); + } + + pr_info("iterating forwards"); + + i = 0; + + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(0, 0), 0, k) { + BUG_ON(bkey_start_offset(k.k) != i); + i = k.k->p.offset; + } + bch2_btree_iter_unlock(&iter); + + BUG_ON(i != nr); + + pr_info("iterating backwards"); + + while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k)) { + BUG_ON(k.k->p.offset != i); + i = bkey_start_offset(k.k); + } + bch2_btree_iter_unlock(&iter); + + BUG_ON(i); +} + +static void test_iterate_slots(struct bch_fs *c, u64 nr) +{ + struct btree_iter iter; + struct bkey_s_c k; + u64 i; + int ret; + + delete_test_keys(c); + + pr_info("inserting test keys"); + + for (i = 0; i < nr; i++) { + struct bkey_i_cookie k; + + bkey_cookie_init(&k.k_i); + k.k.p.offset = i * 2; + + ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i, + NULL, NULL, NULL, 0); + BUG_ON(ret); + } + + pr_info("iterating forwards"); + + i = 0; + + for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(0, 0), 0, k) { + BUG_ON(k.k->p.offset != i); + i += 2; + } + bch2_btree_iter_unlock(&iter); + + BUG_ON(i != nr * 2); + + pr_info("iterating forwards by slots"); + + i = 0; + + for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(0, 0), + BTREE_ITER_SLOTS, k) { + BUG_ON(bkey_deleted(k.k) != (i & 1)); + BUG_ON(k.k->p.offset != i++); + + if (i == nr * 2) + break; + } + bch2_btree_iter_unlock(&iter); +} + +static void test_iterate_slots_extents(struct bch_fs *c, u64 nr) +{ + struct btree_iter iter; + struct bkey_s_c k; + u64 i; + int ret; + + delete_test_keys(c); + + pr_info("inserting test keys"); + + for (i = 0; i < nr; i += 16) { + struct bkey_i_cookie k; + + bkey_cookie_init(&k.k_i); + k.k.p.offset = i + 16; + k.k.size = 8; + + ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, + NULL, NULL, NULL, 0); + BUG_ON(ret); + } + + pr_info("iterating forwards"); + + i = 0; + + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(0, 0), 0, k) { + BUG_ON(bkey_start_offset(k.k) != i + 8); + BUG_ON(k.k->size != 8); + i += 16; + } + bch2_btree_iter_unlock(&iter); + + BUG_ON(i != nr); + + pr_info("iterating forwards by slots"); + + i = 0; + + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(0, 0), + BTREE_ITER_SLOTS, k) { + BUG_ON(bkey_deleted(k.k) != !(i % 16)); + + BUG_ON(bkey_start_offset(k.k) != i); + BUG_ON(k.k->size != 8); + i = k.k->p.offset; + + if (i == nr) + break; + } + bch2_btree_iter_unlock(&iter); +} + +/* perf tests */ + static u64 test_rand(void) { u64 v; @@ -183,7 +419,7 @@ static void seq_delete(struct bch_fs *c, u64 nr) int ret; ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS, - POS_MIN, POS_MAX, + POS(0, 0), POS(0, U64_MAX), ZERO_VERSION, NULL, NULL, NULL); BUG_ON(ret); } @@ -256,6 +492,11 @@ void bch2_btree_perf_test(struct bch_fs *c, const char *testname, /* a unit test, not a perf test: */ perf_test(test_delete); + perf_test(test_delete_written); + perf_test(test_iterate); + perf_test(test_iterate_extents); + perf_test(test_iterate_slots); + perf_test(test_iterate_slots_extents); if (!j.fn) { pr_err("unknown test %s", testname); diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index de95480c..c6b5015a 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -13,24 +13,8 @@ #include #include -static unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) -{ - return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) + - name_len + val_len, sizeof(u64)); -} - -#define xattr_val(_xattr) ((_xattr)->x_name + (_xattr)->x_name_len) - static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned); -struct xattr_search_key { - u8 type; - struct qstr name; -}; - -#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key) \ - { .type = _type, .name = QSTR_INIT(_name, _len) }) - static u64 bch2_xattr_hash(const struct bch_hash_info *info, const struct xattr_search_key *key) { @@ -158,6 +142,17 @@ void bch2_xattr_to_text(struct bch_fs *c, char *buf, } } +struct bkey_s_c bch2_xattr_get_iter(struct bch_fs *c, + struct btree_iter *iter, + struct bch_inode_info *inode, + const char *name, int type) +{ + return bch2_hash_lookup(bch2_xattr_hash_desc, + &inode->ei_str_hash, + c, inode->v.i_ino, iter, + &X_SEARCH(type, name, strlen(name))); +} + int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, const char *name, void *buffer, size_t size, int type) { @@ -185,19 +180,15 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, return ret; } -int __bch2_xattr_set(struct bch_fs *c, u64 inum, - const struct bch_hash_info *hash_info, - const char *name, const void *value, size_t size, - int flags, int type, u64 *journal_seq) +int bch2_xattr_set(struct bch_fs *c, u64 inum, + const struct bch_hash_info *hash_info, + const char *name, const void *value, size_t size, + int flags, int type, u64 *journal_seq) { struct xattr_search_key search = X_SEARCH(type, name, strlen(name)); int ret; - if (!value) { - ret = bch2_hash_delete(bch2_xattr_hash_desc, hash_info, - c, inum, - journal_seq, &search); - } else { + if (value) { struct bkey_i_xattr *xattr; unsigned u64s = BKEY_U64s + xattr_val_u64s(search.name.len, size); @@ -223,6 +214,9 @@ int __bch2_xattr_set(struct bch_fs *c, u64 inum, (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)| (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0)); kfree(xattr); + } else { + ret = bch2_hash_delete(bch2_xattr_hash_desc, hash_info, + c, inum, journal_seq, &search); } if (ret == -ENOENT) @@ -231,15 +225,6 @@ int __bch2_xattr_set(struct bch_fs *c, u64 inum, return ret; } -int bch2_xattr_set(struct bch_fs *c, struct bch_inode_info *inode, - const char *name, const void *value, size_t size, - int flags, int type) -{ - return __bch2_xattr_set(c, inode->v.i_ino, &inode->ei_str_hash, - name, value, size, flags, type, - &inode->ei_journal_seq); -} - static size_t bch2_xattr_emit(struct dentry *dentry, const struct bch_xattr *xattr, char *buffer, size_t buffer_size) @@ -323,8 +308,9 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler, struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - return bch2_xattr_set(c, inode, name, value, size, flags, - handler->flags); + return bch2_xattr_set(c, inode->v.i_ino, &inode->ei_str_hash, + name, value, size, flags, handler->flags, + &inode->ei_journal_seq); } static const struct xattr_handler bch_xattr_user_handler = { diff --git a/libbcachefs/xattr.h b/libbcachefs/xattr.h index a58e7e30..1365032d 100644 --- a/libbcachefs/xattr.h +++ b/libbcachefs/xattr.h @@ -13,17 +13,37 @@ void bch2_xattr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); .val_to_text = bch2_xattr_to_text, \ } +static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) +{ + return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) + + name_len + val_len, sizeof(u64)); +} + +#define xattr_val(_xattr) \ + ((void *) (_xattr)->x_name + (_xattr)->x_name_len) + +struct xattr_search_key { + u8 type; + struct qstr name; +}; + +#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key) \ + { .type = _type, .name = QSTR_INIT(_name, _len) }) + struct dentry; struct xattr_handler; struct bch_hash_info; struct bch_inode_info; +struct bkey_s_c bch2_xattr_get_iter(struct bch_fs *, + struct btree_iter *, + struct bch_inode_info *, + const char *, int); int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *, const char *, void *, size_t, int); -int __bch2_xattr_set(struct bch_fs *, u64, const struct bch_hash_info *, - const char *, const void *, size_t, int, int, u64 *); -int bch2_xattr_set(struct bch_fs *, struct bch_inode_info *, - const char *, const void *, size_t, int, int); + +int bch2_xattr_set(struct bch_fs *, u64, const struct bch_hash_info *, + const char *, const void *, size_t, int, int, u64 *); ssize_t bch2_xattr_list(struct dentry *, char *, size_t); extern const struct xattr_handler *bch2_xattr_handlers[];