diff --git a/.bcachefs_revision b/.bcachefs_revision index 654eedda..fafb2859 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -1592eaa60418d460021ecf44bc405ec49ef14adf +dfc7b2c9433ed926cad881b8fbee55e36105989b diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h new file mode 100644 index 00000000..43a7b9dc --- /dev/null +++ b/include/linux/min_heap.h @@ -0,0 +1,236 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_MIN_HEAP_H +#define _LINUX_MIN_HEAP_H + +#include +#include +#include + +/** + * Data structure to hold a min-heap. + * @nr: Number of elements currently in the heap. + * @size: Maximum number of elements that can be held in current storage. + * @data: Pointer to the start of array holding the heap elements. + * @preallocated: Start of the static preallocated array holding the heap elements. + */ +#define MIN_HEAP_PREALLOCATED(_type, _name, _nr) \ +struct _name { \ + int nr; \ + int size; \ + _type *data; \ + _type preallocated[_nr]; \ +} + +#define DEFINE_MIN_HEAP(_type, _name) MIN_HEAP_PREALLOCATED(_type, _name, 0) + +typedef DEFINE_MIN_HEAP(char, min_heap_char) min_heap_char; + +#define __minheap_cast(_heap) (typeof((_heap)->data[0]) *) +#define __minheap_obj_size(_heap) sizeof((_heap)->data[0]) + +/** + * struct min_heap_callbacks - Data/functions to customise the min_heap. + * @less: Partial order function for this heap. + * @swp: Swap elements function. + */ +struct min_heap_callbacks { + bool (*less)(const void *lhs, const void *rhs, void *args); + void (*swp)(void *lhs, void *rhs, void *args); +}; + +/* Initialize a min-heap. */ +static __always_inline +void __min_heap_init(min_heap_char *heap, void *data, int size) +{ + heap->nr = 0; + heap->size = size; + if (data) + heap->data = data; + else + heap->data = heap->preallocated; +} + +#define min_heap_init(_heap, _data, _size) \ + __min_heap_init((min_heap_char *)_heap, _data, _size) + +/* Get the minimum element from the heap. */ +static __always_inline +void *__min_heap_peek(struct min_heap_char *heap) +{ + return heap->nr ? heap->data : NULL; +} + +#define min_heap_peek(_heap) \ + (__minheap_cast(_heap) __min_heap_peek((min_heap_char *)_heap)) + +/* Check if the heap is full. */ +static __always_inline +bool __min_heap_full(min_heap_char *heap) +{ + return heap->nr == heap->size; +} + +#define min_heap_full(_heap) \ + __min_heap_full((min_heap_char *)_heap) + +/* Sift the element at pos down the heap. */ +static __always_inline +void __min_heap_sift_down(min_heap_char *heap, int pos, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + void *left, *right; + void *data = heap->data; + void *root = data + pos * elem_size; + int i = pos, j; + + /* Find the sift-down path all the way to the leaves. */ + for (;;) { + if (i * 2 + 2 >= heap->nr) + break; + left = data + (i * 2 + 1) * elem_size; + right = data + (i * 2 + 2) * elem_size; + i = func->less(left, right, args) ? i * 2 + 1 : i * 2 + 2; + } + + /* Special case for the last leaf with no sibling. */ + if (i * 2 + 2 == heap->nr) + i = i * 2 + 1; + + /* Backtrack to the correct location. */ + while (i != pos && func->less(root, data + i * elem_size, args)) + i = (i - 1) / 2; + + /* Shift the element into its correct place. */ + j = i; + while (i != pos) { + i = (i - 1) / 2; + func->swp(data + i * elem_size, data + j * elem_size, args); + } +} + +#define min_heap_sift_down(_heap, _pos, _func, _args) \ + __min_heap_sift_down((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), _func, _args) + +/* Sift up ith element from the heap, O(log2(nr)). */ +static __always_inline +void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args) +{ + void *data = heap->data; + size_t parent; + + while (idx) { + parent = (idx - 1) / 2; + if (func->less(data + parent * elem_size, data + idx * elem_size, args)) + break; + func->swp(data + parent * elem_size, data + idx * elem_size, args); + idx = parent; + } +} + +#define min_heap_sift_up(_heap, _idx, _func, _args) \ + __min_heap_sift_up((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args) + +/* Floyd's approach to heapification that is O(nr). */ +static __always_inline +void __min_heapify_all(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + int i; + + for (i = heap->nr / 2 - 1; i >= 0; i--) + __min_heap_sift_down(heap, i, elem_size, func, args); +} + +#define min_heapify_all(_heap, _func, _args) \ + __min_heapify_all((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) + +/* Remove minimum element from the heap, O(log2(nr)). */ +static __always_inline +bool __min_heap_pop(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + void *data = heap->data; + + if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap")) + return false; + + /* Place last element at the root (position 0) and then sift down. */ + heap->nr--; + memcpy(data, data + (heap->nr * elem_size), elem_size); + __min_heap_sift_down(heap, 0, elem_size, func, args); + + return true; +} + +#define min_heap_pop(_heap, _func, _args) \ + __min_heap_pop((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) + +/* + * Remove the minimum element and then push the given element. The + * implementation performs 1 sift (O(log2(nr))) and is therefore more + * efficient than a pop followed by a push that does 2. + */ +static __always_inline +void __min_heap_pop_push(min_heap_char *heap, + const void *element, size_t elem_size, + const struct min_heap_callbacks *func, + void *args) +{ + memcpy(heap->data, element, elem_size); + __min_heap_sift_down(heap, 0, elem_size, func, args); +} + +#define min_heap_pop_push(_heap, _element, _func, _args) \ + __min_heap_pop_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args) + +/* Push an element on to the heap, O(log2(nr)). */ +static __always_inline +bool __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + void *data = heap->data; + int pos; + + if (WARN_ONCE(heap->nr >= heap->size, "Pushing on a full heap")) + return false; + + /* Place at the end of data. */ + pos = heap->nr; + memcpy(data + (pos * elem_size), element, elem_size); + heap->nr++; + + /* Sift child at pos up. */ + __min_heap_sift_up(heap, elem_size, pos, func, args); + + return true; +} + +#define min_heap_push(_heap, _element, _func, _args) \ + __min_heap_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args) + +/* Remove ith element from the heap, O(log2(nr)). */ +static __always_inline +bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args) +{ + void *data = heap->data; + + if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap")) + return false; + + /* Place last element at the root (position 0) and then sift down. */ + heap->nr--; + if (idx == heap->nr) + return true; + func->swp(data + (idx * elem_size), data + (heap->nr * elem_size), args); + __min_heap_sift_up(heap, elem_size, idx, func, args); + __min_heap_sift_down(heap, idx, elem_size, func, args); + + return true; +} + +#define min_heap_del(_heap, _idx, _func, _args) \ + __min_heap_del((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args) + +#endif /* _LINUX_MIN_HEAP_H */ diff --git a/include/linux/poison.h b/include/linux/poison.h index 1f0ee245..331a9a99 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -38,11 +38,8 @@ * Magic nums for obj red zoning. * Placed in the first word before and the first word after an obj. */ -#define RED_INACTIVE 0x09F911029D74E35BULL /* when obj is inactive */ -#define RED_ACTIVE 0xD84156C5635688C0ULL /* when obj is active */ - -#define SLUB_RED_INACTIVE 0xbb -#define SLUB_RED_ACTIVE 0xcc +#define SLUB_RED_INACTIVE 0xbb /* when obj is inactive */ +#define SLUB_RED_ACTIVE 0xcc /* when obj is active */ /* ...and for poisoning */ #define POISON_INUSE 0x5a /* for use-uninitialised poisoning */ @@ -52,12 +49,6 @@ /********** arch/$ARCH/mm/init.c **********/ #define POISON_FREE_INITMEM 0xcc -/********** arch/ia64/hp/common/sba_iommu.c **********/ -/* - * arch/ia64/hp/common/sba_iommu.c uses a 16-byte poison string with a - * value of "SBAIOMMU POISON\0" for spill-over poisoning. - */ - /********** fs/jbd/journal.c **********/ #define JBD_POISON_FREE 0x5b #define JBD2_POISON_FREE 0x5c diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c index a7b425d3..87f1be9d 100644 --- a/libbcachefs/acl.c +++ b/libbcachefs/acl.c @@ -272,16 +272,19 @@ bch2_acl_to_xattr(struct btree_trans *trans, return xattr; } -struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap, - struct dentry *dentry, int type) +struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu) { - struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); + struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); - struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; struct posix_acl *acl = NULL; + + if (rcu) + return ERR_PTR(-ECHILD); + + struct btree_trans *trans = bch2_trans_get(c); retry: bch2_trans_begin(trans); @@ -358,7 +361,7 @@ retry: bch2_trans_begin(trans); acl = _acl; - ret = bch2_subvol_is_ro_trans(trans, inode->ei_subvol) ?: + ret = bch2_subvol_is_ro_trans(trans, inode->ei_inum.subvol) ?: bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), BTREE_ITER_intent); if (ret) diff --git a/libbcachefs/acl.h b/libbcachefs/acl.h index 27e7eec0..fe730a6b 100644 --- a/libbcachefs/acl.h +++ b/libbcachefs/acl.h @@ -28,7 +28,7 @@ void bch2_acl_to_text(struct printbuf *, const void *, size_t); #ifdef CONFIG_BCACHEFS_POSIX_ACL -struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int); +struct posix_acl *bch2_get_acl(struct inode *, int, bool); int bch2_set_acl_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index 8d2b62c9..96a0444e 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -82,6 +82,14 @@ static inline bool bucket_data_type_mismatch(enum bch_data_type bucket, bucket_data_type(bucket) != bucket_data_type(ptr); } +/* + * It is my general preference to use unsigned types for unsigned quantities - + * however, these helpers are used in disk accounting calculations run by + * triggers where the output will be negated and added to an s64. unsigned is + * right out even though all these quantities will fit in 32 bits, since it + * won't be sign extended correctly; u64 will negate "correctly", but s64 is the + * simpler option here. + */ static inline s64 bch2_bucket_sectors_total(struct bch_alloc_v4 a) { return a.stripe_sectors + a.dirty_sectors + a.cached_sectors; @@ -166,8 +174,8 @@ static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a, * avoid overflowing LRU_TIME_BITS on a corrupted fs, when * bucket_sectors_dirty is (much) bigger than bucket_size */ - u64 d = min(bch2_bucket_sectors_dirty(a), - ca->mi.bucket_size); + u64 d = min_t(s64, bch2_bucket_sectors_dirty(a), + ca->mi.bucket_size); return div_u64(d * (1ULL << 31), ca->mi.bucket_size); } diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index cabf866c..922bf2fb 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -496,12 +496,6 @@ again: for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k)); alloc_cursor < k.k->p.offset; alloc_cursor++) { - ret = btree_trans_too_many_iters(trans); - if (ret) { - ob = ERR_PTR(ret); - break; - } - s->buckets_seen++; u64 bucket = alloc_cursor & ~(~0ULL << 56); @@ -1028,9 +1022,6 @@ static int __open_bucket_add_buckets(struct btree_trans *trans, open_bucket_for_each(c, ptrs, ob, i) __clear_bit(ob->dev, devs.d); - if (erasure_code && ec_open_bucket(c, ptrs)) - return 0; - ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs, nr_replicas, nr_effective, have_cache, erasure_code, flags); @@ -1085,7 +1076,7 @@ static int open_bucket_add_buckets(struct btree_trans *trans, { int ret; - if (erasure_code) { + if (erasure_code && !ec_open_bucket(trans->c, ptrs)) { ret = __open_bucket_add_buckets(trans, ptrs, wp, devs_have, target, erasure_code, nr_replicas, nr_effective, have_cache, @@ -1609,7 +1600,8 @@ void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct ope prt_newline(out); } -void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) +void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c, + struct bch_dev *ca) { struct open_bucket *ob; @@ -1619,7 +1611,8 @@ void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { spin_lock(&ob->lock); - if (ob->valid && !ob->on_partial_list) + if (ob->valid && !ob->on_partial_list && + (!ca || ob->dev == ca->dev_idx)) bch2_open_bucket_to_text(out, c, ob); spin_unlock(&ob->lock); } @@ -1762,11 +1755,12 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) prt_printf(out, "buckets to invalidate\t%llu\r\n", should_invalidate_buckets(ca, stats)); } -void bch2_print_allocator_stuck(struct bch_fs *c) +static noinline void bch2_print_allocator_stuck(struct bch_fs *c) { struct printbuf buf = PRINTBUF; - prt_printf(&buf, "Allocator stuck? Waited for 10 seconds\n"); + prt_printf(&buf, "Allocator stuck? Waited for %u seconds\n", + c->opts.allocator_stuck_timeout); prt_printf(&buf, "Allocator debug:\n"); printbuf_indent_add(&buf, 2); @@ -1796,3 +1790,24 @@ void bch2_print_allocator_stuck(struct bch_fs *c) bch2_print_string_as_lines(KERN_ERR, buf.buf); printbuf_exit(&buf); } + +static inline unsigned allocator_wait_timeout(struct bch_fs *c) +{ + if (c->allocator_last_stuck && + time_after(c->allocator_last_stuck + HZ * 60 * 2, jiffies)) + return 0; + + return c->opts.allocator_stuck_timeout * HZ; +} + +void __bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl) +{ + unsigned t = allocator_wait_timeout(c); + + if (t && closure_sync_timeout(cl, t)) { + c->allocator_last_stuck = jiffies; + bch2_print_allocator_stuck(c); + } + + closure_sync(cl); +} diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h index 6da9e7e2..386d231c 100644 --- a/libbcachefs/alloc_foreground.h +++ b/libbcachefs/alloc_foreground.h @@ -223,7 +223,7 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp void bch2_fs_allocator_foreground_init(struct bch_fs *); void bch2_open_bucket_to_text(struct printbuf *, struct bch_fs *, struct open_bucket *); -void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *); +void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *, struct bch_dev *); void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *); void bch2_write_points_to_text(struct printbuf *, struct bch_fs *); @@ -231,6 +231,11 @@ void bch2_write_points_to_text(struct printbuf *, struct bch_fs *); void bch2_fs_alloc_debug_to_text(struct printbuf *, struct bch_fs *); void bch2_dev_alloc_debug_to_text(struct printbuf *, struct bch_dev *); -void bch2_print_allocator_stuck(struct bch_fs *); +void __bch2_wait_on_allocator(struct bch_fs *, struct closure *); +static inline void bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl) +{ + if (cl->closure_get_happened) + __bch2_wait_on_allocator(c, cl); +} #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 91361a16..43cfb5da 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -893,6 +893,8 @@ struct bch_fs { struct bch_fs_usage_base __percpu *usage; u64 __percpu *online_reserved; + unsigned long allocator_last_stuck; + struct io_clock io_clock[2]; /* JOURNAL SEQ BLACKLIST */ @@ -1020,6 +1022,7 @@ struct bch_fs { /* fs.c */ struct list_head vfs_inodes_list; struct mutex vfs_inodes_lock; + struct rhashtable vfs_inodes_table; /* VFS IO PATH - fs-io.c */ struct bio_set writepage_bioset; @@ -1082,7 +1085,6 @@ struct bch_fs { u64 __percpu *counters; unsigned copy_gc_enabled:1; - bool promote_whole_extents; struct bch2_time_stats times[BCH_TIME_STAT_NR]; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 74a60b1a..7861473b 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -792,6 +792,8 @@ LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61); LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62); LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63); +LE64_BITMASK(BCH_SB_PROMOTE_WHOLE_EXTENTS, + struct bch_sb, flags[0], 63, 64); LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4); LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1], 4, 8); @@ -836,6 +838,8 @@ LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI, LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE, struct bch_sb, flags[5], 0, 16); +LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT, + struct bch_sb, flags[5], 16, 32); static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) { diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index fb04795c..afbb35ab 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -1934,6 +1934,11 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) bch2_trans_verify_not_in_restart(trans); bch2_btree_iter_verify(iter); + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (ret) + goto err; + + struct btree_path *path = btree_iter_path(trans, iter); /* already at end? */ @@ -2720,6 +2725,7 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) return bch2_btree_iter_peek_slot(iter); } +/* Obsolete, but still used by rust wrapper in -tools */ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter) { struct bkey_s_c k; diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 6c7a5034..b241f544 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -811,20 +811,6 @@ transaction_restart: \ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); -static inline struct bkey_s_c -__bch2_btree_iter_peek_and_restart(struct btree_trans *trans, - struct btree_iter *iter, unsigned flags) -{ - struct bkey_s_c k; - - while (btree_trans_too_many_iters(trans) || - (k = bch2_btree_iter_peek_type(iter, flags), - bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) - bch2_trans_begin(trans); - - return k; -} - #define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \ _start, _end, _flags, _k, _ret) \ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ @@ -865,7 +851,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, \ if (bch2_err_matches(_ret, ENOMEM)) { \ _gfp = GFP_KERNEL; \ - _ret = drop_locks_do(trans, _do); \ + _ret = drop_locks_do(_trans, _do); \ } \ _ret; \ }) @@ -878,7 +864,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, _ret = 0; \ if (unlikely(!_p)) { \ _gfp = GFP_KERNEL; \ - _ret = drop_locks_do(trans, ((_p = _do), 0)); \ + _ret = drop_locks_do(_trans, ((_p = _do), 0)); \ } \ _p; \ }) diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index fbcfa5ce..b4f76784 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -1264,7 +1264,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl); bch2_trans_unlock(trans); - closure_sync(&cl); + bch2_wait_on_allocator(c, &cl); } while (bch2_err_matches(ret, BCH_ERR_operation_blocked)); } diff --git a/libbcachefs/clock.c b/libbcachefs/clock.c index df3763c1..1d6b691e 100644 --- a/libbcachefs/clock.c +++ b/libbcachefs/clock.c @@ -6,15 +6,29 @@ #include #include -static inline long io_timer_cmp(io_timer_heap *h, - struct io_timer *l, - struct io_timer *r) +static inline bool io_timer_cmp(const void *l, const void *r, void __always_unused *args) { - return l->expire - r->expire; + struct io_timer **_l = (struct io_timer **)l; + struct io_timer **_r = (struct io_timer **)r; + + return (*_l)->expire < (*_r)->expire; +} + +static inline void io_timer_swp(void *l, void *r, void __always_unused *args) +{ + struct io_timer **_l = (struct io_timer **)l; + struct io_timer **_r = (struct io_timer **)r; + + swap(*_l, *_r); } void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) { + const struct min_heap_callbacks callbacks = { + .less = io_timer_cmp, + .swp = io_timer_swp, + }; + spin_lock(&clock->timer_lock); if (time_after_eq64((u64) atomic64_read(&clock->now), timer->expire)) { @@ -23,22 +37,27 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) return; } - for (size_t i = 0; i < clock->timers.used; i++) + for (size_t i = 0; i < clock->timers.nr; i++) if (clock->timers.data[i] == timer) goto out; - BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp, NULL)); + BUG_ON(!min_heap_push(&clock->timers, &timer, &callbacks, NULL)); out: spin_unlock(&clock->timer_lock); } void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) { + const struct min_heap_callbacks callbacks = { + .less = io_timer_cmp, + .swp = io_timer_swp, + }; + spin_lock(&clock->timer_lock); - for (size_t i = 0; i < clock->timers.used; i++) + for (size_t i = 0; i < clock->timers.nr; i++) if (clock->timers.data[i] == timer) { - heap_del(&clock->timers, i, io_timer_cmp, NULL); + min_heap_del(&clock->timers, i, &callbacks, NULL); break; } @@ -123,10 +142,17 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now) { struct io_timer *ret = NULL; + const struct min_heap_callbacks callbacks = { + .less = io_timer_cmp, + .swp = io_timer_swp, + }; + + if (clock->timers.nr && + time_after_eq64(now, clock->timers.data[0]->expire)) { + ret = *min_heap_peek(&clock->timers); + min_heap_pop(&clock->timers, &callbacks, NULL); + } - if (clock->timers.used && - time_after_eq64(now, clock->timers.data[0]->expire)) - heap_pop(&clock->timers, ret, io_timer_cmp, NULL); return ret; } @@ -150,7 +176,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) printbuf_tabstop_push(out, 40); prt_printf(out, "current time:\t%llu\n", now); - for (unsigned i = 0; i < clock->timers.used; i++) + for (unsigned i = 0; i < clock->timers.nr; i++) prt_printf(out, "%ps %ps:\t%llu\n", clock->timers.data[i]->fn, clock->timers.data[i]->fn2, diff --git a/libbcachefs/clock.h b/libbcachefs/clock.h index 85c975df..82c79c8b 100644 --- a/libbcachefs/clock.h +++ b/libbcachefs/clock.h @@ -20,15 +20,6 @@ static inline void bch2_increment_clock(struct bch_fs *c, u64 sectors, void bch2_io_clock_schedule_timeout(struct io_clock *, u64); -#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\ -({ \ - long __ret = timeout; \ - might_sleep(); \ - if (!___wait_cond_timeout(condition)) \ - __ret = __wait_event_timeout(wq, condition, timeout); \ - __ret; \ -}) - void bch2_io_timers_to_text(struct printbuf *, struct io_clock *); void bch2_io_clock_exit(struct io_clock *); diff --git a/libbcachefs/clock_types.h b/libbcachefs/clock_types.h index 9c25d0fc..37554e45 100644 --- a/libbcachefs/clock_types.h +++ b/libbcachefs/clock_types.h @@ -24,7 +24,7 @@ struct io_timer { /* Amount to buffer up on a percpu counter */ #define IO_CLOCK_PCPU_SECTORS 128 -typedef HEAP(struct io_timer *) io_timer_heap; +typedef DEFINE_MIN_HEAP(struct io_timer *, io_timer_heap) io_timer_heap; struct io_clock { atomic64_t now; diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index d743da89..6c200401 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -553,62 +553,30 @@ static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subv int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - subvol_inum target; - u32 snapshot; struct bkey_buf sk; - int ret; - bch2_bkey_buf_init(&sk); -retry: - bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; + int ret = bch2_trans_run(c, + for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_dirents, + POS(inum.inum, ctx->pos), + POS(inum.inum, U64_MAX), + inum.subvol, 0, k, ({ + if (k.k->type != KEY_TYPE_dirent) + continue; - for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents, - SPOS(inum.inum, ctx->pos, snapshot), - POS(inum.inum, U64_MAX), 0, k, ret) { - if (k.k->type != KEY_TYPE_dirent) - continue; + /* dir_emit() can fault and block: */ + bch2_bkey_buf_reassemble(&sk, c, k); + struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k); - /* dir_emit() can fault and block: */ - bch2_bkey_buf_reassemble(&sk, c, k); - struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k); + subvol_inum target; + int ret2 = bch2_dirent_read_target(trans, inum, dirent, &target); + if (ret2 > 0) + continue; - ret = bch2_dirent_read_target(trans, inum, dirent, &target); - if (ret < 0) - break; - if (ret) - continue; + ret2 ?: drop_locks_do(trans, bch2_dir_emit(ctx, dirent, target)); + }))); - /* - * read_target looks up subvolumes, we can overflow paths if the - * directory has many subvolumes in it - * - * XXX: btree_trans_too_many_iters() is something we'd like to - * get rid of, and there's no good reason to be using it here - * except that we don't yet have a for_each_btree_key() helper - * that does subvolume_get_snapshot(). - */ - ret = drop_locks_do(trans, - bch2_dir_emit(ctx, dirent, target)) ?: - btree_trans_too_many_iters(trans); - if (ret) { - ret = ret < 0 ? ret : 0; - break; - } - } - bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_put(trans); bch2_bkey_buf_exit(&sk, c); - return ret; + return ret < 0 ? ret : 0; } diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 86948d11..84f1cbf6 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -901,8 +901,8 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) mutex_lock(&c->ec_stripes_heap_lock); if (n.size > h->size) { - memcpy(n.data, h->data, h->used * sizeof(h->data[0])); - n.used = h->used; + memcpy(n.data, h->data, h->nr * sizeof(h->data[0])); + n.nr = h->nr; swap(*h, n); } mutex_unlock(&c->ec_stripes_heap_lock); @@ -993,7 +993,7 @@ static u64 stripe_idx_to_delete(struct bch_fs *c) lockdep_assert_held(&c->ec_stripes_heap_lock); - if (h->used && + if (h->nr && h->data[0].blocks_nonempty == 0 && !bch2_stripe_is_open(c, h->data[0].idx)) return h->data[0].idx; @@ -1001,14 +1001,6 @@ static u64 stripe_idx_to_delete(struct bch_fs *c) return 0; } -static inline int ec_stripes_heap_cmp(ec_stripes_heap *h, - struct ec_stripe_heap_entry l, - struct ec_stripe_heap_entry r) -{ - return ((l.blocks_nonempty > r.blocks_nonempty) - - (l.blocks_nonempty < r.blocks_nonempty)); -} - static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, size_t i) { @@ -1017,39 +1009,71 @@ static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i; } +static inline bool ec_stripes_heap_cmp(const void *l, const void *r, void __always_unused *args) +{ + struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l; + struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r; + + return ((_l->blocks_nonempty > _r->blocks_nonempty) < + (_l->blocks_nonempty < _r->blocks_nonempty)); +} + +static inline void ec_stripes_heap_swap(void *l, void *r, void *h) +{ + struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l; + struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r; + ec_stripes_heap *_h = (ec_stripes_heap *)h; + size_t i = _l - _h->data; + size_t j = _r - _h->data; + + swap(*_l, *_r); + + ec_stripes_heap_set_backpointer(_h, i); + ec_stripes_heap_set_backpointer(_h, j); +} + static void heap_verify_backpointer(struct bch_fs *c, size_t idx) { ec_stripes_heap *h = &c->ec_stripes_heap; struct stripe *m = genradix_ptr(&c->stripes, idx); - BUG_ON(m->heap_idx >= h->used); + BUG_ON(m->heap_idx >= h->nr); BUG_ON(h->data[m->heap_idx].idx != idx); } void bch2_stripes_heap_del(struct bch_fs *c, struct stripe *m, size_t idx) { + const struct min_heap_callbacks callbacks = { + .less = ec_stripes_heap_cmp, + .swp = ec_stripes_heap_swap, + }; + mutex_lock(&c->ec_stripes_heap_lock); heap_verify_backpointer(c, idx); - heap_del(&c->ec_stripes_heap, m->heap_idx, - ec_stripes_heap_cmp, - ec_stripes_heap_set_backpointer); + min_heap_del(&c->ec_stripes_heap, m->heap_idx, &callbacks, &c->ec_stripes_heap); mutex_unlock(&c->ec_stripes_heap_lock); } void bch2_stripes_heap_insert(struct bch_fs *c, struct stripe *m, size_t idx) { - mutex_lock(&c->ec_stripes_heap_lock); - BUG_ON(heap_full(&c->ec_stripes_heap)); + const struct min_heap_callbacks callbacks = { + .less = ec_stripes_heap_cmp, + .swp = ec_stripes_heap_swap, + }; - heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { + mutex_lock(&c->ec_stripes_heap_lock); + BUG_ON(min_heap_full(&c->ec_stripes_heap)); + + genradix_ptr(&c->stripes, idx)->heap_idx = c->ec_stripes_heap.nr; + min_heap_push(&c->ec_stripes_heap, &((struct ec_stripe_heap_entry) { .idx = idx, .blocks_nonempty = m->blocks_nonempty, }), - ec_stripes_heap_cmp, - ec_stripes_heap_set_backpointer); + &callbacks, + &c->ec_stripes_heap); heap_verify_backpointer(c, idx); mutex_unlock(&c->ec_stripes_heap_lock); @@ -1058,6 +1082,10 @@ void bch2_stripes_heap_insert(struct bch_fs *c, void bch2_stripes_heap_update(struct bch_fs *c, struct stripe *m, size_t idx) { + const struct min_heap_callbacks callbacks = { + .less = ec_stripes_heap_cmp, + .swp = ec_stripes_heap_swap, + }; ec_stripes_heap *h = &c->ec_stripes_heap; bool do_deletes; size_t i; @@ -1068,10 +1096,8 @@ void bch2_stripes_heap_update(struct bch_fs *c, h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; i = m->heap_idx; - heap_sift_up(h, i, ec_stripes_heap_cmp, - ec_stripes_heap_set_backpointer); - heap_sift_down(h, i, ec_stripes_heap_cmp, - ec_stripes_heap_set_backpointer); + min_heap_sift_up(h, i, &callbacks, &c->ec_stripes_heap); + min_heap_sift_down(h, i, &callbacks, &c->ec_stripes_heap); heap_verify_backpointer(c, idx); @@ -1783,6 +1809,9 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ BUG_ON(v->nr_blocks != h->s->nr_data + h->s->nr_parity); BUG_ON(v->nr_redundant != h->s->nr_parity); + /* * We bypass the sector allocator which normally does this: */ + bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); + for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) { __clear_bit(v->ptrs[i].dev, devs.d); if (i < h->s->nr_data) @@ -1864,7 +1893,7 @@ static s64 get_existing_stripe(struct bch_fs *c, return -1; mutex_lock(&c->ec_stripes_heap_lock); - for (heap_idx = 0; heap_idx < h->used; heap_idx++) { + for (heap_idx = 0; heap_idx < h->nr; heap_idx++) { /* No blocks worth reusing, stripe will just be deleted: */ if (!h->data[heap_idx].blocks_nonempty) continue; @@ -2195,7 +2224,7 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) size_t i; mutex_lock(&c->ec_stripes_heap_lock); - for (i = 0; i < min_t(size_t, h->used, 50); i++) { + for (i = 0; i < min_t(size_t, h->nr, 50); i++) { m = genradix_ptr(&c->stripes, h->data[i].idx); prt_printf(out, "%zu %u/%u+%u", h->data[i].idx, @@ -2209,6 +2238,23 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) mutex_unlock(&c->ec_stripes_heap_lock); } +static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c, + struct ec_stripe_new *s) +{ + prt_printf(out, "\tidx %llu blocks %u+%u allocated %u ref %u %u %s obs", + s->idx, s->nr_data, s->nr_parity, + bitmap_weight(s->blocks_allocated, s->nr_data), + atomic_read(&s->ref[STRIPE_REF_io]), + atomic_read(&s->ref[STRIPE_REF_stripe]), + bch2_watermarks[s->h->watermark]); + + struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; + unsigned i; + for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) + prt_printf(out, " %u", s->blocks[i]); + prt_newline(out); +} + void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) { struct ec_stripe_head *h; @@ -2221,23 +2267,15 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) bch2_watermarks[h->watermark]); if (h->s) - prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n", - h->s->idx, h->s->nr_data, h->s->nr_parity, - bitmap_weight(h->s->blocks_allocated, - h->s->nr_data)); + bch2_new_stripe_to_text(out, c, h->s); } mutex_unlock(&c->ec_stripe_head_lock); prt_printf(out, "in flight:\n"); mutex_lock(&c->ec_stripe_new_lock); - list_for_each_entry(s, &c->ec_stripe_new_list, list) { - prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n", - s->idx, s->nr_data, s->nr_parity, - atomic_read(&s->ref[STRIPE_REF_io]), - atomic_read(&s->ref[STRIPE_REF_stripe]), - bch2_watermarks[s->h->watermark]); - } + list_for_each_entry(s, &c->ec_stripe_new_list, list) + bch2_new_stripe_to_text(out, c, s); mutex_unlock(&c->ec_stripe_new_lock); } diff --git a/libbcachefs/ec_types.h b/libbcachefs/ec_types.h index 976426da..1df03dcc 100644 --- a/libbcachefs/ec_types.h +++ b/libbcachefs/ec_types.h @@ -36,6 +36,6 @@ struct ec_stripe_heap_entry { unsigned blocks_nonempty; }; -typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap; +typedef DEFINE_MIN_HEAP(struct ec_stripe_heap_entry, ec_stripes_heap) ec_stripes_heap; #endif /* _BCACHEFS_EC_TYPES_H */ diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index facdb8a8..68f8c30b 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -357,7 +357,7 @@ out: \ __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \ _ptr, _entry) -#define bkey_crc_next(_k, _start, _end, _crc, _iter) \ +#define bkey_crc_next(_k, _end, _crc, _iter) \ ({ \ __bkey_extent_entry_for_each_from(_iter, _end, _iter) \ if (extent_entry_is_crc(_iter)) { \ @@ -372,7 +372,7 @@ out: \ #define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \ for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \ (_iter) = (_start); \ - bkey_crc_next(_k, _start, _end, _crc, _iter); \ + bkey_crc_next(_k, _end, _crc, _iter); \ (_iter) = extent_entry_next(_iter)) #define bkey_for_each_crc(_k, _p, _crc, _iter) \ diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c index cc33d763..0b900a4a 100644 --- a/libbcachefs/fs-io-buffered.c +++ b/libbcachefs/fs-io-buffered.c @@ -151,7 +151,6 @@ static void bchfs_read(struct btree_trans *trans, struct bkey_buf sk; int flags = BCH_READ_RETRY_IF_STALE| BCH_READ_MAY_PROMOTE; - u32 snapshot; int ret = 0; rbio->c = c; @@ -159,29 +158,23 @@ static void bchfs_read(struct btree_trans *trans, rbio->subvol = inum.subvol; bch2_bkey_buf_init(&sk); -retry: bch2_trans_begin(trans); - iter = (struct btree_iter) { NULL }; - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot), + POS(inum.inum, rbio->bio.bi_iter.bi_sector), BTREE_ITER_slots); while (1) { struct bkey_s_c k; unsigned bytes, sectors, offset_into_extent; enum btree_id data_btree = BTREE_ID_extents; - /* - * read_extent -> io_time_reset may cause a transaction restart - * without returning an error, we need to check for that here: - */ - ret = bch2_trans_relock(trans); + bch2_trans_begin(trans); + + u32 snapshot; + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); if (ret) - break; + goto err; + + bch2_btree_iter_set_snapshot(&iter, snapshot); bch2_btree_iter_set_pos(&iter, POS(inum.inum, rbio->bio.bi_iter.bi_sector)); @@ -189,7 +182,7 @@ retry: k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) - break; + goto err; offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); @@ -200,7 +193,7 @@ retry: ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, &sk); if (ret) - break; + goto err; k = bkey_i_to_s_c(sk.k); @@ -210,7 +203,7 @@ retry: ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors, extent_partial_reads_expensive(k)); if (ret) - break; + goto err; } bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; @@ -229,17 +222,13 @@ retry: swap(rbio->bio.bi_iter.bi_size, bytes); bio_advance(&rbio->bio, bytes); - - ret = btree_trans_too_many_iters(trans); - if (ret) +err: + if (ret && + !bch2_err_matches(ret, BCH_ERR_transaction_restart)) break; } -err: bch2_trans_iter_exit(trans, &iter); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - if (ret) { bch_err_inum_offset_ratelimited(c, iter.pos.inode, @@ -486,7 +475,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, op->nr_replicas = nr_replicas; op->res.nr_replicas = nr_replicas; op->write_point = writepoint_hashed(inode->ei_last_dirtied); - op->subvol = inode->ei_subvol; + op->subvol = inode->ei_inum.subvol; op->pos = POS(inode->v.i_ino, sector); op->end_io = bch2_writepage_io_done; op->devs_need_flush = &inode->ei_devs_need_flush; diff --git a/libbcachefs/fs-io-direct.c b/libbcachefs/fs-io-direct.c index e246b1e0..ee1c0325 100644 --- a/libbcachefs/fs-io-direct.c +++ b/libbcachefs/fs-io-direct.c @@ -500,7 +500,7 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio) dio->op.target = dio->op.opts.foreground_target; dio->op.write_point = writepoint_hashed((unsigned long) current); dio->op.nr_replicas = dio->op.opts.data_replicas; - dio->op.subvol = inode->ei_subvol; + dio->op.subvol = inode->ei_inum.subvol; dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); dio->op.devs_need_flush = &inode->ei_devs_need_flush; diff --git a/libbcachefs/fs-io-pagecache.c b/libbcachefs/fs-io-pagecache.c index a9cc5cad..af3a2454 100644 --- a/libbcachefs/fs-io-pagecache.c +++ b/libbcachefs/fs-io-pagecache.c @@ -182,18 +182,11 @@ static void __bch2_folio_set(struct folio *folio, int bch2_folio_set(struct bch_fs *c, subvol_inum inum, struct folio **fs, unsigned nr_folios) { - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - struct bch_folio *s; u64 offset = folio_sector(fs[0]); - unsigned folio_idx; - u32 snapshot; bool need_set = false; - int ret; - for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) { - s = bch2_folio_create(fs[folio_idx], GFP_KERNEL); + for (unsigned folio_idx = 0; folio_idx < nr_folios; folio_idx++) { + struct bch_folio *s = bch2_folio_create(fs[folio_idx], GFP_KERNEL); if (!s) return -ENOMEM; @@ -203,53 +196,40 @@ int bch2_folio_set(struct bch_fs *c, subvol_inum inum, if (!need_set) return 0; - folio_idx = 0; - trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); + unsigned folio_idx = 0; - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; + return bch2_trans_run(c, + for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, + POS(inum.inum, offset), + POS(inum.inum, U64_MAX), + inum.subvol, BTREE_ITER_slots, k, ({ + unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); + unsigned state = bkey_to_sector_state(k); - for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, - SPOS(inum.inum, offset, snapshot), - BTREE_ITER_slots, k, ret) { - unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); - unsigned state = bkey_to_sector_state(k); + while (folio_idx < nr_folios) { + struct folio *folio = fs[folio_idx]; + u64 folio_start = folio_sector(folio); + u64 folio_end = folio_end_sector(folio); + unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - + folio_start; + unsigned folio_len = min(k.k->p.offset, folio_end) - + folio_offset - folio_start; - while (folio_idx < nr_folios) { - struct folio *folio = fs[folio_idx]; - u64 folio_start = folio_sector(folio); - u64 folio_end = folio_end_sector(folio); - unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - - folio_start; - unsigned folio_len = min(k.k->p.offset, folio_end) - - folio_offset - folio_start; + BUG_ON(k.k->p.offset < folio_start); + BUG_ON(bkey_start_offset(k.k) > folio_end); - BUG_ON(k.k->p.offset < folio_start); - BUG_ON(bkey_start_offset(k.k) > folio_end); + if (!bch2_folio(folio)->uptodate) + __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state); - if (!bch2_folio(folio)->uptodate) - __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state); + if (k.k->p.offset < folio_end) + break; + folio_idx++; + } - if (k.k->p.offset < folio_end) + if (folio_idx == nr_folios) break; - folio_idx++; - } - - if (folio_idx == nr_folios) - break; - } - - offset = iter.pos.offset; - bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - bch2_trans_put(trans); - - return ret; + 0; + }))); } void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 77b85da3..71d0fa38 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -221,30 +221,11 @@ static inline int range_has_data(struct bch_fs *c, u32 subvol, struct bpos start, struct bpos end) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; -retry: - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, subvol, &start.snapshot); - if (ret) - goto err; - - for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, start, end, 0, k, ret) - if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) { - ret = 1; - break; - } - start = iter.pos; - bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_put(trans); - return ret; + return bch2_trans_run(c, + for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, start, end, + subvol, 0, k, ({ + bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k); + }))); } static int __bch2_truncate_folio(struct bch_inode_info *inode, @@ -267,7 +248,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode, * XXX: we're doing two index lookups when we end up reading the * folio */ - ret = range_has_data(c, inode->ei_subvol, + ret = range_has_data(c, inode->ei_inum.subvol, POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)), POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS)); if (ret <= 0) @@ -618,7 +599,7 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, bch2_trans_begin(trans); ret = bch2_subvolume_get_snapshot(trans, - inode->ei_subvol, &snapshot); + inode->ei_inum.subvol, &snapshot); if (ret) goto bkey_err; @@ -813,41 +794,23 @@ static int quota_reserve_range(struct bch_inode_info *inode, u64 start, u64 end) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - u32 snapshot; u64 sectors = end - start; - u64 pos = start; - int ret; -retry: - bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot); - if (ret) - goto err; + int ret = bch2_trans_run(c, + for_each_btree_key_in_subvolume_upto(trans, iter, + BTREE_ID_extents, + POS(inode->v.i_ino, start), + POS(inode->v.i_ino, end - 1), + inode->ei_inum.subvol, 0, k, ({ + if (bkey_extent_is_allocation(k.k)) { + u64 s = min(end, k.k->p.offset) - + max(start, bkey_start_offset(k.k)); + BUG_ON(s > sectors); + sectors -= s; + } - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - SPOS(inode->v.i_ino, pos, snapshot), 0); - - while (!(ret = btree_trans_too_many_iters(trans)) && - (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k && - !(ret = bkey_err(k))) { - if (bkey_extent_is_allocation(k.k)) { - u64 s = min(end, k.k->p.offset) - - max(start, bkey_start_offset(k.k)); - BUG_ON(s > sectors); - sectors -= s; - } - bch2_btree_iter_advance(&iter); - } - pos = iter.pos.offset; - bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_put(trans); + 0; + }))); return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true); } @@ -942,42 +905,25 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; subvol_inum inum = inode_inum(inode); u64 isize, next_data = MAX_LFS_FILESIZE; - u32 snapshot; - int ret; isize = i_size_read(&inode->v); if (offset >= isize) return -ENXIO; - trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; - - for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, - SPOS(inode->v.i_ino, offset >> 9, snapshot), - POS(inode->v.i_ino, U64_MAX), - 0, k, ret) { - if (bkey_extent_is_data(k.k)) { - next_data = max(offset, bkey_start_offset(k.k) << 9); - break; - } else if (k.k->p.offset >> 9 > isize) - break; - } - bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_put(trans); + int ret = bch2_trans_run(c, + for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, + POS(inode->v.i_ino, offset >> 9), + POS(inode->v.i_ino, U64_MAX), + inum.subvol, 0, k, ({ + if (bkey_extent_is_data(k.k)) { + next_data = max(offset, bkey_start_offset(k.k) << 9); + break; + } else if (k.k->p.offset >> 9 > isize) + break; + 0; + }))); if (ret) return ret; @@ -995,50 +941,34 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; subvol_inum inum = inode_inum(inode); u64 isize, next_hole = MAX_LFS_FILESIZE; - u32 snapshot; - int ret; isize = i_size_read(&inode->v); if (offset >= isize) return -ENXIO; - trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, - SPOS(inode->v.i_ino, offset >> 9, snapshot), - BTREE_ITER_slots, k, ret) { - if (k.k->p.inode != inode->v.i_ino) { - next_hole = bch2_seek_pagecache_hole(&inode->v, - offset, MAX_LFS_FILESIZE, 0, false); - break; - } else if (!bkey_extent_is_data(k.k)) { - next_hole = bch2_seek_pagecache_hole(&inode->v, - max(offset, bkey_start_offset(k.k) << 9), - k.k->p.offset << 9, 0, false); - - if (next_hole < k.k->p.offset << 9) + int ret = bch2_trans_run(c, + for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, + POS(inode->v.i_ino, offset >> 9), + POS(inode->v.i_ino, U64_MAX), + inum.subvol, BTREE_ITER_slots, k, ({ + if (k.k->p.inode != inode->v.i_ino) { + next_hole = bch2_seek_pagecache_hole(&inode->v, + offset, MAX_LFS_FILESIZE, 0, false); break; - } else { - offset = max(offset, bkey_start_offset(k.k) << 9); - } - } - bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; + } else if (!bkey_extent_is_data(k.k)) { + next_hole = bch2_seek_pagecache_hole(&inode->v, + max(offset, bkey_start_offset(k.k) << 9), + k.k->p.offset << 9, 0, false); - bch2_trans_put(trans); + if (next_hole < k.k->p.offset << 9) + break; + } else { + offset = max(offset, bkey_start_offset(k.k) << 9); + } + 0; + }))); if (ret) return ret; diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index aea8132d..f5dfb7e4 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -100,7 +100,7 @@ static int bch2_ioc_setflags(struct bch_fs *c, } mutex_lock(&inode->ei_update_lock); - ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?: + ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: bch2_write_inode(c, inode, bch2_inode_flags_set, &s, ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); @@ -184,7 +184,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c, } mutex_lock(&inode->ei_update_lock); - ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?: + ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: bch2_set_projid(c, inode, fa.fsx_projid) ?: bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, ATTR_CTIME); diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index bdd9183b..c50457ba 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -152,42 +152,80 @@ int bch2_fs_quota_transfer(struct bch_fs *c, return ret; } -static int bch2_iget5_test(struct inode *vinode, void *p) +static bool subvol_inum_eq(subvol_inum a, subvol_inum b) { - struct bch_inode_info *inode = to_bch_ei(vinode); - subvol_inum *inum = p; - - return inode->ei_subvol == inum->subvol && - inode->ei_inode.bi_inum == inum->inum; + return a.subvol == b.subvol && a.inum == b.inum; } -static int bch2_iget5_set(struct inode *vinode, void *p) +static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg, + const void *obj) { - struct bch_inode_info *inode = to_bch_ei(vinode); - subvol_inum *inum = p; + const struct bch_inode_info *inode = obj; + const subvol_inum *v = arg->key; - inode->v.i_ino = inum->inum; - inode->ei_subvol = inum->subvol; - inode->ei_inode.bi_inum = inum->inum; - return 0; + return !subvol_inum_eq(inode->ei_inum, *v); } -static unsigned bch2_inode_hash(subvol_inum inum) +static const struct rhashtable_params bch2_vfs_inodes_params = { + .head_offset = offsetof(struct bch_inode_info, hash), + .key_offset = offsetof(struct bch_inode_info, ei_inum), + .key_len = sizeof(subvol_inum), + .obj_cmpfn = bch2_vfs_inode_cmp_fn, + .automatic_shrinking = true, +}; + +static void __wait_on_freeing_inode(struct inode *inode) { - return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL); + wait_queue_head_t *wq; + DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); + wq = bit_waitqueue(&inode->i_state, __I_NEW); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->i_lock); + schedule(); + finish_wait(wq, &wait.wq_entry); } -static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode) +static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) { - subvol_inum inum = inode_inum(inode); - struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v, - bch2_inode_hash(inum), - bch2_iget5_test, - bch2_iget5_set, - &inum)); - BUG_ON(!old); + struct bch_inode_info *inode; +repeat: + inode = rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, + bch2_vfs_inodes_params); + if (inode) { + spin_lock(&inode->v.i_lock); + if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) { + __wait_on_freeing_inode(&inode->v); + goto repeat; + } + __iget(&inode->v); + spin_unlock(&inode->v.i_lock); + } + + return inode; +} + +static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode) +{ + if (test_bit(EI_INODE_HASHED, &inode->ei_flags)) { + int ret = rhashtable_remove_fast(&c->vfs_inodes_table, + &inode->hash, bch2_vfs_inodes_params); + BUG_ON(ret); + clear_bit(EI_INODE_HASHED, &inode->ei_flags); + inode->v.i_hash.pprev = NULL; + } +} + +static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, struct bch_inode_info *inode) +{ + struct bch_inode_info *old = inode; +retry: + if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table, + &inode->hash, + bch2_vfs_inodes_params))) { + old = bch2_inode_hash_find(c, inode->ei_inum); + if (!old) + goto retry; - if (unlikely(old != inode)) { /* * bcachefs doesn't use I_NEW; we have no use for it since we * only insert fully created inodes in the inode hash table. But @@ -203,16 +241,14 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino discard_new_inode(&inode->v); inode = old; } else { + set_bit(EI_INODE_HASHED, &inode->ei_flags); + inode_fake_hash(&inode->v); + + inode_sb_list_add(&inode->v); + mutex_lock(&c->vfs_inodes_lock); list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); mutex_unlock(&c->vfs_inodes_lock); - /* - * Again, I_NEW makes no sense for bcachefs. This is only needed - * for clearing I_NEW, but since the inode was already fully - * created and initialized we didn't actually want - * inode_insert5() to set it for us. - */ - unlock_new_inode(&inode->v); } return inode; @@ -245,7 +281,6 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c) inode->ei_flags = 0; mutex_init(&inode->ei_quota_lock); memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); - inode->v.i_state = 0; if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) { kmem_cache_free(bch2_inode_cache, inode); @@ -279,11 +314,7 @@ static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans) struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) { - struct bch_inode_info *inode = - to_bch_ei(ilookup5_nowait(c->vfs_sb, - bch2_inode_hash(inum), - bch2_iget5_test, - &inum)); + struct bch_inode_info *inode = bch2_inode_hash_find(c, inum); if (inode) return &inode->v; @@ -297,7 +328,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans)); if (!ret) { bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); - inode = bch2_inode_insert(c, inode); + inode = bch2_inode_hash_insert(c, inode); } bch2_trans_put(trans); @@ -345,7 +376,7 @@ __bch2_create(struct mnt_idmap *idmap, retry: bch2_trans_begin(trans); - ret = bch2_subvol_is_ro_trans(trans, dir->ei_subvol) ?: + ret = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?: bch2_create_trans(trans, inode_inum(dir), &dir_u, &inode_u, !(flags & BCH_CREATE_TMPFILE) @@ -359,7 +390,7 @@ retry: if (unlikely(ret)) goto err_before_quota; - inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol; + inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol; inum.inum = inode_u.bi_inum; ret = bch2_subvolume_get(trans, inum.subvol, true, @@ -390,7 +421,7 @@ err_before_quota: * bch2_trans_exit() and dropping locks, else we could race with another * thread pulling the inode in and modifying it: */ - inode = bch2_inode_insert(c, inode); + inode = bch2_inode_hash_insert(c, inode); bch2_trans_put(trans); err: posix_acl_release(default_acl); @@ -430,11 +461,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, if (ret) goto err; - struct bch_inode_info *inode = - to_bch_ei(ilookup5_nowait(c->vfs_sb, - bch2_inode_hash(inum), - bch2_iget5_test, - &inum)); + struct bch_inode_info *inode = bch2_inode_hash_find(c, inum); if (inode) goto out; @@ -464,7 +491,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, } bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); - inode = bch2_inode_insert(c, inode); + inode = bch2_inode_hash_insert(c, inode); out: bch2_trans_iter_exit(trans, &dirent_iter); printbuf_exit(&buf); @@ -551,8 +578,8 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir, lockdep_assert_held(&inode->v.i_rwsem); - ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?: - bch2_subvol_is_ro(c, inode->ei_subvol) ?: + ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: + bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: __bch2_link(c, inode, dir, dentry); if (unlikely(ret)) return bch2_err_class(ret); @@ -608,7 +635,7 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) struct bch_inode_info *dir= to_bch_ei(vdir); struct bch_fs *c = dir->v.i_sb->s_fs_info; - int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?: + int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: __bch2_unlink(vdir, dentry, false); return bch2_err_class(ret); } @@ -691,8 +718,8 @@ static int bch2_rename2(struct mnt_idmap *idmap, trans = bch2_trans_get(c); - ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_subvol) ?: - bch2_subvol_is_ro_trans(trans, dst_dir->ei_subvol); + ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?: + bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol); if (ret) goto err; @@ -893,7 +920,7 @@ static int bch2_getattr(struct mnt_idmap *idmap, stat->blksize = block_bytes(c); stat->blocks = inode->v.i_blocks; - stat->subvol = inode->ei_subvol; + stat->subvol = inode->ei_inum.subvol; stat->result_mask |= STATX_SUBVOL; if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) { @@ -935,7 +962,7 @@ static int bch2_setattr(struct mnt_idmap *idmap, lockdep_assert_held(&inode->v.i_rwsem); - ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?: + ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: setattr_prepare(idmap, dentry, iattr); if (ret) return ret; @@ -1028,7 +1055,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, struct bkey_buf cur, prev; unsigned offset_into_extent, sectors; bool have_extent = false; - u32 snapshot; int ret = 0; ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); @@ -1044,21 +1070,30 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bch2_bkey_buf_init(&cur); bch2_bkey_buf_init(&prev); trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot); - if (ret) - goto err; bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - SPOS(ei->v.i_ino, start, snapshot), 0); + POS(ei->v.i_ino, start), 0); - while (!(ret = btree_trans_too_many_iters(trans)) && - (k = bch2_btree_iter_peek_upto(&iter, end)).k && - !(ret = bkey_err(k))) { + while (true) { enum btree_id data_btree = BTREE_ID_extents; + bch2_trans_begin(trans); + + u32 snapshot; + ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot); + if (ret) + goto err; + + bch2_btree_iter_set_snapshot(&iter, snapshot); + + k = bch2_btree_iter_peek_upto(&iter, end); + ret = bkey_err(k); + if (ret) + goto err; + + if (!k.k) + break; + if (!bkey_extent_is_data(k.k) && k.k->type != KEY_TYPE_reservation) { bch2_btree_iter_advance(&iter); @@ -1102,16 +1137,12 @@ retry: bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, iter.pos.offset + sectors)); - - ret = bch2_trans_relock(trans); - if (ret) +err: + if (ret && + !bch2_err_matches(ret, BCH_ERR_transaction_restart)) break; } - start = iter.pos.offset; bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; if (!ret && have_extent) { bch2_trans_unlock(trans); @@ -1167,7 +1198,7 @@ static int bch2_open(struct inode *vinode, struct file *file) struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret = bch2_subvol_is_ro(c, inode->ei_subvol); + int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol); if (ret) return ret; } @@ -1201,7 +1232,7 @@ static const struct inode_operations bch_file_inode_operations = { .fiemap = bch2_fiemap, .listxattr = bch2_xattr_list, #ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_acl = bch2_get_acl, + .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif }; @@ -1221,7 +1252,7 @@ static const struct inode_operations bch_dir_inode_operations = { .tmpfile = bch2_tmpfile, .listxattr = bch2_xattr_list, #ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_acl = bch2_get_acl, + .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif }; @@ -1243,7 +1274,7 @@ static const struct inode_operations bch_symlink_inode_operations = { .setattr = bch2_setattr, .listxattr = bch2_xattr_list, #ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_acl = bch2_get_acl, + .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif }; @@ -1253,7 +1284,7 @@ static const struct inode_operations bch_special_inode_operations = { .setattr = bch2_setattr, .listxattr = bch2_xattr_list, #ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_acl = bch2_get_acl, + .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif }; @@ -1299,8 +1330,8 @@ static int bcachefs_fid_valid(int fh_len, int fh_type) static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode) { return (struct bcachefs_fid) { - .inum = inode->ei_inode.bi_inum, - .subvol = inode->ei_subvol, + .inum = inode->ei_inum.inum, + .subvol = inode->ei_inum.subvol, .gen = inode->ei_inode.bi_generation, }; } @@ -1385,7 +1416,7 @@ static struct dentry *bch2_get_parent(struct dentry *child) struct bch_fs *c = inode->v.i_sb->s_fs_info; subvol_inum parent_inum = { .subvol = inode->ei_inode.bi_parent_subvol ?: - inode->ei_subvol, + inode->ei_inum.subvol, .inum = inode->ei_inode.bi_dir, }; @@ -1421,7 +1452,7 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child retry: bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot); + ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot); if (ret) goto err; @@ -1452,8 +1483,7 @@ retry: if (ret) goto err; - if (target.subvol == inode->ei_subvol && - target.inum == inode->ei_inode.bi_inum) + if (subvol_inum_eq(target, inode->ei_inum)) goto found; } else { /* @@ -1474,8 +1504,7 @@ retry: if (ret) continue; - if (target.subvol == inode->ei_subvol && - target.inum == inode->ei_inode.bi_inum) + if (subvol_inum_eq(target, inode->ei_inum)) goto found; } } @@ -1512,7 +1541,9 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, struct bch_inode_unpacked *bi, struct bch_subvolume *subvol) { - bch2_iget5_set(&inode->v, &inum); + inode->v.i_ino = inum.inum; + inode->ei_inum = inum; + inode->ei_inode.bi_inum = inum.inum; bch2_inode_update_after_write(trans, inode, bi, ~0); inode->v.i_blocks = bi->bi_sectors; @@ -1524,7 +1555,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, inode->ei_flags = 0; inode->ei_quota_reserved = 0; inode->ei_qid = bch_qid(bi); - inode->ei_subvol = inum.subvol; if (BCH_SUBVOLUME_SNAP(subvol)) set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); @@ -1592,6 +1622,8 @@ static void bch2_evict_inode(struct inode *vinode) struct bch_fs *c = vinode->i_sb->s_fs_info; struct bch_inode_info *inode = to_bch_ei(vinode); + bch2_inode_hash_remove(c, inode); + truncate_inode_pages_final(&inode->v.i_data); clear_inode(&inode->v); @@ -1633,7 +1665,7 @@ again: mutex_lock(&c->vfs_inodes_lock); list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) { - if (!snapshot_list_has_id(s, inode->ei_subvol)) + if (!snapshot_list_has_id(s, inode->ei_inum.subvol)) continue; if (!(inode->v.i_state & I_DONTCACHE) && @@ -2121,6 +2153,17 @@ static int bch2_init_fs_context(struct fs_context *fc) return 0; } +void bch2_fs_vfs_exit(struct bch_fs *c) +{ + if (c->vfs_inodes_table.tbl) + rhashtable_destroy(&c->vfs_inodes_table); +} + +int bch2_fs_vfs_init(struct bch_fs *c) +{ + return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params); +} + static struct file_system_type bcache_fs_type = { .owner = THIS_MODULE, .name = "bcachefs", diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h index c3af7225..f8f88781 100644 --- a/libbcachefs/fs.h +++ b/libbcachefs/fs.h @@ -13,6 +13,9 @@ struct bch_inode_info { struct inode v; + struct rhash_head hash; + subvol_inum ei_inum; + struct list_head ei_vfs_inode_list; unsigned long ei_flags; @@ -24,8 +27,6 @@ struct bch_inode_info { struct mutex ei_quota_lock; struct bch_qid ei_qid; - u32 ei_subvol; - /* * When we've been doing nocow writes we'll need to issue flushes to the * underlying block devices @@ -50,10 +51,7 @@ struct bch_inode_info { static inline subvol_inum inode_inum(struct bch_inode_info *inode) { - return (subvol_inum) { - .subvol = inode->ei_subvol, - .inum = inode->ei_inode.bi_inum, - }; + return inode->ei_inum; } /* @@ -67,6 +65,7 @@ static inline subvol_inum inode_inum(struct bch_inode_info *inode) * those: */ #define EI_INODE_SNAPSHOT 1 +#define EI_INODE_HASHED 2 #define to_bch_ei(_inode) \ container_of_or_null(_inode, struct bch_inode_info, v) @@ -187,6 +186,9 @@ int __bch2_unlink(struct inode *, struct dentry *, bool); void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *); +void bch2_fs_vfs_exit(struct bch_fs *); +int bch2_fs_vfs_init(struct bch_fs *); + void bch2_vfs_exit(void); int bch2_vfs_init(void); @@ -196,6 +198,10 @@ int bch2_vfs_init(void); static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) {} + +static inline void bch2_fs_vfs_exit(struct bch_fs *c) {} +static inline int bch2_fs_vfs_init(struct bch_fs *c) { return 0; } + static inline void bch2_vfs_exit(void) {} static inline int bch2_vfs_init(void) { return 0; } diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index cc4f0963..9138944c 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -283,6 +283,7 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode, u32 inode_snapshot) { + struct bch_fs *c = trans->c; struct bch_hash_info dir_hash; struct bch_inode_unpacked lostfound; char name_buf[20]; @@ -317,7 +318,7 @@ static int reattach_inode(struct btree_trans *trans, return ret; } - dir_hash = bch2_hash_info_init(trans->c, &lostfound); + dir_hash = bch2_hash_info_init(c, &lostfound); name = (struct qstr) QSTR(name_buf); @@ -330,8 +331,10 @@ static int reattach_inode(struct btree_trans *trans, inode->bi_subvol ?: inode->bi_inum, &dir_offset, STR_HASH_must_create); - if (ret) + if (ret) { + bch_err_msg(c, ret, "error creating dirent"); return ret; + } inode->bi_dir = lostfound.bi_inum; inode->bi_dir_offset = dir_offset; diff --git a/libbcachefs/io_misc.c b/libbcachefs/io_misc.c index 2cf62977..177ed331 100644 --- a/libbcachefs/io_misc.c +++ b/libbcachefs/io_misc.c @@ -126,11 +126,7 @@ err_noprint: if (closure_nr_remaining(&cl) != 1) { bch2_trans_unlock_long(trans); - - if (closure_sync_timeout(&cl, HZ * 10)) { - bch2_print_allocator_stuck(c); - closure_sync(&cl); - } + bch2_wait_on_allocator(c, &cl); } return ret; diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c index 4531c9ab..ce27ba1f 100644 --- a/libbcachefs/io_read.c +++ b/libbcachefs/io_read.c @@ -286,7 +286,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, */ bool promote_full = (failed || *read_full || - READ_ONCE(c->promote_whole_extents)); + READ_ONCE(c->opts.promote_whole_extents)); /* data might have to be decompressed in the write path: */ unsigned sectors = promote_full ? max(pick->crc.compressed_size, pick->crc.live_size) @@ -406,6 +406,7 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio bch2_trans_iter_init(trans, &iter, rbio->data_btree, rbio->read_pos, BTREE_ITER_slots); retry: + bch2_trans_begin(trans); rbio->bio.bi_status = 0; k = bch2_btree_iter_peek_slot(&iter); @@ -1213,10 +1214,6 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, swap(bvec_iter.bi_size, bytes); bio_advance_iter(&rbio->bio, &bvec_iter, bytes); - - ret = btree_trans_too_many_iters(trans); - if (ret) - goto err; err: if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart) && diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c index d31c8d00..1d4761d1 100644 --- a/libbcachefs/io_write.c +++ b/libbcachefs/io_write.c @@ -1503,10 +1503,7 @@ err: if ((op->flags & BCH_WRITE_SYNC) || (!(op->flags & BCH_WRITE_SUBMITTED) && !(op->flags & BCH_WRITE_IN_WORKER))) { - if (closure_sync_timeout(&op->cl, HZ * 10)) { - bch2_print_allocator_stuck(c); - closure_sync(&op->cl); - } + bch2_wait_on_allocator(c, &op->cl); __bch2_write_index(op); diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index d8a63074..70b998d9 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -206,6 +206,7 @@ void bch2_journal_space_available(struct journal *j) if (nr_online < metadata_replicas_required(c)) { struct printbuf buf = PRINTBUF; + buf.atomic++; prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n" "rw journal devs:", nr_online, metadata_replicas_required(c)); diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 1d4ffd01..3d83bcdc 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -265,6 +265,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ NULL, "Enable inline data extents") \ + x(promote_whole_extents, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_PROMOTE_WHOLE_EXTENTS, true, \ + NULL, "Promote whole extents, instead of just part being read")\ x(acl, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ OPT_BOOL(), \ @@ -393,6 +398,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH_SB_JOURNAL_TRANSACTION_NAMES, true, \ NULL, "Log transaction function names in journal") \ + x(allocator_stuck_timeout, u16, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(0, U16_MAX), \ + BCH_SB_ALLOCATOR_STUCK_TIMEOUT, 30, \ + NULL, "Default timeout in seconds for stuck allocator messages")\ x(noexcl, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ diff --git a/libbcachefs/sb-members.c b/libbcachefs/sb-members.c index 39196f2a..b4ea6490 100644 --- a/libbcachefs/sb-members.c +++ b/libbcachefs/sb-members.c @@ -464,3 +464,12 @@ void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k) __bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c)); } } + +unsigned bch2_sb_nr_devices(const struct bch_sb *sb) +{ + unsigned nr = 0; + + for (unsigned i = 0; i < sb->nr_devices; i++) + nr += bch2_member_exists((struct bch_sb *) sb, i); + return nr; +} diff --git a/libbcachefs/sb-members.h b/libbcachefs/sb-members.h index dd93192e..f307f285 100644 --- a/libbcachefs/sb-members.h +++ b/libbcachefs/sb-members.h @@ -307,6 +307,8 @@ static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev) return false; } +unsigned bch2_sb_nr_devices(const struct bch_sb *); + static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) { return (struct bch_member_cpu) { diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h index afa5e871..03b8c6fc 100644 --- a/libbcachefs/subvolume.h +++ b/libbcachefs/subvolume.h @@ -32,6 +32,51 @@ int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); int bch2_subvol_is_ro_trans(struct btree_trans *, u32); int bch2_subvol_is_ro(struct bch_fs *, u32); +static inline struct bkey_s_c +bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos end, + u32 subvolid, unsigned flags) +{ + u32 snapshot; + int ret = bch2_subvolume_get_snapshot(iter->trans, subvolid, &snapshot); + if (ret) + return bkey_s_c_err(ret); + + bch2_btree_iter_set_snapshot(iter, snapshot); + return bch2_btree_iter_peek_upto_type(iter, end, flags); +} + +#define for_each_btree_key_in_subvolume_upto_continue(_trans, _iter, \ + _end, _subvolid, _flags, _k, _do) \ +({ \ + struct bkey_s_c _k; \ + int _ret3 = 0; \ + \ + do { \ + _ret3 = lockrestart_do(_trans, ({ \ + (_k) = bch2_btree_iter_peek_in_subvolume_upto_type(&(_iter), \ + _end, _subvolid, (_flags)); \ + if (!(_k).k) \ + break; \ + \ + bkey_err(_k) ?: (_do); \ + })); \ + } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \ + \ + bch2_trans_iter_exit((_trans), &(_iter)); \ + _ret3; \ +}) + +#define for_each_btree_key_in_subvolume_upto(_trans, _iter, _btree_id, \ + _start, _end, _subvolid, _flags, _k, _do) \ +({ \ + struct btree_iter _iter; \ + bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + \ + for_each_btree_key_in_subvolume_upto_continue(_trans, _iter, \ + _end, _subvolid, _flags, _k, _do); \ +}) + int bch2_delete_dead_snapshots(struct bch_fs *); void bch2_delete_dead_snapshots_async(struct bch_fs *); diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 8bc81983..d86d5dae 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -414,6 +414,13 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb)) SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb, le16_to_cpu(sb->version)); + + if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2 && + !BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb)) + SET_BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb, 30); + + if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2) + SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true); } for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { @@ -1288,15 +1295,9 @@ void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l) void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, bool print_layout, unsigned fields) { - u64 fields_have = 0; - unsigned nr_devices = 0; - if (!out->nr_tabstops) printbuf_tabstop_push(out, 44); - for (int i = 0; i < sb->nr_devices; i++) - nr_devices += bch2_member_exists(sb, i); - prt_printf(out, "External UUID:\t"); pr_uuid(out, sb->user_uuid.b); prt_newline(out); @@ -1352,9 +1353,10 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, prt_newline(out); prt_printf(out, "Clean:\t%llu\n", BCH_SB_CLEAN(sb)); - prt_printf(out, "Devices:\t%u\n", nr_devices); + prt_printf(out, "Devices:\t%u\n", bch2_sb_nr_devices(sb)); prt_printf(out, "Sections:\t"); + u64 fields_have = 0; vstruct_for_each(sb, f) fields_have |= 1 << le32_to_cpu(f->type); prt_bitflags(out, bch2_sb_fields, fields_have); diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 0455a100..d8adf465 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -543,6 +543,7 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_fs_fs_io_direct_exit(c); bch2_fs_fs_io_buffered_exit(c); bch2_fs_fsio_exit(c); + bch2_fs_vfs_exit(c); bch2_fs_ec_exit(c); bch2_fs_encryption_exit(c); bch2_fs_nocow_locking_exit(c); @@ -810,7 +811,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->copy_gc_enabled = 1; c->rebalance.enabled = 1; - c->promote_whole_extents = true; c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; @@ -926,6 +926,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_encryption_init(c) ?: bch2_fs_compress_init(c) ?: bch2_fs_ec_init(c) ?: + bch2_fs_vfs_init(c) ?: bch2_fs_fsio_init(c) ?: bch2_fs_fs_io_buffered_init(c) ?: bch2_fs_fs_io_direct_init(c); @@ -1193,7 +1194,6 @@ static void bch2_dev_free(struct bch_dev *ca) if (ca->kobj.state_in_sysfs) kobject_del(&ca->kobj); - kfree(ca->buckets_nouse); bch2_free_super(&ca->disk_sb); bch2_dev_allocator_background_exit(ca); bch2_dev_journal_exit(ca); diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index d4c5bfca..9f8ca6a5 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -219,7 +219,6 @@ read_attribute(copy_gc_wait); rw_attribute(rebalance_enabled); sysfs_pd_controller_attribute(rebalance); read_attribute(rebalance_status); -rw_attribute(promote_whole_extents); read_attribute(new_stripes); @@ -347,8 +346,6 @@ SHOW(bch2_fs) if (attr == &sysfs_rebalance_status) bch2_rebalance_status_to_text(out, c); - sysfs_print(promote_whole_extents, c->promote_whole_extents); - /* Debugging: */ if (attr == &sysfs_journal_debug) @@ -367,7 +364,7 @@ SHOW(bch2_fs) bch2_stripes_heap_to_text(out, c); if (attr == &sysfs_open_buckets) - bch2_open_buckets_to_text(out, c); + bch2_open_buckets_to_text(out, c, NULL); if (attr == &sysfs_open_buckets_partial) bch2_open_buckets_partial_to_text(out, c); @@ -436,8 +433,6 @@ STORE(bch2_fs) sysfs_pd_controller_store(rebalance, &c->rebalance.pd); - sysfs_strtoul(promote_whole_extents, c->promote_whole_extents); - /* Debugging: */ if (!test_bit(BCH_FS_started, &c->flags)) @@ -514,7 +509,7 @@ struct attribute *bch2_fs_files[] = { &sysfs_btree_cache_size, &sysfs_btree_write_stats, - &sysfs_promote_whole_extents, + &sysfs_rebalance_status, &sysfs_compression_stats, @@ -614,7 +609,6 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_copy_gc_wait, &sysfs_rebalance_enabled, - &sysfs_rebalance_status, sysfs_pd_controller_files(rebalance), &sysfs_moving_ctxts, @@ -811,6 +805,9 @@ SHOW(bch2_dev) if (attr == &sysfs_alloc_debug) bch2_dev_alloc_debug_to_text(out, ca); + if (attr == &sysfs_open_buckets) + bch2_open_buckets_to_text(out, c, ca); + return 0; } @@ -877,6 +874,7 @@ struct attribute *bch2_dev_files[] = { /* debug: */ &sysfs_alloc_debug, + &sysfs_open_buckets, NULL }; diff --git a/libbcachefs/util.c b/libbcachefs/util.c index 89fdaf2b..332bc17f 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * random utiility code, for bcache but in theory not specific to bcache + * random utility code, for bcache but in theory not specific to bcache * * Copyright 2010, 2011 Kent Overstreet * Copyright 2012 Google, Inc. diff --git a/libbcachefs/util.h b/libbcachefs/util.h index bd9db6bf..fb02c1c3 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -54,17 +55,9 @@ static inline size_t buf_pages(void *p, size_t len) PAGE_SIZE); } -#define HEAP(type) \ -struct { \ - size_t size, used; \ - type *data; \ -} - -#define DECLARE_HEAP(type, name) HEAP(type) name - #define init_heap(heap, _size, gfp) \ ({ \ - (heap)->used = 0; \ + (heap)->nr = 0; \ (heap)->size = (_size); \ (heap)->data = kvmalloc((heap)->size * sizeof((heap)->data[0]),\ (gfp)); \ @@ -76,113 +69,6 @@ do { \ (heap)->data = NULL; \ } while (0) -#define heap_set_backpointer(h, i, _fn) \ -do { \ - void (*fn)(typeof(h), size_t) = _fn; \ - if (fn) \ - fn(h, i); \ -} while (0) - -#define heap_swap(h, i, j, set_backpointer) \ -do { \ - swap((h)->data[i], (h)->data[j]); \ - heap_set_backpointer(h, i, set_backpointer); \ - heap_set_backpointer(h, j, set_backpointer); \ -} while (0) - -#define heap_peek(h) \ -({ \ - EBUG_ON(!(h)->used); \ - (h)->data[0]; \ -}) - -#define heap_full(h) ((h)->used == (h)->size) - -#define heap_sift_down(h, i, cmp, set_backpointer) \ -do { \ - size_t _c, _j = i; \ - \ - for (; _j * 2 + 1 < (h)->used; _j = _c) { \ - _c = _j * 2 + 1; \ - if (_c + 1 < (h)->used && \ - cmp(h, (h)->data[_c], (h)->data[_c + 1]) >= 0) \ - _c++; \ - \ - if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0) \ - break; \ - heap_swap(h, _c, _j, set_backpointer); \ - } \ -} while (0) - -#define heap_sift_up(h, i, cmp, set_backpointer) \ -do { \ - while (i) { \ - size_t p = (i - 1) / 2; \ - if (cmp(h, (h)->data[i], (h)->data[p]) >= 0) \ - break; \ - heap_swap(h, i, p, set_backpointer); \ - i = p; \ - } \ -} while (0) - -#define __heap_add(h, d, cmp, set_backpointer) \ -({ \ - size_t _i = (h)->used++; \ - (h)->data[_i] = d; \ - heap_set_backpointer(h, _i, set_backpointer); \ - \ - heap_sift_up(h, _i, cmp, set_backpointer); \ - _i; \ -}) - -#define heap_add(h, d, cmp, set_backpointer) \ -({ \ - bool _r = !heap_full(h); \ - if (_r) \ - __heap_add(h, d, cmp, set_backpointer); \ - _r; \ -}) - -#define heap_add_or_replace(h, new, cmp, set_backpointer) \ -do { \ - if (!heap_add(h, new, cmp, set_backpointer) && \ - cmp(h, new, heap_peek(h)) >= 0) { \ - (h)->data[0] = new; \ - heap_set_backpointer(h, 0, set_backpointer); \ - heap_sift_down(h, 0, cmp, set_backpointer); \ - } \ -} while (0) - -#define heap_del(h, i, cmp, set_backpointer) \ -do { \ - size_t _i = (i); \ - \ - BUG_ON(_i >= (h)->used); \ - (h)->used--; \ - if ((_i) < (h)->used) { \ - heap_swap(h, _i, (h)->used, set_backpointer); \ - heap_sift_up(h, _i, cmp, set_backpointer); \ - heap_sift_down(h, _i, cmp, set_backpointer); \ - } \ -} while (0) - -#define heap_pop(h, d, cmp, set_backpointer) \ -({ \ - bool _r = (h)->used; \ - if (_r) { \ - (d) = (h)->data[0]; \ - heap_del(h, 0, cmp, set_backpointer); \ - } \ - _r; \ -}) - -#define heap_resort(heap, cmp, set_backpointer) \ -do { \ - ssize_t _i; \ - for (_i = (ssize_t) (heap)->used / 2 - 1; _i >= 0; --_i) \ - heap_sift_down(heap, _i, cmp, set_backpointer); \ -} while (0) - #define ANYSINT_MAX(t) \ ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index c11bf6da..780781f5 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -296,54 +296,23 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { struct bch_fs *c = dentry->d_sb->s_fs_info; struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; u64 offset = 0, inum = inode->ei_inode.bi_inum; - u32 snapshot; - int ret; -retry: - bch2_trans_begin(trans); - iter = (struct btree_iter) { NULL }; - ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot); - if (ret) - goto err; + int ret = bch2_trans_run(c, + for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_xattrs, + POS(inum, offset), + POS(inum, U64_MAX), + inode->ei_inum.subvol, 0, k, ({ + if (k.k->type != KEY_TYPE_xattr) + continue; - for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_xattrs, - SPOS(inum, offset, snapshot), - POS(inum, U64_MAX), 0, k, ret) { - if (k.k->type != KEY_TYPE_xattr) - continue; + bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf); + }))) ?: + bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false) ?: + bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true); - ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf); - if (ret) - break; - } - - offset = iter.pos.offset; - bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_put(trans); - - if (ret) - goto out; - - ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false); - if (ret) - goto out; - - ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true); - if (ret) - goto out; - - return buf.used; -out: - return bch2_err_class(ret); + return ret ? bch2_err_class(ret) : buf.used; } static int bch2_xattr_get_handler(const struct xattr_handler *handler,