From d2a118d921dfdf43adfa37aed1d9df62925bda66 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 7 Nov 2020 11:26:00 -0500 Subject: [PATCH] Update bcachefs sources to 1d669389f7 bcachefs: use a radix tree for inum bitmap in fsck --- .bcachefs_revision | 2 +- cmd_migrate.c | 2 +- include/linux/cpumask.h | 2 + include/linux/page.h | 2 + include/trace/events/bcachefs.h | 43 +++- libbcachefs/alloc_background.h | 2 +- libbcachefs/bcachefs.h | 24 ++- libbcachefs/bcachefs_format.h | 17 +- libbcachefs/bkey.c | 8 +- libbcachefs/bkey.h | 47 +---- libbcachefs/bkey_methods.c | 2 +- libbcachefs/bkey_sort.c | 10 +- libbcachefs/bset.c | 32 ++- libbcachefs/bset.h | 27 +-- libbcachefs/btree_cache.c | 19 +- libbcachefs/btree_cache.h | 2 +- libbcachefs/btree_gc.c | 12 +- libbcachefs/btree_io.c | 8 +- libbcachefs/btree_iter.c | 197 +++++++++++------- libbcachefs/btree_iter.h | 7 +- libbcachefs/btree_key_cache.c | 13 +- libbcachefs/btree_key_cache.h | 3 + libbcachefs/btree_locking.h | 8 +- libbcachefs/btree_types.h | 13 +- libbcachefs/btree_update_interior.c | 2 +- libbcachefs/btree_update_leaf.c | 21 +- libbcachefs/compress.c | 2 +- libbcachefs/debug.c | 2 +- libbcachefs/debug.h | 33 +-- libbcachefs/ec.c | 2 +- libbcachefs/extents.c | 4 +- libbcachefs/fs-common.c | 4 +- libbcachefs/fs-io.c | 112 ++-------- libbcachefs/fs-io.h | 3 +- libbcachefs/fs.c | 62 ++++-- libbcachefs/fsck.c | 45 ++-- libbcachefs/inode.c | 311 +++++++++++++++++++--------- libbcachefs/inode.h | 21 +- libbcachefs/io.c | 4 +- libbcachefs/journal.c | 8 +- libbcachefs/journal_reclaim.c | 82 ++++---- libbcachefs/recovery.c | 2 +- libbcachefs/super.c | 18 +- libbcachefs/sysfs.c | 19 -- libbcachefs/util.c | 2 +- libbcachefs/util.h | 11 - libbcachefs/varint.c | 42 ++++ libbcachefs/varint.h | 8 + 48 files changed, 723 insertions(+), 599 deletions(-) create mode 100644 libbcachefs/varint.c create mode 100644 libbcachefs/varint.h diff --git a/.bcachefs_revision b/.bcachefs_revision index dc226f84..dc583047 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -8436db7aac9ced2118bf19b8f1bf3682f479d17e +1d669389f79de8571732c13fdf4d23039e2308fd diff --git a/cmd_migrate.c b/cmd_migrate.c index 797c51e0..42fbc2bc 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -122,7 +122,7 @@ static void update_inode(struct bch_fs *c, struct bkey_inode_buf packed; int ret; - bch2_inode_pack(&packed, inode); + bch2_inode_pack(c, &packed, inode); ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, NULL, NULL, 0); if (ret) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 024d645c..bfab7ea7 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -10,6 +10,8 @@ #define cpu_present(cpu) ((cpu) == 0) #define cpu_active(cpu) ((cpu) == 0) +#define raw_smp_processor_id() 0U + #define for_each_cpu(cpu, mask) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) #define for_each_cpu_not(cpu, mask) \ diff --git a/include/linux/page.h b/include/linux/page.h index 87be064f..310b3eda 100644 --- a/include/linux/page.h +++ b/include/linux/page.h @@ -21,6 +21,8 @@ struct page; #define kmap_atomic(page) page_address(page) #define kunmap_atomic(addr) do {} while (0) +#define PageHighMem(page) false + static const char zero_page[PAGE_SIZE]; #define ZERO_PAGE(o) ((struct page *) &zero_page[0]) diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index 9b4e8295..ba2c5555 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -536,9 +536,46 @@ DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused, TP_ARGS(ip) ); -DEFINE_EVENT(transaction_restart, trans_restart_would_deadlock, - TP_PROTO(unsigned long ip), - TP_ARGS(ip) +TRACE_EVENT(trans_restart_would_deadlock, + TP_PROTO(unsigned long trans_ip, + unsigned long caller_ip, + unsigned reason, + enum btree_id have_btree_id, + unsigned have_iter_type, + enum btree_id want_btree_id, + unsigned want_iter_type), + TP_ARGS(trans_ip, caller_ip, reason, + have_btree_id, have_iter_type, + want_btree_id, want_iter_type), + + TP_STRUCT__entry( + __field(unsigned long, trans_ip ) + __field(unsigned long, caller_ip ) + __field(u8, reason ) + __field(u8, have_btree_id ) + __field(u8, have_iter_type ) + __field(u8, want_btree_id ) + __field(u8, want_iter_type ) + ), + + TP_fast_assign( + __entry->trans_ip = trans_ip; + __entry->caller_ip = caller_ip; + __entry->reason = reason; + __entry->have_btree_id = have_btree_id; + __entry->have_iter_type = have_iter_type; + __entry->want_btree_id = want_btree_id; + __entry->want_iter_type = want_iter_type; + ), + + TP_printk("%pF %pF because %u have %u:%u want %u:%u", + (void *) __entry->trans_ip, + (void *) __entry->caller_ip, + __entry->reason, + __entry->have_btree_id, + __entry->have_iter_type, + __entry->want_btree_id, + __entry->want_iter_type) ); TRACE_EVENT(trans_restart_iters_realloced, diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index cbaff56f..d10ff56e 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -76,7 +76,7 @@ static inline void bch2_wake_allocator(struct bch_dev *ca) static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, size_t bucket) { - if (expensive_debug_checks(c)) { + if (bch2_expensive_debug_checks) { size_t iter; long i; unsigned j; diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 29f41163..35311dbb 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -265,6 +265,8 @@ do { \ BCH_DEBUG_PARAM(debug_check_bkeys, \ "Run bkey_debugcheck (primarily checking GC/allocation "\ "information) when iterating over keys") \ + BCH_DEBUG_PARAM(debug_check_btree_accounting, \ + "Verify btree accounting for keys within a node") \ BCH_DEBUG_PARAM(verify_btree_ondisk, \ "Reread btree nodes at various points to verify the " \ "mergesort in the read path against modifications " \ @@ -295,6 +297,16 @@ do { \ #define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() #endif +#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; +BCH_DEBUG_PARAMS() +#undef BCH_DEBUG_PARAM + +#ifndef CONFIG_BCACHEFS_DEBUG +#define BCH_DEBUG_PARAM(name, description) static const bool bch2_##name; +BCH_DEBUG_PARAMS_DEBUG() +#undef BCH_DEBUG_PARAM +#endif + #define BCH_TIME_STATS() \ x(btree_node_mem_alloc) \ x(btree_node_split) \ @@ -529,6 +541,10 @@ struct journal_keys { u64 journal_seq_base; }; +struct btree_iter_buf { + struct btree_iter *iter; +}; + struct bch_fs { struct closure cl; @@ -624,6 +640,7 @@ struct bch_fs { struct mutex btree_trans_lock; struct list_head btree_trans_list; mempool_t btree_iters_pool; + struct btree_iter_buf __percpu *btree_iters_bufs; struct btree_key_cache btree_key_cache; @@ -801,7 +818,8 @@ struct bch_fs { struct mutex verify_lock; #endif - u64 unused_inode_hint; + u64 *unused_inode_hints; + unsigned inode_shard_bits; /* * A btree node on disk could have too many bsets for an iterator to fit @@ -826,10 +844,6 @@ struct bch_fs { unsigned copy_gc_enabled:1; bool promote_whole_extents; -#define BCH_DEBUG_PARAM(name, description) bool name; - BCH_DEBUG_PARAMS_ALL() -#undef BCH_DEBUG_PARAM - struct time_stats times[BCH_TIME_STAT_NR]; }; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 2926c648..94b54185 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -669,10 +669,10 @@ struct bch_inode_generation { } __attribute__((packed, aligned(8))); #define BCH_INODE_FIELDS() \ - x(bi_atime, 64) \ - x(bi_ctime, 64) \ - x(bi_mtime, 64) \ - x(bi_otime, 64) \ + x(bi_atime, 96) \ + x(bi_ctime, 96) \ + x(bi_mtime, 96) \ + x(bi_otime, 96) \ x(bi_size, 64) \ x(bi_sectors, 64) \ x(bi_uid, 32) \ @@ -739,7 +739,8 @@ enum { #define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED) LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); -LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 32); +LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); +LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32); /* Dirents */ @@ -1330,13 +1331,15 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); x(btree_ptr_v2, 11) \ x(extents_above_btree_updates, 12) \ x(btree_updates_journalled, 13) \ - x(reflink_inline_data, 14) + x(reflink_inline_data, 14) \ + x(new_varint, 15) #define BCH_SB_FEATURES_ALL \ ((1ULL << BCH_FEATURE_new_siphash)| \ (1ULL << BCH_FEATURE_new_extent_overwrite)| \ (1ULL << BCH_FEATURE_btree_ptr_v2)| \ - (1ULL << BCH_FEATURE_extents_above_btree_updates)) + (1ULL << BCH_FEATURE_extents_above_btree_updates)|\ + (1ULL << BCH_FEATURE_new_varint))\ enum bch_sb_feature { #define x(f, n) BCH_FEATURE_##f, diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c index 4d0c9129..c06d0a96 100644 --- a/libbcachefs/bkey.c +++ b/libbcachefs/bkey.c @@ -411,7 +411,7 @@ static bool bkey_packed_successor(struct bkey_packed *out, if ((*p & mask) != mask) { *p += 1ULL << offset; - EBUG_ON(bkey_cmp_packed(b, out, &k) <= 0); + EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0); return true; } @@ -1054,9 +1054,9 @@ int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b, } __pure __flatten -int __bch2_bkey_cmp_packed(const struct bkey_packed *l, - const struct bkey_packed *r, - const struct btree *b) +int bch2_bkey_cmp_packed(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r) { struct bkey unpacked; diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index 80ea488d..2d2c6403 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -67,13 +67,6 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) #define bkey_whiteout(_k) \ ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard) -#define bkey_packed_typecheck(_k) \ -({ \ - BUILD_BUG_ON(!type_is(_k, struct bkey *) && \ - !type_is(_k, struct bkey_packed *)); \ - type_is(_k, struct bkey_packed *); \ -}) - enum bkey_lr_packed { BKEY_PACKED_BOTH, BKEY_PACKED_RIGHT, @@ -81,9 +74,6 @@ enum bkey_lr_packed { BKEY_PACKED_NONE, }; -#define bkey_lr_packed_typecheck(_l, _r) \ - (!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1)) - #define bkey_lr_packed(_l, _r) \ ((_l)->format + ((_r)->format << 1)) @@ -132,9 +122,9 @@ int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *, const struct bpos *); __pure -int __bch2_bkey_cmp_packed(const struct bkey_packed *, - const struct bkey_packed *, - const struct btree *); +int bch2_bkey_cmp_packed(const struct btree *, + const struct bkey_packed *, + const struct bkey_packed *); __pure int __bch2_bkey_cmp_left_packed(const struct btree *, @@ -160,37 +150,6 @@ static inline int bkey_cmp_left_packed_byval(const struct btree *b, return bkey_cmp_left_packed(b, l, &r); } -/* - * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to - * skip dispatching on k->format: - */ -#define bkey_cmp_packed(_b, _l, _r) \ -({ \ - int _cmp; \ - \ - switch (bkey_lr_packed_typecheck(_l, _r)) { \ - case BKEY_PACKED_NONE: \ - _cmp = bkey_cmp(((struct bkey *) (_l))->p, \ - ((struct bkey *) (_r))->p); \ - break; \ - case BKEY_PACKED_LEFT: \ - _cmp = bkey_cmp_left_packed((_b), \ - (struct bkey_packed *) (_l), \ - &((struct bkey *) (_r))->p); \ - break; \ - case BKEY_PACKED_RIGHT: \ - _cmp = -bkey_cmp_left_packed((_b), \ - (struct bkey_packed *) (_r), \ - &((struct bkey *) (_l))->p); \ - break; \ - case BKEY_PACKED_BOTH: \ - _cmp = __bch2_bkey_cmp_packed((void *) (_l), \ - (void *) (_r), (_b)); \ - break; \ - } \ - _cmp; \ -}) - #if 1 static __always_inline int bkey_cmp(struct bpos l, struct bpos r) { diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index 32849229..99b7fce2 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -236,7 +236,7 @@ enum merge_result bch2_bkey_merge(struct bch_fs *c, const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type]; enum merge_result ret; - if (key_merging_disabled(c) || + if (bch2_key_merging_disabled || !ops->key_merge || l.k->type != r.k->type || bversion_cmp(l.k->version, r.k->version) || diff --git a/libbcachefs/bkey_sort.c b/libbcachefs/bkey_sort.c index 839e78d1..99e0a401 100644 --- a/libbcachefs/bkey_sort.c +++ b/libbcachefs/bkey_sort.c @@ -86,7 +86,7 @@ static inline int key_sort_fix_overlapping_cmp(struct btree *b, struct bkey_packed *l, struct bkey_packed *r) { - return bkey_cmp_packed(b, l, r) ?: + return bch2_bkey_cmp_packed(b, l, r) ?: cmp_int((unsigned long) l, (unsigned long) r); } @@ -98,7 +98,7 @@ static inline bool should_drop_next_key(struct sort_iter *iter) * and should be dropped. */ return iter->used >= 2 && - !bkey_cmp_packed(iter->b, + !bch2_bkey_cmp_packed(iter->b, iter->data[0].k, iter->data[1].k); } @@ -223,7 +223,7 @@ static inline int sort_keys_cmp(struct btree *b, struct bkey_packed *l, struct bkey_packed *r) { - return bkey_cmp_packed(b, l, r) ?: + return bch2_bkey_cmp_packed(b, l, r) ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: (int) l->needs_whiteout - (int) r->needs_whiteout; } @@ -245,7 +245,7 @@ unsigned bch2_sort_keys(struct bkey_packed *dst, continue; while ((next = sort_iter_peek(iter)) && - !bkey_cmp_packed(iter->b, in, next)) { + !bch2_bkey_cmp_packed(iter->b, in, next)) { BUG_ON(in->needs_whiteout && next->needs_whiteout); needs_whiteout |= in->needs_whiteout; @@ -406,7 +406,7 @@ static inline int sort_extents_cmp(struct btree *b, struct bkey_packed *l, struct bkey_packed *r) { - return bkey_cmp_packed(b, l, r) ?: + return bch2_bkey_cmp_packed(b, l, r) ?: (int) bkey_deleted(l) - (int) bkey_deleted(r); } diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index f7c2841e..26716657 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -369,10 +369,10 @@ static struct bkey_float *bkey_float(const struct btree *b, return ro_aux_tree_base(b, t)->f + idx; } -static void bset_aux_tree_verify(struct btree *b) +static void bset_aux_tree_verify(const struct btree *b) { #ifdef CONFIG_BCACHEFS_DEBUG - struct bset_tree *t; + const struct bset_tree *t; for_each_bset(b, t) { if (t->aux_data_offset == U16_MAX) @@ -388,15 +388,13 @@ static void bset_aux_tree_verify(struct btree *b) #endif } -void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks) +void bch2_btree_keys_init(struct btree *b) { unsigned i; b->nsets = 0; memset(&b->nr, 0, sizeof(b->nr)); -#ifdef CONFIG_BCACHEFS_DEBUG - b->expensive_debug_checks = expensive_debug_checks; -#endif + for (i = 0; i < MAX_BSETS; i++) b->set[i].data_offset = U16_MAX; @@ -522,7 +520,7 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b, struct bkey_packed *k = btree_bkey_first(b, t); unsigned j = 0; - if (!btree_keys_expensive_checks(b)) + if (!bch2_expensive_debug_checks) return; BUG_ON(bset_has_ro_aux_tree(t)); @@ -710,20 +708,20 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, } /* bytes remaining - only valid for last bset: */ -static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t) +static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t) { bset_aux_tree_verify(b); return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); } -static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t) +static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t) { return __bset_tree_capacity(b, t) / (sizeof(struct bkey_float) + sizeof(u8)); } -static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t) +static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t) { return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); } @@ -922,7 +920,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, k = p; } - if (btree_keys_expensive_checks(b)) { + if (bch2_expensive_debug_checks) { BUG_ON(ret >= orig_k); for (i = ret @@ -1227,8 +1225,8 @@ static inline bool bkey_mantissa_bits_dropped(const struct btree *b, __flatten static struct bkey_packed *bset_search_tree(const struct btree *b, - struct bset_tree *t, - struct bpos *search, + const struct bset_tree *t, + const struct bpos *search, const struct bkey_packed *packed_search) { struct ro_aux_tree *base = ro_aux_tree_base(b, t); @@ -1345,7 +1343,7 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b, bkey_iter_pos_cmp(b, m, search) < 0) m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); - if (btree_keys_expensive_checks(b)) { + if (bch2_expensive_debug_checks) { struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); BUG_ON(prev && @@ -1601,7 +1599,7 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, void bch2_btree_node_iter_advance(struct btree_node_iter *iter, struct btree *b) { - if (btree_keys_expensive_checks(b)) { + if (bch2_expensive_debug_checks) { bch2_btree_node_iter_verify(iter, b); bch2_btree_node_iter_next_check(iter, b); } @@ -1620,7 +1618,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, struct bset_tree *t; unsigned end = 0; - if (btree_keys_expensive_checks(b)) + if (bch2_expensive_debug_checks) bch2_btree_node_iter_verify(iter, b); for_each_bset(b, t) { @@ -1656,7 +1654,7 @@ found: iter->data[0].k = __btree_node_key_to_offset(b, prev); iter->data[0].end = end; - if (btree_keys_expensive_checks(b)) + if (bch2_expensive_debug_checks) bch2_btree_node_iter_verify(iter, b); return prev; } diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h index 5921cf68..469294cc 100644 --- a/libbcachefs/bset.h +++ b/libbcachefs/bset.h @@ -5,7 +5,7 @@ #include #include -#include "bcachefs_format.h" +#include "bcachefs.h" #include "bkey.h" #include "bkey_methods.h" #include "btree_types.h" @@ -147,17 +147,6 @@ * first key in that range of bytes again. */ -extern bool bch2_expensive_debug_checks; - -static inline bool btree_keys_expensive_checks(const struct btree *b) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - return bch2_expensive_debug_checks || *b->expensive_debug_checks; -#else - return false; -#endif -} - enum bset_aux_tree_type { BSET_NO_AUX_TREE, BSET_RO_AUX_TREE, @@ -201,17 +190,17 @@ static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree #define BSET_CACHELINE 128 -static inline size_t btree_keys_cachelines(struct btree *b) +static inline size_t btree_keys_cachelines(const struct btree *b) { return (1U << b->byte_order) / BSET_CACHELINE; } -static inline size_t btree_aux_data_bytes(struct btree *b) +static inline size_t btree_aux_data_bytes(const struct btree *b) { return btree_keys_cachelines(b) * 8; } -static inline size_t btree_aux_data_u64s(struct btree *b) +static inline size_t btree_aux_data_u64s(const struct btree *b) { return btree_aux_data_bytes(b) / sizeof(u64); } @@ -228,7 +217,7 @@ __bkey_unpack_key_format_checked(const struct btree *b, compiled_unpack_fn unpack_fn = b->aux_data; unpack_fn(dst, src); - if (btree_keys_expensive_checks(b)) { + if (bch2_expensive_debug_checks) { struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); @@ -366,7 +355,7 @@ static inline struct bset *bset_next_set(struct btree *b, return ((void *) i) + round_up(vstruct_bytes(i), block_bytes); } -void bch2_btree_keys_init(struct btree *, bool *); +void bch2_btree_keys_init(struct btree *); void bch2_bset_init_first(struct btree *, struct bset *); void bch2_bset_init_next(struct bch_fs *, struct btree *, @@ -477,7 +466,7 @@ static inline int bkey_iter_cmp(const struct btree *b, const struct bkey_packed *l, const struct bkey_packed *r) { - return bkey_cmp_packed(b, l, r) + return bch2_bkey_cmp_packed(b, l, r) ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: cmp_int(l, r); } @@ -654,7 +643,7 @@ static inline void bch2_verify_insert_pos(struct btree *b, static inline void bch2_verify_btree_nr_keys(struct btree *b) { - if (btree_keys_expensive_checks(b)) + if (bch2_debug_check_btree_accounting) __bch2_verify_btree_nr_keys(b); } diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index bb94fa23..325a1661 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -211,7 +211,7 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) * - unless btree verify mode is enabled, since it runs out of * the post write cleanup: */ - if (verify_btree_ondisk(c)) + if (bch2_verify_btree_ondisk) bch2_btree_node_write(c, b, SIX_LOCK_intent); else __bch2_btree_node_write(c, b, SIX_LOCK_read); @@ -254,7 +254,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, unsigned long freed = 0; unsigned i, flags; - if (btree_shrinker_disabled(c)) + if (bch2_btree_shrinker_disabled) return SHRINK_STOP; /* Return -1 if we can't do anything right now */ @@ -341,7 +341,7 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink, btree_cache.shrink); struct btree_cache *bc = &c->btree_cache; - if (btree_shrinker_disabled(c)) + if (bch2_btree_shrinker_disabled) return 0; return btree_cache_can_free(bc) * btree_pages(c); @@ -590,7 +590,7 @@ out: b->sib_u64s[0] = 0; b->sib_u64s[1] = 0; b->whiteout_u64s = 0; - bch2_btree_keys_init(b, &c->expensive_debug_checks); + bch2_btree_keys_init(b); bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], start_time); @@ -705,7 +705,8 @@ static int lock_node_check_fn(struct six_lock *lock, void *p) */ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, const struct bkey_i *k, unsigned level, - enum six_lock_type lock_type) + enum six_lock_type lock_type, + unsigned long trace_ip) { struct btree_cache *bc = &c->btree_cache; struct btree *b; @@ -767,7 +768,7 @@ lock_node: btree_node_unlock(iter, level + 1); if (!btree_node_lock(b, k->k.p, level, iter, lock_type, - lock_node_check_fn, (void *) k)) { + lock_node_check_fn, (void *) k, trace_ip)) { if (b->hash_val != btree_ptr_hash_val(k)) goto retry; return ERR_PTR(-EINTR); @@ -935,7 +936,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, bch2_bkey_unpack(parent, &tmp.k, k); ret = bch2_btree_node_get(c, iter, &tmp.k, level, - SIX_LOCK_intent); + SIX_LOCK_intent, _THIS_IP_); if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) { struct btree_iter *linked; @@ -948,14 +949,14 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, * holding other locks that would cause us to deadlock: */ trans_for_each_iter(trans, linked) - if (btree_iter_cmp(iter, linked) < 0) + if (btree_iter_lock_cmp(iter, linked) < 0) __bch2_btree_iter_unlock(linked); if (sib == btree_prev_sib) btree_node_unlock(iter, level); ret = bch2_btree_node_get(c, iter, &tmp.k, level, - SIX_LOCK_intent); + SIX_LOCK_intent, _THIS_IP_); /* * before btree_iter_relock() calls btree_iter_verify_locks(): diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index d0d3a85b..8a19e60e 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -23,7 +23,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, const struct bkey_i *, unsigned, - enum six_lock_type); + enum six_lock_type, unsigned long); struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, enum btree_id, unsigned); diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index e8c1e752..ba4acc11 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -101,7 +101,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, int ret = 0; if (initial) { - BUG_ON(journal_seq_verify(c) && + BUG_ON(bch2_journal_seq_verify && k.k->version.lo > journal_cur_seq(&c->journal)); /* XXX change to fsck check */ @@ -209,7 +209,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, struct btree_iter *iter; struct btree *b; unsigned depth = metadata_only ? 1 - : expensive_debug_checks(c) ? 0 + : bch2_expensive_debug_checks ? 0 : !btree_node_type_needs_gc(btree_id) ? 1 : 0; u8 max_stale = 0; @@ -236,8 +236,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, BTREE_INSERT_USE_RESERVE| BTREE_INSERT_NOWAIT| BTREE_INSERT_GC_LOCK_HELD); - else if (!btree_gc_rewrite_disabled(c) && - (btree_gc_always_rewrite(c) || max_stale > 16)) + else if (!bch2_btree_gc_rewrite_disabled && + (bch2_btree_gc_always_rewrite || max_stale > 16)) bch2_btree_node_rewrite(c, iter, b->data->keys.seq, BTREE_INSERT_NOWAIT| @@ -328,7 +328,7 @@ static int bch2_gc_btree_init(struct bch_fs *c, { struct btree *b; unsigned target_depth = metadata_only ? 1 - : expensive_debug_checks(c) ? 0 + : bch2_expensive_debug_checks ? 0 : !btree_node_type_needs_gc(btree_id) ? 1 : 0; u8 max_stale = 0; @@ -835,7 +835,7 @@ again: out: if (!ret && (test_bit(BCH_FS_FIXED_GENS, &c->flags) || - (!iter && test_restart_gc(c)))) { + (!iter && bch2_test_restart_gc))) { /* * XXX: make sure gens we fixed got saved */ diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 682f599c..10a00085 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -42,7 +42,7 @@ static void verify_no_dups(struct btree *b, BUG_ON(extents ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0 : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0); - //BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0); + //BUG_ON(bch2_bkey_cmp_packed(&b->format, p, k) >= 0); } #endif } @@ -102,14 +102,14 @@ static void sort_bkey_ptrs(const struct btree *bt, break; for (b = a; c = 2 * b + 1, (d = c + 1) < n;) - b = bkey_cmp_packed(bt, + b = bch2_bkey_cmp_packed(bt, ptrs[c], ptrs[d]) >= 0 ? c : d; if (d == n) b = c; while (b != a && - bkey_cmp_packed(bt, + bch2_bkey_cmp_packed(bt, ptrs[a], ptrs[b]) >= 0) b = (b - 1) / 2; @@ -1044,7 +1044,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry const char *invalid = bch2_bkey_val_invalid(c, u.s_c); if (invalid || - (inject_invalid_keys(c) && + (bch2_inject_invalid_keys && !bversion_cmp(u.k->version, MAX_VERSION))) { char buf[160]; diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 6fab76c3..58f1a3dd 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -197,13 +197,13 @@ static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b, bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, unsigned level, struct btree_iter *iter, enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, - void *p) + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) { struct btree_trans *trans = iter->trans; - struct btree_iter *linked; + struct btree_iter *linked, *deadlock_iter = NULL; u64 start_time = local_clock(); - bool ret = true; + unsigned reason = 9; /* Check if it's safe to block: */ trans_for_each_iter(trans, linked) { @@ -228,42 +228,64 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, linked->locks_want = max_t(unsigned, linked->locks_want, __fls(linked->nodes_locked) + 1); - if (!btree_iter_get_locks(linked, true, false)) - ret = false; + if (!btree_iter_get_locks(linked, true, false)) { + deadlock_iter = linked; + reason = 1; + } } else { - ret = false; + deadlock_iter = linked; + reason = 2; } } + if (linked->btree_id != iter->btree_id) { + if (linked->btree_id > iter->btree_id) { + deadlock_iter = linked; + reason = 3; + } + continue; + } + + /* + * Within the same btree, cached iterators come before non + * cached iterators: + */ + if (btree_iter_is_cached(linked) != btree_iter_is_cached(iter)) { + if (btree_iter_is_cached(iter)) { + deadlock_iter = linked; + reason = 4; + } + continue; + } + /* * Interior nodes must be locked before their descendants: if * another iterator has possible descendants locked of the node * we're about to lock, it must have the ancestors locked too: */ - if (linked->btree_id == iter->btree_id && - level > __fls(linked->nodes_locked)) { + if (level > __fls(linked->nodes_locked)) { if (!(trans->nounlock)) { linked->locks_want = max(level + 1, max_t(unsigned, linked->locks_want, iter->locks_want)); - if (!btree_iter_get_locks(linked, true, false)) - ret = false; + if (!btree_iter_get_locks(linked, true, false)) { + deadlock_iter = linked; + reason = 5; + } } else { - ret = false; + deadlock_iter = linked; + reason = 6; } } /* Must lock btree nodes in key order: */ - if ((cmp_int(iter->btree_id, linked->btree_id) ?: - -cmp_int(btree_iter_type(iter), btree_iter_type(linked))) < 0) - ret = false; - - if (iter->btree_id == linked->btree_id && - btree_node_locked(linked, level) && + if (btree_node_locked(linked, level) && bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b, - btree_iter_type(linked))) <= 0) - ret = false; + btree_iter_type(linked))) <= 0) { + deadlock_iter = linked; + reason = 7; + } /* * Recheck if this is a node we already have locked - since one @@ -277,8 +299,13 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, } } - if (unlikely(!ret)) { - trace_trans_restart_would_deadlock(iter->trans->ip); + if (unlikely(deadlock_iter)) { + trace_trans_restart_would_deadlock(iter->trans->ip, ip, + reason, + deadlock_iter->btree_id, + btree_iter_type(deadlock_iter), + iter->btree_id, + btree_iter_type(iter)); return false; } @@ -471,7 +498,7 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter, char buf1[100], buf2[100]; const char *msg; - if (!debug_check_iterators(iter->trans->c)) + if (!bch2_debug_check_iterators) return; if (btree_iter_type(iter) == BTREE_ITER_CACHED) { @@ -567,7 +594,7 @@ void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b) { struct btree_iter *iter; - if (!debug_check_iterators(trans->c)) + if (!bch2_debug_check_iterators) return; trans_for_each_iter_with_node(trans, b, iter) @@ -739,7 +766,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter, __bch2_btree_node_iter_fix(iter, b, node_iter, t, where, clobber_u64s, new_u64s); - if (debug_check_iterators(iter->trans->c)) + if (bch2_debug_check_iterators) bch2_btree_node_iter_verify(node_iter, b); } @@ -769,7 +796,7 @@ static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, ret = bkey_disassemble(l->b, k, u); - if (debug_check_bkeys(iter->trans->c)) + if (bch2_debug_check_bkeys) bch2_bkey_debugcheck(iter->trans->c, l->b, ret); return ret; @@ -945,7 +972,8 @@ static int lock_root_check_fn(struct six_lock *lock, void *p) } static inline int btree_iter_lock_root(struct btree_iter *iter, - unsigned depth_want) + unsigned depth_want, + unsigned long trace_ip) { struct bch_fs *c = iter->trans->c; struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b; @@ -974,7 +1002,8 @@ static inline int btree_iter_lock_root(struct btree_iter *iter, lock_type = __btree_lock_want(iter, iter->level); if (unlikely(!btree_node_lock(b, POS_MAX, iter->level, iter, lock_type, - lock_root_check_fn, rootp))) + lock_root_check_fn, rootp, + trace_ip))) return -EINTR; if (likely(b == READ_ONCE(*rootp) && @@ -1046,7 +1075,8 @@ static noinline void btree_node_mem_ptr_set(struct btree_iter *iter, btree_node_unlock(iter, plevel); } -static __always_inline int btree_iter_down(struct btree_iter *iter) +static __always_inline int btree_iter_down(struct btree_iter *iter, + unsigned long trace_ip) { struct bch_fs *c = iter->trans->c; struct btree_iter_level *l = &iter->l[iter->level]; @@ -1060,7 +1090,7 @@ static __always_inline int btree_iter_down(struct btree_iter *iter) bch2_bkey_unpack(l->b, &tmp.k, bch2_btree_node_iter_peek(&l->iter, l->b)); - b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type); + b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type, trace_ip); if (unlikely(IS_ERR(b))) return PTR_ERR(b); @@ -1084,7 +1114,7 @@ static void btree_iter_up(struct btree_iter *iter) btree_node_unlock(iter, iter->level++); } -static int btree_iter_traverse_one(struct btree_iter *); +static int btree_iter_traverse_one(struct btree_iter *, unsigned long); static int __btree_iter_traverse_all(struct btree_trans *trans, int ret) { @@ -1104,11 +1134,12 @@ retry_all: sorted[nr_sorted++] = iter->idx; #define btree_iter_cmp_by_idx(_l, _r) \ - btree_iter_cmp(&trans->iters[_l], &trans->iters[_r]) + btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r]) bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx); #undef btree_iter_cmp_by_idx bch2_trans_unlock(trans); + cond_resched(); if (unlikely(ret == -ENOMEM)) { struct closure cl; @@ -1139,7 +1170,7 @@ retry_all: if (!(trans->iters_linked & (1ULL << idx))) continue; - ret = btree_iter_traverse_one(&trans->iters[idx]); + ret = btree_iter_traverse_one(&trans->iters[idx], _THIS_IP_); if (ret) goto retry_all; } @@ -1202,7 +1233,8 @@ static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter, * On error, caller (peek_node()/peek_key()) must return NULL; the error is * stashed in the iterator and returned from bch2_trans_exit(). */ -static int btree_iter_traverse_one(struct btree_iter *iter) +static int btree_iter_traverse_one(struct btree_iter *iter, + unsigned long trace_ip) { unsigned depth_want = iter->level; @@ -1249,8 +1281,8 @@ static int btree_iter_traverse_one(struct btree_iter *iter) */ while (iter->level > depth_want) { int ret = btree_iter_node(iter, iter->level) - ? btree_iter_down(iter) - : btree_iter_lock_root(iter, depth_want); + ? btree_iter_down(iter, trace_ip) + : btree_iter_lock_root(iter, depth_want, trace_ip); if (unlikely(ret)) { if (ret == 1) return 0; @@ -1281,7 +1313,7 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) int ret; ret = bch2_trans_cond_resched(trans) ?: - btree_iter_traverse_one(iter); + btree_iter_traverse_one(iter, _RET_IP_); if (unlikely(ret)) ret = __btree_iter_traverse_all(trans, ret); @@ -1545,13 +1577,13 @@ static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter) ret.v = bkeyp_val(&l->b->format, _k); - if (debug_check_iterators(iter->trans->c)) { + if (bch2_debug_check_iterators) { struct bkey k = bkey_unpack_key(l->b, _k); BUG_ON(memcmp(&k, &iter->k, sizeof(k))); } - if (debug_check_bkeys(iter->trans->c)) + if (bch2_debug_check_bkeys) bch2_bkey_debugcheck(iter->trans->c, l->b, ret); } @@ -1970,6 +2002,7 @@ int bch2_trans_iter_free(struct btree_trans *trans, return bch2_trans_iter_put(trans, iter); } +#if 0 static int bch2_trans_realloc_iters(struct btree_trans *trans, unsigned new_size) { @@ -2018,8 +2051,7 @@ success: sizeof(struct btree_iter) * trans->nr_iters + sizeof(struct btree_insert_entry) * trans->nr_iters); - if (trans->iters != trans->iters_onstack) - kfree(trans->iters); + kfree(trans->iters); trans->iters = new_iters; trans->updates = new_updates; @@ -2033,6 +2065,7 @@ success: return 0; } +#endif static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) { @@ -2042,28 +2075,27 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) goto got_slot; if (trans->nr_iters == trans->size) { - int ret; + struct btree_iter *iter; - if (trans->nr_iters >= BTREE_ITER_MAX) { - struct btree_iter *iter; + BUG_ON(trans->size < BTREE_ITER_MAX); - trans_for_each_iter(trans, iter) { - pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps", - bch2_btree_ids[iter->btree_id], - iter->pos.inode, - iter->pos.offset, - (trans->iters_live & (1ULL << iter->idx)) ? " live" : "", - (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", - iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", - (void *) iter->ip_allocated); - } - - panic("trans iter oveflow\n"); + trans_for_each_iter(trans, iter) { + pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps", + bch2_btree_ids[iter->btree_id], + iter->pos.inode, + iter->pos.offset, + (trans->iters_live & (1ULL << iter->idx)) ? " live" : "", + (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", + iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", + (void *) iter->ip_allocated); } + panic("trans iter oveflow\n"); +#if 0 ret = bch2_trans_realloc_iters(trans, trans->size * 2); if (ret) return ERR_PTR(ret); +#endif } idx = trans->nr_iters++; @@ -2305,28 +2337,37 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) bch2_btree_iter_traverse_all(trans); } +static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c) +{ + unsigned new_size = BTREE_ITER_MAX; + size_t iters_bytes = sizeof(struct btree_iter) * new_size; + size_t updates_bytes = sizeof(struct btree_insert_entry) * new_size; + void *p; + + BUG_ON(trans->used_mempool); + + p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL) ?: + mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); + + trans->iters = p; p += iters_bytes; + trans->updates = p; p += updates_bytes; + trans->updates2 = p; p += updates_bytes; + trans->size = new_size; +} + void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned expected_nr_iters, size_t expected_mem_bytes) { - memset(trans, 0, offsetof(struct btree_trans, iters_onstack)); + memset(trans, 0, sizeof(*trans)); + trans->c = c; + trans->ip = _RET_IP_; /* * reallocating iterators currently completely breaks - * bch2_trans_iter_put(): + * bch2_trans_iter_put(), we always allocate the max: */ - expected_nr_iters = BTREE_ITER_MAX; - - trans->c = c; - trans->ip = _RET_IP_; - trans->size = ARRAY_SIZE(trans->iters_onstack); - trans->iters = trans->iters_onstack; - trans->updates = trans->updates_onstack; - trans->updates2 = trans->updates2_onstack; - trans->fs_usage_deltas = NULL; - - if (expected_nr_iters > trans->size) - bch2_trans_realloc_iters(trans, expected_nr_iters); + bch2_trans_alloc_iters(trans, c); if (expected_mem_bytes) bch2_trans_preload_mem(trans, expected_mem_bytes); @@ -2341,6 +2382,8 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, int bch2_trans_exit(struct btree_trans *trans) { + struct bch_fs *c = trans->c; + bch2_trans_unlock(trans); #ifdef CONFIG_BCACHEFS_DEBUG @@ -2353,19 +2396,21 @@ int bch2_trans_exit(struct btree_trans *trans) kfree(trans->fs_usage_deltas); kfree(trans->mem); - if (trans->used_mempool) + + trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters); + if (trans->iters) mempool_free(trans->iters, &trans->c->btree_iters_pool); - else if (trans->iters != trans->iters_onstack) - kfree(trans->iters); + trans->mem = (void *) 0x1; trans->iters = (void *) 0x1; return trans->error ? -EIO : 0; } -static void bch2_btree_iter_node_to_text(struct printbuf *out, - struct btree_bkey_cached_common *_b, - enum btree_iter_type type) +static void __maybe_unused +bch2_btree_iter_node_to_text(struct printbuf *out, + struct btree_bkey_cached_common *_b, + enum btree_iter_type type) { pr_buf(out, " %px l=%u %s:", _b, _b->level, bch2_btree_ids[_b->btree_id]); diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index bd9ec3ec..f7a73619 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -177,11 +177,12 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos); void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool); void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos); -static inline int btree_iter_cmp(const struct btree_iter *l, - const struct btree_iter *r) +/* Sort order for locking btree iterators: */ +static inline int btree_iter_lock_cmp(const struct btree_iter *l, + const struct btree_iter *r) { return cmp_int(l->btree_id, r->btree_id) ?: - -cmp_int(btree_iter_type(l), btree_iter_type(r)) ?: + -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?: bkey_cmp(l->pos, r->pos); } diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index 61662750..0ee4f78c 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -29,8 +29,8 @@ static const struct rhashtable_params bch2_btree_key_cache_params = { }; __flatten -static inline struct bkey_cached * -btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) +inline struct bkey_cached * +bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) { struct bkey_cached_key key = { .btree_id = btree_id, @@ -204,6 +204,7 @@ static int bkey_cached_check_fn(struct six_lock *lock, void *p) !bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1; } +__flatten int bch2_btree_iter_traverse_cached(struct btree_iter *iter) { struct btree_trans *trans = iter->trans; @@ -218,7 +219,7 @@ int bch2_btree_iter_traverse_cached(struct btree_iter *iter) goto fill; } retry: - ck = btree_key_cache_find(c, iter->btree_id, iter->pos); + ck = bch2_btree_key_cache_find(c, iter->btree_id, iter->pos); if (!ck) { if (iter->flags & BTREE_ITER_CACHED_NOCREATE) { iter->l[0].b = NULL; @@ -242,7 +243,7 @@ retry: enum six_lock_type lock_want = __btree_lock_want(iter, 0); if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want, - bkey_cached_check_fn, iter)) { + bkey_cached_check_fn, iter, _THIS_IP_)) { if (ck->key.btree_id != iter->btree_id || bkey_cmp(ck->key.pos, iter->pos)) { goto retry; @@ -415,7 +416,7 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans, struct bkey_cached_key key = { id, pos }; /* Fastpath - assume it won't be found: */ - if (!btree_key_cache_find(c, id, pos)) + if (!bch2_btree_key_cache_find(c, id, pos)) return 0; return btree_key_cache_flush_pos(trans, key, 0, true); @@ -462,7 +463,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, void bch2_btree_key_cache_verify_clean(struct btree_trans *trans, enum btree_id id, struct bpos pos) { - BUG_ON(btree_key_cache_find(trans->c, id, pos)); + BUG_ON(bch2_btree_key_cache_find(trans->c, id, pos)); } #endif diff --git a/libbcachefs/btree_key_cache.h b/libbcachefs/btree_key_cache.h index b1756c6c..d448264a 100644 --- a/libbcachefs/btree_key_cache.h +++ b/libbcachefs/btree_key_cache.h @@ -1,6 +1,9 @@ #ifndef _BCACHEFS_BTREE_KEY_CACHE_H #define _BCACHEFS_BTREE_KEY_CACHE_H +struct bkey_cached * +bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos); + int bch2_btree_iter_traverse_cached(struct btree_iter *); bool bch2_btree_insert_key_cached(struct btree_trans *, diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index 81fbf3e1..38323e32 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -176,13 +176,15 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans, bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned, struct btree_iter *, enum six_lock_type, - six_lock_should_sleep_fn, void *); + six_lock_should_sleep_fn, void *, + unsigned long); static inline bool btree_node_lock(struct btree *b, struct bpos pos, unsigned level, struct btree_iter *iter, enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, void *p) + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) { struct btree_trans *trans = iter->trans; bool ret; @@ -200,7 +202,7 @@ static inline bool btree_node_lock(struct btree *b, ret = likely(six_trylock_type(&b->c.lock, type)) || btree_node_lock_increment(trans, b, level, type) || __bch2_btree_node_lock(b, pos, level, iter, type, - should_sleep_fn, p); + should_sleep_fn, p, ip); #ifdef CONFIG_BCACHEFS_DEBUG trans->locking = NULL; diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index cc01baee..93721fbc 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -130,10 +130,6 @@ struct btree { struct btree_write writes[2]; -#ifdef CONFIG_BCACHEFS_DEBUG - bool *expensive_debug_checks; -#endif - /* Key/pointer for this btree node */ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); }; @@ -283,6 +279,11 @@ btree_iter_type(const struct btree_iter *iter) return iter->flags & BTREE_ITER_TYPE; } +static inline bool btree_iter_is_cached(const struct btree_iter *iter) +{ + return btree_iter_type(iter) == BTREE_ITER_CACHED; +} + static inline struct btree_iter_level *iter_l(struct btree_iter *iter) { return iter->l + iter->level; @@ -380,10 +381,6 @@ struct btree_trans { unsigned journal_u64s; unsigned journal_preres_u64s; struct replicas_delta_list *fs_usage_deltas; - - struct btree_iter iters_onstack[2]; - struct btree_insert_entry updates_onstack[2]; - struct btree_insert_entry updates2_onstack[2]; }; #define BTREE_FLAG(flag) \ diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index a2604b0c..4ddd1697 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -1313,7 +1313,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, * the node the iterator points to: */ while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && - (bkey_cmp_packed(b, k, &insert->k) >= 0)) + (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0)) ; for_each_keylist_key(keys, insert) diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 49995cd0..e386f8ed 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -72,7 +72,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); k = bch2_btree_node_iter_peek_all(node_iter, b); - if (k && bkey_cmp_packed(b, k, &insert->k)) + if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) k = NULL; /* @k is the key being overwritten/deleted, if any: */ @@ -220,7 +220,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, struct bch_fs *c = trans->c; BUG_ON(bkey_cmp(insert->k.p, iter->pos)); - BUG_ON(debug_check_bkeys(c) && + BUG_ON(bch2_debug_check_bkeys && bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(iter->level, iter->btree_id))); } @@ -440,10 +440,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, */ if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { - if (journal_seq_verify(c)) + if (bch2_journal_seq_verify) trans_for_each_update2(trans, i) i->k->k.version.lo = trans->journal_res.seq; - else if (inject_invalid_keys(c)) + else if (bch2_inject_invalid_keys) trans_for_each_update2(trans, i) i->k->k.version = MAX_VERSION; } @@ -680,6 +680,13 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) return 0; } +static inline int btree_iter_pos_cmp(const struct btree_iter *l, + const struct btree_iter *r) +{ + return cmp_int(l->btree_id, r->btree_id) ?: + bkey_cmp(l->pos, r->pos); +} + static void bch2_trans_update2(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *insert) @@ -697,12 +704,12 @@ static void bch2_trans_update2(struct btree_trans *trans, iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; trans_for_each_update2(trans, i) { - if (btree_iter_cmp(n.iter, i->iter) == 0) { + if (btree_iter_pos_cmp(n.iter, i->iter) == 0) { *i = n; return; } - if (btree_iter_cmp(n.iter, i->iter) <= 0) + if (btree_iter_pos_cmp(n.iter, i->iter) <= 0) break; } @@ -986,7 +993,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, * Pending updates are kept sorted: first, find position of new update: */ trans_for_each_update(trans, i) - if (btree_iter_cmp(iter, i->iter) <= 0) + if (btree_iter_pos_cmp(iter, i->iter) <= 0) break; /* diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c index b50d2b0d..aebf46bb 100644 --- a/libbcachefs/compress.c +++ b/libbcachefs/compress.c @@ -70,7 +70,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max); - if (!IS_ENABLED(CONFIG_HIGHMEM) && + if (!PageHighMem(bio_iter_page(bio, start)) && bio_phys_contig(bio, start)) return (struct bbuf) { .b = page_address(bio_iter_page(bio, start)) + diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index aa10591a..bbe3fefa 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -54,7 +54,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) v->written = 0; v->c.level = b->c.level; v->c.btree_id = b->c.btree_id; - bch2_btree_keys_init(v, &c->expensive_debug_checks); + bch2_btree_keys_init(v); if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) diff --git a/libbcachefs/debug.h b/libbcachefs/debug.h index 56c2d1ab..7ac1615e 100644 --- a/libbcachefs/debug.h +++ b/libbcachefs/debug.h @@ -8,44 +8,15 @@ struct bio; struct btree; struct bch_fs; -#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; -BCH_DEBUG_PARAMS() -#undef BCH_DEBUG_PARAM - -#define BCH_DEBUG_PARAM(name, description) \ - static inline bool name(struct bch_fs *c) \ - { return bch2_##name || c->name; } -BCH_DEBUG_PARAMS_ALWAYS() -#undef BCH_DEBUG_PARAM - #ifdef CONFIG_BCACHEFS_DEBUG - -#define BCH_DEBUG_PARAM(name, description) \ - static inline bool name(struct bch_fs *c) \ - { return bch2_##name || c->name; } -BCH_DEBUG_PARAMS_DEBUG() -#undef BCH_DEBUG_PARAM - void __bch2_btree_verify(struct bch_fs *, struct btree *); - -#define bypass_torture_test(d) ((d)->bypass_torture_test) - -#else /* DEBUG */ - -#define BCH_DEBUG_PARAM(name, description) \ - static inline bool name(struct bch_fs *c) { return false; } -BCH_DEBUG_PARAMS_DEBUG() -#undef BCH_DEBUG_PARAM - +#else static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {} - -#define bypass_torture_test(d) 0 - #endif static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) { - if (verify_btree_ondisk(c)) + if (bch2_verify_btree_ondisk) __bch2_btree_verify(c, b); } diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index e4a4805e..d7ba0e7f 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -1586,7 +1586,7 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) size_t i; spin_lock(&c->ec_stripes_heap_lock); - for (i = 0; i < min(h->used, 20UL); i++) { + for (i = 0; i < min_t(size_t, h->used, 20); i++) { m = genradix_ptr(&c->stripes[0], h->data[i].idx); pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx, diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 88297b30..7fae6a4b 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -89,7 +89,7 @@ static inline bool ptr_better(struct bch_fs *c, return bch2_rand_range(l1 + l2) > l1; } - if (force_reconstruct_read(c)) + if (bch2_force_reconstruct_read) return p1.idx > p2.idx; return p1.idx < p2.idx; @@ -137,7 +137,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, !bch2_dev_is_readable(ca)) p.idx++; - if (force_reconstruct_read(c) && + if (bch2_force_reconstruct_read && !p.idx && p.has_ec) p.idx++; diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c index 878419d4..503ce192 100644 --- a/libbcachefs/fs-common.c +++ b/libbcachefs/fs-common.c @@ -34,9 +34,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, if (!name) new_inode->bi_flags |= BCH_INODE_UNLINKED; - ret = bch2_inode_create(trans, new_inode, - BLOCKDEV_INODE_MAX, 0, - &c->unused_inode_hint); + ret = bch2_inode_create(trans, new_inode); if (ret) goto err; diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 3aed2ca4..1eb69ed3 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -265,28 +265,13 @@ static inline struct bch_page_state *bch2_page_state(struct page *page) /* for newly allocated pages: */ static void __bch2_page_state_release(struct page *page) { - struct bch_page_state *s = __bch2_page_state(page); - - if (!s) - return; - - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - kfree(s); + kfree(detach_page_private(page)); } static void bch2_page_state_release(struct page *page) { - struct bch_page_state *s = bch2_page_state(page); - - if (!s) - return; - - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - kfree(s); + EBUG_ON(!PageLocked(page)); + __bch2_page_state_release(page); } /* for newly allocated pages: */ @@ -300,13 +285,7 @@ static struct bch_page_state *__bch2_page_state_create(struct page *page, return NULL; spin_lock_init(&s->lock); - /* - * migrate_page_move_mapping() assumes that pages with private data - * have their count elevated by 1. - */ - get_page(page); - set_page_private(page, (unsigned long) s); - SetPagePrivate(page); + attach_page_private(page, s); return s; } @@ -608,14 +587,8 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, if (ret != MIGRATEPAGE_SUCCESS) return ret; - if (PagePrivate(page)) { - ClearPagePrivate(page); - get_page(newpage); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - SetPagePrivate(newpage); - } + if (PagePrivate(page)) + attach_page_private(newpage, detach_page_private(page)); if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); @@ -647,41 +620,33 @@ static void bch2_readpages_end_io(struct bio *bio) bio_put(bio); } -static inline void page_state_init_for_read(struct page *page) -{ - SetPagePrivate(page); - page->private = 0; -} - struct readpages_iter { struct address_space *mapping; struct page **pages; unsigned nr_pages; - unsigned nr_added; unsigned idx; pgoff_t offset; }; static int readpages_iter_init(struct readpages_iter *iter, - struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) + struct readahead_control *ractl) { + unsigned i, nr_pages = readahead_count(ractl); + memset(iter, 0, sizeof(*iter)); - iter->mapping = mapping; - iter->offset = list_last_entry(pages, struct page, lru)->index; + iter->mapping = ractl->mapping; + iter->offset = readahead_index(ractl); + iter->nr_pages = nr_pages; iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); if (!iter->pages) return -ENOMEM; - while (!list_empty(pages)) { - struct page *page = list_last_entry(pages, struct page, lru); - - __bch2_page_state_create(page, __GFP_NOFAIL); - - iter->pages[iter->nr_pages++] = page; - list_del(&page->lru); + __readahead_batch(ractl, iter->pages, nr_pages); + for (i = 0; i < nr_pages; i++) { + __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL); + put_page(iter->pages[i]); } return 0; @@ -689,41 +654,9 @@ static int readpages_iter_init(struct readpages_iter *iter, static inline struct page *readpage_iter_next(struct readpages_iter *iter) { - struct page *page; - unsigned i; - int ret; + if (iter->idx >= iter->nr_pages) + return NULL; - BUG_ON(iter->idx > iter->nr_added); - BUG_ON(iter->nr_added > iter->nr_pages); - - if (iter->idx < iter->nr_added) - goto out; - - while (1) { - if (iter->idx == iter->nr_pages) - return NULL; - - ret = add_to_page_cache_lru_vec(iter->mapping, - iter->pages + iter->nr_added, - iter->nr_pages - iter->nr_added, - iter->offset + iter->nr_added, - GFP_NOFS); - if (ret > 0) - break; - - page = iter->pages[iter->nr_added]; - iter->idx++; - iter->nr_added++; - - __bch2_page_state_release(page); - put_page(page); - } - - iter->nr_added += ret; - - for (i = iter->idx; i < iter->nr_added; i++) - put_page(iter->pages[i]); -out: EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx); return iter->pages[iter->idx]; @@ -889,10 +822,9 @@ retry: bkey_on_stack_exit(&sk, c); } -int bch2_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +void bch2_readahead(struct readahead_control *ractl) { - struct bch_inode_info *inode = to_bch_ei(mapping->host); + struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_io_opts opts = io_opts(c, &inode->ei_inode); struct btree_trans trans; @@ -901,7 +833,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping, struct readpages_iter readpages_iter; int ret; - ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages); + ret = readpages_iter_init(&readpages_iter, ractl); BUG_ON(ret); bch2_trans_init(&trans, c, 0, 0); @@ -936,8 +868,6 @@ int bch2_readpages(struct file *file, struct address_space *mapping, bch2_trans_exit(&trans); kfree(readpages_iter.pages); - - return 0; } static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, diff --git a/libbcachefs/fs-io.h b/libbcachefs/fs-io.h index 7063556d..2537a3d2 100644 --- a/libbcachefs/fs-io.h +++ b/libbcachefs/fs-io.h @@ -19,8 +19,7 @@ int bch2_writepage(struct page *, struct writeback_control *); int bch2_readpage(struct file *, struct page *); int bch2_writepages(struct address_space *, struct writeback_control *); -int bch2_readpages(struct file *, struct address_space *, - struct list_head *, unsigned); +void bch2_readahead(struct readahead_control *); int bch2_write_begin(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page **, void **); diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 1d66acac..3ac57ba2 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -42,6 +42,11 @@ static void journal_seq_copy(struct bch_fs *c, struct bch_inode_info *dst, u64 journal_seq) { + /* + * atomic64_cmpxchg has a fallback for archs that don't support it, + * cmpxchg does not: + */ + atomic64_t *dst_seq = (void *) &dst->ei_journal_seq; u64 old, v = READ_ONCE(dst->ei_journal_seq); do { @@ -49,7 +54,7 @@ static void journal_seq_copy(struct bch_fs *c, if (old >= journal_seq) break; - } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old); + } while ((v = atomic64_cmpxchg(dst_seq, old, journal_seq)) != old); bch2_journal_set_has_inum(&c->journal, dst->v.i_ino, journal_seq); } @@ -225,6 +230,13 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) return &inode->v; } +static int inum_test(struct inode *inode, void *p) +{ + unsigned long *ino = p; + + return *ino == inode->i_ino; +} + static struct bch_inode_info * __bch2_create(struct bch_inode_info *dir, struct dentry *dentry, umode_t mode, dev_t rdev, bool tmpfile) @@ -304,8 +316,12 @@ err_before_quota: * thread pulling the inode in and modifying it: */ - old = to_bch_ei(insert_inode_locked2(&inode->v)); - if (unlikely(old)) { + inode->v.i_state |= I_CREATING; + old = to_bch_ei(inode_insert5(&inode->v, inode->v.i_ino, + inum_test, NULL, &inode->v.i_ino)); + BUG_ON(!old); + + if (unlikely(old != inode)) { /* * We raced, another process pulled the new inode into cache * before us: @@ -807,7 +823,7 @@ static int bch2_fill_extent(struct bch_fs *c, struct fiemap_extent_info *info, struct bkey_s_c k, unsigned flags) { - if (bkey_extent_is_data(k.k)) { + if (bkey_extent_is_direct_data(k.k)) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; @@ -838,6 +854,12 @@ static int bch2_fill_extent(struct bch_fs *c, } return 0; + } else if (bkey_extent_is_inline_data(k.k)) { + return fiemap_fill_next_extent(info, + bkey_start_offset(k.k) << 9, + 0, k.k->size << 9, + flags| + FIEMAP_EXTENT_DATA_INLINE); } else if (k.k->type == KEY_TYPE_reservation) { return fiemap_fill_next_extent(info, bkey_start_offset(k.k) << 9, @@ -891,9 +913,7 @@ retry: bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; - bkey_on_stack_realloc(&cur, c, k.k->u64s); - bkey_on_stack_realloc(&prev, c, k.k->u64s); - bkey_reassemble(cur.k, k); + bkey_on_stack_reassemble(&cur, c, k); ret = bch2_read_indirect_extent(&trans, &offset_into_extent, &cur); @@ -901,14 +921,14 @@ retry: break; k = bkey_i_to_s_c(cur.k); + bkey_on_stack_realloc(&prev, c, k.k->u64s); sectors = min(sectors, k.k->size - offset_into_extent); - if (offset_into_extent) - bch2_cut_front(POS(k.k->p.inode, - bkey_start_offset(k.k) + - offset_into_extent), - cur.k); + bch2_cut_front(POS(k.k->p.inode, + bkey_start_offset(k.k) + + offset_into_extent), + cur.k); bch2_key_resize(&cur.k->k, sectors); cur.k->k.p = iter->pos; cur.k->k.p.offset += cur.k->k.size; @@ -923,10 +943,8 @@ retry: bkey_copy(prev.k, cur.k); have_extent = true; - if (k.k->type == KEY_TYPE_reflink_v) - bch2_btree_iter_set_pos(iter, k.k->p); - else - bch2_btree_iter_next(iter); + bch2_btree_iter_set_pos(iter, + POS(iter->pos.inode, iter->pos.offset + sectors)); } if (ret == -EINTR) @@ -1062,7 +1080,7 @@ static const struct address_space_operations bch_address_space_operations = { .writepage = bch2_writepage, .readpage = bch2_readpage, .writepages = bch2_writepages, - .readpages = bch2_readpages, + .readahead = bch2_readahead, .set_page_dirty = __set_page_dirty_nobuffers, .write_begin = bch2_write_begin, .write_end = bch2_write_end, @@ -1238,6 +1256,11 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) struct bch_fs *c = sb->s_fs_info; struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); unsigned shift = sb->s_blocksize_bits - 9; + /* + * this assumes inodes take up 64 bytes, which is a decent average + * number: + */ + u64 avail_inodes = ((usage.capacity - usage.used) << 3); u64 fsid; buf->f_type = BCACHEFS_STATFS_MAGIC; @@ -1245,8 +1268,9 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = usage.capacity >> shift; buf->f_bfree = (usage.capacity - usage.used) >> shift; buf->f_bavail = buf->f_bfree; - buf->f_files = 0; - buf->f_ffree = 0; + + buf->f_files = usage.nr_inodes + avail_inodes; + buf->f_ffree = avail_inodes; fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 5a6df3d1..0c503527 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -537,7 +537,7 @@ retry: bch2_trans_unlock(&trans); - bch2_inode_pack(&p, &w.inode); + bch2_inode_pack(c, &p, &w.inode); ret = bch2_btree_insert(c, BTREE_ID_INODES, &p.inode.k_i, NULL, NULL, @@ -808,7 +808,7 @@ create_root: 0, NULL); root_inode->bi_inum = BCACHEFS_ROOT_INO; - bch2_inode_pack(&packed, root_inode); + bch2_inode_pack(c, &packed, root_inode); return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, NULL, NULL, @@ -866,36 +866,22 @@ create_lostfound: return ret; } -struct inode_bitmap { - unsigned long *bits; - size_t size; -}; +typedef GENRADIX(unsigned long) inode_bitmap; -static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr) +static inline bool inode_bitmap_test(inode_bitmap *b, size_t nr) { - return nr < b->size ? test_bit(nr, b->bits) : false; + unsigned long *w = genradix_ptr(b, nr / BITS_PER_LONG); + return w ? test_bit(nr & (BITS_PER_LONG - 1), w) : false; } -static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr) +static inline int inode_bitmap_set(inode_bitmap *b, size_t nr) { - if (nr >= b->size) { - size_t new_size = max_t(size_t, max_t(size_t, - PAGE_SIZE * 8, - b->size * 2), - nr + 1); - void *n; + unsigned long *w = genradix_ptr_alloc(b, nr / BITS_PER_LONG, GFP_KERNEL); - new_size = roundup_pow_of_two(new_size); - n = krealloc(b->bits, new_size / 8, GFP_KERNEL|__GFP_ZERO); - if (!n) { - return -ENOMEM; - } + if (!w) + return -ENOMEM; - b->bits = n; - b->size = new_size; - } - - __set_bit(nr, b->bits); + *w |= 1UL << (nr & (BITS_PER_LONG - 1)); return 0; } @@ -934,7 +920,7 @@ noinline_for_stack static int check_directory_structure(struct bch_fs *c, struct bch_inode_unpacked *lostfound_inode) { - struct inode_bitmap dirs_done = { NULL, 0 }; + inode_bitmap dirs_done; struct pathbuf path = { 0, 0, NULL }; struct pathbuf_entry *e; struct btree_trans trans; @@ -951,6 +937,7 @@ static int check_directory_structure(struct bch_fs *c, /* DFS: */ restart_dfs: + genradix_init(&dirs_done); had_unreachable = false; ret = inode_bitmap_set(&dirs_done, BCACHEFS_ROOT_INO); @@ -1057,7 +1044,7 @@ retry: if (had_unreachable) { bch_info(c, "reattached unreachable directories, restarting pass to check for loops"); - kfree(dirs_done.bits); + genradix_free(&dirs_done); kfree(path.entries); memset(&dirs_done, 0, sizeof(dirs_done)); memset(&path, 0, sizeof(path)); @@ -1066,7 +1053,7 @@ retry: err: fsck_err: ret = bch2_trans_exit(&trans) ?: ret; - kfree(dirs_done.bits); + genradix_free(&dirs_done); kfree(path.entries); return ret; } @@ -1326,7 +1313,7 @@ static int check_inode(struct btree_trans *trans, if (do_update) { struct bkey_inode_buf p; - bch2_inode_pack(&p, &u); + bch2_inode_pack(c, &p, &u); ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL| diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 7d20f082..42371de7 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -1,12 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "btree_key_cache.h" #include "bkey_methods.h" #include "btree_update.h" #include "error.h" #include "extents.h" #include "inode.h" #include "str_hash.h" +#include "varint.h" #include @@ -88,22 +90,17 @@ static int inode_decode_field(const u8 *in, const u8 *end, return bytes; } -void bch2_inode_pack(struct bkey_inode_buf *packed, - const struct bch_inode_unpacked *inode) +static noinline void bch2_inode_pack_v1(struct bkey_inode_buf *packed, + const struct bch_inode_unpacked *inode) { - u8 *out = packed->inode.v.fields; + struct bkey_i_inode *k = &packed->inode; + u8 *out = k->v.fields; u8 *end = (void *) &packed[1]; u8 *last_nonzero_field = out; unsigned nr_fields = 0, last_nonzero_fieldnr = 0; unsigned bytes; - bkey_inode_init(&packed->inode.k_i); - packed->inode.k.p.offset = inode->bi_inum; - packed->inode.v.bi_hash_seed = inode->bi_hash_seed; - packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags); - packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); - -#define x(_name, _bits) \ +#define x(_name, _bits) \ out += inode_encode_field(out, end, 0, inode->_name); \ nr_fields++; \ \ @@ -122,7 +119,69 @@ void bch2_inode_pack(struct bkey_inode_buf *packed, set_bkey_val_bytes(&packed->inode.k, bytes); memset_u64s_tail(&packed->inode.v, 0, bytes); - SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields); + SET_INODE_NR_FIELDS(&k->v, nr_fields); +} + +static void bch2_inode_pack_v2(struct bkey_inode_buf *packed, + const struct bch_inode_unpacked *inode) +{ + struct bkey_i_inode *k = &packed->inode; + u8 *out = k->v.fields; + u8 *end = (void *) &packed[1]; + u8 *last_nonzero_field = out; + unsigned nr_fields = 0, last_nonzero_fieldnr = 0; + unsigned bytes; + int ret; + +#define x(_name, _bits) \ + nr_fields++; \ + \ + if (inode->_name) { \ + ret = bch2_varint_encode(out, inode->_name); \ + out += ret; \ + \ + if (_bits > 64) \ + *out++ = 0; \ + \ + last_nonzero_field = out; \ + last_nonzero_fieldnr = nr_fields; \ + } else { \ + *out++ = 0; \ + \ + if (_bits > 64) \ + *out++ = 0; \ + } + + BCH_INODE_FIELDS() +#undef x + BUG_ON(out > end); + + out = last_nonzero_field; + nr_fields = last_nonzero_fieldnr; + + bytes = out - (u8 *) &packed->inode.v; + set_bkey_val_bytes(&packed->inode.k, bytes); + memset_u64s_tail(&packed->inode.v, 0, bytes); + + SET_INODE_NR_FIELDS(&k->v, nr_fields); +} + +void bch2_inode_pack(struct bch_fs *c, + struct bkey_inode_buf *packed, + const struct bch_inode_unpacked *inode) +{ + bkey_inode_init(&packed->inode.k_i); + packed->inode.k.p.offset = inode->bi_inum; + packed->inode.v.bi_hash_seed = inode->bi_hash_seed; + packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags); + packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); + + if (c->sb.features & (1ULL << BCH_FEATURE_new_varint)) { + SET_INODE_NEW_VARINT(&packed->inode.v, true); + bch2_inode_pack_v2(packed, inode); + } else { + bch2_inode_pack_v1(packed, inode); + } if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { struct bch_inode_unpacked unpacked; @@ -134,26 +193,23 @@ void bch2_inode_pack(struct bkey_inode_buf *packed, BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed); BUG_ON(unpacked.bi_mode != inode->bi_mode); -#define x(_name, _bits) BUG_ON(unpacked._name != inode->_name); +#define x(_name, _bits) if (unpacked._name != inode->_name) \ + panic("unpacked %llu should be %llu", \ + (u64) unpacked._name, (u64) inode->_name); BCH_INODE_FIELDS() #undef x } } -int bch2_inode_unpack(struct bkey_s_c_inode inode, - struct bch_inode_unpacked *unpacked) +static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, + struct bch_inode_unpacked *unpacked) { const u8 *in = inode.v->fields; - const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k); + const u8 *end = bkey_val_end(inode); u64 field[2]; unsigned fieldnr = 0, field_bits; int ret; - unpacked->bi_inum = inode.k->p.offset; - unpacked->bi_hash_seed = inode.v->bi_hash_seed; - unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); - unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); - #define x(_name, _bits) \ if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \ memset(&unpacked->_name, 0, \ @@ -176,6 +232,62 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode, #undef x /* XXX: signal if there were more fields than expected? */ + return 0; +} + +static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode, + struct bch_inode_unpacked *unpacked) +{ + const u8 *in = inode.v->fields; + const u8 *end = bkey_val_end(inode); + unsigned fieldnr = 0; + int ret; + u64 v[2]; + +#define x(_name, _bits) \ + if (fieldnr < INODE_NR_FIELDS(inode.v)) { \ + ret = bch2_varint_decode(in, end, &v[0]); \ + if (ret < 0) \ + return ret; \ + in += ret; \ + \ + if (_bits > 64) { \ + ret = bch2_varint_decode(in, end, &v[1]); \ + if (ret < 0) \ + return ret; \ + in += ret; \ + } else { \ + v[1] = 0; \ + } \ + } else { \ + v[0] = v[1] = 0; \ + } \ + \ + unpacked->_name = v[0]; \ + if (v[1] || v[0] != unpacked->_name) \ + return -1; \ + fieldnr++; + + BCH_INODE_FIELDS() +#undef x + + /* XXX: signal if there were more fields than expected? */ + return 0; +} + +int bch2_inode_unpack(struct bkey_s_c_inode inode, + struct bch_inode_unpacked *unpacked) +{ + unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_hash_seed = inode.v->bi_hash_seed; + unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); + unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); + + if (INODE_NEW_VARINT(inode.v)) { + return bch2_inode_unpack_v2(inode, unpacked); + } else { + return bch2_inode_unpack_v1(inode, unpacked); + } return 0; } @@ -189,11 +301,11 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans, int ret; iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inum), - BTREE_ITER_SLOTS|flags); + BTREE_ITER_CACHED|flags); if (IS_ERR(iter)) return iter; - k = bch2_btree_iter_peek_slot(iter); + k = bch2_btree_iter_peek_cached(iter); ret = bkey_err(k); if (ret) goto err; @@ -222,7 +334,7 @@ int bch2_inode_write(struct btree_trans *trans, if (IS_ERR(inode_p)) return PTR_ERR(inode_p); - bch2_inode_pack(inode_p, inode); + bch2_inode_pack(trans->c, inode_p, inode); bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); return 0; } @@ -271,6 +383,8 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, return; } + pr_buf(out, "mode: %o ", unpacked.bi_mode); + #define x(_name, _bits) \ pr_buf(out, #_name ": %llu ", (u64) unpacked._name); BCH_INODE_FIELDS() @@ -359,20 +473,24 @@ static inline u32 bkey_generation(struct bkey_s_c k) } int bch2_inode_create(struct btree_trans *trans, - struct bch_inode_unpacked *inode_u, - u64 min, u64 max, u64 *hint) + struct bch_inode_unpacked *inode_u) { + struct bch_fs *c = trans->c; struct bkey_inode_buf *inode_p; struct btree_iter *iter = NULL; struct bkey_s_c k; - u64 start; + u64 min, max, start, *hint; int ret; - if (!max) - max = ULLONG_MAX; + unsigned cpu = raw_smp_processor_id(); + unsigned bits = (c->opts.inodes_32bit + ? 31 : 63) - c->inode_shard_bits; - if (trans->c->opts.inodes_32bit) - max = min_t(u64, max, U32_MAX); + min = (cpu << bits); + max = (cpu << bits) | ~(ULLONG_MAX << bits); + + min = max_t(u64, min, BLOCKDEV_INODE_MAX); + hint = c->unused_inode_hints + cpu; start = READ_ONCE(*hint); @@ -388,7 +506,17 @@ again: if (bkey_cmp(iter->pos, POS(0, max)) > 0) break; - if (k.k->type != KEY_TYPE_inode) + /* + * There's a potential cache coherency issue with the btree key + * cache code here - we're iterating over the btree, skipping + * that cache. We should never see an empty slot that isn't + * actually empty due to a pending update in the key cache + * because the update that creates the inode isn't done with a + * cached iterator, but - better safe than sorry, check the + * cache before using a slot: + */ + if (k.k->type != KEY_TYPE_inode && + !bch2_btree_key_cache_find(c, BTREE_ID_INODES, iter->pos)) goto found_slot; } @@ -409,10 +537,7 @@ found_slot: inode_u->bi_inum = k.k->p.offset; inode_u->bi_generation = bkey_generation(k); - bch2_inode_pack(inode_p, inode_u); - bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); - bch2_trans_iter_put(trans, iter); - return 0; + return bch2_inode_write(trans, iter, inode_u); } int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) @@ -422,6 +547,8 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) struct bkey_i_inode_generation delete; struct bpos start = POS(inode_nr, 0); struct bpos end = POS(inode_nr + 1, 0); + struct bkey_s_c k; + u64 bi_generation; int ret; /* @@ -442,51 +569,62 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) return ret; bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + bi_generation = 0; + + ret = bch2_btree_key_cache_flush(&trans, BTREE_ID_INODES, POS(0, inode_nr)); + if (ret) { + if (ret != -EINTR) + bch_err(c, "error flushing btree key cache: %i", ret); + goto err; + } iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - do { - struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); - u32 bi_generation = 0; + k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); - if (ret) - break; + ret = bkey_err(k); + if (ret) + goto err; - bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c, - "inode %llu not found when deleting", - inode_nr); + bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c, + "inode %llu not found when deleting", + inode_nr); - switch (k.k->type) { - case KEY_TYPE_inode: { - struct bch_inode_unpacked inode_u; + switch (k.k->type) { + case KEY_TYPE_inode: { + struct bch_inode_unpacked inode_u; - if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u)) - bi_generation = inode_u.bi_generation + 1; - break; - } - case KEY_TYPE_inode_generation: { - struct bkey_s_c_inode_generation g = - bkey_s_c_to_inode_generation(k); - bi_generation = le32_to_cpu(g.v->bi_generation); - break; - } - } + if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u)) + bi_generation = inode_u.bi_generation + 1; + break; + } + case KEY_TYPE_inode_generation: { + struct bkey_s_c_inode_generation g = + bkey_s_c_to_inode_generation(k); + bi_generation = le32_to_cpu(g.v->bi_generation); + break; + } + } - if (!bi_generation) { - bkey_init(&delete.k); - delete.k.p.offset = inode_nr; - } else { - bkey_inode_generation_init(&delete.k_i); - delete.k.p.offset = inode_nr; - delete.v.bi_generation = cpu_to_le32(bi_generation); - } + if (!bi_generation) { + bkey_init(&delete.k); + delete.k.p.offset = inode_nr; + } else { + bkey_inode_generation_init(&delete.k_i); + delete.k.p.offset = inode_nr; + delete.v.bi_generation = cpu_to_le32(bi_generation); + } - bch2_trans_update(&trans, iter, &delete.k_i, 0); + bch2_trans_update(&trans, iter, &delete.k_i, 0); - ret = bch2_trans_commit(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL); - } while (ret == -EINTR); + ret = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL); +err: + if (ret == -EINTR) + goto retry; bch2_trans_exit(&trans); return ret; @@ -500,11 +638,11 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, int ret; iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, - POS(0, inode_nr), BTREE_ITER_SLOTS); + POS(0, inode_nr), BTREE_ITER_CACHED); if (IS_ERR(iter)) return PTR_ERR(iter); - k = bch2_btree_iter_peek_slot(iter); + k = bch2_btree_iter_peek_cached(iter); ret = bkey_err(k); if (ret) goto err; @@ -523,32 +661,3 @@ int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, return bch2_trans_do(c, NULL, NULL, 0, bch2_inode_find_by_inum_trans(&trans, inode_nr, inode)); } - -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_inode_pack_test(void) -{ - struct bch_inode_unpacked *u, test_inodes[] = { - { - .bi_atime = U64_MAX, - .bi_ctime = U64_MAX, - .bi_mtime = U64_MAX, - .bi_otime = U64_MAX, - .bi_size = U64_MAX, - .bi_sectors = U64_MAX, - .bi_uid = U32_MAX, - .bi_gid = U32_MAX, - .bi_nlink = U32_MAX, - .bi_generation = U32_MAX, - .bi_dev = U32_MAX, - }, - }; - - for (u = test_inodes; - u < test_inodes + ARRAY_SIZE(test_inodes); - u++) { - struct bkey_inode_buf p; - - bch2_inode_pack(&p, u); - } -} -#endif diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index bb759a46..ef7e885d 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -24,6 +24,14 @@ void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, .val_to_text = bch2_inode_generation_to_text, \ } +#if 0 +typedef struct { + u64 lo; + u32 hi; +} __packed __aligned(4) u96; +#endif +typedef u64 u96; + struct bch_inode_unpacked { u64 bi_inum; __le64 bi_hash_seed; @@ -43,7 +51,8 @@ struct bkey_inode_buf { #undef x } __attribute__((packed, aligned(8))); -void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *); +void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *, + const struct bch_inode_unpacked *); int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); struct btree_iter *bch2_inode_peek(struct btree_trans *, @@ -60,9 +69,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, uid_t, gid_t, umode_t, dev_t, struct bch_inode_unpacked *); -int bch2_inode_create(struct btree_trans *, - struct bch_inode_unpacked *, - u64, u64, u64 *); +int bch2_inode_create(struct btree_trans *, struct bch_inode_unpacked *); int bch2_inode_rm(struct bch_fs *, u64); @@ -168,10 +175,4 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, } } -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_inode_pack_test(void); -#else -static inline void bch2_inode_pack_test(void) {} -#endif - #endif /* _BCACHEFS_INODE_H */ diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 8add8ccd..21087d11 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -171,7 +171,7 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, while (size) { struct page *page = __bio_alloc_page_pool(c, &using_mempool); - unsigned len = min(PAGE_SIZE, size); + unsigned len = min_t(size_t, PAGE_SIZE, size); BUG_ON(!bio_add_page(bio, page, len, 0)); size -= len; @@ -301,7 +301,7 @@ int bch2_extent_update(struct btree_trans *trans, inode_u.bi_sectors += delta; if (delta || new_i_size) { - bch2_inode_pack(&inode_p, &inode_u); + bch2_inode_pack(trans->c, &inode_p, &inode_u); bch2_trans_update(trans, inode_iter, &inode_p.inode.k_i, 0); } diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index b8b71990..c2cafd38 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -980,9 +980,11 @@ void bch2_fs_journal_stop(struct journal *j) wait_event(j->wait, journal_entry_close(j)); - /* do we need to write another journal entry? */ - if (test_bit(JOURNAL_NOT_EMPTY, &j->flags)) - bch2_journal_meta(j); + /* + * Always write a new journal entry, to make sure the clock hands are up + * to date (and match the superblock) + */ + bch2_journal_meta(j); journal_quiesce(j); diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 57591983..18e45296 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -465,34 +465,12 @@ static bool journal_flush_pins(struct journal *j, u64 seq_to_flush, return ret; } -/** - * bch2_journal_reclaim - free up journal buckets - * - * Background journal reclaim writes out btree nodes. It should be run - * early enough so that we never completely run out of journal buckets. - * - * High watermarks for triggering background reclaim: - * - FIFO has fewer than 512 entries left - * - fewer than 25% journal buckets free - * - * Background reclaim runs until low watermarks are reached: - * - FIFO has more than 1024 entries left - * - more than 50% journal buckets free - * - * As long as a reclaim can complete in the time it takes to fill up - * 512 journal entries or 25% of all journal buckets, then - * journal_next_bucket() should not stall. - */ -void bch2_journal_reclaim(struct journal *j) +static u64 journal_seq_to_flush(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; - unsigned iter, min_nr = 0; u64 seq_to_flush = 0; - - lockdep_assert_held(&j->reclaim_lock); - - bch2_journal_do_discards(j); + unsigned iter; spin_lock(&j->lock); @@ -524,20 +502,52 @@ void bch2_journal_reclaim(struct journal *j) (j->pin.size >> 1)); spin_unlock(&j->lock); - /* - * If it's been longer than j->reclaim_delay_ms since we last flushed, - * make sure to flush at least one journal pin: - */ - if (time_after(jiffies, j->last_flushed + - msecs_to_jiffies(j->reclaim_delay_ms))) - min_nr = 1; + return seq_to_flush; +} - if (j->prereserved.reserved * 2 > j->prereserved.remaining) { - seq_to_flush = max(seq_to_flush, journal_last_seq(j)); - min_nr = 1; - } +/** + * bch2_journal_reclaim - free up journal buckets + * + * Background journal reclaim writes out btree nodes. It should be run + * early enough so that we never completely run out of journal buckets. + * + * High watermarks for triggering background reclaim: + * - FIFO has fewer than 512 entries left + * - fewer than 25% journal buckets free + * + * Background reclaim runs until low watermarks are reached: + * - FIFO has more than 1024 entries left + * - more than 50% journal buckets free + * + * As long as a reclaim can complete in the time it takes to fill up + * 512 journal entries or 25% of all journal buckets, then + * journal_next_bucket() should not stall. + */ +void bch2_journal_reclaim(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + unsigned min_nr = 0; + u64 seq_to_flush = 0; - journal_flush_pins(j, seq_to_flush, min_nr); + lockdep_assert_held(&j->reclaim_lock); + + do { + bch2_journal_do_discards(j); + + seq_to_flush = journal_seq_to_flush(j); + min_nr = 0; + + /* + * If it's been longer than j->reclaim_delay_ms since we last flushed, + * make sure to flush at least one journal pin: + */ + if (time_after(jiffies, j->last_flushed + + msecs_to_jiffies(j->reclaim_delay_ms))) + min_nr = 1; + + if (j->prereserved.reserved * 2 > j->prereserved.remaining) + min_nr = 1; + } while (journal_flush_pins(j, seq_to_flush, min_nr)); if (!bch2_journal_error(j)) queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 32fed6b8..1745cfac 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -1320,7 +1320,7 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); root_inode.bi_inum = BCACHEFS_ROOT_INO; - bch2_inode_pack(&packed_inode, &root_inode); + bch2_inode_pack(c, &packed_inode, &root_inode); err = "error creating root directory"; ret = bch2_btree_insert(c, BTREE_ID_INODES, diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 015bbd9f..8673e974 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -451,6 +451,7 @@ int bch2_fs_read_write_early(struct bch_fs *c) static void __bch2_fs_free(struct bch_fs *c) { unsigned i; + int cpu; for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_exit(&c->times[i]); @@ -475,6 +476,12 @@ static void __bch2_fs_free(struct bch_fs *c) free_percpu(c->usage[1]); free_percpu(c->usage[0]); kfree(c->usage_base); + + if (c->btree_iters_bufs) + for_each_possible_cpu(cpu) + kfree(per_cpu_ptr(c->btree_iters_bufs, cpu)->iter); + + free_percpu(c->btree_iters_bufs); free_percpu(c->pcpu); mempool_exit(&c->large_bkey_pool); mempool_exit(&c->btree_bounce_pool); @@ -485,6 +492,7 @@ static void __bch2_fs_free(struct bch_fs *c) kfree(c->replicas_gc.entries); kfree(rcu_dereference_protected(c->disk_groups, 1)); kfree(c->journal_seq_blacklist_table); + kfree(c->unused_inode_hints); free_heap(&c->copygc_heap); if (c->journal_reclaim_wq) @@ -736,11 +744,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) (btree_blocks(c) + 1) * 2 * sizeof(struct sort_iter_set); + c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus())); + if (!(c->wq = alloc_workqueue("bcachefs", WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || - !(c->copygc_wq = alloc_workqueue("bcache_copygc", + !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || - !(c->journal_reclaim_wq = alloc_workqueue("bcache_journal", + !(c->journal_reclaim_wq = alloc_workqueue("bcachefs_journal_reclaim", WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || percpu_ref_init(&c->writes, bch2_writes_disabled, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || @@ -750,9 +760,12 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) offsetof(struct btree_write_bio, wbio.bio)), BIOSET_NEED_BVECS) || !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || + !(c->btree_iters_bufs = alloc_percpu(struct btree_iter_buf)) || mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) || mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || + !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, + sizeof(u64), GFP_KERNEL)) || bch2_io_clock_init(&c->io_clock[READ]) || bch2_io_clock_init(&c->io_clock[WRITE]) || bch2_fs_journal_init(&c->journal) || @@ -2012,7 +2025,6 @@ static void bcachefs_exit(void) static int __init bcachefs_init(void) { bch2_bkey_pack_test(); - bch2_inode_pack_test(); if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || bch2_chardev_init() || diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 0cb29f43..d7ad293a 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -208,12 +208,6 @@ read_attribute(io_timers_write); write_attribute(perf_test); #endif /* CONFIG_BCACHEFS_TESTS */ -#define BCH_DEBUG_PARAM(name, description) \ - rw_attribute(name); - - BCH_DEBUG_PARAMS() -#undef BCH_DEBUG_PARAM - #define x(_name) \ static struct attribute sysfs_time_stat_##_name = \ { .name = #_name, .mode = S_IRUGO }; @@ -414,10 +408,6 @@ SHOW(bch2_fs) return out.pos - buf; } -#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name); - BCH_DEBUG_PARAMS() -#undef BCH_DEBUG_PARAM - return 0; } @@ -462,10 +452,6 @@ STORE(bch2_fs) /* Debugging: */ -#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name); - BCH_DEBUG_PARAMS() -#undef BCH_DEBUG_PARAM - if (!test_bit(BCH_FS_STARTED, &c->flags)) return -EPERM; @@ -590,11 +576,6 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_io_timers_write, &sysfs_internal_uuid, - -#define BCH_DEBUG_PARAM(name, description) &sysfs_##name, - BCH_DEBUG_PARAMS() -#undef BCH_DEBUG_PARAM - NULL }; diff --git a/libbcachefs/util.c b/libbcachefs/util.c index fd4044a6..2709163e 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -520,7 +520,7 @@ int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) { while (size) { struct page *page = alloc_page(gfp_mask); - unsigned len = min(PAGE_SIZE, size); + unsigned len = min_t(size_t, PAGE_SIZE, size); if (!page) return -ENOMEM; diff --git a/libbcachefs/util.h b/libbcachefs/util.h index f48c6380..6e533544 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -37,17 +37,6 @@ struct closure; #define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0) #define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0) -#define memcpy(dst, src, len) \ -({ \ - void *_dst = (dst); \ - const void *_src = (src); \ - size_t _len = (len); \ - \ - BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) || \ - (void *) (_dst) + (_len) <= (void *) (_src))); \ - memcpy(_dst, _src, _len); \ -}) - #else /* DEBUG */ #define EBUG_ON(cond) diff --git a/libbcachefs/varint.c b/libbcachefs/varint.c new file mode 100644 index 00000000..a3d252c7 --- /dev/null +++ b/libbcachefs/varint.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +#include "varint.h" + +int bch2_varint_encode(u8 *out, u64 v) +{ + unsigned bits = fls64(v|1); + unsigned bytes = DIV_ROUND_UP(bits, 7); + + if (likely(bytes < 9)) { + v <<= bytes; + v |= ~(~0 << (bytes - 1)); + } else { + *out++ = 255; + bytes = 9; + } + + put_unaligned_le64(v, out); + return bytes; +} + +int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) +{ + u64 v = get_unaligned_le64(in); + unsigned bytes = ffz(v & 255) + 1; + + if (unlikely(in + bytes > end)) + return -1; + + if (likely(bytes < 9)) { + v >>= bytes; + v &= ~(~0ULL << (7 * bytes)); + } else { + v = get_unaligned_le64(++in); + } + + *out = v; + return bytes; +} diff --git a/libbcachefs/varint.h b/libbcachefs/varint.h new file mode 100644 index 00000000..8daf8135 --- /dev/null +++ b/libbcachefs/varint.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_VARINT_H +#define _BCACHEFS_VARINT_H + +int bch2_varint_encode(u8 *, u64); +int bch2_varint_decode(const u8 *, const u8 *, u64 *); + +#endif /* _BCACHEFS_VARINT_H */