From abfdc593a532abaa40ac6ca87c1e0c86459f8c87 Mon Sep 17 00:00:00 2001 From: Kent Overstreet <kent.overstreet@linux.dev> Date: Sat, 16 Mar 2024 19:29:22 -0400 Subject: [PATCH] Update bcachefs sources to 83338f5b2cb8 bcachefs: fix for building in userspace --- .bcachefs_revision | 2 +- Makefile | 14 - c_src/cmd_format.c | 2 +- c_src/cmd_fs.c | 2 +- c_src/cmd_list_journal.c | 2 +- c_src/tools-util.h | 2 +- include/linux/darray_types.h | 22 - include/linux/generic-radix-tree.h | 29 +- libbcachefs/alloc_background.c | 62 ++- libbcachefs/backpointers.c | 3 +- libbcachefs/bcachefs.h | 12 +- libbcachefs/bkey.h | 207 +------- libbcachefs/bkey_types.h | 213 +++++++++ libbcachefs/bset.c | 2 +- libbcachefs/btree_cache.c | 2 +- libbcachefs/btree_gc.c | 132 +++-- libbcachefs/btree_io.c | 19 +- libbcachefs/btree_iter.c | 12 +- libbcachefs/btree_journal_iter.c | 2 +- libbcachefs/btree_key_cache.c | 8 +- libbcachefs/btree_locking.h | 2 +- libbcachefs/btree_types.h | 2 +- libbcachefs/btree_update.c | 4 +- libbcachefs/btree_update_interior.c | 86 +++- libbcachefs/btree_update_interior.h | 2 + libbcachefs/btree_write_buffer_types.h | 2 +- libbcachefs/chardev.c | 20 +- {linux => libbcachefs}/darray.c | 7 +- {include/linux => libbcachefs}/darray.h | 59 +-- libbcachefs/errcode.h | 7 +- libbcachefs/error.c | 4 +- libbcachefs/extents.h | 11 +- {include/linux => libbcachefs}/eytzinger.h | 58 +-- libbcachefs/fs-io-buffered.c | 151 ++++-- libbcachefs/fs-io-pagecache.h | 9 +- libbcachefs/fs.c | 4 +- libbcachefs/fsck.c | 11 +- libbcachefs/inode.c | 15 +- libbcachefs/io_read.c | 6 +- libbcachefs/io_write.c | 4 +- libbcachefs/journal.c | 22 +- libbcachefs/journal_io.c | 42 +- libbcachefs/journal_io.h | 10 +- libbcachefs/journal_sb.c | 2 +- libbcachefs/journal_seq_blacklist.c | 73 +-- libbcachefs/journal_types.h | 6 +- libbcachefs/lru.c | 3 +- {linux => libbcachefs}/mean_and_variance.c | 3 +- .../linux => libbcachefs}/mean_and_variance.h | 0 libbcachefs/nocow_locking.c | 2 +- libbcachefs/opts.c | 8 +- libbcachefs/opts.h | 5 + libbcachefs/recovery.c | 71 ++- libbcachefs/replicas.c | 19 +- libbcachefs/replicas.h | 3 +- libbcachefs/sb-downgrade.c | 5 +- libbcachefs/sb-errors_types.h | 5 +- libbcachefs/sb-members.h | 2 +- libbcachefs/subvolume.h | 1 + libbcachefs/subvolume_types.h | 2 +- libbcachefs/super-io.c | 8 + libbcachefs/super-io.h | 2 +- libbcachefs/super.c | 77 +-- libbcachefs/thread_with_file.c | 450 ++++++++++++++++++ libbcachefs/thread_with_file.h | 76 +++ libbcachefs/thread_with_file_types.h | 23 + libbcachefs/time_stats.c | 165 +++++++ {include/linux => libbcachefs}/time_stats.h | 62 ++- libbcachefs/util.c | 157 +++++- libbcachefs/util.h | 61 ++- libbcachefs/xattr.c | 5 +- linux/generic-radix-tree.c | 35 +- linux/sort.c | 89 ---- linux/time_stats.c | 373 --------------- 74 files changed, 1807 insertions(+), 1273 deletions(-) delete mode 100644 include/linux/darray_types.h create mode 100644 libbcachefs/bkey_types.h rename {linux => libbcachefs}/darray.c (68%) rename {include/linux => libbcachefs}/darray.h (66%) rename {include/linux => libbcachefs}/eytzinger.h (77%) rename {linux => libbcachefs}/mean_and_variance.c (99%) rename {include/linux => libbcachefs}/mean_and_variance.h (100%) create mode 100644 libbcachefs/thread_with_file.c create mode 100644 libbcachefs/thread_with_file.h create mode 100644 libbcachefs/thread_with_file_types.h create mode 100644 libbcachefs/time_stats.c rename {include/linux => libbcachefs}/time_stats.h (63%) delete mode 100644 linux/time_stats.c diff --git a/.bcachefs_revision b/.bcachefs_revision index f1553698..c9361961 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -26494335d114f7813a7fc499bbacb4a74d613b6f +83338f5b2cb8406cda8bf7be3f566ab97c696917 diff --git a/Makefile b/Makefile index 37101aff..26cbcb91 100644 --- a/Makefile +++ b/Makefile @@ -274,20 +274,6 @@ update-bcachefs-sources: git add include/linux/kmemleak.h cp $(LINUX_DIR)/lib/math/int_sqrt.c linux/ git add linux/int_sqrt.c - cp $(LINUX_DIR)/lib/math/mean_and_variance.c linux/ - git add linux/mean_and_variance.c - cp $(LINUX_DIR)/include/linux/mean_and_variance.h include/linux/ - git add include/linux/mean_and_variance.h - cp $(LINUX_DIR)/lib/time_stats.c linux/ - git add linux/time_stats.c - cp $(LINUX_DIR)/include/linux/time_stats.h include/linux/ - git add include/linux/time_stats.h - cp $(LINUX_DIR)/include/linux/darray.h include/linux/ - git add include/linux/darray.h - cp $(LINUX_DIR)/include/linux/darray_types.h include/linux/ - git add include/linux/darray_types.h - cp $(LINUX_DIR)/include/linux/eytzinger.h include/linux/ - git add include/linux/eytzinger.h cp $(LINUX_DIR)/scripts/Makefile.compiler ./ git add Makefile.compiler $(RM) libbcachefs/*.mod.c diff --git a/c_src/cmd_format.c b/c_src/cmd_format.c index 3d29f413..4d6c2dc1 100644 --- a/c_src/cmd_format.c +++ b/c_src/cmd_format.c @@ -28,7 +28,7 @@ #include "libbcachefs/super-io.h" #include "libbcachefs/util.h" -#include "linux/darray.h" +#include "libbcachefs/darray.h" #define OPTS \ x(0, replicas, required_argument) \ diff --git a/c_src/cmd_fs.c b/c_src/cmd_fs.c index d8542d61..e08054e8 100644 --- a/c_src/cmd_fs.c +++ b/c_src/cmd_fs.c @@ -15,7 +15,7 @@ #include "cmds.h" #include "libbcachefs.h" -#include "linux/darray.h" +#include "libbcachefs/darray.h" static void __dev_usage_type_to_text(struct printbuf *out, enum bch_data_type type, diff --git a/c_src/cmd_list_journal.c b/c_src/cmd_list_journal.c index d104f50a..38ec8a28 100644 --- a/c_src/cmd_list_journal.c +++ b/c_src/cmd_list_journal.c @@ -128,7 +128,7 @@ static void journal_entries_print(struct bch_fs *c, unsigned nr_entries, if (le64_to_cpu(p->j.seq) + nr_entries < atomic64_read(&c->journal.seq)) continue; - bool blacklisted = p->ignore || + bool blacklisted = p->ignore_blacklisted || bch2_journal_seq_is_blacklisted(c, le64_to_cpu(p->j.seq), false); diff --git a/c_src/tools-util.h b/c_src/tools-util.h index 4682406e..bff3bc65 100644 --- a/c_src/tools-util.h +++ b/c_src/tools-util.h @@ -20,7 +20,7 @@ #include <linux/uuid.h> #include "libbcachefs/bcachefs.h" #include "libbcachefs/bbpos.h" -#include "linux/darray.h" +#include "libbcachefs/darray.h" #define noreturn __attribute__((noreturn)) diff --git a/include/linux/darray_types.h b/include/linux/darray_types.h deleted file mode 100644 index a400a0c3..00000000 --- a/include/linux/darray_types.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev> - */ -#ifndef _LINUX_DARRAY_TYpES_H -#define _LINUX_DARRAY_TYpES_H - -#include <linux/types.h> - -#define DARRAY_PREALLOCATED(_type, _nr) \ -struct { \ - size_t nr, size; \ - _type *data; \ - _type preallocated[_nr]; \ -} - -#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0) - -typedef DARRAY(char) darray_char; -typedef DARRAY(char *) darray_str; - -#endif /* _LINUX_DARRAY_TYpES_H */ diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h index 84741316..f3512fdd 100644 --- a/include/linux/generic-radix-tree.h +++ b/include/linux/generic-radix-tree.h @@ -5,7 +5,7 @@ * DOC: Generic radix trees/sparse arrays * * Very simple and minimalistic, supporting arbitrary size entries up to - * PAGE_SIZE. + * GENRADIX_NODE_SIZE. * * A genradix is defined with the type it will store, like so: * @@ -45,12 +45,15 @@ struct genradix_root; +#define GENRADIX_NODE_SHIFT 9 +#define GENRADIX_NODE_SIZE (1U << GENRADIX_NODE_SHIFT) + struct __genradix { struct genradix_root *root; }; /* - * NOTE: currently, sizeof(_type) must not be larger than PAGE_SIZE: + * NOTE: currently, sizeof(_type) must not be larger than GENRADIX_NODE_SIZE: */ #define __GENRADIX_INITIALIZER \ @@ -101,14 +104,14 @@ void __genradix_free(struct __genradix *); static inline size_t __idx_to_offset(size_t idx, size_t obj_size) { if (__builtin_constant_p(obj_size)) - BUILD_BUG_ON(obj_size > PAGE_SIZE); + BUILD_BUG_ON(obj_size > GENRADIX_NODE_SIZE); else - BUG_ON(obj_size > PAGE_SIZE); + BUG_ON(obj_size > GENRADIX_NODE_SIZE); if (!is_power_of_2(obj_size)) { - size_t objs_per_page = PAGE_SIZE / obj_size; + size_t objs_per_page = GENRADIX_NODE_SIZE / obj_size; - return (idx / objs_per_page) * PAGE_SIZE + + return (idx / objs_per_page) * GENRADIX_NODE_SIZE + (idx % objs_per_page) * obj_size; } else { return idx * obj_size; @@ -118,9 +121,9 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size) #define __genradix_cast(_radix) (typeof((_radix)->type[0]) *) #define __genradix_obj_size(_radix) sizeof((_radix)->type[0]) #define __genradix_objs_per_page(_radix) \ - (PAGE_SIZE / sizeof((_radix)->type[0])) + (GENRADIX_NODE_SIZE / sizeof((_radix)->type[0])) #define __genradix_page_remainder(_radix) \ - (PAGE_SIZE % sizeof((_radix)->type[0])) + (GENRADIX_NODE_SIZE % sizeof((_radix)->type[0])) #define __genradix_idx_to_offset(_radix, _idx) \ __idx_to_offset(_idx, __genradix_obj_size(_radix)) @@ -217,8 +220,8 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter, iter->offset += obj_size; if (!is_power_of_2(obj_size) && - (iter->offset & (PAGE_SIZE - 1)) + obj_size > PAGE_SIZE) - iter->offset = round_up(iter->offset, PAGE_SIZE); + (iter->offset & (GENRADIX_NODE_SIZE - 1)) + obj_size > GENRADIX_NODE_SIZE) + iter->offset = round_up(iter->offset, GENRADIX_NODE_SIZE); iter->pos++; } @@ -235,8 +238,8 @@ static inline void __genradix_iter_rewind(struct genradix_iter *iter, return; } - if ((iter->offset & (PAGE_SIZE - 1)) == 0) - iter->offset -= PAGE_SIZE % obj_size; + if ((iter->offset & (GENRADIX_NODE_SIZE - 1)) == 0) + iter->offset -= GENRADIX_NODE_SIZE % obj_size; iter->offset -= obj_size; iter->pos--; @@ -263,7 +266,7 @@ static inline void __genradix_iter_rewind(struct genradix_iter *iter, genradix_for_each_from(_radix, _iter, _p, 0) #define genradix_last_pos(_radix) \ - (SIZE_MAX / PAGE_SIZE * __genradix_objs_per_page(_radix) - 1) + (SIZE_MAX / GENRADIX_NODE_SIZE * __genradix_objs_per_page(_radix) - 1) /** * genradix_for_each_reverse - iterate over entry in a genradix, reverse order diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index ccd6cbfd..c47f72f2 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -1052,14 +1052,13 @@ int bch2_check_alloc_key(struct btree_trans *trans, if (ret) goto err; - if (k.k->type != discard_key_type && - (c->opts.reconstruct_alloc || - fsck_err(c, need_discard_key_wrong, - "incorrect key in need_discard btree (got %s should be %s)\n" - " %s", - bch2_bkey_types[k.k->type], - bch2_bkey_types[discard_key_type], - (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { + if (fsck_err_on(k.k->type != discard_key_type, + c, need_discard_key_wrong, + "incorrect key in need_discard btree (got %s should be %s)\n" + " %s", + bch2_bkey_types[k.k->type], + bch2_bkey_types[discard_key_type], + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(*update)); @@ -1083,15 +1082,14 @@ int bch2_check_alloc_key(struct btree_trans *trans, if (ret) goto err; - if (k.k->type != freespace_key_type && - (c->opts.reconstruct_alloc || - fsck_err(c, freespace_key_wrong, - "incorrect key in freespace btree (got %s should be %s)\n" - " %s", - bch2_bkey_types[k.k->type], - bch2_bkey_types[freespace_key_type], - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { + if (fsck_err_on(k.k->type != freespace_key_type, + c, freespace_key_wrong, + "incorrect key in freespace btree (got %s should be %s)\n" + " %s", + bch2_bkey_types[k.k->type], + bch2_bkey_types[freespace_key_type], + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(*update)); @@ -1115,14 +1113,13 @@ int bch2_check_alloc_key(struct btree_trans *trans, if (ret) goto err; - if (a->gen != alloc_gen(k, gens_offset) && - (c->opts.reconstruct_alloc || - fsck_err(c, bucket_gens_key_wrong, - "incorrect gen in bucket_gens btree (got %u should be %u)\n" - " %s", - alloc_gen(k, gens_offset), a->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { + if (fsck_err_on(a->gen != alloc_gen(k, gens_offset), + c, bucket_gens_key_wrong, + "incorrect gen in bucket_gens btree (got %u should be %u)\n" + " %s", + alloc_gen(k, gens_offset), a->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { struct bkey_i_bucket_gens *g = bch2_trans_kmalloc(trans, sizeof(*g)); @@ -1174,14 +1171,13 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans, *end = bkey_min(k.k->p, *end); - if (k.k->type != KEY_TYPE_set && - (c->opts.reconstruct_alloc || - fsck_err(c, freespace_hole_missing, - "hole in alloc btree missing in freespace btree\n" - " device %llu buckets %llu-%llu", - freespace_iter->pos.inode, - freespace_iter->pos.offset, - end->offset))) { + if (fsck_err_on(k.k->type != KEY_TYPE_set, + c, freespace_hole_missing, + "hole in alloc btree missing in freespace btree\n" + " device %llu buckets %llu-%llu", + freespace_iter->pos.inode, + freespace_iter->pos.offset, + end->offset)) { struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(*update)); diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c index f2b33fe4..8cb35ea5 100644 --- a/libbcachefs/backpointers.c +++ b/libbcachefs/backpointers.c @@ -477,8 +477,7 @@ missing: prt_printf(&buf, "\nbp pos "); bch2_bpos_to_text(&buf, bp_iter.pos); - if (c->opts.reconstruct_alloc || - fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) + if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true); goto out; diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 0bee9dab..339dc3e1 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -200,8 +200,6 @@ #include <linux/seqlock.h> #include <linux/shrinker.h> #include <linux/srcu.h> -#include <linux/thread_with_file_types.h> -#include <linux/time_stats.h> #include <linux/types.h> #include <linux/workqueue.h> #include <linux/zstd.h> @@ -214,6 +212,7 @@ #include "recovery_types.h" #include "sb-errors_types.h" #include "seqmutex.h" +#include "time_stats.h" #include "util.h" #ifdef CONFIG_BCACHEFS_DEBUG @@ -470,6 +469,7 @@ enum bch_time_stats { #include "replicas_types.h" #include "subvolume_types.h" #include "super_types.h" +#include "thread_with_file_types.h" /* Number of nodes btree coalesce will try to coalesce at once */ #define GC_MERGE_NODES 4U @@ -598,7 +598,7 @@ struct bch_dev { /* The rest of this all shows up in sysfs */ atomic64_t cur_latency[2]; - struct time_stats_quantiles io_latency[2]; + struct bch2_time_stats_quantiles io_latency[2]; #define CONGESTED_MAX 1024 atomic_t congested; @@ -645,8 +645,8 @@ struct btree_debug { #define BCH_TRANSACTIONS_NR 128 struct btree_transaction_stats { - struct time_stats duration; - struct time_stats lock_hold_times; + struct bch2_time_stats duration; + struct bch2_time_stats lock_hold_times; struct mutex lock; unsigned nr_max_paths; unsigned journal_entries_size; @@ -1111,7 +1111,7 @@ struct bch_fs { unsigned copy_gc_enabled:1; bool promote_whole_extents; - struct time_stats times[BCH_TIME_STAT_NR]; + struct bch2_time_stats times[BCH_TIME_STAT_NR]; struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR]; diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index 831be018..cf23ff47 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -4,7 +4,7 @@ #include <linux/bug.h> #include "bcachefs_format.h" - +#include "bkey_types.h" #include "btree_types.h" #include "util.h" #include "vstructs.h" @@ -31,57 +31,6 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *, const struct bkey_format *, const struct bkey_packed *); -/* bkey with split value, const */ -struct bkey_s_c { - const struct bkey *k; - const struct bch_val *v; -}; - -/* bkey with split value */ -struct bkey_s { - union { - struct { - struct bkey *k; - struct bch_val *v; - }; - struct bkey_s_c s_c; - }; -}; - -#define bkey_p_next(_k) vstruct_next(_k) - -static inline struct bkey_i *bkey_next(struct bkey_i *k) -{ - return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s); -} - -#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) - -static inline size_t bkey_val_bytes(const struct bkey *k) -{ - return bkey_val_u64s(k) * sizeof(u64); -} - -static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) -{ - unsigned u64s = BKEY_U64s + val_u64s; - - BUG_ON(u64s > U8_MAX); - k->u64s = u64s; -} - -static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) -{ - set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64))); -} - -#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) - -#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) - -#define bkey_whiteout(_k) \ - ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) - enum bkey_lr_packed { BKEY_PACKED_BOTH, BKEY_PACKED_RIGHT, @@ -362,10 +311,7 @@ static inline struct bpos bkey_start_pos(const struct bkey *k) static inline unsigned bkeyp_key_u64s(const struct bkey_format *format, const struct bkey_packed *k) { - unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s; - - EBUG_ON(k->u64s < ret); - return ret; + return bkey_packed(k) ? format->key_u64s : BKEY_U64s; } static inline unsigned bkeyp_key_bytes(const struct bkey_format *format, @@ -553,155 +499,6 @@ static inline void bkey_reassemble(struct bkey_i *dst, memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k)); } -#define bkey_s_null ((struct bkey_s) { .k = NULL }) -#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) - -#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) -#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) - -static inline struct bkey_s bkey_to_s(struct bkey *k) -{ - return (struct bkey_s) { .k = k, .v = NULL }; -} - -static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) -{ - return (struct bkey_s_c) { .k = k, .v = NULL }; -} - -static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) -{ - return (struct bkey_s) { .k = &k->k, .v = &k->v }; -} - -static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) -{ - return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; -} - -/* - * For a given type of value (e.g. struct bch_extent), generates the types for - * bkey + bch_extent - inline, split, split const - and also all the conversion - * functions, which also check that the value is of the correct type. - * - * We use anonymous unions for upcasting - e.g. converting from e.g. a - * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion - * functions. - */ -#define x(name, ...) \ -struct bkey_i_##name { \ - union { \ - struct bkey k; \ - struct bkey_i k_i; \ - }; \ - struct bch_##name v; \ -}; \ - \ -struct bkey_s_c_##name { \ - union { \ - struct { \ - const struct bkey *k; \ - const struct bch_##name *v; \ - }; \ - struct bkey_s_c s_c; \ - }; \ -}; \ - \ -struct bkey_s_##name { \ - union { \ - struct { \ - struct bkey *k; \ - struct bch_##name *v; \ - }; \ - struct bkey_s_c_##name c; \ - struct bkey_s s; \ - struct bkey_s_c s_c; \ - }; \ -}; \ - \ -static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return container_of(&k->k, struct bkey_i_##name, k); \ -} \ - \ -static inline const struct bkey_i_##name * \ -bkey_i_to_##name##_c(const struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return container_of(&k->k, struct bkey_i_##name, k); \ -} \ - \ -static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ - return (struct bkey_s_##name) { \ - .k = k.k, \ - .v = container_of(k.v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ - return (struct bkey_s_c_##name) { \ - .k = k.k, \ - .v = container_of(k.v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ -{ \ - return (struct bkey_s_##name) { \ - .k = &k->k, \ - .v = &k->v, \ - }; \ -} \ - \ -static inline struct bkey_s_c_##name \ -name##_i_to_s_c(const struct bkey_i_##name *k) \ -{ \ - return (struct bkey_s_c_##name) { \ - .k = &k->k, \ - .v = &k->v, \ - }; \ -} \ - \ -static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return (struct bkey_s_##name) { \ - .k = &k->k, \ - .v = container_of(&k->v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_s_c_##name \ -bkey_i_to_s_c_##name(const struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return (struct bkey_s_c_##name) { \ - .k = &k->k, \ - .v = container_of(&k->v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ -{ \ - struct bkey_i_##name *k = \ - container_of(&_k->k, struct bkey_i_##name, k); \ - \ - bkey_init(&k->k); \ - memset(&k->v, 0, sizeof(k->v)); \ - k->k.type = KEY_TYPE_##name; \ - set_bkey_val_bytes(&k->k, sizeof(k->v)); \ - \ - return k; \ -} - -BCH_BKEY_TYPES(); -#undef x - /* byte order helpers */ #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ diff --git a/libbcachefs/bkey_types.h b/libbcachefs/bkey_types.h new file mode 100644 index 00000000..c9ae9e42 --- /dev/null +++ b/libbcachefs/bkey_types.h @@ -0,0 +1,213 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_TYPES_H +#define _BCACHEFS_BKEY_TYPES_H + +#include "bcachefs_format.h" + +/* + * bkey_i - bkey with inline value + * bkey_s - bkey with split value + * bkey_s_c - bkey with split value, const + */ + +#define bkey_p_next(_k) vstruct_next(_k) + +static inline struct bkey_i *bkey_next(struct bkey_i *k) +{ + return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s); +} + +#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) + +static inline size_t bkey_val_bytes(const struct bkey *k) +{ + return bkey_val_u64s(k) * sizeof(u64); +} + +static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) +{ + unsigned u64s = BKEY_U64s + val_u64s; + + BUG_ON(u64s > U8_MAX); + k->u64s = u64s; +} + +static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) +{ + set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64))); +} + +#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) + +#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) + +#define bkey_whiteout(_k) \ + ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) + +/* bkey with split value, const */ +struct bkey_s_c { + const struct bkey *k; + const struct bch_val *v; +}; + +/* bkey with split value */ +struct bkey_s { + union { + struct { + struct bkey *k; + struct bch_val *v; + }; + struct bkey_s_c s_c; + }; +}; + +#define bkey_s_null ((struct bkey_s) { .k = NULL }) +#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) + +#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) +#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) + +static inline struct bkey_s bkey_to_s(struct bkey *k) +{ + return (struct bkey_s) { .k = k, .v = NULL }; +} + +static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) +{ + return (struct bkey_s_c) { .k = k, .v = NULL }; +} + +static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) +{ + return (struct bkey_s) { .k = &k->k, .v = &k->v }; +} + +static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) +{ + return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; +} + +/* + * For a given type of value (e.g. struct bch_extent), generates the types for + * bkey + bch_extent - inline, split, split const - and also all the conversion + * functions, which also check that the value is of the correct type. + * + * We use anonymous unions for upcasting - e.g. converting from e.g. a + * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion + * functions. + */ +#define x(name, ...) \ +struct bkey_i_##name { \ + union { \ + struct bkey k; \ + struct bkey_i k_i; \ + }; \ + struct bch_##name v; \ +}; \ + \ +struct bkey_s_c_##name { \ + union { \ + struct { \ + const struct bkey *k; \ + const struct bch_##name *v; \ + }; \ + struct bkey_s_c s_c; \ + }; \ +}; \ + \ +struct bkey_s_##name { \ + union { \ + struct { \ + struct bkey *k; \ + struct bch_##name *v; \ + }; \ + struct bkey_s_c_##name c; \ + struct bkey_s s; \ + struct bkey_s_c s_c; \ + }; \ +}; \ + \ +static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return container_of(&k->k, struct bkey_i_##name, k); \ +} \ + \ +static inline const struct bkey_i_##name * \ +bkey_i_to_##name##_c(const struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return container_of(&k->k, struct bkey_i_##name, k); \ +} \ + \ +static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ + return (struct bkey_s_##name) { \ + .k = k.k, \ + .v = container_of(k.v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ + return (struct bkey_s_c_##name) { \ + .k = k.k, \ + .v = container_of(k.v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ +{ \ + return (struct bkey_s_##name) { \ + .k = &k->k, \ + .v = &k->v, \ + }; \ +} \ + \ +static inline struct bkey_s_c_##name \ +name##_i_to_s_c(const struct bkey_i_##name *k) \ +{ \ + return (struct bkey_s_c_##name) { \ + .k = &k->k, \ + .v = &k->v, \ + }; \ +} \ + \ +static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return (struct bkey_s_##name) { \ + .k = &k->k, \ + .v = container_of(&k->v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_s_c_##name \ +bkey_i_to_s_c_##name(const struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return (struct bkey_s_c_##name) { \ + .k = &k->k, \ + .v = container_of(&k->v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ +{ \ + struct bkey_i_##name *k = \ + container_of(&_k->k, struct bkey_i_##name, k); \ + \ + bkey_init(&k->k); \ + memset(&k->v, 0, sizeof(k->v)); \ + k->k.type = KEY_TYPE_##name; \ + set_bkey_val_bytes(&k->k, sizeof(k->v)); \ + \ + return k; \ +} + +BCH_BKEY_TYPES(); +#undef x + +#endif /* _BCACHEFS_BKEY_TYPES_H */ diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index 1d77aa55..3fd1085b 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -9,12 +9,12 @@ #include "bcachefs.h" #include "btree_cache.h" #include "bset.h" +#include "eytzinger.h" #include "trace.h" #include "util.h" #include <asm/unaligned.h> #include <linux/console.h> -#include <linux/eytzinger.h> #include <linux/random.h> #include <linux/prefetch.h> diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 79975046..562561a9 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -661,7 +661,7 @@ out: bch2_btree_keys_init(b); set_btree_node_accessed(b); - time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], start_time); memalloc_nofs_restore(flags); diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 6c52f116..584aee70 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -593,16 +593,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr); - if (!g->gen_valid && - (c->opts.reconstruct_alloc || - fsck_err(c, ptr_to_missing_alloc_key, - "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { + if (fsck_err_on(!g->gen_valid, + c, ptr_to_missing_alloc_key, + "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { if (!p.ptr.cached) { g->gen_valid = true; g->gen = p.ptr.gen; @@ -611,16 +610,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id } } - if (gen_cmp(p.ptr.gen, g->gen) > 0 && - (c->opts.reconstruct_alloc || - fsck_err(c, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { + if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, + c, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { if (!p.ptr.cached) { g->gen_valid = true; g->gen = p.ptr.gen; @@ -633,28 +631,26 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id } } - if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX && - (c->opts.reconstruct_alloc || - fsck_err(c, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) + if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, + c, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) do_update = true; - if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 && - (c->opts.reconstruct_alloc || - fsck_err(c, stale_dirty_ptr, - "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) + if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0, + c, stale_dirty_ptr, + "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) do_update = true; if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) @@ -1366,11 +1362,10 @@ static int bch2_alloc_write_key(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); - struct bucket gc, *b; + struct bucket old_gc, gc, *b; struct bkey_i_alloc_v4 *a; struct bch_alloc_v4 old_convert, new; const struct bch_alloc_v4 *old; - enum bch_data_type type; int ret; old = bch2_alloc_to_v4(k, &old_convert); @@ -1378,30 +1373,31 @@ static int bch2_alloc_write_key(struct btree_trans *trans, percpu_down_read(&c->mark_lock); b = gc_bucket(ca, iter->pos.offset); + old_gc = *b; + + if ((old->data_type == BCH_DATA_sb || + old->data_type == BCH_DATA_journal) && + !bch2_dev_is_online(ca)) { + b->data_type = old->data_type; + b->dirty_sectors = old->dirty_sectors; + } /* * b->data_type doesn't yet include need_discard & need_gc_gen states - * fix that here: */ - type = __alloc_data_type(b->dirty_sectors, - b->cached_sectors, - b->stripe, - *old, - b->data_type); - if (b->data_type != type) { - struct bch_dev_usage *u; - - preempt_disable(); - u = this_cpu_ptr(ca->usage_gc); - u->d[b->data_type].buckets--; - b->data_type = type; - u->d[b->data_type].buckets++; - preempt_enable(); - } - + b->data_type = __alloc_data_type(b->dirty_sectors, + b->cached_sectors, + b->stripe, + *old, + b->data_type); gc = *b; percpu_up_read(&c->mark_lock); + if (gc.data_type != old_gc.data_type || + gc.dirty_sectors != old_gc.dirty_sectors) + bch2_dev_usage_update_m(c, ca, &old_gc, &gc); + if (metadata_only && gc.data_type != BCH_DATA_sb && gc.data_type != BCH_DATA_journal && @@ -1411,8 +1407,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, if (gen_after(old->gen, gc.gen)) return 0; - if (c->opts.reconstruct_alloc || - fsck_err_on(new.data_type != gc.data_type, c, + if (fsck_err_on(new.data_type != gc.data_type, c, alloc_key_data_type_wrong, "bucket %llu:%llu gen %u has wrong data_type" ": got %s, should be %s", @@ -1423,8 +1418,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, new.data_type = gc.data_type; #define copy_bucket_field(_errtype, _f) \ - if (c->opts.reconstruct_alloc || \ - fsck_err_on(new._f != gc._f, c, _errtype, \ + if (fsck_err_on(new._f != gc._f, c, _errtype, \ "bucket %llu:%llu gen %u data type %s has wrong " #_f \ ": got %u, should be %u", \ iter->pos.inode, iter->pos.offset, \ @@ -1586,8 +1580,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans, " should be %u", (bch2_bkey_val_to_text(&buf, c, k), buf.buf), r->refcount)) { - struct bkey_i *new = bch2_bkey_make_mut(trans, iter, &k, 0); - + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); ret = PTR_ERR_OR_ZERO(new); if (ret) return ret; @@ -1596,6 +1589,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans, new->k.type = KEY_TYPE_deleted; else *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount); + ret = bch2_trans_update(trans, iter, new, 0); } fsck_err: printbuf_exit(&buf); @@ -1818,10 +1812,10 @@ out: if (!ret) { bch2_journal_block(&c->journal); - ret = bch2_gc_stripes_done(c, metadata_only) ?: - bch2_gc_reflink_done(c, metadata_only) ?: - bch2_gc_alloc_done(c, metadata_only) ?: - bch2_gc_done(c, initial, metadata_only); + ret = bch2_gc_alloc_done(c, metadata_only) ?: + bch2_gc_done(c, initial, metadata_only) ?: + bch2_gc_stripes_done(c, metadata_only) ?: + bch2_gc_reflink_done(c, metadata_only); bch2_journal_unblock(&c->journal); } @@ -1971,7 +1965,7 @@ int bch2_gc_gens(struct bch_fs *c) c->gc_count++; - time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); + bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); trace_and_count(c, gc_gens_end, c); err: for_each_member_device(c, ca) { diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 86415701..624c8287 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -327,7 +327,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes); if (sorting_entire_node) - time_stats_update(&c->times[BCH_TIME_btree_node_sort], + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], start_time); /* Make sure we preserve bset journal_seq: */ @@ -397,7 +397,7 @@ void bch2_btree_sort_into(struct bch_fs *c, &dst->format, true); - time_stats_update(&c->times[BCH_TIME_btree_node_sort], + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], start_time); set_btree_bset_end(dst, dst->set); @@ -839,6 +839,9 @@ static bool __bkey_valid(struct bch_fs *c, struct btree *b, if (k->format > KEY_FORMAT_CURRENT) return false; + if (k->u64s < bkeyp_key_u64s(&b->format, k)) + return false; + struct printbuf buf = PRINTBUF; struct bkey tmp; struct bkey_s u = __bkey_disassemble(b, k, &tmp); @@ -880,7 +883,13 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, "invalid bkey format %u", k->format)) goto drop_this_key; - /* XXX: validate k->u64s */ + if (btree_err_on(k->u64s < bkeyp_key_u64s(&b->format, k), + -BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, i, + btree_node_bkey_bad_u64s, + "k->u64s too small (%u < %u)", k->u64s, bkeyp_key_u64s(&b->format, k))) + goto drop_this_key; + if (!write) bch2_bkey_compat(b->c.level, b->c.btree_id, version, BSET_BIG_ENDIAN(i), write, @@ -1250,7 +1259,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, out: mempool_free(iter, &c->fill_iter); printbuf_exit(&buf); - time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time); + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time); return retry_read; fsck_err: if (ret == -BCH_ERR_btree_node_read_err_want_retry || @@ -1322,7 +1331,7 @@ start: } } - time_stats_update(&c->times[BCH_TIME_btree_node_read], + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], rb->start_time); bio_put(&rb->bio); diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 2357af3e..51bcdc6c 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -1729,7 +1729,9 @@ bch2_btree_iter_traverse(struct btree_iter *iter) if (ret) return ret; - btree_path_set_should_be_locked(trans->paths + iter->path); + struct btree_path *path = btree_iter_path(trans, iter); + if (btree_path_node(path, path->level)) + btree_path_set_should_be_locked(path); return 0; } @@ -2905,7 +2907,7 @@ u32 bch2_trans_begin(struct btree_trans *trans) if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) && time_after64(now, trans->last_begin_time + 10)) - __time_stats_update(&btree_trans_stats(trans)->duration, + __bch2_time_stats_update(&btree_trans_stats(trans)->duration, trans->last_begin_time, now); if (!trans->restarted && @@ -3230,7 +3232,7 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); s++) { kfree(s->max_paths_text); - time_stats_exit(&s->lock_hold_times); + bch2_time_stats_exit(&s->lock_hold_times); } if (c->btree_trans_barrier_initialized) @@ -3246,8 +3248,8 @@ void bch2_fs_btree_iter_init_early(struct bch_fs *c) for (s = c->btree_transaction_stats; s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); s++) { - time_stats_init(&s->duration); - time_stats_init(&s->lock_hold_times); + bch2_time_stats_init(&s->duration); + bch2_time_stats_init(&s->lock_hold_times); mutex_init(&s->lock); } diff --git a/libbcachefs/btree_journal_iter.c b/libbcachefs/btree_journal_iter.c index 207dd32e..50e04356 100644 --- a/libbcachefs/btree_journal_iter.c +++ b/libbcachefs/btree_journal_iter.c @@ -512,7 +512,7 @@ int bch2_journal_keys_sort(struct bch_fs *c) genradix_for_each(&c->journal_entries, iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; cond_resched(); diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index 74e52fd2..8a71d434 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -380,9 +380,11 @@ static int btree_key_cache_fill(struct btree_trans *trans, struct bkey_i *new_k = NULL; int ret; - k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos, - BTREE_ITER_KEY_CACHE_FILL| - BTREE_ITER_CACHED_NOFILL); + bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos, + BTREE_ITER_KEY_CACHE_FILL| + BTREE_ITER_CACHED_NOFILL); + iter.flags &= ~BTREE_ITER_WITH_JOURNAL; + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto err; diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index f2e2c588..4bd72c85 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -122,7 +122,7 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans, struct btree_path *path, unsigned level) { #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS - __time_stats_update(&btree_trans_stats(trans)->lock_hold_times, + __bch2_time_stats_update(&btree_trans_stats(trans)->lock_hold_times, path->l[level].lock_taken_time, local_clock()); #endif diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index b2ebf143..9404d96c 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -2,13 +2,13 @@ #ifndef _BCACHEFS_BTREE_TYPES_H #define _BCACHEFS_BTREE_TYPES_H -#include <linux/darray_types.h> #include <linux/list.h> #include <linux/rhashtable.h> #include "bbpos_types.h" #include "btree_key_cache_types.h" #include "buckets_types.h" +#include "darray.h" #include "errcode.h" #include "journal_types.h" #include "replicas_types.h" diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c index cbb7cf21..a4b40c16 100644 --- a/libbcachefs/btree_update.c +++ b/libbcachefs/btree_update.c @@ -14,8 +14,6 @@ #include "snapshot.h" #include "trace.h" -#include <linux/darray.h> - static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, const struct btree_insert_entry *r) { @@ -454,7 +452,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, * the key cache - but the key has to exist in the btree for that to * work: */ - if (path->cached && bkey_deleted(&i->old_k)) + if (path->cached && !i->old_btree_u64s) return flush_new_cached_update(trans, i, flags, ip); return 0; diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 70da4fa2..642213ef 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -25,8 +25,7 @@ #include <linux/random.h> static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, - btree_path_idx_t, struct btree *, - struct keylist *, unsigned); + btree_path_idx_t, struct btree *, struct keylist *); static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans, @@ -517,7 +516,7 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans * bch2_disk_reservation_put(c, &as->disk_res); bch2_btree_reserve_put(as, trans); - time_stats_update(&c->times[BCH_TIME_btree_interior_update_total], + bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total], as->start_time); mutex_lock(&c->btree_interior_update_lock); @@ -1039,7 +1038,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans * continue_at(&as->cl, btree_update_set_nodes_written, as->c->btree_interior_update_worker); - time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground], + bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground], start_time); } @@ -1208,10 +1207,6 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) mutex_unlock(&c->btree_cache.lock); mutex_lock(&c->btree_root_lock); - BUG_ON(btree_node_root(c, b) && - (b->c.level < btree_node_root(c, b)->c.level || - !btree_node_dying(btree_node_root(c, b)))); - bch2_btree_id_root(c, b->c.btree_id)->b = b; mutex_unlock(&c->btree_root_lock); @@ -1477,7 +1472,7 @@ static void btree_split_insert_keys(struct btree_update *as, static int btree_split(struct btree_update *as, struct btree_trans *trans, btree_path_idx_t path, struct btree *b, - struct keylist *keys, unsigned flags) + struct keylist *keys) { struct bch_fs *c = as->c; struct btree *parent = btree_node_parent(trans->paths + path, b); @@ -1578,7 +1573,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, if (parent) { /* Split a non root node */ - ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); + ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); if (ret) goto err; } else if (n3) { @@ -1630,7 +1625,7 @@ out: bch2_trans_verify_locks(trans); - time_stats_update(&c->times[n2 + bch2_time_stats_update(&c->times[n2 ? BCH_TIME_btree_node_split : BCH_TIME_btree_node_compact], start_time); @@ -1673,7 +1668,6 @@ bch2_btree_insert_keys_interior(struct btree_update *as, * @path_idx: path that points to current node * @b: node to insert keys into * @keys: list of keys to insert - * @flags: transaction commit flags * * Returns: 0 on success, typically transaction restart error on failure * @@ -1683,7 +1677,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, */ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, btree_path_idx_t path_idx, struct btree *b, - struct keylist *keys, unsigned flags) + struct keylist *keys) { struct bch_fs *c = as->c; struct btree_path *path = trans->paths + path_idx; @@ -1739,7 +1733,7 @@ split: return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); } - return btree_split(as, trans, path_idx, b, keys, flags); + return btree_split(as, trans, path_idx, b, keys); } int bch2_btree_split_leaf(struct btree_trans *trans, @@ -1747,7 +1741,6 @@ int bch2_btree_split_leaf(struct btree_trans *trans, unsigned flags) { /* btree_split & merge may both cause paths array to be reallocated */ - struct btree *b = path_l(trans->paths + path)->b; struct btree_update *as; unsigned l; @@ -1759,7 +1752,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans, if (IS_ERR(as)) return PTR_ERR(as); - ret = btree_split(as, trans, path, b, NULL, flags); + ret = btree_split(as, trans, path, b, NULL); if (ret) { bch2_btree_update_free(as, trans); return ret; @@ -1775,6 +1768,60 @@ int bch2_btree_split_leaf(struct btree_trans *trans, return ret; } +static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans, + btree_path_idx_t path_idx) +{ + struct bch_fs *c = as->c; + struct btree_path *path = trans->paths + path_idx; + struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b; + + BUG_ON(!btree_node_locked(path, b->c.level)); + + n = __btree_root_alloc(as, trans, b->c.level + 1); + + bch2_btree_update_add_new_node(as, n); + six_unlock_write(&n->c.lock); + + path->locks_want++; + BUG_ON(btree_node_locked(path, n->c.level)); + six_lock_increment(&n->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, path, n); + + n->sib_u64s[0] = U16_MAX; + n->sib_u64s[1] = U16_MAX; + + bch2_keylist_add(&as->parent_keys, &b->key); + btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys); + + bch2_btree_set_root(as, trans, path, n); + bch2_btree_update_get_open_buckets(as, n); + bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); + bch2_trans_node_add(trans, path, n); + six_unlock_intent(&n->c.lock); + + mutex_lock(&c->btree_cache.lock); + list_add_tail(&b->list, &c->btree_cache.live); + mutex_unlock(&c->btree_cache.lock); + + bch2_trans_verify_locks(trans); +} + +int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b; + struct btree_update *as = + bch2_btree_update_start(trans, trans->paths + path, + b->c.level, true, flags); + if (IS_ERR(as)) + return PTR_ERR(as); + + __btree_increase_depth(as, trans, path); + bch2_btree_update_done(as, trans); + return 0; +} + int __bch2_foreground_maybe_merge(struct btree_trans *trans, btree_path_idx_t path, unsigned level, @@ -1915,7 +1962,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_trans_verify_paths(trans); - ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); + ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); if (ret) goto err_free_update; @@ -1935,7 +1982,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_btree_update_done(as, trans); - time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time); + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time); out: err: if (new_path) @@ -1986,8 +2033,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, if (parent) { bch2_keylist_add(&as->parent_keys, &n->key); - ret = bch2_btree_insert_node(as, trans, iter->path, - parent, &as->parent_keys, flags); + ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys); if (ret) goto err; } else { diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index c593c925..3439b037 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -119,6 +119,8 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned); +int bch2_btree_increase_depth(struct btree_trans *, btree_path_idx_t, unsigned); + int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t, unsigned, unsigned, enum btree_node_sibling); diff --git a/libbcachefs/btree_write_buffer_types.h b/libbcachefs/btree_write_buffer_types.h index 5f248873..9b9433de 100644 --- a/libbcachefs/btree_write_buffer_types.h +++ b/libbcachefs/btree_write_buffer_types.h @@ -2,7 +2,7 @@ #ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H #define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H -#include <linux/darray_types.h> +#include "darray.h" #include "journal_types.h" #define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4 diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index 99293915..38defa19 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -11,6 +11,7 @@ #include "replicas.h" #include "super.h" #include "super-io.h" +#include "thread_with_file.h" #include <linux/cdev.h> #include <linux/device.h> @@ -19,15 +20,8 @@ #include <linux/major.h> #include <linux/sched/task.h> #include <linux/slab.h> -#include <linux/thread_with_file.h> #include <linux/uaccess.h> -__must_check -static int copy_to_user_errcode(void __user *to, const void *from, unsigned long n) -{ - return copy_to_user(to, from, n) ? -EFAULT : 0; -} - /* returns with ref on ca->ref */ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, unsigned flags) @@ -172,9 +166,9 @@ static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) bch2_fs_stop(c); if (ret & 1) - stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name); + bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name); if (ret & 4) - stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name); + bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name); return ret; } @@ -236,7 +230,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); - ret = run_thread_with_stdio(&thr->thr, &bch2_offline_fsck_ops); + ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_offline_fsck_ops); err: if (ret < 0) { if (thr) @@ -439,7 +433,7 @@ static int bch2_data_job_release(struct inode *inode, struct file *file) { struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr); - thread_with_file_exit(&ctx->thr); + bch2_thread_with_file_exit(&ctx->thr); kfree(ctx); return 0; } @@ -489,7 +483,7 @@ static long bch2_ioctl_data(struct bch_fs *c, ctx->c = c; ctx->arg = arg; - ret = run_thread_with_file(&ctx->thr, + ret = bch2_run_thread_with_file(&ctx->thr, &bcachefs_data_ops, bch2_data_thread); if (ret < 0) @@ -857,7 +851,7 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c, goto err; } - ret = run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops); + ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops); err: if (ret < 0) { bch_err_fn(c, ret); diff --git a/linux/darray.c b/libbcachefs/darray.c similarity index 68% rename from linux/darray.c rename to libbcachefs/darray.c index 80e77959..ac35b8b7 100644 --- a/linux/darray.c +++ b/libbcachefs/darray.c @@ -1,13 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 -/* - * (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev> - */ -#include <linux/darray.h> #include <linux/log2.h> #include <linux/slab.h> +#include "darray.h" -int __darray_resize_slowpath(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp) +int __bch2_darray_resize(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp) { if (new_size > d->size) { new_size = roundup_pow_of_two(new_size); diff --git a/include/linux/darray.h b/libbcachefs/darray.h similarity index 66% rename from include/linux/darray.h rename to libbcachefs/darray.h index ff167eb7..4b340d13 100644 --- a/include/linux/darray.h +++ b/libbcachefs/darray.h @@ -1,26 +1,34 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* - * (C) 2022-2024 Kent Overstreet <kent.overstreet@linux.dev> - */ -#ifndef _LINUX_DARRAY_H -#define _LINUX_DARRAY_H +#ifndef _BCACHEFS_DARRAY_H +#define _BCACHEFS_DARRAY_H /* - * Dynamic arrays + * Dynamic arrays: * * Inspired by CCAN's darray */ -#include <linux/darray_types.h> #include <linux/slab.h> -int __darray_resize_slowpath(darray_char *, size_t, size_t, gfp_t); +#define DARRAY_PREALLOCATED(_type, _nr) \ +struct { \ + size_t nr, size; \ + _type *data; \ + _type preallocated[_nr]; \ +} + +#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0) + +typedef DARRAY(char) darray_char; +typedef DARRAY(char *) darray_str; + +int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t); static inline int __darray_resize(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp) { return unlikely(new_size > d->size) - ? __darray_resize_slowpath(d, element_size, new_size, gfp) + ? __bch2_darray_resize(d, element_size, new_size, gfp) : 0; } @@ -61,28 +69,6 @@ static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more, #define darray_first(_d) ((_d).data[0]) #define darray_last(_d) ((_d).data[(_d).nr - 1]) -/* Insert/remove items into the middle of a darray: */ - -#define array_insert_item(_array, _nr, _pos, _new_item) \ -do { \ - memmove(&(_array)[(_pos) + 1], \ - &(_array)[(_pos)], \ - sizeof((_array)[0]) * ((_nr) - (_pos))); \ - (_nr)++; \ - (_array)[(_pos)] = (_new_item); \ -} while (0) - -#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \ -do { \ - (_nr) -= (_nr_to_remove); \ - memmove(&(_array)[(_pos)], \ - &(_array)[(_pos) + (_nr_to_remove)], \ - sizeof((_array)[0]) * ((_nr) - (_pos))); \ -} while (0) - -#define array_remove_item(_array, _nr, _pos) \ - array_remove_items(_array, _nr, _pos, 1) - #define darray_insert_item(_d, pos, _item) \ ({ \ size_t _pos = (pos); \ @@ -93,15 +79,10 @@ do { \ _ret; \ }) -#define darray_remove_items(_d, _pos, _nr_to_remove) \ - array_remove_items((_d)->data, (_d)->nr, (_pos) - (_d)->data, _nr_to_remove) - #define darray_remove_item(_d, _pos) \ - darray_remove_items(_d, _pos, 1) + array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data) -/* Iteration: */ - -#define __darray_for_each(_d, _i) \ +#define __darray_for_each(_d, _i) \ for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++) #define darray_for_each(_d, _i) \ @@ -125,4 +106,4 @@ do { \ darray_init(_d); \ } while (0) -#endif /* _LINUX_DARRAY_H */ +#endif /* _BCACHEFS_DARRAY_H */ diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index e960a6ea..af25d8ec 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -5,6 +5,10 @@ #define BCH_ERRCODES() \ x(ERANGE, ERANGE_option_too_small) \ x(ERANGE, ERANGE_option_too_big) \ + x(EINVAL, mount_option) \ + x(BCH_ERR_mount_option, option_name) \ + x(BCH_ERR_mount_option, option_value) \ + x(BCH_ERR_mount_option, option_not_bool) \ x(ENOMEM, ENOMEM_stripe_buf) \ x(ENOMEM, ENOMEM_replicas_table) \ x(ENOMEM, ENOMEM_cpu_replicas) \ @@ -247,7 +251,8 @@ x(BCH_ERR_nopromote, nopromote_congested) \ x(BCH_ERR_nopromote, nopromote_in_flight) \ x(BCH_ERR_nopromote, nopromote_no_writes) \ - x(BCH_ERR_nopromote, nopromote_enomem) + x(BCH_ERR_nopromote, nopromote_enomem) \ + x(0, need_inode_lock) enum bch_errcode { BCH_ERR_START = 2048, diff --git a/libbcachefs/error.c b/libbcachefs/error.c index 8ae95b21..04343120 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -3,7 +3,7 @@ #include "error.h" #include "recovery.h" #include "super.h" -#include <linux/thread_with_file.h> +#include "thread_with_file.h" #define FSCK_ERR_RATELIMIT_NR 10 @@ -111,7 +111,7 @@ static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c) do { bch2_print(c, " (y,n, or Y,N for all errors of this type) "); - int r = stdio_redirect_readline(stdio, buf, sizeof(buf) - 1); + int r = bch2_stdio_redirect_readline(stdio, buf, sizeof(buf) - 1); if (r < 0) return YN_NO; buf[r] = '\0'; diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 6bf839d6..6219f2c0 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -43,6 +43,11 @@ enum bkey_invalid_flags; #define extent_entry_next(_entry) \ ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) +#define extent_entry_next_safe(_entry, _end) \ + (likely(__extent_entry_type(_entry) < BCH_EXTENT_ENTRY_MAX) \ + ? extent_entry_next(_entry) \ + : _end) + static inline unsigned __extent_entry_type(const union bch_extent_entry *e) { @@ -280,7 +285,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) #define __bkey_extent_entry_for_each_from(_start, _end, _entry) \ for ((_entry) = (_start); \ (_entry) < (_end); \ - (_entry) = extent_entry_next(_entry)) + (_entry) = extent_entry_next_safe(_entry, _end)) #define __bkey_ptr_next(_ptr, _end) \ ({ \ @@ -318,7 +323,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) (_ptr).has_ec = false; \ \ __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ - switch (extent_entry_type(_entry)) { \ + switch (__extent_entry_type(_entry)) { \ case BCH_EXTENT_ENTRY_ptr: \ (_ptr).ptr = _entry->ptr; \ goto out; \ @@ -344,7 +349,7 @@ out: \ for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \ (_entry) = _start; \ __bkey_ptr_next_decode(_k, _end, _ptr, _entry); \ - (_entry) = extent_entry_next(_entry)) + (_entry) = extent_entry_next_safe(_entry, _end)) #define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \ __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \ diff --git a/include/linux/eytzinger.h b/libbcachefs/eytzinger.h similarity index 77% rename from include/linux/eytzinger.h rename to libbcachefs/eytzinger.h index 10315010..b04750db 100644 --- a/include/linux/eytzinger.h +++ b/libbcachefs/eytzinger.h @@ -1,37 +1,27 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_EYTZINGER_H -#define _LINUX_EYTZINGER_H +#ifndef _EYTZINGER_H +#define _EYTZINGER_H #include <linux/bitops.h> #include <linux/log2.h> -#ifdef EYTZINGER_DEBUG -#define EYTZINGER_BUG_ON(cond) BUG_ON(cond) -#else -#define EYTZINGER_BUG_ON(cond) -#endif +#include "util.h" /* * Traversal for trees in eytzinger layout - a full binary tree layed out in an - * array. + * array + */ + +/* + * One based indexing version: * - * Consider using an eytzinger tree any time you would otherwise be doing binary - * search over an array. Binary search is a worst case scenario for branch - * prediction and prefetching, but in an eytzinger tree every node's children - * are adjacent in memory, thus we can prefetch children before knowing the - * result of the comparison, assuming multiple nodes fit on a cacheline. - * - * Two variants are provided, for one based indexing and zero based indexing. - * - * Zero based indexing is more convenient, but one based indexing has better - * alignment and thus better performance because each new level of the tree - * starts at a power of two, and thus if element 0 was cacheline aligned, each - * new level will be as well. + * With one based indexing each level of the tree starts at a power of two - + * good for cacheline alignment: */ static inline unsigned eytzinger1_child(unsigned i, unsigned child) { - EYTZINGER_BUG_ON(child > 1); + EBUG_ON(child > 1); return (i << 1) + child; } @@ -68,7 +58,7 @@ static inline unsigned eytzinger1_last(unsigned size) static inline unsigned eytzinger1_next(unsigned i, unsigned size) { - EYTZINGER_BUG_ON(i > size); + EBUG_ON(i > size); if (eytzinger1_right_child(i) <= size) { i = eytzinger1_right_child(i); @@ -84,7 +74,7 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size) static inline unsigned eytzinger1_prev(unsigned i, unsigned size) { - EYTZINGER_BUG_ON(i > size); + EBUG_ON(i > size); if (eytzinger1_left_child(i) <= size) { i = eytzinger1_left_child(i) + 1; @@ -111,7 +101,7 @@ static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, unsigned shift = __fls(size) - b; int s; - EYTZINGER_BUG_ON(!i || i > size); + EBUG_ON(!i || i > size); i ^= 1U << b; i <<= 1; @@ -136,7 +126,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, unsigned shift; int s; - EYTZINGER_BUG_ON(!i || i > size); + EBUG_ON(!i || i > size); /* * sign bit trick: @@ -174,7 +164,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) static inline unsigned eytzinger0_child(unsigned i, unsigned child) { - EYTZINGER_BUG_ON(child > 1); + EBUG_ON(child > 1); return (i << 1) + 1 + child; } @@ -241,9 +231,11 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) (_i) != -1; \ (_i) = eytzinger0_next((_i), (_size))) +typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size); + /* return greatest node <= @search, or -1 if not found */ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, - cmp_func_t cmp, const void *search) + eytzinger_cmp_fn cmp, const void *search) { unsigned i, n = 0; @@ -252,7 +244,7 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, do { i = n; - n = eytzinger0_child(i, cmp(search, base + i * size) >= 0); + n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0); } while (n < nr); if (n & 1) { @@ -277,13 +269,13 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, int _res; \ \ while (_i < _nr && \ - (_res = _cmp(_search, _base + _i * _size))) \ + (_res = _cmp(_search, _base + _i * _size, _size))) \ _i = eytzinger0_child(_i, _res > 0); \ _i; \ }) -void eytzinger0_sort_r(void *, size_t, size_t, - cmp_r_func_t, swap_r_func_t, const void *); -void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t); +void eytzinger0_sort(void *, size_t, size_t, + int (*cmp_func)(const void *, const void *, size_t), + void (*swap_func)(void *, void *, size_t)); -#endif /* _LINUX_EYTZINGER_H */ +#endif /* _EYTZINGER_H */ diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c index 27710cdd..39292e7e 100644 --- a/libbcachefs/fs-io-buffered.c +++ b/libbcachefs/fs-io-buffered.c @@ -810,7 +810,8 @@ static noinline void folios_trunc(folios *fs, struct folio **fi) static int __bch2_buffered_write(struct bch_inode_info *inode, struct address_space *mapping, struct iov_iter *iter, - loff_t pos, unsigned len) + loff_t pos, unsigned len, + bool inode_locked) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch2_folio_reservation res; @@ -835,6 +836,15 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, BUG_ON(!fs.nr); + /* + * If we're not using the inode lock, we need to lock all the folios for + * atomiticity of writes vs. other writes: + */ + if (!inode_locked && folio_end_pos(darray_last(fs)) < end) { + ret = -BCH_ERR_need_inode_lock; + goto out; + } + f = darray_first(fs); if (pos != folio_pos(f) && !folio_test_uptodate(f)) { ret = bch2_read_single_folio(f, mapping); @@ -929,8 +939,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, end = pos + copied; spin_lock(&inode->v.i_lock); - if (end > inode->v.i_size) + if (end > inode->v.i_size) { + BUG_ON(!inode_locked); i_size_write(&inode->v, end); + } spin_unlock(&inode->v.i_lock); f_pos = pos; @@ -974,12 +986,68 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct bch_inode_info *inode = file_bch_inode(file); - loff_t pos = iocb->ki_pos; - ssize_t written = 0; - int ret = 0; + loff_t pos; + bool inode_locked = false; + ssize_t written = 0, written2 = 0, ret = 0; + + /* + * We don't take the inode lock unless i_size will be changing. Folio + * locks provide exclusion with other writes, and the pagecache add lock + * provides exclusion with truncate and hole punching. + * + * There is one nasty corner case where atomicity would be broken + * without great care: when copying data from userspace to the page + * cache, we do that with faults disable - a page fault would recurse + * back into the filesystem, taking filesystem locks again, and + * deadlock; so it's done with faults disabled, and we fault in the user + * buffer when we aren't holding locks. + * + * If we do part of the write, but we then race and in the userspace + * buffer have been evicted and are no longer resident, then we have to + * drop our folio locks to re-fault them in, breaking write atomicity. + * + * To fix this, we restart the write from the start, if we weren't + * holding the inode lock. + * + * There is another wrinkle after that; if we restart the write from the + * start, and then get an unrecoverable error, we _cannot_ claim to + * userspace that we did not write data we actually did - so we must + * track (written2) the most we ever wrote. + */ + + if ((iocb->ki_flags & IOCB_APPEND) || + (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) { + inode_lock(&inode->v); + inode_locked = true; + } + + ret = generic_write_checks(iocb, iter); + if (ret <= 0) + goto unlock; + + ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0); + if (ret) { + if (!inode_locked) { + inode_lock(&inode->v); + inode_locked = true; + ret = file_remove_privs_flags(file, 0); + } + if (ret) + goto unlock; + } + + ret = file_update_time(file); + if (ret) + goto unlock; + + pos = iocb->ki_pos; bch2_pagecache_add_get(inode); + if (!inode_locked && + (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) + goto get_inode_lock; + do { unsigned offset = pos & (PAGE_SIZE - 1); unsigned bytes = iov_iter_count(iter); @@ -1004,12 +1072,17 @@ again: } } + if (unlikely(bytes != iov_iter_count(iter) && !inode_locked)) + goto get_inode_lock; + if (unlikely(fatal_signal_pending(current))) { ret = -EINTR; break; } - ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); + ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked); + if (ret == -BCH_ERR_need_inode_lock) + goto get_inode_lock; if (unlikely(ret < 0)) break; @@ -1030,50 +1103,46 @@ again: } pos += ret; written += ret; + written2 = max(written, written2); + + if (ret != bytes && !inode_locked) + goto get_inode_lock; ret = 0; balance_dirty_pages_ratelimited(mapping); + + if (0) { +get_inode_lock: + bch2_pagecache_add_put(inode); + inode_lock(&inode->v); + inode_locked = true; + bch2_pagecache_add_get(inode); + + iov_iter_revert(iter, written); + pos -= written; + written = 0; + ret = 0; + } } while (iov_iter_count(iter)); - bch2_pagecache_add_put(inode); - - return written ? written : ret; -} - -ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - struct file *file = iocb->ki_filp; - struct bch_inode_info *inode = file_bch_inode(file); - ssize_t ret; - - if (iocb->ki_flags & IOCB_DIRECT) { - ret = bch2_direct_write(iocb, from); - goto out; - } - - inode_lock(&inode->v); - - ret = generic_write_checks(iocb, from); - if (ret <= 0) - goto unlock; - - ret = file_remove_privs(file); - if (ret) - goto unlock; - - ret = file_update_time(file); - if (ret) - goto unlock; - - ret = bch2_buffered_write(iocb, from); - if (likely(ret > 0)) - iocb->ki_pos += ret; unlock: - inode_unlock(&inode->v); + if (inode_locked) + inode_unlock(&inode->v); + iocb->ki_pos += written; + + ret = max(written, written2) ?: ret; if (ret > 0) ret = generic_write_sync(iocb, ret); -out: + return ret; +} + +ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + ssize_t ret = iocb->ki_flags & IOCB_DIRECT + ? bch2_direct_write(iocb, iter) + : bch2_buffered_write(iocb, iter); + return bch2_err_class(ret); } diff --git a/libbcachefs/fs-io-pagecache.h b/libbcachefs/fs-io-pagecache.h index 8cbaba65..828c3d7c 100644 --- a/libbcachefs/fs-io-pagecache.h +++ b/libbcachefs/fs-io-pagecache.h @@ -51,13 +51,10 @@ enum bch_folio_sector_state { struct bch_folio_sector { /* Uncompressed, fully allocated replicas (or on disk reservation): */ - unsigned nr_replicas:4; - + u8 nr_replicas:4, /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */ - unsigned replicas_reserved:4; - - /* i_sectors: */ - enum bch_folio_sector_state state:8; + replicas_reserved:4; + u8 state; }; struct bch_folio { diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 093f5404..3f073845 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -1870,8 +1870,10 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, opt_set(opts, read_only, (flags & SB_RDONLY) != 0); ret = bch2_parse_mount_opts(NULL, &opts, data); - if (ret) + if (ret) { + ret = bch2_err_class(ret); return ERR_PTR(ret); + } if (!dev_name || strlen(dev_name) == 0) return ERR_PTR(-EINVAL); diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 144f074b..f48033be 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -5,6 +5,7 @@ #include "btree_cache.h" #include "btree_update.h" #include "buckets.h" +#include "darray.h" #include "dirent.h" #include "error.h" #include "fs-common.h" @@ -17,7 +18,6 @@ #include "xattr.h" #include <linux/bsearch.h> -#include <linux/darray.h> #include <linux/dcache.h> /* struct qstr */ /* @@ -849,12 +849,9 @@ static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) { struct btree_iter iter; struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0); - int ret = bkey_err(k); - if (ret) - return ret; - + int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set; bch2_trans_iter_exit(trans, &iter); - return k.k->type == KEY_TYPE_set; + return ret; } static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k, @@ -970,7 +967,7 @@ static int check_inode(struct btree_trans *trans, if (ret < 0) return ret; - fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list, + fsck_err_on(!ret, c, unlinked_inode_not_on_deleted_list, "inode %llu:%u unlinked, but not on deleted list", u.bi_inum, k.k->p.snapshot); ret = 0; diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index a3139bb6..2b5e0677 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -1181,6 +1181,15 @@ int bch2_delete_dead_inodes(struct bch_fs *c) bool need_another_pass; int ret; again: + /* + * if we ran check_inodes() unlinked inodes will have already been + * cleaned up but the write buffer will be out of sync; therefore we + * alway need a write buffer flush + */ + ret = bch2_btree_write_buffer_flush_sync(trans); + if (ret) + goto err; + need_another_pass = false; /* @@ -1213,12 +1222,8 @@ again: ret; })); - if (!ret && need_another_pass) { - ret = bch2_btree_write_buffer_flush_sync(trans); - if (ret) - goto err; + if (!ret && need_another_pass) goto again; - } err: bch2_trans_put(trans); return ret; diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c index dce136cd..8a556e6d 100644 --- a/libbcachefs/io_read.c +++ b/libbcachefs/io_read.c @@ -134,7 +134,7 @@ static void promote_done(struct bch_write_op *wop) container_of(wop, struct promote_op, write.op); struct bch_fs *c = op->write.op.c; - time_stats_update(&c->times[BCH_TIME_data_promote], + bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); promote_free(c, op); } @@ -174,7 +174,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) return ERR_PTR(-BCH_ERR_nopromote_no_writes); - op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_KERNEL); + op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL); if (!op) { ret = -BCH_ERR_nopromote_enomem; goto err; @@ -356,7 +356,7 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) static void bch2_rbio_done(struct bch_read_bio *rbio) { if (rbio->start_time) - time_stats_update(&rbio->c->times[BCH_TIME_data_read], + bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], rbio->start_time); bio_endio(&rbio->bio); } diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c index f7c4a428..f137252b 100644 --- a/libbcachefs/io_write.c +++ b/libbcachefs/io_write.c @@ -88,7 +88,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) bch2_congested_acct(ca, io_latency, now, rw); - __time_stats_update(&ca->io_latency[rw].stats, submit_time, now); + __bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now); } #endif @@ -457,7 +457,7 @@ static void bch2_write_done(struct closure *cl) EBUG_ON(op->open_buckets.nr); - time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); + bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); bch2_disk_reservation_put(c, &op->res); if (!(op->flags & BCH_WRITE_MOVE)) diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 46dc25ad..f314b2e7 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -84,12 +84,8 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 prt_str(out, "separate_flush "); if (buf->need_flush_to_write_buffer) prt_str(out, "need_flush_to_write_buffer "); - if (buf->need_flush_to_write_buffer) - prt_str(out, "need_flush_to_write_buffer "); - if (buf->write_done) - prt_str(out, "write done "); if (buf->write_started) - prt_str(out, "write started "); + prt_str(out, "write_started "); if (buf->write_allocated) prt_str(out, "write allocated "); if (buf->write_done) @@ -715,7 +711,7 @@ recheck_need_open: return ret; seq = res.seq; - buf = j->buf + (seq & JOURNAL_BUF_MASK); + buf = journal_seq_to_buf(j, seq); buf->must_flush = true; if (!buf->flush_time) { @@ -733,8 +729,8 @@ recheck_need_open: } /* - * if write was kicked off without a flush, flush the next sequence - * number instead + * if write was kicked off without a flush, or if we promised it + * wouldn't be a flush, flush the next sequence number instead */ buf = journal_seq_to_buf(j, seq); if (buf->noflush) { @@ -768,7 +764,7 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq) ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL))); if (!ret) - time_stats_update(j->flush_seq_time, start_time); + bch2_time_stats_update(j->flush_seq_time, start_time); return ret ?: ret2 < 0 ? ret2 : 0; } @@ -812,8 +808,8 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq) unwritten_seq++) { struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq); - /* journal write is already in flight, and was a flush write: */ - if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush) + /* journal flush already in flight, or flush requseted */ + if (buf->must_flush) goto out; buf->noflush = true; @@ -1203,7 +1199,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) genradix_for_each_reverse(&c->journal_entries, iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; last_seq = le64_to_cpu(i->j.last_seq); @@ -1236,7 +1232,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) genradix_for_each(&c->journal_entries, iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; seq = le64_to_cpu(i->j.seq); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index b37b75cc..d76c3c0c 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -86,9 +86,12 @@ static void __journal_replay_free(struct bch_fs *c, kvfree(i); } -static void journal_replay_free(struct bch_fs *c, struct journal_replay *i) +static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted) { - i->ignore = true; + if (blacklisted) + i->ignore_blacklisted = true; + else + i->ignore_not_dirty = true; if (!c->opts.read_entire_journal) __journal_replay_free(c, i); @@ -138,12 +141,13 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, journal_entry_radix_idx(c, jlist->last_seq)) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; if (le64_to_cpu(i->j.seq) >= last_seq) break; - journal_replay_free(c, i); + + journal_replay_free(c, i, false); } } @@ -199,8 +203,9 @@ replace: return -BCH_ERR_ENOMEM_journal_entry_add; darray_init(&i->ptrs); - i->csum_good = entry_ptr.csum_good; - i->ignore = false; + i->csum_good = entry_ptr.csum_good; + i->ignore_blacklisted = false; + i->ignore_not_dirty = false; unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); if (dup) { @@ -1255,20 +1260,20 @@ int bch2_journal_read(struct bch_fs *c, i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; if (!*start_seq) *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; if (JSET_NO_FLUSH(&i->j)) { - i->ignore = true; + i->ignore_blacklisted = true; continue; } if (!last_write_torn && !i->csum_good) { last_write_torn = true; - i->ignore = true; + i->ignore_blacklisted = true; continue; } @@ -1307,12 +1312,12 @@ int bch2_journal_read(struct bch_fs *c, genradix_for_each(&c->journal_entries, radix_iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; seq = le64_to_cpu(i->j.seq); if (seq < *last_seq) { - journal_replay_free(c, i); + journal_replay_free(c, i, false); continue; } @@ -1320,7 +1325,7 @@ int bch2_journal_read(struct bch_fs *c, fsck_err_on(!JSET_NO_FLUSH(&i->j), c, jset_seq_blacklisted, "found blacklisted journal entry %llu", seq); - i->ignore = true; + i->ignore_blacklisted = true; } } @@ -1329,7 +1334,7 @@ int bch2_journal_read(struct bch_fs *c, genradix_for_each(&c->journal_entries, radix_iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; BUG_ON(seq > le64_to_cpu(i->j.seq)); @@ -1382,7 +1387,7 @@ int bch2_journal_read(struct bch_fs *c, }; i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; darray_for_each(i->ptrs, ptr) { @@ -1602,9 +1607,9 @@ static CLOSURE_CALLBACK(journal_write_done) u64 v, seq = le64_to_cpu(w->data->seq); int err = 0; - time_stats_update(!JSET_NO_FLUSH(w->data) - ? j->flush_write_time - : j->noflush_write_time, j->write_start_time); + bch2_time_stats_update(!JSET_NO_FLUSH(w->data) + ? j->flush_write_time + : j->noflush_write_time, j->write_start_time); if (!w->devs_written.nr) { bch_err(c, "unable to write journal to sufficient devices"); @@ -1667,6 +1672,7 @@ static CLOSURE_CALLBACK(journal_write_done) new.unwritten_idx++; } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); + closure_wake_up(&w->wait); completed = true; } @@ -1676,7 +1682,6 @@ static CLOSURE_CALLBACK(journal_write_done) track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false); - closure_wake_up(&w->wait); journal_wake(j); } @@ -1930,6 +1935,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * j->nr_noflush_writes++; } else { + w->must_flush = true; j->last_flush_write = jiffies; j->nr_flush_writes++; clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags); diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h index 59790456..4f1e763a 100644 --- a/libbcachefs/journal_io.h +++ b/libbcachefs/journal_io.h @@ -2,7 +2,7 @@ #ifndef _BCACHEFS_JOURNAL_IO_H #define _BCACHEFS_JOURNAL_IO_H -#include <linux/darray_types.h> +#include "darray.h" struct journal_ptr { bool csum_good; @@ -20,11 +20,17 @@ struct journal_replay { DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs; bool csum_good; - bool ignore; + bool ignore_blacklisted; + bool ignore_not_dirty; /* must be last: */ struct jset j; }; +static inline bool journal_replay_ignore(struct journal_replay *i) +{ + return !i || i->ignore_blacklisted || i->ignore_not_dirty; +} + static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, struct jset_entry *entry, unsigned type) { diff --git a/libbcachefs/journal_sb.c b/libbcachefs/journal_sb.c index 156691c2..ae4fb8c3 100644 --- a/libbcachefs/journal_sb.c +++ b/libbcachefs/journal_sb.c @@ -2,8 +2,8 @@ #include "bcachefs.h" #include "journal_sb.h" +#include "darray.h" -#include <linux/darray.h> #include <linux/sort.h> /* BCH_SB_FIELD_journal: */ diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c index 024c9b1b..b5303874 100644 --- a/libbcachefs/journal_seq_blacklist.c +++ b/libbcachefs/journal_seq_blacklist.c @@ -2,11 +2,10 @@ #include "bcachefs.h" #include "btree_iter.h" +#include "eytzinger.h" #include "journal_seq_blacklist.h" #include "super-io.h" -#include <linux/eytzinger.h> - /* * journal_seq_blacklist machinery: * @@ -44,61 +43,36 @@ static unsigned sb_blacklist_u64s(unsigned nr) return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64); } -static struct bch_sb_field_journal_seq_blacklist * -blacklist_entry_try_merge(struct bch_fs *c, - struct bch_sb_field_journal_seq_blacklist *bl, - unsigned i) -{ - unsigned nr = blacklist_nr_entries(bl); - - if (le64_to_cpu(bl->start[i].end) >= - le64_to_cpu(bl->start[i + 1].start)) { - bl->start[i].end = bl->start[i + 1].end; - --nr; - memmove(&bl->start[i], - &bl->start[i + 1], - sizeof(bl->start[0]) * (nr - i)); - - bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, - sb_blacklist_u64s(nr)); - BUG_ON(!bl); - } - - return bl; -} - -static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e, - u64 start, u64 end) -{ - return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start); -} - int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) { struct bch_sb_field_journal_seq_blacklist *bl; - unsigned i, nr; + unsigned i = 0, nr; int ret = 0; mutex_lock(&c->sb_lock); bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); nr = blacklist_nr_entries(bl); - for (i = 0; i < nr; i++) { + while (i < nr) { struct journal_seq_blacklist_entry *e = bl->start + i; - if (bl_entry_contig_or_overlaps(e, start, end)) { - e->start = cpu_to_le64(min(start, le64_to_cpu(e->start))); - e->end = cpu_to_le64(max(end, le64_to_cpu(e->end))); + if (end < le64_to_cpu(e->start)) + break; - if (i + 1 < nr) - bl = blacklist_entry_try_merge(c, - bl, i); - if (i) - bl = blacklist_entry_try_merge(c, - bl, i - 1); - goto out_write_sb; + if (start > le64_to_cpu(e->end)) { + i++; + continue; } + + /* + * Entry is contiguous or overlapping with new entry: merge it + * with new entry, and delete: + */ + + start = min(start, le64_to_cpu(e->start)); + end = max(end, le64_to_cpu(e->end)); + array_remove_item(bl->start, nr, i); } bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, @@ -108,9 +82,10 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) goto out; } - bl->start[nr].start = cpu_to_le64(start); - bl->start[nr].end = cpu_to_le64(end); -out_write_sb: + array_insert_item(bl->start, nr, i, ((struct journal_seq_blacklist_entry) { + .start = cpu_to_le64(start), + .end = cpu_to_le64(end), + })); c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); ret = bch2_write_super(c); @@ -120,7 +95,8 @@ out: return ret ?: bch2_blacklist_table_initialize(c); } -static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r) +static int journal_seq_blacklist_table_cmp(const void *_l, + const void *_r, size_t size) { const struct journal_seq_blacklist_table_entry *l = _l; const struct journal_seq_blacklist_table_entry *r = _r; @@ -165,8 +141,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) if (!bl) return 0; - t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr, - GFP_KERNEL); + t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL); if (!t) return -BCH_ERR_ENOMEM_blacklist_table_init; diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index 011f7a0d..8c053cb6 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -287,9 +287,9 @@ struct journal { u64 nr_noflush_writes; u64 entry_bytes_written; - struct time_stats *flush_write_time; - struct time_stats *noflush_write_time; - struct time_stats *flush_seq_time; + struct bch2_time_stats *flush_write_time; + struct bch2_time_stats *noflush_write_time; + struct bch2_time_stats *flush_seq_time; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map res_map; diff --git a/libbcachefs/lru.c b/libbcachefs/lru.c index ed7577cd..26569043 100644 --- a/libbcachefs/lru.c +++ b/libbcachefs/lru.c @@ -125,8 +125,7 @@ static int bch2_check_lru_key(struct btree_trans *trans, goto out; } - if (c->opts.reconstruct_alloc || - fsck_err(c, lru_entry_bad, + if (fsck_err(c, lru_entry_bad, "incorrect lru entry: lru %s time %llu\n" " %s\n" " for %s", diff --git a/linux/mean_and_variance.c b/libbcachefs/mean_and_variance.c similarity index 99% rename from linux/mean_and_variance.c rename to libbcachefs/mean_and_variance.c index 21ec6afc..0ea9f308 100644 --- a/linux/mean_and_variance.c +++ b/libbcachefs/mean_and_variance.c @@ -40,9 +40,10 @@ #include <linux/limits.h> #include <linux/math.h> #include <linux/math64.h> -#include <linux/mean_and_variance.h> #include <linux/module.h> +#include "mean_and_variance.h" + u128_u u128_div(u128_u n, u64 d) { u128_u r; diff --git a/include/linux/mean_and_variance.h b/libbcachefs/mean_and_variance.h similarity index 100% rename from include/linux/mean_and_variance.h rename to libbcachefs/mean_and_variance.h diff --git a/libbcachefs/nocow_locking.c b/libbcachefs/nocow_locking.c index 181efa4a..3c21981a 100644 --- a/libbcachefs/nocow_locking.c +++ b/libbcachefs/nocow_locking.c @@ -85,7 +85,7 @@ void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t, u64 start_time = local_clock(); __closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags)); - time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time); + bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time); } } diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index b1ed0b9a..08ea0cfc 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -314,7 +314,7 @@ int bch2_opt_parse(struct bch_fs *c, if (ret < 0 || (*res != 0 && *res != 1)) { if (err) prt_printf(err, "%s: must be bool", opt->attr.name); - return ret; + return ret < 0 ? ret : -BCH_ERR_option_not_bool; } break; case BCH_OPT_UINT: @@ -456,7 +456,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, copied_opts = kstrdup(options, GFP_KERNEL); if (!copied_opts) - return -1; + return -ENOMEM; copied_opts_start = copied_opts; while ((opt = strsep(&copied_opts, ",")) != NULL) { @@ -501,11 +501,11 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, bad_opt: pr_err("Bad mount option %s", name); - ret = -1; + ret = -BCH_ERR_option_name; goto out; bad_val: pr_err("Invalid mount option %s", err.buf); - ret = -1; + ret = -BCH_ERR_option_value; goto out; out: kfree(copied_opts_start); diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index f8c2341e..136083c1 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -290,6 +290,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Allow mounting in when data will be missing") \ + x(no_splitbrain_check, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Don't kick drives out when splitbrain detected")\ x(discard, u8, \ OPT_FS|OPT_MOUNT|OPT_DEVICE, \ OPT_BOOL(), \ diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 96e7a1ec..2af219ae 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -52,14 +52,47 @@ static bool btree_id_is_alloc(enum btree_id id) } /* for -o reconstruct_alloc: */ -static void drop_alloc_keys(struct journal_keys *keys) +static void do_reconstruct_alloc(struct bch_fs *c) { + bch2_journal_log_msg(c, "dropping alloc info"); + bch_info(c, "dropping and reconstructing all alloc info"); + + mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required); + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info, ext->recovery_passes_required); + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus, ext->recovery_passes_required); + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, ext->recovery_passes_required); + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, ext->recovery_passes_required); + + __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent); + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + + struct journal_keys *keys = &c->journal_keys; size_t src, dst; for (src = 0, dst = 0; src < keys->nr; src++) if (!btree_id_is_alloc(keys->data[src].btree_id)) keys->data[dst++] = keys->data[src]; - keys->nr = dst; } @@ -122,6 +155,17 @@ static int bch2_journal_replay_key(struct btree_trans *trans, if (ret) goto out; + struct btree_path *path = btree_iter_path(trans, &iter); + if (unlikely(!btree_path_node(path, k->level))) { + bch2_trans_iter_exit(trans, &iter); + bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, + BTREE_MAX_DEPTH, 0, iter_flags); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_btree_increase_depth(trans, iter.path, 0) ?: + -BCH_ERR_transaction_restart_nested; + goto out; + } + /* Must be checked with btree locked: */ if (k->overwritten) goto out; @@ -355,7 +399,7 @@ static int journal_replay_early(struct bch_fs *c, genradix_for_each(&c->journal_entries, iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; vstruct_for_each(&i->j, entry) { @@ -384,11 +428,8 @@ static int read_btree_roots(struct bch_fs *c) if (!r->alive) continue; - if (btree_id_is_alloc(i) && - c->opts.reconstruct_alloc) { - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc) continue; - } if (r->error) { __fsck_err(c, @@ -857,7 +898,7 @@ int bch2_fs_recovery(struct bch_fs *c) goto out; genradix_for_each_reverse(&c->journal_entries, iter, i) - if (*i && !(*i)->ignore) { + if (!journal_replay_ignore(*i)) { last_journal_entry = &(*i)->j; break; } @@ -882,7 +923,8 @@ int bch2_fs_recovery(struct bch_fs *c) genradix_for_each_reverse(&c->journal_entries, iter, i) if (*i) { last_journal_entry = &(*i)->j; - (*i)->ignore = false; + (*i)->ignore_blacklisted = false; + (*i)->ignore_not_dirty= false; /* * This was probably a NO_FLUSH entry, * so last_seq was garbage - but we know @@ -918,10 +960,8 @@ use_clean: c->journal_replay_seq_start = last_seq; c->journal_replay_seq_end = blacklist_seq - 1; - if (c->opts.reconstruct_alloc) { - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); - drop_alloc_keys(&c->journal_keys); - } + if (c->opts.reconstruct_alloc) + do_reconstruct_alloc(c); zero_out_btree_mem_ptr(&c->journal_keys); @@ -945,7 +985,7 @@ use_clean: bch2_journal_seq_blacklist_add(c, blacklist_seq, journal_seq); if (ret) { - bch_err(c, "error creating new journal seq blacklist entry"); + bch_err_msg(c, ret, "error creating new journal seq blacklist entry"); goto err; } } @@ -956,9 +996,6 @@ use_clean: if (ret) goto err; - if (c->opts.reconstruct_alloc) - bch2_journal_log_msg(c, "dropping alloc info"); - /* * Skip past versions that might have possibly been used (as nonces), * but hadn't had their pointers written: diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index 678b9c20..cc2672c1 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -6,15 +6,12 @@ #include "replicas.h" #include "super-io.h" -#include <linux/sort.h> - static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, struct bch_replicas_cpu *); /* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */ -static int bch2_memcmp(const void *l, const void *r, const void *priv) +static int bch2_memcmp(const void *l, const void *r, size_t size) { - size_t size = (size_t) priv; return memcmp(l, r, size); } @@ -42,8 +39,7 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e) static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) { - eytzinger0_sort_r(r->entries, r->nr, r->entry_size, - bch2_memcmp, NULL, (void *)(size_t)r->entry_size); + eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL); } static void bch2_replicas_entry_v0_to_text(struct printbuf *out, @@ -232,7 +228,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, verify_replicas_entry(search); -#define entry_cmp(_l, _r) memcmp(_l, _r, entry_size) +#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size) idx = eytzinger0_find(r->entries, r->nr, r->entry_size, entry_cmp, search); #undef entry_cmp @@ -828,11 +824,10 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, { unsigned i; - sort_r(cpu_r->entries, - cpu_r->nr, - cpu_r->entry_size, - bch2_memcmp, NULL, - (void *)(size_t)cpu_r->entry_size); + sort_cmp_size(cpu_r->entries, + cpu_r->nr, + cpu_r->entry_size, + bch2_memcmp, NULL); for (i = 0; i < cpu_r->nr; i++) { struct bch_replicas_entry_v1 *e = diff --git a/libbcachefs/replicas.h b/libbcachefs/replicas.h index 983cce78..654a4b26 100644 --- a/libbcachefs/replicas.h +++ b/libbcachefs/replicas.h @@ -3,10 +3,9 @@ #define _BCACHEFS_REPLICAS_H #include "bkey.h" +#include "eytzinger.h" #include "replicas_types.h" -#include <linux/eytzinger.h> - void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *); void bch2_replicas_entry_to_text(struct printbuf *, struct bch_replicas_entry_v1 *); diff --git a/libbcachefs/sb-downgrade.c b/libbcachefs/sb-downgrade.c index 3337419f..e4396cb0 100644 --- a/libbcachefs/sb-downgrade.c +++ b/libbcachefs/sb-downgrade.c @@ -6,13 +6,12 @@ */ #include "bcachefs.h" +#include "darray.h" #include "recovery.h" #include "sb-downgrade.h" #include "sb-errors.h" #include "super-io.h" -#include <linux/darray.h> - #define RECOVERY_PASS_ALL_FSCK BIT_ULL(63) /* @@ -260,7 +259,7 @@ void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_mi if (e < BCH_SB_ERR_MAX) __set_bit(e, c->sb.errors_silent); if (e < sizeof(ext->errors_silent) * 8) - ext->errors_silent[e / 64] |= cpu_to_le64(BIT_ULL(e % 64)); + __set_bit_le64(e, ext->errors_silent); } } } diff --git a/libbcachefs/sb-errors_types.h b/libbcachefs/sb-errors_types.h index 0df4b0e7..5178bf57 100644 --- a/libbcachefs/sb-errors_types.h +++ b/libbcachefs/sb-errors_types.h @@ -2,7 +2,7 @@ #ifndef _BCACHEFS_SB_ERRORS_TYPES_H #define _BCACHEFS_SB_ERRORS_TYPES_H -#include <linux/darray_types.h> +#include "darray.h" #define BCH_SB_ERRS() \ x(clean_but_journal_not_empty, 0) \ @@ -264,7 +264,8 @@ x(subvol_children_not_set, 256) \ x(subvol_children_bad, 257) \ x(subvol_loop, 258) \ - x(subvol_unreachable, 259) + x(subvol_unreachable, 259) \ + x(btree_node_bkey_bad_u64s, 260) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, diff --git a/libbcachefs/sb-members.h b/libbcachefs/sb-members.h index e4d4d842..be0a9418 100644 --- a/libbcachefs/sb-members.h +++ b/libbcachefs/sb-members.h @@ -2,7 +2,7 @@ #ifndef _BCACHEFS_SB_MEMBERS_H #define _BCACHEFS_SB_MEMBERS_H -#include <linux/darray.h> +#include "darray.h" extern char * const bch2_member_error_strs[]; diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h index 4045a180..903c0516 100644 --- a/libbcachefs/subvolume.h +++ b/libbcachefs/subvolume.h @@ -2,6 +2,7 @@ #ifndef _BCACHEFS_SUBVOLUME_H #define _BCACHEFS_SUBVOLUME_H +#include "darray.h" #include "subvolume_types.h" enum bkey_invalid_flags; diff --git a/libbcachefs/subvolume_types.h b/libbcachefs/subvolume_types.h index 40f16e3a..ae644adf 100644 --- a/libbcachefs/subvolume_types.h +++ b/libbcachefs/subvolume_types.h @@ -2,7 +2,7 @@ #ifndef _BCACHEFS_SUBVOLUME_TYPES_H #define _BCACHEFS_SUBVOLUME_TYPES_H -#include <linux/darray_types.h> +#include "darray.h" typedef DARRAY(u32) snapshot_id_list; diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 38a50732..010daebf 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -470,6 +470,14 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, return ret; } + if (rw == WRITE && + bch2_sb_member_get(sb, sb->dev_idx).seq != sb->seq) { + prt_printf(out, "Invalid superblock: member seq %llu != sb seq %llu", + le64_to_cpu(bch2_sb_member_get(sb, sb->dev_idx).seq), + le64_to_cpu(sb->seq)); + return -BCH_ERR_invalid_sb_members_missing; + } + return 0; } diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h index f3762091..95e80e06 100644 --- a/libbcachefs/super-io.h +++ b/libbcachefs/super-io.h @@ -3,12 +3,12 @@ #define _BCACHEFS_SUPER_IO_H #include "extents.h" +#include "eytzinger.h" #include "super_types.h" #include "super.h" #include "sb-members.h" #include <asm/byteorder.h> -#include <linux/eytzinger.h> static inline bool bch2_version_compatible(u16 version) { diff --git a/libbcachefs/super.c b/libbcachefs/super.c index a7f9de22..1cabdd47 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -56,6 +56,7 @@ #include "super.h" #include "super-io.h" #include "sysfs.h" +#include "thread_with_file.h" #include "trace.h" #include <linux/backing-dev.h> @@ -67,7 +68,6 @@ #include <linux/percpu.h> #include <linux/random.h> #include <linux/sysfs.h> -#include <linux/thread_with_file.h> #include <crypto/hash.h> MODULE_LICENSE("GPL"); @@ -87,20 +87,27 @@ const char * const bch2_fs_flag_strs[] = { NULL }; +static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args) +{ +#ifdef __KERNEL__ + if (unlikely(stdio)) { + if (fmt[0] == KERN_SOH[0]) + fmt += 2; + + bch2_stdio_redirect_vprintf(stdio, true, fmt, args); + return; + } +#endif + vprintk(fmt, args); +} + void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...) { struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio; va_list args; va_start(args, fmt); - if (likely(!stdio)) { - vprintk(fmt, args); - } else { - if (fmt[0] == KERN_SOH[0]) - fmt += 2; - - stdio_redirect_vprintf(stdio, true, fmt, args); - } + bch2_print_maybe_redirect(stdio, fmt, args); va_end(args); } @@ -110,14 +117,7 @@ void __bch2_print(struct bch_fs *c, const char *fmt, ...) va_list args; va_start(args, fmt); - if (likely(!stdio)) { - vprintk(fmt, args); - } else { - if (fmt[0] == KERN_SOH[0]) - fmt += 2; - - stdio_redirect_vprintf(stdio, true, fmt, args); - } + bch2_print_maybe_redirect(stdio, fmt, args); va_end(args); } @@ -532,7 +532,7 @@ static void __bch2_fs_free(struct bch_fs *c) unsigned i; for (i = 0; i < BCH_TIME_STAT_NR; i++) - time_stats_exit(&c->times[i]); + bch2_time_stats_exit(&c->times[i]); bch2_free_pending_node_rewrites(c); bch2_fs_sb_errors_exit(c); @@ -765,7 +765,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->journal_keys.initial_ref_held = true; for (i = 0; i < BCH_TIME_STAT_NR; i++) - time_stats_init(&c->times[i]); + bch2_time_stats_init(&c->times[i]); bch2_fs_copygc_init(c); bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); @@ -830,13 +830,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) goto err; pr_uuid(&name, c->sb.user_uuid.b); - strscpy(c->name, name.buf, sizeof(c->name)); - printbuf_exit(&name); - ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; if (ret) goto err; + strscpy(c->name, name.buf, sizeof(c->name)); + printbuf_exit(&name); + /* Compat: */ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) @@ -1073,7 +1073,8 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) } static int bch2_dev_in_fs(struct bch_sb_handle *fs, - struct bch_sb_handle *sb) + struct bch_sb_handle *sb, + struct bch_opts *opts) { if (fs == sb) return 0; @@ -1114,11 +1115,14 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));; prt_newline(&buf); - prt_printf(&buf, "Not using older sb"); + if (!opts->no_splitbrain_check) + prt_printf(&buf, "Not using older sb"); pr_err("%s", buf.buf); printbuf_exit(&buf); - return -BCH_ERR_device_splitbrain; + + if (!opts->no_splitbrain_check) + return -BCH_ERR_device_splitbrain; } struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx); @@ -1141,12 +1145,17 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, prt_printf(&buf, " to be %llu, but ", seq_from_fs); prt_bdevname(&buf, sb->bdev); prt_printf(&buf, " has %llu\n", seq_from_member); - prt_str(&buf, "Not using "); - prt_bdevname(&buf, sb->bdev); + + if (!opts->no_splitbrain_check) { + prt_str(&buf, "Not using "); + prt_bdevname(&buf, sb->bdev); + } pr_err("%s", buf.buf); printbuf_exit(&buf); - return -BCH_ERR_device_splitbrain; + + if (!opts->no_splitbrain_check) + return -BCH_ERR_device_splitbrain; } return 0; @@ -1180,8 +1189,8 @@ static void bch2_dev_free(struct bch_dev *ca) bch2_dev_buckets_free(ca); free_page((unsigned long) ca->sb_read_scratch); - time_stats_quantiles_exit(&ca->io_latency[WRITE]); - time_stats_quantiles_exit(&ca->io_latency[READ]); + bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); + bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); percpu_ref_exit(&ca->io_ref); percpu_ref_exit(&ca->ref); @@ -1272,8 +1281,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, INIT_WORK(&ca->io_error_work, bch2_io_error_work); - time_stats_quantiles_init(&ca->io_latency[READ]); - time_stats_quantiles_init(&ca->io_latency[WRITE]); + bch2_time_stats_quantiles_init(&ca->io_latency[READ]); + bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]); ca->mi = bch2_mi_to_cpu(member); @@ -1847,7 +1856,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path) dev_idx = sb.sb->dev_idx; - ret = bch2_dev_in_fs(&c->disk_sb, &sb); + ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts); bch_err_msg(c, ret, "bringing %s online", path); if (ret) goto err; @@ -2035,7 +2044,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, best = sb; darray_for_each_reverse(sbs, sb) { - ret = bch2_dev_in_fs(best, sb); + ret = bch2_dev_in_fs(best, sb, &opts); if (ret == -BCH_ERR_device_has_been_removed || ret == -BCH_ERR_device_splitbrain) { diff --git a/libbcachefs/thread_with_file.c b/libbcachefs/thread_with_file.c new file mode 100644 index 00000000..940db15d --- /dev/null +++ b/libbcachefs/thread_with_file.c @@ -0,0 +1,450 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + +#include "bcachefs.h" +#include "thread_with_file.h" + +#include <linux/anon_inodes.h> +#include <linux/file.h> +#include <linux/kthread.h> +#include <linux/pagemap.h> +#include <linux/poll.h> +#include <linux/sched/sysctl.h> + +void bch2_thread_with_file_exit(struct thread_with_file *thr) +{ + if (thr->task) { + kthread_stop(thr->task); + put_task_struct(thr->task); + } +} + +int bch2_run_thread_with_file(struct thread_with_file *thr, + const struct file_operations *fops, + int (*fn)(void *)) +{ + struct file *file = NULL; + int ret, fd = -1; + unsigned fd_flags = O_CLOEXEC; + + if (fops->read && fops->write) + fd_flags |= O_RDWR; + else if (fops->read) + fd_flags |= O_RDONLY; + else if (fops->write) + fd_flags |= O_WRONLY; + + char name[TASK_COMM_LEN]; + get_task_comm(name, current); + + thr->ret = 0; + thr->task = kthread_create(fn, thr, "%s", name); + ret = PTR_ERR_OR_ZERO(thr->task); + if (ret) + return ret; + + ret = get_unused_fd_flags(fd_flags); + if (ret < 0) + goto err; + fd = ret; + + file = anon_inode_getfile(name, fops, thr, fd_flags); + ret = PTR_ERR_OR_ZERO(file); + if (ret) + goto err; + + get_task_struct(thr->task); + wake_up_process(thr->task); + fd_install(fd, file); + return fd; +err: + if (fd >= 0) + put_unused_fd(fd); + if (thr->task) + kthread_stop(thr->task); + return ret; +} + +/* stdio_redirect */ + +static bool stdio_redirect_has_input(struct stdio_redirect *stdio) +{ + return stdio->input.buf.nr || stdio->done; +} + +static bool stdio_redirect_has_output(struct stdio_redirect *stdio) +{ + return stdio->output.buf.nr || stdio->done; +} + +#define STDIO_REDIRECT_BUFSIZE 4096 + +static bool stdio_redirect_has_input_space(struct stdio_redirect *stdio) +{ + return stdio->input.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done; +} + +static bool stdio_redirect_has_output_space(struct stdio_redirect *stdio) +{ + return stdio->output.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done; +} + +static void stdio_buf_init(struct stdio_buf *buf) +{ + spin_lock_init(&buf->lock); + init_waitqueue_head(&buf->wait); + darray_init(&buf->buf); +} + +/* thread_with_stdio */ + +static void thread_with_stdio_done(struct thread_with_stdio *thr) +{ + thr->thr.done = true; + thr->stdio.done = true; + wake_up(&thr->stdio.input.wait); + wake_up(&thr->stdio.output.wait); +} + +static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf, + size_t len, loff_t *ppos) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + struct stdio_buf *buf = &thr->stdio.output; + size_t copied = 0, b; + int ret = 0; + + if (!(file->f_flags & O_NONBLOCK)) { + ret = wait_event_interruptible(buf->wait, stdio_redirect_has_output(&thr->stdio)); + if (ret) + return ret; + } else if (!stdio_redirect_has_output(&thr->stdio)) + return -EAGAIN; + + while (len && buf->buf.nr) { + if (fault_in_writeable(ubuf, len) == len) { + ret = -EFAULT; + break; + } + + spin_lock_irq(&buf->lock); + b = min_t(size_t, len, buf->buf.nr); + + if (b && !copy_to_user_nofault(ubuf, buf->buf.data, b)) { + ubuf += b; + len -= b; + copied += b; + buf->buf.nr -= b; + memmove(buf->buf.data, + buf->buf.data + b, + buf->buf.nr); + } + spin_unlock_irq(&buf->lock); + } + + return copied ?: ret; +} + +static int thread_with_stdio_release(struct inode *inode, struct file *file) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + + thread_with_stdio_done(thr); + bch2_thread_with_file_exit(&thr->thr); + darray_exit(&thr->stdio.input.buf); + darray_exit(&thr->stdio.output.buf); + thr->ops->exit(thr); + return 0; +} + +static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf, + size_t len, loff_t *ppos) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + struct stdio_buf *buf = &thr->stdio.input; + size_t copied = 0; + ssize_t ret = 0; + + while (len) { + if (thr->thr.done) { + ret = -EPIPE; + break; + } + + size_t b = len - fault_in_readable(ubuf, len); + if (!b) { + ret = -EFAULT; + break; + } + + spin_lock(&buf->lock); + if (buf->buf.nr < STDIO_REDIRECT_BUFSIZE) + darray_make_room_gfp(&buf->buf, + min(b, STDIO_REDIRECT_BUFSIZE - buf->buf.nr), GFP_NOWAIT); + b = min(len, darray_room(buf->buf)); + + if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) { + buf->buf.nr += b; + ubuf += b; + len -= b; + copied += b; + } + spin_unlock(&buf->lock); + + if (b) { + wake_up(&buf->wait); + } else { + if ((file->f_flags & O_NONBLOCK)) { + ret = -EAGAIN; + break; + } + + ret = wait_event_interruptible(buf->wait, + stdio_redirect_has_input_space(&thr->stdio)); + if (ret) + break; + } + } + + return copied ?: ret; +} + +static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_struct *wait) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + + poll_wait(file, &thr->stdio.output.wait, wait); + poll_wait(file, &thr->stdio.input.wait, wait); + + __poll_t mask = 0; + + if (stdio_redirect_has_output(&thr->stdio)) + mask |= EPOLLIN; + if (stdio_redirect_has_input_space(&thr->stdio)) + mask |= EPOLLOUT; + if (thr->thr.done) + mask |= EPOLLHUP|EPOLLERR; + return mask; +} + +static __poll_t thread_with_stdout_poll(struct file *file, struct poll_table_struct *wait) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + + poll_wait(file, &thr->stdio.output.wait, wait); + + __poll_t mask = 0; + + if (stdio_redirect_has_output(&thr->stdio)) + mask |= EPOLLIN; + if (thr->thr.done) + mask |= EPOLLHUP|EPOLLERR; + return mask; +} + +static int thread_with_stdio_flush(struct file *file, fl_owner_t id) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + + return thr->thr.ret; +} + +static long thread_with_stdio_ioctl(struct file *file, unsigned int cmd, unsigned long p) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + + if (thr->ops->unlocked_ioctl) + return thr->ops->unlocked_ioctl(thr, cmd, p); + return -ENOTTY; +} + +static const struct file_operations thread_with_stdio_fops = { + .llseek = no_llseek, + .read = thread_with_stdio_read, + .write = thread_with_stdio_write, + .poll = thread_with_stdio_poll, + .flush = thread_with_stdio_flush, + .release = thread_with_stdio_release, + .unlocked_ioctl = thread_with_stdio_ioctl, +}; + +static const struct file_operations thread_with_stdout_fops = { + .llseek = no_llseek, + .read = thread_with_stdio_read, + .poll = thread_with_stdout_poll, + .flush = thread_with_stdio_flush, + .release = thread_with_stdio_release, + .unlocked_ioctl = thread_with_stdio_ioctl, +}; + +static int thread_with_stdio_fn(void *arg) +{ + struct thread_with_stdio *thr = arg; + + thr->thr.ret = thr->ops->fn(thr); + + thread_with_stdio_done(thr); + return 0; +} + +int bch2_run_thread_with_stdio(struct thread_with_stdio *thr, + const struct thread_with_stdio_ops *ops) +{ + stdio_buf_init(&thr->stdio.input); + stdio_buf_init(&thr->stdio.output); + thr->ops = ops; + + return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn); +} + +int bch2_run_thread_with_stdout(struct thread_with_stdio *thr, + const struct thread_with_stdio_ops *ops) +{ + stdio_buf_init(&thr->stdio.input); + stdio_buf_init(&thr->stdio.output); + thr->ops = ops; + + return bch2_run_thread_with_file(&thr->thr, &thread_with_stdout_fops, thread_with_stdio_fn); +} +EXPORT_SYMBOL_GPL(bch2_run_thread_with_stdout); + +int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len) +{ + struct stdio_buf *buf = &stdio->input; + + /* + * we're waiting on user input (or for the file descriptor to be + * closed), don't want a hung task warning: + */ + do { + wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio), + sysctl_hung_task_timeout_secs * HZ / 2); + } while (!stdio_redirect_has_input(stdio)); + + if (stdio->done) + return -1; + + spin_lock(&buf->lock); + int ret = min(len, buf->buf.nr); + buf->buf.nr -= ret; + memcpy(ubuf, buf->buf.data, ret); + memmove(buf->buf.data, + buf->buf.data + ret, + buf->buf.nr); + spin_unlock(&buf->lock); + + wake_up(&buf->wait); + return ret; +} + +int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *ubuf, size_t len) +{ + struct stdio_buf *buf = &stdio->input; + size_t copied = 0; + ssize_t ret = 0; +again: + do { + wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio), + sysctl_hung_task_timeout_secs * HZ / 2); + } while (!stdio_redirect_has_input(stdio)); + + if (stdio->done) { + ret = -1; + goto out; + } + + spin_lock(&buf->lock); + size_t b = min(len, buf->buf.nr); + char *n = memchr(buf->buf.data, '\n', b); + if (n) + b = min_t(size_t, b, n + 1 - buf->buf.data); + buf->buf.nr -= b; + memcpy(ubuf, buf->buf.data, b); + memmove(buf->buf.data, + buf->buf.data + b, + buf->buf.nr); + ubuf += b; + len -= b; + copied += b; + spin_unlock(&buf->lock); + + wake_up(&buf->wait); + + if (!n && len) + goto again; +out: + return copied ?: ret; +} + +__printf(3, 0) +static ssize_t bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args) +{ + ssize_t ret; + + do { + va_list args2; + size_t len; + + va_copy(args2, args); + len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2); + va_end(args2); + + if (len + 1 <= darray_room(*out)) { + out->nr += len; + return len; + } + + ret = darray_make_room_gfp(out, len + 1, gfp); + } while (ret == 0); + + return ret; +} + +ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking, + const char *fmt, va_list args) +{ + struct stdio_buf *buf = &stdio->output; + unsigned long flags; + ssize_t ret; + +again: + spin_lock_irqsave(&buf->lock, flags); + ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args); + spin_unlock_irqrestore(&buf->lock, flags); + + if (ret < 0) { + if (nonblocking) + return -EAGAIN; + + ret = wait_event_interruptible(buf->wait, + stdio_redirect_has_output_space(stdio)); + if (ret) + return ret; + goto again; + } + + wake_up(&buf->wait); + return ret; +} + +ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking, + const char *fmt, ...) +{ + va_list args; + ssize_t ret; + + va_start(args, fmt); + ret = bch2_stdio_redirect_vprintf(stdio, nonblocking, fmt, args); + va_end(args); + + return ret; +} + +#endif /* NO_BCACHEFS_FS */ diff --git a/libbcachefs/thread_with_file.h b/libbcachefs/thread_with_file.h new file mode 100644 index 00000000..af54ea8f --- /dev/null +++ b/libbcachefs/thread_with_file.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_THREAD_WITH_FILE_H +#define _BCACHEFS_THREAD_WITH_FILE_H + +#include "thread_with_file_types.h" + +/* + * Thread with file: Run a kthread and connect it to a file descriptor, so that + * it can be interacted with via fd read/write methods and closing the file + * descriptor stops the kthread. + * + * We have two different APIs: + * + * thread_with_file, the low level version. + * You get to define the full file_operations, including your release function, + * which means that you must call bch2_thread_with_file_exit() from your + * .release method + * + * thread_with_stdio, the higher level version + * This implements full piping of input and output, including .poll. + * + * Notes on behaviour: + * - kthread shutdown behaves like writing or reading from a pipe that has been + * closed + * - Input and output buffers are 4096 bytes, although buffers may in some + * situations slightly exceed that limit so as to avoid chopping off a + * message in the middle in nonblocking mode. + * - Input/output buffers are lazily allocated, with GFP_NOWAIT allocations - + * should be fine but might change in future revisions. + * - Output buffer may grow past 4096 bytes to deal with messages that are + * bigger than 4096 bytes + * - Writing may be done blocking or nonblocking; in nonblocking mode, we only + * drop entire messages. + * + * To write, use stdio_redirect_printf() + * To read, use stdio_redirect_read() or stdio_redirect_readline() + */ + +struct task_struct; + +struct thread_with_file { + struct task_struct *task; + int ret; + bool done; +}; + +void bch2_thread_with_file_exit(struct thread_with_file *); +int bch2_run_thread_with_file(struct thread_with_file *, + const struct file_operations *, + int (*fn)(void *)); + +struct thread_with_stdio; + +struct thread_with_stdio_ops { + void (*exit)(struct thread_with_stdio *); + int (*fn)(struct thread_with_stdio *); + long (*unlocked_ioctl)(struct thread_with_stdio *, unsigned int, unsigned long); +}; + +struct thread_with_stdio { + struct thread_with_file thr; + struct stdio_redirect stdio; + const struct thread_with_stdio_ops *ops; +}; + +int bch2_run_thread_with_stdio(struct thread_with_stdio *, + const struct thread_with_stdio_ops *); +int bch2_run_thread_with_stdout(struct thread_with_stdio *, + const struct thread_with_stdio_ops *); +int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t); +int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t); + +__printf(3, 0) ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list); +__printf(3, 4) ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...); + +#endif /* _BCACHEFS_THREAD_WITH_FILE_H */ diff --git a/libbcachefs/thread_with_file_types.h b/libbcachefs/thread_with_file_types.h new file mode 100644 index 00000000..e0daf4ee --- /dev/null +++ b/libbcachefs/thread_with_file_types.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H +#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H + +#include "darray.h" + +struct stdio_buf { + spinlock_t lock; + wait_queue_head_t wait; + darray_char buf; +}; + +struct stdio_redirect { + struct stdio_buf input; + struct stdio_buf output; + + spinlock_t input_lock; + wait_queue_head_t input_wait; + darray_char input_buf; + bool done; +}; + +#endif /* _BCACHEFS_THREAD_WITH_FILE_TYPES_H */ diff --git a/libbcachefs/time_stats.c b/libbcachefs/time_stats.c new file mode 100644 index 00000000..4508e9dc --- /dev/null +++ b/libbcachefs/time_stats.c @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/jiffies.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/preempt.h> +#include <linux/time.h> +#include <linux/spinlock.h> + +#include "eytzinger.h" +#include "time_stats.h" + +static const struct time_unit time_units[] = { + { "ns", 1 }, + { "us", NSEC_PER_USEC }, + { "ms", NSEC_PER_MSEC }, + { "s", NSEC_PER_SEC }, + { "m", (u64) NSEC_PER_SEC * 60}, + { "h", (u64) NSEC_PER_SEC * 3600}, + { "d", (u64) NSEC_PER_SEC * 3600 * 24}, + { "w", (u64) NSEC_PER_SEC * 3600 * 24 * 7}, + { "y", (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */ + { "eon", U64_MAX }, +}; + +const struct time_unit *bch2_pick_time_units(u64 ns) +{ + const struct time_unit *u; + + for (u = time_units; + u + 1 < time_units + ARRAY_SIZE(time_units) && + ns >= u[1].nsecs << 1; + u++) + ; + + return u; +} + +static void quantiles_update(struct quantiles *q, u64 v) +{ + unsigned i = 0; + + while (i < ARRAY_SIZE(q->entries)) { + struct quantile_entry *e = q->entries + i; + + if (unlikely(!e->step)) { + e->m = v; + e->step = max_t(unsigned, v / 2, 1024); + } else if (e->m > v) { + e->m = e->m >= e->step + ? e->m - e->step + : 0; + } else if (e->m < v) { + e->m = e->m + e->step > e->m + ? e->m + e->step + : U32_MAX; + } + + if ((e->m > v ? e->m - v : v - e->m) < e->step) + e->step = max_t(unsigned, e->step / 2, 1); + + if (v >= e->m) + break; + + i = eytzinger0_child(i, v > e->m); + } +} + +static inline void time_stats_update_one(struct bch2_time_stats *stats, + u64 start, u64 end) +{ + u64 duration, freq; + bool initted = stats->last_event != 0; + + if (time_after64(end, start)) { + struct quantiles *quantiles = time_stats_to_quantiles(stats); + + duration = end - start; + mean_and_variance_update(&stats->duration_stats, duration); + mean_and_variance_weighted_update(&stats->duration_stats_weighted, + duration, initted, TIME_STATS_MV_WEIGHT); + stats->max_duration = max(stats->max_duration, duration); + stats->min_duration = min(stats->min_duration, duration); + stats->total_duration += duration; + + if (quantiles) + quantiles_update(quantiles, duration); + } + + if (stats->last_event && time_after64(end, stats->last_event)) { + freq = end - stats->last_event; + mean_and_variance_update(&stats->freq_stats, freq); + mean_and_variance_weighted_update(&stats->freq_stats_weighted, + freq, initted, TIME_STATS_MV_WEIGHT); + stats->max_freq = max(stats->max_freq, freq); + stats->min_freq = min(stats->min_freq, freq); + } + + stats->last_event = end; +} + +void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, + struct time_stat_buffer *b) +{ + for (struct time_stat_buffer_entry *i = b->entries; + i < b->entries + ARRAY_SIZE(b->entries); + i++) + time_stats_update_one(stats, i->start, i->end); + b->nr = 0; +} + +static noinline void time_stats_clear_buffer(struct bch2_time_stats *stats, + struct time_stat_buffer *b) +{ + unsigned long flags; + + spin_lock_irqsave(&stats->lock, flags); + __bch2_time_stats_clear_buffer(stats, b); + spin_unlock_irqrestore(&stats->lock, flags); +} + +void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) +{ + unsigned long flags; + + if (!stats->buffer) { + spin_lock_irqsave(&stats->lock, flags); + time_stats_update_one(stats, start, end); + + if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 && + stats->duration_stats.n > 1024) + stats->buffer = + alloc_percpu_gfp(struct time_stat_buffer, + GFP_ATOMIC); + spin_unlock_irqrestore(&stats->lock, flags); + } else { + struct time_stat_buffer *b; + + preempt_disable(); + b = this_cpu_ptr(stats->buffer); + + BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); + b->entries[b->nr++] = (struct time_stat_buffer_entry) { + .start = start, + .end = end + }; + + if (unlikely(b->nr == ARRAY_SIZE(b->entries))) + time_stats_clear_buffer(stats, b); + preempt_enable(); + } +} + +void bch2_time_stats_exit(struct bch2_time_stats *stats) +{ + free_percpu(stats->buffer); +} + +void bch2_time_stats_init(struct bch2_time_stats *stats) +{ + memset(stats, 0, sizeof(*stats)); + stats->min_duration = U64_MAX; + stats->min_freq = U64_MAX; + spin_lock_init(&stats->lock); +} diff --git a/include/linux/time_stats.h b/libbcachefs/time_stats.h similarity index 63% rename from include/linux/time_stats.h rename to libbcachefs/time_stats.h index 6df2b34a..5df61403 100644 --- a/include/linux/time_stats.h +++ b/libbcachefs/time_stats.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * time_stats - collect statistics on events that have a duration, with nicely + * bch2_time_stats - collect statistics on events that have a duration, with nicely * formatted textual output on demand * * - percpu buffering of event collection: cheap enough to shotgun @@ -21,14 +21,15 @@ * * Particularly useful for tracking down latency issues. */ -#ifndef _LINUX_TIME_STATS_H -#define _LINUX_TIME_STATS_H +#ifndef _BCACHEFS_TIME_STATS_H +#define _BCACHEFS_TIME_STATS_H -#include <linux/mean_and_variance.h> #include <linux/sched/clock.h> #include <linux/spinlock_types.h> #include <linux/string.h> +#include "mean_and_variance.h" + struct time_unit { const char *name; u64 nsecs; @@ -37,12 +38,12 @@ struct time_unit { /* * given a nanosecond value, pick the preferred time units for printing: */ -const struct time_unit *pick_time_units(u64 ns); +const struct time_unit *bch2_pick_time_units(u64 ns); /* * quantiles - do not use: * - * Only enabled if time_stats->quantiles_enabled has been manually set - don't + * Only enabled if bch2_time_stats->quantiles_enabled has been manually set - don't * use in new code. */ @@ -66,7 +67,7 @@ struct time_stat_buffer { } entries[31]; }; -struct time_stats { +struct bch2_time_stats { spinlock_t lock; bool have_quantiles; /* all fields are in nanoseconds */ @@ -87,52 +88,50 @@ struct time_stats { struct mean_and_variance_weighted duration_stats_weighted; struct mean_and_variance_weighted freq_stats_weighted; struct time_stat_buffer __percpu *buffer; - - u64 start_time; }; -struct time_stats_quantiles { - struct time_stats stats; +struct bch2_time_stats_quantiles { + struct bch2_time_stats stats; struct quantiles quantiles; }; -static inline struct quantiles *time_stats_to_quantiles(struct time_stats *stats) +static inline struct quantiles *time_stats_to_quantiles(struct bch2_time_stats *stats) { return stats->have_quantiles - ? &container_of(stats, struct time_stats_quantiles, stats)->quantiles + ? &container_of(stats, struct bch2_time_stats_quantiles, stats)->quantiles : NULL; } -void __time_stats_clear_buffer(struct time_stats *, struct time_stat_buffer *); -void __time_stats_update(struct time_stats *stats, u64, u64); +void __bch2_time_stats_clear_buffer(struct bch2_time_stats *, struct time_stat_buffer *); +void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64); /** * time_stats_update - collect a new event being tracked * - * @stats - time_stats to update + * @stats - bch2_time_stats to update * @start - start time of event, recorded with local_clock() * * The end duration of the event will be the current time */ -static inline void time_stats_update(struct time_stats *stats, u64 start) +static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) { - __time_stats_update(stats, start, local_clock()); + __bch2_time_stats_update(stats, start, local_clock()); } /** * track_event_change - track state change events * - * @stats - time_stats to update + * @stats - bch2_time_stats to update * @v - new state, true or false * * Use this when tracking time stats for state changes, i.e. resource X becoming * blocked/unblocked. */ -static inline bool track_event_change(struct time_stats *stats, bool v) +static inline bool track_event_change(struct bch2_time_stats *stats, bool v) { if (v != !!stats->last_event_start) { if (!v) { - time_stats_update(stats, stats->last_event_start); + bch2_time_stats_update(stats, stats->last_event_start); stats->last_event_start = 0; } else { stats->last_event_start = local_clock() ?: 1; @@ -143,25 +142,18 @@ static inline bool track_event_change(struct time_stats *stats, bool v) return false; } -#define TIME_STATS_PRINT_NO_ZEROES (1U << 0) /* print nothing if zero count */ -struct seq_buf; -void time_stats_to_seq_buf(struct seq_buf *, struct time_stats *, - const char *epoch_name, unsigned int flags); -void time_stats_to_json(struct seq_buf *, struct time_stats *, - const char *epoch_name, unsigned int flags); +void bch2_time_stats_exit(struct bch2_time_stats *); +void bch2_time_stats_init(struct bch2_time_stats *); -void time_stats_exit(struct time_stats *); -void time_stats_init(struct time_stats *); - -static inline void time_stats_quantiles_exit(struct time_stats_quantiles *statq) +static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq) { - time_stats_exit(&statq->stats); + bch2_time_stats_exit(&statq->stats); } -static inline void time_stats_quantiles_init(struct time_stats_quantiles *statq) +static inline void bch2_time_stats_quantiles_init(struct bch2_time_stats_quantiles *statq) { - time_stats_init(&statq->stats); + bch2_time_stats_init(&statq->stats); statq->stats.have_quantiles = true; memset(&statq->quantiles, 0, sizeof(statq->quantiles)); } -#endif /* _LINUX_TIME_STATS_H */ +#endif /* _BCACHEFS_TIME_STATS_H */ diff --git a/libbcachefs/util.c b/libbcachefs/util.c index 098ebbe4..216fadf1 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -11,7 +11,6 @@ #include <linux/console.h> #include <linux/ctype.h> #include <linux/debugfs.h> -#include <linux/eytzinger.h> #include <linux/freezer.h> #include <linux/kthread.h> #include <linux/log2.h> @@ -23,8 +22,9 @@ #include <linux/string.h> #include <linux/types.h> #include <linux/sched/clock.h> -#include <linux/mean_and_variance.h> +#include "eytzinger.h" +#include "mean_and_variance.h" #include "util.h" static const char si_units[] = "?kMGTPEZY"; @@ -339,14 +339,14 @@ void bch2_prt_datetime(struct printbuf *out, time64_t sec) void bch2_pr_time_units(struct printbuf *out, u64 ns) { - const struct time_unit *u = pick_time_units(ns); + const struct time_unit *u = bch2_pick_time_units(ns); prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); } static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) { - const struct time_unit *u = pick_time_units(ns); + const struct time_unit *u = bch2_pick_time_units(ns); prt_printf(out, "%llu ", div64_u64(ns, u->nsecs)); prt_tab_rjust(out); @@ -363,7 +363,7 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 #define TABSTOP_SIZE 12 -void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats) +void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) { struct quantiles *quantiles = time_stats_to_quantiles(stats); s64 f_mean = 0, d_mean = 0; @@ -374,7 +374,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats) spin_lock_irq(&stats->lock); for_each_possible_cpu(cpu) - __time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu)); + __bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu)); spin_unlock_irq(&stats->lock); } @@ -469,7 +469,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats) if (quantiles) { int i = eytzinger0_first(NR_QUANTILES); const struct time_unit *u = - pick_time_units(quantiles->entries[i].m); + bch2_pick_time_units(quantiles->entries[i].m); u64 last_q = 0; prt_printf(out, "quantiles (%s):\t", u->name); @@ -707,6 +707,149 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) } } +static int alignment_ok(const void *base, size_t align) +{ + return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || + ((unsigned long)base & (align - 1)) == 0; +} + +static void u32_swap(void *a, void *b, size_t size) +{ + u32 t = *(u32 *)a; + *(u32 *)a = *(u32 *)b; + *(u32 *)b = t; +} + +static void u64_swap(void *a, void *b, size_t size) +{ + u64 t = *(u64 *)a; + *(u64 *)a = *(u64 *)b; + *(u64 *)b = t; +} + +static void generic_swap(void *a, void *b, size_t size) +{ + char t; + + do { + t = *(char *)a; + *(char *)a++ = *(char *)b; + *(char *)b++ = t; + } while (--size > 0); +} + +static inline int do_cmp(void *base, size_t n, size_t size, + int (*cmp_func)(const void *, const void *, size_t), + size_t l, size_t r) +{ + return cmp_func(base + inorder_to_eytzinger0(l, n) * size, + base + inorder_to_eytzinger0(r, n) * size, + size); +} + +static inline void do_swap(void *base, size_t n, size_t size, + void (*swap_func)(void *, void *, size_t), + size_t l, size_t r) +{ + swap_func(base + inorder_to_eytzinger0(l, n) * size, + base + inorder_to_eytzinger0(r, n) * size, + size); +} + +void eytzinger0_sort(void *base, size_t n, size_t size, + int (*cmp_func)(const void *, const void *, size_t), + void (*swap_func)(void *, void *, size_t)) +{ + int i, c, r; + + if (!swap_func) { + if (size == 4 && alignment_ok(base, 4)) + swap_func = u32_swap; + else if (size == 8 && alignment_ok(base, 8)) + swap_func = u64_swap; + else + swap_func = generic_swap; + } + + /* heapify */ + for (i = n / 2 - 1; i >= 0; --i) { + for (r = i; r * 2 + 1 < n; r = c) { + c = r * 2 + 1; + + if (c + 1 < n && + do_cmp(base, n, size, cmp_func, c, c + 1) < 0) + c++; + + if (do_cmp(base, n, size, cmp_func, r, c) >= 0) + break; + + do_swap(base, n, size, swap_func, r, c); + } + } + + /* sort */ + for (i = n - 1; i > 0; --i) { + do_swap(base, n, size, swap_func, 0, i); + + for (r = 0; r * 2 + 1 < i; r = c) { + c = r * 2 + 1; + + if (c + 1 < i && + do_cmp(base, n, size, cmp_func, c, c + 1) < 0) + c++; + + if (do_cmp(base, n, size, cmp_func, r, c) >= 0) + break; + + do_swap(base, n, size, swap_func, r, c); + } + } +} + +void sort_cmp_size(void *base, size_t num, size_t size, + int (*cmp_func)(const void *, const void *, size_t), + void (*swap_func)(void *, void *, size_t size)) +{ + /* pre-scale counters for performance */ + int i = (num/2 - 1) * size, n = num * size, c, r; + + if (!swap_func) { + if (size == 4 && alignment_ok(base, 4)) + swap_func = u32_swap; + else if (size == 8 && alignment_ok(base, 8)) + swap_func = u64_swap; + else + swap_func = generic_swap; + } + + /* heapify */ + for ( ; i >= 0; i -= size) { + for (r = i; r * 2 + size < n; r = c) { + c = r * 2 + size; + if (c < n - size && + cmp_func(base + c, base + c + size, size) < 0) + c += size; + if (cmp_func(base + r, base + c, size) >= 0) + break; + swap_func(base + r, base + c, size); + } + } + + /* sort */ + for (i = n - size; i > 0; i -= size) { + swap_func(base, base + i, size); + for (r = 0; r * 2 + size < i; r = c) { + c = r * 2 + size; + if (c < i - size && + cmp_func(base + c, base + c + size, size) < 0) + c += size; + if (cmp_func(base + r, base + c, size) >= 0) + break; + swap_func(base + r, base + c, size); + } + } +} + #if 0 void eytzinger1_test(void) { diff --git a/libbcachefs/util.h b/libbcachefs/util.h index c5fec87d..7ffbddb8 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -5,21 +5,23 @@ #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/closure.h> -#include <linux/darray.h> #include <linux/errno.h> #include <linux/freezer.h> #include <linux/kernel.h> +#include <linux/sched/clock.h> #include <linux/llist.h> #include <linux/log2.h> #include <linux/percpu.h> #include <linux/preempt.h> #include <linux/ratelimit.h> -#include <linux/sched/clock.h> #include <linux/slab.h> -#include <linux/time_stats.h> #include <linux/vmalloc.h> #include <linux/workqueue.h> -#include <linux/mean_and_variance.h> + +#include "mean_and_variance.h" + +#include "darray.h" +#include "time_stats.h" struct closure; @@ -328,7 +330,7 @@ static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev) #endif } -void bch2_time_stats_to_text(struct printbuf *, struct time_stats *); +void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *); #define ewma_add(ewma, val, weight) \ ({ \ @@ -629,6 +631,34 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes) memset(s + bytes, c, rem); } +void sort_cmp_size(void *base, size_t num, size_t size, + int (*cmp_func)(const void *, const void *, size_t), + void (*swap_func)(void *, void *, size_t)); + +/* just the memmove, doesn't update @_nr */ +#define __array_insert_item(_array, _nr, _pos) \ + memmove(&(_array)[(_pos) + 1], \ + &(_array)[(_pos)], \ + sizeof((_array)[0]) * ((_nr) - (_pos))) + +#define array_insert_item(_array, _nr, _pos, _new_item) \ +do { \ + __array_insert_item(_array, _nr, _pos); \ + (_nr)++; \ + (_array)[(_pos)] = (_new_item); \ +} while (0) + +#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \ +do { \ + (_nr) -= (_nr_to_remove); \ + memmove(&(_array)[(_pos)], \ + &(_array)[(_pos) + (_nr_to_remove)], \ + sizeof((_array)[0]) * ((_nr) - (_pos))); \ +} while (0) + +#define array_remove_item(_array, _nr, _pos) \ + array_remove_items(_array, _nr, _pos, 1) + static inline void __move_gap(void *array, size_t element_size, size_t nr, size_t size, size_t old_gap, size_t new_gap) @@ -743,4 +773,25 @@ static inline bool qstr_eq(const struct qstr l, const struct qstr r) void bch2_darray_str_exit(darray_str *); int bch2_split_devs(const char *, darray_str *); +#ifdef __KERNEL__ + +__must_check +static inline int copy_to_user_errcode(void __user *to, const void *from, unsigned long n) +{ + return copy_to_user(to, from, n) ? -EFAULT : 0; +} + +__must_check +static inline int copy_from_user_errcode(void *to, const void __user *from, unsigned long n) +{ + return copy_from_user(to, from, n) ? -EFAULT : 0; +} + +#endif + +static inline void __set_bit_le64(size_t bit, __le64 *addr) +{ + addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64)); +} + #endif /* _BCACHEFS_UTIL_H */ diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index 9c0d2316..754f17bb 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -544,11 +544,11 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, kfree(buf); if (ret < 0) - return ret; + goto err_class_exit; ret = bch2_opt_check_may_set(c, opt_id, v); if (ret < 0) - return ret; + goto err_class_exit; s.v = v + 1; s.defined = true; @@ -595,6 +595,7 @@ err: (opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression)))) bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum); +err_class_exit: return bch2_err_class(ret); } diff --git a/linux/generic-radix-tree.c b/linux/generic-radix-tree.c index 41f1bcdc..aaefb9b6 100644 --- a/linux/generic-radix-tree.c +++ b/linux/generic-radix-tree.c @@ -5,7 +5,7 @@ #include <linux/gfp.h> #include <linux/kmemleak.h> -#define GENRADIX_ARY (PAGE_SIZE / sizeof(struct genradix_node *)) +#define GENRADIX_ARY (GENRADIX_NODE_SIZE / sizeof(struct genradix_node *)) #define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY) struct genradix_node { @@ -14,13 +14,13 @@ struct genradix_node { struct genradix_node *children[GENRADIX_ARY]; /* Leaf: */ - u8 data[PAGE_SIZE]; + u8 data[GENRADIX_NODE_SIZE]; }; }; static inline int genradix_depth_shift(unsigned depth) { - return PAGE_SHIFT + GENRADIX_ARY_SHIFT * depth; + return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth; } /* @@ -33,7 +33,7 @@ static inline size_t genradix_depth_size(unsigned depth) /* depth that's needed for a genradix that can address up to ULONG_MAX: */ #define GENRADIX_MAX_DEPTH \ - DIV_ROUND_UP(BITS_PER_LONG - PAGE_SHIFT, GENRADIX_ARY_SHIFT) + DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT) #define GENRADIX_DEPTH_MASK \ ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1)) @@ -79,23 +79,12 @@ EXPORT_SYMBOL(__genradix_ptr); static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask) { - struct genradix_node *node; - - node = (struct genradix_node *)__get_free_page(gfp_mask|__GFP_ZERO); - - /* - * We're using pages (not slab allocations) directly for kernel data - * structures, so we need to explicitly inform kmemleak of them in order - * to avoid false positive memory leak reports. - */ - kmemleak_alloc(node, PAGE_SIZE, 1, gfp_mask); - return node; + return kzalloc(GENRADIX_NODE_SIZE, gfp_mask); } static inline void genradix_free_node(struct genradix_node *node) { - kmemleak_free(node); - free_page((unsigned long)node); + kfree(node); } /* @@ -200,7 +189,7 @@ restart: i++; iter->offset = round_down(iter->offset + objs_per_ptr, objs_per_ptr); - iter->pos = (iter->offset >> PAGE_SHIFT) * + iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page; if (i == GENRADIX_ARY) goto restart; @@ -209,7 +198,7 @@ restart: n = n->children[i]; } - return &n->data[iter->offset & (PAGE_SIZE - 1)]; + return &n->data[iter->offset & (GENRADIX_NODE_SIZE - 1)]; } EXPORT_SYMBOL(__genradix_iter_peek); @@ -235,7 +224,7 @@ restart: if (ilog2(iter->offset) >= genradix_depth_shift(level)) { iter->offset = genradix_depth_size(level); - iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page; + iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page; iter->offset -= obj_size_plus_page_remainder; iter->pos--; @@ -251,7 +240,7 @@ restart: size_t objs_per_ptr = genradix_depth_size(level); iter->offset = round_down(iter->offset, objs_per_ptr); - iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page; + iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page; if (!iter->offset) return NULL; @@ -267,7 +256,7 @@ restart: n = n->children[i]; } - return &n->data[iter->offset & (PAGE_SIZE - 1)]; + return &n->data[iter->offset & (GENRADIX_NODE_SIZE - 1)]; } EXPORT_SYMBOL(__genradix_iter_peek_prev); @@ -289,7 +278,7 @@ int __genradix_prealloc(struct __genradix *radix, size_t size, { size_t offset; - for (offset = 0; offset < size; offset += PAGE_SIZE) + for (offset = 0; offset < size; offset += GENRADIX_NODE_SIZE) if (!__genradix_ptr_alloc(radix, offset, gfp_mask)) return -ENOMEM; diff --git a/linux/sort.c b/linux/sort.c index ffa4817b..ecce71c5 100644 --- a/linux/sort.c +++ b/linux/sort.c @@ -277,92 +277,3 @@ void sort_r(void *base, size_t num, size_t size, } } EXPORT_SYMBOL(sort_r); - -#include <linux/eytzinger.h> - -static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size, - cmp_r_func_t cmp_func, const void *priv, - size_t l, size_t r) -{ - return do_cmp(base + inorder_to_eytzinger0(l, n) * size, - base + inorder_to_eytzinger0(r, n) * size, - cmp_func, priv); -} - -static inline void eytzinger0_do_swap(void *base, size_t n, size_t size, - swap_r_func_t swap_func, const void *priv, - size_t l, size_t r) -{ - do_swap(base + inorder_to_eytzinger0(l, n) * size, - base + inorder_to_eytzinger0(r, n) * size, - size, swap_func, priv); -} - -void eytzinger0_sort_r(void *base, size_t n, size_t size, - cmp_r_func_t cmp_func, - swap_r_func_t swap_func, - const void *priv) -{ - int i, c, r; - - /* called from 'sort' without swap function, let's pick the default */ - if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap) - swap_func = NULL; - - if (!swap_func) { - if (is_aligned(base, size, 8)) - swap_func = SWAP_WORDS_64; - else if (is_aligned(base, size, 4)) - swap_func = SWAP_WORDS_32; - else - swap_func = SWAP_BYTES; - } - - /* heapify */ - for (i = n / 2 - 1; i >= 0; --i) { - for (r = i; r * 2 + 1 < n; r = c) { - c = r * 2 + 1; - - if (c + 1 < n && - eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0) - c++; - - if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0) - break; - - eytzinger0_do_swap(base, n, size, swap_func, priv, r, c); - } - } - - /* sort */ - for (i = n - 1; i > 0; --i) { - eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i); - - for (r = 0; r * 2 + 1 < i; r = c) { - c = r * 2 + 1; - - if (c + 1 < i && - eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0) - c++; - - if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0) - break; - - eytzinger0_do_swap(base, n, size, swap_func, priv, r, c); - } - } -} -EXPORT_SYMBOL_GPL(eytzinger0_sort_r); - -void eytzinger0_sort(void *base, size_t n, size_t size, - cmp_func_t cmp_func, - swap_func_t swap_func) -{ - struct wrapper w = { - .cmp = cmp_func, - .swap = swap_func, - }; - - return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w); -} -EXPORT_SYMBOL_GPL(eytzinger0_sort); diff --git a/linux/time_stats.c b/linux/time_stats.c deleted file mode 100644 index d7dd64ba..00000000 --- a/linux/time_stats.c +++ /dev/null @@ -1,373 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include <linux/eytzinger.h> -#include <linux/jiffies.h> -#include <linux/module.h> -#include <linux/percpu.h> -#include <linux/preempt.h> -#include <linux/time.h> -#include <linux/time_stats.h> -#include <linux/spinlock.h> - -static const struct time_unit time_units[] = { - { "ns", 1 }, - { "us", NSEC_PER_USEC }, - { "ms", NSEC_PER_MSEC }, - { "s", NSEC_PER_SEC }, - { "m", (u64) NSEC_PER_SEC * 60}, - { "h", (u64) NSEC_PER_SEC * 3600}, - { "d", (u64) NSEC_PER_SEC * 3600 * 24}, - { "w", (u64) NSEC_PER_SEC * 3600 * 24 * 7}, - { "y", (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */ - { "eon", U64_MAX }, -}; - -const struct time_unit *pick_time_units(u64 ns) -{ - const struct time_unit *u; - - for (u = time_units; - u + 1 < time_units + ARRAY_SIZE(time_units) && - ns >= u[1].nsecs << 1; - u++) - ; - - return u; -} -EXPORT_SYMBOL_GPL(pick_time_units); - -static void quantiles_update(struct quantiles *q, u64 v) -{ - unsigned i = 0; - - while (i < ARRAY_SIZE(q->entries)) { - struct quantile_entry *e = q->entries + i; - - if (unlikely(!e->step)) { - e->m = v; - e->step = max_t(unsigned, v / 2, 1024); - } else if (e->m > v) { - e->m = e->m >= e->step - ? e->m - e->step - : 0; - } else if (e->m < v) { - e->m = e->m + e->step > e->m - ? e->m + e->step - : U32_MAX; - } - - if ((e->m > v ? e->m - v : v - e->m) < e->step) - e->step = max_t(unsigned, e->step / 2, 1); - - if (v >= e->m) - break; - - i = eytzinger0_child(i, v > e->m); - } -} - -static inline void time_stats_update_one(struct time_stats *stats, - u64 start, u64 end) -{ - u64 duration, freq; - bool initted = stats->last_event != 0; - - if (time_after64(end, start)) { - struct quantiles *quantiles = time_stats_to_quantiles(stats); - - duration = end - start; - mean_and_variance_update(&stats->duration_stats, duration); - mean_and_variance_weighted_update(&stats->duration_stats_weighted, - duration, initted, TIME_STATS_MV_WEIGHT); - stats->max_duration = max(stats->max_duration, duration); - stats->min_duration = min(stats->min_duration, duration); - stats->total_duration += duration; - - if (quantiles) - quantiles_update(quantiles, duration); - } - - if (stats->last_event && time_after64(end, stats->last_event)) { - freq = end - stats->last_event; - mean_and_variance_update(&stats->freq_stats, freq); - mean_and_variance_weighted_update(&stats->freq_stats_weighted, - freq, initted, TIME_STATS_MV_WEIGHT); - stats->max_freq = max(stats->max_freq, freq); - stats->min_freq = min(stats->min_freq, freq); - } - - stats->last_event = end; -} - -void __time_stats_clear_buffer(struct time_stats *stats, - struct time_stat_buffer *b) -{ - for (struct time_stat_buffer_entry *i = b->entries; - i < b->entries + ARRAY_SIZE(b->entries); - i++) - time_stats_update_one(stats, i->start, i->end); - b->nr = 0; -} -EXPORT_SYMBOL_GPL(__time_stats_clear_buffer); - -static noinline void time_stats_clear_buffer(struct time_stats *stats, - struct time_stat_buffer *b) -{ - unsigned long flags; - - spin_lock_irqsave(&stats->lock, flags); - __time_stats_clear_buffer(stats, b); - spin_unlock_irqrestore(&stats->lock, flags); -} - -void __time_stats_update(struct time_stats *stats, u64 start, u64 end) -{ - unsigned long flags; - - if (!stats->buffer) { - spin_lock_irqsave(&stats->lock, flags); - time_stats_update_one(stats, start, end); - - if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 && - stats->duration_stats.n > 1024) - stats->buffer = - alloc_percpu_gfp(struct time_stat_buffer, - GFP_ATOMIC); - spin_unlock_irqrestore(&stats->lock, flags); - } else { - struct time_stat_buffer *b; - - preempt_disable(); - b = this_cpu_ptr(stats->buffer); - - BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); - b->entries[b->nr++] = (struct time_stat_buffer_entry) { - .start = start, - .end = end - }; - - if (unlikely(b->nr == ARRAY_SIZE(b->entries))) - time_stats_clear_buffer(stats, b); - preempt_enable(); - } -} -EXPORT_SYMBOL_GPL(__time_stats_update); - -#include <linux/seq_buf.h> - -static void seq_buf_time_units_aligned(struct seq_buf *out, u64 ns) -{ - const struct time_unit *u = pick_time_units(ns); - - seq_buf_printf(out, "%8llu %s", div64_u64(ns, u->nsecs), u->name); -} - -static inline u64 time_stats_lifetime(const struct time_stats *stats) -{ - return local_clock() - stats->start_time; -} - -void time_stats_to_seq_buf(struct seq_buf *out, struct time_stats *stats, - const char *epoch_name, unsigned int flags) -{ - struct quantiles *quantiles = time_stats_to_quantiles(stats); - s64 f_mean = 0, d_mean = 0; - u64 f_stddev = 0, d_stddev = 0; - u64 lifetime = time_stats_lifetime(stats); - - if (stats->buffer) { - int cpu; - - spin_lock_irq(&stats->lock); - for_each_possible_cpu(cpu) - __time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu)); - spin_unlock_irq(&stats->lock); - } - - if (stats->freq_stats.n) { - /* avoid divide by zero */ - f_mean = mean_and_variance_get_mean(stats->freq_stats); - f_stddev = mean_and_variance_get_stddev(stats->freq_stats); - d_mean = mean_and_variance_get_mean(stats->duration_stats); - d_stddev = mean_and_variance_get_stddev(stats->duration_stats); - } else if (flags & TIME_STATS_PRINT_NO_ZEROES) { - /* unless we didn't want zeroes anyway */ - return; - } - - seq_buf_printf(out, "count: %llu\n", stats->duration_stats.n); - seq_buf_printf(out, "lifetime: "); - seq_buf_time_units_aligned(out, lifetime); - seq_buf_printf(out, "\n"); - - seq_buf_printf(out, " since %-12s recent\n", epoch_name); - - seq_buf_printf(out, "duration of events\n"); - - seq_buf_printf(out, " min: "); - seq_buf_time_units_aligned(out, stats->min_duration); - seq_buf_printf(out, "\n"); - - seq_buf_printf(out, " max: "); - seq_buf_time_units_aligned(out, stats->max_duration); - seq_buf_printf(out, "\n"); - - seq_buf_printf(out, " total: "); - seq_buf_time_units_aligned(out, stats->total_duration); - seq_buf_printf(out, "\n"); - - seq_buf_printf(out, " mean: "); - seq_buf_time_units_aligned(out, d_mean); - seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); - seq_buf_printf(out, "\n"); - - seq_buf_printf(out, " stddev: "); - seq_buf_time_units_aligned(out, d_stddev); - seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); - seq_buf_printf(out, "\n"); - - seq_buf_printf(out, "time between events\n"); - - seq_buf_printf(out, " min: "); - seq_buf_time_units_aligned(out, stats->min_freq); - seq_buf_printf(out, "\n"); - - seq_buf_printf(out, " max: "); - seq_buf_time_units_aligned(out, stats->max_freq); - seq_buf_printf(out, "\n"); - - seq_buf_printf(out, " mean: "); - seq_buf_time_units_aligned(out, f_mean); - seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); - seq_buf_printf(out, "\n"); - - seq_buf_printf(out, " stddev: "); - seq_buf_time_units_aligned(out, f_stddev); - seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); - seq_buf_printf(out, "\n"); - - if (quantiles) { - int i = eytzinger0_first(NR_QUANTILES); - const struct time_unit *u = - pick_time_units(quantiles->entries[i].m); - u64 last_q = 0; - - seq_buf_printf(out, "quantiles (%s):\t", u->name); - eytzinger0_for_each(i, NR_QUANTILES) { - bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; - - u64 q = max(quantiles->entries[i].m, last_q); - seq_buf_printf(out, "%llu ", div_u64(q, u->nsecs)); - if (is_last) - seq_buf_printf(out, "\n"); - last_q = q; - } - } -} -EXPORT_SYMBOL_GPL(time_stats_to_seq_buf); - -void time_stats_to_json(struct seq_buf *out, struct time_stats *stats, - const char *epoch_name, unsigned int flags) -{ - struct quantiles *quantiles = time_stats_to_quantiles(stats); - s64 f_mean = 0, d_mean = 0; - u64 f_stddev = 0, d_stddev = 0; - - if (stats->buffer) { - int cpu; - - spin_lock_irq(&stats->lock); - for_each_possible_cpu(cpu) - __time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu)); - spin_unlock_irq(&stats->lock); - } - - if (stats->freq_stats.n) { - /* avoid divide by zero */ - f_mean = mean_and_variance_get_mean(stats->freq_stats); - f_stddev = mean_and_variance_get_stddev(stats->freq_stats); - d_mean = mean_and_variance_get_mean(stats->duration_stats); - d_stddev = mean_and_variance_get_stddev(stats->duration_stats); - } else if (flags & TIME_STATS_PRINT_NO_ZEROES) { - /* unless we didn't want zeroes anyway */ - return; - } - - seq_buf_printf(out, "{\n"); - seq_buf_printf(out, " \"epoch\": \"%s\",\n", epoch_name); - seq_buf_printf(out, " \"count\": %llu,\n", stats->duration_stats.n); - - seq_buf_printf(out, " \"duration_ns\": {\n"); - seq_buf_printf(out, " \"min\": %llu,\n", stats->min_duration); - seq_buf_printf(out, " \"max\": %llu,\n", stats->max_duration); - seq_buf_printf(out, " \"total\": %llu,\n", stats->total_duration); - seq_buf_printf(out, " \"mean\": %llu,\n", d_mean); - seq_buf_printf(out, " \"stddev\": %llu\n", d_stddev); - seq_buf_printf(out, " },\n"); - - d_mean = mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT); - d_stddev = mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT); - - seq_buf_printf(out, " \"duration_ewma_ns\": {\n"); - seq_buf_printf(out, " \"mean\": %llu,\n", d_mean); - seq_buf_printf(out, " \"stddev\": %llu\n", d_stddev); - seq_buf_printf(out, " },\n"); - - seq_buf_printf(out, " \"between_ns\": {\n"); - seq_buf_printf(out, " \"min\": %llu,\n", stats->min_freq); - seq_buf_printf(out, " \"max\": %llu,\n", stats->max_freq); - seq_buf_printf(out, " \"mean\": %llu,\n", f_mean); - seq_buf_printf(out, " \"stddev\": %llu\n", f_stddev); - seq_buf_printf(out, " },\n"); - - f_mean = mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT); - f_stddev = mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT); - - seq_buf_printf(out, " \"between_ewma_ns\": {\n"); - seq_buf_printf(out, " \"mean\": %llu,\n", f_mean); - seq_buf_printf(out, " \"stddev\": %llu\n", f_stddev); - - if (quantiles) { - u64 last_q = 0; - - /* close between_ewma_ns but signal more items */ - seq_buf_printf(out, " },\n"); - - seq_buf_printf(out, " \"quantiles_ns\": [\n"); - eytzinger0_for_each(i, NR_QUANTILES) { - bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; - - u64 q = max(quantiles->entries[i].m, last_q); - seq_buf_printf(out, " %llu", q); - if (!is_last) - seq_buf_printf(out, ", "); - last_q = q; - } - seq_buf_printf(out, " ]\n"); - } else { - /* close between_ewma_ns without dumping further */ - seq_buf_printf(out, " }\n"); - } - - seq_buf_printf(out, "}\n"); -} -EXPORT_SYMBOL_GPL(time_stats_to_json); - -void time_stats_exit(struct time_stats *stats) -{ - free_percpu(stats->buffer); -} -EXPORT_SYMBOL_GPL(time_stats_exit); - -void time_stats_init(struct time_stats *stats) -{ - memset(stats, 0, sizeof(*stats)); - stats->min_duration = U64_MAX; - stats->min_freq = U64_MAX; - stats->start_time = local_clock(); - spin_lock_init(&stats->lock); -} -EXPORT_SYMBOL_GPL(time_stats_init); - -MODULE_AUTHOR("Kent Overstreet"); -MODULE_LICENSE("GPL");