diff --git a/.bcachefs_revision b/.bcachefs_revision index 615d94b8..3ca1265b 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -0939e1c73231c779c961e1143e1ba489ef2b168c +ea93c26e98081d8e1a5fc138e6334b3631983d77 diff --git a/Makefile b/Makefile index 743f6ca9..bce10d5b 100644 --- a/Makefile +++ b/Makefile @@ -221,14 +221,6 @@ update-bcachefs-sources: git add linux/generic-radix-tree.c cp $(LINUX_DIR)/include/linux/kmemleak.h include/linux/ git add include/linux/kmemleak.h - cp $(LINUX_DIR)/include/linux/printbuf.h include/linux/ - git add include/linux/printbuf.h - cp $(LINUX_DIR)/lib/printbuf.c linux/ - git add linux/printbuf.c - cp $(LINUX_DIR)/lib/math/mean_and_variance.c linux/ - git add linux/mean_and_variance.c - cp $(LINUX_DIR)/include/linux/mean_and_variance.h include/linux/ - git add include/linux/mean_and_variance.h cp $(LINUX_DIR)/lib/math/int_sqrt.c linux/ git add linux/int_sqrt.c cp $(LINUX_DIR)/scripts/Makefile.compiler ./ diff --git a/Makefile.compiler b/Makefile.compiler index 94d0d40c..20d353dc 100644 --- a/Makefile.compiler +++ b/Makefile.compiler @@ -61,9 +61,13 @@ cc-option-yn = $(call try-run,\ cc-disable-warning = $(call try-run,\ $(CC) -Werror $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1))) -# cc-ifversion -# Usage: EXTRA_CFLAGS += $(call cc-ifversion, -lt, 0402, -O1) -cc-ifversion = $(shell [ $(CONFIG_GCC_VERSION)0 $(1) $(2)000 ] && echo $(3) || echo $(4)) +# gcc-min-version +# Usage: cflags-$(call gcc-min-version, 70100) += -foo +gcc-min-version = $(shell [ $(CONFIG_GCC_VERSION)0 -ge $(1)0 ] && echo y) + +# clang-min-version +# Usage: cflags-$(call clang-min-version, 110000) += -foo +clang-min-version = $(shell [ $(CONFIG_CLANG_VERSION)0 -ge $(1)0 ] && echo y) # ld-option # Usage: KBUILD_LDFLAGS += $(call ld-option, -X, -Y) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index b2c1751c..a21b7cc3 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -264,4 +264,7 @@ struct qstr { static inline void dump_stack(void) {} +#define unsafe_memcpy(dst, src, bytes, justification) \ + memcpy(dst, src, bytes) + #endif diff --git a/include/linux/mean_and_variance.h b/include/linux/mean_and_variance.h index b7fa5e96..756eb3d1 100644 --- a/include/linux/mean_and_variance.h +++ b/include/linux/mean_and_variance.h @@ -2,13 +2,35 @@ #ifndef MEAN_AND_VARIANCE_H_ #define MEAN_AND_VARIANCE_H_ +#include <linux/kernel.h> #include <linux/types.h> #include <linux/limits.h> #include <linux/math64.h> -#include <linux/printbuf.h> #define SQRT_U64_MAX 4294967295ULL +/** + * abs - return absolute value of an argument + * @x: the value. If it is unsigned type, it is converted to signed type first. + * char is treated as if it was signed (regardless of whether it really is) + * but the macro's return type is preserved as char. + * + * Return: an absolute value of x. + */ +#define abs(x) __abs_choose_expr(x, long long, \ + __abs_choose_expr(x, long, \ + __abs_choose_expr(x, int, \ + __abs_choose_expr(x, short, \ + __abs_choose_expr(x, char, \ + __builtin_choose_expr( \ + __builtin_types_compatible_p(typeof(x), char), \ + (char)({ signed char __x = (x); __x<0?-__x:__x; }), \ + ((void)0))))))) + +#define __abs_choose_expr(x, type, other) __builtin_choose_expr( \ + __builtin_types_compatible_p(typeof(x), signed type) || \ + __builtin_types_compatible_p(typeof(x), unsigned type), \ + ({ signed type __x = (x); __x < 0 ? -__x : __x; }), other) #if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__) diff --git a/include/linux/poison.h b/include/linux/poison.h index d62ef5a6..2d3249eb 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -81,4 +81,7 @@ /********** net/core/page_pool.c **********/ #define PP_SIGNATURE (0x40 + POISON_POINTER_DELTA) +/********** kernel/bpf/ **********/ +#define BPF_PTR_POISON ((void *)(0xeB9FUL + POISON_POINTER_DELTA)) + #endif diff --git a/include/linux/prandom.h b/include/linux/prandom.h index 6f177cdd..9aea22dc 100644 --- a/include/linux/prandom.h +++ b/include/linux/prandom.h @@ -23,5 +23,11 @@ prandom_type(u32); prandom_type(u64); #undef prandom_type +static inline u32 prandom_u32_max(u32 max) +{ + return prandom_u32() % max; + +} + #endif /* _LINUX_PRANDOM_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index ac6d27bb..fef7e323 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -28,6 +28,7 @@ #define TASK_NEW 2048 #define TASK_IDLE_WORKER 4096 #define TASK_STATE_MAX 8192 +#define TASK_FREEZABLE (1U << 14) /* Convenience macros for the sake of set_task_state */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h new file mode 100644 index 00000000..8c9c0dd7 --- /dev/null +++ b/include/linux/seq_buf.h @@ -0,0 +1,153 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SEQ_BUF_H +#define _LINUX_SEQ_BUF_H + +#include <linux/kernel.h> +#include <stdarg.h> +#include <string.h> + +/* + * Trace sequences are used to allow a function to call several other functions + * to create a string of data to use. + */ + +/** + * seq_buf - seq buffer structure + * @buffer: pointer to the buffer + * @size: size of the buffer + * @len: the amount of data inside the buffer + * @readpos: The next position to read in the buffer. + */ +struct seq_buf { + char *buffer; + size_t size; + size_t len; + loff_t readpos; +}; + +static inline void seq_buf_clear(struct seq_buf *s) +{ + s->len = 0; + s->readpos = 0; +} + +static inline void +seq_buf_init(struct seq_buf *s, char *buf, unsigned int size) +{ + s->buffer = buf; + s->size = size; + seq_buf_clear(s); +} + +/* + * seq_buf have a buffer that might overflow. When this happens + * the len and size are set to be equal. + */ +static inline bool +seq_buf_has_overflowed(struct seq_buf *s) +{ + return s->len > s->size; +} + +static inline void +seq_buf_set_overflow(struct seq_buf *s) +{ + s->len = s->size + 1; +} + +/* + * How much buffer is left on the seq_buf? + */ +static inline unsigned int +seq_buf_buffer_left(struct seq_buf *s) +{ + if (seq_buf_has_overflowed(s)) + return 0; + + return s->size - s->len; +} + +/* How much buffer was written? */ +static inline unsigned int seq_buf_used(struct seq_buf *s) +{ + return min(s->len, s->size); +} + +/** + * seq_buf_terminate - Make sure buffer is nul terminated + * @s: the seq_buf descriptor to terminate. + * + * This makes sure that the buffer in @s is nul terminated and + * safe to read as a string. + * + * Note, if this is called when the buffer has overflowed, then + * the last byte of the buffer is zeroed, and the len will still + * point passed it. + * + * After this function is called, s->buffer is safe to use + * in string operations. + */ +static inline void seq_buf_terminate(struct seq_buf *s) +{ + if (WARN_ON(s->size == 0)) + return; + + if (seq_buf_buffer_left(s)) + s->buffer[s->len] = 0; + else + s->buffer[s->size - 1] = 0; +} + +/** + * seq_buf_get_buf - get buffer to write arbitrary data to + * @s: the seq_buf handle + * @bufp: the beginning of the buffer is stored here + * + * Return the number of bytes available in the buffer, or zero if + * there's no space. + */ +static inline size_t seq_buf_get_buf(struct seq_buf *s, char **bufp) +{ + WARN_ON(s->len > s->size + 1); + + if (s->len < s->size) { + *bufp = s->buffer + s->len; + return s->size - s->len; + } + + *bufp = NULL; + return 0; +} + +/** + * seq_buf_commit - commit data to the buffer + * @s: the seq_buf handle + * @num: the number of bytes to commit + * + * Commit @num bytes of data written to a buffer previously acquired + * by seq_buf_get. To signal an error condition, or that the data + * didn't fit in the available space, pass a negative @num value. + */ +static inline void seq_buf_commit(struct seq_buf *s, int num) +{ + if (num < 0) { + seq_buf_set_overflow(s); + } else { + /* num must be negative on overflow */ + BUG_ON(s->len + num > s->size); + s->len += num; + } +} + +extern __printf(2, 3) +int seq_buf_printf(struct seq_buf *s, const char *fmt, ...); +extern __printf(2, 0) +int seq_buf_vprintf(struct seq_buf *s, const char *fmt, va_list args); +extern int seq_buf_to_user(struct seq_buf *s, char __user *ubuf, + int cnt); +extern int seq_buf_puts(struct seq_buf *s, const char *str); +extern int seq_buf_putc(struct seq_buf *s, unsigned char c); + +void seq_buf_human_readable_u64(struct seq_buf *, u64); + +#endif /* _LINUX_SEQ_BUF_H */ diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index ebbab7a6..bca00d61 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -11,13 +11,13 @@ struct shrink_control { #define SHRINK_STOP (~0UL) -struct printbuf; +struct seq_buf; struct shrinker { unsigned long (*count_objects)(struct shrinker *, struct shrink_control *sc); unsigned long (*scan_objects)(struct shrinker *, struct shrink_control *sc); - void (*to_text)(struct printbuf *, struct shrinker *); + void (*to_text)(struct seq_buf *, struct shrinker *); int seeks; /* seeks to recreate an obj */ long batch; /* reclaim batch size, 0 = default */ diff --git a/include/linux/six.h b/include/linux/six.h index 362a577b..16ad2073 100644 --- a/include/linux/six.h +++ b/include/linux/six.h @@ -59,6 +59,7 @@ */ #include <linux/lockdep.h> +#include <linux/osq_lock.h> #include <linux/sched.h> #include <linux/types.h> @@ -79,9 +80,10 @@ union six_lock_state { }; struct { - unsigned read_lock:27; + unsigned read_lock:26; unsigned write_locking:1; unsigned intent_lock:1; + unsigned nospin:1; unsigned waiters:3; /* * seq works much like in seqlocks: it's incremented every time @@ -104,10 +106,10 @@ enum six_lock_type { struct six_lock { union six_lock_state state; + unsigned intent_lock_recurse; struct task_struct *owner; unsigned __percpu *readers; - unsigned intent_lock_recurse; - unsigned long ip; + struct optimistic_spin_queue osq; raw_spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -148,12 +150,37 @@ do { \ #define __SIX_VAL(field, _v) (((union six_lock_state) { .field = _v }).v) #define __SIX_LOCK(type) \ -bool six_trylock_##type(struct six_lock *); \ -bool six_relock_##type(struct six_lock *, u32); \ -int six_lock_##type(struct six_lock *, six_lock_should_sleep_fn, void *);\ -int six_lock_waiter_##type(struct six_lock *, struct six_lock_waiter *, \ - six_lock_should_sleep_fn, void *); \ -void six_unlock_##type(struct six_lock *); +bool six_trylock_ip_##type(struct six_lock *, unsigned long); \ +bool six_relock_ip_##type(struct six_lock *, u32, unsigned long); \ +int six_lock_ip_##type(struct six_lock *, six_lock_should_sleep_fn, \ + void *, unsigned long); \ +int six_lock_ip_waiter_##type(struct six_lock *, struct six_lock_waiter *,\ + six_lock_should_sleep_fn, void *, unsigned long);\ +void six_unlock_ip_##type(struct six_lock *, unsigned long); \ + \ +static inline bool six_trylock_##type(struct six_lock *lock) \ +{ \ + return six_trylock_ip_##type(lock, _THIS_IP_); \ +} \ +static inline bool six_relock_##type(struct six_lock *lock, u32 seq) \ +{ \ + return six_relock_ip_##type(lock, seq, _THIS_IP_); \ +} \ +static inline int six_lock_##type(struct six_lock *lock, \ + six_lock_should_sleep_fn fn, void *p)\ +{ \ + return six_lock_ip_##type(lock, fn, p, _THIS_IP_); \ +} \ +static inline int six_lock_waiter_##type(struct six_lock *lock, \ + struct six_lock_waiter *wait, \ + six_lock_should_sleep_fn fn, void *p) \ +{ \ + return six_lock_ip_waiter_##type(lock, wait, fn, p, _THIS_IP_); \ +} \ +static inline void six_unlock_##type(struct six_lock *lock) \ +{ \ + return six_unlock_ip_##type(lock, _THIS_IP_); \ +} __SIX_LOCK(read) __SIX_LOCK(intent) @@ -189,6 +216,14 @@ static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type, SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p); } +static inline int six_lock_type_ip_waiter(struct six_lock *lock, enum six_lock_type type, + struct six_lock_waiter *wait, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) +{ + SIX_LOCK_DISPATCH(type, six_lock_ip_waiter, lock, wait, should_sleep_fn, p, ip); +} + static inline int six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type, struct six_lock_waiter *wait, six_lock_should_sleep_fn should_sleep_fn, void *p) diff --git a/include/linux/slab.h b/include/linux/slab.h index cf48570c..ff122ff9 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -174,6 +174,11 @@ static inline void *kmem_cache_alloc(struct kmem_cache *c, gfp_t gfp) return kmalloc(c->obj_size, gfp); } +static inline void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t gfp) +{ + return kzalloc(c->obj_size, gfp); +} + static inline void kmem_cache_free(struct kmem_cache *c, void *p) { kfree(p); diff --git a/include/linux/wait.h b/include/linux/wait.h index d30fb10d..4b9cbf38 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -18,10 +18,12 @@ struct __wait_queue { struct list_head task_list; }; -typedef struct { +struct wait_queue_head { spinlock_t lock; struct list_head task_list; -} wait_queue_head_t; +}; + +typedef struct wait_queue_head wait_queue_head_t; void wake_up(wait_queue_head_t *); void wake_up_all(wait_queue_head_t *); @@ -42,7 +44,7 @@ int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *ke .task_list = { &(name).task_list, &(name).task_list } } #define DECLARE_WAIT_QUEUE_HEAD(name) \ - wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name) + struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name) static inline void init_waitqueue_head(wait_queue_head_t *q) { diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index f699146a..ca5d6c8a 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -514,34 +514,10 @@ DEFINE_EVENT(bch_fs, gc_gens_end, /* Allocator */ -TRACE_EVENT(bucket_alloc, - TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, - bool user, u64 bucket), - TP_ARGS(ca, alloc_reserve, user, bucket), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __array(char, reserve, 16 ) - __field(bool, user ) - __field(u64, bucket ) - ), - - TP_fast_assign( - __entry->dev = ca->dev; - strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); - __entry->user = user; - __entry->bucket = bucket; - ), - - TP_printk("%d,%d reserve %s user %u bucket %llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->reserve, - __entry->user, - __entry->bucket) -); - -TRACE_EVENT(bucket_alloc_fail, +DECLARE_EVENT_CLASS(bucket_alloc, TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, + bool user, + u64 bucket, u64 free, u64 avail, u64 copygc_wait_amount, @@ -549,12 +525,15 @@ TRACE_EVENT(bucket_alloc_fail, struct bucket_alloc_state *s, bool nonblocking, const char *err), - TP_ARGS(ca, alloc_reserve, free, avail, copygc_wait_amount, copygc_waiting_for, + TP_ARGS(ca, alloc_reserve, user, bucket, free, avail, + copygc_wait_amount, copygc_waiting_for, s, nonblocking, err), TP_STRUCT__entry( __field(dev_t, dev ) __array(char, reserve, 16 ) + __field(bool, user ) + __field(u64, bucket ) __field(u64, free ) __field(u64, avail ) __field(u64, copygc_wait_amount ) @@ -571,6 +550,8 @@ TRACE_EVENT(bucket_alloc_fail, TP_fast_assign( __entry->dev = ca->dev; strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); + __entry->user = user; + __entry->bucket = bucket; __entry->free = free; __entry->avail = avail; __entry->copygc_wait_amount = copygc_wait_amount; @@ -584,9 +565,11 @@ TRACE_EVENT(bucket_alloc_fail, strscpy(__entry->err, err, sizeof(__entry->err)); ), - TP_printk("%d,%d reserve %s free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u nocow %llu err %s", + TP_printk("%d,%d reserve %s user %u bucket %llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->reserve, + __entry->user, + __entry->bucket, __entry->free, __entry->avail, __entry->copygc_wait_amount, @@ -595,11 +578,43 @@ TRACE_EVENT(bucket_alloc_fail, __entry->open, __entry->need_journal_commit, __entry->nouse, - __entry->nonblocking, __entry->nocow, + __entry->nonblocking, __entry->err) ); +DEFINE_EVENT(bucket_alloc, bucket_alloc, + TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, + bool user, + u64 bucket, + u64 free, + u64 avail, + u64 copygc_wait_amount, + s64 copygc_waiting_for, + struct bucket_alloc_state *s, + bool nonblocking, + const char *err), + TP_ARGS(ca, alloc_reserve, user, bucket, free, avail, + copygc_wait_amount, copygc_waiting_for, + s, nonblocking, err) +); + +DEFINE_EVENT(bucket_alloc, bucket_alloc_fail, + TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, + bool user, + u64 bucket, + u64 free, + u64 avail, + u64 copygc_wait_amount, + s64 copygc_waiting_for, + struct bucket_alloc_state *s, + bool nonblocking, + const char *err), + TP_ARGS(ca, alloc_reserve, user, bucket, free, avail, + copygc_wait_amount, copygc_waiting_for, + s, nonblocking, err) +); + TRACE_EVENT(discard_buckets, TP_PROTO(struct bch_fs *c, u64 seen, u64 open, u64 need_journal_commit, u64 discarded, const char *err), @@ -673,7 +688,7 @@ DEFINE_EVENT(bkey, move_extent_finish, TP_ARGS(k) ); -DEFINE_EVENT(bkey, move_extent_race, +DEFINE_EVENT(bkey, move_extent_fail, TP_PROTO(const struct bkey *k), TP_ARGS(k) ); diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h new file mode 100644 index 00000000..9ebd081e --- /dev/null +++ b/include/trace/events/lock.h @@ -0,0 +1,144 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM lock + +#if !defined(_TRACE_LOCK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_LOCK_H + +#include <linux/sched.h> +#include <linux/tracepoint.h> + +/* flags for lock:contention_begin */ +#define LCB_F_SPIN (1U << 0) +#define LCB_F_READ (1U << 1) +#define LCB_F_WRITE (1U << 2) +#define LCB_F_RT (1U << 3) +#define LCB_F_PERCPU (1U << 4) +#define LCB_F_MUTEX (1U << 5) + + +#ifdef CONFIG_LOCKDEP + +#include <linux/lockdep.h> + +TRACE_EVENT(lock_acquire, + + TP_PROTO(struct lockdep_map *lock, unsigned int subclass, + int trylock, int read, int check, + struct lockdep_map *next_lock, unsigned long ip), + + TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip), + + TP_STRUCT__entry( + __field(unsigned int, flags) + __string(name, lock->name) + __field(void *, lockdep_addr) + ), + + TP_fast_assign( + __entry->flags = (trylock ? 1 : 0) | (read ? 2 : 0); + __assign_str(name, lock->name); + __entry->lockdep_addr = lock; + ), + + TP_printk("%p %s%s%s", __entry->lockdep_addr, + (__entry->flags & 1) ? "try " : "", + (__entry->flags & 2) ? "read " : "", + __get_str(name)) +); + +DECLARE_EVENT_CLASS(lock, + + TP_PROTO(struct lockdep_map *lock, unsigned long ip), + + TP_ARGS(lock, ip), + + TP_STRUCT__entry( + __string( name, lock->name ) + __field( void *, lockdep_addr ) + ), + + TP_fast_assign( + __assign_str(name, lock->name); + __entry->lockdep_addr = lock; + ), + + TP_printk("%p %s", __entry->lockdep_addr, __get_str(name)) +); + +DEFINE_EVENT(lock, lock_release, + + TP_PROTO(struct lockdep_map *lock, unsigned long ip), + + TP_ARGS(lock, ip) +); + +#ifdef CONFIG_LOCK_STAT + +DEFINE_EVENT(lock, lock_contended, + + TP_PROTO(struct lockdep_map *lock, unsigned long ip), + + TP_ARGS(lock, ip) +); + +DEFINE_EVENT(lock, lock_acquired, + + TP_PROTO(struct lockdep_map *lock, unsigned long ip), + + TP_ARGS(lock, ip) +); + +#endif /* CONFIG_LOCK_STAT */ +#endif /* CONFIG_LOCKDEP */ + +TRACE_EVENT(contention_begin, + + TP_PROTO(void *lock, unsigned int flags), + + TP_ARGS(lock, flags), + + TP_STRUCT__entry( + __field(void *, lock_addr) + __field(unsigned int, flags) + ), + + TP_fast_assign( + __entry->lock_addr = lock; + __entry->flags = flags; + ), + + TP_printk("%p (flags=%s)", __entry->lock_addr, + __print_flags(__entry->flags, "|", + { LCB_F_SPIN, "SPIN" }, + { LCB_F_READ, "READ" }, + { LCB_F_WRITE, "WRITE" }, + { LCB_F_RT, "RT" }, + { LCB_F_PERCPU, "PERCPU" }, + { LCB_F_MUTEX, "MUTEX" } + )) +); + +TRACE_EVENT(contention_end, + + TP_PROTO(void *lock, int ret), + + TP_ARGS(lock, ret), + + TP_STRUCT__entry( + __field(void *, lock_addr) + __field(int, ret) + ), + + TP_fast_assign( + __entry->lock_addr = lock; + __entry->ret = ret; + ), + + TP_printk("%p (ret=%d)", __entry->lock_addr, __entry->ret) +); + +#endif /* _TRACE_LOCK_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index a78232ed..6fd948f1 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -222,7 +222,7 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) } int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); @@ -237,7 +237,7 @@ int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k, } int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { struct bkey_alloc_unpacked u; @@ -250,7 +250,7 @@ int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, } int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { struct bkey_alloc_unpacked u; @@ -263,9 +263,10 @@ int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, } int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); + int rw = flags & WRITE; if (alloc_v4_u64s(a.v) != bkey_val_u64s(k.k)) { prt_printf(err, "bad val size (%lu != %u)", @@ -279,11 +280,9 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, return -BCH_ERR_invalid_bkey; } - /* - * XXX this is wrong, we'll be checking updates that happened from - * before BCH_FS_CHECK_BACKPOINTERS_DONE - */ - if (rw == WRITE && test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) { + if (rw == WRITE && + !(flags & BKEY_INVALID_FROM_JOURNAL) && + test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) { unsigned i, bp_len = 0; for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++) @@ -621,7 +620,7 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) } int bch2_bucket_gens_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { if (bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens)) { prt_printf(err, "bad val size (%lu != %zu)", @@ -1607,7 +1606,6 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, struct bch_dev *ca; struct bkey_i_alloc_v4 *a; struct printbuf buf = PRINTBUF; - bool did_discard = false; int ret = 0; ca = bch_dev_bkey_exists(c, pos.inode); @@ -1683,15 +1681,13 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, k.k->p.offset * ca->mi.bucket_size, ca->mi.bucket_size, GFP_KERNEL); + *discard_pos_done = iter.pos; - ret = bch2_trans_relock(trans); + ret = bch2_trans_relock_notrace(trans); if (ret) goto out; } - *discard_pos_done = iter.pos; - did_discard = true; - SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); a->v.data_type = alloc_data_type(a->v, a->v.data_type); write: @@ -1701,11 +1697,10 @@ write: if (ret) goto out; - if (did_discard) { - this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]); - (*discarded)++; - } + this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]); + (*discarded)++; out: + (*seen)++; bch2_trans_iter_exit(trans, &iter); percpu_ref_put(&ca->io_ref); printbuf_exit(&buf); @@ -1742,7 +1737,7 @@ static void bch2_do_discards_work(struct work_struct *work) if (need_journal_commit * 2 > seen) bch2_journal_flush_async(&c->journal, NULL); - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_discard); trace_discard_buckets(c, seen, open, need_journal_commit, discarded, bch2_err_str(ret)); @@ -1750,44 +1745,45 @@ static void bch2_do_discards_work(struct work_struct *work) void bch2_do_discards(struct bch_fs *c) { - if (percpu_ref_tryget_live(&c->writes) && + if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) && !queue_work(system_long_wq, &c->discard_work)) - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_discard); } static int invalidate_one_bucket(struct btree_trans *trans, struct btree_iter *lru_iter, - struct bpos bucket, + struct bkey_s_c lru_k, s64 *nr_to_invalidate) { struct bch_fs *c = trans->c; struct btree_iter alloc_iter = { NULL }; - struct bkey_i_alloc_v4 *a; + struct bkey_i_alloc_v4 *a = NULL; struct printbuf buf = PRINTBUF; + struct bpos bucket = u64_to_bucket(lru_k.k->p.offset); unsigned cached_sectors; int ret = 0; if (*nr_to_invalidate <= 0) return 1; + if (!bch2_dev_bucket_exists(c, bucket)) { + prt_str(&buf, "lru entry points to invalid bucket"); + goto err; + } + a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket); ret = PTR_ERR_OR_ZERO(a); if (ret) goto out; if (lru_pos_time(lru_iter->pos) != alloc_lru_idx(a->v)) { - prt_printf(&buf, "alloc key does not point back to lru entry when invalidating bucket:\n "); - bch2_bpos_to_text(&buf, lru_iter->pos); - prt_printf(&buf, "\n "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); + prt_str(&buf, "alloc key does not point back to lru entry when invalidating bucket:"); + goto err; + } - bch_err(c, "%s", buf.buf); - if (test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) { - bch2_inconsistent_error(c); - ret = -EINVAL; - } - - goto out; + if (a->v.data_type != BCH_DATA_cached) { + prt_str(&buf, "lru entry points to non cached bucket:"); + goto err; } if (!a->v.cached_sectors) @@ -1816,6 +1812,26 @@ out: bch2_trans_iter_exit(trans, &alloc_iter); printbuf_exit(&buf); return ret; +err: + prt_str(&buf, "\n lru key: "); + bch2_bkey_val_to_text(&buf, c, lru_k); + + prt_str(&buf, "\n lru entry: "); + bch2_lru_pos_to_text(&buf, lru_iter->pos); + + prt_str(&buf, "\n alloc key: "); + if (!a) + bch2_bpos_to_text(&buf, bucket); + else + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); + + bch_err(c, "%s", buf.buf); + if (test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) { + bch2_inconsistent_error(c); + ret = -EINVAL; + } + + goto out; } static void bch2_do_invalidates_work(struct work_struct *work) @@ -1838,9 +1854,7 @@ static void bch2_do_invalidates_work(struct work_struct *work) lru_pos(ca->dev_idx, 0, 0), lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX), BTREE_ITER_INTENT, k, - invalidate_one_bucket(&trans, &iter, - u64_to_bucket(k.k->p.offset), - &nr_to_invalidate)); + invalidate_one_bucket(&trans, &iter, k, &nr_to_invalidate)); if (ret < 0) { percpu_ref_put(&ca->ref); @@ -1849,14 +1863,14 @@ static void bch2_do_invalidates_work(struct work_struct *work) } bch2_trans_exit(&trans); - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } void bch2_do_invalidates(struct bch_fs *c) { - if (percpu_ref_tryget_live(&c->writes) && + if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) && !queue_work(system_long_wq, &c->invalidate_work)) - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca) diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index a0c3c47b..b3c2f1e0 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -122,10 +122,10 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); -int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); -int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); -int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); -int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); +int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); +int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); +int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); void bch2_alloc_v4_swab(struct bkey_s); void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -158,7 +158,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); .atomic_trigger = bch2_mark_alloc, \ }) -int bch2_bucket_gens_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +int bch2_bucket_gens_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \ diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index f1cfb90b..6eeeaec1 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -58,6 +58,17 @@ const char * const bch2_alloc_reserves[] = { * reference _after_ doing the index update that makes its allocation reachable. */ +void bch2_reset_alloc_cursors(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) + ca->alloc_cursor = 0; + rcu_read_unlock(); +} + static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob) { open_bucket_idx_t idx = ob - c->open_buckets; @@ -272,7 +283,6 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * } spin_unlock(&c->freelist_lock); - return ob; } @@ -418,12 +428,11 @@ bch2_bucket_alloc_early(struct btree_trans *trans, struct btree_iter iter; struct bkey_s_c k; struct open_bucket *ob = NULL; + u64 alloc_start = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); + u64 alloc_cursor = max(alloc_start, READ_ONCE(ca->alloc_cursor)); int ret; - - s->cur_bucket = max_t(u64, s->cur_bucket, ca->mi.first_bucket); - s->cur_bucket = max_t(u64, s->cur_bucket, ca->new_fs_bucket_idx); - - for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, s->cur_bucket), +again: + for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor), BTREE_ITER_SLOTS, k, ret) { struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a; @@ -448,9 +457,17 @@ bch2_bucket_alloc_early(struct btree_trans *trans, } bch2_trans_iter_exit(trans, &iter); - s->cur_bucket = iter.pos.offset; + ca->alloc_cursor = alloc_cursor; - return ob ?: ERR_PTR(ret ?: -BCH_ERR_no_buckets_found); + if (!ob && ret) + ob = ERR_PTR(ret); + + if (!ob && alloc_cursor > alloc_start) { + alloc_cursor = alloc_start; + goto again; + } + + return ob; } static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, @@ -462,33 +479,34 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, struct btree_iter iter; struct bkey_s_c k; struct open_bucket *ob = NULL; + u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor)); + u64 alloc_cursor = alloc_start; int ret; BUG_ON(ca->new_fs_bucket_idx); - - /* - * XXX: - * On transaction restart, we'd like to restart from the bucket we were - * at previously - */ +again: for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace, - POS(ca->dev_idx, s->cur_bucket), 0, k, ret) { + POS(ca->dev_idx, alloc_cursor), 0, k, ret) { if (k.k->p.inode != ca->dev_idx) break; - for (s->cur_bucket = max(s->cur_bucket, bkey_start_offset(k.k)); - s->cur_bucket < k.k->p.offset; - s->cur_bucket++) { + for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k)); + alloc_cursor < k.k->p.offset; + alloc_cursor++) { ret = btree_trans_too_many_iters(trans); - if (ret) + if (ret) { + ob = ERR_PTR(ret); break; + } s->buckets_seen++; ob = try_alloc_bucket(trans, ca, reserve, - s->cur_bucket, s, k, cl); - if (ob) + alloc_cursor, s, k, cl); + if (ob) { + iter.path->preserve = false; break; + } } if (ob || ret) @@ -496,7 +514,17 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, } bch2_trans_iter_exit(trans, &iter); - return ob ?: ERR_PTR(ret); + ca->alloc_cursor = alloc_cursor; + + if (!ob && ret) + ob = ERR_PTR(ret); + + if (!ob && alloc_start > ca->mi.first_bucket) { + alloc_cursor = alloc_start = ca->mi.first_bucket; + goto again; + } + + return ob; } /** @@ -514,9 +542,8 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, struct bch_fs *c = trans->c; struct open_bucket *ob = NULL; bool freespace = READ_ONCE(ca->mi.freespace_initialized); - u64 start = freespace ? 0 : ca->bucket_alloc_trans_early_cursor; u64 avail; - struct bucket_alloc_state s = { .cur_bucket = start }; + struct bucket_alloc_state s = { 0 }; bool waiting = false; again: bch2_dev_usage_read_fast(ca, usage); @@ -561,28 +588,31 @@ alloc: if (s.skipped_need_journal_commit * 2 > avail) bch2_journal_flush_async(&c->journal, NULL); - if (!ob && !freespace && start) { - start = s.cur_bucket = 0; - goto alloc; - } - if (!ob && freespace && !test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags)) { freespace = false; goto alloc; } - - if (!freespace) - ca->bucket_alloc_trans_early_cursor = s.cur_bucket; err: if (!ob) ob = ERR_PTR(-BCH_ERR_no_buckets_found); if (!IS_ERR(ob)) - trace_and_count(c, bucket_alloc, ca, bch2_alloc_reserves[reserve], - may_alloc_partial, ob->bucket); + trace_and_count(c, bucket_alloc, ca, + bch2_alloc_reserves[reserve], + may_alloc_partial, + ob->bucket, + usage->d[BCH_DATA_free].buckets, + avail, + bch2_copygc_wait_amount(c), + c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now), + &s, + cl == NULL, + ""); else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart)) - trace_and_count(c, bucket_alloc_fail, - ca, bch2_alloc_reserves[reserve], + trace_and_count(c, bucket_alloc_fail, ca, + bch2_alloc_reserves[reserve], + may_alloc_partial, + 0, usage->d[BCH_DATA_free].buckets, avail, bch2_copygc_wait_amount(c), @@ -1130,16 +1160,16 @@ out: * Get us an open_bucket we can allocate from, return with it locked: */ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, - unsigned target, - unsigned erasure_code, - struct write_point_specifier write_point, - struct bch_devs_list *devs_have, - unsigned nr_replicas, - unsigned nr_replicas_required, - enum alloc_reserve reserve, - unsigned flags, - struct closure *cl, - struct write_point **wp_ret) + unsigned target, + unsigned erasure_code, + struct write_point_specifier write_point, + struct bch_devs_list *devs_have, + unsigned nr_replicas, + unsigned nr_replicas_required, + enum alloc_reserve reserve, + unsigned flags, + struct closure *cl, + struct write_point **wp_ret) { struct bch_fs *c = trans->c; struct write_point *wp; @@ -1336,3 +1366,33 @@ void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) spin_unlock(&ob->lock); } } + +static const char * const bch2_write_point_states[] = { +#define x(n) #n, + WRITE_POINT_STATES() +#undef x + NULL +}; + +void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct write_point *wp; + unsigned i; + + for (wp = c->write_points; + wp < c->write_points + ARRAY_SIZE(c->write_points); + wp++) { + prt_printf(out, "%lu: ", wp->write_point); + prt_human_readable_u64(out, wp->sectors_allocated); + + prt_printf(out, " last wrote: "); + bch2_pr_time_units(out, sched_clock() - wp->last_used); + + for (i = 0; i < WRITE_POINT_STATE_NR; i++) { + prt_printf(out, " %s: ", bch2_write_point_states[i]); + bch2_pr_time_units(out, wp->time[i]); + } + + prt_newline(out); + } +} diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h index 62fbf1c7..26e986f2 100644 --- a/libbcachefs/alloc_foreground.h +++ b/libbcachefs/alloc_foreground.h @@ -16,6 +16,8 @@ struct bch_devs_List; extern const char * const bch2_alloc_reserves[]; +void bch2_reset_alloc_cursors(struct bch_fs *); + struct dev_alloc_list { unsigned nr; u8 devs[BCH_SB_MEMBERS_MAX]; @@ -178,7 +180,8 @@ bch2_alloc_sectors_append_ptrs_inlined(struct bch_fs *c, struct write_point *wp, unsigned i; BUG_ON(sectors > wp->sectors_free); - wp->sectors_free -= sectors; + wp->sectors_free -= sectors; + wp->sectors_allocated += sectors; open_bucket_for_each(c, &wp->ptrs, ob, i) { struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); @@ -219,4 +222,6 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *); void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *); +void bch2_write_points_to_text(struct printbuf *, struct bch_fs *); + #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h index 33026734..2e6f4806 100644 --- a/libbcachefs/alloc_types.h +++ b/libbcachefs/alloc_types.h @@ -9,7 +9,6 @@ #include "fifo.h" struct bucket_alloc_state { - u64 cur_bucket; u64 buckets_seen; u64 skipped_open; u64 skipped_need_journal_commit; @@ -75,6 +74,19 @@ struct dev_stripe_state { u64 next_alloc[BCH_SB_MEMBERS_MAX]; }; +#define WRITE_POINT_STATES() \ + x(stopped) \ + x(waiting_io) \ + x(waiting_work) \ + x(running) + +enum write_point_state { +#define x(n) WRITE_POINT_##n, + WRITE_POINT_STATES() +#undef x + WRITE_POINT_STATE_NR +}; + struct write_point { struct { struct hlist_node node; @@ -88,6 +100,8 @@ struct write_point { struct open_buckets ptrs; struct dev_stripe_state stripe; + + u64 sectors_allocated; } __attribute__((__aligned__(SMP_CACHE_BYTES))); struct { @@ -95,6 +109,10 @@ struct write_point { struct list_head writes; spinlock_t writes_lock; + + enum write_point_state state; + u64 last_state_change; + u64 time[WRITE_POINT_STATE_NR]; } __attribute__((__aligned__(SMP_CACHE_BYTES))); }; diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c index 405823d1..0f8ffdf4 100644 --- a/libbcachefs/backpointers.c +++ b/libbcachefs/backpointers.c @@ -69,7 +69,7 @@ static bool extent_matches_bp(struct bch_fs *c, } int bch2_backpointer_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); struct bpos bucket = bp_pos_to_bucket(c, bp.k->p); diff --git a/libbcachefs/backpointers.h b/libbcachefs/backpointers.h index 48a48b75..ac7b0932 100644 --- a/libbcachefs/backpointers.h +++ b/libbcachefs/backpointers.h @@ -6,7 +6,7 @@ #include "super.h" int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k, - int, struct printbuf *); + unsigned, struct printbuf *); void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *); void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_backpointer_swab(struct bkey_s); diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index febef9ac..f5131733 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -210,6 +210,10 @@ #include "opts.h" #include "util.h" +#ifdef CONFIG_BCACHEFS_DEBUG +#define BCH_WRITE_REF_DEBUG +#endif + #define dynamic_fault(...) 0 #define race_fault(...) 0 @@ -503,7 +507,7 @@ struct bch_dev { /* Allocator: */ u64 new_fs_bucket_idx; - u64 bucket_alloc_trans_early_cursor; + u64 alloc_cursor; unsigned nr_open_buckets; unsigned nr_btree_reserve; @@ -524,7 +528,7 @@ struct bch_dev { /* The rest of this all shows up in sysfs */ atomic64_t cur_latency[2]; - struct time_stats io_latency[2]; + struct bch2_time_stats io_latency[2]; #define CONGESTED_MAX 1024 atomic_t congested; @@ -543,6 +547,7 @@ enum { /* shutdown: */ BCH_FS_STOPPING, BCH_FS_EMERGENCY_RO, + BCH_FS_GOING_RO, BCH_FS_WRITE_DISABLE_COMPLETE, BCH_FS_CLEAN_SHUTDOWN, @@ -573,8 +578,8 @@ struct btree_debug { #define BCH_TRANSACTIONS_NR 128 struct btree_transaction_stats { + struct bch2_time_stats lock_hold_times; struct mutex lock; - struct time_stats lock_hold_times; unsigned nr_max_paths; unsigned max_mem; char *max_paths_text; @@ -634,6 +639,29 @@ typedef struct { #define BCACHEFS_ROOT_SUBVOL_INUM \ ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) +#define BCH_WRITE_REFS() \ + x(trans) \ + x(write) \ + x(promote) \ + x(node_rewrite) \ + x(stripe_create) \ + x(stripe_delete) \ + x(reflink) \ + x(fallocate) \ + x(discard) \ + x(invalidate) \ + x(move) \ + x(delete_dead_snapshots) \ + x(snapshot_delete_pagecache) \ + x(sysfs) + +enum bch_write_ref { +#define x(n) BCH_WRITE_REF_##n, + BCH_WRITE_REFS() +#undef x + BCH_WRITE_REF_NR, +}; + struct bch_fs { struct closure cl; @@ -655,7 +683,11 @@ struct bch_fs { struct rw_semaphore state_lock; /* Counts outstanding writes, for clean transition to read-only */ +#ifdef BCH_WRITE_REF_DEBUG + atomic_long_t writes[BCH_WRITE_REF_NR]; +#else struct percpu_ref writes; +#endif struct work_struct read_only_work; struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; @@ -857,6 +889,7 @@ struct bch_fs { struct mutex gc_gens_lock; /* IO PATH */ + struct semaphore io_in_flight; struct bio_set bio_read; struct bio_set bio_read_split; struct bio_set bio_write; @@ -969,11 +1002,51 @@ struct bch_fs { unsigned copy_gc_enabled:1; bool promote_whole_extents; - struct time_stats times[BCH_TIME_STAT_NR]; + struct bch2_time_stats times[BCH_TIME_STAT_NR]; struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR]; }; +extern struct wait_queue_head bch2_read_only_wait; + +static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref) +{ +#ifdef BCH_WRITE_REF_DEBUG + atomic_long_inc(&c->writes[ref]); +#else + percpu_ref_get(&c->writes); +#endif +} + +static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) +{ +#ifdef BCH_WRITE_REF_DEBUG + return !test_bit(BCH_FS_GOING_RO, &c->flags) && + atomic_long_inc_not_zero(&c->writes[ref]); +#else + return percpu_ref_tryget_live(&c->writes); +#endif +} + +static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref) +{ +#ifdef BCH_WRITE_REF_DEBUG + long v = atomic_long_dec_return(&c->writes[ref]); + + BUG_ON(v < 0); + if (v) + return; + for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) + if (atomic_long_read(&c->writes[i])) + return; + + set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); + wake_up(&bch2_read_only_wait); +#else + percpu_ref_put(&c->writes); +#endif +} + static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) { #ifndef NO_BCACHEFS_FS diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 48438e67..ffd91373 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -1357,7 +1357,7 @@ struct bch_replicas_entry { struct bch_sb_field_replicas { struct bch_sb_field field; - struct bch_replicas_entry entries[0]; + struct bch_replicas_entry entries[]; } __packed __aligned(8); /* BCH_SB_FIELD_quota: */ @@ -1436,7 +1436,7 @@ struct bch_sb_field_disk_groups { x(move_extent_read, 35) \ x(move_extent_write, 36) \ x(move_extent_finish, 37) \ - x(move_extent_race, 38) \ + x(move_extent_fail, 38) \ x(move_extent_alloc_mem_fail, 39) \ x(copygc, 40) \ x(copygc_wait, 41) \ @@ -1705,7 +1705,6 @@ LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); -/* Obsolete, always enabled: */ LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34); diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index e13ce07f..72d95831 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -24,7 +24,7 @@ const char * const bch2_bkey_types[] = { }; static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { return 0; } @@ -38,7 +38,7 @@ static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k, }) static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { if (bkey_val_bytes(k.k)) { prt_printf(err, "incorrect value size (%zu != 0)", @@ -54,7 +54,7 @@ static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k, }) static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)) { prt_printf(err, "incorrect value size (%zu != %zu)", @@ -74,7 +74,7 @@ static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k, }) static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { return 0; } @@ -95,7 +95,7 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, }) static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { if (bkey_val_bytes(k.k)) { prt_printf(err, "incorrect value size (%zu != %zu)", @@ -124,14 +124,14 @@ const struct bkey_ops bch2_bkey_ops[] = { }; int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { if (k.k->type >= KEY_TYPE_MAX) { prt_printf(err, "invalid type (%u >= %u)", k.k->type, KEY_TYPE_MAX); return -BCH_ERR_invalid_bkey; } - return bch2_bkey_ops[k.k->type].key_invalid(c, k, rw, err); + return bch2_bkey_ops[k.k->type].key_invalid(c, k, flags, err); } static unsigned bch2_key_types_allowed[] = { @@ -207,7 +207,7 @@ static unsigned bch2_key_types_allowed[] = { int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, enum btree_node_type type, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { if (k.k->u64s < BKEY_U64s) { prt_printf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s); @@ -216,7 +216,7 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, if (!(bch2_key_types_allowed[type] & (1U << k.k->type))) { prt_printf(err, "invalid key type for btree %s (%s)", - bch2_btree_ids[type], bch2_bkey_types[type]); + bch2_btree_ids[type], bch2_bkey_types[k.k->type]); return -BCH_ERR_invalid_bkey; } @@ -263,10 +263,10 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, enum btree_node_type type, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { - return __bch2_bkey_invalid(c, k, type, rw, err) ?: - bch2_bkey_val_invalid(c, k, rw, err); + return __bch2_bkey_invalid(c, k, type, flags, err) ?: + bch2_bkey_val_invalid(c, k, flags, err); } int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k, @@ -374,7 +374,11 @@ bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) { const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type]; - return bch2_bkey_maybe_mergable(l.k, r.k) && ops->key_merge(c, l, r); + return bch2_bkey_maybe_mergable(l.k, r.k) && + (u64) l.k->size + r.k->size <= KEY_SIZE_MAX && + bch2_bkey_ops[l.k->type].key_merge && + !bch2_key_merging_disabled && + ops->key_merge(c, l, r); } static const struct old_bkey_type { diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h index 2cbb0f39..9a6afab8 100644 --- a/libbcachefs/bkey_methods.h +++ b/libbcachefs/bkey_methods.h @@ -21,7 +21,7 @@ extern const char * const bch2_bkey_types[]; */ struct bkey_ops { int (*key_invalid)(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err); + unsigned flags, struct printbuf *err); void (*val_to_text)(struct printbuf *, struct bch_fs *, struct bkey_s_c); void (*swab)(struct bkey_s); @@ -38,11 +38,13 @@ struct bkey_ops { extern const struct bkey_ops bch2_bkey_ops[]; -int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +#define BKEY_INVALID_FROM_JOURNAL (1 << 1) + +int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, - enum btree_node_type, int, struct printbuf *); + enum btree_node_type, unsigned, struct printbuf *); int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, - enum btree_node_type, int, struct printbuf *); + enum btree_node_type, unsigned, struct printbuf *); int bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c, struct printbuf *); void bch2_bpos_to_text(struct printbuf *, struct bpos); @@ -60,10 +62,7 @@ static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct b { return l->type == r->type && !bversion_cmp(l->version, r->version) && - bpos_eq(l->p, bkey_start_pos(r)) && - (u64) l->size + r->size <= KEY_SIZE_MAX && - bch2_bkey_ops[l->type].key_merge && - !bch2_key_merging_disabled; + bpos_eq(l->p, bkey_start_pos(r)); } bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); @@ -82,7 +81,9 @@ static inline int bch2_mark_key(struct btree_trans *trans, enum btree_update_flags { __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, + __BTREE_UPDATE_NOJOURNAL, __BTREE_UPDATE_KEY_CACHE_RECLAIM, + __BTREE_UPDATE_NO_KEY_CACHE_COHERENCY, __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ @@ -95,7 +96,10 @@ enum btree_update_flags { }; #define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) +#define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL) #define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM) +#define BTREE_UPDATE_NO_KEY_CACHE_COHERENCY \ + (1U << __BTREE_UPDATE_NO_KEY_CACHE_COHERENCY) #define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index 544e2dfb..89478fc5 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -36,16 +36,7 @@ static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k) { - unsigned offset = __btree_node_key_to_offset(b, k); - struct bset_tree *t; - - for_each_bset(b, t) - if (offset <= t->end_offset) { - EBUG_ON(offset < btree_bkey_first_offset(t)); - return t; - } - - BUG(); + return bch2_bkey_to_bset_inlined(b, k); } /* diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h index acef1430..fd2915a1 100644 --- a/libbcachefs/bset.h +++ b/libbcachefs/bset.h @@ -291,6 +291,21 @@ static inline int bkey_cmp_p_or_unp(const struct btree *b, return __bch2_bkey_cmp_left_packed_format_checked(b, l, r); } +static inline struct bset_tree * +bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k) +{ + unsigned offset = __btree_node_key_to_offset(b, k); + struct bset_tree *t; + + for_each_bset(b, t) + if (offset <= t->end_offset) { + EBUG_ON(offset < btree_bkey_first_offset(t)); + return t; + } + + BUG(); +} + struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *); struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *, diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index b5e78042..d10257e1 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -12,6 +12,7 @@ #include <linux/prefetch.h> #include <linux/sched/mm.h> +#include <linux/seq_buf.h> #include <trace/events/bcachefs.h> #define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \ @@ -427,12 +428,16 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink, return btree_cache_can_free(bc); } -static void bch2_btree_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink) +static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink) { struct bch_fs *c = container_of(shrink, struct bch_fs, btree_cache.shrink); + char *cbuf; + size_t buflen = seq_buf_get_buf(s, &cbuf); + struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen); - bch2_btree_cache_to_text(out, &c->btree_cache); + bch2_btree_cache_to_text(&out, &c->btree_cache); + seq_buf_commit(s, out.pos); } void bch2_fs_btree_cache_exit(struct bch_fs *c) @@ -1090,7 +1095,7 @@ retry: goto out; } else { lock_node: - ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read); + ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ERR_PTR(ret); diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 68796e19..0145746c 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -526,11 +526,10 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, struct btree *b, struct bset *i, unsigned offset, int write) { - prt_printf(out, bch2_log_msg(c, "")); - if (!write) - prt_str(out, "error validating btree node "); - else - prt_str(out, "corrupt btree node before write "); + prt_printf(out, bch2_log_msg(c, "%s"), + write == READ + ? "error validating btree node " + : "corrupt btree node before write "); if (ca) prt_printf(out, "on %s ", ca->name); prt_printf(out, "at btree "); @@ -543,63 +542,96 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, } enum btree_err_type { + /* + * We can repair this locally, and we're after the checksum check so + * there's no need to try another replica: + */ BTREE_ERR_FIXABLE, + /* + * We can repair this if we have to, but we should try reading another + * replica if we can: + */ BTREE_ERR_WANT_RETRY, + /* + * Read another replica if we have one, otherwise consider the whole + * node bad: + */ BTREE_ERR_MUST_RETRY, - BTREE_ERR_FATAL, + BTREE_ERR_BAD_NODE, + BTREE_ERR_INCOMPATIBLE, }; enum btree_validate_ret { BTREE_RETRY_READ = 64, }; +static int __btree_err(enum btree_err_type type, + struct bch_fs *c, + struct bch_dev *ca, + struct btree *b, + struct bset *i, + int write, + bool have_retry, + const char *fmt, ...) +{ + struct printbuf out = PRINTBUF; + va_list args; + int ret = -BCH_ERR_fsck_fix; + + btree_err_msg(&out, c, ca, b, i, b->written, write); + + va_start(args, fmt); + prt_vprintf(&out, fmt, args); + va_end(args); + + if (write == WRITE) { + bch2_print_string_as_lines(KERN_ERR, out.buf); + ret = c->opts.errors == BCH_ON_ERROR_continue + ? 0 + : -BCH_ERR_fsck_errors_not_fixed; + goto out; + } + + if (!have_retry && type == BTREE_ERR_WANT_RETRY) + type = BTREE_ERR_FIXABLE; + if (!have_retry && type == BTREE_ERR_MUST_RETRY) + type = BTREE_ERR_BAD_NODE; + + switch (type) { + case BTREE_ERR_FIXABLE: + mustfix_fsck_err(c, "%s", out.buf); + ret = -BCH_ERR_fsck_fix; + break; + case BTREE_ERR_WANT_RETRY: + case BTREE_ERR_MUST_RETRY: + bch2_print_string_as_lines(KERN_ERR, out.buf); + ret = BTREE_RETRY_READ; + break; + case BTREE_ERR_BAD_NODE: + bch2_print_string_as_lines(KERN_ERR, out.buf); + bch2_topology_error(c); + ret = -BCH_ERR_need_topology_repair; + break; + case BTREE_ERR_INCOMPATIBLE: + bch2_print_string_as_lines(KERN_ERR, out.buf); + ret = -BCH_ERR_fsck_errors_not_fixed; + break; + default: + BUG(); + } +out: +fsck_err: + printbuf_exit(&out); + return ret; +} + #define btree_err(type, c, ca, b, i, msg, ...) \ ({ \ - __label__ out; \ - struct printbuf out = PRINTBUF; \ + int _ret = __btree_err(type, c, ca, b, i, write, have_retry, msg, ##__VA_ARGS__);\ \ - btree_err_msg(&out, c, ca, b, i, b->written, write); \ - prt_printf(&out, msg, ##__VA_ARGS__); \ - \ - if (type == BTREE_ERR_FIXABLE && \ - write == READ && \ - !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \ - mustfix_fsck_err(c, "%s", out.buf); \ - goto out; \ - } \ - \ - bch2_print_string_as_lines(KERN_ERR, out.buf); \ - \ - switch (write) { \ - case READ: \ - switch (type) { \ - case BTREE_ERR_FIXABLE: \ - ret = -BCH_ERR_fsck_errors_not_fixed; \ - goto fsck_err; \ - case BTREE_ERR_WANT_RETRY: \ - if (have_retry) { \ - ret = BTREE_RETRY_READ; \ - goto fsck_err; \ - } \ - break; \ - case BTREE_ERR_MUST_RETRY: \ - ret = BTREE_RETRY_READ; \ - goto fsck_err; \ - case BTREE_ERR_FATAL: \ - ret = -BCH_ERR_fsck_errors_not_fixed; \ - goto fsck_err; \ - } \ - break; \ - case WRITE: \ - if (bch2_fs_inconsistent(c)) { \ - ret = -BCH_ERR_fsck_errors_not_fixed; \ - goto fsck_err; \ - } \ - break; \ - } \ -out: \ - printbuf_exit(&out); \ - true; \ + if (_ret != -BCH_ERR_fsck_fix) \ + goto fsck_err; \ + *saw_error = true; \ }) #define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) @@ -608,6 +640,7 @@ out: \ * When btree topology repair changes the start or end of a node, that might * mean we have to drop keys that are no longer inside the node: */ +__cold void bch2_btree_node_drop_keys_outside_node(struct btree *b) { struct bset_tree *t; @@ -658,7 +691,7 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b) static int validate_bset(struct bch_fs *c, struct bch_dev *ca, struct btree *b, struct bset *i, unsigned offset, unsigned sectors, - int write, bool have_retry) + int write, bool have_retry, bool *saw_error) { unsigned version = le16_to_cpu(i->version); const char *err; @@ -669,7 +702,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on((version != BCH_BSET_VERSION_OLD && version < bcachefs_metadata_version_min) || version >= bcachefs_metadata_version_max, - BTREE_ERR_FATAL, c, ca, b, i, + BTREE_ERR_INCOMPATIBLE, c, ca, b, i, "unsupported bset version"); if (btree_err_on(version < c->sb.version_min, @@ -693,7 +726,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, } btree_err_on(BSET_SEPARATE_WHITEOUTS(i), - BTREE_ERR_FATAL, c, ca, b, i, + BTREE_ERR_INCOMPATIBLE, c, ca, b, i, "BSET_SEPARATE_WHITEOUTS no longer supported"); if (btree_err_on(offset + sectors > btree_sectors(c), @@ -770,7 +803,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, err = bch2_bkey_format_validate(&bn->format); btree_err_on(err, - BTREE_ERR_FATAL, c, ca, b, i, + BTREE_ERR_BAD_NODE, c, ca, b, i, "invalid bkey format: %s", err); compat_bformat(b->c.level, b->c.btree_id, version, @@ -795,7 +828,8 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b, } static int validate_bset_keys(struct bch_fs *c, struct btree *b, - struct bset *i, int write, bool have_retry) + struct bset *i, int write, + bool have_retry, bool *saw_error) { unsigned version = le16_to_cpu(i->version); struct bkey_packed *k, *prev = NULL; @@ -882,7 +916,7 @@ fsck_err: } int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, - struct btree *b, bool have_retry) + struct btree *b, bool have_retry, bool *saw_error) { struct btree_node_entry *bne; struct sort_iter *iter; @@ -897,7 +931,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, unsigned blacklisted_written, nonblacklisted_written = 0; unsigned ptr_written = btree_ptr_sectors_written(&b->key); struct printbuf buf = PRINTBUF; - int ret, retry_read = 0, write = READ; + int ret = 0, retry_read = 0, write = READ; b->version_ondisk = U16_MAX; /* We might get called multiple times on read retry: */ @@ -958,7 +992,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), - BTREE_ERR_FATAL, c, NULL, b, NULL, + BTREE_ERR_INCOMPATIBLE, c, NULL, b, NULL, "btree node does not have NEW_EXTENT_OVERWRITE set"); sectors = vstruct_sectors(b->data, c->block_bits); @@ -993,14 +1027,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, le16_to_cpu(i->version)); ret = validate_bset(c, ca, b, i, b->written, sectors, - READ, have_retry); + READ, have_retry, saw_error); if (ret) goto fsck_err; if (!b->written) btree_node_set_format(b, b->data->format); - ret = validate_bset_keys(c, b, i, READ, have_retry); + ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error); if (ret) goto fsck_err; @@ -1140,12 +1174,10 @@ out: printbuf_exit(&buf); return retry_read; fsck_err: - if (ret == BTREE_RETRY_READ) { + if (ret == BTREE_RETRY_READ) retry_read = 1; - } else { - bch2_inconsistent_error(c); + else set_btree_node_read_error(b); - } goto out; } @@ -1195,7 +1227,7 @@ start: &failed, &rb->pick) > 0; if (!bio->bi_status && - !bch2_btree_node_read_done(c, ca, b, can_retry)) { + !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) { if (retry) bch_info(c, "retry success"); break; @@ -1301,6 +1333,7 @@ static void btree_node_read_all_replicas_done(struct closure *cl) unsigned i, written = 0, written2 = 0; __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2 ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0; + bool _saw_error = false, *saw_error = &_saw_error; for (i = 0; i < ra->nr; i++) { struct btree_node *bn = ra->buf[i]; @@ -1387,13 +1420,15 @@ fsck_err: if (best >= 0) { memcpy(b->data, ra->buf[best], btree_bytes(c)); - ret = bch2_btree_node_read_done(c, NULL, b, false); + ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error); } else { ret = -1; } if (ret) set_btree_node_read_error(b); + else if (*saw_error) + bch2_btree_node_rewrite_async(c, b); for (i = 0; i < ra->nr; i++) { mempool_free(ra->buf[i], &c->btree_bounce_pool); @@ -1770,6 +1805,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, struct bset *i, unsigned sectors) { struct printbuf buf = PRINTBUF; + bool saw_error; int ret; ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), @@ -1781,8 +1817,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, if (ret) return ret; - ret = validate_bset_keys(c, b, i, WRITE, false) ?: - validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false); + ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?: + validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error); if (ret) { bch2_inconsistent_error(c); dump_stack(); diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index a720dd74..c43fb60b 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -129,7 +129,7 @@ void bch2_btree_build_aux_trees(struct btree *); void bch2_btree_init_next(struct btree_trans *, struct btree *); int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *, - struct btree *, bool); + struct btree *, bool, bool *); void bch2_btree_node_read(struct bch_fs *, struct btree *, bool); int bch2_btree_root_read(struct bch_fs *, enum btree_id, const struct bkey_i *, unsigned); diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 9c139a7b..077d72bf 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -20,18 +20,13 @@ #include <linux/prefetch.h> #include <trace/events/bcachefs.h> -static void btree_trans_verify_sorted(struct btree_trans *); -inline void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int); -static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans *, - struct btree_path *, int); - static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *); static inline void btree_path_list_add(struct btree_trans *, struct btree_path *, struct btree_path *); static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter) { -#ifdef CONFIG_BCACHEFS_DEBUG +#ifdef TRACK_PATH_ALLOCATED return iter->ip_allocated; #else return 0; @@ -353,6 +348,8 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, unsigned idx; struct printbuf buf = PRINTBUF; + btree_trans_sort_paths(trans); + trans_for_each_path_inorder(trans, path, idx) { int cmp = cmp_int(path->btree_id, id) ?: cmp_int(path->cached, key_cache); @@ -540,7 +537,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, unsigned clobber_u64s, unsigned new_u64s) { - struct bset_tree *t = bch2_bkey_to_bset(b, where); + struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where); struct btree_path *linked; if (node_iter != &path->l[b->c.level].iter) { @@ -595,6 +592,7 @@ static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans, bch2_btree_node_iter_peek(&l->iter, l->b)); path->pos = k.k ? k.k->p : l->b->key.k.p; + trans->paths_sorted = false; bch2_btree_path_verify_level(trans, path, l - path->l); return k; } @@ -608,6 +606,7 @@ static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans, bch2_btree_node_iter_prev(&l->iter, l->b)); path->pos = k.k ? k.k->p : l->b->data->min_key; + trans->paths_sorted = false; bch2_btree_path_verify_level(trans, path, l - path->l); return k; } @@ -963,15 +962,13 @@ err: return ret; } -static int btree_path_traverse_one(struct btree_trans *, struct btree_path *, - unsigned, unsigned long); static int bch2_btree_path_traverse_all(struct btree_trans *trans) { struct bch_fs *c = trans->c; struct btree_path *path; unsigned long trace_ip = _RET_IP_; - int ret = 0; + int i, ret = 0; if (trans->in_traverse_all) return -BCH_ERR_transaction_restart_in_traverse_all; @@ -979,12 +976,11 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans) trans->in_traverse_all = true; retry_all: trans->restarted = 0; - trans->traverse_all_idx = U8_MAX; trans_for_each_path(trans, path) path->should_be_locked = false; - btree_trans_verify_sorted(trans); + btree_trans_sort_paths(trans); bch2_trans_unlock(trans); cond_resched(); @@ -1001,34 +997,35 @@ retry_all: } /* Now, redo traversals in correct order: */ - trans->traverse_all_idx = 0; - while (trans->traverse_all_idx < trans->nr_sorted) { - path = trans->paths + trans->sorted[trans->traverse_all_idx]; + i = 0; + while (i < trans->nr_sorted) { + path = trans->paths + trans->sorted[i]; /* * Traversing a path can cause another path to be added at about * the same position: */ if (path->uptodate) { - ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_); + __btree_path_get(path, false); + ret = bch2_btree_path_traverse_one(trans, path, 0, _THIS_IP_); + __btree_path_put(path, false); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || ret == -ENOMEM) goto retry_all; if (ret) goto err; - BUG_ON(path->uptodate); } else { - trans->traverse_all_idx++; + i++; } } /* - * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock() - * and relock(), relock() won't relock since path->should_be_locked - * isn't set yet, which is all fine + * We used to assert that all paths had been traversed here + * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since + * path->Should_be_locked is not set yet, we we might have unlocked and + * then failed to relock a path - that's fine. */ - trans_for_each_path(trans, path) - BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE); err: bch2_btree_cache_cannibalize_unlock(c); @@ -1115,10 +1112,10 @@ static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, * On error, caller (peek_node()/peek_key()) must return NULL; the error is * stashed in the iterator and returned from bch2_trans_exit(). */ -static int btree_path_traverse_one(struct btree_trans *trans, - struct btree_path *path, - unsigned flags, - unsigned long trace_ip) +int bch2_btree_path_traverse_one(struct btree_trans *trans, + struct btree_path *path, + unsigned flags, + unsigned long trace_ip) { unsigned depth_want = path->level; int ret = -((int) trans->restarted); @@ -1177,31 +1174,14 @@ static int btree_path_traverse_one(struct btree_trans *trans, path->uptodate = BTREE_ITER_UPTODATE; out: - BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted) + panic("ret %s (%i) trans->restarted %s (%i)\n", + bch2_err_str(ret), ret, + bch2_err_str(trans->restarted), trans->restarted); bch2_btree_path_verify(trans, path); return ret; } -int __must_check bch2_btree_path_traverse(struct btree_trans *trans, - struct btree_path *path, unsigned flags) -{ - if (0 && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { - unsigned restart_probability_bits = 4 << min(trans->restart_count, 32U); - u64 mask = ~(~0ULL << restart_probability_bits); - - if ((prandom_u32() & mask) == mask) { - trace_and_count(trans->c, trans_restart_injected, trans, _RET_IP_); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject); - } - } - - if (path->uptodate < BTREE_ITER_NEED_RELOCK) - return 0; - - return bch2_trans_cond_resched(trans) ?: - btree_path_traverse_one(trans, path, flags, _RET_IP_); -} - static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, struct btree_path *src) { @@ -1237,10 +1217,6 @@ struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans, __btree_path_put(path, intent); path = btree_path_clone(trans, path, intent); path->preserve = false; -#ifdef CONFIG_BCACHEFS_DEBUG - path->ip_allocated = ip; -#endif - btree_trans_verify_sorted(trans); return path; } @@ -1251,14 +1227,13 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, { unsigned level = path->level; - EBUG_ON(trans->restarted); + bch2_trans_verify_not_in_restart(trans); EBUG_ON(!path->ref); path = bch2_btree_path_make_mut(trans, path, intent, ip); - path->pos = new_pos; - - bch2_btree_path_check_sort_fast(trans, path, cmp); + path->pos = new_pos; + trans->paths_sorted = false; if (unlikely(path->cached)) { btree_node_unlock(trans, path, 0); @@ -1381,6 +1356,21 @@ static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *p __bch2_path_free(trans, path); } +void bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count) +{ + panic("trans->restart_count %u, should be %u, last restarted by %pS\n", + trans->restart_count, restart_count, + (void *) trans->last_restarted_ip); +} + +void bch2_trans_in_restart_error(struct btree_trans *trans) +{ + panic("in transaction restart: %s, last restarted by %pS\n", + bch2_err_str(trans->restarted), + (void *) trans->last_restarted_ip); +} + +noinline __cold void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) { struct btree_insert_entry *i; @@ -1421,6 +1411,7 @@ void bch2_dump_trans_updates(struct btree_trans *trans) printbuf_exit(&buf); } +noinline __cold void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path) { prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ", @@ -1432,39 +1423,59 @@ void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path) bch2_bpos_to_text(out, path->pos); prt_printf(out, " locks %u", path->nodes_locked); -#ifdef CONFIG_BCACHEFS_DEBUG +#ifdef TRACK_PATH_ALLOCATED prt_printf(out, " %pS", (void *) path->ip_allocated); #endif prt_newline(out); } -void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans) +noinline __cold +void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans, + bool nosort) { struct btree_path *path; unsigned idx; + if (!nosort) + btree_trans_sort_paths(trans); + trans_for_each_path_inorder(trans, path, idx) bch2_btree_path_to_text(out, path); } noinline __cold -void bch2_dump_trans_paths_updates(struct btree_trans *trans) +void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans) +{ + __bch2_trans_paths_to_text(out, trans, false); +} + +noinline __cold +void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort) { struct printbuf buf = PRINTBUF; - bch2_trans_paths_to_text(&buf, trans); + __bch2_trans_paths_to_text(&buf, trans, nosort); bch2_trans_updates_to_text(&buf, trans); bch2_print_string_as_lines(KERN_ERR, buf.buf); printbuf_exit(&buf); } -noinline +noinline __cold +void bch2_dump_trans_paths_updates(struct btree_trans *trans) +{ + __bch2_dump_trans_paths_updates(trans, false); +} + +noinline __cold static void bch2_trans_update_max_paths(struct btree_trans *trans) { struct btree_transaction_stats *s = btree_trans_stats(trans); struct printbuf buf = PRINTBUF; + if (!s) + return; + bch2_trans_paths_to_text(&buf, trans); if (!buf.allocation_failure) { @@ -1478,6 +1489,8 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans) } printbuf_exit(&buf); + + trans->nr_max_paths = hweight64(trans->paths_allocated); } static noinline void btree_path_overflow(struct btree_trans *trans) @@ -1497,19 +1510,24 @@ static inline struct btree_path *btree_path_alloc(struct btree_trans *trans, btree_path_overflow(trans); idx = __ffs64(~trans->paths_allocated); - trans->paths_allocated |= 1ULL << idx; + /* + * Do this before marking the new path as allocated, since it won't be + * initialized yet: + */ if (unlikely(idx > trans->nr_max_paths)) bch2_trans_update_max_paths(trans); - path = &trans->paths[idx]; + trans->paths_allocated |= 1ULL << idx; + path = &trans->paths[idx]; path->idx = idx; path->ref = 0; path->intent_ref = 0; path->nodes_locked = 0; btree_path_list_add(trans, pos, path); + trans->paths_sorted = false; return path; } @@ -1523,10 +1541,11 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, bool intent = flags & BTREE_ITER_INTENT; int i; - EBUG_ON(trans->restarted); - btree_trans_verify_sorted(trans); + bch2_trans_verify_not_in_restart(trans); bch2_trans_verify_locks(trans); + btree_trans_sort_paths(trans); + trans_for_each_path_inorder(trans, path, i) { if (__btree_path_cmp(path, btree_id, @@ -1559,10 +1578,10 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, path->nodes_locked = 0; for (i = 0; i < ARRAY_SIZE(path->l); i++) path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init); -#ifdef CONFIG_BCACHEFS_DEBUG +#ifdef TRACK_PATH_ALLOCATED path->ip_allocated = ip; #endif - btree_trans_verify_sorted(trans); + trans->paths_sorted = false; } if (!(flags & BTREE_ITER_NOPRESERVE)) @@ -1613,7 +1632,8 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey * EBUG_ON(ck && (path->btree_id != ck->key.btree_id || !bkey_eq(path->pos, ck->key.pos))); - EBUG_ON(!ck || !ck->valid); + if (!ck || !ck->valid) + return bkey_s_c_null; *u = ck->k->k; k = bkey_i_to_s_c(ck->k); @@ -1697,7 +1717,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) struct btree *b = NULL; int ret; - BUG_ON(trans->restarted); + bch2_trans_verify_not_in_restart(trans); EBUG_ON(iter->path->cached); bch2_btree_iter_verify(iter); @@ -1798,19 +1818,18 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter) return ret; } -static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, - enum btree_id btree_id, - struct bpos pos) +static noinline +struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter) { struct btree_insert_entry *i; struct bkey_i *ret = NULL; - trans_for_each_update(trans, i) { - if (i->btree_id < btree_id) + trans_for_each_update(iter->trans, i) { + if (i->btree_id < iter->btree_id) continue; - if (i->btree_id > btree_id) + if (i->btree_id > iter->btree_id) break; - if (bpos_lt(i->k->k.p, pos)) + if (bpos_lt(i->k->k.p, iter->path->pos)) continue; if (i->key_cache_already_flushed) continue; @@ -1821,30 +1840,44 @@ static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, return ret; } +static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter) +{ + return iter->flags & BTREE_ITER_WITH_UPDATES + ? __bch2_btree_trans_peek_updates(iter) + : NULL; +} + struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, struct btree_iter *iter, - struct bpos start_pos, struct bpos end_pos) { struct bkey_i *k; - if (bpos_lt(start_pos, iter->journal_pos)) + if (bpos_lt(iter->path->pos, iter->journal_pos)) iter->journal_idx = 0; k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id, iter->path->level, - start_pos, end_pos, + iter->path->pos, + end_pos, &iter->journal_idx); iter->journal_pos = k ? k->k.p : end_pos; return k; } -struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos pos) +static noinline +struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, + struct btree_iter *iter) { - return bch2_btree_journal_peek(trans, iter, pos, pos); + struct bkey_i *k = bch2_btree_journal_peek(trans, iter, iter->path->pos); + + if (k) { + iter->k = k->k; + return bkey_i_to_s_c(k); + } else { + return bkey_s_c_null; + } } static noinline @@ -1853,7 +1886,7 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, struct bkey_s_c k) { struct bkey_i *next_journal = - bch2_btree_journal_peek(trans, iter, iter->path->pos, + bch2_btree_journal_peek(trans, iter, k.k ? k.k->p : path_l(iter->path)->b->key.k.p); if (next_journal) { @@ -1869,42 +1902,46 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, * bkey_s_c_null: */ static noinline -struct bkey_s_c __btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) +struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) { struct btree_trans *trans = iter->trans; struct bch_fs *c = trans->c; struct bkey u; + struct bkey_s_c k; int ret; + if ((iter->flags & BTREE_ITER_KEY_CACHE_FILL) && + bpos_eq(iter->pos, pos)) + return bkey_s_c_null; + if (!bch2_btree_key_cache_find(c, iter->btree_id, pos)) return bkey_s_c_null; if (!iter->key_cache_path) iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos, iter->flags & BTREE_ITER_INTENT, 0, - iter->flags|BTREE_ITER_CACHED, + iter->flags|BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL, _THIS_IP_); iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); - ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED); + ret = bch2_btree_path_traverse(trans, iter->key_cache_path, + iter->flags|BTREE_ITER_CACHED) ?: + bch2_btree_path_relock(trans, iter->path, _THIS_IP_); if (unlikely(ret)) return bkey_s_c_err(ret); btree_path_set_should_be_locked(iter->key_cache_path); - return bch2_btree_path_peek_slot(iter->key_cache_path, &u); -} - -static noinline -struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) -{ - struct bkey_s_c ret = __btree_trans_peek_key_cache(iter, pos); - int err = bkey_err(ret) ?: bch2_btree_path_relock(iter->trans, iter->path, _THIS_IP_); - - return err ? bkey_s_c_err(err) : ret; + k = bch2_btree_path_peek_slot(iter->key_cache_path, &u); + if (k.k && !bkey_err(k)) { + iter->k = u; + k.k = &iter->k; + } + return k; } static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) @@ -1959,9 +1996,8 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) k = btree_trans_peek_journal(trans, iter, k); - next_update = iter->flags & BTREE_ITER_WITH_UPDATES - ? btree_trans_peek_updates(trans, iter->btree_id, search_key) - : NULL; + next_update = btree_trans_peek_updates(iter); + if (next_update && bpos_le(next_update->k.p, k.k ? k.k->p : l->b->key.k.p)) { @@ -2114,8 +2150,8 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e btree_path_set_should_be_locked(iter->path); out_no_locked: if (iter->update_path) { - if (iter->update_path->uptodate && - (ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_))) + ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_); + if (unlikely(ret)) k = bkey_s_c_err(ret); else btree_path_set_should_be_locked(iter->update_path); @@ -2293,8 +2329,6 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) k = btree_path_level_prev(trans, iter->path, &iter->path->l[0], &iter->k); - bch2_btree_path_check_sort(trans, iter->path, 0); - if (likely(k.k)) { if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) { if (k.k->p.snapshot == iter->snapshot) @@ -2419,9 +2453,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { struct bkey_i *next_update; - if ((iter->flags & BTREE_ITER_WITH_UPDATES) && - (next_update = btree_trans_peek_updates(trans, - iter->btree_id, search_key)) && + if ((next_update = btree_trans_peek_updates(iter)) && bpos_eq(next_update->k.p, iter->pos)) { iter->k = next_update->k; k = bkey_i_to_s_c(next_update); @@ -2429,15 +2461,11 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && - (next_update = bch2_btree_journal_peek_slot(trans, - iter, iter->pos))) { - iter->k = next_update->k; - k = bkey_i_to_s_c(next_update); + (k = btree_trans_peek_slot_journal(trans, iter)).k) goto out; - } if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && - (k = __btree_trans_peek_key_cache(iter, iter->pos)).k) { + (k = btree_trans_peek_key_cache(iter, iter->pos)).k) { if (!bkey_err(k)) iter->k = *k.k; /* We're not returning a key from iter->path: */ @@ -2529,27 +2557,29 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) /* new transactional stuff: */ -static inline void btree_path_verify_sorted_ref(struct btree_trans *trans, - struct btree_path *path) -{ - EBUG_ON(path->sorted_idx >= trans->nr_sorted); - EBUG_ON(trans->sorted[path->sorted_idx] != path->idx); - EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); -} - -static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) -{ #ifdef CONFIG_BCACHEFS_DEBUG +static void btree_trans_verify_sorted_refs(struct btree_trans *trans) +{ + struct btree_path *path; unsigned i; - for (i = 0; i < trans->nr_sorted; i++) - btree_path_verify_sorted_ref(trans, trans->paths + trans->sorted[i]); -#endif + BUG_ON(trans->nr_sorted != hweight64(trans->paths_allocated)); + + trans_for_each_path(trans, path) { + BUG_ON(path->sorted_idx >= trans->nr_sorted); + BUG_ON(trans->sorted[path->sorted_idx] != path->idx); + } + + for (i = 0; i < trans->nr_sorted; i++) { + unsigned idx = trans->sorted[i]; + + EBUG_ON(!(trans->paths_allocated & (1ULL << idx))); + BUG_ON(trans->paths[idx].sorted_idx != i); + } } static void btree_trans_verify_sorted(struct btree_trans *trans) { -#ifdef CONFIG_BCACHEFS_DEBUG struct btree_path *path, *prev = NULL; unsigned i; @@ -2558,80 +2588,54 @@ static void btree_trans_verify_sorted(struct btree_trans *trans) trans_for_each_path_inorder(trans, path, i) { if (prev && btree_path_cmp(prev, path) > 0) { - bch2_dump_trans_paths_updates(trans); + __bch2_dump_trans_paths_updates(trans, true); panic("trans paths out of order!\n"); } prev = path; } +} +#else +static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {} +static inline void btree_trans_verify_sorted(struct btree_trans *trans) {} #endif -} -static inline void btree_path_swap(struct btree_trans *trans, - struct btree_path *l, struct btree_path *r) +void __bch2_btree_trans_sort_paths(struct btree_trans *trans) { - swap(l->sorted_idx, r->sorted_idx); - swap(trans->sorted[l->sorted_idx], - trans->sorted[r->sorted_idx]); + int i, l = 0, r = trans->nr_sorted, inc = 1; + bool swapped; - btree_path_verify_sorted_ref(trans, l); - btree_path_verify_sorted_ref(trans, r); -} + btree_trans_verify_sorted_refs(trans); -static inline struct btree_path *sib_btree_path(struct btree_trans *trans, - struct btree_path *path, int sib) -{ - unsigned idx = (unsigned) path->sorted_idx + sib; + if (trans->paths_sorted) + goto out; - EBUG_ON(sib != -1 && sib != 1); + /* + * Cocktail shaker sort: this is efficient because iterators will be + * mostly sorted. + */ + do { + swapped = false; - return idx < trans->nr_sorted - ? trans->paths + trans->sorted[idx] - : NULL; -} - -static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans *trans, - struct btree_path *path, - int cmp) -{ - struct btree_path *n; - int cmp2; - - EBUG_ON(!cmp); - - while ((n = sib_btree_path(trans, path, cmp)) && - (cmp2 = btree_path_cmp(n, path)) && - cmp2 != cmp) - btree_path_swap(trans, n, path); - - btree_trans_verify_sorted(trans); -} - -inline void bch2_btree_path_check_sort(struct btree_trans *trans, struct btree_path *path, - int cmp) -{ - struct btree_path *n; - - if (cmp <= 0) { - n = prev_btree_path(trans, path); - if (n && btree_path_cmp(n, path) > 0) { - do { - btree_path_swap(trans, n, path); - n = prev_btree_path(trans, path); - } while (n && btree_path_cmp(n, path) > 0); - - goto out; + for (i = inc > 0 ? l : r - 2; + i + 1 < r && i >= l; + i += inc) { + if (btree_path_cmp(trans->paths + trans->sorted[i], + trans->paths + trans->sorted[i + 1]) > 0) { + swap(trans->sorted[i], trans->sorted[i + 1]); + trans->paths[trans->sorted[i]].sorted_idx = i; + trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1; + swapped = true; + } } - } - if (cmp >= 0) { - n = next_btree_path(trans, path); - if (n && btree_path_cmp(path, n) > 0) { - do { - btree_path_swap(trans, path, n); - n = next_btree_path(trans, path); - } while (n && btree_path_cmp(path, n) > 0); - } - } + if (inc > 0) + --r; + else + l++; + inc = -inc; + } while (swapped); + + trans->paths_sorted = true; out: btree_trans_verify_sorted(trans); } @@ -2642,15 +2646,18 @@ static inline void btree_path_list_remove(struct btree_trans *trans, unsigned i; EBUG_ON(path->sorted_idx >= trans->nr_sorted); - +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + trans->nr_sorted--; + memmove_u64s_down_small(trans->sorted + path->sorted_idx, + trans->sorted + path->sorted_idx + 1, + DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8)); +#else array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx); - +#endif for (i = path->sorted_idx; i < trans->nr_sorted; i++) trans->paths[trans->sorted[i]].sorted_idx = i; path->sorted_idx = U8_MAX; - - btree_trans_verify_sorted_refs(trans); } static inline void btree_path_list_add(struct btree_trans *trans, @@ -2659,16 +2666,17 @@ static inline void btree_path_list_add(struct btree_trans *trans, { unsigned i; - btree_trans_verify_sorted_refs(trans); - - path->sorted_idx = pos ? pos->sorted_idx + 1 : 0; - - if (unlikely(trans->in_traverse_all) && - trans->traverse_all_idx != U8_MAX && - trans->traverse_all_idx >= path->sorted_idx) - trans->traverse_all_idx++; + path->sorted_idx = pos ? pos->sorted_idx + 1 : trans->nr_sorted; +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1, + trans->sorted + path->sorted_idx, + DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8)); + trans->nr_sorted++; + trans->sorted[path->sorted_idx] = path->idx; +#else array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx); +#endif for (i = path->sorted_idx; i < trans->nr_sorted; i++) trans->paths[trans->sorted[i]].sorted_idx = i; @@ -2812,14 +2820,6 @@ u32 bch2_trans_begin(struct btree_trans *trans) trans->restart_count++; trans->mem_top = 0; - if (trans->fs_usage_deltas) { - trans->fs_usage_deltas->used = 0; - memset((void *) trans->fs_usage_deltas + - offsetof(struct replicas_delta_list, memset_start), 0, - (void *) &trans->fs_usage_deltas->memset_end - - (void *) &trans->fs_usage_deltas->memset_start); - } - trans_for_each_path(trans, path) { path->should_be_locked = false; @@ -2850,25 +2850,19 @@ u32 bch2_trans_begin(struct btree_trans *trans) bch2_trans_relock(trans); } - if (unlikely(time_after(jiffies, trans->srcu_lock_time + HZ))) + if (unlikely(time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10)))) bch2_trans_reset_srcu_lock(trans); trans->last_restarted_ip = _RET_IP_; - if (trans->restarted) + if (trans->restarted) { bch2_btree_path_traverse_all(trans); + trans->notrace_relock_fail = false; + } trans->last_begin_time = local_clock(); return trans->restart_count; } -void bch2_trans_verify_not_restarted(struct btree_trans *trans, u32 restart_count) -{ - if (trans_was_restarted(trans, restart_count)) - panic("trans->restart_count %u, should be %u, last restarted by %pS\n", - trans->restart_count, restart_count, - (void *) trans->last_restarted_ip); -} - static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c) { size_t paths_bytes = sizeof(struct btree_path) * BTREE_ITER_MAX; @@ -2908,7 +2902,6 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_ __acquires(&c->btree_trans_barrier) { struct btree_transaction_stats *s; - struct btree_trans *pos; BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); @@ -2944,16 +2937,20 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_ trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); trans->srcu_lock_time = jiffies; - mutex_lock(&c->btree_trans_lock); - list_for_each_entry(pos, &c->btree_trans_list, list) { - if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) { - list_add_tail(&trans->list, &pos->list); - goto list_add_done; + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { + struct btree_trans *pos; + + mutex_lock(&c->btree_trans_lock); + list_for_each_entry(pos, &c->btree_trans_list, list) { + if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) { + list_add_tail(&trans->list, &pos->list); + goto list_add_done; + } } - } - list_add_tail(&trans->list, &c->btree_trans_list); + list_add_tail(&trans->list, &c->btree_trans_list); list_add_done: - mutex_unlock(&c->btree_trans_lock); + mutex_unlock(&c->btree_trans_lock); + } } static void check_btree_paths_leaked(struct btree_trans *trans) @@ -2998,9 +2995,11 @@ void bch2_trans_exit(struct btree_trans *trans) check_btree_paths_leaked(trans); - mutex_lock(&c->btree_trans_lock); - list_del(&trans->list); - mutex_unlock(&c->btree_trans_lock); + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { + mutex_lock(&c->btree_trans_lock); + list_del(&trans->list); + mutex_unlock(&c->btree_trans_lock); + } srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); @@ -3098,7 +3097,9 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) b = READ_ONCE(trans->locking); if (b) { - prt_str(out, " want"); + prt_printf(out, " blocked for %lluus on", + div_u64(local_clock() - trans->locking_wait.start_time, + 1000)); prt_newline(out); prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]); bch2_btree_bkey_cached_common_to_text(out, b); @@ -3112,8 +3113,10 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) for (s = c->btree_transaction_stats; s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); - s++) + s++) { kfree(s->max_paths_text); + bch2_time_stats_exit(&s->lock_hold_times); + } if (c->btree_trans_barrier_initialized) cleanup_srcu_struct(&c->btree_trans_barrier); @@ -3123,11 +3126,16 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) int bch2_fs_btree_iter_init(struct bch_fs *c) { - unsigned i, nr = BTREE_ITER_MAX; + struct btree_transaction_stats *s; + unsigned nr = BTREE_ITER_MAX; int ret; - for (i = 0; i < ARRAY_SIZE(c->btree_transaction_stats); i++) - mutex_init(&c->btree_transaction_stats[i].lock); + for (s = c->btree_transaction_stats; + s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); + s++) { + bch2_time_stats_init(&s->lock_hold_times); + mutex_init(&s->lock); + } INIT_LIST_HEAD(&c->btree_trans_list); mutex_init(&c->btree_trans_lock); diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 07c415d5..0ede02c3 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -54,6 +54,16 @@ static inline struct btree *btree_node_parent(struct btree_path *path, /* Iterate over paths within a transaction: */ +void __bch2_btree_trans_sort_paths(struct btree_trans *); + +static inline void btree_trans_sort_paths(struct btree_trans *trans) +{ + if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && + trans->paths_sorted) + return; + __bch2_btree_trans_sort_paths(trans); +} + static inline struct btree_path * __trans_next_path(struct btree_trans *trans, unsigned idx) { @@ -72,8 +82,6 @@ __trans_next_path(struct btree_trans *trans, unsigned idx) return &trans->paths[idx]; } -void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int); - #define trans_for_each_path_from(_trans, _path, _start) \ for (_path = __trans_next_path((_trans), _start); \ (_path); \ @@ -95,9 +103,10 @@ static inline struct btree_path *next_btree_path(struct btree_trans *trans, stru static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path) { - EBUG_ON(path->sorted_idx >= trans->nr_sorted); - return path->sorted_idx - ? trans->paths + trans->sorted[path->sorted_idx - 1] + unsigned idx = path ? path->sorted_idx : trans->nr_sorted; + + return idx + ? trans->paths + trans->sorted[idx - 1] : NULL; } @@ -106,6 +115,11 @@ static inline struct btree_path *prev_btree_path(struct btree_trans *trans, stru ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\ _i++) +#define trans_for_each_path_inorder_reverse(_trans, _path, _i) \ + for (_i = trans->nr_sorted - 1; \ + ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) >= 0;\ + --_i) + static inline bool __path_has_node(const struct btree_path *path, const struct btree *b) { @@ -161,6 +175,18 @@ bch2_btree_path_set_pos(struct btree_trans *trans, : path; } +int __must_check bch2_btree_path_traverse_one(struct btree_trans *, struct btree_path *, + unsigned, unsigned long); + +static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans, + struct btree_path *path, unsigned flags) +{ + if (path->uptodate < BTREE_ITER_NEED_RELOCK) + return 0; + + return bch2_btree_path_traverse_one(trans, path, flags, _RET_IP_); +} + int __must_check bch2_btree_path_traverse(struct btree_trans *, struct btree_path *, unsigned); struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, @@ -193,6 +219,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *); void bch2_path_put(struct btree_trans *, struct btree_path *, bool); int bch2_trans_relock(struct btree_trans *); +int bch2_trans_relock_notrace(struct btree_trans *); void bch2_trans_unlock(struct btree_trans *); bool bch2_trans_locked(struct btree_trans *); @@ -201,7 +228,22 @@ static inline bool trans_was_restarted(struct btree_trans *trans, u32 restart_co return restart_count != trans->restart_count; } -void bch2_trans_verify_not_restarted(struct btree_trans *, u32); +void bch2_trans_restart_error(struct btree_trans *, u32); + +static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans, + u32 restart_count) +{ + if (trans_was_restarted(trans, restart_count)) + bch2_trans_restart_error(trans, restart_count); +} + +void bch2_trans_in_restart_error(struct btree_trans *); + +static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans) +{ + if (trans->restarted) + bch2_trans_in_restart_error(trans); +} __always_inline static inline int btree_trans_restart_nounlock(struct btree_trans *trans, int err) diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index dcd1a479..13df0d40 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -12,6 +12,7 @@ #include "journal_reclaim.h" #include <linux/sched/mm.h> +#include <linux/seq_buf.h> #include <trace/events/bcachefs.h> static inline bool btree_uses_pcpu_readers(enum btree_id id) @@ -56,13 +57,12 @@ static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) if (!six_trylock_intent(&ck->c.lock)) return false; - if (!six_trylock_write(&ck->c.lock)) { + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { six_unlock_intent(&ck->c.lock); return false; } - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - six_unlock_write(&ck->c.lock); + if (!six_trylock_write(&ck->c.lock)) { six_unlock_intent(&ck->c.lock); return false; } @@ -197,6 +197,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, struct btree_key_cache *bc = &c->btree_key_cache; struct bkey_cached *ck = NULL; bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); + int ret; if (!pcpu_readers) { #ifdef __KERNEL__ @@ -244,7 +245,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, if (ck) { int ret; - ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent); + ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_); if (unlikely(ret)) { bkey_cached_move_to_freelist(bc, ck); return ERR_PTR(ret); @@ -264,22 +265,33 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, return ck; } - /* GFP_NOFS because we're holding btree locks: */ - ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO); - if (likely(ck)) { - INIT_LIST_HEAD(&ck->list); - __six_lock_init(&ck->c.lock, "b->c.lock", &bch2_btree_node_lock_key); - if (pcpu_readers) - six_lock_pcpu_alloc(&ck->c.lock); + ck = kmem_cache_zalloc(bch2_key_cache, GFP_NOWAIT|__GFP_NOWARN); + if (likely(ck)) + goto init; - ck->c.cached = true; - BUG_ON(!six_trylock_intent(&ck->c.lock)); - BUG_ON(!six_trylock_write(&ck->c.lock)); - *was_new = true; - return ck; + bch2_trans_unlock(trans); + + ck = kmem_cache_zalloc(bch2_key_cache, GFP_KERNEL); + + ret = bch2_trans_relock(trans); + if (ret) { + kmem_cache_free(bch2_key_cache, ck); + return ERR_PTR(ret); } - return NULL; + if (!ck) + return NULL; +init: + INIT_LIST_HEAD(&ck->list); + __six_lock_init(&ck->c.lock, "b->c.lock", &bch2_btree_node_lock_key); + if (pcpu_readers) + six_lock_pcpu_alloc(&ck->c.lock); + + ck->c.cached = true; + BUG_ON(!six_trylock_intent(&ck->c.lock)); + BUG_ON(!six_trylock_write(&ck->c.lock)); + *was_new = true; + return ck; } static struct bkey_cached * @@ -369,24 +381,23 @@ static int btree_key_cache_fill(struct btree_trans *trans, struct btree_path *ck_path, struct bkey_cached *ck) { - struct btree_path *path; + struct btree_iter iter; struct bkey_s_c k; unsigned new_u64s = 0; struct bkey_i *new_k = NULL; - struct bkey u; int ret; - path = bch2_path_get(trans, ck->key.btree_id, - ck->key.pos, 0, 0, 0, _THIS_IP_); - ret = bch2_btree_path_traverse(trans, path, 0); + bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos, + BTREE_ITER_KEY_CACHE_FILL| + BTREE_ITER_CACHED_NOFILL); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); if (ret) goto err; - k = bch2_btree_path_peek_slot(path, &u); - if (!bch2_btree_node_relock(trans, ck_path, 0)) { trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); goto err; } @@ -405,12 +416,30 @@ static int btree_key_cache_fill(struct btree_trans *trans, if (new_u64s > ck->u64s) { new_u64s = roundup_pow_of_two(new_u64s); - new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS); + new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); if (!new_k) { - bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", - bch2_btree_ids[ck->key.btree_id], new_u64s); - ret = -ENOMEM; - goto err; + bch2_trans_unlock(trans); + + new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); + if (!new_k) { + bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", + bch2_btree_ids[ck->key.btree_id], new_u64s); + ret = -ENOMEM; + goto err; + } + + if (!bch2_btree_node_relock(trans, ck_path, 0)) { + kfree(new_k); + trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); + goto err; + } + + ret = bch2_trans_relock(trans); + if (ret) { + kfree(new_k); + goto err; + } } } @@ -431,9 +460,9 @@ static int btree_key_cache_fill(struct btree_trans *trans, bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); /* We're not likely to need this iterator again: */ - path->preserve = false; + set_btree_iter_dontneed(&iter); err: - bch2_path_put(trans, path, 0); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -449,7 +478,7 @@ bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree path->l[1].b = NULL; - if (bch2_btree_node_relock(trans, path, 0)) { + if (bch2_btree_node_relock_notrace(trans, path, 0)) { ck = (void *) path->l[0].b; goto fill; } @@ -487,7 +516,9 @@ retry: path->l[0].lock_seq = ck->c.lock.state.seq; path->l[0].b = (void *) ck; fill: - if (!ck->valid) { + path->uptodate = BTREE_ITER_UPTODATE; + + if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) { /* * Using the underscore version because we haven't set * path->uptodate yet: @@ -502,17 +533,23 @@ fill: ret = btree_key_cache_fill(trans, path, ck); if (ret) goto err; + + ret = bch2_btree_path_relock(trans, path, _THIS_IP_); + if (ret) + goto err; + + path->uptodate = BTREE_ITER_UPTODATE; } if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) set_bit(BKEY_CACHED_ACCESSED, &ck->flags); - path->uptodate = BTREE_ITER_UPTODATE; - BUG_ON(!ck->valid); BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); + BUG_ON(path->uptodate); return ret; err: + path->uptodate = BTREE_ITER_NEED_TRAVERSE; if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { btree_node_unlock(trans, path, 0); path->l[0].b = ERR_PTR(ret); @@ -531,7 +568,7 @@ int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path path->l[1].b = NULL; - if (bch2_btree_node_relock(trans, path, 0)) { + if (bch2_btree_node_relock_notrace(trans, path, 0)) { ck = (void *) path->l[0].b; goto fill; } @@ -696,6 +733,13 @@ int bch2_btree_key_cache_journal_flush(struct journal *j, six_unlock_read(&ck->c.lock); goto unlock; } + + if (ck->seq != seq) { + bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal, + bch2_btree_key_cache_journal_flush); + six_unlock_read(&ck->c.lock); + goto unlock; + } six_unlock_read(&ck->c.lock); ret = commit_do(&trans, NULL, NULL, 0, @@ -725,6 +769,7 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans, } bool bch2_btree_insert_key_cached(struct btree_trans *trans, + unsigned flags, struct btree_path *path, struct bkey_i *insert) { @@ -734,7 +779,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, BUG_ON(insert->u64s > ck->u64s); - if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { + if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { int difference; BUG_ON(jset_u64s(insert->u64s) > trans->journal_preres.u64s); @@ -757,8 +802,9 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, kick_reclaim = true; } - bch2_journal_pin_update(&c->journal, trans->journal_res.seq, - &ck->journal, bch2_btree_key_cache_journal_flush); + bch2_journal_pin_add(&c->journal, trans->journal_res.seq, + &ck->journal, bch2_btree_key_cache_journal_flush); + ck->seq = trans->journal_res.seq; if (kick_reclaim) journal_reclaim_kick(&c->journal); @@ -978,12 +1024,16 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) INIT_LIST_HEAD(&c->freed_nonpcpu); } -static void bch2_btree_key_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink) +static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink) { struct btree_key_cache *bc = container_of(shrink, struct btree_key_cache, shrink); + char *cbuf; + size_t buflen = seq_buf_get_buf(s, &cbuf); + struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen); - bch2_btree_key_cache_to_text(out, bc); + bch2_btree_key_cache_to_text(&out, bc); + seq_buf_commit(s, out.pos); } int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) diff --git a/libbcachefs/btree_key_cache.h b/libbcachefs/btree_key_cache.h index eccea15f..c86d5e48 100644 --- a/libbcachefs/btree_key_cache.h +++ b/libbcachefs/btree_key_cache.h @@ -29,7 +29,7 @@ bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos); int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *, unsigned); -bool bch2_btree_insert_key_cached(struct btree_trans *, +bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned, struct btree_path *, struct bkey_i *); int bch2_btree_key_cache_flush(struct btree_trans *, enum btree_id, struct bpos); diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c index dce2dc0c..1ddac23c 100644 --- a/libbcachefs/btree_locking.c +++ b/libbcachefs/btree_locking.c @@ -99,6 +99,12 @@ static void lock_graph_up(struct lock_graph *g) closure_put(&g->g[--g->nr].trans->ref); } +static noinline void lock_graph_pop_all(struct lock_graph *g) +{ + while (g->nr) + lock_graph_up(g); +} + static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans) { closure_get(&trans->ref); @@ -274,7 +280,25 @@ next: b = &READ_ONCE(path->l[top->level].b)->c; if (IS_ERR_OR_NULL(b)) { - BUG_ON(!lock_graph_remove_non_waiters(&g)); + /* + * If we get here, it means we raced with the + * other thread updating its btree_path + * structures - which means it can't be blocked + * waiting on a lock: + */ + if (!lock_graph_remove_non_waiters(&g)) { + /* + * If lock_graph_remove_non_waiters() + * didn't do anything, it must be + * because we're being called by debugfs + * checking for lock cycles, which + * invokes us on btree_transactions that + * aren't actually waiting on anything. + * Just bail out: + */ + lock_graph_pop_all(&g); + } + goto next; } @@ -335,7 +359,8 @@ int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *p * locked: */ six_lock_readers_add(&b->lock, -readers); - ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write, lock_may_not_fail); + ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write, + lock_may_not_fail, _RET_IP_); six_lock_readers_add(&b->lock, readers); if (ret) @@ -407,7 +432,7 @@ bool __bch2_btree_node_relock(struct btree_trans *trans, return true; } fail: - if (trace) + if (trace && !trans->notrace_relock_fail) trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level); return false; } @@ -504,6 +529,17 @@ bool bch2_btree_path_relock_norestart(struct btree_trans *trans, return btree_path_get_locks(trans, path, false); } +int __bch2_btree_path_relock(struct btree_trans *trans, + struct btree_path *path, unsigned long trace_ip) +{ + if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) { + trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path); + } + + return 0; +} + __flatten bool bch2_btree_path_upgrade_norestart(struct btree_trans *trans, struct btree_path *path, unsigned long trace_ip) @@ -615,6 +651,21 @@ int bch2_trans_relock(struct btree_trans *trans) return 0; } +int bch2_trans_relock_notrace(struct btree_trans *trans) +{ + struct btree_path *path; + + if (unlikely(trans->restarted)) + return -((int) trans->restarted); + + trans_for_each_path(trans, path) + if (path->should_be_locked && + !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) { + return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); + } + return 0; +} + void bch2_trans_unlock(struct btree_trans *trans) { struct btree_path *path; diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index fb237c95..3e14fe60 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -191,7 +191,8 @@ int bch2_six_check_for_deadlock(struct six_lock *lock, void *p); static inline int __btree_node_lock_nopath(struct btree_trans *trans, struct btree_bkey_cached_common *b, enum six_lock_type type, - bool lock_may_not_fail) + bool lock_may_not_fail, + unsigned long ip) { int ret; @@ -199,8 +200,8 @@ static inline int __btree_node_lock_nopath(struct btree_trans *trans, trans->lock_must_abort = false; trans->locking = b; - ret = six_lock_type_waiter(&b->lock, type, &trans->locking_wait, - bch2_six_check_for_deadlock, trans); + ret = six_lock_type_ip_waiter(&b->lock, type, &trans->locking_wait, + bch2_six_check_for_deadlock, trans, ip); WRITE_ONCE(trans->locking, NULL); WRITE_ONCE(trans->locking_wait.start_time, 0); return ret; @@ -209,16 +210,17 @@ static inline int __btree_node_lock_nopath(struct btree_trans *trans, static inline int __must_check btree_node_lock_nopath(struct btree_trans *trans, struct btree_bkey_cached_common *b, - enum six_lock_type type) + enum six_lock_type type, + unsigned long ip) { - return __btree_node_lock_nopath(trans, b, type, false); + return __btree_node_lock_nopath(trans, b, type, false, ip); } static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans, struct btree_bkey_cached_common *b, enum six_lock_type type) { - int ret = __btree_node_lock_nopath(trans, b, type, true); + int ret = __btree_node_lock_nopath(trans, b, type, true, _THIS_IP_); BUG_ON(ret); } @@ -258,7 +260,7 @@ static inline int btree_node_lock(struct btree_trans *trans, if (likely(six_trylock_type(&b->lock, type)) || btree_node_lock_increment(trans, b, level, type) || - !(ret = btree_node_lock_nopath(trans, b, type))) { + !(ret = btree_node_lock_nopath(trans, b, type, btree_path_ip_allocated(path)))) { #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS path->l[b->level].lock_taken_time = local_clock(); #endif @@ -312,6 +314,17 @@ bch2_btree_node_lock_write(struct btree_trans *trans, bool bch2_btree_path_relock_norestart(struct btree_trans *, struct btree_path *, unsigned long); +int __bch2_btree_path_relock(struct btree_trans *, + struct btree_path *, unsigned long); + +static inline int bch2_btree_path_relock(struct btree_trans *trans, + struct btree_path *path, unsigned long trace_ip) +{ + return btree_node_locked(path, path->level) + ? 0 + : __bch2_btree_path_relock(trans, path, trace_ip); +} + bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned, bool trace); static inline bool bch2_btree_node_relock(struct btree_trans *trans, @@ -338,17 +351,6 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, __bch2_btree_node_relock(trans, path, level, false)); } -static inline int bch2_btree_path_relock(struct btree_trans *trans, - struct btree_path *path, unsigned long trace_ip) -{ - if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) { - trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path); - } - - return 0; -} - /* upgrade */ bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *, diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index af86ba12..23e7f0ca 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -10,6 +10,7 @@ #include "buckets_types.h" #include "darray.h" #include "journal_types.h" +#include "replicas_types.h" struct open_bucket; struct btree_update; @@ -217,6 +218,8 @@ struct btree_node_iter { #define BTREE_ITER_ALL_SNAPSHOTS (1 << 11) #define BTREE_ITER_FILTER_SNAPSHOTS (1 << 12) #define BTREE_ITER_NOPRESERVE (1 << 13) +#define BTREE_ITER_CACHED_NOFILL (1 << 14) +#define BTREE_ITER_KEY_CACHE_FILL (1 << 15) enum btree_path_uptodate { BTREE_ITER_UPTODATE = 0, @@ -224,6 +227,10 @@ enum btree_path_uptodate { BTREE_ITER_NEED_TRAVERSE = 2, }; +#if defined(CONFIG_BCACHEFS_LOCK_TIME_STATS) || defined(CONFIG_BCACHEFS_DEBUG) +#define TRACK_PATH_ALLOCATED +#endif + struct btree_path { u8 idx; u8 sorted_idx; @@ -254,7 +261,7 @@ struct btree_path { u64 lock_taken_time; #endif } l[BTREE_MAX_DEPTH]; -#ifdef CONFIG_BCACHEFS_DEBUG +#ifdef TRACK_PATH_ALLOCATED unsigned long ip_allocated; #endif }; @@ -264,6 +271,15 @@ static inline struct btree_path_level *path_l(struct btree_path *path) return path->l + path->level; } +static inline unsigned long btree_path_ip_allocated(struct btree_path *path) +{ +#ifdef TRACK_PATH_ALLOCATED + return path->ip_allocated; +#else + return _THIS_IP_; +#endif +} + /* * @pos - iterator's current position * @level - current btree depth @@ -297,7 +313,7 @@ struct btree_iter { /* BTREE_ITER_WITH_JOURNAL: */ size_t journal_idx; struct bpos journal_pos; -#ifdef CONFIG_BCACHEFS_DEBUG +#ifdef TRACK_PATH_ALLOCATED unsigned long ip_allocated; #endif }; @@ -344,6 +360,7 @@ struct bkey_cached { struct journal_preres res; struct journal_entry_pin journal; + u64 seq; struct bkey_i *k; }; @@ -412,12 +429,14 @@ struct btree_trans { u8 fn_idx; u8 nr_sorted; u8 nr_updates; - u8 traverse_all_idx; bool used_mempool:1; bool in_traverse_all:1; + bool paths_sorted:1; bool memory_allocation_failure:1; - bool is_initial_gc:1; + bool journal_transaction_names:1; bool journal_replay_not_finished:1; + bool is_initial_gc:1; + bool notrace_relock_fail:1; enum bch_errcode restarted:16; u32 restart_count; unsigned long last_restarted_ip; @@ -437,7 +456,7 @@ struct btree_trans { unsigned mem_bytes; void *mem; - u8 sorted[BTREE_ITER_MAX]; + u8 sorted[BTREE_ITER_MAX + 8]; struct btree_path *paths; struct btree_insert_entry *updates; @@ -450,7 +469,6 @@ struct btree_trans { struct journal_preres journal_preres; u64 *journal_seq; struct disk_reservation *disk_res; - unsigned flags; unsigned journal_u64s; unsigned journal_preres_u64s; struct replicas_delta_list *fs_usage_deltas; diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index 7e9f1f17..673c3a78 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -80,7 +80,7 @@ int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, void bch2_trans_commit_hook(struct btree_trans *, struct btree_trans_commit_hook *); -int __bch2_trans_commit(struct btree_trans *); +int __bch2_trans_commit(struct btree_trans *, unsigned); int bch2_trans_log_msg(struct btree_trans *, const char *, ...); int bch2_fs_log_msg(struct bch_fs *, const char *, ...); @@ -101,9 +101,8 @@ static inline int bch2_trans_commit(struct btree_trans *trans, { trans->disk_res = disk_res; trans->journal_seq = journal_seq; - trans->flags = flags; - return __bch2_trans_commit(trans); + return __bch2_trans_commit(trans, flags); } #define commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \ @@ -154,6 +153,14 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans) trans->nr_updates = 0; trans->hooks = NULL; trans->extra_journal_entries.nr = 0; + + if (trans->fs_usage_deltas) { + trans->fs_usage_deltas->used = 0; + memset((void *) trans->fs_usage_deltas + + offsetof(struct replicas_delta_list, memset_start), 0, + (void *) &trans->fs_usage_deltas->memset_end - + (void *) &trans->fs_usage_deltas->memset_start); + } } #endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index a49e7b6b..09aeee06 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -2032,7 +2032,7 @@ void async_btree_node_rewrite_work(struct work_struct *work) bch2_trans_do(c, NULL, NULL, 0, async_btree_node_rewrite_trans(&trans, a)); - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); kfree(a); } @@ -2040,12 +2040,12 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) { struct async_btree_rewrite *a; - if (!percpu_ref_tryget_live(&c->writes)) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) return; a = kmalloc(sizeof(*a), GFP_NOFS); if (!a) { - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); return; } @@ -2102,7 +2102,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, btree_path_set_level_up(trans, iter2.path); - bch2_btree_path_check_sort(trans, iter2.path, 0); + trans->paths_sorted = false; ret = bch2_btree_iter_traverse(&iter2) ?: bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN); diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index a2b37dd4..f01a2e90 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -24,12 +24,28 @@ #include <linux/sort.h> #include <trace/events/bcachefs.h> +/* + * bch2_btree_path_peek_slot() for a cached iterator might return a key in a + * different snapshot: + */ +struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u) +{ + struct bkey_s_c k = bch2_btree_path_peek_slot(path, u); + + if (k.k && bpos_eq(path->pos, k.k->p)) + return k; + + bkey_init(u); + u->p = path->pos; + return (struct bkey_s_c) { u, NULL }; +} + static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i) { #ifdef CONFIG_BCACHEFS_DEBUG struct bch_fs *c = trans->c; struct bkey u; - struct bkey_s_c k = bch2_btree_path_peek_slot(i->path, &u); + struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u); if (unlikely(trans->journal_replay_not_finished)) { struct bkey_i *j_k = @@ -314,17 +330,15 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s, } static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, - unsigned flags) + unsigned flags) { return bch2_journal_res_get(&trans->c->journal, &trans->journal_res, - trans->journal_u64s, - flags| - (trans->flags & JOURNAL_WATERMARK_MASK)); + trans->journal_u64s, flags); } #define JSET_ENTRY_LOG_U64s 4 -static void journal_transaction_name(struct btree_trans *trans) +static noinline void journal_transaction_name(struct btree_trans *trans) { struct bch_fs *c = trans->c; struct journal *j = &c->journal; @@ -349,9 +363,8 @@ static inline int btree_key_can_insert(struct btree_trans *trans, return 0; } -static int btree_key_can_insert_cached(struct btree_trans *trans, - struct btree_path *path, - unsigned u64s) +static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags, + struct btree_path *path, unsigned u64s) { struct bch_fs *c = trans->c; struct bkey_cached *ck = (void *) path->l[0].b; @@ -363,7 +376,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && bch2_btree_key_cache_must_wait(c) && - !(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM)) + !(flags & BTREE_INSERT_JOURNAL_RECLAIM)) return -BCH_ERR_btree_insert_need_journal_reclaim; /* @@ -573,7 +586,7 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) } static inline int -bch2_trans_commit_write_locked(struct btree_trans *trans, +bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, struct btree_insert_entry **stopped_at, unsigned long trace_ip) { @@ -613,7 +626,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, u64s += i->k->k.u64s; ret = !i->cached ? btree_key_can_insert(trans, insert_l(i)->b, u64s) - : btree_key_can_insert_cached(trans, i->path, u64s); + : btree_key_can_insert_cached(trans, flags, i->path, u64s); if (ret) { *stopped_at = i; return ret; @@ -627,13 +640,15 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, * Don't get journal reservation until after we know insert will * succeed: */ - if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { + if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { ret = bch2_trans_journal_res_get(trans, + (flags & JOURNAL_WATERMARK_MASK)| JOURNAL_RES_GET_NONBLOCK); if (ret) return ret; - journal_transaction_name(trans); + if (unlikely(trans->journal_transaction_names)) + journal_transaction_name(trans); } else { trans->journal_res.seq = c->journal.replay_journal_seq; } @@ -644,7 +659,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, */ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { + !(flags & BTREE_INSERT_JOURNAL_REPLAY)) { if (bch2_journal_seq_verify) trans_for_each_update(trans, i) i->k->k.version.lo = trans->journal_res.seq; @@ -679,7 +694,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, trans->journal_res.u64s -= trans->extra_journal_entries.nr; } - if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { + if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { trans_for_each_update(trans, i) { struct journal *j = &c->journal; struct jset_entry *entry; @@ -687,14 +702,19 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, if (i->key_cache_already_flushed) continue; + if (i->flags & BTREE_UPDATE_NOJOURNAL) + continue; + verify_update_old_key(trans, i); - entry = bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_overwrite, - i->btree_id, i->level, - i->old_k.u64s); - bkey_reassemble(&entry->start[0], - (struct bkey_s_c) { &i->old_k, i->old_v }); + if (trans->journal_transaction_names) { + entry = bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_overwrite, + i->btree_id, i->level, + i->old_k.u64s); + bkey_reassemble(&entry->start[0], + (struct bkey_s_c) { &i->old_k, i->old_v }); + } entry = bch2_journal_add_entry(j, &trans->journal_res, BCH_JSET_ENTRY_btree_keys, @@ -713,7 +733,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, if (!i->cached) btree_insert_key_leaf(trans, i); else if (!i->key_cache_already_flushed) - bch2_btree_insert_key_cached(trans, i->path, i->k); + bch2_btree_insert_key_cached(trans, flags, i->path, i->k); else { bch2_btree_key_cache_drop(trans, i->path); btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE); @@ -762,12 +782,12 @@ static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans } #ifdef CONFIG_BCACHEFS_DEBUG -static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, +static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, unsigned flags, struct btree_insert_entry *i, struct printbuf *err) { struct bch_fs *c = trans->c; - int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE; + int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE; printbuf_reset(err); prt_printf(err, "invalid bkey on insert from %s -> %ps", @@ -793,7 +813,7 @@ static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, /* * Get journal reservation, take write locks, and attempt to do btree update(s): */ -static inline int do_bch2_trans_commit(struct btree_trans *trans, +static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags, struct btree_insert_entry **stopped_at, unsigned long trace_ip) { @@ -804,11 +824,11 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, #ifdef CONFIG_BCACHEFS_DEBUG trans_for_each_update(trans, i) { - int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE; + int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE; if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, rw, &buf))) - return bch2_trans_commit_bkey_invalid(trans, i, &buf); + return bch2_trans_commit_bkey_invalid(trans, flags, i, &buf); btree_insert_entry_checks(trans, i); } #endif @@ -824,7 +844,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, if (!same_leaf_as_next(trans, i)) { if (u64s_delta <= 0) { ret = bch2_foreground_maybe_merge(trans, i->path, - i->level, trans->flags); + i->level, flags); if (unlikely(ret)) return ret; } @@ -835,8 +855,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, ret = bch2_journal_preres_get(&c->journal, &trans->journal_preres, trans->journal_preres_u64s, - JOURNAL_RES_GET_NONBLOCK| - (trans->flags & JOURNAL_WATERMARK_MASK)); + (flags & JOURNAL_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK); if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked)) ret = bch2_trans_journal_preres_get_cold(trans, trans->journal_preres_u64s, trace_ip); @@ -847,7 +866,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, if (unlikely(ret)) return ret; - ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip); + ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip); if (!ret && unlikely(trans->journal_replay_not_finished)) bch2_drop_overwrites_from_journal(trans); @@ -886,7 +905,7 @@ static int journal_reclaim_wait_done(struct bch_fs *c) } static noinline -int bch2_trans_commit_error(struct btree_trans *trans, +int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, struct btree_insert_entry *i, int ret, unsigned long trace_ip) { @@ -894,7 +913,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, switch (ret) { case -BCH_ERR_btree_insert_btree_node_full: - ret = bch2_btree_split_leaf(trans, i->path, trans->flags); + ret = bch2_btree_split_leaf(trans, i->path, flags); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path); break; @@ -912,8 +931,8 @@ int bch2_trans_commit_error(struct btree_trans *trans, case -BCH_ERR_journal_res_get_blocked: bch2_trans_unlock(trans); - if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) && - !(trans->flags & JOURNAL_WATERMARK_reserved)) { + if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) && + !(flags & JOURNAL_WATERMARK_reserved)) { ret = -BCH_ERR_journal_reclaim_would_deadlock; break; } @@ -948,20 +967,20 @@ int bch2_trans_commit_error(struct btree_trans *trans, BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) && - !(trans->flags & BTREE_INSERT_NOWAIT) && - (trans->flags & BTREE_INSERT_NOFAIL), c, + !(flags & BTREE_INSERT_NOWAIT) && + (flags & BTREE_INSERT_NOFAIL), c, "%s: incorrectly got %s\n", __func__, bch2_err_str(ret)); return ret; } static noinline int -bch2_trans_commit_get_rw_cold(struct btree_trans *trans) +bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags) { struct bch_fs *c = trans->c; int ret; - if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)) || + if (likely(!(flags & BTREE_INSERT_LAZY_RW)) || test_bit(BCH_FS_STARTED, &c->flags)) return -BCH_ERR_erofs_trans_commit; @@ -972,7 +991,7 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) if (ret) return ret; - percpu_ref_get(&c->writes); + bch2_write_ref_get(c, BCH_WRITE_REF_trans); return 0; } @@ -997,7 +1016,7 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) return ret; } -int __bch2_trans_commit(struct btree_trans *trans) +int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) { struct bch_fs *c = trans->c; struct btree_insert_entry *i = NULL; @@ -1008,7 +1027,7 @@ int __bch2_trans_commit(struct btree_trans *trans) !trans->extra_journal_entries.nr) goto out_reset; - if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) + if (flags & BTREE_INSERT_GC_LOCK_HELD) lockdep_assert_held(&c->gc_lock); ret = bch2_trans_commit_run_triggers(trans); @@ -1020,9 +1039,9 @@ int __bch2_trans_commit(struct btree_trans *trans) goto out_reset; } - if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) && - unlikely(!percpu_ref_tryget_live(&c->writes))) { - ret = bch2_trans_commit_get_rw_cold(trans); + if (!(flags & BTREE_INSERT_NOCHECK_RW) && + unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { + ret = bch2_trans_commit_get_rw_cold(trans, flags); if (ret) goto out_reset; } @@ -1034,8 +1053,10 @@ int __bch2_trans_commit(struct btree_trans *trans) trans->journal_u64s = trans->extra_journal_entries.nr; trans->journal_preres_u64s = 0; - /* For journalling transaction name: */ - trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); + trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); + + if (trans->journal_transaction_names) + trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); trans_for_each_update(trans, i) { EBUG_ON(!i->path->should_be_locked); @@ -1052,27 +1073,32 @@ int __bch2_trans_commit(struct btree_trans *trans) /* we're going to journal the key being updated: */ u64s = jset_u64s(i->k->k.u64s); if (i->cached && - likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) + likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) trans->journal_preres_u64s += u64s; + + if (i->flags & BTREE_UPDATE_NOJOURNAL) + continue; + trans->journal_u64s += u64s; /* and we're also going to log the overwrite: */ - trans->journal_u64s += jset_u64s(i->old_k.u64s); + if (trans->journal_transaction_names) + trans->journal_u64s += jset_u64s(i->old_k.u64s); } if (trans->extra_journal_res) { ret = bch2_disk_reservation_add(c, trans->disk_res, trans->extra_journal_res, - (trans->flags & BTREE_INSERT_NOFAIL) + (flags & BTREE_INSERT_NOFAIL) ? BCH_DISK_RESERVATION_NOFAIL : 0); if (ret) goto err; } retry: - EBUG_ON(trans->restarted); + bch2_trans_verify_not_in_restart(trans); memset(&trans->journal_res, 0, sizeof(trans->journal_res)); - ret = do_bch2_trans_commit(trans, &i, _RET_IP_); + ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_); /* make sure we didn't drop or screw up locks: */ bch2_trans_verify_locks(trans); @@ -1084,22 +1110,14 @@ retry: out: bch2_journal_preres_put(&c->journal, &trans->journal_preres); - if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) - percpu_ref_put(&c->writes); + if (likely(!(flags & BTREE_INSERT_NOCHECK_RW))) + bch2_write_ref_put(c, BCH_WRITE_REF_trans); out_reset: bch2_trans_reset_updates(trans); - if (trans->fs_usage_deltas) { - trans->fs_usage_deltas->used = 0; - memset((void *) trans->fs_usage_deltas + - offsetof(struct replicas_delta_list, memset_start), 0, - (void *) &trans->fs_usage_deltas->memset_end - - (void *) &trans->fs_usage_deltas->memset_start); - } - return ret; err: - ret = bch2_trans_commit_error(trans, i, ret, _RET_IP_); + ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_); if (ret) goto out; @@ -1152,12 +1170,63 @@ static inline int check_pos_snapshot_overwritten(struct btree_trans *trans, return __check_pos_snapshot_overwritten(trans, id, pos); } +static noinline int extent_front_merge(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + struct bkey_i **insert, + enum btree_update_flags flags) +{ + struct bch_fs *c = trans->c; + struct bkey_i *update; + int ret; + + update = bch2_bkey_make_mut(trans, k); + ret = PTR_ERR_OR_ZERO(update); + if (ret) + return ret; + + if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert))) + return 0; + + ret = check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p) ?: + check_pos_snapshot_overwritten(trans, iter->btree_id, (*insert)->k.p); + if (ret < 0) + return ret; + if (ret) + return 0; + + ret = bch2_btree_delete_at(trans, iter, flags); + if (ret) + return ret; + + *insert = update; + return 0; +} + +static noinline int extent_back_merge(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *insert, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + int ret; + + ret = check_pos_snapshot_overwritten(trans, iter->btree_id, insert->k.p) ?: + check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p); + if (ret < 0) + return ret; + if (ret) + return 0; + + bch2_bkey_merge(c, bkey_i_to_s(insert), k); + return 0; +} + int bch2_trans_update_extent(struct btree_trans *trans, struct btree_iter *orig_iter, struct bkey_i *insert, enum btree_update_flags flags) { - struct bch_fs *c = trans->c; struct btree_iter iter, update_iter; struct bpos start = bkey_start_pos(&insert->k); struct bkey_i *update; @@ -1175,46 +1244,15 @@ int bch2_trans_update_extent(struct btree_trans *trans, if (!k.k) goto out; - if (bch2_bkey_maybe_mergable(k.k, &insert->k)) { - /* - * We can't merge extents if they belong to interior snapshot - * tree nodes, and there's a snapshot in which one extent is - * visible and the other is not - i.e. if visibility is - * different. - * - * Instead of checking if visibilitiy of the two extents is - * different, for now we just check if either has been - * overwritten: - */ - ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p); - if (ret < 0) - goto err; - if (ret) - goto nomerge1; - - ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p); - if (ret < 0) - goto err; - if (ret) - goto nomerge1; - - update = bch2_bkey_make_mut(trans, k); - if ((ret = PTR_ERR_OR_ZERO(update))) - goto err; - - if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(insert))) { - ret = bch2_btree_delete_at(trans, &iter, flags); + if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) { + if (bch2_bkey_maybe_mergable(k.k, &insert->k)) { + ret = extent_front_merge(trans, &iter, k, &insert, flags); if (ret) goto err; - - insert = update; - goto next; } - } -nomerge1: - ret = 0; - if (bkey_eq(k.k->p, start)) + goto next; + } while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) { bool front_split = bkey_lt(bkey_start_pos(k.k), start); @@ -1323,22 +1361,10 @@ next: } if (bch2_bkey_maybe_mergable(&insert->k, k.k)) { - ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p); - if (ret < 0) - goto err; + ret = extent_back_merge(trans, &iter, insert, k); if (ret) - goto nomerge2; - - ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p); - if (ret < 0) goto err; - if (ret) - goto nomerge2; - - bch2_bkey_merge(c, bkey_i_to_s(insert), k); } -nomerge2: - ret = 0; out: if (!bkey_deleted(&insert->k)) { /* @@ -1476,7 +1502,7 @@ bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *pa array_insert_item(trans->updates, trans->nr_updates, i - trans->updates, n); - i->old_v = bch2_btree_path_peek_slot(path, &i->old_k).v; + i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v; i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0; if (unlikely(trans->journal_replay_not_finished)) { @@ -1499,7 +1525,9 @@ bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *pa * the key cache - but the key has to exist in the btree for that to * work: */ - if (unlikely(path->cached && bkey_deleted(&i->old_k))) + if (path->cached && + bkey_deleted(&i->old_k) && + !(flags & BTREE_UPDATE_NO_KEY_CACHE_COHERENCY)) return flush_new_cached_update(trans, path, i, flags, ip); return 0; @@ -1671,18 +1699,10 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, */ delete.k.p = iter.pos; - if (iter.flags & BTREE_ITER_IS_EXTENTS) { - unsigned max_sectors = - KEY_SIZE_MAX & (~0 << trans->c->block_bits); - - /* create the biggest key we can */ - bch2_key_resize(&delete.k, max_sectors); - bch2_cut_back(end, &delete); - - ret = bch2_extent_trim_atomic(trans, &iter, &delete); - if (ret) - goto err; - } + if (iter.flags & BTREE_ITER_IS_EXTENTS) + bch2_key_resize(&delete.k, + bpos_min(end, k.k->p).offset - + iter.pos.offset); ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?: bch2_trans_commit(trans, &disk_res, journal_seq, diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 428c2be6..ac0328c4 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -137,23 +137,28 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c) { struct bch_fs_usage_online *ret; - unsigned seq, i, u64s; + unsigned seq, i, v, u64s = fs_usage_u64s(c) + 1; +retry: + ret = kmalloc(u64s * sizeof(u64), GFP_NOFS); + if (unlikely(!ret)) + return NULL; percpu_down_read(&c->mark_lock); - ret = kmalloc(sizeof(struct bch_fs_usage_online) + - sizeof(u64) * c->replicas.nr, GFP_NOFS); - if (unlikely(!ret)) { + v = fs_usage_u64s(c) + 1; + if (unlikely(u64s != v)) { + u64s = v; percpu_up_read(&c->mark_lock); - return NULL; + kfree(ret); + goto retry; } ret->online_reserved = percpu_u64_get(c->online_reserved); - u64s = fs_usage_u64s(c); do { seq = read_seqcount_begin(&c->usage_lock); - memcpy(&ret->u, c->usage_base, u64s * sizeof(u64)); + unsafe_memcpy(&ret->u, c->usage_base, u64s * sizeof(u64), + "embedded variable length struct"); for (i = 0; i < ARRAY_SIZE(c->usage); i++) acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], u64s); } while (read_seqcount_retry(&c->usage_lock, seq)); @@ -1203,17 +1208,23 @@ not_found: " missing range %llu-%llu", (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), *idx, next_idx)) { - struct bkey_i_error new; + struct bkey_i_error *new; - bkey_init(&new.k); - new.k.type = KEY_TYPE_error; - new.k.p = bkey_start_pos(p.k); - new.k.p.offset += *idx - start; - bch2_key_resize(&new.k, next_idx - *idx); - ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new.k_i); + new = bch2_trans_kmalloc(trans, sizeof(*new)); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto err; + + bkey_init(&new->k); + new->k.type = KEY_TYPE_error; + new->k.p = bkey_start_pos(p.k); + new->k.p.offset += *idx - start; + bch2_key_resize(&new->k, next_idx - *idx); + ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new->k_i); } *idx = next_idx; +err: fsck_err: printbuf_exit(&buf); return ret; @@ -1258,36 +1269,6 @@ int bch2_mark_reflink_p(struct btree_trans *trans, return ret; } -static noinline __cold -void fs_usage_apply_warn(struct btree_trans *trans, - unsigned disk_res_sectors, - s64 should_not_have_added) -{ - struct bch_fs *c = trans->c; - struct btree_insert_entry *i; - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, - bch2_fmt(c, "disk usage increased %lli more than %u sectors reserved)"), - should_not_have_added, disk_res_sectors); - - trans_for_each_update(trans, i) { - struct bkey_s_c old = { &i->old_k, i->old_v }; - - prt_str(&buf, "new "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); - prt_newline(&buf); - - prt_str(&buf, "old "); - bch2_bkey_val_to_text(&buf, c, old); - prt_newline(&buf); - } - - __WARN(); - bch2_print_string_as_lines(KERN_ERR, buf.buf); - printbuf_exit(&buf); -} - int bch2_trans_fs_usage_apply(struct btree_trans *trans, struct replicas_delta_list *deltas) { @@ -1352,7 +1333,9 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans, percpu_up_read(&c->mark_lock); if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) - fs_usage_apply_warn(trans, disk_res_sectors, should_not_have_added); + bch2_trans_inconsistent(trans, + "disk usage increased %lli more than %u sectors reserved)", + should_not_have_added, disk_res_sectors); return 0; need_mark: /* revert changes: */ diff --git a/libbcachefs/buckets_waiting_for_journal.c b/libbcachefs/buckets_waiting_for_journal.c index 0f4ef9e5..f3774e30 100644 --- a/libbcachefs/buckets_waiting_for_journal.c +++ b/libbcachefs/buckets_waiting_for_journal.c @@ -2,28 +2,24 @@ #include "bcachefs.h" #include "buckets_waiting_for_journal.h" +#include <linux/hash.h> #include <linux/random.h> -#include <linux/siphash.h> static inline struct bucket_hashed * bucket_hash(struct buckets_waiting_for_journal_table *t, unsigned hash_seed_idx, u64 dev_bucket) { - unsigned h = siphash_1u64(dev_bucket, &t->hash_seeds[hash_seed_idx]); - - EBUG_ON(!is_power_of_2(t->size)); - - return t->d + (h & (t->size - 1)); + return t->d + hash_64(dev_bucket ^ t->hash_seeds[hash_seed_idx], t->bits); } -static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t size) +static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t bits) { unsigned i; - t->size = size; + t->bits = bits; for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i])); - memset(t->d, 0, sizeof(t->d[0]) * size); + memset(t->d, 0, sizeof(t->d[0]) << t->bits); } bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, @@ -97,7 +93,7 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, .dev_bucket = (u64) dev << 56 | bucket, .journal_seq = journal_seq, }; - size_t i, new_size, nr_elements = 1, nr_rehashes = 0; + size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0; int ret = 0; mutex_lock(&b->lock); @@ -106,12 +102,13 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, goto out; t = b->t; - for (i = 0; i < t->size; i++) + size = 1UL << t->bits; + for (i = 0; i < size; i++) nr_elements += t->d[i].journal_seq > flushed_seq; - new_size = nr_elements < t->size / 3 ? t->size : t->size * 2; + new_bits = t->bits + (nr_elements * 3 > size); - n = kvmalloc(sizeof(*n) + sizeof(n->d[0]) * new_size, GFP_KERNEL); + n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL); if (!n) { ret = -ENOMEM; goto out; @@ -119,12 +116,12 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, retry_rehash: nr_rehashes++; - bucket_table_init(n, new_size); + bucket_table_init(n, new_bits); tmp = new; BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq)); - for (i = 0; i < t->size; i++) { + for (i = 0; i < 1UL << t->bits; i++) { if (t->d[i].journal_seq <= flushed_seq) continue; @@ -137,7 +134,7 @@ retry_rehash: kvfree(t); pr_debug("took %zu rehashes, table at %zu/%zu elements", - nr_rehashes, nr_elements, b->t->size); + nr_rehashes, nr_elements, 1UL << b->t->bits); out: mutex_unlock(&b->lock); @@ -151,7 +148,7 @@ void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c) kvfree(b->t); } -#define INITIAL_TABLE_SIZE 8 +#define INITIAL_TABLE_BITS 3 int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c) { @@ -159,10 +156,11 @@ int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c) mutex_init(&b->lock); - b->t = kvmalloc(sizeof(*b->t) + sizeof(b->t->d[0]) * INITIAL_TABLE_SIZE, GFP_KERNEL); + b->t = kvmalloc(sizeof(*b->t) + + (sizeof(b->t->d[0]) << INITIAL_TABLE_BITS), GFP_KERNEL); if (!b->t) return -ENOMEM; - bucket_table_init(b->t, INITIAL_TABLE_SIZE); + bucket_table_init(b->t, INITIAL_TABLE_BITS); return 0; } diff --git a/libbcachefs/buckets_waiting_for_journal_types.h b/libbcachefs/buckets_waiting_for_journal_types.h index fea7f944..e593db06 100644 --- a/libbcachefs/buckets_waiting_for_journal_types.h +++ b/libbcachefs/buckets_waiting_for_journal_types.h @@ -10,8 +10,8 @@ struct bucket_hashed { }; struct buckets_waiting_for_journal_table { - size_t size; - siphash_key_t hash_seeds[3]; + unsigned bits; + u64 hash_seeds[3]; struct bucket_hashed d[]; }; diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c index 7ef7bb61..9df958b4 100644 --- a/libbcachefs/data_update.c +++ b/libbcachefs/data_update.c @@ -182,7 +182,17 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, /* Add new ptrs: */ extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) { - if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) { + const struct bch_extent_ptr *existing_ptr = + bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev); + + if (existing_ptr && existing_ptr->cached) { + /* + * We're replacing a cached pointer with a non + * cached pointer: + */ + bch2_bkey_drop_device_noerror(bkey_i_to_s(insert), + existing_ptr->dev); + } else if (existing_ptr) { /* * raced with another move op? extent already * has a pointer to the device we just wrote @@ -253,8 +263,8 @@ nomatch: &m->ctxt->stats->sectors_raced); } - this_cpu_add(c->counters[BCH_COUNTER_move_extent_race], new->k.size); - trace_move_extent_race(&new->k); + this_cpu_add(c->counters[BCH_COUNTER_move_extent_fail], new->k.size); + trace_move_extent_fail(&new->k); bch2_btree_iter_advance(&iter); goto next; @@ -388,17 +398,21 @@ void bch2_update_unwritten_extent(struct btree_trans *trans, } } -int bch2_data_update_init(struct bch_fs *c, struct data_update *m, +int bch2_data_update_init(struct btree_trans *trans, + struct moving_context *ctxt, + struct data_update *m, struct write_point_specifier wp, struct bch_io_opts io_opts, struct data_update_opts data_opts, enum btree_id btree_id, struct bkey_s_c k) { + struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas; + unsigned int ptrs_locked = 0; int ret; bch2_bkey_buf_init(&m->k); @@ -424,11 +438,14 @@ int bch2_data_update_init(struct bch_fs *c, struct data_update *m, i = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + bool locked; + if (((1U << i) & m->data_opts.rewrite_ptrs) && p.ptr.cached) BUG(); - if (!((1U << i) & m->data_opts.rewrite_ptrs)) + if (!((1U << i) & m->data_opts.rewrite_ptrs) && + !p.ptr.cached) bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); if (((1U << i) & m->data_opts.rewrite_ptrs) && @@ -448,10 +465,24 @@ int bch2_data_update_init(struct bch_fs *c, struct data_update *m, if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) m->op.incompressible = true; - i++; + if (ctxt) { + move_ctxt_wait_event(ctxt, trans, + (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, + PTR_BUCKET_POS(c, &p.ptr), 0)) || + !atomic_read(&ctxt->read_sectors)); - bch2_bucket_nocow_lock(&c->nocow_locks, - PTR_BUCKET_POS(c, &p.ptr), 0); + if (!locked) + bch2_bucket_nocow_lock(&c->nocow_locks, + PTR_BUCKET_POS(c, &p.ptr), 0); + } else { + if (!bch2_bucket_nocow_trylock(&c->nocow_locks, + PTR_BUCKET_POS(c, &p.ptr), 0)) { + ret = -BCH_ERR_nocow_lock_blocked; + goto err; + } + } + ptrs_locked |= (1U << i); + i++; } if (reserve_sectors) { @@ -473,9 +504,13 @@ int bch2_data_update_init(struct bch_fs *c, struct data_update *m, return -BCH_ERR_unwritten_extent_update; return 0; err: - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - bch2_bucket_nocow_unlock(&c->nocow_locks, - PTR_BUCKET_POS(c, &p.ptr), 0); + i = 0; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if ((1U << i) & ptrs_locked) + bch2_bucket_nocow_unlock(&c->nocow_locks, + PTR_BUCKET_POS(c, &p.ptr), 0); + i++; + } bch2_bkey_buf_exit(&m->k, c); bch2_bio_free_pages_pool(c, &m->op.wbio.bio); diff --git a/libbcachefs/data_update.h b/libbcachefs/data_update.h index f304c336..49e9055c 100644 --- a/libbcachefs/data_update.h +++ b/libbcachefs/data_update.h @@ -33,7 +33,8 @@ void bch2_data_update_read_done(struct data_update *, void bch2_data_update_exit(struct data_update *); void bch2_update_unwritten_extent(struct btree_trans *, struct data_update *); -int bch2_data_update_init(struct bch_fs *, struct data_update *, +int bch2_data_update_init(struct btree_trans *, struct moving_context *, + struct data_update *, struct write_point_specifier, struct bch_io_opts, struct data_update_opts, enum btree_id, struct bkey_s_c); diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index 3c2f008d..fcefd55a 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -25,7 +25,6 @@ #include <linux/console.h> #include <linux/debugfs.h> #include <linux/module.h> -#include <linux/pretty-printers.h> #include <linux/random.h> #include <linux/seq_file.h> @@ -40,7 +39,7 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, struct bset *sorted, *inmemory = &b->data->keys; struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev); struct bio *bio; - bool failed = false; + bool failed = false, saw_error = false; if (!bch2_dev_get_ioref(ca, READ)) return false; @@ -61,7 +60,7 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, memcpy(n_ondisk, n_sorted, btree_bytes(c)); v->written = 0; - if (bch2_btree_node_read_done(c, ca, v, false)) + if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error) return false; n_sorted = c->verify_data->data; @@ -501,6 +500,7 @@ static const struct file_operations cached_btree_nodes_ops = { .read = bch2_cached_btree_nodes_read, }; +#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { @@ -520,7 +520,7 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, ret = flush_buf(i); if (ret) - return ret; + break; bch2_btree_trans_to_text(&i->buf, trans); @@ -550,6 +550,7 @@ static const struct file_operations btree_transactions_ops = { .release = bch2_dump_release, .read = bch2_btree_transactions_read, }; +#endif /* CONFIG_BCACHEFS_DEBUG_TRANSACTIONS */ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) @@ -710,7 +711,7 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, ret = flush_buf(i); if (ret) - return ret; + break; bch2_check_for_deadlock(trans, &i->buf); @@ -756,8 +757,10 @@ void bch2_fs_debug_init(struct bch_fs *c) debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir, c->btree_debug, &cached_btree_nodes_ops); +#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir, c->btree_debug, &btree_transactions_ops); +#endif debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, c->btree_debug, &journal_pins_ops); diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index f1838b7c..4c85d339 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -84,7 +84,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = { }; int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); unsigned len; diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h index 1a2c9108..ad131e8e 100644 --- a/libbcachefs/dirent.h +++ b/libbcachefs/dirent.h @@ -6,7 +6,7 @@ extern const struct bch_hash_desc bch2_dirent_hash_desc; -int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_dirent ((struct bkey_ops) { \ diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index c234c8d5..9f5a27da 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -104,7 +104,7 @@ struct ec_bio { /* Stripes btree keys: */ int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; @@ -130,7 +130,7 @@ int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k, return -BCH_ERR_invalid_bkey; } - return bch2_bkey_ptrs_invalid(c, k, rw, err); + return bch2_bkey_ptrs_invalid(c, k, flags, err); } void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, @@ -673,9 +673,8 @@ void bch2_stripes_heap_update(struct bch_fs *c, heap_verify_backpointer(c, idx); - if (stripe_idx_to_delete(c) >= 0 && - !percpu_ref_is_dying(&c->writes)) - schedule_work(&c->ec_stripe_delete_work); + if (stripe_idx_to_delete(c) >= 0) + bch2_do_stripe_deletes(c); } /* stripe deletion */ @@ -708,6 +707,15 @@ static void ec_stripe_delete_work(struct work_struct *work) if (ec_stripe_delete(c, idx)) break; } + + bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); +} + +void bch2_do_stripe_deletes(struct bch_fs *c) +{ + if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) && + !schedule_work(&c->ec_stripe_delete_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); } /* stripe creation: */ @@ -965,7 +973,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) BUG_ON(!s->allocated); - if (!percpu_ref_tryget_live(&c->writes)) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_create)) goto err; ec_generate_ec(&s->new_stripe); @@ -1003,7 +1011,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset); spin_unlock(&c->ec_stripes_heap_lock); err_put_writes: - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); err: bch2_disk_reservation_put(c, &s->res); diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h index aba1e82b..d47da7d8 100644 --- a/libbcachefs/ec.h +++ b/libbcachefs/ec.h @@ -4,9 +4,10 @@ #include "ec_types.h" #include "buckets_types.h" +#include "extents_types.h" int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c, - int rw, struct printbuf *); + unsigned, struct printbuf *); void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -206,6 +207,8 @@ void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t); void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t); void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t); +void bch2_do_stripe_deletes(struct bch_fs *); + void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); void bch2_ec_flush_new_stripes(struct bch_fs *); diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index 62170964..b8c24f51 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -118,6 +118,7 @@ x(BCH_ERR_invalid_sb, invalid_sb_clean) \ x(BCH_ERR_invalid_sb, invalid_sb_quota) \ x(BCH_ERR_invalid, invalid_bkey) \ + x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ enum bch_errcode { BCH_ERR_START = 2048, diff --git a/libbcachefs/error.c b/libbcachefs/error.c index 3e49d72d..c2882c59 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -27,8 +27,11 @@ bool bch2_inconsistent_error(struct bch_fs *c) void bch2_topology_error(struct bch_fs *c) { + if (!test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) + return; + set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags); - if (test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) + if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) bch2_inconsistent_error(c); } diff --git a/libbcachefs/error.h b/libbcachefs/error.h index dae72620..9991879d 100644 --- a/libbcachefs/error.h +++ b/libbcachefs/error.h @@ -73,8 +73,8 @@ do { \ #define bch2_trans_inconsistent(trans, ...) \ ({ \ bch_err(trans->c, __VA_ARGS__); \ - bch2_inconsistent_error(trans->c); \ bch2_dump_trans_updates(trans); \ + bch2_inconsistent_error(trans->c); \ }) #define bch2_trans_inconsistent_on(cond, trans, ...) \ diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 9b197db7..d01cec89 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -166,7 +166,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, /* KEY_TYPE_btree_ptr: */ int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) { prt_printf(err, "value too big (%zu > %u)", @@ -174,7 +174,7 @@ int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k, return -BCH_ERR_invalid_bkey; } - return bch2_bkey_ptrs_invalid(c, k, rw, err); + return bch2_bkey_ptrs_invalid(c, k, flags, err); } void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, @@ -184,7 +184,7 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, } int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); @@ -207,7 +207,7 @@ int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, return -BCH_ERR_invalid_bkey; } - return bch2_bkey_ptrs_invalid(c, k, rw, err); + return bch2_bkey_ptrs_invalid(c, k, flags, err); } void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, @@ -389,7 +389,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) /* KEY_TYPE_reservation: */ int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); @@ -715,7 +715,7 @@ static inline void __extent_entry_insert(struct bkey_i *k, memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), dst, (u64 *) end - (u64 *) dst); k->k.u64s += extent_entry_u64s(new); - memcpy(dst, new, extent_entry_bytes(new)); + memcpy_u64s_small(dst, new, extent_entry_u64s(new)); } void bch2_extent_ptr_decoded_append(struct bkey_i *k, @@ -1086,7 +1086,7 @@ static int extent_ptr_invalid(const struct bch_fs *c, } int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index e27d39b7..1d8f3b30 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -371,11 +371,11 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, /* KEY_TYPE_btree_ptr: */ -int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, int, struct bkey_s); @@ -414,7 +414,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); /* KEY_TYPE_reservation: */ int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c, - int, struct printbuf *); + unsigned, struct printbuf *); void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); @@ -659,7 +659,7 @@ bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c, - int, struct printbuf *); + unsigned, struct printbuf *); void bch2_ptr_swab(struct bkey_s); diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index e7ebb01b..e088bbcc 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -812,7 +812,7 @@ static void bch2_set_page_dirty(struct bch_fs *c, i_sectors_acct(c, inode, &res->quota, dirty_sectors); if (!PageDirty(page)) - __set_page_dirty_nobuffers(page); + filemap_dirty_folio(inode->v.i_mapping, page_folio(page)); } vm_fault_t bch2_page_fault(struct vm_fault *vmf) @@ -2715,7 +2715,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, * redirty the full page: */ page_mkclean(page); - __set_page_dirty_nobuffers(page); + filemap_dirty_folio(mapping, page_folio(page)); unlock: unlock_page(page); put_page(page); @@ -3280,7 +3280,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode, struct bch_fs *c = inode->v.i_sb->s_fs_info; long ret; - if (!percpu_ref_tryget_live(&c->writes)) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate)) return -EROFS; inode_lock(&inode->v); @@ -3304,7 +3304,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode, err: bch2_pagecache_block_put(inode); inode_unlock(&inode->v); - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_fallocate); return bch2_err_class(ret); } @@ -3448,9 +3448,9 @@ err: /* fseek: */ -static int page_data_offset(struct page *page, unsigned offset) +static int folio_data_offset(struct folio *folio, unsigned offset) { - struct bch_page_state *s = bch2_page_state(page); + struct bch_page_state *s = bch2_page_state(&folio->page); unsigned i; if (s) @@ -3481,8 +3481,7 @@ static loff_t bch2_seek_pagecache_data(struct inode *vinode, struct folio *folio = fbatch.folios[i]; folio_lock(folio); - - offset = page_data_offset(&folio->page, + offset = folio_data_offset(folio, folio->index == start_index ? start_offset & (PAGE_SIZE - 1) : 0); @@ -3494,7 +3493,6 @@ static loff_t bch2_seek_pagecache_data(struct inode *vinode, folio_batch_release(&fbatch); return ret; } - folio_unlock(folio); } folio_batch_release(&fbatch); diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index cbd9b1e7..c23309f1 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -667,10 +667,10 @@ int bch2_setattr_nonsize(struct user_namespace *mnt_userns, qid = inode->ei_qid; if (attr->ia_valid & ATTR_UID) - qid.q[QTYP_USR] = from_kuid(&init_user_ns, attr->ia_uid); + qid.q[QTYP_USR] = from_kuid(mnt_userns, attr->ia_uid); if (attr->ia_valid & ATTR_GID) - qid.q[QTYP_GRP] = from_kgid(&init_user_ns, attr->ia_gid); + qid.q[QTYP_GRP] = from_kgid(mnt_userns, attr->ia_gid); ret = bch2_fs_quota_transfer(c, inode, qid, ~0, KEY_TYPE_QUOTA_PREALLOC); @@ -779,18 +779,19 @@ static int bch2_setattr(struct user_namespace *mnt_userns, } static int bch2_tmpfile(struct user_namespace *mnt_userns, - struct inode *vdir, struct dentry *dentry, umode_t mode) + struct inode *vdir, struct file *file, umode_t mode) { struct bch_inode_info *inode = - __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0, + __bch2_create(mnt_userns, to_bch_ei(vdir), + file->f_path.dentry, mode, 0, (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); if (IS_ERR(inode)) return bch2_err_class(PTR_ERR(inode)); - d_mark_tmpfile(dentry, &inode->v); - d_instantiate(dentry, &inode->v); - return 0; + d_mark_tmpfile(file, &inode->v); + d_instantiate(file->f_path.dentry, &inode->v); + return finish_open_simple(file, 0); } static int bch2_fill_extent(struct bch_fs *c, diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h index 6b91bbe9..783e77c3 100644 --- a/libbcachefs/fs.h +++ b/libbcachefs/fs.h @@ -18,7 +18,6 @@ struct bch_inode_info { struct mutex ei_update_lock; u64 ei_quota_reserved; unsigned long ei_last_dirtied; - two_state_lock_t ei_pagecache_lock; struct mutex ei_quota_lock; diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 3b71eedb..5887d781 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -817,7 +817,7 @@ static int hash_check_key(struct btree_trans *trans, goto bad_hash; for_each_btree_key_norestart(trans, iter, desc.btree_id, - POS(hash_k.k->p.inode, hash), + SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), BTREE_ITER_SLOTS, k, ret) { if (bkey_eq(k.k->p, hash_k.k->p)) break; diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 585d16ac..560545a7 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -433,7 +433,7 @@ static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err) } int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); @@ -453,7 +453,7 @@ int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k, } int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); @@ -473,7 +473,7 @@ int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, } int bch2_inode_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); @@ -536,7 +536,7 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c } int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { if (k.k->p.inode) { prt_printf(err, "nonzero k.p.inode"); @@ -663,19 +663,8 @@ again: while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k)) && bkey_lt(k.k->p, POS(0, max))) { - while (pos < iter->pos.offset) { - if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos))) - goto found_slot; - - pos++; - } - - if (k.k->p.snapshot == snapshot && - !bkey_is_inode(k.k) && - !bch2_btree_key_cache_find(c, BTREE_ID_inodes, SPOS(0, pos, snapshot))) { - bch2_btree_iter_advance(iter); - continue; - } + if (pos < iter->pos.offset) + goto found_slot; /* * We don't need to iterate over keys in every snapshot once @@ -685,12 +674,8 @@ again: bch2_btree_iter_set_pos(iter, POS(0, pos)); } - while (!ret && pos < max) { - if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos))) - goto found_slot; - - pos++; - } + if (!ret && pos < max) + goto found_slot; if (!ret && start == min) ret = -BCH_ERR_ENOSPC_inode_create; @@ -713,11 +698,6 @@ found_slot: return ret; } - /* We may have raced while the iterator wasn't pointing at pos: */ - if (bkey_is_inode(k.k) || - bch2_btree_key_cache_find(c, BTREE_ID_inodes, k.k->p)) - goto again; - *hint = k.k->p.offset; inode_u->bi_inum = k.k->p.offset; inode_u->bi_generation = bkey_generation(k); @@ -734,11 +714,11 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, int ret = 0; /* - * We're never going to be deleting extents, no need to use an extent - * iterator: + * We're never going to be deleting partial extents, no need to use an + * extent iterator: */ bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0), - BTREE_ITER_INTENT); + BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS); while (1) { bch2_trans_begin(trans); @@ -760,14 +740,6 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, bkey_init(&delete.k); delete.k.p = iter.pos; - if (iter.flags & BTREE_ITER_IS_EXTENTS) { - bch2_key_resize(&delete.k, k.k->p.offset - iter.pos.offset); - - ret = bch2_extent_trim_atomic(trans, &iter, &delete); - if (ret) - goto err; - } - ret = bch2_trans_update(trans, &iter, &delete, 0) ?: bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); @@ -823,8 +795,8 @@ retry: if (!bkey_is_inode(k.k)) { bch2_fs_inconsistent(trans.c, - "inode %llu not found when deleting", - inum.inum); + "inode %llu:%u not found when deleting", + inum.inum, snapshot); ret = -EIO; goto err; } diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index b753e1b2..f5066afb 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -7,9 +7,9 @@ extern const char * const bch2_inode_opts[]; -int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); -int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); -int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); +int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); +int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_inode ((struct bkey_ops) { \ @@ -41,7 +41,7 @@ static inline bool bkey_is_inode(const struct bkey *k) } int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c, - int, struct printbuf *); + unsigned, struct printbuf *); void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \ diff --git a/libbcachefs/io.c b/libbcachefs/io.c index d215973a..c0c33f78 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -34,6 +34,7 @@ #include "super-io.h" #include <linux/blkdev.h> +#include <linux/prefetch.h> #include <linux/random.h> #include <linux/sched/mm.h> @@ -46,6 +47,8 @@ const char *bch2_blk_status_to_str(blk_status_t status) return blk_status_to_str(status); } +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT + static bool bch2_target_congested(struct bch_fs *c, u16 target) { const struct bch_devs_mask *devs; @@ -134,6 +137,15 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); } +#else + +static bool bch2_target_congested(struct bch_fs *c, u16 target) +{ + return false; +} + +#endif + /* Allocate, free from mempool: */ void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) @@ -242,6 +254,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, struct btree_iter iter; struct bkey_i *k; struct bkey_i_inode_v3 *inode; + unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL; int ret; bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, @@ -264,15 +277,24 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, inode = bkey_i_to_inode_v3(k); if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_I_SIZE_DIRTY) && - new_i_size > le64_to_cpu(inode->v.bi_size)) + new_i_size > le64_to_cpu(inode->v.bi_size)) { inode->v.bi_size = cpu_to_le64(new_i_size); + inode_update_flags = 0; + } - le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta); + if (i_sectors_delta) { + le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta); + inode_update_flags = 0; + } - inode->k.p.snapshot = iter.snapshot; + if (inode->k.p.snapshot != iter.snapshot) { + inode->k.p.snapshot = iter.snapshot; + inode_update_flags = 0; + } ret = bch2_trans_update(trans, &iter, &inode->k_i, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + inode_update_flags); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -513,8 +535,6 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, if (ret) continue; - BUG_ON(bkey_ge(iter->pos, end_pos)); - bkey_init(&delete.k); delete.k.p = iter->pos; @@ -527,8 +547,6 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, bch2_disk_reservation_put(c, &disk_res); } - BUG_ON(bkey_gt(iter->pos, end_pos)); - return ret ?: ret2; } @@ -665,6 +683,12 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, bio_sectors(&n->bio)); bio_set_dev(&n->bio, ca->disk_sb.bdev); + + if (IS_ENABLED(CONFIG_BCACHEFS_NO_IO) && type != BCH_DATA_btree) { + bio_endio(&n->bio); + continue; + } + submit_bio(&n->bio); } else { n->bio.bi_status = BLK_STS_REMOVED; @@ -681,11 +705,12 @@ static void bch2_write_done(struct closure *cl) struct bch_fs *c = op->c; bch2_disk_reservation_put(c, &op->res); - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_write); bch2_keylist_free(&op->insert_keys, op->inline_keys); bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); + EBUG_ON(cl->parent); closure_debug_destroy(cl); if (op->end_io) op->end_io(op); @@ -780,6 +805,30 @@ err: goto out; } +static inline void __wp_update_state(struct write_point *wp, enum write_point_state state) +{ + if (state != wp->state) { + u64 now = ktime_get_ns(); + + if (wp->last_state_change && + time_after64(now, wp->last_state_change)) + wp->time[wp->state] += now - wp->last_state_change; + wp->state = state; + wp->last_state_change = now; + } +} + +static inline void wp_update_state(struct write_point *wp, bool running) +{ + enum write_point_state state; + + state = running ? WRITE_POINT_running : + !list_empty(&wp->writes) ? WRITE_POINT_waiting_io + : WRITE_POINT_stopped; + + __wp_update_state(wp, state); +} + static void bch2_write_index(struct closure *cl) { struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); @@ -787,6 +836,16 @@ static void bch2_write_index(struct closure *cl) struct workqueue_struct *wq = index_update_wq(op); barrier(); + + /* + * We're not using wp->writes_lock here, so this is racey: that's ok, + * because this is just for diagnostic purposes, and we're running out + * of interrupt context here so if we were to take the log we'd have to + * switch to spin_lock_irq()/irqsave(), which is not free: + */ + if (wp->state == WRITE_POINT_waiting_io) + __wp_update_state(wp, WRITE_POINT_waiting_work); + op->btree_update_ready = true; queue_work(wq, &wp->index_update_work); } @@ -799,16 +858,21 @@ void bch2_write_point_do_index_updates(struct work_struct *work) while (1) { spin_lock(&wp->writes_lock); - op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list); - if (op && !op->btree_update_ready) - op = NULL; - if (op) - list_del(&op->wp_list); + list_for_each_entry(op, &wp->writes, wp_list) + if (op->btree_update_ready) { + list_del(&op->wp_list); + goto unlock; + } + op = NULL; +unlock: + wp_update_state(wp, op != NULL); spin_unlock(&wp->writes_lock); if (!op) break; + op->flags |= BCH_WRITE_IN_WORKER; + __bch2_write_index(op); if (!(op->flags & BCH_WRITE_DONE)) @@ -850,12 +914,10 @@ static void bch2_write_endio(struct bio *bio) if (wbio->put_bio) bio_put(bio); - if (parent) { + if (parent) bio_endio(&parent->bio); - return; - } - - closure_put(cl); + else + closure_put(cl); } static void init_append_extent(struct bch_write_op *op, @@ -863,7 +925,6 @@ static void init_append_extent(struct bch_write_op *op, struct bversion version, struct bch_extent_crc_unpacked crc) { - struct bch_fs *c = op->c; struct bkey_i_extent *e; op->pos.offset += crc.uncompressed_size; @@ -878,7 +939,7 @@ static void init_append_extent(struct bch_write_op *op, crc.nonce) bch2_extent_crc_append(&e->k_i, crc); - bch2_alloc_sectors_append_ptrs_inlined(c, wp, &e->k_i, crc.compressed_size, + bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, op->flags & BCH_WRITE_CACHED); bch2_keylist_push(&op->insert_keys); @@ -1360,8 +1421,6 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) bkey_start_pos(&orig->k), orig->k.p, BTREE_ITER_INTENT, k, NULL, NULL, BTREE_INSERT_NOFAIL, ({ - BUG_ON(bkey_ge(bkey_start_pos(k.k), orig->k.p)); - bch2_nocow_write_convert_one_unwritten(&trans, &iter, orig, k, op->new_i_size); })); @@ -1641,10 +1700,11 @@ again: goto err; } + EBUG_ON(!wp); + + bch2_open_bucket_get(c, wp, &op->open_buckets); ret = bch2_write_extent(op, wp, &bio); - if (ret >= 0) - bch2_open_bucket_get(c, wp, &op->open_buckets); bch2_alloc_sectors_done_inlined(c, wp); err: if (ret <= 0) { @@ -1652,6 +1712,8 @@ err: spin_lock(&wp->writes_lock); op->wp = wp; list_add_tail(&op->wp_list, &wp->writes); + if (wp->state == WRITE_POINT_stopped) + __wp_update_state(wp, WRITE_POINT_waiting_io); spin_unlock(&wp->writes_lock); } @@ -1683,7 +1745,9 @@ err: * synchronously here if we weren't able to submit all of the IO at * once, as that signals backpressure to the caller. */ - if ((op->flags & BCH_WRITE_SYNC) || !(op->flags & BCH_WRITE_DONE)) { + if ((op->flags & BCH_WRITE_SYNC) || + (!(op->flags & BCH_WRITE_DONE) && + !(op->flags & BCH_WRITE_IN_WORKER))) { closure_sync(&op->cl); __bch2_write_index(op); @@ -1705,6 +1769,9 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) unsigned sectors; int ret; + op->flags |= BCH_WRITE_WROTE_DATA_INLINE; + op->flags |= BCH_WRITE_DONE; + bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, @@ -1732,9 +1799,6 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) set_bkey_val_bytes(&id->k, data_len); bch2_keylist_push(&op->insert_keys); - op->flags |= BCH_WRITE_WROTE_DATA_INLINE; - op->flags |= BCH_WRITE_DONE; - __bch2_write_index(op); err: bch2_write_done(&op->cl); @@ -1782,7 +1846,7 @@ void bch2_write(struct closure *cl) } if (c->opts.nochanges || - !percpu_ref_tryget_live(&c->writes)) { + !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { op->error = -BCH_ERR_erofs_no_writes; goto err; } @@ -1861,10 +1925,12 @@ static void promote_free(struct bch_fs *c, struct promote_op *op) { int ret; + bch2_data_update_exit(&op->write); + ret = rhashtable_remove_fast(&c->promote_table, &op->hash, bch_promote_params); BUG_ON(ret); - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_promote); kfree_rcu(op, rcu); } @@ -1876,8 +1942,6 @@ static void promote_done(struct bch_write_op *wop) bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); - - bch2_data_update_exit(&op->write); promote_free(c, op); } @@ -1898,7 +1962,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) bch2_data_update_read_done(&op->write, rbio->pick.crc); } -static struct promote_op *__promote_alloc(struct bch_fs *c, +static struct promote_op *__promote_alloc(struct btree_trans *trans, enum btree_id btree_id, struct bkey_s_c k, struct bpos pos, @@ -1907,12 +1971,13 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, unsigned sectors, struct bch_read_bio **rbio) { + struct bch_fs *c = trans->c; struct promote_op *op = NULL; struct bio *bio; unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); int ret; - if (!percpu_ref_tryget_live(&c->writes)) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) return NULL; op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO); @@ -1950,7 +2015,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, bio = &op->write.op.wbio.bio; bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); - ret = bch2_data_update_init(c, &op->write, + ret = bch2_data_update_init(trans, NULL, &op->write, writepoint_hashed((unsigned long) current), opts, (struct data_update_opts) { @@ -1959,6 +2024,13 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED, }, btree_id, k); + if (ret == -BCH_ERR_nocow_lock_blocked) { + ret = rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params); + BUG_ON(ret); + goto err; + } + BUG_ON(ret); op->write.op.end_io = promote_done; @@ -1969,21 +2041,22 @@ err: kfree(*rbio); *rbio = NULL; kfree(op); - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_promote); return NULL; } noinline -static struct promote_op *promote_alloc(struct bch_fs *c, - struct bvec_iter iter, - struct bkey_s_c k, - struct extent_ptr_decoded *pick, - struct bch_io_opts opts, - unsigned flags, - struct bch_read_bio **rbio, - bool *bounce, - bool *read_full) +static struct promote_op *promote_alloc(struct btree_trans *trans, + struct bvec_iter iter, + struct bkey_s_c k, + struct extent_ptr_decoded *pick, + struct bch_io_opts opts, + unsigned flags, + struct bch_read_bio **rbio, + bool *bounce, + bool *read_full) { + struct bch_fs *c = trans->c; bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); /* data might have to be decompressed in the write path: */ unsigned sectors = promote_full @@ -1997,7 +2070,7 @@ static struct promote_op *promote_alloc(struct bch_fs *c, if (!should_promote(c, k, pos, opts, flags)) return NULL; - promote = __promote_alloc(c, + promote = __promote_alloc(trans, k.k->type == KEY_TYPE_reflink_v ? BTREE_ID_reflink : BTREE_ID_extents, @@ -2283,7 +2356,8 @@ static void __bch2_read_endio(struct work_struct *work) } csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); - if (bch2_crc_cmp(csum, rbio->pick.crc.csum)) + if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && + !IS_ENABLED(CONFIG_BCACHEFS_NO_IO)) goto csum_err; /* @@ -2604,7 +2678,7 @@ retry_pick: } if (orig->opts.promote_target) - promote = promote_alloc(c, iter, k, &pick, orig->opts, flags, + promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, &rbio, &bounce, &read_full); if (!read_full) { @@ -2734,10 +2808,21 @@ get_bio: bio_sectors(&rbio->bio)); bio_set_dev(&rbio->bio, ca->disk_sb.bdev); - if (likely(!(flags & BCH_READ_IN_RETRY))) - submit_bio(&rbio->bio); - else - submit_bio_wait(&rbio->bio); + if (IS_ENABLED(CONFIG_BCACHEFS_NO_IO)) { + if (likely(!(flags & BCH_READ_IN_RETRY))) + bio_endio(&rbio->bio); + } else { + if (likely(!(flags & BCH_READ_IN_RETRY))) + submit_bio(&rbio->bio); + else + submit_bio_wait(&rbio->bio); + } + + /* + * We just submitted IO which may block, we expect relock fail + * events and shouldn't count them: + */ + trans->notrace_relock_fail = true; } else { /* Attempting reconstruct read: */ if (bch2_ec_read_extent(c, rbio)) { diff --git a/libbcachefs/io.h b/libbcachefs/io.h index 68e4d764..166ad681 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -15,7 +15,11 @@ void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT void bch2_latency_acct(struct bch_dev *, u64, int); +#else +static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {} +#endif void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, enum bch_data_type, const struct bkey_i *, bool); @@ -25,23 +29,41 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, const char *bch2_blk_status_to_str(blk_status_t); enum bch_write_flags { - BCH_WRITE_ALLOC_NOWAIT = (1 << 0), - BCH_WRITE_CACHED = (1 << 1), - BCH_WRITE_DATA_ENCODED = (1 << 2), - BCH_WRITE_PAGES_STABLE = (1 << 3), - BCH_WRITE_PAGES_OWNED = (1 << 4), - BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 5), - BCH_WRITE_WROTE_DATA_INLINE = (1 << 6), - BCH_WRITE_CHECK_ENOSPC = (1 << 7), - BCH_WRITE_SYNC = (1 << 8), - BCH_WRITE_MOVE = (1 << 9), - - /* Internal: */ - BCH_WRITE_DONE = (1 << 10), - BCH_WRITE_IO_ERROR = (1 << 11), - BCH_WRITE_CONVERT_UNWRITTEN = (1 << 12), + __BCH_WRITE_ALLOC_NOWAIT, + __BCH_WRITE_CACHED, + __BCH_WRITE_DATA_ENCODED, + __BCH_WRITE_PAGES_STABLE, + __BCH_WRITE_PAGES_OWNED, + __BCH_WRITE_ONLY_SPECIFIED_DEVS, + __BCH_WRITE_WROTE_DATA_INLINE, + __BCH_WRITE_FROM_INTERNAL, + __BCH_WRITE_CHECK_ENOSPC, + __BCH_WRITE_SYNC, + __BCH_WRITE_MOVE, + __BCH_WRITE_IN_WORKER, + __BCH_WRITE_DONE, + __BCH_WRITE_IO_ERROR, + __BCH_WRITE_CONVERT_UNWRITTEN, }; +#define BCH_WRITE_ALLOC_NOWAIT (1U << __BCH_WRITE_ALLOC_NOWAIT) +#define BCH_WRITE_CACHED (1U << __BCH_WRITE_CACHED) +#define BCH_WRITE_DATA_ENCODED (1U << __BCH_WRITE_DATA_ENCODED) +#define BCH_WRITE_PAGES_STABLE (1U << __BCH_WRITE_PAGES_STABLE) +#define BCH_WRITE_PAGES_OWNED (1U << __BCH_WRITE_PAGES_OWNED) +#define BCH_WRITE_ONLY_SPECIFIED_DEVS (1U << __BCH_WRITE_ONLY_SPECIFIED_DEVS) +#define BCH_WRITE_WROTE_DATA_INLINE (1U << __BCH_WRITE_WROTE_DATA_INLINE) +#define BCH_WRITE_FROM_INTERNAL (1U << __BCH_WRITE_FROM_INTERNAL) +#define BCH_WRITE_CHECK_ENOSPC (1U << __BCH_WRITE_CHECK_ENOSPC) +#define BCH_WRITE_SYNC (1U << __BCH_WRITE_SYNC) +#define BCH_WRITE_MOVE (1U << __BCH_WRITE_MOVE) + +/* Internal: */ +#define BCH_WRITE_IN_WORKER (1U << __BCH_WRITE_IN_WORKER) +#define BCH_WRITE_DONE (1U << __BCH_WRITE_DONE) +#define BCH_WRITE_IO_ERROR (1U << __BCH_WRITE_IO_ERROR) +#define BCH_WRITE_CONVERT_UNWRITTEN (1U << __BCH_WRITE_CONVERT_UNWRITTEN) + static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) { return op->alloc_reserve == RESERVE_movinggc diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 1cbca187..6214a919 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -225,7 +225,7 @@ static int journal_entry_open(struct journal *j) if (!fifo_free(&j->pin)) return JOURNAL_ERR_journal_pin_full; - if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) - 1) + if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) return JOURNAL_ERR_max_in_flight; BUG_ON(!j->cur_entry_sectors); diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index ee37f90a..024cea9f 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -110,7 +110,6 @@ */ #include <linux/hash.h> -#include <linux/prefetch.h> #include "journal_types.h" @@ -305,26 +304,15 @@ static inline int journal_res_get_fast(struct journal *j, { union journal_res_state old, new; u64 v = atomic64_read(&j->reservations.counter); - unsigned u64s, offset; do { old.v = new.v = v; - /* - * Round up the end of the journal reservation to the next - * cacheline boundary: - */ - u64s = res->u64s; - offset = sizeof(struct jset) / sizeof(u64) + - new.cur_entry_offset + u64s; - u64s += ((offset - 1) & ((SMP_CACHE_BYTES / sizeof(u64)) - 1)) + 1; - - /* * Check if there is still room in the current journal * entry: */ - if (new.cur_entry_offset + u64s > j->cur_entry_u64s) + if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) return 0; EBUG_ON(!journal_state_count(new, new.idx)); @@ -332,7 +320,7 @@ static inline int journal_res_get_fast(struct journal *j, if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark) return 0; - new.cur_entry_offset += u64s; + new.cur_entry_offset += res->u64s; journal_state_inc(&new); /* @@ -349,15 +337,8 @@ static inline int journal_res_get_fast(struct journal *j, res->ref = true; res->idx = old.idx; - res->u64s = u64s; res->offset = old.cur_entry_offset; res->seq = le64_to_cpu(j->buf[old.idx].data->seq); - - offset = res->offset; - while (offset < res->offset + res->u64s) { - prefetchw(vstruct_idx(j->buf[res->idx].data, offset)); - offset += SMP_CACHE_BYTES / sizeof(u64); - } return 1; } diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index d6f25934..f6374a2b 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -154,7 +154,7 @@ replace: i->nr_ptrs = 0; i->csum_good = entry_ptr.csum_good; i->ignore = false; - memcpy(&i->j, j, bytes); + unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); i->ptrs[i->nr_ptrs++] = entry_ptr; if (dup) { @@ -341,7 +341,7 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c, int ret = journal_validate_key(c, jset, entry, entry->level, entry->btree_id, - k, version, big_endian, write); + k, version, big_endian, write|BKEY_INVALID_FROM_JOURNAL); if (ret == FSCK_DELETED_KEY) continue; @@ -662,7 +662,8 @@ static int journal_entry_overwrite_validate(struct bch_fs *c, struct jset_entry *entry, unsigned version, int big_endian, int write) { - return journal_entry_btree_keys_validate(c, jset, entry, version, big_endian, write); + return journal_entry_btree_keys_validate(c, jset, entry, + version, big_endian, READ); } static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, @@ -1498,6 +1499,8 @@ static void journal_write_done(struct closure *cl) bch2_do_discards(c); closure_wake_up(&c->freelist_wait); + + bch2_reset_alloc_cursors(c); } } else if (!j->err_seq || seq < j->err_seq) j->err_seq = seq; diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index e873ce2a..8744581d 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -347,13 +347,13 @@ void bch2_journal_pin_put(struct journal *j, u64 seq) } } -static inline void __journal_pin_drop(struct journal *j, +static inline bool __journal_pin_drop(struct journal *j, struct journal_entry_pin *pin) { struct journal_entry_pin_list *pin_list; if (!journal_pin_active(pin)) - return; + return false; if (j->flush_in_progress == pin) j->flush_in_progress_dropped = true; @@ -363,19 +363,19 @@ static inline void __journal_pin_drop(struct journal *j, list_del_init(&pin->list); /* - * Unpinning a journal entry may make journal_next_bucket() succeed if + * Unpinning a journal entry make make journal_next_bucket() succeed, if * writing a new last_seq will now make another bucket available: */ - if (atomic_dec_and_test(&pin_list->count) && - pin_list == &fifo_peek_front(&j->pin)) - bch2_journal_reclaim_fast(j); + return atomic_dec_and_test(&pin_list->count) && + pin_list == &fifo_peek_front(&j->pin); } void bch2_journal_pin_drop(struct journal *j, struct journal_entry_pin *pin) { spin_lock(&j->lock); - __journal_pin_drop(j, pin); + if (__journal_pin_drop(j, pin)) + bch2_journal_reclaim_fast(j); spin_unlock(&j->lock); } @@ -384,6 +384,7 @@ void bch2_journal_pin_set(struct journal *j, u64 seq, journal_pin_flush_fn flush_fn) { struct journal_entry_pin_list *pin_list; + bool reclaim; spin_lock(&j->lock); @@ -400,7 +401,7 @@ void bch2_journal_pin_set(struct journal *j, u64 seq, pin_list = journal_seq_pin(j, seq); - __journal_pin_drop(j, pin); + reclaim = __journal_pin_drop(j, pin); atomic_inc(&pin_list->count); pin->seq = seq; @@ -412,6 +413,9 @@ void bch2_journal_pin_set(struct journal *j, u64 seq, list_add(&pin->list, &pin_list->list); else list_add(&pin->list, &pin_list->flushed); + + if (reclaim) + bch2_journal_reclaim_fast(j); spin_unlock(&j->lock); /* @@ -703,7 +707,7 @@ static int bch2_journal_reclaim_thread(void *arg) j->next_reclaim = now + delay; while (1) { - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); if (kthread_should_stop()) break; if (j->reclaim_kicked) @@ -714,9 +718,9 @@ static int bch2_journal_reclaim_thread(void *arg) spin_unlock(&j->lock); if (journal_empty) - freezable_schedule(); + schedule(); else if (time_after(j->next_reclaim, jiffies)) - freezable_schedule_timeout(j->next_reclaim - jiffies); + schedule_timeout(j->next_reclaim - jiffies); else break; } diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index c8729cb3..0e6bde66 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -182,29 +182,32 @@ typedef DARRAY(u64) darray_u64; /* Embedded in struct bch_fs */ struct journal { /* Fastpath stuff up front: */ - - unsigned long flags; + struct { union journal_res_state reservations; enum journal_watermark watermark; + union journal_preres_state prereserved; + + } __aligned(SMP_CACHE_BYTES); + + unsigned long flags; + /* Max size of current journal entry */ unsigned cur_entry_u64s; unsigned cur_entry_sectors; + /* Reserved space in journal entry to be used just prior to write */ + unsigned entry_u64s_reserved; + + /* * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if * insufficient devices: */ enum journal_errors cur_entry_error; - union journal_preres_state prereserved; - - /* Reserved space in journal entry to be used just prior to write */ - unsigned entry_u64s_reserved; - unsigned buf_size_want; - /* * We may queue up some things to be journalled (log messages) before * the journal has actually started - stash them here: @@ -298,15 +301,15 @@ struct journal { u64 nr_flush_writes; u64 nr_noflush_writes; - struct time_stats *flush_write_time; - struct time_stats *noflush_write_time; - struct time_stats *blocked_time; - struct time_stats *flush_seq_time; + struct bch2_time_stats *flush_write_time; + struct bch2_time_stats *noflush_write_time; + struct bch2_time_stats *blocked_time; + struct bch2_time_stats *flush_seq_time; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map res_map; #endif -}; +} __aligned(SMP_CACHE_BYTES); /* * Embedded in struct bch_dev. First three fields refer to the array of journal diff --git a/libbcachefs/lru.c b/libbcachefs/lru.c index e542cd3d..07fb41ca 100644 --- a/libbcachefs/lru.c +++ b/libbcachefs/lru.c @@ -10,7 +10,7 @@ /* KEY_TYPE_lru is obsolete: */ int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { const struct bch_lru *lru = bkey_s_c_to_lru(k).v; @@ -20,6 +20,12 @@ int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k, return -BCH_ERR_invalid_bkey; } + if (!lru_pos_time(k.k->p)) { + prt_printf(err, "lru entry at time=0"); + return -BCH_ERR_invalid_bkey; + + } + return 0; } @@ -31,6 +37,15 @@ void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, "idx %llu", le64_to_cpu(lru->idx)); } +void bch2_lru_pos_to_text(struct printbuf *out, struct bpos lru) +{ + prt_printf(out, "%llu:%llu -> %llu:%llu", + lru_pos_id(lru), + lru_pos_time(lru), + u64_to_bucket(lru.offset).inode, + u64_to_bucket(lru.offset).offset); +} + static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time, unsigned key_type) { diff --git a/libbcachefs/lru.h b/libbcachefs/lru.h index 2e22f139..b8d9848c 100644 --- a/libbcachefs/lru.h +++ b/libbcachefs/lru.h @@ -22,9 +22,11 @@ static inline u64 lru_pos_time(struct bpos pos) return pos.inode & ~(~0ULL << LRU_TIME_BITS); } -int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +void bch2_lru_pos_to_text(struct printbuf *, struct bpos); + #define bch2_bkey_ops_lru ((struct bkey_ops) { \ .key_invalid = bch2_lru_invalid, \ .val_to_text = bch2_lru_to_text, \ diff --git a/libbcachefs/move.c b/libbcachefs/move.c index b308354a..8321563d 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -61,7 +61,7 @@ static void move_free(struct moving_io *io) bch2_data_update_exit(&io->write); wake_up(&ctxt->wait); - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_move); kfree(io); } @@ -74,6 +74,7 @@ static void move_write_done(struct bch_write_op *op) ctxt->write_error = true; atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); + atomic_dec(&io->write.ctxt->write_ios); move_free(io); closure_put(&ctxt->cl); } @@ -87,11 +88,12 @@ static void move_write(struct moving_io *io) closure_get(&io->write.ctxt->cl); atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); + atomic_inc(&io->write.ctxt->write_ios); bch2_data_update_read_done(&io->write, io->rbio.pick.crc); } -static inline struct moving_io *next_pending_write(struct moving_context *ctxt) +struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) { struct moving_io *io = list_first_entry_or_null(&ctxt->reads, struct moving_io, list); @@ -105,35 +107,27 @@ static void move_read_endio(struct bio *bio) struct moving_context *ctxt = io->write.ctxt; atomic_sub(io->read_sectors, &ctxt->read_sectors); + atomic_dec(&ctxt->read_ios); io->read_completed = true; wake_up(&ctxt->wait); closure_put(&ctxt->cl); } -static void do_pending_writes(struct moving_context *ctxt, struct btree_trans *trans) +void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt, + struct btree_trans *trans) { struct moving_io *io; if (trans) bch2_trans_unlock(trans); - while ((io = next_pending_write(ctxt))) { + while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) { list_del(&io->list); move_write(io); } } -#define move_ctxt_wait_event(_ctxt, _trans, _cond) \ -do { \ - do_pending_writes(_ctxt, _trans); \ - \ - if (_cond) \ - break; \ - __wait_event((_ctxt)->wait, \ - next_pending_write(_ctxt) || (_cond)); \ -} while (1) - static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt, struct btree_trans *trans) { @@ -148,7 +142,11 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt) { move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads)); closure_sync(&ctxt->cl); + EBUG_ON(atomic_read(&ctxt->write_sectors)); + EBUG_ON(atomic_read(&ctxt->write_ios)); + EBUG_ON(atomic_read(&ctxt->read_sectors)); + EBUG_ON(atomic_read(&ctxt->read_ios)); if (ctxt->stats) { progress_list_del(ctxt->c, ctxt->stats); @@ -257,7 +255,7 @@ static int bch2_move_extent(struct btree_trans *trans, return 0; } - if (!percpu_ref_tryget_live(&c->writes)) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_move)) return -BCH_ERR_erofs_no_writes; /* @@ -299,8 +297,8 @@ static int bch2_move_extent(struct btree_trans *trans, io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); io->rbio.bio.bi_end_io = move_read_endio; - ret = bch2_data_update_init(c, &io->write, ctxt->wp, io_opts, - data_opts, btree_id, k); + ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp, + io_opts, data_opts, btree_id, k); if (ret && ret != -BCH_ERR_unwritten_extent_update) goto err_free_pages; @@ -323,6 +321,7 @@ static int bch2_move_extent(struct btree_trans *trans, trace_move_extent_read(k.k); atomic_add(io->read_sectors, &ctxt->read_sectors); + atomic_inc(&ctxt->read_ios); list_add_tail(&io->list, &ctxt->reads); /* @@ -341,7 +340,7 @@ err_free_pages: err_free: kfree(io); err: - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_move); trace_and_count(c, move_extent_alloc_mem_fail, k.k); return ret; } @@ -412,13 +411,15 @@ static int move_ratelimit(struct btree_trans *trans, } } while (delay); + /* + * XXX: these limits really ought to be per device, SSDs and hard drives + * will want different limits + */ move_ctxt_wait_event(ctxt, trans, - atomic_read(&ctxt->write_sectors) < - c->opts.move_bytes_in_flight >> 9); - - move_ctxt_wait_event(ctxt, trans, - atomic_read(&ctxt->read_sectors) < - c->opts.move_bytes_in_flight >> 9); + atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 && + atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 && + atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight && + atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight); return 0; } diff --git a/libbcachefs/move.h b/libbcachefs/move.h index b14f679f..aef61380 100644 --- a/libbcachefs/move.h +++ b/libbcachefs/move.h @@ -24,10 +24,26 @@ struct moving_context { /* in flight sectors: */ atomic_t read_sectors; atomic_t write_sectors; + atomic_t read_ios; + atomic_t write_ios; wait_queue_head_t wait; }; +#define move_ctxt_wait_event(_ctxt, _trans, _cond) \ +do { \ + bool cond_finished = false; \ + bch2_moving_ctxt_do_pending_writes(_ctxt, _trans); \ + \ + if (_cond) \ + break; \ + __wait_event((_ctxt)->wait, \ + bch2_moving_ctxt_next_pending_write(_ctxt) || \ + (cond_finished = (_cond))); \ + if (cond_finished) \ + break; \ +} while (1) + typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c, struct bch_io_opts *, struct data_update_opts *); @@ -35,6 +51,9 @@ void bch2_moving_ctxt_exit(struct moving_context *); void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *, struct bch_ratelimit *, struct bch_move_stats *, struct write_point_specifier, bool); +struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *); +void bch2_moving_ctxt_do_pending_writes(struct moving_context *, + struct btree_trans *); int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); diff --git a/libbcachefs/nocow_locking.c b/libbcachefs/nocow_locking.c index bff62671..396357cd 100644 --- a/libbcachefs/nocow_locking.c +++ b/libbcachefs/nocow_locking.c @@ -18,6 +18,8 @@ bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos return false; } +#define sign(v) (v < 0 ? -1 : v > 0 ? 1 : 0) + void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket, int flags) { u64 dev_bucket = bucket_to_u64(bucket); @@ -27,6 +29,8 @@ void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos buc for (i = 0; i < ARRAY_SIZE(l->b); i++) if (l->b[i] == dev_bucket) { + BUG_ON(sign(atomic_read(&l->l[i])) != lock_val); + if (!atomic_sub_return(lock_val, &l->l[i])) closure_wake_up(&l->wait); return; @@ -35,8 +39,8 @@ void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos buc BUG(); } -static bool bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l, - u64 dev_bucket, int flags) +bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l, + u64 dev_bucket, int flags) { int v, lock_val = flags ? 1 : -1; unsigned i; @@ -69,11 +73,11 @@ void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t, struct nocow_lock_bucket *l, u64 dev_bucket, int flags) { - if (!bch2_bucket_nocow_trylock(l, dev_bucket, flags)) { + if (!__bch2_bucket_nocow_trylock(l, dev_bucket, flags)) { struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks); u64 start_time = local_clock(); - __closure_wait_event(&l->wait, bch2_bucket_nocow_trylock(l, dev_bucket, flags)); + __closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags)); bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time); } } diff --git a/libbcachefs/nocow_locking.h b/libbcachefs/nocow_locking.h index 45258cc3..ff8e4af5 100644 --- a/libbcachefs/nocow_locking.h +++ b/libbcachefs/nocow_locking.h @@ -20,6 +20,7 @@ static inline struct nocow_lock_bucket *bucket_nocow_lock(struct bucket_nocow_lo bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *, struct bpos); void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *, struct bpos, int); +bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *, u64, int); void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *, struct nocow_lock_bucket *, u64, int); @@ -32,6 +33,15 @@ static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t, __bch2_bucket_nocow_lock(t, l, dev_bucket, flags); } +static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t, + struct bpos bucket, int flags) +{ + u64 dev_bucket = bucket_to_u64(bucket); + struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); + + return __bch2_bucket_nocow_trylock(l, dev_bucket, flags); +} + void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *); int bch2_fs_nocow_locking_init(struct bch_fs *); diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index 555e6373..04e2989c 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -9,8 +9,6 @@ #include "super-io.h" #include "util.h" -#include <linux/pretty-printers.h> - #define x(t, n) [n] = #t, const char * const bch2_metadata_versions[] = { @@ -284,7 +282,7 @@ int bch2_opt_parse(struct bch_fs *c, if (ret < 0) { if (err) prt_printf(err, "%s: must be a number", - opt->attr.name); + opt->attr.name); return ret; } break; @@ -293,7 +291,7 @@ int bch2_opt_parse(struct bch_fs *c, if (ret < 0) { if (err) prt_printf(err, "%s: invalid selection", - opt->attr.name); + opt->attr.name); return ret; } @@ -307,7 +305,7 @@ int bch2_opt_parse(struct bch_fs *c, if (ret < 0) { if (err) prt_printf(err, "%s: parse error", - opt->attr.name); + opt->attr.name); return ret; } } diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index ee3055cf..9d4bbec7 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -294,7 +294,12 @@ enum opt_type { OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(1024, U32_MAX), \ BCH2_NO_SB_OPT, 1U << 20, \ - NULL, "Amount of IO in flight to keep in flight by the move path")\ + NULL, "Maximum Amount of IO to keep in flight by the move path")\ + x(move_ios_in_flight, u32, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(1, 1024), \ + BCH2_NO_SB_OPT, 32, \ + NULL, "Maximum number of IOs to keep in flight by the move path")\ x(fsck, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ @@ -336,6 +341,11 @@ enum opt_type { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Only read the journal, skip the rest of recovery")\ + x(journal_transaction_names, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_JOURNAL_TRANSACTION_NAMES, true, \ + NULL, "Log transaction function names in journal") \ x(noexcl, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ diff --git a/linux/printbuf.c b/libbcachefs/printbuf.c similarity index 72% rename from linux/printbuf.c rename to libbcachefs/printbuf.c index 5cf79d43..c41daa18 100644 --- a/linux/printbuf.c +++ b/libbcachefs/printbuf.c @@ -4,16 +4,17 @@ #include <linux/err.h> #include <linux/export.h> #include <linux/kernel.h> -#include <linux/printbuf.h> #include <linux/slab.h> #include <linux/string_helpers.h> +#include "printbuf.h" + static inline unsigned printbuf_linelen(struct printbuf *buf) { return buf->pos - buf->last_newline; } -int printbuf_make_room(struct printbuf *out, unsigned extra) +int bch2_printbuf_make_room(struct printbuf *out, unsigned extra) { unsigned new_size; char *buf; @@ -44,13 +45,46 @@ int printbuf_make_room(struct printbuf *out, unsigned extra) out->size = new_size; return 0; } -EXPORT_SYMBOL(printbuf_make_room); + +void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args) +{ + int len; + + do { + va_list args2; + + va_copy(args2, args); + len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2); + } while (len + 1 >= printbuf_remaining(out) && + !bch2_printbuf_make_room(out, len + 1)); + + len = min_t(size_t, len, + printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0); + out->pos += len; +} + +void bch2_prt_printf(struct printbuf *out, const char *fmt, ...) +{ + va_list args; + int len; + + do { + va_start(args, fmt); + len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args); + va_end(args); + } while (len + 1 >= printbuf_remaining(out) && + !bch2_printbuf_make_room(out, len + 1)); + + len = min_t(size_t, len, + printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0); + out->pos += len; +} /** * printbuf_str - returns printbuf's buf as a C string, guaranteed to be null * terminated */ -const char *printbuf_str(const struct printbuf *buf) +const char *bch2_printbuf_str(const struct printbuf *buf) { /* * If we've written to a printbuf then it's guaranteed to be a null @@ -61,33 +95,29 @@ const char *printbuf_str(const struct printbuf *buf) ? buf->buf : ""; } -EXPORT_SYMBOL(printbuf_str); /** * printbuf_exit - exit a printbuf, freeing memory it owns and poisoning it * against accidental use. */ -void printbuf_exit(struct printbuf *buf) +void bch2_printbuf_exit(struct printbuf *buf) { if (buf->heap_allocated) { kfree(buf->buf); buf->buf = ERR_PTR(-EINTR); /* poison value */ } } -EXPORT_SYMBOL(printbuf_exit); -void printbuf_tabstops_reset(struct printbuf *buf) +void bch2_printbuf_tabstops_reset(struct printbuf *buf) { buf->nr_tabstops = 0; } -EXPORT_SYMBOL(printbuf_tabstops_reset); -void printbuf_tabstop_pop(struct printbuf *buf) +void bch2_printbuf_tabstop_pop(struct printbuf *buf) { if (buf->nr_tabstops) --buf->nr_tabstops; } -EXPORT_SYMBOL(printbuf_tabstop_pop); /* * printbuf_tabstop_set - add a tabstop, n spaces from the previous tabstop @@ -99,7 +129,7 @@ EXPORT_SYMBOL(printbuf_tabstop_pop); * PRINTBUF_INLINE_TABSTOPS or setting tabstops more than 255 spaces from start * of line. */ -int printbuf_tabstop_push(struct printbuf *buf, unsigned spaces) +int bch2_printbuf_tabstop_push(struct printbuf *buf, unsigned spaces) { unsigned prev_tabstop = buf->nr_tabstops ? buf->_tabstops[buf->nr_tabstops - 1] @@ -112,7 +142,6 @@ int printbuf_tabstop_push(struct printbuf *buf, unsigned spaces) buf->has_indent_or_tabstops = true; return 0; } -EXPORT_SYMBOL(printbuf_tabstop_push); /** * printbuf_indent_add - add to the current indent level @@ -123,7 +152,7 @@ EXPORT_SYMBOL(printbuf_tabstop_push); * Subsequent lines, and the current line if the output position is at the start * of the current line, will be indented by @spaces more spaces. */ -void printbuf_indent_add(struct printbuf *buf, unsigned spaces) +void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces) { if (WARN_ON_ONCE(buf->indent + spaces < buf->indent)) spaces = 0; @@ -133,7 +162,6 @@ void printbuf_indent_add(struct printbuf *buf, unsigned spaces) buf->has_indent_or_tabstops = true; } -EXPORT_SYMBOL(printbuf_indent_add); /** * printbuf_indent_sub - subtract from the current indent level @@ -144,7 +172,7 @@ EXPORT_SYMBOL(printbuf_indent_add); * Subsequent lines, and the current line if the output position is at the start * of the current line, will be indented by @spaces less spaces. */ -void printbuf_indent_sub(struct printbuf *buf, unsigned spaces) +void bch2_printbuf_indent_sub(struct printbuf *buf, unsigned spaces) { if (WARN_ON_ONCE(spaces > buf->indent)) spaces = buf->indent; @@ -158,13 +186,12 @@ void printbuf_indent_sub(struct printbuf *buf, unsigned spaces) if (!buf->indent && !buf->nr_tabstops) buf->has_indent_or_tabstops = false; } -EXPORT_SYMBOL(printbuf_indent_sub); -void prt_newline(struct printbuf *buf) +void bch2_prt_newline(struct printbuf *buf) { unsigned i; - printbuf_make_room(buf, 1 + buf->indent); + bch2_printbuf_make_room(buf, 1 + buf->indent); __prt_char(buf, '\n'); @@ -178,7 +205,6 @@ void prt_newline(struct printbuf *buf) buf->last_field = buf->pos; buf->cur_tabstop = 0; } -EXPORT_SYMBOL(prt_newline); /* * Returns spaces from start of line, if set, or 0 if unset: @@ -207,14 +233,13 @@ static void __prt_tab(struct printbuf *out) * * Advance output to the next tabstop by printing spaces. */ -void prt_tab(struct printbuf *out) +void bch2_prt_tab(struct printbuf *out) { if (WARN_ON(!cur_tabstop(out))) return; __prt_tab(out); } -EXPORT_SYMBOL(prt_tab); static void __prt_tab_rjust(struct printbuf *buf) { @@ -222,7 +247,7 @@ static void __prt_tab_rjust(struct printbuf *buf) int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf); if (pad > 0) { - printbuf_make_room(buf, pad); + bch2_printbuf_make_room(buf, pad); if (buf->last_field + pad < buf->size) memmove(buf->buf + buf->last_field + pad, @@ -250,14 +275,13 @@ static void __prt_tab_rjust(struct printbuf *buf) * Advance output to the next tabstop by inserting spaces immediately after the * previous tabstop, right justifying previously outputted text. */ -void prt_tab_rjust(struct printbuf *buf) +void bch2_prt_tab_rjust(struct printbuf *buf) { if (WARN_ON(!cur_tabstop(buf))) return; __prt_tab_rjust(buf); } -EXPORT_SYMBOL(prt_tab_rjust); /** * prt_bytes_indented - Print an array of chars, handling embedded control characters @@ -271,7 +295,7 @@ EXPORT_SYMBOL(prt_tab_rjust); * \t: prt_tab advance to next tabstop * \r: prt_tab_rjust advance to next tabstop, with right justification */ -void prt_bytes_indented(struct printbuf *out, const char *str, unsigned count) +void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned count) { const char *unprinted_start = str; const char *end = str + count; @@ -286,7 +310,7 @@ void prt_bytes_indented(struct printbuf *out, const char *str, unsigned count) case '\n': prt_bytes(out, unprinted_start, str - unprinted_start); unprinted_start = str + 1; - prt_newline(out); + bch2_prt_newline(out); break; case '\t': if (likely(cur_tabstop(out))) { @@ -309,34 +333,31 @@ void prt_bytes_indented(struct printbuf *out, const char *str, unsigned count) prt_bytes(out, unprinted_start, str - unprinted_start); } -EXPORT_SYMBOL(prt_bytes_indented); /** * prt_human_readable_u64 - Print out a u64 in human readable units * * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units */ -void prt_human_readable_u64(struct printbuf *buf, u64 v) +void bch2_prt_human_readable_u64(struct printbuf *buf, u64 v) { - printbuf_make_room(buf, 10); + bch2_printbuf_make_room(buf, 10); buf->pos += string_get_size(v, 1, !buf->si_units, buf->buf + buf->pos, printbuf_remaining_size(buf)); } -EXPORT_SYMBOL(prt_human_readable_u64); /** * prt_human_readable_s64 - Print out a s64 in human readable units * * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units */ -void prt_human_readable_s64(struct printbuf *buf, s64 v) +void bch2_prt_human_readable_s64(struct printbuf *buf, s64 v) { if (v < 0) prt_char(buf, '-'); - prt_human_readable_u64(buf, abs(v)); + bch2_prt_human_readable_u64(buf, abs(v)); } -EXPORT_SYMBOL(prt_human_readable_s64); /** * prt_units_u64 - Print out a u64 according to printbuf unit options @@ -344,14 +365,13 @@ EXPORT_SYMBOL(prt_human_readable_s64); * Units are either raw (default), or human reabable units (controlled via * @buf->human_readable_units) */ -void prt_units_u64(struct printbuf *out, u64 v) +void bch2_prt_units_u64(struct printbuf *out, u64 v) { if (out->human_readable_units) - prt_human_readable_u64(out, v); + bch2_prt_human_readable_u64(out, v); else - prt_printf(out, "%llu", v); + bch2_prt_printf(out, "%llu", v); } -EXPORT_SYMBOL(prt_units_u64); /** * prt_units_s64 - Print out a s64 according to printbuf unit options @@ -359,10 +379,37 @@ EXPORT_SYMBOL(prt_units_u64); * Units are either raw (default), or human reabable units (controlled via * @buf->human_readable_units) */ -void prt_units_s64(struct printbuf *out, s64 v) +void bch2_prt_units_s64(struct printbuf *out, s64 v) { if (v < 0) prt_char(out, '-'); - prt_units_u64(out, abs(v)); + bch2_prt_units_u64(out, abs(v)); +} + +void bch2_prt_string_option(struct printbuf *out, + const char * const list[], + size_t selected) +{ + size_t i; + + for (i = 0; list[i]; i++) + bch2_prt_printf(out, i == selected ? "[%s] " : "%s ", list[i]); +} + +void bch2_prt_bitflags(struct printbuf *out, + const char * const list[], u64 flags) +{ + unsigned bit, nr = 0; + bool first = true; + + while (list[nr]) + nr++; + + while (flags && (bit = __ffs(flags)) < nr) { + if (!first) + bch2_prt_printf(out, ","); + first = false; + bch2_prt_printf(out, "%s", list[bit]); + flags ^= 1 << bit; + } } -EXPORT_SYMBOL(prt_units_s64); diff --git a/include/linux/printbuf.h b/libbcachefs/printbuf.h similarity index 76% rename from include/linux/printbuf.h rename to libbcachefs/printbuf.h index 24e62e56..2e993995 100644 --- a/include/linux/printbuf.h +++ b/libbcachefs/printbuf.h @@ -1,8 +1,8 @@ /* SPDX-License-Identifier: LGPL-2.1+ */ /* Copyright (C) 2022 Kent Overstreet */ -#ifndef _LINUX_PRINTBUF_H -#define _LINUX_PRINTBUF_H +#ifndef _BCACHEFS_PRINTBUF_H +#define _BCACHEFS_PRINTBUF_H /* * Printbufs: Simple strings for printing to, with optional heap allocation @@ -100,26 +100,30 @@ struct printbuf { u8 _tabstops[PRINTBUF_INLINE_TABSTOPS]; }; -int printbuf_make_room(struct printbuf *, unsigned); -const char *printbuf_str(const struct printbuf *); -void printbuf_exit(struct printbuf *); +int bch2_printbuf_make_room(struct printbuf *, unsigned); +__printf(2, 3) void bch2_prt_printf(struct printbuf *out, const char *fmt, ...); +__printf(2, 0) void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list); +const char *bch2_printbuf_str(const struct printbuf *); +void bch2_printbuf_exit(struct printbuf *); -void printbuf_tabstops_reset(struct printbuf *); -void printbuf_tabstop_pop(struct printbuf *); -int printbuf_tabstop_push(struct printbuf *, unsigned); +void bch2_printbuf_tabstops_reset(struct printbuf *); +void bch2_printbuf_tabstop_pop(struct printbuf *); +int bch2_printbuf_tabstop_push(struct printbuf *, unsigned); -void printbuf_indent_add(struct printbuf *, unsigned); -void printbuf_indent_sub(struct printbuf *, unsigned); +void bch2_printbuf_indent_add(struct printbuf *, unsigned); +void bch2_printbuf_indent_sub(struct printbuf *, unsigned); -void prt_newline(struct printbuf *); -void prt_tab(struct printbuf *); -void prt_tab_rjust(struct printbuf *); +void bch2_prt_newline(struct printbuf *); +void bch2_prt_tab(struct printbuf *); +void bch2_prt_tab_rjust(struct printbuf *); -void prt_bytes_indented(struct printbuf *, const char *, unsigned); -void prt_human_readable_u64(struct printbuf *, u64); -void prt_human_readable_s64(struct printbuf *, s64); -void prt_units_u64(struct printbuf *, u64); -void prt_units_s64(struct printbuf *, s64); +void bch2_prt_bytes_indented(struct printbuf *, const char *, unsigned); +void bch2_prt_human_readable_u64(struct printbuf *, u64); +void bch2_prt_human_readable_s64(struct printbuf *, s64); +void bch2_prt_units_u64(struct printbuf *, u64); +void bch2_prt_units_s64(struct printbuf *, s64); +void bch2_prt_string_option(struct printbuf *, const char * const[], size_t); +void bch2_prt_bitflags(struct printbuf *, const char * const[], u64); /* Initializer for a heap allocated printbuf: */ #define PRINTBUF ((struct printbuf) { .heap_allocated = true }) @@ -163,7 +167,7 @@ static inline bool printbuf_overflowed(struct printbuf *out) static inline void printbuf_nul_terminate(struct printbuf *out) { - printbuf_make_room(out, 1); + bch2_printbuf_make_room(out, 1); if (out->pos < out->size) out->buf[out->pos] = 0; @@ -171,7 +175,7 @@ static inline void printbuf_nul_terminate(struct printbuf *out) out->buf[out->size - 1] = 0; } -/* Doesn't call printbuf_make_room(), doesn't nul terminate: */ +/* Doesn't call bch2_printbuf_make_room(), doesn't nul terminate: */ static inline void __prt_char_reserved(struct printbuf *out, char c) { if (printbuf_remaining(out)) @@ -182,7 +186,7 @@ static inline void __prt_char_reserved(struct printbuf *out, char c) /* Doesn't nul terminate: */ static inline void __prt_char(struct printbuf *out, char c) { - printbuf_make_room(out, 1); + bch2_printbuf_make_room(out, 1); __prt_char_reserved(out, c); } @@ -203,7 +207,7 @@ static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n static inline void prt_chars(struct printbuf *out, char c, unsigned n) { - printbuf_make_room(out, n); + bch2_printbuf_make_room(out, n); __prt_chars_reserved(out, c, n); printbuf_nul_terminate(out); } @@ -212,7 +216,7 @@ static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n) { unsigned i, can_print; - printbuf_make_room(out, n); + bch2_printbuf_make_room(out, n); can_print = min(n, printbuf_remaining(out)); @@ -230,12 +234,12 @@ static inline void prt_str(struct printbuf *out, const char *str) static inline void prt_str_indented(struct printbuf *out, const char *str) { - prt_bytes_indented(out, str, strlen(str)); + bch2_prt_bytes_indented(out, str, strlen(str)); } static inline void prt_hex_byte(struct printbuf *out, u8 byte) { - printbuf_make_room(out, 2); + bch2_printbuf_make_room(out, 2); __prt_char_reserved(out, hex_asc_hi(byte)); __prt_char_reserved(out, hex_asc_lo(byte)); printbuf_nul_terminate(out); @@ -243,7 +247,7 @@ static inline void prt_hex_byte(struct printbuf *out, u8 byte) static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte) { - printbuf_make_room(out, 2); + bch2_printbuf_make_room(out, 2); __prt_char_reserved(out, hex_asc_upper_hi(byte)); __prt_char_reserved(out, hex_asc_upper_lo(byte)); printbuf_nul_terminate(out); @@ -277,30 +281,4 @@ static inline void printbuf_atomic_dec(struct printbuf *buf) buf->atomic--; } -/* - * This is used for the %pf(%p) sprintf format extension, where we pass a pretty - * printer and arguments to the pretty-printer to sprintf - * - * Instead of passing a pretty-printer function to sprintf directly, we pass it - * a pointer to a struct call_pp, so that sprintf can check that the magic - * number is present, which in turn ensures that the CALL_PP() macro has been - * used in order to typecheck the arguments to the pretty printer function - * - * Example usage: - * sprintf("%pf(%p)", CALL_PP(prt_bdev, bdev)); - */ -struct call_pp { - unsigned long magic; - void *fn; -}; - -#define PP_TYPECHECK(fn, ...) \ - ({ while (0) fn((struct printbuf *) NULL, ##__VA_ARGS__); }) - -#define CALL_PP_MAGIC (unsigned long) 0xce0b92d22f6b6be4 - -#define CALL_PP(fn, ...) \ - (PP_TYPECHECK(fn, ##__VA_ARGS__), \ - &((struct call_pp) { CALL_PP_MAGIC, fn })), ##__VA_ARGS__ - -#endif /* _LINUX_PRINTBUF_H */ +#endif /* _BCACHEFS_PRINTBUF_H */ diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c index 4b663f32..331f2283 100644 --- a/libbcachefs/quota.c +++ b/libbcachefs/quota.c @@ -59,7 +59,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = { }; int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { if (k.k->p.inode >= QTYP_NR) { prt_printf(err, "invalid quota type (%llu >= %u)", diff --git a/libbcachefs/quota.h b/libbcachefs/quota.h index 59bed114..146264fd 100644 --- a/libbcachefs/quota.h +++ b/libbcachefs/quota.h @@ -7,7 +7,7 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_quota; -int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_quota ((struct bkey_ops) { \ diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 8df94ad5..8a78377b 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -969,7 +969,7 @@ static int read_btree_roots(struct bch_fs *c) ? FSCK_CAN_IGNORE : 0, "error reading btree root %s", bch2_btree_ids[i]); - if (i == BTREE_ID_alloc) + if (btree_id_is_alloc(i)) c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); } } @@ -1217,6 +1217,9 @@ use_clean: if (ret) goto err; + if (c->opts.reconstruct_alloc) + bch2_fs_log_msg(c, "dropping alloc info"); + /* * Skip past versions that might have possibly been used (as nonces), * but hadn't had their pointers written: @@ -1250,6 +1253,20 @@ use_clean: bch2_stripes_heap_start(c); + if (c->sb.version < bcachefs_metadata_version_snapshot_2) { + err = "error creating root snapshot node"; + ret = bch2_fs_initialize_subvolumes(c); + if (ret) + goto err; + } + + bch_verbose(c, "reading snapshots table"); + err = "error reading snapshots table"; + ret = bch2_fs_snapshots_start(c); + if (ret) + goto err; + bch_verbose(c, "reading snapshots done"); + if (c->opts.fsck) { bool metadata_only = c->opts.norecovery; @@ -1262,20 +1279,6 @@ use_clean: set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); - if (c->sb.version < bcachefs_metadata_version_snapshot_2) { - err = "error creating root snapshot node"; - ret = bch2_fs_initialize_subvolumes(c); - if (ret) - goto err; - } - - bch_verbose(c, "reading snapshots table"); - err = "error reading snapshots table"; - ret = bch2_fs_snapshots_start(c); - if (ret) - goto err; - bch_verbose(c, "reading snapshots done"); - set_bit(BCH_FS_MAY_GO_RW, &c->flags); bch_info(c, "starting journal replay, %zu keys", c->journal_keys.nr); @@ -1343,20 +1346,6 @@ use_clean: if (c->opts.norecovery) goto out; - if (c->sb.version < bcachefs_metadata_version_snapshot_2) { - err = "error creating root snapshot node"; - ret = bch2_fs_initialize_subvolumes(c); - if (ret) - goto err; - } - - bch_verbose(c, "reading snapshots table"); - err = "error reading snapshots table"; - ret = bch2_fs_snapshots_start(c); - if (ret) - goto err; - bch_verbose(c, "reading snapshots done"); - set_bit(BCH_FS_MAY_GO_RW, &c->flags); bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr); @@ -1632,6 +1621,6 @@ int bch2_fs_initialize(struct bch_fs *c) return 0; err: - pr_err("Error initializing new filesystem: %s (%i)", err, ret); + pr_err("Error initializing new filesystem: %s (%s)", err, bch2_err_str(ret)); return ret; } diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c index e89a9a1a..87446f7b 100644 --- a/libbcachefs/reflink.c +++ b/libbcachefs/reflink.c @@ -26,7 +26,7 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k) /* reflink pointers */ int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); @@ -78,7 +78,7 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r /* indirect extents */ int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); @@ -88,7 +88,7 @@ int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k, return -BCH_ERR_invalid_bkey; } - return bch2_bkey_ptrs_invalid(c, k, rw, err); + return bch2_bkey_ptrs_invalid(c, k, flags, err); } void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, @@ -131,7 +131,7 @@ int bch2_trans_mark_reflink_v(struct btree_trans *trans, /* indirect inline data */ int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data)) { prt_printf(err, "incorrect value size (%zu < %zu)", @@ -282,7 +282,7 @@ s64 bch2_remap_range(struct bch_fs *c, u32 dst_snapshot, src_snapshot; int ret = 0, ret2 = 0; - if (!percpu_ref_tryget_live(&c->writes)) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink)) return -BCH_ERR_erofs_no_writes; bch2_check_set_feature(c, BCH_FEATURE_reflink); @@ -416,7 +416,7 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_bkey_buf_exit(&new_src, c); bch2_bkey_buf_exit(&new_dst, c); - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_reflink); return dst_done ?: ret ?: ret2; } diff --git a/libbcachefs/reflink.h b/libbcachefs/reflink.h index ce0012aa..2391037c 100644 --- a/libbcachefs/reflink.h +++ b/libbcachefs/reflink.h @@ -3,7 +3,7 @@ #define _BCACHEFS_REFLINK_H int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c, - int, struct printbuf *); + unsigned, struct printbuf *); void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); @@ -17,7 +17,7 @@ bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); }) int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c, - int, struct printbuf *); + unsigned, struct printbuf *); void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned, @@ -32,7 +32,7 @@ int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned, }) int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c, - int, struct printbuf *); + unsigned, struct printbuf *); void bch2_indirect_inline_data_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trans_mark_indirect_inline_data(struct btree_trans *, diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index 3bff2195..482bedf4 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -299,6 +299,13 @@ static int replicas_table_update(struct bch_fs *c, memset(new_usage, 0, sizeof(new_usage)); + for (i = 0; i < ARRAY_SIZE(new_usage); i++) + if (!(new_usage[i] = __alloc_percpu_gfp(bytes, + sizeof(u64), GFP_KERNEL))) + goto err; + + memset(new_usage, 0, sizeof(new_usage)); + for (i = 0; i < ARRAY_SIZE(new_usage); i++) if (!(new_usage[i] = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL))) diff --git a/libbcachefs/replicas.h b/libbcachefs/replicas.h index cc34b380..4887675a 100644 --- a/libbcachefs/replicas.h +++ b/libbcachefs/replicas.h @@ -27,22 +27,6 @@ bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *); int bch2_mark_replicas(struct bch_fs *, struct bch_replicas_entry *); -struct replicas_delta { - s64 delta; - struct bch_replicas_entry r; -} __packed; - -struct replicas_delta_list { - unsigned size; - unsigned used; - - struct {} memset_start; - u64 nr_inodes; - u64 persistent_reserved[BCH_REPLICAS_MAX]; - struct {} memset_end; - struct replicas_delta d[0]; -}; - static inline struct replicas_delta * replicas_delta_next(struct replicas_delta *d) { diff --git a/libbcachefs/replicas_types.h b/libbcachefs/replicas_types.h index f12a35b3..5cfff489 100644 --- a/libbcachefs/replicas_types.h +++ b/libbcachefs/replicas_types.h @@ -8,4 +8,20 @@ struct bch_replicas_cpu { struct bch_replicas_entry *entries; }; +struct replicas_delta { + s64 delta; + struct bch_replicas_entry r; +} __packed; + +struct replicas_delta_list { + unsigned size; + unsigned used; + + struct {} memset_start; + u64 nr_inodes; + u64 persistent_reserved[BCH_REPLICAS_MAX]; + struct {} memset_end; + struct replicas_delta d[0]; +}; + #endif /* _BCACHEFS_REPLICAS_TYPES_H */ diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c index d090a74b..1805c854 100644 --- a/libbcachefs/subvolume.c +++ b/libbcachefs/subvolume.c @@ -25,7 +25,7 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, } int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { struct bkey_s_c_snapshot s; u32 i, id; @@ -706,16 +706,14 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work) struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); bch2_delete_dead_snapshots(c); - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); } void bch2_delete_dead_snapshots_async(struct bch_fs *c) { - if (!percpu_ref_tryget_live(&c->writes)) - return; - - if (!queue_work(system_long_wq, &c->snapshot_delete_work)) - percpu_ref_put(&c->writes); + if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) && + !queue_work(system_long_wq, &c->snapshot_delete_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); } static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans, @@ -735,7 +733,7 @@ static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans, /* Subvolumes: */ int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { if (bkey_lt(k.k->p, SUBVOL_POS_MIN) || bkey_gt(k.k->p, SUBVOL_POS_MAX)) { @@ -900,7 +898,7 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) darray_exit(&s); } - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache); } struct subvolume_unlink_hook { @@ -923,11 +921,11 @@ int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans, if (ret) return ret; - if (unlikely(!percpu_ref_tryget_live(&c->writes))) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache)) return -EROFS; if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work)) - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache); return 0; } diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h index c694c1c2..b6740eab 100644 --- a/libbcachefs/subvolume.h +++ b/libbcachefs/subvolume.h @@ -7,7 +7,7 @@ void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c, - int rw, struct printbuf *); + unsigned, struct printbuf *); #define bch2_bkey_ops_snapshot ((struct bkey_ops) { \ .key_invalid = bch2_snapshot_invalid, \ @@ -106,7 +106,7 @@ void bch2_fs_snapshots_exit(struct bch_fs *); int bch2_fs_snapshots_start(struct bch_fs *); int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c, - int rw, struct printbuf *); + unsigned, struct printbuf *); void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_subvolume ((struct bkey_ops) { \ diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 738b68b5..00c1f69b 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -20,7 +20,6 @@ #include "counters.h" #include <linux/backing-dev.h> -#include <linux/pretty-printers.h> #include <linux/sort.h> #include <trace/events/bcachefs.h> @@ -1261,7 +1260,8 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, u->entry.type = BCH_JSET_ENTRY_data_usage; u->v = cpu_to_le64(c->usage_base->replicas[i]); - memcpy(&u->r, e, replicas_entry_bytes(e)); + unsafe_memcpy(&u->r, e, replicas_entry_bytes(e), + "embedded variable length struct"); } for_each_member_device(ca, c, dev) { diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 95c16f70..08bfed1b 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -55,7 +55,6 @@ #include <linux/idr.h> #include <linux/module.h> #include <linux/percpu.h> -#include <linux/pretty-printers.h> #include <linux/random.h> #include <linux/sysfs.h> #include <crypto/hash.h> @@ -110,7 +109,7 @@ static struct kset *bcachefs_kset; static LIST_HEAD(bch_fs_list); static DEFINE_MUTEX(bch_fs_list_lock); -static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait); +DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait); static void bch2_dev_free(struct bch_dev *); static int bch2_dev_alloc(struct bch_fs *, unsigned); @@ -238,13 +237,15 @@ static void __bch2_fs_read_only(struct bch_fs *c) bch2_dev_allocator_remove(c, ca); } +#ifndef BCH_WRITE_REF_DEBUG static void bch2_writes_disabled(struct percpu_ref *writes) { struct bch_fs *c = container_of(writes, struct bch_fs, writes); set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); - wake_up(&bch_read_only_wait); + wake_up(&bch2_read_only_wait); } +#endif void bch2_fs_read_only(struct bch_fs *c) { @@ -259,9 +260,13 @@ void bch2_fs_read_only(struct bch_fs *c) * Block new foreground-end write operations from starting - any new * writes will return -EROFS: */ + set_bit(BCH_FS_GOING_RO, &c->flags); +#ifndef BCH_WRITE_REF_DEBUG percpu_ref_kill(&c->writes); - - cancel_work_sync(&c->ec_stripe_delete_work); +#else + for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) + bch2_write_ref_put(c, i); +#endif /* * If we're not doing an emergency shutdown, we want to wait on @@ -274,16 +279,17 @@ void bch2_fs_read_only(struct bch_fs *c) * we do need to wait on them before returning and signalling * that going RO is complete: */ - wait_event(bch_read_only_wait, + wait_event(bch2_read_only_wait, test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) || test_bit(BCH_FS_EMERGENCY_RO, &c->flags)); __bch2_fs_read_only(c); - wait_event(bch_read_only_wait, + wait_event(bch2_read_only_wait, test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); + clear_bit(BCH_FS_GOING_RO, &c->flags); if (!bch2_journal_error(&c->journal) && !test_bit(BCH_FS_ERROR, &c->flags) && @@ -320,7 +326,7 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c) bch2_journal_halt(&c->journal); bch2_fs_read_only_async(c); - wake_up(&bch_read_only_wait); + wake_up(&bch2_read_only_wait); return ret; } @@ -392,20 +398,26 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) return ret; } - schedule_work(&c->ec_stripe_delete_work); - - bch2_do_discards(c); - bch2_do_invalidates(c); - if (!early) { ret = bch2_fs_read_write_late(c); if (ret) goto err; } +#ifndef BCH_WRITE_REF_DEBUG percpu_ref_reinit(&c->writes); +#else + for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) { + BUG_ON(atomic_long_read(&c->writes[i])); + atomic_long_inc(&c->writes[i]); + } +#endif set_bit(BCH_FS_RW, &c->flags); set_bit(BCH_FS_WAS_RW, &c->flags); + + bch2_do_discards(c); + bch2_do_invalidates(c); + bch2_do_stripe_deletes(c); return 0; err: __bch2_fs_read_only(c); @@ -454,19 +466,21 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_journal_keys_free(&c->journal_keys); bch2_journal_entries_free(c); percpu_free_rwsem(&c->mark_lock); + free_percpu(c->online_reserved); if (c->btree_paths_bufs) for_each_possible_cpu(cpu) kfree(per_cpu_ptr(c->btree_paths_bufs, cpu)->path); - free_percpu(c->online_reserved); free_percpu(c->btree_paths_bufs); free_percpu(c->pcpu); mempool_exit(&c->large_bkey_pool); mempool_exit(&c->btree_bounce_pool); bioset_exit(&c->btree_bio); mempool_exit(&c->fill_iter); +#ifndef BCH_WRITE_REF_DEBUG percpu_ref_exit(&c->writes); +#endif kfree(rcu_dereference_protected(c->disk_groups, 1)); kfree(c->journal_seq_blacklist_table); kfree(c->unused_inode_hints); @@ -695,6 +709,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) seqcount_init(&c->usage_lock); + sema_init(&c->io_in_flight, 128); + c->copy_gc_enabled = 1; c->rebalance.enabled = 1; c->promote_whole_extents = true; @@ -743,9 +759,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_opts_apply(&c->opts, opts); - /* key cache currently disabled for inodes, because of snapshots: */ - c->opts.inodes_use_key_cache = 0; - c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; if (c->opts.inodes_use_key_cache) c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes; @@ -766,23 +779,25 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus())); if (!(c->btree_update_wq = alloc_workqueue("bcachefs", - WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || + WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512)) || !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", - WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || + WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || !(c->io_complete_wq = alloc_workqueue("bcachefs_io", WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) || +#ifndef BCH_WRITE_REF_DEBUG percpu_ref_init(&c->writes, bch2_writes_disabled, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || +#endif mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || bioset_init(&c->btree_bio, 1, max(offsetof(struct btree_read_bio, bio), offsetof(struct btree_write_bio, wbio.bio)), BIOSET_NEED_BVECS) || !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || - !(c->btree_paths_bufs = alloc_percpu(struct btree_path_buf)) || !(c->online_reserved = alloc_percpu(u64)) || + !(c->btree_paths_bufs = alloc_percpu(struct btree_path_buf)) || mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) || mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || @@ -850,9 +865,12 @@ static void print_mount_opts(struct bch_fs *c) struct printbuf p = PRINTBUF; bool first = true; + prt_printf(&p, "mounted version=%s", bch2_metadata_versions[c->sb.version]); + if (c->opts.read_only) { - prt_printf(&p, "ro"); + prt_str(&p, " opts="); first = false; + prt_printf(&p, "ro"); } for (i = 0; i < bch2_opts_nr; i++) { @@ -865,16 +883,12 @@ static void print_mount_opts(struct bch_fs *c) if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) continue; - if (!first) - prt_printf(&p, ","); + prt_str(&p, first ? " opts=" : ","); first = false; bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE); } - if (!p.pos) - prt_printf(&p, "(null)"); - - bch_info(c, "mounted version=%s opts=%s", bch2_metadata_versions[c->sb.version], p.buf); + bch_info(c, "%s", p.buf); printbuf_exit(&p); } @@ -1955,5 +1969,8 @@ err: BCH_DEBUG_PARAMS() #undef BCH_DEBUG_PARAM +unsigned bch2_metadata_version = bcachefs_metadata_version_current; +module_param_named(version, bch2_metadata_version, uint, 0400); + module_exit(bcachefs_exit); module_init(bcachefs_init); diff --git a/libbcachefs/super.h b/libbcachefs/super.h index 3c83e9b9..d4e939c8 100644 --- a/libbcachefs/super.h +++ b/libbcachefs/super.h @@ -251,7 +251,8 @@ int bch2_fs_read_write_early(struct bch_fs *); */ static inline void bch2_fs_lazy_rw(struct bch_fs *c) { - if (percpu_ref_is_zero(&c->writes)) + if (!test_bit(BCH_FS_RW, &c->flags) && + !test_bit(BCH_FS_WAS_RW, &c->flags)) bch2_fs_read_write_early(c); } diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 6e49cf98..ebd10cd5 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -35,7 +35,6 @@ #include "tests.h" #include <linux/blkdev.h> -#include <linux/pretty-printers.h> #include <linux/sort.h> #include <linux/sched/clock.h> @@ -195,8 +194,32 @@ read_attribute(btree_cache); read_attribute(btree_key_cache); read_attribute(stripes_heap); read_attribute(open_buckets); +read_attribute(write_points); read_attribute(nocow_lock_table); +#ifdef BCH_WRITE_REF_DEBUG +read_attribute(write_refs); + +const char * const bch2_write_refs[] = { +#define x(n) #n, + BCH_WRITE_REFS() +#undef x + NULL +}; + +static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c) +{ + bch2_printbuf_tabstop_push(out, 24); + + for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) { + prt_str(out, bch2_write_refs[i]); + prt_tab(out); + prt_printf(out, "%li", atomic_long_read(&c->writes[i])); + prt_newline(out); + } +} +#endif + read_attribute(internal_uuid); read_attribute(has_data); @@ -432,6 +455,9 @@ SHOW(bch2_fs) if (attr == &sysfs_open_buckets) bch2_open_buckets_to_text(out, c); + if (attr == &sysfs_write_points) + bch2_write_points_to_text(out, c); + if (attr == &sysfs_compression_stats) bch2_compression_stats_to_text(out, c); @@ -450,6 +476,11 @@ SHOW(bch2_fs) if (attr == &sysfs_nocow_lock_table) bch2_nocow_locks_to_text(out, &c->nocow_locks); +#ifdef BCH_WRITE_REF_DEBUG + if (attr == &sysfs_write_refs) + bch2_write_refs_to_text(out, c); +#endif + return 0; } @@ -632,7 +663,11 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_new_stripes, &sysfs_stripes_heap, &sysfs_open_buckets, + &sysfs_write_points, &sysfs_nocow_lock_table, +#ifdef BCH_WRITE_REF_DEBUG + &sysfs_write_refs, +#endif &sysfs_io_timers_read, &sysfs_io_timers_write, @@ -684,7 +719,7 @@ STORE(bch2_fs_opts_dir) * We don't need to take c->writes for correctness, but it eliminates an * unsightly error message in the dmesg log when we're RO: */ - if (unlikely(!percpu_ref_tryget_live(&c->writes))) + if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))) return -EROFS; tmp = kstrdup(buf, GFP_KERNEL); @@ -714,7 +749,7 @@ STORE(bch2_fs_opts_dir) ret = size; err: - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); return ret; } SYSFS_OPS(bch2_fs_opts_dir); diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c index b99a9e42..80fce1c9 100644 --- a/libbcachefs/tests.c +++ b/libbcachefs/tests.c @@ -573,7 +573,7 @@ static u64 test_rand(void) { u64 v; - prandom_bytes(&v, sizeof(v)); + get_random_bytes(&v, sizeof(v)); return v; } diff --git a/libbcachefs/util.c b/libbcachefs/util.c index bb8a495e..9939bf2a 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -240,12 +240,12 @@ bool bch2_is_zero(const void *_p, size_t n) return true; } -static void bch2_quantiles_update(struct quantiles *q, u64 v) +static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v) { unsigned i = 0; while (i < ARRAY_SIZE(q->entries)) { - struct quantile_entry *e = q->entries + i; + struct bch2_quantile_entry *e = q->entries + i; if (unlikely(!e->step)) { e->m = v; @@ -292,7 +292,6 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines) if (!*p) break; lines = p + 1; - prefix = KERN_CONT; } console_unlock(); } @@ -301,11 +300,9 @@ int bch2_prt_backtrace(struct printbuf *out, struct task_struct *task) { unsigned long entries[32]; unsigned i, nr_entries; - int ret; - ret = down_read_killable(&task->signal->exec_update_lock); - if (ret) - return ret; + if (!down_read_trylock(&task->signal->exec_update_lock)) + return 0; nr_entries = stack_trace_save_tsk(task, entries, ARRAY_SIZE(entries), 0); for (i = 0; i < nr_entries; i++) { @@ -319,7 +316,8 @@ int bch2_prt_backtrace(struct printbuf *out, struct task_struct *task) /* time stats: */ -static inline void bch2_time_stats_update_one(struct time_stats *stats, +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT +static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats, u64 start, u64 end) { u64 duration, freq; @@ -348,10 +346,10 @@ static inline void bch2_time_stats_update_one(struct time_stats *stats, } } -static noinline void bch2_time_stats_clear_buffer(struct time_stats *stats, - struct time_stat_buffer *b) +static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, + struct bch2_time_stat_buffer *b) { - struct time_stat_buffer_entry *i; + struct bch2_time_stat_buffer_entry *i; unsigned long flags; spin_lock_irqsave(&stats->lock, flags); @@ -364,7 +362,7 @@ static noinline void bch2_time_stats_clear_buffer(struct time_stats *stats, b->nr = 0; } -void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end) +void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) { unsigned long flags; @@ -379,17 +377,17 @@ void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end) if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 && stats->duration_stats.n > 1024) stats->buffer = - alloc_percpu_gfp(struct time_stat_buffer, + alloc_percpu_gfp(struct bch2_time_stat_buffer, GFP_ATOMIC); spin_unlock_irqrestore(&stats->lock, flags); } else { - struct time_stat_buffer *b; + struct bch2_time_stat_buffer *b; preempt_disable(); b = this_cpu_ptr(stats->buffer); BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); - b->entries[b->nr++] = (struct time_stat_buffer_entry) { + b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) { .start = start, .end = end }; @@ -399,6 +397,7 @@ void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end) preempt_enable(); } } +#endif static const struct time_unit { const char *name; @@ -426,7 +425,14 @@ static const struct time_unit *pick_time_units(u64 ns) return u; } -static void pr_time_units(struct printbuf *out, u64 ns) +void bch2_pr_time_units(struct printbuf *out, u64 ns) +{ + const struct time_unit *u = pick_time_units(ns); + + prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); +} + +static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) { const struct time_unit *u = pick_time_units(ns); @@ -441,11 +447,11 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 { prt_str(out, name); prt_tab(out); - pr_time_units(out, ns); + bch2_pr_time_units_aligned(out, ns); prt_newline(out); } -void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats) +void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) { const struct time_unit *u; s64 f_mean = 0, d_mean = 0; @@ -499,16 +505,16 @@ void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats) prt_printf(out, "mean:"); prt_tab(out); - pr_time_units(out, d_mean); + bch2_pr_time_units_aligned(out, d_mean); prt_tab(out); - pr_time_units(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted)); prt_newline(out); prt_printf(out, "stddev:"); prt_tab(out); - pr_time_units(out, d_stddev); + bch2_pr_time_units_aligned(out, d_stddev); prt_tab(out); - pr_time_units(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted)); printbuf_indent_sub(out, 2); prt_newline(out); @@ -522,16 +528,16 @@ void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats) prt_printf(out, "mean:"); prt_tab(out); - pr_time_units(out, f_mean); + bch2_pr_time_units_aligned(out, f_mean); prt_tab(out); - pr_time_units(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted)); prt_newline(out); prt_printf(out, "stddev:"); prt_tab(out); - pr_time_units(out, f_stddev); + bch2_pr_time_units_aligned(out, f_stddev); prt_tab(out); - pr_time_units(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted)); printbuf_indent_sub(out, 2); prt_newline(out); @@ -554,12 +560,12 @@ void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats) } } -void bch2_time_stats_exit(struct time_stats *stats) +void bch2_time_stats_exit(struct bch2_time_stats *stats) { free_percpu(stats->buffer); } -void bch2_time_stats_init(struct time_stats *stats) +void bch2_time_stats_init(struct bch2_time_stats *stats) { memset(stats, 0, sizeof(*stats)); stats->duration_stats_weighted.w = 8; diff --git a/libbcachefs/util.h b/libbcachefs/util.h index 473c9696..09e27293 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -11,7 +11,6 @@ #include <linux/sched/clock.h> #include <linux/llist.h> #include <linux/log2.h> -#include <linux/printbuf.h> #include <linux/percpu.h> #include <linux/preempt.h> #include <linux/ratelimit.h> @@ -215,6 +214,34 @@ do { \ #define ANYSINT_MAX(t) \ ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) +#include "printbuf.h" + +#define prt_vprintf(_out, ...) bch2_prt_vprintf(_out, __VA_ARGS__) +#define prt_printf(_out, ...) bch2_prt_printf(_out, __VA_ARGS__) +#define printbuf_str(_buf) bch2_printbuf_str(_buf) +#define printbuf_exit(_buf) bch2_printbuf_exit(_buf) + +#define printbuf_tabstops_reset(_buf) bch2_printbuf_tabstops_reset(_buf) +#define printbuf_tabstop_pop(_buf) bch2_printbuf_tabstop_pop(_buf) +#define printbuf_tabstop_push(_buf, _n) bch2_printbuf_tabstop_push(_buf, _n) + +#define printbuf_indent_add(_out, _n) bch2_printbuf_indent_add(_out, _n) +#define printbuf_indent_sub(_out, _n) bch2_printbuf_indent_sub(_out, _n) + +#define prt_newline(_out) bch2_prt_newline(_out) +#define prt_tab(_out) bch2_prt_tab(_out) +#define prt_tab_rjust(_out) bch2_prt_tab_rjust(_out) + +#define prt_bytes_indented(...) bch2_prt_bytes_indented(__VA_ARGS__) +#define prt_u64(_out, _v) prt_printf(_out, "%llu", _v) +#define prt_human_readable_u64(...) bch2_prt_human_readable_u64(__VA_ARGS__) +#define prt_human_readable_s64(...) bch2_prt_human_readable_s64(__VA_ARGS__) +#define prt_units_u64(...) bch2_prt_units_u64(__VA_ARGS__) +#define prt_units_s64(...) bch2_prt_units_s64(__VA_ARGS__) +#define prt_string_option(...) bch2_prt_string_option(__VA_ARGS__) +#define prt_bitflags(...) bch2_prt_bitflags(__VA_ARGS__) + +void bch2_pr_time_units(struct printbuf *, u64); #ifdef __KERNEL__ static inline void pr_time(struct printbuf *out, u64 time) @@ -340,22 +367,22 @@ int bch2_prt_backtrace(struct printbuf *, struct task_struct *); #define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) #define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) -struct quantiles { - struct quantile_entry { +struct bch2_quantiles { + struct bch2_quantile_entry { u64 m; u64 step; } entries[NR_QUANTILES]; }; -struct time_stat_buffer { +struct bch2_time_stat_buffer { unsigned nr; - struct time_stat_buffer_entry { + struct bch2_time_stat_buffer_entry { u64 start; u64 end; } entries[32]; }; -struct time_stats { +struct bch2_time_stats { spinlock_t lock; /* all fields are in nanoseconds */ u64 max_duration; @@ -363,26 +390,30 @@ struct time_stats { u64 max_freq; u64 min_freq; u64 last_event; - struct quantiles quantiles; + struct bch2_quantiles quantiles; struct mean_and_variance duration_stats; struct mean_and_variance_weighted duration_stats_weighted; struct mean_and_variance freq_stats; struct mean_and_variance_weighted freq_stats_weighted; - struct time_stat_buffer __percpu *buffer; + struct bch2_time_stat_buffer __percpu *buffer; }; -void __bch2_time_stats_update(struct time_stats *stats, u64, u64); +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT +void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64); +#else +static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {} +#endif -static inline void bch2_time_stats_update(struct time_stats *stats, u64 start) +static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) { __bch2_time_stats_update(stats, start, local_clock()); } -void bch2_time_stats_to_text(struct printbuf *, struct time_stats *); +void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *); -void bch2_time_stats_exit(struct time_stats *); -void bch2_time_stats_init(struct time_stats *); +void bch2_time_stats_exit(struct bch2_time_stats *); +void bch2_time_stats_init(struct bch2_time_stats *); #define ewma_add(ewma, val, weight) \ ({ \ @@ -582,6 +613,20 @@ static inline void memmove_u64s_down(void *dst, const void *src, __memmove_u64s_down(dst, src, u64s); } +static inline void __memmove_u64s_down_small(void *dst, const void *src, + unsigned u64s) +{ + memcpy_u64s_small(dst, src, u64s); +} + +static inline void memmove_u64s_down_small(void *dst, const void *src, + unsigned u64s) +{ + EBUG_ON(dst > src); + + __memmove_u64s_down_small(dst, src, u64s); +} + static inline void __memmove_u64s_up_small(void *_dst, const void *_src, unsigned u64s) { diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index b5022a8b..9f77bb2e 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -70,7 +70,7 @@ const struct bch_hash_desc bch2_xattr_hash_desc = { }; int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { const struct xattr_handler *handler; struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); diff --git a/libbcachefs/xattr.h b/libbcachefs/xattr.h index 03f1b73f..1a4cff3a 100644 --- a/libbcachefs/xattr.h +++ b/libbcachefs/xattr.h @@ -6,7 +6,7 @@ extern const struct bch_hash_desc bch2_xattr_hash_desc; -int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_xattr ((struct bkey_ops) { \ diff --git a/linux/blkdev.c b/linux/blkdev.c index 54cd6e9c..0a5cedfe 100644 --- a/linux/blkdev.c +++ b/linux/blkdev.c @@ -184,7 +184,7 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, if (buffered_fd < 0) return ERR_PTR(-errno); - fd = open(path, flags|O_DIRECT); + fd = open(path, flags); if (fd < 0) fd = dup(buffered_fd); if (fd < 0) { @@ -192,7 +192,7 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, return ERR_PTR(-errno); } - sync_fd = open(path, flags|O_DIRECT|O_SYNC); + sync_fd = open(path, flags|O_SYNC); if (sync_fd < 0) sync_fd = open(path, flags|O_SYNC); if (sync_fd < 0) { diff --git a/linux/mean_and_variance.c b/linux/mean_and_variance.c index 55d46c9d..bd08da5f 100644 --- a/linux/mean_and_variance.c +++ b/linux/mean_and_variance.c @@ -42,8 +42,6 @@ #include <linux/math64.h> #include <linux/mean_and_variance.h> #include <linux/module.h> -#include <linux/printbuf.h> - /** * fast_divpow2() - fast approximation for n / (1 << d) diff --git a/linux/pretty-printers.c b/linux/pretty-printers.c deleted file mode 100644 index addbac95..00000000 --- a/linux/pretty-printers.c +++ /dev/null @@ -1,60 +0,0 @@ -// SPDX-License-Identifier: LGPL-2.1+ -/* Copyright (C) 2022 Kent Overstreet */ - -#include <linux/bitops.h> -#include <linux/kernel.h> -#include <linux/printbuf.h> -#include <linux/pretty-printers.h> - -/** - * prt_string_option - Given a list of strings, print out the list and indicate - * which option is selected, with square brackets (sysfs style) - * - * @out: The printbuf to output to - * @list: List of strings to choose from - * @selected: The option to highlight, with square brackets - */ -void prt_string_option(struct printbuf *out, - const char * const list[], - size_t selected) -{ - size_t i; - - for (i = 0; list[i]; i++) { - if (i) - prt_char(out, ' '); - if (i == selected) - prt_char(out, '['); - prt_str(out, list[i]); - if (i == selected) - prt_char(out, ']'); - } -} -EXPORT_SYMBOL(prt_string_option); - -/** - * prt_bitflags: Given a bitmap and a list of names for each bit, print out which - * bits are on, comma separated - * - * @out: The printbuf to output to - * @list: List of names for each bit - * @flags: Bits to print - */ -void prt_bitflags(struct printbuf *out, - const char * const list[], u64 flags) -{ - unsigned bit, nr = 0; - bool first = true; - - while (list[nr]) - nr++; - - while (flags && (bit = __ffs(flags)) < nr) { - if (!first) - prt_char(out, ','); - first = false; - prt_str(out, list[bit]); - flags ^= 1 << bit; - } -} -EXPORT_SYMBOL(prt_bitflags); diff --git a/linux/printbuf_userspace.c b/linux/printbuf_userspace.c deleted file mode 100644 index 0ae56ee1..00000000 --- a/linux/printbuf_userspace.c +++ /dev/null @@ -1,34 +0,0 @@ - -#include <stdio.h> -#include <linux/printbuf.h> - -void prt_vprintf(struct printbuf *out, const char *fmt, va_list args) -{ - int len; - - do { - va_list args2; - - va_copy(args2, args); - len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2); - } while (len + 1 >= printbuf_remaining(out) && - !printbuf_make_room(out, len + 1)); - - len = min_t(size_t, len, - printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0); - out->pos += len; -} - -void prt_printf(struct printbuf *out, const char *fmt, ...) -{ - va_list args; - - va_start(args, fmt); - prt_vprintf(out, fmt, args); - va_end(args); -} - -void prt_u64(struct printbuf *out, u64 v) -{ - prt_printf(out, "%llu", v); -} diff --git a/linux/seq_buf.c b/linux/seq_buf.c new file mode 100644 index 00000000..cf8709ad --- /dev/null +++ b/linux/seq_buf.c @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * seq_buf.c + * + * Copyright (C) 2014 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> + * + * The seq_buf is a handy tool that allows you to pass a descriptor around + * to a buffer that other functions can write to. It is similar to the + * seq_file functionality but has some differences. + * + * To use it, the seq_buf must be initialized with seq_buf_init(). + * This will set up the counters within the descriptor. You can call + * seq_buf_init() more than once to reset the seq_buf to start + * from scratch. + */ +#include <linux/seq_buf.h> +#include <stdio.h> + +/** + * seq_buf_can_fit - can the new data fit in the current buffer? + * @s: the seq_buf descriptor + * @len: The length to see if it can fit in the current buffer + * + * Returns true if there's enough unused space in the seq_buf buffer + * to fit the amount of new data according to @len. + */ +static bool seq_buf_can_fit(struct seq_buf *s, size_t len) +{ + return s->len + len <= s->size; +} + +/** + * seq_buf_vprintf - sequence printing of information. + * @s: seq_buf descriptor + * @fmt: printf format string + * @args: va_list of arguments from a printf() type function + * + * Writes a vnprintf() format into the sequencce buffer. + * + * Returns zero on success, -1 on overflow. + */ +int seq_buf_vprintf(struct seq_buf *s, const char *fmt, va_list args) +{ + int len; + + WARN_ON(s->size == 0); + + if (s->len < s->size) { + len = vsnprintf(s->buffer + s->len, s->size - s->len, fmt, args); + if (s->len + len < s->size) { + s->len += len; + return 0; + } + } + seq_buf_set_overflow(s); + return -1; +} + +/** + * seq_buf_printf - sequence printing of information + * @s: seq_buf descriptor + * @fmt: printf format string + * + * Writes a printf() format into the sequence buffer. + * + * Returns zero on success, -1 on overflow. + */ +int seq_buf_printf(struct seq_buf *s, const char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = seq_buf_vprintf(s, fmt, ap); + va_end(ap); + + return ret; +} + +/** + * seq_buf_puts - sequence printing of simple string + * @s: seq_buf descriptor + * @str: simple string to record + * + * Copy a simple string into the sequence buffer. + * + * Returns zero on success, -1 on overflow + */ +int seq_buf_puts(struct seq_buf *s, const char *str) +{ + size_t len = strlen(str); + + WARN_ON(s->size == 0); + + /* Add 1 to len for the trailing null byte which must be there */ + len += 1; + + if (seq_buf_can_fit(s, len)) { + memcpy(s->buffer + s->len, str, len); + /* Don't count the trailing null byte against the capacity */ + s->len += len - 1; + return 0; + } + seq_buf_set_overflow(s); + return -1; +} + +/** + * seq_buf_putc - sequence printing of simple character + * @s: seq_buf descriptor + * @c: simple character to record + * + * Copy a single character into the sequence buffer. + * + * Returns zero on success, -1 on overflow + */ +int seq_buf_putc(struct seq_buf *s, unsigned char c) +{ + WARN_ON(s->size == 0); + + if (seq_buf_can_fit(s, 1)) { + s->buffer[s->len++] = c; + return 0; + } + seq_buf_set_overflow(s); + return -1; +} + +/** + * seq_buf_putmem - write raw data into the sequenc buffer + * @s: seq_buf descriptor + * @mem: The raw memory to copy into the buffer + * @len: The length of the raw memory to copy (in bytes) + * + * There may be cases where raw memory needs to be written into the + * buffer and a strcpy() would not work. Using this function allows + * for such cases. + * + * Returns zero on success, -1 on overflow + */ +int seq_buf_putmem(struct seq_buf *s, const void *mem, unsigned int len) +{ + WARN_ON(s->size == 0); + + if (seq_buf_can_fit(s, len)) { + memcpy(s->buffer + s->len, mem, len); + s->len += len; + return 0; + } + seq_buf_set_overflow(s); + return -1; +} diff --git a/linux/six.c b/linux/six.c index 39a9bd6e..41337a7f 100644 --- a/linux/six.c +++ b/linux/six.c @@ -11,14 +11,16 @@ #include <linux/six.h> #include <linux/slab.h> +#include <trace/events/lock.h> + #ifdef DEBUG #define EBUG_ON(cond) BUG_ON(cond) #else #define EBUG_ON(cond) do {} while (0) #endif -#define six_acquire(l, t, r) lock_acquire(l, 0, t, r, 1, NULL, _RET_IP_) -#define six_release(l) lock_release(l, _RET_IP_) +#define six_acquire(l, t, r, ip) lock_acquire(l, 0, t, r, 1, NULL, ip) +#define six_release(l, ip) lock_release(l, ip) static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type); @@ -278,19 +280,20 @@ static bool do_six_trylock_type(struct six_lock *lock, } __always_inline __flatten -static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type) +static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type, + unsigned long ip) { if (!do_six_trylock_type(lock, type, true)) return false; if (type != SIX_LOCK_write) - six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read); + six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip); return true; } __always_inline __flatten static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type, - unsigned seq) + unsigned seq, unsigned long ip) { const struct six_lock_vals l[] = LOCK_VALS; union six_lock_state old; @@ -321,7 +324,7 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type, six_lock_wakeup(lock, old, SIX_LOCK_write); if (ret) - six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read); + six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip); return ret; } @@ -338,36 +341,48 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type, six_set_owner(lock, type, old, current); if (type != SIX_LOCK_write) - six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read); + six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip); return true; } -/* - * We don't see stable performance with SIX_LOCK_SPIN_ON_OWNER enabled, so it's - * off for now: - */ -#ifdef SIX_LOCK_SPIN_ON_OWNER +#ifdef CONFIG_LOCK_SPIN_ON_OWNER -static inline bool six_optimistic_spin(struct six_lock *lock, - struct six_lock_waiter *wait) +static inline bool six_can_spin_on_owner(struct six_lock *lock) { - struct task_struct *owner, *task = current; + struct task_struct *owner; + bool ret; - switch (wait->lock_want) { - case SIX_LOCK_read: - break; - case SIX_LOCK_intent: - if (lock->wait_list.next != &wait->list) - return false; - break; - case SIX_LOCK_write: + if (need_resched()) return false; - } rcu_read_lock(); owner = READ_ONCE(lock->owner); + ret = !owner || owner_on_cpu(owner); + rcu_read_unlock(); - while (owner && lock->owner == owner) { + return ret; +} + +static inline void six_set_nospin(struct six_lock *lock) +{ + union six_lock_state old, new; + u64 v = READ_ONCE(lock->state.v); + + do { + new.v = old.v = v; + new.nospin = true; + } while ((v = atomic64_cmpxchg(&lock->state.counter, old.v, new.v)) != old.v); +} + +static inline bool six_spin_on_owner(struct six_lock *lock, + struct task_struct *owner, + u64 end_time) +{ + bool ret = true; + unsigned loop = 0; + + rcu_read_lock(); + while (lock->owner == owner) { /* * Ensure we emit the owner->on_cpu, dereference _after_ * checking lock->owner still matches owner. If that fails, @@ -376,27 +391,94 @@ static inline bool six_optimistic_spin(struct six_lock *lock, */ barrier(); - /* - * If we're an RT task that will live-lock because we won't let - * the owner complete. - */ - if (wait->lock_acquired || - !owner->on_cpu || - rt_task(task) || - need_resched()) + if (!owner_on_cpu(owner) || need_resched()) { + ret = false; break; + } + + if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) { + six_set_nospin(lock); + ret = false; + break; + } cpu_relax(); } rcu_read_unlock(); - return wait->lock_acquired; + return ret; +} + +static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) +{ + struct task_struct *task = current; + u64 end_time; + + if (type == SIX_LOCK_write) + return false; + + preempt_disable(); + if (!six_can_spin_on_owner(lock)) + goto fail; + + if (!osq_lock(&lock->osq)) + goto fail; + + end_time = sched_clock() + 10 * NSEC_PER_USEC; + + while (1) { + struct task_struct *owner; + + /* + * If there's an owner, wait for it to either + * release the lock or go to sleep. + */ + owner = READ_ONCE(lock->owner); + if (owner && !six_spin_on_owner(lock, owner, end_time)) + break; + + if (do_six_trylock_type(lock, type, false)) { + osq_unlock(&lock->osq); + preempt_enable(); + return true; + } + + /* + * When there's no owner, we might have preempted between the + * owner acquiring the lock and setting the owner field. If + * we're an RT task that will live-lock because we won't let + * the owner complete. + */ + if (!owner && (need_resched() || rt_task(task))) + break; + + /* + * The cpu_relax() call is a compiler barrier which forces + * everything in this loop to be re-loaded. We don't need + * memory barriers as we'll eventually observe the right + * values at the cost of a few extra spins. + */ + cpu_relax(); + } + + osq_unlock(&lock->osq); +fail: + preempt_enable(); + + /* + * If we fell out of the spin path because of need_resched(), + * reschedule now, before we try-lock again. This avoids getting + * scheduled out right after we obtained the lock. + */ + if (need_resched()) + schedule(); + + return false; } #else /* CONFIG_LOCK_SPIN_ON_OWNER */ -static inline bool six_optimistic_spin(struct six_lock *lock, - struct six_lock_waiter *wait) +static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) { return false; } @@ -406,7 +488,8 @@ static inline bool six_optimistic_spin(struct six_lock *lock, noinline static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type, struct six_lock_waiter *wait, - six_lock_should_sleep_fn should_sleep_fn, void *p) + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) { union six_lock_state old; int ret = 0; @@ -417,7 +500,11 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty smp_mb__after_atomic(); } - lock_contended(&lock->dep_map, _RET_IP_); + trace_contention_begin(lock, 0); + lock_contended(&lock->dep_map, ip); + + if (six_optimistic_spin(lock, type)) + goto out; wait->task = current; wait->lock_want = type; @@ -457,9 +544,6 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty ret = 0; } - if (six_optimistic_spin(lock, wait)) - goto out; - while (1) { set_current_state(TASK_UNINTERRUPTIBLE); @@ -488,6 +572,7 @@ out: &lock->state.counter); six_lock_wakeup(lock, old, SIX_LOCK_read); } + trace_contention_end(lock, 0); return ret; } @@ -495,33 +580,35 @@ out: __always_inline __flatten static int __six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type, struct six_lock_waiter *wait, - six_lock_should_sleep_fn should_sleep_fn, void *p) + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) { int ret; wait->start_time = 0; if (type != SIX_LOCK_write) - six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read); + six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip); ret = do_six_trylock_type(lock, type, true) ? 0 - : __six_lock_type_slowpath(lock, type, wait, should_sleep_fn, p); + : __six_lock_type_slowpath(lock, type, wait, should_sleep_fn, p, ip); if (ret && type != SIX_LOCK_write) - six_release(&lock->dep_map); + six_release(&lock->dep_map, ip); if (!ret) - lock_acquired(&lock->dep_map, _RET_IP_); + lock_acquired(&lock->dep_map, ip); return ret; } __always_inline static int __six_lock_type(struct six_lock *lock, enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, void *p) + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) { struct six_lock_waiter wait; - return __six_lock_type_waiter(lock, type, &wait, should_sleep_fn, p); + return __six_lock_type_waiter(lock, type, &wait, should_sleep_fn, p, ip); } __always_inline __flatten @@ -540,16 +627,21 @@ static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type) smp_mb(); /* between unlocking and checking for waiters */ state.v = READ_ONCE(lock->state.v); } else { + u64 v = l[type].unlock_val; + + if (type != SIX_LOCK_read) + v -= lock->state.v & __SIX_VAL(nospin, 1); + EBUG_ON(!(lock->state.v & l[type].held_mask)); - state.v = atomic64_add_return_release(l[type].unlock_val, - &lock->state.counter); + state.v = atomic64_add_return_release(v, &lock->state.counter); } six_lock_wakeup(lock, state, l[type].unlock_wakeup); } __always_inline __flatten -static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type) +static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type, + unsigned long ip) { EBUG_ON(type == SIX_LOCK_write && !(lock->state.v & __SIX_LOCK_HELD_intent)); @@ -558,7 +650,7 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type) lock->owner != current); if (type != SIX_LOCK_write) - six_release(&lock->dep_map); + six_release(&lock->dep_map, ip); if (type == SIX_LOCK_intent && lock->intent_lock_recurse) { @@ -570,38 +662,40 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type) } #define __SIX_LOCK(type) \ -bool six_trylock_##type(struct six_lock *lock) \ +bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip) \ { \ - return __six_trylock_type(lock, SIX_LOCK_##type); \ + return __six_trylock_type(lock, SIX_LOCK_##type, ip); \ } \ -EXPORT_SYMBOL_GPL(six_trylock_##type); \ +EXPORT_SYMBOL_GPL(six_trylock_ip_##type); \ \ -bool six_relock_##type(struct six_lock *lock, u32 seq) \ +bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\ { \ - return __six_relock_type(lock, SIX_LOCK_##type, seq); \ + return __six_relock_type(lock, SIX_LOCK_##type, seq, ip); \ } \ -EXPORT_SYMBOL_GPL(six_relock_##type); \ +EXPORT_SYMBOL_GPL(six_relock_ip_##type); \ \ -int six_lock_##type(struct six_lock *lock, \ - six_lock_should_sleep_fn should_sleep_fn, void *p) \ +int six_lock_ip_##type(struct six_lock *lock, \ + six_lock_should_sleep_fn should_sleep_fn, void *p, \ + unsigned long ip) \ { \ - return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p);\ + return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\ } \ -EXPORT_SYMBOL_GPL(six_lock_##type); \ +EXPORT_SYMBOL_GPL(six_lock_ip_##type); \ \ -int six_lock_waiter_##type(struct six_lock *lock, \ +int six_lock_ip_waiter_##type(struct six_lock *lock, \ struct six_lock_waiter *wait, \ - six_lock_should_sleep_fn should_sleep_fn, void *p)\ + six_lock_should_sleep_fn should_sleep_fn, void *p,\ + unsigned long ip) \ { \ - return __six_lock_type_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p);\ + return __six_lock_type_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\ } \ -EXPORT_SYMBOL_GPL(six_lock_waiter_##type); \ +EXPORT_SYMBOL_GPL(six_lock_ip_waiter_##type); \ \ -void six_unlock_##type(struct six_lock *lock) \ +void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip) \ { \ - __six_unlock_type(lock, SIX_LOCK_##type); \ + __six_unlock_type(lock, SIX_LOCK_##type, ip); \ } \ -EXPORT_SYMBOL_GPL(six_unlock_##type); +EXPORT_SYMBOL_GPL(six_unlock_ip_##type); __SIX_LOCK(read) __SIX_LOCK(intent) @@ -672,7 +766,7 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type) { const struct six_lock_vals l[] = LOCK_VALS; - six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read); + six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_); /* XXX: assert already locked, and that we don't overflow: */ diff --git a/linux/string_helpers.c b/linux/string_helpers.c index 29c498ad..0810ca13 100644 --- a/linux/string_helpers.c +++ b/linux/string_helpers.c @@ -14,7 +14,6 @@ #include <linux/errno.h> #include <linux/fs.h> #include <linux/limits.h> -#include <linux/printbuf.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/string_helpers.h>