Update bcachefs sources to 92092a772970 bcachefs: fix bch2_can_do_write_btree()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2025-12-07 00:00:12 +03:00 · 2025-12-02 22:02:26 -05:00 · 2025-12-02 22:02:26 -05:00 · b601a0f2c3
commit b601a0f2c3
parent 96aa355c1d
41 changed files with 632 additions and 523 deletions
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@ -1 +1 @@
-2a26443359de230e360b7de6531db938bfb0cbd8
+92092a7729703f2285902b56aacaae199a3517eb
--- a/include/linux/ratelimit.h
+++ b/include/linux/ratelimit.h
@ -9,25 +9,30 @@
 #define DEFAULT_RATELIMIT_BURST		10

 /* issue num suppressed message on exit */
-#define RATELIMIT_MSG_ON_RELEASE	1
+#define RATELIMIT_MSG_ON_RELEASE	BIT(0)
+#define RATELIMIT_INITIALIZED		BIT(1)

 struct ratelimit_state {
 	raw_spinlock_t	lock;		/* protect the state */

 	int		interval;
 	int		burst;
-	int		printed;
-	int		missed;
+	atomic_t	rs_n_left;
+	atomic_t	missed;
+	unsigned int	flags;
 	unsigned long	begin;
-	unsigned long	flags;
 };

-#define RATELIMIT_STATE_INIT(name, interval_init, burst_init) {		\
-		.lock		= __RAW_SPIN_LOCK_UNLOCKED(name.lock),	\
-		.interval	= interval_init,			\
-		.burst		= burst_init,				\
+#define RATELIMIT_STATE_INIT_FLAGS(name, interval_init, burst_init, flags_init) { \
+		.lock		= __RAW_SPIN_LOCK_UNLOCKED(name.lock),		  \
+		.interval	= interval_init,				  \
+		.burst		= burst_init,					  \
+		.flags		= flags_init,					  \
 	}

+#define RATELIMIT_STATE_INIT(name, interval_init, burst_init) \
+	RATELIMIT_STATE_INIT_FLAGS(name, interval_init, burst_init, 0)
+
 #define RATELIMIT_STATE_INIT_DISABLED					\
 	RATELIMIT_STATE_INIT(ratelimit_state, 0, DEFAULT_RATELIMIT_BURST)

@ -36,6 +41,9 @@ struct ratelimit_state {
 	struct ratelimit_state name =					\
 		RATELIMIT_STATE_INIT(name, interval_init, burst_init)	\

+extern int ___ratelimit(struct ratelimit_state *rs, const char *func);
+#define __ratelimit(state) ___ratelimit(state, __func__)
+
 static inline void ratelimit_state_init(struct ratelimit_state *rs,
 					int interval, int burst)
 {
@ -52,16 +60,43 @@ static inline void ratelimit_default_init(struct ratelimit_state *rs)
 					DEFAULT_RATELIMIT_BURST);
 }

+static inline void ratelimit_state_inc_miss(struct ratelimit_state *rs)
+{
+	atomic_inc(&rs->missed);
+}
+
+static inline int ratelimit_state_get_miss(struct ratelimit_state *rs)
+{
+	return atomic_read(&rs->missed);
+}
+
+static inline int ratelimit_state_reset_miss(struct ratelimit_state *rs)
+{
+	return atomic_xchg(&rs->missed, 0);
+}
+
+static inline void ratelimit_state_reset_interval(struct ratelimit_state *rs, int interval_init)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&rs->lock, flags);
+	rs->interval = interval_init;
+	rs->flags &= ~RATELIMIT_INITIALIZED;
+	atomic_set(&rs->rs_n_left, rs->burst);
+	ratelimit_state_reset_miss(rs);
+	raw_spin_unlock_irqrestore(&rs->lock, flags);
+}
+
 static inline void ratelimit_state_exit(struct ratelimit_state *rs)
 {
+	int m;
+
 	if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE))
 		return;

-	if (rs->missed) {
-		pr_warn("%s: %d output lines suppressed due to ratelimiting\n",
-			current->comm, rs->missed);
-		rs->missed = 0;
-	}
+	m = ratelimit_state_reset_miss(rs);
+	if (m)
+		pr_warn("%s: %d output lines suppressed due to ratelimiting\n", current->comm, m);
 }

 static inline void
@ -72,13 +107,13 @@ ratelimit_set_flags(struct ratelimit_state *rs, unsigned long flags)

 extern struct ratelimit_state printk_ratelimit_state;

-extern int ___ratelimit(struct ratelimit_state *rs, const char *func);
-#define __ratelimit(state) ___ratelimit(state, __func__)
-
 #ifdef CONFIG_PRINTK

-#define WARN_ON_RATELIMIT(condition, state)			\
-		WARN_ON((condition) && __ratelimit(state))
+#define WARN_ON_RATELIMIT(condition, state)	({		\
+	bool __rtn_cond = !!(condition);			\
+	WARN_ON(__rtn_cond && __ratelimit(state));		\
+	__rtn_cond;						\
+})

 #define WARN_RATELIMIT(condition, format, ...)			\
 ({								\
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@ -50,6 +50,10 @@ DEFINE_LOCK_GUARD_1(spinlock_irq, spinlock_t,
 		    spin_lock_irq(_T->lock),
 		    spin_unlock_irq(_T->lock))

+DEFINE_LOCK_GUARD_1(raw_spinlock, spinlock_t,
+		    spin_lock(_T->lock),
+		    spin_unlock(_T->lock))
+
 #if 0
 DEFINE_LOCK_GUARD_1_COND(spinlock_irq, _try,
 			 spin_trylock_irq(_T->lock))
--- a/libbcachefs/alloc/accounting.c
+++ b/libbcachefs/alloc/accounting.c
@ -824,7 +824,6 @@ static int accounting_read_mem_fixups(struct btree_trans *trans)
 {
 	struct bch_fs *c = trans->c;
 	struct bch_accounting_mem *acc = &c->accounting;
-	CLASS(printbuf, underflow_err)();

 	darray_for_each_reverse(acc->k, i) {
 		struct disk_accounting_pos acc_k;
@ -863,6 +862,10 @@ static int accounting_read_mem_fixups(struct btree_trans *trans)
 	eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
 			accounting_pos_cmp, NULL);

+	CLASS(bch_log_msg, underflow_err)(c);
+	prt_printf(&underflow_err.m, "Accounting underflow for\n");
+	underflow_err.m.suppress = true;
+
 	for (unsigned i = 0; i < acc->k.nr; i++) {
 		struct disk_accounting_pos k;
 		bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos);
@ -883,15 +886,12 @@ static int accounting_read_mem_fixups(struct btree_trans *trans)
 			underflow |= (s64) v[j] < 0;

 		if (underflow) {
-			if (!underflow_err.pos) {
-				bch2_log_msg_start(c, &underflow_err);
-				prt_printf(&underflow_err, "Accounting underflow for\n");
-			}
-			bch2_accounting_key_to_text(&underflow_err, c, &k);
+			bch2_accounting_key_to_text(&underflow_err.m, c, &k);

 			for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++)
-				prt_printf(&underflow_err, " %lli", v[j]);
-			prt_newline(&underflow_err);
+				prt_printf(&underflow_err.m, " %lli", v[j]);
+			prt_newline(&underflow_err.m);
+			underflow_err.m.suppress = false;
 		}

 		guard(preempt)();
@ -922,17 +922,10 @@ static int accounting_read_mem_fixups(struct btree_trans *trans)
 		}
 	}

-	if (underflow_err.pos) {
-		bool print = bch2_count_fsck_err(c, accounting_key_underflow, &underflow_err);
-		unsigned pos = underflow_err.pos;
-		int ret = bch2_run_explicit_recovery_pass(c, &underflow_err,
-						      BCH_RECOVERY_PASS_check_allocations, 0);
-		print |= underflow_err.pos != pos;
-
-		if (print)
-			bch2_print_str(c, KERN_ERR, underflow_err.buf);
-		if (ret)
-			return ret;
+	if (!underflow_err.m.suppress) {
+		bch2_count_fsck_err(c, accounting_key_underflow, &underflow_err.m);
+		try(bch2_run_explicit_recovery_pass(c, &underflow_err.m,
+						    BCH_RECOVERY_PASS_check_allocations, 0));
 	}

 	return 0;
--- a/libbcachefs/alloc/backpointers.c
+++ b/libbcachefs/alloc/backpointers.c
@ -897,32 +897,30 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
 	    sectors[ALLOC_cached] > a->cached_sectors ||
 	    sectors[ALLOC_stripe] > a->stripe_sectors) {
 		if (*nr_iters) {
-			CLASS(printbuf, buf)();
-			bch2_log_msg_start(c, &buf);
+			CLASS(bch_log_msg, msg)(c);

-			prt_printf(&buf, "backpointer sectors > bucket sectors, but found no bad backpointers\n"
+			prt_printf(&msg.m, "backpointer sectors > bucket sectors, but found no bad backpointers\n"
 				   "bucket %llu:%llu data type %s, counters\n",
 				   alloc_k.k->p.inode,
 				   alloc_k.k->p.offset,
 				   __bch2_data_types[a->data_type]);
 			if (sectors[ALLOC_dirty]  > a->dirty_sectors)
-				prt_printf(&buf, "dirty: %u > %u\n",
+				prt_printf(&msg.m, "dirty: %u > %u\n",
 					   sectors[ALLOC_dirty], a->dirty_sectors);
 			if (sectors[ALLOC_cached] > a->cached_sectors)
-				prt_printf(&buf, "cached: %u > %u\n",
+				prt_printf(&msg.m, "cached: %u > %u\n",
 					   sectors[ALLOC_cached], a->cached_sectors);
 			if (sectors[ALLOC_stripe] > a->stripe_sectors)
-				prt_printf(&buf, "stripe: %u > %u\n",
+				prt_printf(&msg.m, "stripe: %u > %u\n",
 					   sectors[ALLOC_stripe], a->stripe_sectors);

 			for_each_btree_key_max_norestart(trans, iter, BTREE_ID_backpointers,
 						bucket_pos_to_bp_start(ca, alloc_k.k->p),
 						bucket_pos_to_bp_end(ca, alloc_k.k->p), 0, bp_k, ret) {
-				bch2_bkey_val_to_text(&buf, c, bp_k);
-				prt_newline(&buf);
+				bch2_bkey_val_to_text(&msg.m, c, bp_k);
+				prt_newline(&msg.m);
 			}

-			bch2_print_str(c, KERN_ERR, buf.buf);
 			__WARN();
 			return ret;
 		}
--- a/libbcachefs/alloc/buckets.c
+++ b/libbcachefs/alloc/buckets.c
@ -718,13 +718,12 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,

 		if (!m || !m->alive) {
 			gc_stripe_unlock(m);
-			CLASS(printbuf, buf)();
-			bch2_log_msg_start(c, &buf);
-			prt_printf(&buf, "pointer to nonexistent stripe %llu\n  while marking ",
+
+			CLASS(bch_log_msg, msg)(c);
+			prt_printf(&msg.m, "pointer to nonexistent stripe %llu\n  while marking ",
 				   (u64) p.ec.idx);
-			bch2_bkey_val_to_text(&buf, c, k);
-			__bch2_inconsistent_error(c, &buf);
-			bch2_print_str(c, KERN_ERR, buf.buf);
+			bch2_bkey_val_to_text(&msg.m, c, k);
+			__bch2_inconsistent_error(c, &msg.m);
 			return bch_err_throw(c, trigger_stripe_pointer);
 		}

@ -931,23 +930,20 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 		return PTR_ERR(a);

 	if (a->v.data_type && type && a->v.data_type != type) {
-		CLASS(printbuf, buf)();
-		bch2_log_msg_start(c, &buf);
-		prt_printf(&buf, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
+		CLASS(bch_log_msg, msg)(c);
+		prt_printf(&msg.m, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
 			   "while marking %s\n",
 			   iter.pos.inode, iter.pos.offset, a->v.gen,
 			   bch2_data_type_str(a->v.data_type),
 			   bch2_data_type_str(type),
 			   bch2_data_type_str(type));

-		bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf);
+		bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &msg.m);

-		ret = bch2_run_explicit_recovery_pass(c, &buf,
-					BCH_RECOVERY_PASS_check_allocations, 0);
+		try(bch2_run_explicit_recovery_pass(c, &msg.m,
+					BCH_RECOVERY_PASS_check_allocations, 0));

-		/* Always print, this is always fatal */
-		bch2_print_str(c, KERN_ERR, buf.buf);
-		return ret ?: bch_err_throw(c, metadata_bucket_inconsistency);
+		return bch_err_throw(c, metadata_bucket_inconsistency);
 	}

 	if (a->v.data_type	!= type ||
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@ -304,6 +304,7 @@

 #define bch2_fmt(_c, fmt)		bch2_log_msg(_c, fmt "\n")

+void bch2_print_str_loglevel(struct bch_fs *, int, const char *);
 void bch2_print_str(struct bch_fs *, const char *, const char *);

 __printf(2, 3)
@ -318,27 +319,24 @@ void __bch2_print(struct bch_fs *c, const char *fmt, ...);

 #define bch2_print(_c, ...) __bch2_print(maybe_dev_to_fs(_c), __VA_ARGS__)

-#define bch2_ratelimit()						\
+#define __bch2_ratelimit(_c, _rs)					\
+	(!(_c)->opts.ratelimit_errors || !__ratelimit(_rs))
+
+#define bch2_ratelimit(_c)						\
 ({									\
 	static DEFINE_RATELIMIT_STATE(rs,				\
 				      DEFAULT_RATELIMIT_INTERVAL,	\
 				      DEFAULT_RATELIMIT_BURST);		\
 									\
-	!__ratelimit(&rs);						\
+	__bch2_ratelimit(_c, &rs);					\
 })

 #define bch2_print_ratelimited(_c, ...)					\
 do {									\
-	if (!bch2_ratelimit())						\
+	if (!bch2_ratelimit(_c))					\
 		bch2_print(_c, __VA_ARGS__);				\
 } while (0)

-#define bch2_print_str_ratelimited(_c, ...)				\
-do {									\
-	if (!bch2_ratelimit())						\
-		bch2_print_str(_c, __VA_ARGS__);			\
-} while (0)
-
 #define bch_log(c, loglevel, fmt, ...) \
 	bch2_print(c, loglevel bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_log_ratelimited(c, loglevel, fmt, ...) \
@ -362,21 +360,11 @@ do {									\
 #define bch_info_dev(ca, ...)		bch_dev_log(ca, KERN_INFO, __VA_ARGS__)
 #define bch_verbose_dev(ca, ...)	bch_dev_log(ca, KERN_DEBUG, __VA_ARGS__)

-#define bch_err_dev_offset(ca, _offset, fmt, ...) \
-	bch2_print(c, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
-#define bch_err_inum(c, _inum, fmt, ...) \
-	bch2_print(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
-#define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \
-	bch2_print(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
-
-#define bch_err_dev_ratelimited(ca, fmt, ...) \
-	bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
-#define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \
-	bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
-#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \
-	bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
-#define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \
-	bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
+#define bch_err_dev_ratelimited(ca, ...)				\
+do {									\
+	if (!bch2_ratelimit(ca->fs))					\
+		bch_err_dev(ca, __VA_ARGS__);				\
+} while (0)

 static inline bool should_print_err(int err)
 {
@ -894,7 +882,9 @@ struct bch_fs {
 	reflink_gc_table	reflink_gc_table;
 	size_t			reflink_gc_nr;

+#ifndef NO_BCACHEFS_FS
 	struct bch_fs_vfs	vfs;
+#endif

 	/* QUOTAS */
 	struct bch_memquota_type quotas[QTYP_NR];
@ -1057,4 +1047,57 @@ static inline bool bch2_dev_rotational(struct bch_fs *c, unsigned dev)
 	return dev != BCH_SB_MEMBER_INVALID && test_bit(dev, c->devs_rotational.d);
 }

+void __bch2_log_msg_start(const char *, struct printbuf *);
+
+static inline void bch2_log_msg_start(struct bch_fs *c, struct printbuf *out)
+{
+	__bch2_log_msg_start(c->name, out);
+}
+
+struct bch_log_msg {
+	struct bch_fs	*c;
+	u8		loglevel;
+	struct printbuf	m;
+};
+
+static inline void bch2_log_msg_exit(struct bch_log_msg *msg)
+{
+	if (!msg->m.suppress)
+		bch2_print_str_loglevel(msg->c, msg->loglevel, msg->m.buf);
+	printbuf_exit(&msg->m);
+}
+
+static inline struct bch_log_msg bch2_log_msg_init(struct bch_fs *c,
+						   unsigned loglevel,
+						   bool suppress)
+{
+	struct printbuf buf = PRINTBUF;
+	bch2_log_msg_start(c, &buf);
+	return (struct bch_log_msg) {
+		.c		= c,
+		.loglevel	= loglevel,
+		.m		= buf,
+	};
+}
+
+DEFINE_CLASS(bch_log_msg, struct bch_log_msg,
+	     bch2_log_msg_exit(&_T),
+	     bch2_log_msg_init(c, 3, false), /* 3 == KERN_ERR */
+	     struct bch_fs *c)
+
+EXTEND_CLASS(bch_log_msg, _level,
+	     bch2_log_msg_init(c, loglevel, false),
+	     struct bch_fs *c, unsigned loglevel)
+
+/*
+ * Open coded EXTEND_CLASS, because we need the constructor to be a macro for
+ * ratelimiting to work correctly
+ */
+
+typedef class_bch_log_msg_t class_bch_log_msg_ratelimited_t;
+
+static inline void class_bch_log_msg_ratelimited_destructor(class_bch_log_msg_t *p)
+{ bch2_log_msg_exit(p); }
+#define class_bch_log_msg_ratelimited_constructor(_c)	bch2_log_msg_init(_c, 3, bch2_ratelimit(_c))
+
 #endif /* _BCACHEFS_H */
--- a/libbcachefs/btree/check.c
+++ b/libbcachefs/btree/check.c
@ -537,14 +537,10 @@ static int bch2_topology_check_root(struct btree_trans *trans, enum btree_id btr
 	if (!r->error)
 		return 0;

-	CLASS(printbuf, buf)();
-	bch2_log_msg_start(c, &buf);
-	prt_printf(&buf, "btree root ");
-	bch2_btree_id_to_text(&buf, btree);
-	prt_printf(&buf, " unreadable: %s\n", bch2_err_str(r->error));
-
-	int ret = 0;
-	bool print = true;
+	CLASS(bch_log_msg, msg)(c);
+	prt_printf(&msg.m, "btree root ");
+	bch2_btree_id_to_text(&msg.m, btree);
+	prt_printf(&msg.m, " unreadable: %s\n", bch2_err_str(r->error));

 	if (!btree_id_recovers_from_scan(btree)) {
 		r->alive = false;
@ -552,22 +548,19 @@ static int bch2_topology_check_root(struct btree_trans *trans, enum btree_id btr
 		bch2_btree_root_alloc_fake_trans(trans, btree, 0);
 		*reconstructed_root = true;

-		ret = bch2_btree_lost_data(c, &buf, btree);
+		try(bch2_btree_lost_data(c, &msg.m, btree));
 	} else {
-		ret = bch2_btree_has_scanned_nodes(c, btree, &buf);
+		int ret = bch2_btree_has_scanned_nodes(c, btree, &msg.m);
+		if (ret < 0)
+			return ret;

-		if (ret < 0) {
-			/*
-			 * just log our message, we'll be rewinding to run
-			 * btree node scan
-			 */
-		} else if (!ret) {
-			print = false;
+		if (!ret) {
+			msg.m.suppress = true;

-			__fsck_err(trans,
-				   FSCK_CAN_FIX|(btree_id_can_reconstruct(btree) ? FSCK_AUTOFIX : 0),
-				   btree_root_unreadable_and_scan_found_nothing,
-				   "%sbtree node scan found no nodes, continue?", buf.buf);
+			__ret_fsck_err(trans,
+				       FSCK_CAN_FIX|(btree_id_can_reconstruct(btree) ? FSCK_AUTOFIX : 0),
+				       btree_root_unreadable_and_scan_found_nothing,
+				       "%sbtree node scan found no nodes, continue?", msg.m.buf);

 			r->alive = false;
 			r->error = 0;
@ -582,37 +575,39 @@ static int bch2_topology_check_root(struct btree_trans *trans, enum btree_id btr
 			bch2_shoot_down_journal_keys(c, btree, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);

 			size_t nodes_found = 0;
-			try(bch2_get_scanned_nodes(c, btree, 0, POS_MIN, SPOS_MAX, &buf, &nodes_found));
+			try(bch2_get_scanned_nodes(c, btree, 0, POS_MIN, SPOS_MAX, &msg.m, &nodes_found));
 		}
 	}

-	if (print)
-		bch2_print_str(c, KERN_NOTICE, buf.buf);
-fsck_err:
-	bch_err_fn(c, ret);
-	return ret;
+	return 0;
+}
+
+static void ratelimit_reset(struct ratelimit_state *rs)
+{
+	guard(raw_spinlock)(&rs->lock);
+	atomic_set(&rs->rs_n_left, 0);
+	atomic_set(&rs->missed, 0);
+	rs->flags = 0;
+	rs->begin = 0;
 }

 int bch2_check_topology(struct bch_fs *c)
 {
 	CLASS(btree_trans, trans)(c);
-	int ret = 0;

 	bch2_trans_srcu_unlock(trans);

-	for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
+	for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
 		bool reconstructed_root = false;
 recover:
-		ret = lockrestart_do(trans, bch2_topology_check_root(trans, i, &reconstructed_root));
-		if (ret)
-			break;
+		try(lockrestart_do(trans, bch2_topology_check_root(trans, i, &reconstructed_root)));

 		struct btree_root *r = bch2_btree_id_root(c, i);
 		struct btree *b = r->b;

 		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
-		ret =   btree_check_root_boundaries(trans, b) ?:
-			bch2_btree_repair_topology_recurse(trans, b);
+		int ret = btree_check_root_boundaries(trans, b) ?:
+			  bch2_btree_repair_topology_recurse(trans, b);
 		six_unlock_read(&b->c.lock);

 		if (bch2_err_matches(ret, BCH_ERR_topology_repair_drop_this_node)) {
@ -633,9 +628,19 @@ recover:
 			r->alive = false;
 			ret = 0;
 		}
+
+		if (ret)
+			return ret;
 	}

-	return ret;
+	/*
+	 * post topology repair there should be no errored nodes; reset
+	 * ratelimiters so we see new unexpected errors
+	 */
+	ratelimit_reset(&c->btree.read_errors_soft);
+	ratelimit_reset(&c->btree.read_errors_hard);
+
+	return 0;
 }

 /* marking of btree keys/nodes: */
--- a/libbcachefs/btree/init.c
+++ b/libbcachefs/btree/init.c
@ -64,6 +64,15 @@ int bch2_fs_btree_init(struct bch_fs *c)
 	try(bch2_fs_btree_iter_init(c));
 	try(bch2_fs_btree_key_cache_init(&c->btree.key_cache));

+	c->btree.read_errors_soft = (struct ratelimit_state)
+		RATELIMIT_STATE_INIT(btree_read_error_soft,
+				     DEFAULT_RATELIMIT_INTERVAL,
+				     DEFAULT_RATELIMIT_BURST);
+	c->btree.read_errors_hard = (struct ratelimit_state)
+		RATELIMIT_STATE_INIT(btree_read_error_hard,
+				     DEFAULT_RATELIMIT_INTERVAL,
+				     DEFAULT_RATELIMIT_BURST);
+
 	return 0;
 }

--- a/libbcachefs/btree/interior.c
+++ b/libbcachefs/btree/interior.c
@ -53,6 +53,13 @@ static void bch2_btree_update_to_text(struct printbuf *, struct btree_update *);
 static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
 				  btree_path_idx_t, struct btree *, struct keylist *);

+static int btree_node_topology_err(struct bch_fs *c, struct btree *b, struct printbuf *out)
+{
+	bch2_btree_pos_to_text(out, c, b);
+	prt_newline(out);
+	return __bch2_topology_error(c, out);
+}
+
 /*
 * Verify that child nodes correctly span parent node's range:
 */
@ -62,8 +69,6 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
 	struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2
 		? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key
 		: b->data->min_key;
-	CLASS(printbuf, buf)();
-	int ret = 0;

 	BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
 	       !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
@ -72,7 +77,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
 	struct bkey_buf prev __cleanup(bch2_bkey_buf_exit);
 	bch2_bkey_buf_init(&prev);

-	struct btree_and_journal_iter iter;
+	struct btree_and_journal_iter iter __cleanup(bch2_btree_and_journal_iter_exit);
 	bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);

 	/*
@ -81,33 +86,33 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
 	 */
 	if (b == btree_node_root(c, b)) {
 		if (!bpos_eq(b->data->min_key, POS_MIN)) {
-			bch2_log_msg_start(c, &buf);
-			prt_printf(&buf, "btree root with incorrect min_key: ");
-			bch2_bpos_to_text(&buf, b->data->min_key);
-			prt_newline(&buf);
+			CLASS(bch_log_msg, msg)(c);
+			prt_printf(&msg.m, "btree root with incorrect min_key: ");
+			bch2_bpos_to_text(&msg.m, b->data->min_key);
+			prt_newline(&msg.m);

-			bch2_count_fsck_err(c, btree_root_bad_min_key, &buf);
-			goto err;
+			bch2_count_fsck_err(c, btree_root_bad_min_key, &msg.m);
+			return btree_node_topology_err(c, b, &msg.m);
 		}

 		if (!bpos_eq(b->data->max_key, SPOS_MAX)) {
-			bch2_log_msg_start(c, &buf);
-			prt_printf(&buf, "btree root with incorrect max_key: ");
-			bch2_bpos_to_text(&buf, b->data->max_key);
-			prt_newline(&buf);
+			CLASS(bch_log_msg, msg)(c);
+			prt_printf(&msg.m, "btree root with incorrect max_key: ");
+			bch2_bpos_to_text(&msg.m, b->data->max_key);
+			prt_newline(&msg.m);

-			bch2_count_fsck_err(c, btree_root_bad_max_key, &buf);
-			goto err;
+			bch2_count_fsck_err(c, btree_root_bad_max_key, &msg.m);
+			return btree_node_topology_err(c, b, &msg.m);
 		}
 	}

 	if (!b->c.level)
-		goto out;
+		return 0;

 	struct bkey_s_c k;
 	while ((k = bch2_btree_and_journal_iter_peek(c, &iter)).k) {
 		if (k.k->type != KEY_TYPE_btree_ptr_v2)
-			goto out;
+			return 0;

 		struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);

@ -116,15 +121,16 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
 			: bpos_successor(prev.k->k.p);

 		if (!bpos_eq(expected_min, bp.v->min_key)) {
-			prt_str(&buf, "end of prev node doesn't match start of next node");
-			prt_str(&buf, "\nprev ");
-			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
-			prt_str(&buf, "\nnext ");
-			bch2_bkey_val_to_text(&buf, c, k);
-			prt_newline(&buf);
+			CLASS(bch_log_msg, msg)(c);
+			prt_str(&msg.m, "end of prev node doesn't match start of next node");
+			prt_str(&msg.m, "\nprev ");
+			bch2_bkey_val_to_text(&msg.m, c, bkey_i_to_s_c(prev.k));
+			prt_str(&msg.m, "\nnext ");
+			bch2_bkey_val_to_text(&msg.m, c, k);
+			prt_newline(&msg.m);

-			bch2_count_fsck_err(c, btree_node_topology_bad_min_key, &buf);
-			goto err;
+			bch2_count_fsck_err(c, btree_node_topology_bad_min_key, &msg.m);
+			return btree_node_topology_err(c, b, &msg.m);
 		}

 		bch2_bkey_buf_reassemble(&prev, k);
@ -132,32 +138,23 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
 	}

 	if (bkey_deleted(&prev.k->k)) {
-		prt_printf(&buf, "empty interior node\n");
-		bch2_count_fsck_err(c, btree_node_topology_empty_interior_node, &buf);
-		goto err;
+		CLASS(bch_log_msg, msg)(c);
+		prt_printf(&msg.m, "empty interior node\n");
+		bch2_count_fsck_err(c, btree_node_topology_empty_interior_node, &msg.m);
+		return btree_node_topology_err(c, b, &msg.m);
 	}

 	if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
-		prt_str(&buf, "last child node doesn't end at end of parent node\nchild: ");
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
-		prt_newline(&buf);
+		CLASS(bch_log_msg, msg)(c);
+		prt_str(&msg.m, "last child node doesn't end at end of parent node\nchild: ");
+		bch2_bkey_val_to_text(&msg.m, c, bkey_i_to_s_c(prev.k));
+		prt_newline(&msg.m);

-		bch2_count_fsck_err(c, btree_node_topology_bad_max_key, &buf);
-		goto err;
+		bch2_count_fsck_err(c, btree_node_topology_bad_max_key, &msg.m);
+		return btree_node_topology_err(c, b, &msg.m);
 	}
-out:
-	bch2_btree_and_journal_iter_exit(&iter);
-	return ret;
-err:
-	bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
-	prt_char(&buf, ' ');
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-	prt_newline(&buf);

-	ret = __bch2_topology_error(c, &buf);
-	bch2_print_str(c, KERN_ERR, buf.buf);
-	BUG_ON(!ret);
-	goto out;
+	return 0;
 }

 /* Calculate ideal packed bkey format for new btree nodes: */
@ -1880,15 +1877,12 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
 	bch2_verify_keylist_sorted(keys);

 	if (!btree_node_intent_locked(path, b->c.level)) {
-		CLASS(printbuf, buf)();
-		bch2_log_msg_start(c, &buf);
-		prt_printf(&buf, "%s(): node not locked at level %u\n",
+		CLASS(bch_log_msg, msg)(c);
+		prt_printf(&msg.m, "%s(): node not locked at level %u\n",
 			   __func__, b->c.level);
-		bch2_btree_update_to_text(&buf, as);
-		bch2_btree_path_to_text(&buf, trans, path_idx, path);
-		bch2_fs_emergency_read_only2(c, &buf);
-
-		bch2_print_str(c, KERN_ERR, buf.buf);
+		bch2_btree_update_to_text(&msg.m, as);
+		bch2_btree_path_to_text(&msg.m, trans, path_idx, path);
+		bch2_fs_emergency_read_only2(c, &msg.m);
 		return -EIO;
 	}

@ -2121,21 +2115,19 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 	}

 	if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) {
-		CLASS(printbuf, buf)();
-		bch2_log_msg_start(c, &buf);
+		CLASS(bch_log_msg, msg)(c);

-		prt_printf(&buf, "%s(): ", __func__);
-		ret = __bch2_topology_error(c, &buf);
-		prt_newline(&buf);
+		prt_str(&msg.m, "btree node merge: end of prev node doesn't match start of next node\n");

-		prt_printf(&buf, "prev ends at   ");
-		bch2_bpos_to_text(&buf, prev->data->max_key);
-		prt_newline(&buf);
+		prt_printf(&msg.m, "prev ends at   ");
+		bch2_bpos_to_text(&msg.m, prev->data->max_key);
+		prt_newline(&msg.m);

-		prt_printf(&buf, "next starts at ");
-		bch2_bpos_to_text(&buf, next->data->min_key);
+		prt_printf(&msg.m, "next starts at ");
+		bch2_bpos_to_text(&msg.m, next->data->min_key);
+		prt_newline(&msg.m);

-		bch2_print_str(c, KERN_ERR, buf.buf);
+		ret = __bch2_topology_error(c, &msg.m);
 		goto err;
 	}

--- a/libbcachefs/btree/iter.c
+++ b/libbcachefs/btree/iter.c
@ -735,16 +735,13 @@ void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
 static noinline_for_stack int btree_node_root_err(struct btree_trans *trans, struct btree *b)
 {
 	struct bch_fs *c = trans->c;
-	CLASS(printbuf, buf)();
-	bch2_log_msg_start(c, &buf);
+	CLASS(bch_log_msg, msg)(c);

-	prt_str(&buf, "btree root doesn't cover expected range:\n");
-	bch2_btree_pos_to_text(&buf, c, b);
-	prt_newline(&buf);
+	prt_str(&msg.m, "btree root doesn't cover expected range:\n");
+	bch2_btree_pos_to_text(&msg.m, c, b);
+	prt_newline(&msg.m);

-	int ret = __bch2_topology_error(c, &buf);
-	bch2_print_str(trans->c, KERN_ERR, buf.buf);
-	return ret;
+	return __bch2_topology_error(c, &msg.m);
 }

 static inline int btree_path_lock_root(struct btree_trans *trans,
@ -910,17 +907,15 @@ static noinline_for_stack int btree_node_missing_err(struct btree_trans *trans,
 						     struct btree_path *path)
 {
 	struct bch_fs *c = trans->c;
-	CLASS(printbuf, buf)();
+	CLASS(bch_log_msg, msg)(c);

-	prt_str(&buf, "node not found at pos: ");
-	bch2_bpos_to_text(&buf, path->pos);
-	prt_str(&buf, "\n  within parent node ");
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&path_l(path)->b->key));
-	prt_newline(&buf);
+	prt_str(&msg.m, "node not found at pos: ");
+	bch2_bpos_to_text(&msg.m, path->pos);
+	prt_str(&msg.m, "\n  within parent node ");
+	bch2_bkey_val_to_text(&msg.m, c, bkey_i_to_s_c(&path_l(path)->b->key));
+	prt_newline(&msg.m);

-	int ret = __bch2_topology_error(c, &buf);
-	bch2_print_str(trans->c, KERN_ERR, buf.buf);
-	return ret;
+	return __bch2_topology_error(c, &msg.m);
 }

 static noinline_for_stack int btree_node_gap_err(struct btree_trans *trans,
@ -928,19 +923,17 @@ static noinline_for_stack int btree_node_gap_err(struct btree_trans *trans,
 						 struct bkey_i *k)
 {
 	struct bch_fs *c = trans->c;
-	CLASS(printbuf, buf)();
+	CLASS(bch_log_msg, msg)(c);

-	prt_str(&buf, "node doesn't cover expected range at pos: ");
-	bch2_bpos_to_text(&buf, path->pos);
-	prt_str(&buf, "\n  within parent node ");
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&path_l(path)->b->key));
-	prt_str(&buf, "\n  but got node: ");
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
-	prt_newline(&buf);
+	prt_str(&msg.m, "node doesn't cover expected range at pos: ");
+	bch2_bpos_to_text(&msg.m, path->pos);
+	prt_str(&msg.m, "\n  within parent node ");
+	bch2_bkey_val_to_text(&msg.m, c, bkey_i_to_s_c(&path_l(path)->b->key));
+	prt_str(&msg.m, "\n  but got node: ");
+	bch2_bkey_val_to_text(&msg.m, c, bkey_i_to_s_c(k));
+	prt_newline(&msg.m);

-	int ret = __bch2_topology_error(c, &buf);
-	bch2_print_str(trans->c, KERN_ERR, buf.buf);
-	return ret;
+	return __bch2_topology_error(c, &msg.m);
 }

 static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
@ -1673,13 +1666,10 @@ void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
 static noinline __cold
 void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
 {
-	CLASS(printbuf, buf)();
-	bch2_log_msg_start(trans->c, &buf);
+	CLASS(bch_log_msg, msg)(trans->c);

-	__bch2_trans_paths_to_text(&buf, trans, nosort);
-	bch2_trans_updates_to_text(&buf, trans);
-
-	bch2_print_str(trans->c, KERN_ERR, buf.buf);
+	__bch2_trans_paths_to_text(&msg.m, trans, nosort);
+	bch2_trans_updates_to_text(&msg.m, trans);
 }

 noinline __cold
@ -3297,13 +3287,11 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long

 	if (WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX)) {
 #ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
-		CLASS(printbuf, buf)();
-		bch2_log_msg_start(c, &buf);
-		prt_printf(&buf, "bump allocator exceeded BTREE_TRANS_MEM_MAX (%u)\n",
+		CLASS(bch_log_msg, msg)(c);
+		prt_printf(&msg.m, "bump allocator exceeded BTREE_TRANS_MEM_MAX (%u)\n",
 			   BTREE_TRANS_MEM_MAX);

-		bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace);
-		bch2_print_str(c, KERN_ERR, buf.buf);
+		bch2_trans_kmalloc_trace_to_text(&msg.m, &trans->trans_kmalloc_trace);
 #endif
 	}

@ -3655,18 +3643,16 @@ static void check_btree_paths_leaked(struct btree_trans *trans)
 		struct btree_path *path;
 		unsigned i;

-		CLASS(printbuf, buf)();
-		bch2_log_msg_start(c, &buf);
+		CLASS(bch_log_msg, msg)(c);

-		prt_printf(&buf, "btree paths leaked from %s!\n", trans->fn);
+		prt_printf(&msg.m, "btree paths leaked from %s!\n", trans->fn);
 		trans_for_each_path(trans, path, i)
 			if (path->ref)
-				prt_printf(&buf, "btree %s %pS\n",
+				prt_printf(&msg.m, "btree %s %pS\n",
 					   bch2_btree_id_str(path->btree_id),
 					   (void *) path->ip_allocated);

-		bch2_fs_emergency_read_only2(c, &buf);
-		bch2_print_str(c, KERN_ERR, buf.buf);
+		bch2_fs_emergency_read_only2(c, &msg.m);
 	}
 }
 #else
--- a/libbcachefs/btree/read.c
+++ b/libbcachefs/btree/read.c
@ -1010,14 +1010,26 @@ start:
 	 * only print retry success if we read from a replica with no errors
 	 */
 	if (ret) {
+		/*
+		 * Initialize buf.suppress before btree_lost_data(); that will
+		 * clear it if it did any work (scheduling recovery passes,
+		 * marking superblock
+		 */
+		buf.suppress = !__bch2_ratelimit(c, &c->btree.read_errors_hard);
+
 		set_btree_node_read_error(b);
 		bch2_btree_lost_data(c, &buf, b->c.btree_id);
 		prt_printf(&buf, "ret %s", bch2_err_str(ret));
 	} else if (failed.nr) {
+		/* Separate ratelimit states for soft vs. hard errors */
+		buf.suppress = !__bch2_ratelimit(c, &c->btree.read_errors_soft);
+
 		if (!bch2_dev_io_failures(&failed, rb->pick.ptr.dev))
 			prt_printf(&buf, "retry success");
 		else
 			prt_printf(&buf, "repair success");
+	} else {
+		buf.suppress = true;
 	}

 	if ((failed.nr ||
@ -1029,8 +1041,8 @@ start:
 	}
 	prt_newline(&buf);

-	if (ret || failed.nr)
-		bch2_print_str_ratelimited(c, KERN_ERR, buf.buf);
+	if (!buf.suppress)
+		bch2_print_str(c, ret ? KERN_ERR : KERN_NOTICE, buf.buf);

 	/*
 	 * Do this late; unlike other btree_node_need_rewrite() cases if a node
@ -1086,21 +1098,15 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
 					 NULL, &pick, -1);

 	if (ret <= 0) {
-		bool print = !bch2_ratelimit();
-		CLASS(printbuf, buf)();
-		bch2_log_msg_start(c, &buf);
+		CLASS(bch_log_msg_ratelimited, msg)(c);

-		prt_str(&buf, "btree node read error: no device to read from\n at ");
-		bch2_btree_pos_to_text(&buf, c, b);
-		prt_newline(&buf);
-		bch2_btree_lost_data(c, &buf, b->c.btree_id);
+		prt_str(&msg.m, "btree node read error: no device to read from\n at ");
+		bch2_btree_pos_to_text(&msg.m, c, b);
+		prt_newline(&msg.m);
+		bch2_btree_lost_data(c, &msg.m, b->c.btree_id);

-		if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
-		    bch2_fs_emergency_read_only2(c, &buf))
-			print = true;
-
-		if (print)
-			bch2_print_str(c, KERN_ERR, buf.buf);
+		if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology))
+			bch2_fs_emergency_read_only2(c, &msg.m);

 		set_btree_node_read_error(b);
 		clear_btree_node_read_in_flight(b);
--- a/libbcachefs/btree/types.h
+++ b/libbcachefs/btree/types.h
@ -721,6 +721,8 @@ struct bch_fs_btree {
 	struct bio_set				bio;
 	mempool_t				fill_iter;
 	struct workqueue_struct			*read_complete_wq;
+	struct ratelimit_state			read_errors_soft;
+	struct ratelimit_state			read_errors_hard;

 	struct workqueue_struct			*write_submit_wq;
 	struct workqueue_struct			*write_complete_wq;
--- a/libbcachefs/btree/write.c
+++ b/libbcachefs/btree/write.c
@ -154,28 +154,28 @@ static void btree_node_write_work(struct work_struct *work)

 		if ((ret && !bch2_err_matches(ret, EROFS)) ||
 		    wbio->wbio.failed.nr) {
-			bool print = !bch2_ratelimit();
+			CLASS(bch_log_msg, msg)(c);

-			CLASS(printbuf, buf)();
-			bch2_log_msg_start(c, &buf);
-			prt_printf(&buf, "error writing btree node at ");
-			bch2_btree_pos_to_text(&buf, c, b);
-			prt_newline(&buf);
+			/* Separate ratelimit_states for hard and soft errors */
+			msg.m.suppress = !ret
+				? bch2_ratelimit(c)
+				: bch2_ratelimit(c);

-			bch2_io_failures_to_text(&buf, c, &wbio->wbio.failed);
+			prt_printf(&msg.m, "error writing btree node at ");
+			bch2_btree_pos_to_text(&msg.m, c, b);
+			prt_newline(&msg.m);
+
+			bch2_io_failures_to_text(&msg.m, c, &wbio->wbio.failed);

 			if (!ret) {
-				prt_printf(&buf, "wrote degraded to ");
+				prt_printf(&msg.m, "wrote degraded to ");
 				struct bch_devs_list d = bch2_bkey_devs(c, bkey_i_to_s_c(&b->key));
-				bch2_devs_list_to_text(&buf, c, &d);
-				prt_newline(&buf);
+				bch2_devs_list_to_text(&msg.m, c, &d);
+				prt_newline(&msg.m);
 			} else {
-				prt_printf(&buf, "%s\n", bch2_err_str(ret));
-				print = bch2_fs_emergency_read_only2(c, &buf);
+				prt_printf(&msg.m, "%s\n", bch2_err_str(ret));
+				bch2_fs_emergency_read_only2(c, &msg.m);
 			}
-
-			if (print)
-				bch2_print_str(c, KERN_ERR, buf.buf);
 		}
 	}

--- a/libbcachefs/data/extents.c
+++ b/libbcachefs/data/extents.c
@ -985,6 +985,16 @@ void bch2_bkey_drop_ptr(const struct bch_fs *c, struct bkey_s k, struct bch_exte
 	}
 }

+void bch2_bkey_drop_ptrs_mask(const struct bch_fs *c, struct bkey_i *k, unsigned ptrs)
+{
+	while (ptrs) {
+		unsigned i = 0, drop = __fls(ptrs);
+
+		bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(k), p, entry, i++ == drop);
+		ptrs  ^= 1U << drop;
+	}
+}
+
 void bch2_bkey_drop_device_noerror(const struct bch_fs *c, struct bkey_s k, unsigned dev)
 {
 	bch2_bkey_drop_ptrs_noerror(k, p, entry, p.ptr.dev == dev);
@ -995,7 +1005,7 @@ void bch2_bkey_drop_device(const struct bch_fs *c, struct bkey_s k, unsigned dev
 	bch2_bkey_drop_ptrs(k, p, entry, p.ptr.dev == dev);
 }

-void bch2_bkey_drop_ec(const struct bch_fs *c, struct bkey_i *k, unsigned dev)
+static void bch2_bkey_drop_ec(const struct bch_fs *c, struct bkey_i *k, unsigned dev)
 {
 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
 	union bch_extent_entry *entry, *ec = NULL;
@ -1011,6 +1021,22 @@ void bch2_bkey_drop_ec(const struct bch_fs *c, struct bkey_i *k, unsigned dev)
 	}
 }

+void bch2_bkey_drop_ec_mask(const struct bch_fs *c, struct bkey_i *k, unsigned mask)
+{
+	while (mask) {
+		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
+		unsigned ptr_bit = 1;
+		bkey_for_each_ptr(ptrs, ptr) {
+			if (mask & ptr_bit) {
+				bch2_bkey_drop_ec(c, k, ptr->dev);
+				mask &= ~ptr_bit;
+				break;
+			}
+			ptr_bit <<= 1;
+		}
+	}
+}
+
 const struct bch_extent_ptr *bch2_bkey_has_device_c(const struct bch_fs *c, struct bkey_s_c k, unsigned dev)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
--- a/libbcachefs/data/extents.h
+++ b/libbcachefs/data/extents.h
@ -631,10 +631,11 @@ void bch2_extent_ptr_decoded_append(const struct bch_fs *, struct bkey_i *,
 				    struct extent_ptr_decoded *);
 void bch2_bkey_drop_ptr_noerror(const struct bch_fs *, struct bkey_s, struct bch_extent_ptr *);
 void bch2_bkey_drop_ptr(const struct bch_fs *, struct bkey_s, struct bch_extent_ptr *);
+void bch2_bkey_drop_ptrs_mask(const struct bch_fs *, struct bkey_i *, unsigned);

 void bch2_bkey_drop_device_noerror(const struct bch_fs *, struct bkey_s, unsigned);
 void bch2_bkey_drop_device(const struct bch_fs *, struct bkey_s, unsigned);
-void bch2_bkey_drop_ec(const struct bch_fs *, struct bkey_i *k, unsigned);
+void bch2_bkey_drop_ec_mask(const struct bch_fs *, struct bkey_i *k, unsigned);

 #define bch2_bkey_drop_ptrs_noerror(_k, _p, _entry, _cond)			\
 do {										\
--- a/libbcachefs/data/move.c
+++ b/libbcachefs/data/move.c
@ -320,7 +320,7 @@ int bch2_move_extent(struct moving_context *ctxt,
 		struct bch_devs_list devs_have = bch2_data_update_devs_keeping(c, &data_opts, k);

 		if (data_opts.type != BCH_DATA_UPDATE_copygc)
-			try(bch2_can_do_write(c, &data_opts, k, &devs_have));
+			try(bch2_can_do_write(c, &opts, &data_opts, k, &devs_have));

 		ret = bch2_btree_node_rewrite_pos(trans, iter->btree_id, level, k.k->p,
 						  data_opts.target, 0, data_opts.write_flags);
--- a/libbcachefs/data/read.c
+++ b/libbcachefs/data/read.c
@ -723,36 +723,30 @@ static void bch2_rbio_retry(struct work_struct *work)
 			ret = 0;

 		if (failed.nr || ret) {
-			CLASS(printbuf, buf)();
-			bch2_log_msg_start(c, &buf);
+			CLASS(bch_log_msg, msg)(c);

-			bch2_read_err_msg_trans(trans, &buf, rbio, read_pos);
+			/* Separate ratelimit_states for hard and soft errors */
+			msg.m.suppress = !ret
+				? bch2_ratelimit(c)
+				: bch2_ratelimit(c);

-			prt_str(&buf, "data read error, ");
+			bch2_read_err_msg_trans(trans, &msg.m, rbio, read_pos);
+
+			prt_str(&msg.m, "data read error, ");
 			if (!ret) {
-				prt_str(&buf, "successful retry");
+				prt_str(&msg.m, "successful retry");
 				if (rbio->self_healing)
-					prt_str(&buf, ", self healing");
+					prt_str(&msg.m, ", self healing");
 			} else
-				prt_str(&buf, bch2_err_str(ret));
-			prt_newline(&buf);
-
+				prt_str(&msg.m, bch2_err_str(ret));
+			prt_newline(&msg.m);

 			if (!bkey_deleted(&sk.k->k)) {
-				bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(sk.k));
-				prt_newline(&buf);
+				bch2_bkey_val_to_text(&msg.m, c, bkey_i_to_s_c(sk.k));
+				prt_newline(&msg.m);
 			}

-			bch2_io_failures_to_text(&buf, c, &failed);
-
-			static struct ratelimit_state rs[2] = {
-				RATELIMIT_STATE_INIT("read_retry", DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST),
-				RATELIMIT_STATE_INIT("read_error", DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST),
-			};
-			struct ratelimit_state *r = &rs[ret != 0];
-
-			if (__ratelimit(r))
-				bch2_print_str(c, KERN_ERR, buf.buf);
+			bch2_io_failures_to_text(&msg.m, c, &failed);
 		}

 		/* drop trans before calling rbio_done() */
--- a/libbcachefs/data/reconcile.c
+++ b/libbcachefs/data/reconcile.c
@ -1465,7 +1465,7 @@ static int do_reconcile_extent(struct moving_context *ctxt,
 		reconcile_set_data_opts(trans, NULL, data_pos.btree, k, &opts, &data_opts);

 		struct bch_devs_list devs_have = bch2_data_update_devs_keeping(c, &data_opts, k);
-		int ret = bch2_can_do_write(c, &data_opts, k, &devs_have);
+		int ret = bch2_can_do_write(c, &opts, &data_opts, k, &devs_have);
 		if (ret) {
 			if (is_reconcile_pending_err(c, k, ret))
 				return 0;
--- a/libbcachefs/data/update.c
+++ b/libbcachefs/data/update.c
@ -610,21 +610,11 @@ static int bch2_extent_drop_ptrs(struct btree_trans *trans,

 	struct bkey_i *n = errptr_try(bch2_bkey_make_mut_noupdate(trans, k));

-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p = {};
-	unsigned i = 0;
-	bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
-		if (data_opts->ptrs_kill_ec & BIT(i))
-			bch2_bkey_drop_ec(c, n, p.ptr.dev);
-		i++;
-	}
+	if (data_opts->ptrs_kill_ec)
+		bch2_bkey_drop_ec_mask(c, n, data_opts->ptrs_kill_ec);

-	while (data_opts->ptrs_kill) {
-		unsigned i = 0, drop = __fls(data_opts->ptrs_kill);
-
-		bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), p, entry, i++ == drop);
-		data_opts->ptrs_kill ^= 1U << drop;
-	}
+	if (data_opts->ptrs_kill)
+		bch2_bkey_drop_ptrs_mask(c, n, data_opts->ptrs_kill);

 	/*
 	 * If the new extent no longer has any pointers, bch2_extent_normalize()
@ -740,39 +730,44 @@ static unsigned bch2_bkey_durability_on_target(struct bch_fs *c, struct bkey_s_c
 	return durability;
 }

-static int bch2_can_do_write_btree(struct bch_fs *c, struct data_update_opts *opts, struct bkey_s_c k)
+static int bch2_can_do_write_btree(struct bch_fs *c,
+				   struct bch_inode_opts *opts,
+				   struct data_update_opts *data_opts, struct bkey_s_c k)
 {
-	enum bch_watermark watermark = opts->commit_flags & BCH_WATERMARK_MASK;
+	enum bch_watermark watermark = data_opts->commit_flags & BCH_WATERMARK_MASK;

-	if (opts->target)
-		if (durability_available_on_target(c, watermark, opts->target) >
-		    bch2_bkey_durability_on_target(c, k, opts->target))
-			return 0;
+	if (durability_available_on_target(c, watermark, data_opts->target) >
+	    bch2_bkey_durability_on_target(c, k, data_opts->target))
+		return 0;

-	if (!opts->target || !(opts->write_flags & BCH_WRITE_only_specified_devs))
-		if (durability_available_on_target(c, watermark, 0) >
-		    bch2_bkey_durability(c, k))
+	if (!(data_opts->write_flags & BCH_WRITE_only_specified_devs)) {
+		unsigned d = bch2_bkey_durability(c, k);
+		if (d < opts->data_replicas &&
+		    d < durability_available_on_target(c, watermark, 0))
 			return 0;
+	}

 	return bch_err_throw(c, data_update_fail_no_rw_devs);
 }

-int bch2_can_do_write(struct bch_fs *c, struct data_update_opts *opts,
+int bch2_can_do_write(struct bch_fs *c,
+		      struct bch_inode_opts *opts,
+		      struct data_update_opts *data_opts,
 		      struct bkey_s_c k, struct bch_devs_list *devs_have)
 {
-	enum bch_watermark watermark = opts->commit_flags & BCH_WATERMARK_MASK;
+	enum bch_watermark watermark = data_opts->commit_flags & BCH_WATERMARK_MASK;

-	if ((opts->write_flags & BCH_WRITE_alloc_nowait) &&
+	if ((data_opts->write_flags & BCH_WRITE_alloc_nowait) &&
 	    unlikely(c->allocator.open_buckets_nr_free <= bch2_open_buckets_reserved(watermark)))
 		return bch_err_throw(c, data_update_fail_would_block);

 	guard(rcu)();

 	if (bkey_is_btree_ptr(k.k))
-		return bch2_can_do_write_btree(c, opts, k);
+		return bch2_can_do_write_btree(c, opts, data_opts, k);

-	unsigned target = opts->write_flags & BCH_WRITE_only_specified_devs
-		? opts->target
+	unsigned target = data_opts->write_flags & BCH_WRITE_only_specified_devs
+		? data_opts->target
 		: 0;
 	struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target);

@ -1001,7 +996,7 @@ int bch2_data_update_init(struct btree_trans *trans,
 		 *   single durability=2 device)
 		 */
 		if (data_opts.type != BCH_DATA_UPDATE_copygc) {
-			ret = bch2_can_do_write(c, &m->opts, k, &m->op.devs_have);
+			ret = bch2_can_do_write(c, io_opts, &m->opts, k, &m->op.devs_have);
 			if (ret)
 				goto out;
 		}
--- a/libbcachefs/data/update.h
+++ b/libbcachefs/data/update.h
@ -88,7 +88,8 @@ void bch2_data_update_read_done(struct data_update *);
 struct bch_devs_list bch2_data_update_devs_keeping(struct bch_fs *,
 						   struct data_update_opts *,
 						   struct bkey_s_c);
-int bch2_can_do_write(struct bch_fs *, struct data_update_opts *,
+int bch2_can_do_write(struct bch_fs *, struct bch_inode_opts *,
+		      struct data_update_opts *,
 		      struct bkey_s_c, struct bch_devs_list *);

 void bch2_data_update_exit(struct data_update *, int);
--- a/libbcachefs/data/write.c
+++ b/libbcachefs/data/write.c
@ -257,14 +257,12 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
 		s64 bi_sectors = le64_to_cpu(inode->v.bi_sectors);
 		if (unlikely(bi_sectors + i_sectors_delta < 0)) {
 			struct bch_fs *c = trans->c;
-			CLASS(printbuf, buf)();
-			bch2_log_msg_start(c, &buf);
-			prt_printf(&buf, "inode %llu i_sectors underflow: %lli + %lli < 0",
+
+			CLASS(bch_log_msg, msg)(c);
+			prt_printf(&msg.m, "inode %llu i_sectors underflow: %lli + %lli < 0",
 				   extent_iter->pos.inode, bi_sectors, i_sectors_delta);

-			bool print = bch2_count_fsck_err(c, inode_i_sectors_underflow, &buf);
-			if (print)
-				bch2_print_str(c, KERN_ERR, buf.buf);
+			msg.m.suppress = !bch2_count_fsck_err(c, inode_i_sectors_underflow, &msg.m);

 			if (i_sectors_delta < 0)
 				i_sectors_delta = -bi_sectors;
@ -424,7 +422,6 @@ static int bch2_write_index_default(struct bch_write_op *op)

 static void bch2_log_write_error_start(struct printbuf *out, struct bch_write_op *op, u64 offset)
 {
-	bch2_log_msg_start(op->c, out);
 	prt_printf(out, "error writing data at ");

 	struct bpos pos = op->pos;
@ -445,16 +442,14 @@ static void bch2_log_write_error_start(struct printbuf *out, struct bch_write_op

 void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, ...)
 {
-	CLASS(printbuf, buf)();
-	bch2_log_write_error_start(&buf, op, offset);
+	CLASS(bch_log_msg_ratelimited, msg)(op->c);
+
+	bch2_log_write_error_start(&msg.m, op, offset);

 	va_list args;
 	va_start(args, fmt);
-	prt_vprintf(&buf, fmt, args);
+	prt_vprintf(&msg.m, fmt, args);
 	va_end(args);
-	prt_newline(&buf);
-
-	bch2_print_str_ratelimited(op->c, KERN_ERR, buf.buf);
 }

 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
@ -596,26 +591,27 @@ static void __bch2_write_index(struct bch_write_op *op)
 	int ret = 0;

 	if (unlikely(op->io_error)) {
-		struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
-		bool print;
-		CLASS(printbuf, buf)();
-		bch2_log_write_error_start(&buf, op, bkey_start_offset(&k->k));
-		bch2_io_failures_to_text(&buf, c, &op->wbio.failed);
-
 		ret = bch2_write_drop_io_error_ptrs(op);
-		if (!ret) {
-			prt_printf(&buf, "wrote degraded to ");
-			struct bch_devs_list d = bch2_bkey_devs(c, bkey_i_to_s_c(k));
-			bch2_devs_list_to_text(&buf, c, &d);
-			prt_newline(&buf);
-			print = !bch2_ratelimit(); /* Different ratelimits for hard and soft errors */
-		} else {
-			prt_printf(&buf, "all replicated writes failed\n");
-			print = !bch2_ratelimit();
-		}

-		if (print)
-			bch2_print_str(c, KERN_ERR, buf.buf);
+		CLASS(bch_log_msg, msg)(c);
+
+		/* Separate ratelimit_states for hard and soft errors */
+		msg.m.suppress = !ret
+			? bch2_ratelimit(c)
+			: bch2_ratelimit(c);
+
+		struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
+		bch2_log_write_error_start(&msg.m, op, bkey_start_offset(&k->k));
+		bch2_io_failures_to_text(&msg.m, c, &op->wbio.failed);
+
+		if (!ret) {
+			prt_printf(&msg.m, "wrote degraded to ");
+			struct bch_devs_list d = bch2_bkey_devs(c, bkey_i_to_s_c(k));
+			bch2_devs_list_to_text(&msg.m, c, &d);
+			prt_newline(&msg.m);
+		} else {
+			prt_printf(&msg.m, "all replicated writes failed\n");
+		}

 		if (ret)
 			goto err;
--- a/libbcachefs/debug/sysfs.c
+++ b/libbcachefs/debug/sysfs.c
@ -490,13 +490,10 @@ STORE(bch2_fs)
 		__bch2_delete_dead_snapshots(c);

 	if (attr == &sysfs_trigger_emergency_read_only) {
-		struct printbuf buf = PRINTBUF;
-		bch2_log_msg_start(c, &buf);
+		CLASS(bch_log_msg, msg)(c);

-		prt_printf(&buf, "shutdown by sysfs\n");
-		bch2_fs_emergency_read_only2(c, &buf);
-		bch2_print_str(c, KERN_ERR, buf.buf);
-		printbuf_exit(&buf);
+		prt_printf(&msg.m, "shutdown by sysfs\n");
+		bch2_fs_emergency_read_only2(c, &msg.m);
 	}

 #ifdef CONFIG_BCACHEFS_TESTS
--- a/libbcachefs/init/error.c
+++ b/libbcachefs/init/error.c
@ -112,17 +112,14 @@ int __bch2_topology_error(struct bch_fs *c, struct printbuf *out)

 int bch2_fs_topology_error(struct bch_fs *c, const char *fmt, ...)
 {
-	CLASS(printbuf, buf)();
-	bch2_log_msg_start(c, &buf);
+	CLASS(bch_log_msg, msg)(c);

 	va_list args;
 	va_start(args, fmt);
-	prt_vprintf(&buf, fmt, args);
+	prt_vprintf(&msg.m, fmt, args);
 	va_end(args);

-	int ret = __bch2_topology_error(c, &buf);
-	bch2_print_str(c, KERN_ERR, buf.buf);
-	return ret;
+	return __bch2_topology_error(c, &msg.m);
 }

 void bch2_fatal_error(struct bch_fs *c)
--- a/libbcachefs/init/error.h
+++ b/libbcachefs/init/error.h
@ -18,13 +18,6 @@ struct work_struct;

 /* Error messages: */

-void __bch2_log_msg_start(const char *, struct printbuf *);
-
-static inline void bch2_log_msg_start(struct bch_fs *c, struct printbuf *out)
-{
-	__bch2_log_msg_start(c->name, out);
-}
-
 /*
 * Inconsistency errors: The on disk data is inconsistent. If these occur during
 * initial recovery, they don't indicate a bug in the running code - we walk all
--- a/libbcachefs/init/fs.c
+++ b/libbcachefs/init/fs.c
@ -101,34 +101,44 @@ const char * const bch2_write_refs[] = {
 };
 #undef x

-static bool should_print_loglevel(struct bch_fs *c, const char *fmt)
+static int kern_soh_to_loglevel(const char *fmt)
 {
-	unsigned loglevel_opt = c->loglevel ?: c->opts.verbose ? 7: 6;
-
-	bool have_soh = fmt[0] == KERN_SOH[0];
-	bool have_loglevel = have_soh && fmt[1] >= '0' && fmt[1] <= '9';
-
-	unsigned loglevel = have_loglevel
-		? fmt[1] - '0'
-		: c->prev_loglevel;
-
-	if (have_loglevel)
-		c->prev_loglevel = loglevel;
-
-	return loglevel <= loglevel_opt;
+	if (fmt[0] == KERN_SOH[0] &&
+	    fmt[1] >= '0' && fmt[1] <= '9')
+		return fmt[1] - '0';
+	else
+		return -1;
 }

-void bch2_print_str(struct bch_fs *c, const char *prefix, const char *str)
+static unsigned loglevel_opt(struct bch_fs *c)
 {
-	/* Nothing to print? Nothing to do: */
-	if (!str)
+	return c->loglevel ?: c->opts.verbose ? 7: 6;
+}
+
+void bch2_print_str_loglevel(struct bch_fs *c, int loglevel, const char *str)
+{
+	if (loglevel < 0)
+		loglevel = c->prev_loglevel;
+	else
+		c->prev_loglevel = loglevel;
+
+	if (loglevel > loglevel_opt(c))
 		return;

-	if (!should_print_loglevel(c, prefix))
-		return;
-
-#ifndef __KERNEL__
-	prefix = "";
+#ifdef __KERNEL__
+	static const char *prefixes[] = {
+		KERN_SOH "0",
+		KERN_SOH "1",
+		KERN_SOH "2",
+		KERN_SOH "3",
+		KERN_SOH "4",
+		KERN_SOH "5",
+		KERN_SOH "6",
+		KERN_SOH "7",
+	};
+	const char *prefix = loglevel < ARRAY_SIZE(prefixes) ? prefixes[loglevel] : KERN_SOH;
+#else
+	const char *prefix = "";
 #endif

 #ifdef __KERNEL__
@ -142,6 +152,15 @@ void bch2_print_str(struct bch_fs *c, const char *prefix, const char *str)
 	bch2_print_string_as_lines(prefix, str);
 }

+void bch2_print_str(struct bch_fs *c, const char *prefix, const char *str)
+{
+	/* Nothing to print? Nothing to do: */
+	if (!str)
+		return;
+
+	bch2_print_str_loglevel(c, kern_soh_to_loglevel(prefix), str);
+}
+
 __printf(2, 0)
 static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args)
 {
@ -169,7 +188,13 @@ void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...)

 void __bch2_print(struct bch_fs *c, const char *fmt, ...)
 {
-	if (!should_print_loglevel(c, fmt))
+	int loglevel = kern_soh_to_loglevel(fmt);
+	if (loglevel < 0)
+		loglevel = c->prev_loglevel;
+	else
+		c->prev_loglevel = loglevel;
+
+	if (loglevel > loglevel_opt(c))
 		return;

 #ifndef __KERNEL__
@ -426,9 +451,11 @@ static bool __bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *ou
 	bch2_fs_read_only_async(c);
 	wake_up(&bch2_read_only_wait);

-	if (ret)
+	if (ret) {
 		prt_printf(out, "emergency read only at seq %llu\n",
 			   journal_cur_seq(&c->journal));
+		out->suppress = false;
+	}

 	return ret;
 }
@ -1464,10 +1491,8 @@ struct bch_fs *bch2_fs_open(darray_const_str *devices,
 		prt_printf(&msg, "error starting filesystem: %s", bch2_err_str(ret));
 		bch2_print_string_as_lines(KERN_ERR, msg.buf);
 	} else if (msg.pos) {
-		CLASS(printbuf, msg_with_prefix)();
-		bch2_log_msg_start(c, &msg_with_prefix);
-		prt_str(&msg_with_prefix, msg.buf);
-		bch2_print_str(c, KERN_INFO, msg_with_prefix.buf);
+		CLASS(bch_log_msg_level, msg_with_prefix)(c, 6);
+		prt_str(&msg_with_prefix.m, msg.buf);
 	}

 	return c;
--- a/libbcachefs/init/passes.c
+++ b/libbcachefs/init/passes.c
@ -353,6 +353,8 @@ int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
 	if (!recovery_pass_needs_set(c, pass, &flags))
 		return 0;

+	out->suppress = false;
+
 	bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags);
 	bool rewind = in_recovery &&
 		r->curr_pass > pass &&
--- a/libbcachefs/init/recovery.c
+++ b/libbcachefs/init/recovery.c
@ -123,8 +123,10 @@ int bch2_btree_lost_data(struct bch_fs *c,
 		break;
 	}

-	if (write_sb)
+	if (write_sb) {
 		bch2_write_super(c);
+		msg->suppress = false;
+	}
 	return ret;
 }

@ -922,13 +924,9 @@ int bch2_fs_recovery(struct bch_fs *c)
 	bch2_flush_fsck_errs(c);

 	if (ret) {
-		CLASS(printbuf, buf)();
-		bch2_log_msg_start(c, &buf);
-
-		prt_printf(&buf, "error in recovery: %s\n", bch2_err_str(ret));
-		bch2_fs_emergency_read_only2(c, &buf);
-
-		bch2_print_str(c, KERN_ERR, buf.buf);
+		CLASS(bch_log_msg, msg)(c);
+		prt_printf(&msg.m, "error in recovery: %s\n", bch2_err_str(ret));
+		bch2_fs_emergency_read_only2(c, &msg.m);
 	}
 	return ret;
 }
--- a/libbcachefs/journal/read.c
+++ b/libbcachefs/journal/read.c
@ -1251,33 +1251,30 @@ err:
 noinline_for_stack
 static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_replay *j)
 {
-	CLASS(printbuf, buf)();
-	bch2_log_msg_start(c, &buf);
+	CLASS(bch_log_msg, msg)(c);

 	enum bch_csum_type csum_type = JSET_CSUM_TYPE(&j->j);
 	bool have_good = false;

-	prt_printf(&buf, "invalid journal checksum(s) at seq %llu ", le64_to_cpu(j->j.seq));
-	bch2_journal_datetime_to_text(&buf, &j->j);
-	prt_newline(&buf);
+	prt_printf(&msg.m, "invalid journal checksum(s) at seq %llu ", le64_to_cpu(j->j.seq));
+	bch2_journal_datetime_to_text(&msg.m, &j->j);
+	prt_newline(&msg.m);

 	darray_for_each(j->ptrs, ptr)
 		if (!ptr->csum_good) {
-			bch2_journal_ptr_to_text(&buf, c, ptr);
-			prt_char(&buf, ' ');
-			bch2_csum_to_text(&buf, csum_type, ptr->csum);
-			prt_newline(&buf);
+			bch2_journal_ptr_to_text(&msg.m, c, ptr);
+			prt_char(&msg.m, ' ');
+			bch2_csum_to_text(&msg.m, csum_type, ptr->csum);
+			prt_newline(&msg.m);
 		} else {
 			have_good = true;
 		}

-	prt_printf(&buf, "should be ");
-	bch2_csum_to_text(&buf, csum_type, j->j.csum);
+	prt_printf(&msg.m, "should be ");
+	bch2_csum_to_text(&msg.m, csum_type, j->j.csum);

 	if (have_good)
-		prt_printf(&buf, "\n(had good copy on another device)");
-
-	bch2_print_str(c, KERN_ERR, buf.buf);
+		prt_printf(&msg.m, "\n(had good copy on another device)");
 }

 struct u64_range bch2_journal_entry_missing_range(struct bch_fs *c, u64 start, u64 end)
--- a/libbcachefs/journal/write.c
+++ b/libbcachefs/journal/write.c
@ -231,32 +231,32 @@ static CLOSURE_CALLBACK(journal_write_done)
 	}

 	if (unlikely(w->failed.nr || err)) {
-		bool print = !bch2_ratelimit();
+		CLASS(bch_log_msg, msg)(c);

-		CLASS(printbuf, buf)();
-		bch2_log_msg_start(c, &buf);
-		prt_printf(&buf, "error writing journal entry %llu\n", seq_wrote);
-		bch2_io_failures_to_text(&buf, c, &w->failed);
+		/* Separate ratelimit_states for hard and soft errors */
+		msg.m.suppress = !err
+			? bch2_ratelimit(c)
+			: bch2_ratelimit(c);
+
+		prt_printf(&msg.m, "error writing journal entry %llu\n", seq_wrote);
+		bch2_io_failures_to_text(&msg.m, c, &w->failed);

 		if (!w->devs_written.nr)
 			err = bch_err_throw(c, journal_write_err);

 		if (!err) {
-			prt_printf(&buf, "wrote degraded to ");
-			bch2_devs_list_to_text(&buf, c, &w->devs_written);
-			prt_newline(&buf);
+			prt_printf(&msg.m, "wrote degraded to ");
+			bch2_devs_list_to_text(&msg.m, c, &w->devs_written);
+			prt_newline(&msg.m);
 		} else {
 			if (err == -BCH_ERR_journal_write_err)
-				prt_printf(&buf, "unable to write journal to sufficient devices\n");
+				prt_printf(&msg.m, "unable to write journal to sufficient devices\n");
 			else
-				prt_printf(&buf, "journal write error marking replicas: %s\n",
+				prt_printf(&msg.m, "journal write error marking replicas: %s\n",
 					   bch2_err_str(err));

-			print = bch2_fs_emergency_read_only2(c, &buf);
+			bch2_fs_emergency_read_only2(c, &msg.m);
 		}
-
-		if (print)
-			bch2_print_str(c, KERN_ERR, buf.buf);
 	}

 	closure_debug_destroy(cl);
--- a/libbcachefs/sb/io.c
+++ b/libbcachefs/sb/io.c
@ -98,7 +98,7 @@ int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version v
 			bch2_version_to_text(&buf, version);
 			prt_str(&buf, " currently not enabled, allowed up to ");
 			bch2_version_to_text(&buf, c->sb.version_incompat_allowed);
-			prt_printf(&buf, "\n  set version_upgrade=incompat to enable");
+			prt_printf(&buf, "\n  set version_upgrade=incompatible to enable");

 			bch_notice(c, "%s", buf.buf);
 		}
--- a/libbcachefs/sb/members.c
+++ b/libbcachefs/sb/members.c
@ -704,8 +704,8 @@ static void bch2_maybe_schedule_btree_bitmap_gc_work(struct work_struct *work)
 	if (bch2_recovery_pass_want_ratelimit(c, BCH_RECOVERY_PASS_btree_bitmap_gc, 1000))
 		return;

-	CLASS(printbuf, buf)();
-	bch2_log_msg_start(c, &buf);
+	CLASS(bch_log_msg_level, msg)(c, 5);
+	msg.m.suppress = true; /* run_explicit_recovery_pass will unsuppress */

 	bool want_schedule = false;
 	for_each_member_device(c, ca) {
@ -716,21 +716,19 @@ static void bch2_maybe_schedule_btree_bitmap_gc_work(struct work_struct *work)
 		u64 bitmap_sectors = hweight64(ca->mi.btree_allocated_bitmap) << ca->mi.btree_bitmap_shift;

 		if (btree_sectors * 4 < bitmap_sectors) {
-			prt_printf(&buf, "%s has ", ca->name);
-			prt_human_readable_u64(&buf, btree_sectors << 9);
-			prt_printf(&buf, " btree buckets and ");
-			prt_human_readable_u64(&buf, bitmap_sectors << 9);
-			prt_printf(&buf, " marked in bitmap\n");
+			prt_printf(&msg.m, "%s has ", ca->name);
+			prt_human_readable_u64(&msg.m, btree_sectors << 9);
+			prt_printf(&msg.m, " btree buckets and ");
+			prt_human_readable_u64(&msg.m, bitmap_sectors << 9);
+			prt_printf(&msg.m, " marked in bitmap\n");
 			want_schedule = true;
 		}
 	}

-	if (want_schedule) {
-		bch2_run_explicit_recovery_pass(c, &buf,
+	if (want_schedule)
+		bch2_run_explicit_recovery_pass(c, &msg.m,
 			BCH_RECOVERY_PASS_btree_bitmap_gc,
 			RUN_RECOVERY_PASS_ratelimit);
-		bch2_print_str(c, KERN_NOTICE, buf.buf);
-	}

 	queue_delayed_work(system_long_wq, &c->maybe_schedule_btree_bitmap_gc, HZ * 60 * 60 * 24);
 }
--- a/libbcachefs/snapshots/subvolume.c
+++ b/libbcachefs/snapshots/subvolume.c
@ -21,17 +21,12 @@ static int bch2_subvolume_delete(struct btree_trans *, u32);

 static int bch2_subvolume_missing(struct bch_fs *c, u32 subvolid)
 {
-	CLASS(printbuf, buf)();
-	bch2_log_msg_start(c, &buf);
+	CLASS(bch_log_msg, msg)(c);

-	prt_printf(&buf, "missing subvolume %u", subvolid);
-	bool print = bch2_count_fsck_err(c, subvol_missing, &buf);
+	prt_printf(&msg.m, "missing subvolume %u", subvolid);
+	msg.m.suppress = !bch2_count_fsck_err(c, subvol_missing, &msg.m);

-	int ret = bch2_run_explicit_recovery_pass(c, &buf,
-					BCH_RECOVERY_PASS_check_inodes, 0);
-	if (print)
-		bch2_print_str(c, KERN_ERR, buf.buf);
-	return ret;
+	return bch2_run_explicit_recovery_pass(c, &msg.m, BCH_RECOVERY_PASS_check_inodes, 0);
 }

 static struct bpos subvolume_children_pos(struct bkey_s_c k)
--- a/libbcachefs/util/printbuf.h
+++ b/libbcachefs/util/printbuf.h
@ -87,6 +87,7 @@ struct printbuf {
 	bool			allocation_failure:1;
 	bool			heap_allocated:1;
 	bool			overflow:1;
+	bool			suppress:1; /* Ratelimited or already printed */
 	enum printbuf_si	si_units:1;
 	bool			human_readable_units:1;
 	bool			has_indent_or_tabstops:1;
--- a/libbcachefs/vfs/buffered.c
+++ b/libbcachefs/vfs/buffered.c
@ -598,6 +598,7 @@ static int __bch2_writepage(struct folio *folio,
 do_io:
 	f_sectors = folio_sectors(folio);
 	s = bch2_folio(folio);
+	BUG_ON(!s);

 	if (f_sectors > w->tmp_sectors) {
 		kfree(w->tmp);
@ -829,7 +830,7 @@ int bch2_write_end(
 	struct bch_inode_info *inode = to_bch_ei(mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch2_folio_reservation *res = fsdata;
-	unsigned offset = pos - folio_pos(folio);
+	size_t offset = pos - folio_pos(folio);

 	BUG_ON(offset + copied > folio_size(folio));

@ -886,8 +887,9 @@ static int __bch2_buffered_write(struct bch_fs *c,
 	struct bch2_folio_reservation res;
 	folios fs;
 	struct folio *f;
-	unsigned copied = 0, f_offset, f_copied;
-	u64 end = pos + len, f_pos, f_len;
+	unsigned copied = 0, f_copied;
+	size_t f_offset, f_len;
+	u64 end = pos + len, f_pos;
 	loff_t last_folio_pos = inode->v.i_size;
 	int ret = 0;

--- a/libbcachefs/vfs/io.c
+++ b/libbcachefs/vfs/io.c
@ -139,15 +139,12 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
 			   struct quota_res *quota_res, s64 sectors)
 {
 	if (unlikely((s64) inode->v.i_blocks + sectors < 0)) {
-		CLASS(printbuf, buf)();
-		bch2_log_msg_start(c, &buf);
-		prt_printf(&buf, "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
+		CLASS(bch_log_msg, msg)(c);
+		prt_printf(&msg.m, "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
 			   inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
 			   inode->ei_inode.bi_sectors);

-		bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, &buf);
-		if (print)
-			bch2_print_str(c, KERN_ERR, buf.buf);
+		msg.m.suppress = !bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, &msg.m);

 		if (sectors < 0)
 			sectors = -inode->v.i_blocks;
@ -532,16 +529,13 @@ int bchfs_truncate(struct mnt_idmap *idmap,

 	if (unlikely(!inode->v.i_size && inode->v.i_blocks &&
 		     !bch2_journal_error(&c->journal))) {
-		CLASS(printbuf, buf)();
-		bch2_log_msg_start(c, &buf);
-		prt_printf(&buf,
+		CLASS(bch_log_msg, msg)(c);
+		prt_printf(&msg.m,
 			   "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
 			   inode->v.i_ino, (u64) inode->v.i_blocks,
 			   inode->ei_inode.bi_sectors);

-		bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, &buf);
-		if (print)
-			bch2_print_str(c, KERN_ERR, buf.buf);
+		msg.m.suppress = !bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, &msg.m);
 	}

 	ret = bch2_setattr_nonsize(idmap, inode, iattr);
--- a/libbcachefs/vfs/ioctl.c
+++ b/libbcachefs/vfs/ioctl.c
@ -172,41 +172,35 @@ static int bch2_ioc_setlabel(struct bch_fs *c,

 static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
 {
-	u32 flags;
-	int ret = 0;
-
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;

-	if (get_user(flags, arg))
-		return -EFAULT;
+	u32 flags;
+	try(get_user(flags, arg));

-	CLASS(printbuf, buf)();
-	bch2_log_msg_start(c, &buf);
+	CLASS(bch_log_msg, msg)(c);
+	msg.m.suppress = true; /* cleared by ERO */

-	prt_printf(&buf, "shutdown by ioctl type %u", flags);
+	prt_printf(&msg.m, "shutdown by ioctl type %u", flags);

 	switch (flags) {
 	case FSOP_GOING_FLAGS_DEFAULT:
-		ret = bdev_freeze(c->vfs_sb->s_bdev);
-		if (ret)
-			break;
+		try(bdev_freeze(c->vfs_sb->s_bdev));
+
 		bch2_journal_flush(&c->journal);
-		bch2_fs_emergency_read_only2(c, &buf);
+		bch2_fs_emergency_read_only2(c, &msg.m);
+
 		bdev_thaw(c->vfs_sb->s_bdev);
-		break;
+		return 0;
 	case FSOP_GOING_FLAGS_LOGFLUSH:
 		bch2_journal_flush(&c->journal);
 		fallthrough;
 	case FSOP_GOING_FLAGS_NOLOGFLUSH:
-		bch2_fs_emergency_read_only2(c, &buf);
-		break;
+		bch2_fs_emergency_read_only2(c, &msg.m);
+		return 0;
 	default:
 		return -EINVAL;
 	}
-
-	bch2_print_str(c, KERN_ERR, buf.buf);
-	return ret;
 }

 static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
--- a/libbcachefs/vfs/pagecache.c
+++ b/libbcachefs/vfs/pagecache.c
@ -361,14 +361,14 @@ int bch2_get_folio_disk_reservation(struct bch_fs *c,
 				struct bch_inode_info *inode,
 				struct folio *folio, bool check_enospc)
 {
-	struct bch_folio *s = bch2_folio_create(folio, 0);
+	struct bch_folio *s = bch2_folio(folio);
 	unsigned nr_replicas = inode_nr_replicas(c, inode);
 	struct disk_reservation disk_res = { 0 };
 	unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0;
 	int ret;

-	if (!s)
-		return -ENOMEM;
+	BUG_ON(!s);
+	EBUG_ON(!s->uptodate);

 	for (i = 0; i < sectors; i++)
 		disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
@ -399,21 +399,19 @@ void bch2_folio_reservation_put(struct bch_fs *c,
 	bch2_quota_reservation_put(c, inode, &res->quota);
 }

-static int __bch2_folio_reservation_get(struct bch_fs *c,
+static ssize_t __bch2_folio_reservation_get(struct bch_fs *c,
 			struct bch_inode_info *inode,
 			struct folio *folio,
 			struct bch2_folio_reservation *res,
 			size_t offset, size_t len,
 			bool partial)
 {
-	struct bch_folio *s = bch2_folio_create(folio, 0);
+	struct bch_folio *s = bch2_folio(folio);
 	unsigned i, disk_sectors = 0, quota_sectors = 0;
 	size_t reserved = len;
 	int ret;

-	if (!s)
-		return -ENOMEM;
-
+	BUG_ON(!s);
 	BUG_ON(!s->uptodate);

 	for (i = round_down(offset, block_bytes(c)) >> 9;
@ -468,7 +466,7 @@ int bch2_folio_reservation_get(struct bch_fs *c,
 			struct bch2_folio_reservation *res,
 			size_t offset, size_t len)
 {
-	return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, false);
+	return (int)__bch2_folio_reservation_get(c, inode, folio, res, offset, len, false);
 }

 ssize_t bch2_folio_reservation_get_partial(struct bch_fs *c,
@ -512,7 +510,7 @@ void bch2_set_folio_dirty(struct bch_fs *c,
 			  struct bch_inode_info *inode,
 			  struct folio *folio,
 			  struct bch2_folio_reservation *res,
-			  unsigned offset, unsigned len)
+			  size_t offset, size_t len)
 {
 	struct bch_folio *s = bch2_folio(folio);
 	unsigned i, dirty_sectors = 0;
@ -520,7 +518,9 @@ void bch2_set_folio_dirty(struct bch_fs *c,
 	WARN_ON((u64) folio_pos(folio) + offset + len >
 		round_up((u64) i_size_read(&inode->v), block_bytes(c)));

+	BUG_ON(!s);
 	BUG_ON(!s->uptodate);
+	EBUG_ON(round_up(offset + len, block_bytes(c)) >> 9 > UINT_MAX);

 	scoped_guard(spinlock, &s->lock)
 		for (i = round_down(offset, block_bytes(c)) >> 9;
@ -598,7 +598,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
 	vm_fault_t ret;

 	loff_t file_offset = round_down(vmf->pgoff << PAGE_SHIFT, block_bytes(c));
-	unsigned offset = file_offset - folio_pos(folio);
+	size_t offset = file_offset - folio_pos(folio);
 	unsigned len = max(PAGE_SIZE, block_bytes(c));

 	BUG_ON(offset + len > folio_size(folio));
--- a/libbcachefs/vfs/pagecache.h
+++ b/libbcachefs/vfs/pagecache.h
@ -157,7 +157,7 @@ void bch2_set_folio_dirty(struct bch_fs *,
 			  struct bch_inode_info *,
 			  struct folio *,
 			  struct bch2_folio_reservation *,
-			  unsigned, unsigned);
+			  size_t, size_t);

 vm_fault_t bch2_page_fault(struct vm_fault *);
 vm_fault_t bch2_page_mkwrite(struct vm_fault *);
--- a/libbcachefs/vfs/types.h
+++ b/libbcachefs/vfs/types.h
@ -3,7 +3,6 @@
 #define _BCACHEFS_VFS_TYPES_H

 struct bch_fs_vfs {
-#ifndef NO_BCACHEFS_FS
 	struct list_head	inodes_list;
 	struct mutex		inodes_lock;
 	struct rhashtable	inodes_table;
@ -14,7 +13,6 @@ struct bch_fs_vfs {
 	struct bio_set		dio_read_bioset;
 	struct bio_set		nocow_flush_bioset;
 	struct workqueue_struct	*writeback_wq;
-#endif
 };

 #endif /* _BCACHEFS_VFS_TYPES_H */
--- a/linux/ratelimit.c
+++ b/linux/ratelimit.c
@ -11,6 +11,7 @@
 #include <linux/ratelimit.h>
 #include <linux/jiffies.h>
 #include <linux/export.h>
+#include <linux/spinlock.h>

 /*
 * __ratelimit - rate limiting
@ -26,44 +27,79 @@
 */
 int ___ratelimit(struct ratelimit_state *rs, const char *func)
 {
-	int ret;
-
-	if (!rs->interval)
-		return 1;
+	/* Paired with WRITE_ONCE() in .proc_handler().
+	 * Changing two values seperately could be inconsistent
+	 * and some message could be lost.  (See: net_ratelimit_state).
+	 */
+	int interval = READ_ONCE(rs->interval);
+	int burst = READ_ONCE(rs->burst);
+	int ret = 0;

 	/*
-	 * If we contend on this state's lock then almost
-	 * by definition we are too busy to print a message,
-	 * in addition to the one that will be printed by
-	 * the entity that is holding the lock already:
+	 * Zero interval says never limit, otherwise, non-positive burst
+	 * says always limit.
 	 */
-	if (!raw_spin_trylock(&rs->lock))
-		return 0;
+	if (interval <= 0 || burst <= 0) {
+		WARN_ONCE(interval < 0 || burst < 0, "Negative interval (%d) or burst (%d): Uninitialized ratelimit_state structure?\n", interval, burst);
+		ret = interval == 0 || burst > 0;
+		if (!(READ_ONCE(rs->flags) & RATELIMIT_INITIALIZED) || (!interval && !burst) ||
+		    !raw_spin_trylock(&rs->lock))
+			goto nolock_ret;

-	if (!rs->begin)
+		/* Force re-initialization once re-enabled. */
+		rs->flags &= ~RATELIMIT_INITIALIZED;
+		goto unlock_ret;
+	}
+
+	/*
+	 * If we contend on this state's lock then just check if
+	 * the current burst is used or not. It might cause
+	 * false positive when we are past the interval and
+	 * the current lock owner is just about to reset it.
+	 */
+	if (!raw_spin_trylock(&rs->lock)) {
+		if (READ_ONCE(rs->flags) & RATELIMIT_INITIALIZED &&
+		    atomic_read(&rs->rs_n_left) > 0 && atomic_dec_return(&rs->rs_n_left) >= 0)
+			ret = 1;
+		goto nolock_ret;
+	}
+
+	if (!(rs->flags & RATELIMIT_INITIALIZED)) {
+		rs->begin = jiffies;
+		rs->flags |= RATELIMIT_INITIALIZED;
+		atomic_set(&rs->rs_n_left, rs->burst);
+	}
+
+	if (time_is_before_jiffies(rs->begin + interval)) {
+		int m;
+
+		/*
+		 * Reset rs_n_left ASAP to reduce false positives
+		 * in parallel calls, see above.
+		 */
+		atomic_set(&rs->rs_n_left, rs->burst);
 		rs->begin = jiffies;

-	if (time_is_before_jiffies(rs->begin + rs->interval)) {
-		if (rs->missed) {
-			if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) {
+		if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) {
+			m = ratelimit_state_reset_miss(rs);
+			if (m) {
 				printk(KERN_WARNING
-				       "%s: %d callbacks suppressed\n",
-				       func, rs->missed);
-				rs->missed = 0;
+						"%s: %d callbacks suppressed\n", func, m);
 			}
 		}
-		rs->begin   = jiffies;
-		rs->printed = 0;
 	}
-	if (rs->burst && rs->burst > rs->printed) {
-		rs->printed++;
+
+	/* Note that the burst might be taken by a parallel call. */
+	if (atomic_read(&rs->rs_n_left) > 0 && atomic_dec_return(&rs->rs_n_left) >= 0)
 		ret = 1;
-	} else {
-		rs->missed++;
-		ret = 0;
-	}
+
+unlock_ret:
 	raw_spin_unlock(&rs->lock);

+nolock_ret:
+	if (!ret)
+		ratelimit_state_inc_miss(rs);
+
 	return ret;
 }
 EXPORT_SYMBOL(___ratelimit);