Update bcachefs sources to e1f6739c4a bcachefs: Fix another iterator counting bug

2025-12-10 00:00:24 +03:00 · 2020-05-14 21:46:09 -04:00 · 2020-05-14 21:46:09 -04:00 · 34c9be19b3
commit 34c9be19b3
parent 024a01bf07
30 changed files with 463 additions and 220 deletions
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@ -1 +1 @@
-a27d7265e75f6d65c2b972ce4ac27abfc153c230
+e1f6739c4a9fee1db7d94a5087a253041542cb62
--- a/8
+++ b/8
@ -143,10 +143,16 @@ update-bcachefs-sources:
 	git rm -rf --ignore-unmatch libbcachefs
 	test -d libbcachefs || mkdir libbcachefs
 	cp $(LINUX_DIR)/fs/bcachefs/*.[ch] libbcachefs/
+	git add libbcachefs/*.[ch]
 	cp $(LINUX_DIR)/include/trace/events/bcachefs.h include/trace/events/
+	git add include/trace/events/bcachefs.h
+	cp $(LINUX_DIR)/kernel/locking/six.c linux/
+	git add linux/six.c
+	cp $(LINUX_DIR)/include/linux/six.h include/linux/
+	git add include/linux/six.h
 	$(RM) libbcachefs/*.mod.c
 	git -C $(LINUX_DIR) rev-parse HEAD | tee .bcachefs_revision
-	git add libbcachefs/*.[ch] include/trace/events/bcachefs.h .bcachefs_revision
+	git add .bcachefs_revision

 .PHONE: update-commit-bcachefs-sources
 update-commit-bcachefs-sources: update-bcachefs-sources
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@ -5,7 +5,7 @@ struct lock_class_key {};
 struct task_struct;

 # define lock_acquire(l, s, t, r, c, n, i)	do { } while (0)
-# define lock_release(l, n, i)			do { } while (0)
+# define lock_release(l, i)			do { } while (0)
 # define lock_set_class(l, n, k, s, i)		do { } while (0)
 # define lock_set_subclass(l, s, i)		do { } while (0)
 # define lockdep_set_current_reclaim_state(g)	do { } while (0)
--- a/include/trace/events/bcachefs.h
+++ b/include/trace/events/bcachefs.h
@ -499,6 +499,23 @@ TRACE_EVENT(copygc,
 		__entry->buckets_moved, __entry->buckets_not_moved)
 );

+TRACE_EVENT(transaction_restart_ip,
+	TP_PROTO(unsigned long caller, unsigned long ip),
+	TP_ARGS(caller, ip),
+
+	TP_STRUCT__entry(
+		__field(unsigned long,		caller	)
+		__field(unsigned long,		ip	)
+	),
+
+	TP_fast_assign(
+		__entry->caller	= caller;
+		__entry->ip	= ip;
+	),
+
+	TP_printk("%pF %pF", (void *) __entry->caller, (void *) __entry->ip)
+);
+
 DECLARE_EVENT_CLASS(transaction_restart,
 	TP_PROTO(unsigned long ip),
 	TP_ARGS(ip),
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@ -315,7 +315,9 @@ retry:
 	bch2_trans_update(trans, iter, &a->k_i,
 			  BTREE_TRIGGER_NORUN);
 	ret = bch2_trans_commit(trans, NULL, NULL,
-				BTREE_INSERT_NOFAIL|flags);
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_USE_RESERVE|
+				flags);
 err:
 	if (ret == -EINTR)
 		goto retry;
@ -1033,7 +1035,16 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t
 		set_current_state(TASK_INTERRUPTIBLE);

 		spin_lock(&c->freelist_lock);
-		for (i = 0; i < RESERVE_NR; i++)
+		for (i = 0; i < RESERVE_NR; i++) {
+
+			/*
+			 * Don't strand buckets on the copygc freelist until
+			 * after recovery is finished:
+			 */
+			if (!test_bit(BCH_FS_STARTED, &c->flags) &&
+			    i == RESERVE_MOVINGGC)
+				continue;
+
 			if (fifo_push(&ca->free[i], bucket)) {
 				fifo_pop(&ca->free_inc, bucket);

@ -1043,6 +1054,7 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t
 				spin_unlock(&c->freelist_lock);
 				goto out;
 			}
+		}

 		if (ca->allocator_state != ALLOCATOR_BLOCKED_FULL) {
 			ca->allocator_state = ALLOCATOR_BLOCKED_FULL;
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@ -179,7 +179,6 @@
 #undef pr_fmt
 #define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__

-#include <linux/stddef.h>
 #include <linux/bug.h>
 #include <linux/bio.h>
 #include <linux/closure.h>
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@ -72,7 +72,6 @@
 * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost.
 */

-#include <linux/stddef.h>
 #include <asm/types.h>
 #include <asm/byteorder.h>
 #include <linux/kernel.h>
--- a/libbcachefs/bkey_methods.c
+++ b/libbcachefs/bkey_methods.c
@ -283,49 +283,64 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
 	const struct bkey_ops *ops;
 	struct bkey uk;
 	struct bkey_s u;
+	int i;

-	if (big_endian != CPU_BIG_ENDIAN)
-		bch2_bkey_swab_key(f, k);
+	/*
+	 * Do these operations in reverse order in the write path:
+	 */

-	if (version < bcachefs_metadata_version_bkey_renumber)
-		bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write);
+	for (i = 0; i < 4; i++)
+	switch (!write ? i : 3 - i) {
+	case 0:
+		if (big_endian != CPU_BIG_ENDIAN)
+			bch2_bkey_swab_key(f, k);
+		break;
+	case 1:
+		if (version < bcachefs_metadata_version_bkey_renumber)
+			bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write);
+		break;
+	case 2:
+		if (version < bcachefs_metadata_version_inode_btree_change &&
+		    btree_id == BTREE_ID_INODES) {
+			if (!bkey_packed(k)) {
+				struct bkey_i *u = packed_to_bkey(k);
+				swap(u->k.p.inode, u->k.p.offset);
+			} else if (f->bits_per_field[BKEY_FIELD_INODE] &&
+				   f->bits_per_field[BKEY_FIELD_OFFSET]) {
+				struct bkey_format tmp = *f, *in = f, *out = &tmp;

-	if (version < bcachefs_metadata_version_inode_btree_change &&
-	    btree_id == BTREE_ID_INODES) {
-		if (!bkey_packed(k)) {
-			struct bkey_i *u = packed_to_bkey(k);
-			swap(u->k.p.inode, u->k.p.offset);
-		} else if (f->bits_per_field[BKEY_FIELD_INODE] &&
-			   f->bits_per_field[BKEY_FIELD_OFFSET]) {
-			struct bkey_format tmp = *f, *in = f, *out = &tmp;
+				swap(tmp.bits_per_field[BKEY_FIELD_INODE],
+				     tmp.bits_per_field[BKEY_FIELD_OFFSET]);
+				swap(tmp.field_offset[BKEY_FIELD_INODE],
+				     tmp.field_offset[BKEY_FIELD_OFFSET]);

-			swap(tmp.bits_per_field[BKEY_FIELD_INODE],
-			     tmp.bits_per_field[BKEY_FIELD_OFFSET]);
-			swap(tmp.field_offset[BKEY_FIELD_INODE],
-			     tmp.field_offset[BKEY_FIELD_OFFSET]);
+				if (!write)
+					swap(in, out);

-			if (!write)
-				swap(in, out);
-
-			uk = __bch2_bkey_unpack_key(in, k);
-			swap(uk.p.inode, uk.p.offset);
-			BUG_ON(!bch2_bkey_pack_key(k, &uk, out));
+				uk = __bch2_bkey_unpack_key(in, k);
+				swap(uk.p.inode, uk.p.offset);
+				BUG_ON(!bch2_bkey_pack_key(k, &uk, out));
+			}
 		}
+		break;
+	case 3:
+		if (!bkey_packed(k)) {
+			u = bkey_i_to_s(packed_to_bkey(k));
+		} else {
+			uk = __bch2_bkey_unpack_key(f, k);
+			u.k = &uk;
+			u.v = bkeyp_val(f, k);
+		}
+
+		if (big_endian != CPU_BIG_ENDIAN)
+			bch2_bkey_swab_val(u);
+
+		ops = &bch2_bkey_ops[k->type];
+
+		if (ops->compat)
+			ops->compat(btree_id, version, big_endian, write, u);
+		break;
+	default:
+		BUG();
 	}
-
-	if (!bkey_packed(k)) {
-		u = bkey_i_to_s(packed_to_bkey(k));
-	} else {
-		uk = __bch2_bkey_unpack_key(f, k);
-		u.k = &uk;
-		u.v = bkeyp_val(f, k);
-	}
-
-	if (big_endian != CPU_BIG_ENDIAN)
-		bch2_bkey_swab_val(u);
-
-	ops = &bch2_bkey_ops[k->type];
-
-	if (ops->compat)
-		ops->compat(btree_id, version, big_endian, write, u);
 }
--- a/libbcachefs/btree_cache.h
+++ b/libbcachefs/btree_cache.h
@ -94,7 +94,7 @@ static inline unsigned btree_blocks(struct bch_fs *c)
 	return c->opts.btree_node_size >> c->block_bits;
 }

-#define BTREE_SPLIT_THRESHOLD(c)		(btree_max_u64s(c) * 3 / 4)
+#define BTREE_SPLIT_THRESHOLD(c)		(btree_max_u64s(c) * 2 / 3)

 #define BTREE_FOREGROUND_MERGE_THRESHOLD(c)	(btree_max_u64s(c) * 1 / 3)
 #define BTREE_FOREGROUND_MERGE_HYSTERESIS(c)			\
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@ -699,8 +699,10 @@ static int bch2_gc_start(struct bch_fs *c,

 	c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
 					 sizeof(u64), GFP_KERNEL);
-	if (!c->usage_gc)
+	if (!c->usage_gc) {
+		bch_err(c, "error allocating c->usage_gc");
 		return -ENOMEM;
+	}

 	for_each_member_device(ca, c, i) {
 		BUG_ON(ca->buckets[1]);
@ -711,19 +713,23 @@ static int bch2_gc_start(struct bch_fs *c,
 				GFP_KERNEL|__GFP_ZERO);
 		if (!ca->buckets[1]) {
 			percpu_ref_put(&ca->ref);
+			bch_err(c, "error allocating ca->buckets[gc]");
 			return -ENOMEM;
 		}

 		ca->usage[1] = alloc_percpu(struct bch_dev_usage);
 		if (!ca->usage[1]) {
+			bch_err(c, "error allocating ca->usage[gc]");
 			percpu_ref_put(&ca->ref);
 			return -ENOMEM;
 		}
 	}

 	ret = bch2_ec_mem_alloc(c, true);
-	if (ret)
+	if (ret) {
+		bch_err(c, "error allocating ec gc mem");
 		return ret;
+	}

 	percpu_down_write(&c->mark_lock);

@ -933,7 +939,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
 		return;
 	}

-	as = bch2_btree_update_start(c, iter->btree_id,
+	as = bch2_btree_update_start(iter->trans, iter->btree_id,
 			btree_update_reserve_required(c, parent) + nr_old_nodes,
 			BTREE_INSERT_NOFAIL|
 			BTREE_INSERT_USE_RESERVE,
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@ -736,6 +736,17 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 		struct btree_node *bn =
 			container_of(i, struct btree_node, keys);
 		/* These indicate that we read the wrong btree node: */
+
+		if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+			struct bch_btree_ptr_v2 *bp =
+				&bkey_i_to_btree_ptr_v2(&b->key)->v;
+
+			/* XXX endianness */
+			btree_err_on(bp->seq != bn->keys.seq,
+				     BTREE_ERR_MUST_RETRY, c, b, NULL,
+				     "incorrect sequence number (wrong btree node)");
+		}
+
 		btree_err_on(BTREE_NODE_ID(bn) != b->btree_id,
 			     BTREE_ERR_MUST_RETRY, c, b, i,
 			     "incorrect btree id");
@ -1626,6 +1637,11 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 	 * reflect that those writes were done and the data flushed from the
 	 * journal:
 	 *
+	 * Also on journal error, the pending write may have updates that were
+	 * never journalled (interior nodes, see btree_update_nodes_written()) -
+	 * it's critical that we don't do the write in that case otherwise we
+	 * will have updates visible that weren't in the journal:
+	 *
 	 * Make sure to update b->written so bch2_btree_init_next() doesn't
 	 * break:
 	 */
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@ -165,8 +165,7 @@ static inline bool btree_node_lock_increment(struct btree_iter *iter,
 	struct btree_iter *linked;

 	trans_for_each_iter(iter->trans, linked)
-		if (linked != iter &&
-		    linked->l[level].b == b &&
+		if (linked->l[level].b == b &&
 		    btree_node_locked_type(linked, level) >= want) {
 			six_lock_increment(&b->lock, want);
 			return true;
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@ -586,12 +586,12 @@ static void __bch2_btree_update_free(struct btree_update *as)
 	bch2_journal_pin_drop(&c->journal, &as->journal);
 	bch2_journal_pin_flush(&c->journal, &as->journal);

-	BUG_ON((as->nr_new_nodes || as->nr_pending) &&
-	       !bch2_journal_error(&c->journal));;
+	BUG_ON(as->nr_new_nodes || as->nr_pending);

 	if (as->reserve)
 		bch2_btree_reserve_put(c, as->reserve);

+	list_del(&as->unwritten_list);
 	list_del(&as->list);

 	closure_debug_destroy(&as->cl);
@ -609,37 +609,28 @@ static void bch2_btree_update_free(struct btree_update *as)
 	mutex_unlock(&c->btree_interior_update_lock);
 }

-static void btree_update_nodes_reachable(struct btree_update *as, u64 seq)
+static inline bool six_trylock_intentwrite(struct six_lock *lock)
 {
-	struct bch_fs *c = as->c;
+	if (!six_trylock_intent(lock))
+		return false;

-	while (as->nr_new_nodes) {
-		struct btree *b = as->new_nodes[--as->nr_new_nodes];
-
-		BUG_ON(b->will_make_reachable != (unsigned long) as);
-		b->will_make_reachable = 0;
-
-		/*
-		 * b->will_make_reachable prevented it from being written, so
-		 * write it now if it needs to be written:
-		 */
-		btree_node_lock_type(c, b, SIX_LOCK_read);
-		bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
-		six_unlock_read(&b->lock);
+	if (!six_trylock_write(lock)) {
+		six_unlock_intent(lock);
+		return false;
 	}

-	while (as->nr_pending)
-		bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending],
-					    seq);
+	return true;
 }

 static void btree_update_nodes_written(struct closure *cl)
 {
 	struct btree_update *as = container_of(cl, struct btree_update, cl);
+	struct btree *nodes_need_write[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES + 1];
+	unsigned nr_nodes_need_write;
 	struct journal_res res = { 0 };
 	struct bch_fs *c = as->c;
+	struct btree_root *r;
 	struct btree *b;
-	struct bset *i;
 	int ret;

 	/*
@ -650,6 +641,7 @@ static void btree_update_nodes_written(struct closure *cl)
 	mutex_lock(&c->btree_interior_update_lock);
 	as->nodes_written = true;
 again:
+	nr_nodes_need_write = 0;
 	as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
 				      struct btree_update, unwritten_list);
 	if (!as || !as->nodes_written) {
@ -658,31 +650,57 @@ again:
 	}

 	b = as->b;
-	if (b && !six_trylock_intent(&b->lock)) {
+	if (b && !six_trylock_intentwrite(&b->lock)) {
 		mutex_unlock(&c->btree_interior_update_lock);
+
 		btree_node_lock_type(c, b, SIX_LOCK_intent);
+		six_lock_write(&b->lock);
+
+		six_unlock_write(&b->lock);
 		six_unlock_intent(&b->lock);
+
 		mutex_lock(&c->btree_interior_update_lock);
 		goto again;
 	}

-	list_del(&as->unwritten_list);
-
 	ret = bch2_journal_res_get(&c->journal, &res, as->journal_u64s,
+				   JOURNAL_RES_GET_NONBLOCK|
 				   JOURNAL_RES_GET_RESERVED);
-	if (ret) {
-		BUG_ON(!bch2_journal_error(&c->journal));
-		/* can't unblock btree writes */
-		goto free_update;
+	if (ret == -EAGAIN) {
+		unsigned u64s = as->journal_u64s;
+
+		if (b) {
+			six_unlock_write(&b->lock);
+			six_unlock_intent(&b->lock);
+		}
+
+		mutex_unlock(&c->btree_interior_update_lock);
+
+		ret = bch2_journal_res_get(&c->journal, &res, u64s,
+					   JOURNAL_RES_GET_CHECK|
+					   JOURNAL_RES_GET_RESERVED);
+		if (!ret) {
+			mutex_lock(&c->btree_interior_update_lock);
+			goto again;
+		}
 	}

-	{
+	if (!ret) {
 		struct journal_buf *buf = &c->journal.buf[res.idx];
 		struct jset_entry *entry = vstruct_idx(buf->data, res.offset);

 		res.offset	+= as->journal_u64s;
 		res.u64s	-= as->journal_u64s;
 		memcpy_u64s(entry, as->journal_entries, as->journal_u64s);
+	} else {
+		/*
+		 * On journal error we have to run most of the normal path so
+		 * that shutdown works - unblocking btree node writes in
+		 * particular and writing them if needed - except for
+		 * journalling the update:
+		 */
+
+		BUG_ON(!bch2_journal_error(&c->journal));
 	}

 	switch (as->mode) {
@ -690,26 +708,41 @@ again:
 		BUG();
 	case BTREE_INTERIOR_UPDATING_NODE:
 		/* @b is the node we did the final insert into: */
-		BUG_ON(!res.ref);

-		six_lock_write(&b->lock);
+		/*
+		 * On failure to get a journal reservation, we still have to
+		 * unblock the write and allow most of the write path to happen
+		 * so that shutdown works, but the i->journal_seq mechanism
+		 * won't work to prevent the btree write from being visible (we
+		 * didn't get a journal sequence number) - instead
+		 * __bch2_btree_node_write() doesn't do the actual write if
+		 * we're in journal error state:
+		 */
+
 		list_del(&as->write_blocked_list);

-		i = btree_bset_last(b);
-		i->journal_seq = cpu_to_le64(
-			max(res.seq,
-			    le64_to_cpu(i->journal_seq)));
+		if (!ret) {
+			struct bset *i = btree_bset_last(b);
+
+			i->journal_seq = cpu_to_le64(
+				max(res.seq,
+				    le64_to_cpu(i->journal_seq)));
+
+			bch2_btree_add_journal_pin(c, b, res.seq);
+		}
+
+		nodes_need_write[nr_nodes_need_write++] = b;

-		bch2_btree_add_journal_pin(c, b, res.seq);
 		six_unlock_write(&b->lock);
+		six_unlock_intent(&b->lock);
 		break;

 	case BTREE_INTERIOR_UPDATING_AS:
 		BUG_ON(b);
 		break;

-	case BTREE_INTERIOR_UPDATING_ROOT: {
-		struct btree_root *r = &c->btree_roots[as->btree_id];
+	case BTREE_INTERIOR_UPDATING_ROOT:
+		r = &c->btree_roots[as->btree_id];

 		BUG_ON(b);

@ -721,25 +754,24 @@ again:
 		mutex_unlock(&c->btree_root_lock);
 		break;
 	}
-	}

 	bch2_journal_pin_drop(&c->journal, &as->journal);

 	bch2_journal_res_put(&c->journal, &res);
 	bch2_journal_preres_put(&c->journal, &as->journal_preres);
-free_update:
-	/* Do btree write after dropping journal res: */
-	if (b) {
-		/*
-		 * b->write_blocked prevented it from being written, so
-		 * write it now if it needs to be written:
-		 */
-		btree_node_write_if_need(c, b, SIX_LOCK_intent);
-		six_unlock_intent(&b->lock);
+
+	while (as->nr_new_nodes) {
+		b = as->new_nodes[--as->nr_new_nodes];
+
+		BUG_ON(b->will_make_reachable != (unsigned long) as);
+		b->will_make_reachable = 0;
+
+		nodes_need_write[nr_nodes_need_write++] = b;
 	}

-	if (!ret)
-		btree_update_nodes_reachable(as, res.seq);
+	while (as->nr_pending)
+		bch2_btree_node_free_ondisk(c,
+			&as->pending[--as->nr_pending], res.seq);

 	__bch2_btree_update_free(as);
 	/*
@ -747,6 +779,22 @@ free_update:
 	 * nodes to be writeable:
 	 */
 	closure_wake_up(&c->btree_interior_update_wait);
+
+	/*
+	 * Can't take btree node locks while holding btree_interior_update_lock:
+	 * */
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	/* Do btree writes after dropping journal res/locks: */
+	while (nr_nodes_need_write) {
+		b = nodes_need_write[--nr_nodes_need_write];
+
+		btree_node_lock_type(c, b, SIX_LOCK_read);
+		bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
+		six_unlock_read(&b->lock);
+	}
+
+	mutex_lock(&c->btree_interior_update_lock);
 	goto again;
 }

@ -949,17 +997,41 @@ void bch2_btree_update_done(struct btree_update *as)
 }

 struct btree_update *
-bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
+bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
 			unsigned nr_nodes, unsigned flags,
 			struct closure *cl)
 {
+	struct bch_fs *c = trans->c;
+	struct journal_preres journal_preres = { 0 };
 	struct btree_reserve *reserve;
 	struct btree_update *as;
 	int ret;

+	ret = bch2_journal_preres_get(&c->journal, &journal_preres,
+				      BTREE_UPDATE_JOURNAL_RES,
+				      JOURNAL_RES_GET_NONBLOCK);
+	if (ret == -EAGAIN) {
+		if (flags & BTREE_INSERT_NOUNLOCK)
+			return ERR_PTR(-EINTR);
+
+		bch2_trans_unlock(trans);
+
+		ret = bch2_journal_preres_get(&c->journal, &journal_preres,
+					      BTREE_UPDATE_JOURNAL_RES, 0);
+		if (ret)
+			return ERR_PTR(ret);
+
+		if (!bch2_trans_relock(trans)) {
+			bch2_journal_preres_put(&c->journal, &journal_preres);
+			return ERR_PTR(-EINTR);
+		}
+	}
+
 	reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl);
-	if (IS_ERR(reserve))
+	if (IS_ERR(reserve)) {
+		bch2_journal_preres_put(&c->journal, &journal_preres);
 		return ERR_CAST(reserve);
+	}

 	as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
 	memset(as, 0, sizeof(*as));
@ -969,18 +1041,11 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
 	as->btree_id	= id;
 	as->reserve	= reserve;
 	INIT_LIST_HEAD(&as->write_blocked_list);
+	INIT_LIST_HEAD(&as->unwritten_list);
+	as->journal_preres = journal_preres;

 	bch2_keylist_init(&as->parent_keys, as->inline_keys);

-	ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
-				      ARRAY_SIZE(as->journal_entries), 0);
-	if (ret) {
-		bch2_btree_reserve_put(c, reserve);
-		closure_debug_destroy(&as->cl);
-		mempool_free(as, &c->btree_interior_update_pool);
-		return ERR_PTR(ret);
-	}
-
 	mutex_lock(&c->btree_interior_update_lock);
 	list_add_tail(&as->list, &c->btree_interior_update_list);
 	mutex_unlock(&c->btree_interior_update_lock);
@ -1531,8 +1596,10 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 	/* Hack, because gc and splitting nodes doesn't mix yet: */
 	if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
 	    !down_read_trylock(&c->gc_lock)) {
-		if (flags & BTREE_INSERT_NOUNLOCK)
+		if (flags & BTREE_INSERT_NOUNLOCK) {
+			trace_transaction_restart_ip(trans->ip, _THIS_IP_);
 			return -EINTR;
+		}

 		bch2_trans_unlock(trans);
 		down_read(&c->gc_lock);
@ -1551,7 +1618,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 		goto out;
 	}

-	as = bch2_btree_update_start(c, iter->btree_id,
+	as = bch2_btree_update_start(trans, iter->btree_id,
 		btree_update_reserve_required(c, b), flags,
 		!(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
 	if (IS_ERR(as)) {
@ -1560,6 +1627,8 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 			BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
 			bch2_trans_unlock(trans);
 			ret = -EINTR;
+
+			trace_transaction_restart_ip(trans->ip, _THIS_IP_);
 		}
 		goto out;
 	}
@ -1663,8 +1732,9 @@ retry:
 		goto err_unlock;
 	}

-	as = bch2_btree_update_start(c, iter->btree_id,
+	as = bch2_btree_update_start(trans, iter->btree_id,
 			 btree_update_reserve_required(c, parent) + 1,
+			 flags|
 			 BTREE_INSERT_NOFAIL|
 			 BTREE_INSERT_USE_RESERVE,
 			 !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
@ -1776,7 +1846,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 	struct btree *n, *parent = btree_node_parent(iter, b);
 	struct btree_update *as;

-	as = bch2_btree_update_start(c, iter->btree_id,
+	as = bch2_btree_update_start(iter->trans, iter->btree_id,
 		(parent
 		 ? btree_update_reserve_required(c, parent)
 		 : 0) + 1,
@ -2043,7 +2113,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
 		new_hash = bch2_btree_node_mem_alloc(c);
 	}

-	as = bch2_btree_update_start(c, iter->btree_id,
+	as = bch2_btree_update_start(iter->trans, iter->btree_id,
 		parent ? btree_update_reserve_required(c, parent) : 0,
 		BTREE_INSERT_NOFAIL|
 		BTREE_INSERT_USE_RESERVE|
--- a/libbcachefs/btree_update_interior.h
+++ b/libbcachefs/btree_update_interior.h
@ -32,6 +32,9 @@ struct pending_btree_node_free {
 	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
 };

+#define BTREE_UPDATE_JOURNAL_RES		\
+	((BKEY_BTREE_PTR_U64s_MAX + 1) * (BTREE_MAX_DEPTH - 1) * 2)
+
 /*
 * Tracks an in progress split/rewrite of a btree node and the update to the
 * parent node:
@ -105,8 +108,7 @@ struct btree_update {
 	unsigned			nr_new_nodes;

 	unsigned			journal_u64s;
-	u64				journal_entries[
-		(BKEY_BTREE_PTR_U64s_MAX + 1) * (BTREE_MAX_DEPTH - 1) * 2];
+	u64				journal_entries[BTREE_UPDATE_JOURNAL_RES];

 	/* Only here to reduce stack usage on recursive splits: */
 	struct keylist			parent_keys;
@ -132,7 +134,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,

 void bch2_btree_update_done(struct btree_update *);
 struct btree_update *
-bch2_btree_update_start(struct bch_fs *, enum btree_id, unsigned,
+bch2_btree_update_start(struct btree_trans *, enum btree_id, unsigned,
 			unsigned, struct closure *);

 void bch2_btree_interior_update_will_free_node(struct btree_update *,
--- a/libbcachefs/checksum.c
+++ b/libbcachefs/checksum.c
@ -13,6 +13,7 @@
 #include <crypto/chacha.h>
 #include <crypto/hash.h>
 #include <crypto/poly1305.h>
+#include <crypto/skcipher.h>
 #include <keys/user-type.h>

 static u64 bch2_checksum_init(unsigned type)
--- a/libbcachefs/compress.c
+++ b/libbcachefs/compress.c
@ -39,6 +39,24 @@ static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
 	BUG();
 }

+static bool bio_phys_contig(struct bio *bio, struct bvec_iter start)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+	void *expected_start = NULL;
+
+	__bio_for_each_bvec(bv, bio, iter, start) {
+		if (expected_start &&
+		    expected_start != page_address(bv.bv_page) + bv.bv_offset)
+			return false;
+
+		expected_start = page_address(bv.bv_page) +
+			bv.bv_offset + bv.bv_len;
+	}
+
+	return true;
+}
+
 static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
 				       struct bvec_iter start, int rw)
 {
@ -48,27 +66,28 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
 	unsigned nr_pages = 0;
 	struct page *stack_pages[16];
 	struct page **pages = NULL;
-	bool first = true;
-	unsigned prev_end = PAGE_SIZE;
 	void *data;

 	BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max);

-#ifndef CONFIG_HIGHMEM
-	__bio_for_each_bvec(bv, bio, iter, start) {
-		if (bv.bv_len == start.bi_size)
-			return (struct bbuf) {
-				.b = page_address(bv.bv_page) + bv.bv_offset,
-				.type = BB_NONE, .rw = rw
-			};
-	}
-#endif
+	if (!IS_ENABLED(CONFIG_HIGHMEM) &&
+	    bio_phys_contig(bio, start))
+		return (struct bbuf) {
+			.b = page_address(bio_iter_page(bio, start)) +
+				bio_iter_offset(bio, start),
+			.type = BB_NONE, .rw = rw
+		};
+
+	/* check if we can map the pages contiguously: */
 	__bio_for_each_segment(bv, bio, iter, start) {
-		if ((!first && bv.bv_offset) ||
-		    prev_end != PAGE_SIZE)
+		if (iter.bi_size != start.bi_size &&
+		    bv.bv_offset)
+			goto bounce;
+
+		if (bv.bv_len < iter.bi_size &&
+		    bv.bv_offset + bv.bv_len < PAGE_SIZE)
 			goto bounce;

-		prev_end = bv.bv_offset + bv.bv_len;
 		nr_pages++;
 	}

@ -172,20 +191,21 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
 	}
 	case BCH_COMPRESSION_TYPE_zstd: {
 		ZSTD_DCtx *ctx;
-		size_t len;
+		size_t real_src_len = le32_to_cpup(src_data.b);
+
+		if (real_src_len > src_len - 4)
+			goto err;

 		workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
 		ctx = ZSTD_initDCtx(workspace, ZSTD_DCtxWorkspaceBound());

-		src_len = le32_to_cpup(src_data.b);
-
-		len = ZSTD_decompressDCtx(ctx,
+		ret = ZSTD_decompressDCtx(ctx,
 				dst_data,	dst_len,
-				src_data.b + 4, src_len);
+				src_data.b + 4, real_src_len);

 		mempool_free(workspace, &c->decompress_workspace);

-		if (len != dst_len)
+		if (ret != dst_len)
 			goto err;
 		break;
 	}
@ -264,7 +284,8 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
 	if (ret)
 		goto err;

-	if (dst_data.type != BB_NONE)
+	if (dst_data.type != BB_NONE &&
+	    dst_data.type != BB_VMAP)
 		memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9));
 err:
 	bio_unmap_or_unbounce(c, dst_data);
@ -407,7 +428,8 @@ static unsigned __bio_compress(struct bch_fs *c,
 	memset(dst_data.b + *dst_len, 0, pad);
 	*dst_len += pad;

-	if (dst_data.type != BB_NONE)
+	if (dst_data.type != BB_NONE &&
+	    dst_data.type != BB_VMAP)
 		memcpy_to_bio(dst, dst->bi_iter, dst_data.b);

 	BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
@ -512,7 +534,6 @@ void bch2_fs_compress_exit(struct bch_fs *c)
 static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 {
 	size_t max_extent = c->sb.encoded_extent_max << 9;
-	size_t order = get_order(max_extent);
 	size_t decompress_workspace_size = 0;
 	bool decompress_workspace_needed;
 	ZSTD_parameters params = ZSTD_getParams(0, max_extent, 0);
@ -547,14 +568,14 @@ have_compressed:

 	if (!mempool_initialized(&c->compression_bounce[READ])) {
 		ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
-						  1, order);
+						  1, max_extent);
 		if (ret)
 			goto out;
 	}

 	if (!mempool_initialized(&c->compression_bounce[WRITE])) {
 		ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
-						  1, order);
+						  1, max_extent);
 		if (ret)
 			goto out;
 	}
--- a/libbcachefs/extent_update.c
+++ b/libbcachefs/extent_update.c
@ -34,16 +34,10 @@ static int count_iters_for_insert(struct btree_trans *trans,
 				  unsigned offset,
 				  struct bpos *end,
 				  unsigned *nr_iters,
-				  unsigned max_iters,
-				  bool overwrite)
+				  unsigned max_iters)
 {
-	int ret = 0;
+	int ret = 0, ret2 = 0;

-	/*
-	 * The extent update path requires an _additional_ iterator for each
-	 * extent we're inserting and overwriting:
-	 */
-	*nr_iters += 1;
 	if (*nr_iters >= max_iters) {
 		*end = bpos_min(*end, k.k->p);
 		ret = 1;
@ -70,16 +64,20 @@ static int count_iters_for_insert(struct btree_trans *trans,

 		for_each_btree_key(trans, iter,
 				   BTREE_ID_REFLINK, POS(0, idx + offset),
-				   BTREE_ITER_SLOTS, r_k, ret) {
+				   BTREE_ITER_SLOTS, r_k, ret2) {
 			if (bkey_cmp(bkey_start_pos(r_k.k),
 				     POS(0, idx + sectors)) >= 0)
 				break;

+			/* extent_update_to_keys(), for the reflink_v update */
+			*nr_iters += 1;
+
 			*nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k);

 			if (*nr_iters >= max_iters) {
 				struct bpos pos = bkey_start_pos(k.k);
-				pos.offset += r_k.k->p.offset - idx;
+				pos.offset += min_t(u64, k.k->size,
+						    r_k.k->p.offset - idx);

 				*end = bpos_min(*end, pos);
 				ret = 1;
@ -92,7 +90,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
 	}
 	}

-	return ret;
+	return ret2 ?: ret;
 }

 #define EXTENT_ITERS_MAX	(BTREE_ITER_MAX / 3)
@ -121,8 +119,11 @@ int bch2_extent_atomic_end(struct btree_iter *iter,

 	*end = bpos_min(insert->k.p, b->key.k.p);

+	/* extent_update_to_keys(): */
+	nr_iters += 1;
+
 	ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end,
-				     &nr_iters, EXTENT_ITERS_MAX / 2, false);
+				     &nr_iters, EXTENT_ITERS_MAX / 2);
 	if (ret < 0)
 		return ret;

@ -139,8 +140,20 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
 			offset = bkey_start_offset(&insert->k) -
 				bkey_start_offset(k.k);

+		/* extent_handle_overwrites(): */
+		switch (bch2_extent_overlap(&insert->k, k.k)) {
+		case BCH_EXTENT_OVERLAP_ALL:
+		case BCH_EXTENT_OVERLAP_FRONT:
+			nr_iters += 1;
+			break;
+		case BCH_EXTENT_OVERLAP_BACK:
+		case BCH_EXTENT_OVERLAP_MIDDLE:
+			nr_iters += 2;
+			break;
+		}
+
 		ret = count_iters_for_insert(trans, k, offset, end,
-					&nr_iters, EXTENT_ITERS_MAX, true);
+					&nr_iters, EXTENT_ITERS_MAX);
 		if (ret)
 			break;

--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@ -180,7 +180,7 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k)
 		return;

 	bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
-		!bch2_bkey_replicas_marked(c, k, false), c,
+		!bch2_bkey_replicas_marked_locked(c, k, false), c,
 		"btree key bad (replicas not marked in superblock):\n%s",
 		(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));

--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@ -1239,7 +1239,8 @@ do_io:
 		if (w->io &&
 		    (w->io->op.res.nr_replicas != nr_replicas_this_write ||
 		     bio_full(&w->io->op.wbio.bio, PAGE_SIZE) ||
-		     w->io->op.wbio.bio.bi_iter.bi_size >= (256U << 20) ||
+		     w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >=
+		     (BIO_MAX_PAGES * PAGE_SIZE) ||
 		     bio_end_sector(&w->io->op.wbio.bio) != sector))
 			bch2_writepage_do_io(w);

@ -1814,12 +1815,22 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 		goto loop;

 	while (1) {
+		size_t extra = dio->iter.count -
+			min(BIO_MAX_PAGES * PAGE_SIZE, dio->iter.count);
+
 		if (kthread)
 			use_mm(dio->mm);
 		BUG_ON(current->faults_disabled_mapping);
 		current->faults_disabled_mapping = mapping;

+		/*
+		 * Don't issue more than 2MB at once, the bcachefs io path in
+		 * io.c can't bounce more than that:
+		 */
+
+		dio->iter.count -= extra;
 		ret = bio_iov_iter_get_pages(bio, &dio->iter);
+		dio->iter.count += extra;

 		current->faults_disabled_mapping = NULL;
 		if (kthread)
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@ -588,7 +588,9 @@ static void bch2_write_index(struct closure *cl)

 	__bch2_write_index(op);

-	if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
+	if (!(op->flags & BCH_WRITE_DONE)) {
+		continue_at(cl, __bch2_write, index_update_wq(op));
+	} else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
 		bch2_journal_flush_seq_async(&c->journal,
 					     *op_journal_seq(op),
 					     cl);
@ -1103,8 +1105,15 @@ again:
 		if (ret < 0)
 			goto err;

-		if (ret)
+		if (ret) {
 			skip_put = false;
+		} else {
+			/*
+			 * for the skip_put optimization this has to be set
+			 * before we submit the bio:
+			 */
+			op->flags |= BCH_WRITE_DONE;
+		}

 		bio->bi_end_io	= bch2_write_endio;
 		bio->bi_private	= &op->cl;
@ -1127,16 +1136,30 @@ again:
 	return;
 err:
 	op->error = ret;
+	op->flags |= BCH_WRITE_DONE;

 	continue_at(cl, bch2_write_index, index_update_wq(op));
 	return;
 flush_io:
+	/*
+	 * If the write can't all be submitted at once, we generally want to
+	 * block synchronously as that signals backpressure to the caller.
+	 *
+	 * However, if we're running out of a workqueue, we can't block here
+	 * because we'll be blocking other work items from completing:
+	 */
+	if (current->flags & PF_WQ_WORKER) {
+		continue_at(cl, bch2_write_index, index_update_wq(op));
+		return;
+	}
+
 	closure_sync(cl);

 	if (!bch2_keylist_empty(&op->insert_keys)) {
 		__bch2_write_index(op);

 		if (op->error) {
+			op->flags |= BCH_WRITE_DONE;
 			continue_at_nobarrier(cl, bch2_write_done, NULL);
 			return;
 		}
@ -1182,6 +1205,8 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
 	bch2_keylist_push(&op->insert_keys);

 	op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
+	op->flags |= BCH_WRITE_DONE;
+
 	continue_at_nobarrier(cl, bch2_write_index, NULL);
 	return;
 err:
--- a/libbcachefs/io.h
+++ b/libbcachefs/io.h
@ -36,6 +36,7 @@ enum bch_write_flags {
 	/* Internal: */
 	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 10),
 	BCH_WRITE_SKIP_CLOSURE_PUT	= (1 << 11),
+	BCH_WRITE_DONE			= (1 << 12),
 };

 static inline u64 *op_journal_seq(struct bch_write_op *op)
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@ -269,7 +269,7 @@ static inline void bch2_journal_res_put(struct journal *j,
 	if (!res->ref)
 		return;

-	lock_release(&j->res_map, 0, _THIS_IP_);
+	lock_release(&j->res_map, _THIS_IP_);

 	while (res->u64s)
 		bch2_journal_add_entry(j, res,
@ -344,7 +344,9 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re
 		return ret;
 out:
 	if (!(flags & JOURNAL_RES_GET_CHECK)) {
-		lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_);
+		lock_acquire_shared(&j->res_map, 0,
+				    (flags & JOURNAL_RES_GET_NONBLOCK) != 0,
+				    NULL, _THIS_IP_);
 		EBUG_ON(!res->ref);
 	}
 	return 0;
--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@ -322,14 +322,12 @@ void bch2_journal_pin_drop(struct journal *j,
 	spin_unlock(&j->lock);
 }

-void __bch2_journal_pin_add(struct journal *j, u64 seq,
+static void bch2_journal_pin_add_locked(struct journal *j, u64 seq,
 			    struct journal_entry_pin *pin,
 			    journal_pin_flush_fn flush_fn)
 {
 	struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);

-	spin_lock(&j->lock);
-
 	__journal_pin_drop(j, pin);

 	BUG_ON(!atomic_read(&pin_list->count));
@ -339,7 +337,14 @@ void __bch2_journal_pin_add(struct journal *j, u64 seq,
 	pin->flush	= flush_fn;

 	list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
+}

+void __bch2_journal_pin_add(struct journal *j, u64 seq,
+			    struct journal_entry_pin *pin,
+			    journal_pin_flush_fn flush_fn)
+{
+	spin_lock(&j->lock);
+	bch2_journal_pin_add_locked(j, seq, pin, flush_fn);
 	spin_unlock(&j->lock);

 	/*
@ -354,9 +359,13 @@ void bch2_journal_pin_copy(struct journal *j,
 			   struct journal_entry_pin *src,
 			   journal_pin_flush_fn flush_fn)
 {
+	spin_lock(&j->lock);
+
 	if (journal_pin_active(src) &&
 	    (!journal_pin_active(dst) || src->seq < dst->seq))
-		__bch2_journal_pin_add(j, src->seq, dst, flush_fn);
+		bch2_journal_pin_add_locked(j, src->seq, dst, flush_fn);
+
+	spin_unlock(&j->lock);
 }

 /**
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@ -70,19 +70,26 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);

 	while (1) {
-		struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+		struct bkey_s_c k;
 		struct bkey_i *insert;
-		struct bkey_i_extent *new =
-			bkey_i_to_extent(bch2_keylist_front(keys));
+		struct bkey_i_extent *new;
 		BKEY_PADDED(k) _new, _insert;
 		const union bch_extent_entry *entry;
 		struct extent_ptr_decoded p;
 		bool did_work = false;
 		int nr;

+		bch2_trans_reset(&trans, 0);
+
+		k = bch2_btree_iter_peek_slot(iter);
 		ret = bkey_err(k);
-		if (ret)
+		if (ret) {
+			if (ret == -EINTR)
+				continue;
 			break;
+		}
+
+		new = bkey_i_to_extent(bch2_keylist_front(keys));

 		if (bversion_cmp(k.k->version, new->k.version) ||
 		    !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset))
--- a/libbcachefs/quota.c
+++ b/libbcachefs/quota.c
@ -710,13 +710,43 @@ found:
 	return ret;
 }

+static int bch2_set_quota_trans(struct btree_trans *trans,
+				struct bkey_i_quota *new_quota,
+				struct qc_dqblk *qdq)
+{
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	int ret;
+
+	iter = bch2_trans_get_iter(trans, BTREE_ID_QUOTAS, new_quota->k.p,
+				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(iter);
+
+	ret = bkey_err(k);
+	if (unlikely(ret))
+		return ret;
+
+	if (k.k->type == KEY_TYPE_quota)
+		new_quota->v = *bkey_s_c_to_quota(k).v;
+
+	if (qdq->d_fieldmask & QC_SPC_SOFT)
+		new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9);
+	if (qdq->d_fieldmask & QC_SPC_HARD)
+		new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9);
+
+	if (qdq->d_fieldmask & QC_INO_SOFT)
+		new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit);
+	if (qdq->d_fieldmask & QC_INO_HARD)
+		new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
+
+	return bch2_trans_update(trans, iter, &new_quota->k_i, 0);
+}
+
 static int bch2_set_quota(struct super_block *sb, struct kqid qid,
 			  struct qc_dqblk *qdq)
 {
 	struct bch_fs *c = sb->s_fs_info;
 	struct btree_trans trans;
-	struct btree_iter *iter;
-	struct bkey_s_c k;
 	struct bkey_i_quota new_quota;
 	int ret;

@ -728,41 +758,12 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,

 	bch2_trans_init(&trans, c, 0, 0);

-	iter = bch2_trans_get_iter(&trans, BTREE_ID_QUOTAS, new_quota.k.p,
-				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek_slot(iter);
-
-	ret = bkey_err(k);
-	if (unlikely(ret))
-		return ret;
-
-	switch (k.k->type) {
-	case KEY_TYPE_quota:
-		new_quota.v = *bkey_s_c_to_quota(k).v;
-		break;
-	}
-
-	if (qdq->d_fieldmask & QC_SPC_SOFT)
-		new_quota.v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9);
-	if (qdq->d_fieldmask & QC_SPC_HARD)
-		new_quota.v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9);
-
-	if (qdq->d_fieldmask & QC_INO_SOFT)
-		new_quota.v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit);
-	if (qdq->d_fieldmask & QC_INO_HARD)
-		new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
-
-	bch2_trans_update(&trans, iter, &new_quota.k_i, 0);
-
-	ret = bch2_trans_commit(&trans, NULL, NULL, 0);
+	ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOUNLOCK,
+			    bch2_set_quota_trans(&trans, &new_quota, qdq)) ?:
+		__bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i));

 	bch2_trans_exit(&trans);

-	if (ret)
-		return ret;
-
-	ret = __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i));
-
 	return ret;
 }

--- a/libbcachefs/rebalance.c
+++ b/libbcachefs/rebalance.c
@ -204,17 +204,21 @@ static int bch2_rebalance_thread(void *arg)
 			prev_run_time;

 		if (w.dev_most_full_percent < 20 && throttle > 0) {
-			r->state = REBALANCE_THROTTLED;
 			r->throttled_until_iotime = io_start +
 				div_u64(w.dev_most_full_capacity *
 					(20 - w.dev_most_full_percent),
 					50);
-			r->throttled_until_cputime = start + throttle;

-			bch2_kthread_io_clock_wait(clock,
-				r->throttled_until_iotime,
-				throttle);
-			continue;
+			if (atomic_long_read(&clock->now) + clock->max_slop <
+			    r->throttled_until_iotime) {
+				r->throttled_until_cputime = start + throttle;
+				r->state = REBALANCE_THROTTLED;
+
+				bch2_kthread_io_clock_wait(clock,
+					r->throttled_until_iotime,
+					throttle);
+				continue;
+			}
 		}

 		/* minimum 1 mb/sec: */
--- a/libbcachefs/replicas.c
+++ b/libbcachefs/replicas.c
@ -299,8 +299,10 @@ static int replicas_table_update(struct bch_fs *c,
 						GFP_NOIO)) ||
 	    !(new_scratch  = kmalloc(bytes, GFP_NOIO)) ||
 	    (c->usage_gc &&
-	     !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO))))
+	     !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) {
+		bch_err(c, "error updating replicas table: memory allocation failure");
 		goto err;
+	}

 	if (c->usage_base)
 		__replicas_table_update(new_base,		new_r,
@ -362,7 +364,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
 				struct bch_replicas_entry *new_entry)
 {
 	struct bch_replicas_cpu new_r, new_gc;
-	int ret = -ENOMEM;
+	int ret = 0;

 	verify_replicas_entry(new_entry);

@ -409,14 +411,16 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
 		swap(new_gc, c->replicas_gc);
 	percpu_up_write(&c->mark_lock);
 out:
-	ret = 0;
-err:
 	mutex_unlock(&c->sb_lock);

 	kfree(new_r.entries);
 	kfree(new_gc.entries);

 	return ret;
+err:
+	bch_err(c, "error adding replicas entry: memory allocation failure");
+	ret = -ENOMEM;
+	goto out;
 }

 int bch2_mark_replicas(struct bch_fs *c,
@ -561,6 +565,7 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
 					 GFP_NOIO);
 	if (!c->replicas_gc.entries) {
 		mutex_unlock(&c->sb_lock);
+		bch_err(c, "error allocating c->replicas_gc");
 		return -ENOMEM;
 	}

@ -586,8 +591,10 @@ retry:
 	nr		= READ_ONCE(c->replicas.nr);
 	new.entry_size	= READ_ONCE(c->replicas.entry_size);
 	new.entries	= kcalloc(nr, new.entry_size, GFP_KERNEL);
-	if (!new.entries)
+	if (!new.entries) {
+		bch_err(c, "error allocating c->replicas_gc");
 		return -ENOMEM;
+	}

 	mutex_lock(&c->sb_lock);
 	percpu_down_write(&c->mark_lock);
--- a/libbcachefs/siphash.c
+++ b/libbcachefs/siphash.c
@ -44,7 +44,6 @@
 * https://131002.net/siphash/
 */

-#include <linux/stddef.h>
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
 #include <linux/bitops.h>
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@ -889,6 +889,8 @@ int bch2_fs_start(struct bch_fs *c)
 	if (bch2_fs_init_fault("fs_start"))
 		goto err;

+	set_bit(BCH_FS_STARTED, &c->flags);
+
 	if (c->opts.read_only || c->opts.nochanges) {
 		bch2_fs_read_only(c);
 	} else {
@ -900,7 +902,6 @@ int bch2_fs_start(struct bch_fs *c)
 			goto err;
 	}

-	set_bit(BCH_FS_STARTED, &c->flags);
 	print_mount_opts(c);
 	ret = 0;
 out:
--- a/linux/six.c
+++ b/linux/six.c
@ -15,7 +15,7 @@
 #endif

 #define six_acquire(l, t)	lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_)
-#define six_release(l)		lock_release(l, 0, _RET_IP_)
+#define six_release(l)		lock_release(l, _RET_IP_)

 struct six_lock_vals {
 	/* Value we add to the lock in order to take the lock: */