Update bcachefs sources to 43a464c9dd bcachefs: Don't BUG_ON() on bucket sector count overflow

2025-12-10 00:00:24 +03:00 · 2019-03-07 16:39:43 -05:00 · 2019-03-07 16:39:43 -05:00 · fd67296247
commit fd67296247
parent 70bb5ab7a8
18 changed files with 568 additions and 169 deletions
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@ -1 +1 @@
-a5e71b82006fdf563190c41955c2b462854af610
+43a464c9dd38b50c1a89845366f838fe70fbb743
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@ -959,6 +959,7 @@ retry:
 			BTREE_INSERT_NOFAIL|
 			BTREE_INSERT_USE_RESERVE|
 			BTREE_INSERT_USE_ALLOC_RESERVE|
+			BTREE_INSERT_JOURNAL_RESERVED|
 			flags,
 			BTREE_INSERT_ENTRY(iter, &a->k_i));
 	if (ret == -EINTR)
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@ -1610,7 +1610,7 @@ void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c,
 	prefetch(c->btree_roots[btree_id].b);
 }

-void bch2_btree_iter_unlink(struct btree_iter *iter)
+static void bch2_btree_iter_unlink(struct btree_iter *iter)
 {
 	struct btree_iter *linked;

@ -1629,7 +1629,7 @@ void bch2_btree_iter_unlink(struct btree_iter *iter)
 	BUG();
 }

-void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
+static void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
 {
 	BUG_ON(btree_iter_linked(new));

--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@ -105,6 +105,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
 			      unsigned, unsigned);

 int bch2_btree_iter_unlock(struct btree_iter *);
+bool bch2_btree_iter_relock(struct btree_iter *);

 bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned);
 bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned);
@ -164,8 +165,6 @@ static inline void bch2_btree_iter_init(struct btree_iter *iter,
 				?  BTREE_ITER_IS_EXTENTS : 0)|flags);
 }

-void bch2_btree_iter_link(struct btree_iter *, struct btree_iter *);
-void bch2_btree_iter_unlink(struct btree_iter *);
 void bch2_btree_iter_copy(struct btree_iter *, struct btree_iter *);

 static inline struct bpos btree_type_successor(enum btree_id id,
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@ -203,8 +203,6 @@ static inline bool bch2_btree_node_relock(struct btree_iter *iter,
 		__bch2_btree_node_relock(iter, level);
 }

-bool bch2_btree_iter_relock(struct btree_iter *);
-
 void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);

 void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@ -245,10 +245,11 @@ struct btree_iter {
 #define BTREE_ITER_MAX		8

 struct deferred_update {
+	struct journal_preres	res;
 	struct journal_entry_pin journal;

 	spinlock_t		lock;
-	unsigned		gen;
+	unsigned		dirty:1;

 	u8			allocated_u64s;
 	enum btree_id		btree_id;
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@ -26,6 +26,7 @@ struct btree_insert {
 	struct bch_fs		*c;
 	struct disk_reservation *disk_res;
 	struct journal_res	journal_res;
+	struct journal_preres	journal_preres;
 	u64			*journal_seq;
 	unsigned		flags;
 	bool			did_work;
@ -81,6 +82,7 @@ enum {
 	__BTREE_INSERT_USE_RESERVE,
 	__BTREE_INSERT_USE_ALLOC_RESERVE,
 	__BTREE_INSERT_JOURNAL_REPLAY,
+	__BTREE_INSERT_JOURNAL_RESERVED,
 	__BTREE_INSERT_NOMARK,
 	__BTREE_INSERT_NOWAIT,
 	__BTREE_INSERT_GC_LOCK_HELD,
@ -111,6 +113,8 @@ enum {
 /* Insert is for journal replay - don't get journal reservations: */
 #define BTREE_INSERT_JOURNAL_REPLAY	(1 << __BTREE_INSERT_JOURNAL_REPLAY)

+#define BTREE_INSERT_JOURNAL_RESERVED	(1 << __BTREE_INSERT_JOURNAL_RESERVED)
+
 /* Don't call bch2_mark_key: */
 #define BTREE_INSERT_NOMARK		(1 << __BTREE_INSERT_NOMARK)

--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@ -17,6 +17,9 @@
 #include <linux/sort.h>
 #include <trace/events/bcachefs.h>

+static bool btree_trans_relock(struct btree_insert *);
+static void btree_trans_unlock(struct btree_insert *);
+
 /* Inserting into a given leaf node (last stage of insert): */

 /* Handle overwrites and do insert, for non extents: */
@ -239,15 +242,15 @@ btree_insert_key_leaf(struct btree_insert *trans,
 /* Deferred btree updates: */

 static void deferred_update_flush(struct journal *j,
-					struct journal_entry_pin *pin,
-					u64 seq)
+				  struct journal_entry_pin *pin,
+				  u64 seq)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct deferred_update *d =
 		container_of(pin, struct deferred_update, journal);
+	struct journal_preres res = { 0 };
 	u64 tmp[32];
 	struct bkey_i *k = (void *) tmp;
-	unsigned gen;
 	int ret;

 	if (d->allocated_u64s > ARRAY_SIZE(tmp)) {
@ -257,26 +260,32 @@ static void deferred_update_flush(struct journal *j,
 	}

 	spin_lock(&d->lock);
-	gen = d->gen;
+	if (d->dirty) {
+		BUG_ON(jset_u64s(d->k.k.u64s) > d->res.u64s);
+
+		swap(res, d->res);

-	if (journal_pin_active(&d->journal)) {
 		BUG_ON(d->k.k.u64s > d->allocated_u64s);
-		bkey_copy(k, &d->k);

+		bkey_copy(k, &d->k);
+		d->dirty = false;
 		spin_unlock(&d->lock);

 		ret = bch2_btree_insert(c, d->btree_id, k, NULL, NULL,
-					BTREE_INSERT_NOFAIL);
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_USE_RESERVE|
+					BTREE_INSERT_JOURNAL_RESERVED);
 		bch2_fs_fatal_err_on(ret && !bch2_journal_error(j),
-			c, "error flushing deferred btree update: %i", ret);
+				     c, "error flushing deferred btree update: %i", ret);

 		spin_lock(&d->lock);
 	}

-	if (gen == d->gen)
+	if (!d->dirty)
 		bch2_journal_pin_drop(j, &d->journal);
 	spin_unlock(&d->lock);

+	bch2_journal_preres_put(j, &res);
 	if (k != (void *) tmp)
 		kfree(k);
 }
@ -288,6 +297,7 @@ btree_insert_key_deferred(struct btree_insert *trans,
 	struct bch_fs *c = trans->c;
 	struct journal *j = &c->journal;
 	struct deferred_update *d = insert->d;
+	int difference;

 	BUG_ON(trans->flags & BTREE_INSERT_JOURNAL_REPLAY);
 	BUG_ON(insert->k->u64s > d->allocated_u64s);
@ -295,12 +305,21 @@ btree_insert_key_deferred(struct btree_insert *trans,
 	__btree_journal_key(trans, d->btree_id, insert->k);

 	spin_lock(&d->lock);
-	d->gen++;
+	BUG_ON(jset_u64s(insert->k->u64s) >
+	       trans->journal_preres.u64s);
+
+	difference = jset_u64s(insert->k->u64s) - d->res.u64s;
+	if (difference > 0) {
+		trans->journal_preres.u64s	-= difference;
+		d->res.u64s			+= difference;
+	}
+
 	bkey_copy(&d->k, insert->k);
-	spin_unlock(&d->lock);
+	d->dirty = true;

 	bch2_journal_pin_update(j, trans->journal_res.seq, &d->journal,
 				deferred_update_flush);
+	spin_unlock(&d->lock);

 	return BTREE_INSERT_OK;
 }
@ -519,13 +538,16 @@ retry:
 	}

 	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+		unsigned flags = (trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
+			? JOURNAL_RES_GET_RESERVED : 0;
+
 		u64s = 0;
 		trans_for_each_entry(trans, i)
 			u64s += jset_u64s(i->k->k.u64s);

 		ret = bch2_journal_res_get(&c->journal,
 				&trans->journal_res, u64s,
-				JOURNAL_RES_GET_NONBLOCK);
+				flags|JOURNAL_RES_GET_NONBLOCK);
 		if (likely(!ret))
 			goto got_journal_res;
 		if (ret != -EAGAIN)
@ -536,7 +558,7 @@ retry:

 		ret = bch2_journal_res_get(&c->journal,
 				&trans->journal_res, u64s,
-				JOURNAL_RES_GET_CHECK);
+				flags|JOURNAL_RES_GET_CHECK);
 		if (ret)
 			return ret;

@ -586,6 +608,10 @@ got_journal_res:
 		}
 	}
 out:
+	BUG_ON(ret &&
+	       (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) &&
+	       trans->journal_res.ref);
+
 	multi_unlock_write(trans);
 	bch2_journal_res_put(&c->journal, &trans->journal_res);

@ -627,7 +653,7 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
 	struct bch_fs *c = trans->c;
 	struct btree_insert_entry *i;
 	struct btree_iter *linked;
-	unsigned flags;
+	unsigned flags, u64s = 0;
 	int ret;

 	BUG_ON(!trans->nr);
@ -638,11 +664,39 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
 	if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
 		lockdep_assert_held(&c->gc_lock);

+	memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
+
 	bubble_sort(trans->entries, trans->nr, btree_trans_cmp);

 	trans_for_each_entry(trans, i)
 		btree_insert_entry_checks(c, i);

+	trans_for_each_entry(trans, i)
+		if (i->deferred)
+			u64s += jset_u64s(i->k->k.u64s);
+
+	if (u64s) {
+		ret = bch2_journal_preres_get(&c->journal,
+				&trans->journal_preres, u64s,
+				JOURNAL_RES_GET_NONBLOCK);
+		if (!ret)
+			goto got_journal_preres;
+		if (ret != -EAGAIN)
+			return ret;
+
+		btree_trans_unlock(trans);
+		ret = bch2_journal_preres_get(&c->journal,
+				&trans->journal_preres, u64s, 0);
+		if (ret)
+			return ret;
+
+		if (!btree_trans_relock(trans)) {
+			trans_restart(" (iter relock after journal preres get blocked)");
+			bch2_journal_preres_put(&c->journal, &trans->journal_preres);
+			return -EINTR;
+		}
+	}
+got_journal_preres:
 	if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
 		     !percpu_ref_tryget(&c->writes)))
 		return -EROFS;
@ -674,6 +728,8 @@ retry:
 	trans_for_each_iter(trans, i)
 		bch2_btree_iter_downgrade(i->iter);
 out:
+	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
+
 	if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
 		percpu_ref_put(&c->writes);

--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@ -536,11 +536,14 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
 }

 #define checked_add(a, b)					\
-do {								\
+({								\
 	unsigned _res = (unsigned) (a) + (b);			\
+	bool overflow = _res > U16_MAX;				\
+	if (overflow)						\
+		_res = U16_MAX;					\
 	(a) = _res;						\
-	BUG_ON((a) != _res);					\
-} while (0)
+	overflow;						\
+})

 static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 				       size_t b, enum bch_data_type type,
@ -548,17 +551,25 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 {
 	struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
 	struct bucket *g = __bucket(ca, b, gc);
-	struct bucket_mark new;
+	struct bucket_mark old, new;
+	bool overflow;

 	BUG_ON(type != BCH_DATA_SB &&
 	       type != BCH_DATA_JOURNAL);

-	bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+	old = bucket_cmpxchg(g, new, ({
 		new.dirty	= true;
 		new.data_type	= type;
-		checked_add(new.dirty_sectors, sectors);
+		overflow = checked_add(new.dirty_sectors, sectors);
 	}));

+	bch2_fs_inconsistent_on(overflow, c,
+		"bucket sector count overflow: %u + %u > U16_MAX",
+		old.dirty_sectors, sectors);
+
+	if (c)
+		bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+
 	return 0;
 }

@ -574,19 +585,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 		do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags,
 			   ca, b, type, sectors);
 	} else {
-		struct bucket *g;
-		struct bucket_mark new;
-
-		rcu_read_lock();
-
-		g = bucket(ca, b);
-		bucket_cmpxchg(g, new, ({
-			new.dirty	= true;
-			new.data_type	= type;
-			checked_add(new.dirty_sectors, sectors);
-		}));
-
-		rcu_read_unlock();
+		__bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0);
 	}
 }

@ -627,6 +626,7 @@ static bool bch2_mark_pointer(struct bch_fs *c,
 	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
 	size_t b = PTR_BUCKET_NR(ca, &p.ptr);
 	struct bucket *g = __bucket(ca, b, gc);
+	bool overflow;
 	u64 v;

 	v = atomic64_read(&g->_mark.v);
@ -648,9 +648,9 @@ static bool bch2_mark_pointer(struct bch_fs *c,
 		}

 		if (!p.ptr.cached)
-			checked_add(new.dirty_sectors, sectors);
+			overflow = checked_add(new.dirty_sectors, sectors);
 		else
-			checked_add(new.cached_sectors, sectors);
+			overflow = checked_add(new.cached_sectors, sectors);

 		if (!new.dirty_sectors &&
 		    !new.cached_sectors) {
@ -672,6 +672,12 @@ static bool bch2_mark_pointer(struct bch_fs *c,
 			      old.v.counter,
 			      new.v.counter)) != old.v.counter);

+	bch2_fs_inconsistent_on(overflow, c,
+		"bucket sector count overflow: %u + %lli > U16_MAX",
+		!p.ptr.cached
+		? old.dirty_sectors
+		: old.cached_sectors, sectors);
+
 	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);

 	BUG_ON(!gc && bucket_became_unavailable(old, new));
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@ -229,20 +229,19 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,

 /* normal i_size/i_sectors update machinery: */

-static s64 sum_sector_overwrites(struct bkey_i *new, struct btree_iter *_iter,
-				 bool *allocating)
+static int sum_sector_overwrites(struct btree_trans *trans,
+				 struct btree_iter *extent_iter,
+				 struct bkey_i *new, bool *allocating,
+				 s64 *i_sectors_delta)
 {
-	struct btree_iter iter;
+	struct btree_iter *iter = bch2_trans_copy_iter(trans, extent_iter);
 	struct bkey_s_c old;
 	s64 delta = 0;

-	bch2_btree_iter_init(&iter, _iter->c, BTREE_ID_EXTENTS, POS_MIN,
-			     BTREE_ITER_SLOTS);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);

-	bch2_btree_iter_link(_iter, &iter);
-	bch2_btree_iter_copy(&iter, _iter);
-
-	old = bch2_btree_iter_peek_slot(&iter);
+	old = bch2_btree_iter_peek_slot(iter);

 	while (1) {
 		/*
@ -268,12 +267,13 @@ static s64 sum_sector_overwrites(struct bkey_i *new, struct btree_iter *_iter,
 		if (bkey_cmp(old.k->p, new->k.p) >= 0)
 			break;

-		old = bch2_btree_iter_next_slot(&iter);
+		old = bch2_btree_iter_next_slot(iter);
 	}

-	bch2_btree_iter_unlink(&iter);
+	bch2_trans_iter_free(trans, iter);

-	return delta;
+	*i_sectors_delta = delta;
+	return 0;
 }

 static int bch2_extent_update(struct btree_trans *trans,
@ -287,11 +287,11 @@ static int bch2_extent_update(struct btree_trans *trans,
 			      bool direct,
 			      s64 *total_delta)
 {
-	struct btree_iter *inode_iter = NULL;
 	struct bch_inode_unpacked inode_u;
 	struct bkey_inode_buf inode_p;
 	bool allocating = false;
 	bool extended = false;
+	bool inode_locked = false;
 	s64 i_sectors_delta;
 	int ret;

@ -303,7 +303,12 @@ static int bch2_extent_update(struct btree_trans *trans,

 	bch2_extent_trim_atomic(k, extent_iter);

-	i_sectors_delta = sum_sector_overwrites(k, extent_iter, &allocating);
+	ret = sum_sector_overwrites(trans, extent_iter,
+				    k, &allocating,
+				    &i_sectors_delta);
+	if (ret)
+		return ret;
+
 	if (!may_allocate && allocating)
 		return -ENOSPC;

@ -314,16 +319,20 @@ static int bch2_extent_update(struct btree_trans *trans,
 	/* XXX: inode->i_size locking */
 	if (i_sectors_delta ||
 	    new_i_size > inode->ei_inode.bi_size) {
-		inode_iter = bch2_trans_get_iter(trans,
-			BTREE_ID_INODES,
-			POS(k->k.p.inode, 0),
-			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-		if (IS_ERR(inode_iter))
-			return PTR_ERR(inode_iter);
+		bch2_btree_iter_unlock(extent_iter);
+		mutex_lock(&inode->ei_update_lock);

-		ret = bch2_btree_iter_traverse(inode_iter);
-		if (ret)
-			goto err;
+		if (!bch2_btree_iter_relock(extent_iter)) {
+			mutex_unlock(&inode->ei_update_lock);
+			return -EINTR;
+		}
+
+		inode_locked = true;
+
+		if (!inode->ei_inode_update)
+			inode->ei_inode_update =
+				bch2_deferred_update_alloc(trans->c,
+							BTREE_ID_INODES, 64);

 		inode_u = inode->ei_inode;
 		inode_u.bi_sectors += i_sectors_delta;
@ -337,7 +346,8 @@ static int bch2_extent_update(struct btree_trans *trans,

 		bch2_inode_pack(&inode_p, &inode_u);
 		bch2_trans_update(trans,
-			BTREE_INSERT_ENTRY(inode_iter, &inode_p.inode.k_i));
+			BTREE_INSERT_DEFERRED(inode->ei_inode_update,
+					      &inode_p.inode.k_i));
 	}

 	ret = bch2_trans_commit(trans, disk_res,
@ -371,13 +381,15 @@ static int bch2_extent_update(struct btree_trans *trans,
 	if (total_delta)
 		*total_delta += i_sectors_delta;
 err:
-	if (!IS_ERR_OR_NULL(inode_iter))
-		bch2_trans_iter_put(trans, inode_iter);
+	if (inode_locked)
+		mutex_unlock(&inode->ei_update_lock);
+
 	return ret;
 }

 static int bchfs_write_index_update(struct bch_write_op *wop)
 {
+	struct bch_fs *c = wop->c;
 	struct bchfs_write_op *op = container_of(wop,
 				struct bchfs_write_op, op);
 	struct quota_res *quota_res = op->is_dio
@ -392,7 +404,7 @@ static int bchfs_write_index_update(struct bch_write_op *wop)

 	BUG_ON(k->k.p.inode != inode->v.i_ino);

-	bch2_trans_init(&trans, wop->c);
+	bch2_trans_init(&trans, c);
 	bch2_trans_preload_iters(&trans);

 	iter = bch2_trans_get_iter(&trans,
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@ -105,12 +105,18 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans,
 				inode_set_fn set,
 				void *p)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter *iter;
 	struct bkey_inode_buf *inode_p;
 	int ret;

 	lockdep_assert_held(&inode->ei_update_lock);

+	/* XXX: Don't do this with btree locks held */
+	if (!inode->ei_inode_update)
+		inode->ei_inode_update =
+			bch2_deferred_update_alloc(c, BTREE_ID_INODES, 64);
+#if 0
 	iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
 			POS(inode->v.i_ino, 0),
 			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
@ -121,7 +127,7 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans,
 	ret = bch2_btree_iter_traverse(iter);
 	if (ret)
 		return ret;
-
+#endif
 	*inode_u = inode->ei_inode;

 	if (set) {
@ -135,7 +141,15 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans,
 		return PTR_ERR(inode_p);

 	bch2_inode_pack(inode_p, inode_u);
-	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &inode_p->inode.k_i));
+
+	if (!inode->ei_inode_update)
+		bch2_trans_update(trans,
+			BTREE_INSERT_ENTRY(iter, &inode_p->inode.k_i));
+	else
+		bch2_trans_update(trans,
+			BTREE_INSERT_DEFERRED(inode->ei_inode_update,
+					      &inode_p->inode.k_i));
+
 	return 0;
 }

@ -1346,6 +1360,7 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
 	inode_init_once(&inode->v);
 	mutex_init(&inode->ei_update_lock);
 	mutex_init(&inode->ei_quota_lock);
+	inode->ei_inode_update = NULL;
 	inode->ei_journal_seq = 0;

 	return &inode->v;
@ -1409,6 +1424,10 @@ static void bch2_evict_inode(struct inode *vinode)

 	BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);

+	if (inode->ei_inode_update)
+		bch2_deferred_update_free(c, inode->ei_inode_update);
+	inode->ei_inode_update = NULL;
+
 	if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
 		bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
 				KEY_TYPE_QUOTA_WARN);
--- a/libbcachefs/fs.h
+++ b/libbcachefs/fs.h
@ -13,6 +13,7 @@ struct bch_inode_info {
 	struct inode		v;

 	struct mutex		ei_update_lock;
+	struct deferred_update	*ei_inode_update;
 	u64			ei_journal_seq;
 	u64			ei_quota_reserved;
 	unsigned long		ei_last_dirtied;
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@ -322,6 +322,7 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_buf *buf;
+	bool can_discard;
 	int ret;
 retry:
 	if (journal_res_get_fast(j, res, flags))
@ -342,6 +343,16 @@ retry:
 		return 0;
 	}

+	if (!(flags & JOURNAL_RES_GET_RESERVED) &&
+	    !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
+		/*
+		 * Don't want to close current journal entry, just need to
+		 * invoke reclaim:
+		 */
+		ret = -ENOSPC;
+		goto unlock;
+	}
+
 	/*
 	 * If we couldn't get a reservation because the current buf filled up,
 	 * and we had room for a bigger entry on disk, signal that we want to
@ -365,23 +376,38 @@ retry:
 	} else {
 		ret = journal_entry_open(j);
 	}
-
+unlock:
 	if ((ret == -EAGAIN || ret == -ENOSPC) &&
 	    !j->res_get_blocked_start)
 		j->res_get_blocked_start = local_clock() ?: 1;

+	can_discard = j->can_discard;
 	spin_unlock(&j->lock);

 	if (!ret)
 		goto retry;
+
 	if (ret == -ENOSPC) {
+		BUG_ON(!can_discard && (flags & JOURNAL_RES_GET_RESERVED));
+
 		/*
 		 * Journal is full - can't rely on reclaim from work item due to
 		 * freezing:
 		 */
 		trace_journal_full(c);
-		if (!(flags & JOURNAL_RES_GET_NONBLOCK))
-			bch2_journal_reclaim_work(&j->reclaim_work.work);
+
+		if (!(flags & JOURNAL_RES_GET_NONBLOCK)) {
+			if (can_discard) {
+				bch2_journal_do_discards(j);
+				goto retry;
+			}
+
+			if (mutex_trylock(&j->reclaim_lock)) {
+				bch2_journal_reclaim(j);
+				mutex_unlock(&j->reclaim_lock);
+			}
+		}
+
 		ret = -EAGAIN;
 	}

@ -409,6 +435,32 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
 	return ret;
 }

+/* journal_preres: */
+
+static bool journal_preres_available(struct journal *j,
+				     struct journal_preres *res,
+				     unsigned new_u64s)
+{
+	bool ret = bch2_journal_preres_get_fast(j, res, new_u64s);
+
+	if (!ret)
+		bch2_journal_reclaim_work(&j->reclaim_work.work);
+
+	return ret;
+}
+
+int __bch2_journal_preres_get(struct journal *j,
+			      struct journal_preres *res,
+			      unsigned new_u64s)
+{
+	int ret;
+
+	closure_wait_event(&j->preres_wait,
+		   (ret = bch2_journal_error(j)) ||
+		   journal_preres_available(j, res, new_u64s));
+	return ret;
+}
+
 /* journal_entry_res: */

 void bch2_journal_entry_res_resize(struct journal *j,
@ -760,6 +812,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,

 	while (ja->nr < nr) {
 		struct open_bucket *ob = NULL;
+		unsigned pos;
 		long bucket;

 		if (new_fs) {
@ -786,21 +839,25 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 			preempt_disable();
 		}

-		__array_insert_item(ja->buckets,		ja->nr, ja->last_idx);
-		__array_insert_item(ja->bucket_seq,		ja->nr, ja->last_idx);
-		__array_insert_item(journal_buckets->buckets,	ja->nr, ja->last_idx);
-
-		ja->buckets[ja->last_idx] = bucket;
-		ja->bucket_seq[ja->last_idx] = 0;
-		journal_buckets->buckets[ja->last_idx] = cpu_to_le64(bucket);
-
-		if (ja->last_idx < ja->nr) {
-			if (ja->cur_idx >= ja->last_idx)
-				ja->cur_idx++;
-			ja->last_idx++;
-		}
+		pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0;
+		__array_insert_item(ja->buckets,		ja->nr, pos);
+		__array_insert_item(ja->bucket_seq,		ja->nr, pos);
+		__array_insert_item(journal_buckets->buckets,	ja->nr, pos);
 		ja->nr++;

+		ja->buckets[pos] = bucket;
+		ja->bucket_seq[pos] = 0;
+		journal_buckets->buckets[pos] = cpu_to_le64(bucket);
+
+		if (pos <= ja->discard_idx)
+			ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
+		if (pos <= ja->dirty_idx_ondisk)
+			ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
+		if (pos <= ja->dirty_idx)
+			ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
+		if (pos <= ja->cur_idx)
+			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+
 		bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
 					  ca->mi.bucket_size,
 					  gc_phase(GC_PHASE_SB),
@ -1039,6 +1096,7 @@ int bch2_fs_journal_init(struct journal *j)
 	mutex_init(&j->blacklist_lock);
 	INIT_LIST_HEAD(&j->seq_blacklist);
 	mutex_init(&j->reclaim_lock);
+	mutex_init(&j->discard_lock);

 	lockdep_init_map(&j->res_map, "journal res", &res_key, 0);

@ -1087,11 +1145,16 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
 	       "seq:\t\t\t%llu\n"
 	       "last_seq:\t\t%llu\n"
 	       "last_seq_ondisk:\t%llu\n"
+	       "prereserved:\t\t%u/%u\n"
+	       "current entry sectors:\t%u\n"
 	       "current entry:\t\t",
 	       fifo_used(&j->pin),
 	       journal_cur_seq(j),
 	       journal_last_seq(j),
-	       j->last_seq_ondisk);
+	       j->last_seq_ondisk,
+	       j->prereserved.reserved,
+	       j->prereserved.remaining,
+	       j->cur_entry_sectors);

 	switch (s.cur_entry_offset) {
 	case JOURNAL_ENTRY_ERROR_VAL:
@ -1113,8 +1176,9 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
 	       journal_state_count(s, s.idx));

 	if (s.prev_buf_unwritten)
-		pr_buf(&out, "yes, ref %u\n",
-		       journal_state_count(s, !s.idx));
+		pr_buf(&out, "yes, ref %u sectors %u\n",
+		       journal_state_count(s, !s.idx),
+		       journal_prev_buf(j)->sectors);
 	else
 		pr_buf(&out, "no\n");

@ -1135,13 +1199,17 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
 		       "dev %u:\n"
 		       "\tnr\t\t%u\n"
 		       "\tavailable\t%u:%u\n"
-		       "\tcur_idx\t\t%u (seq %llu)\n"
-		       "\tlast_idx\t%u (seq %llu)\n",
+		       "\tdiscard_idx\t\t%u\n"
+		       "\tdirty_idx_ondisk\t%u (seq %llu)\n"
+		       "\tdirty_idx\t\t%u (seq %llu)\n"
+		       "\tcur_idx\t\t%u (seq %llu)\n",
 		       iter, ja->nr,
-		       bch2_journal_dev_buckets_available(j, ja),
+		       bch2_journal_dev_buckets_available(j, ja, journal_space_discarded),
 		       ja->sectors_free,
-		       ja->cur_idx,	ja->bucket_seq[ja->cur_idx],
-		       ja->last_idx,	ja->bucket_seq[ja->last_idx]);
+		       ja->discard_idx,
+		       ja->dirty_idx_ondisk,	ja->bucket_seq[ja->dirty_idx_ondisk],
+		       ja->dirty_idx,		ja->bucket_seq[ja->dirty_idx],
+		       ja->cur_idx,		ja->bucket_seq[ja->cur_idx]);
 	}

 	spin_unlock(&j->lock);
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@ -118,6 +118,7 @@ static inline void journal_wake(struct journal *j)
 {
 	wake_up(&j->wait);
 	closure_wake_up(&j->async_wait);
+	closure_wake_up(&j->preres_wait);
 }

 static inline struct journal_buf *journal_cur_buf(struct journal *j)
@ -271,6 +272,7 @@ int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,

 #define JOURNAL_RES_GET_NONBLOCK	(1 << 0)
 #define JOURNAL_RES_GET_CHECK		(1 << 1)
+#define JOURNAL_RES_GET_RESERVED	(1 << 2)

 static inline int journal_res_get_fast(struct journal *j,
 				       struct journal_res *res,
@ -291,6 +293,10 @@ static inline int journal_res_get_fast(struct journal *j,

 		EBUG_ON(!journal_state_count(new, new.idx));

+		if (!(flags & JOURNAL_RES_GET_RESERVED) &&
+		    !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags))
+			return 0;
+
 		if (flags & JOURNAL_RES_GET_CHECK)
 			return 1;

@ -330,6 +336,89 @@ out:
 	return 0;
 }

+/* journal_preres: */
+
+static inline bool journal_check_may_get_unreserved(struct journal *j)
+{
+	union journal_preres_state s = READ_ONCE(j->prereserved);
+	bool ret = s.reserved <= s.remaining &&
+		fifo_free(&j->pin) > 8;
+
+	lockdep_assert_held(&j->lock);
+
+	if (ret != test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
+		if (ret) {
+			set_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags);
+			journal_wake(j);
+		} else {
+			clear_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags);
+		}
+	}
+	return ret;
+}
+
+static inline void bch2_journal_preres_put(struct journal *j,
+					   struct journal_preres *res)
+{
+	union journal_preres_state s = { .reserved = res->u64s };
+
+	if (!res->u64s)
+		return;
+
+	s.v = atomic64_sub_return(s.v, &j->prereserved.counter);
+	res->u64s = 0;
+	closure_wake_up(&j->preres_wait);
+
+	if (s.reserved <= s.remaining &&
+	    !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
+		spin_lock(&j->lock);
+		journal_check_may_get_unreserved(j);
+		spin_unlock(&j->lock);
+	}
+}
+
+int __bch2_journal_preres_get(struct journal *,
+			struct journal_preres *, unsigned);
+
+static inline int bch2_journal_preres_get_fast(struct journal *j,
+					       struct journal_preres *res,
+					       unsigned new_u64s)
+{
+	int d = new_u64s - res->u64s;
+	union journal_preres_state old, new;
+	u64 v = atomic64_read(&j->prereserved.counter);
+
+	do {
+		old.v = new.v = v;
+
+		new.reserved += d;
+
+		if (new.reserved > new.remaining)
+			return 0;
+	} while ((v = atomic64_cmpxchg(&j->prereserved.counter,
+				       old.v, new.v)) != old.v);
+
+	res->u64s += d;
+	return 1;
+}
+
+static inline int bch2_journal_preres_get(struct journal *j,
+					  struct journal_preres *res,
+					  unsigned new_u64s,
+					  unsigned flags)
+{
+	if (new_u64s <= res->u64s)
+		return 0;
+
+	if (bch2_journal_preres_get_fast(j, res, new_u64s))
+		return 0;
+
+	if (flags & JOURNAL_RES_GET_NONBLOCK)
+		return -EAGAIN;
+
+	return __bch2_journal_preres_get(j, res, new_u64s);
+}
+
 /* journal_entry_res: */

 void bch2_journal_entry_res_resize(struct journal *,
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@ -625,11 +625,12 @@ static void bch2_journal_read_device(struct closure *cl)
 	ja->sectors_free = 0;

 	/*
-	 * Set last_idx to indicate the entire journal is full and needs to be
+	 * Set dirty_idx to indicate the entire journal is full and needs to be
 	 * reclaimed - journal reclaim will immediately reclaim whatever isn't
 	 * pinned when it first runs:
 	 */
-	ja->last_idx = (ja->cur_idx + 1) % ja->nr;
+	ja->discard_idx = ja->dirty_idx_ondisk =
+		ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
 out:
 	kvpfree(buf.data, buf.size);
 	percpu_ref_put(&ca->io_ref);
@ -969,9 +970,16 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,

 		if (sectors > ja->sectors_free &&
 		    sectors <= ca->mi.bucket_size &&
-		    bch2_journal_dev_buckets_available(j, ja)) {
+		    bch2_journal_dev_buckets_available(j, ja,
+					journal_space_discarded)) {
 			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
 			ja->sectors_free = ca->mi.bucket_size;
+
+			/*
+			 * ja->bucket_seq[ja->cur_idx] must always have
+			 * something sensible:
+			 */
+			ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
 		}
 	}

@ -1069,12 +1077,13 @@ static void journal_write_done(struct closure *cl)
 		goto err;

 	spin_lock(&j->lock);
-	j->seq_ondisk		= seq;
-	j->last_seq_ondisk	= last_seq;
-
 	if (seq >= j->pin.front)
 		journal_seq_pin(j, seq)->devs = devs;

+	j->seq_ondisk		= seq;
+	j->last_seq_ondisk	= last_seq;
+	bch2_journal_space_available(j);
+
 	/*
 	 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
 	 * more buckets:
--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@ -8,47 +8,72 @@

 /* Free space calculations: */

+static unsigned journal_space_from(struct journal_device *ja,
+				   enum journal_space_from from)
+{
+	switch (from) {
+	case journal_space_discarded:
+		return ja->discard_idx;
+	case journal_space_clean_ondisk:
+		return ja->dirty_idx_ondisk;
+	case journal_space_clean:
+		return ja->dirty_idx;
+	default:
+		BUG();
+	}
+}
+
 unsigned bch2_journal_dev_buckets_available(struct journal *j,
-					    struct journal_device *ja)
+					    struct journal_device *ja,
+					    enum journal_space_from from)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	unsigned next = (ja->cur_idx + 1) % ja->nr;
-	unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
+	unsigned available = (journal_space_from(ja, from) -
+			      ja->cur_idx - 1 + ja->nr) % ja->nr;

 	/*
 	 * Allocator startup needs some journal space before we can do journal
 	 * replay:
 	 */
-	if (available &&
-	    test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags))
-		available--;
+	if (available && test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags))
+		--available;

 	/*
 	 * Don't use the last bucket unless writing the new last_seq
 	 * will make another bucket available:
 	 */
-	if (available &&
-	    journal_last_seq(j) <= ja->bucket_seq[ja->last_idx])
+	if (available && ja->dirty_idx_ondisk == ja->dirty_idx)
 		--available;

 	return available;
 }

-void bch2_journal_space_available(struct journal *j)
+static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
+{
+	union journal_preres_state old, new;
+	u64 v = atomic64_read(&j->prereserved.counter);
+
+	do {
+		old.v = new.v = v;
+		new.remaining = u64s_remaining;
+	} while ((v = atomic64_cmpxchg(&j->prereserved.counter,
+				       old.v, new.v)) != old.v);
+}
+
+static struct journal_space {
+	unsigned	next_entry;
+	unsigned	remaining;
+} __journal_space_available(struct journal *j, unsigned nr_devs_want,
+			    enum journal_space_from from)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
 	unsigned sectors_next_entry	= UINT_MAX;
 	unsigned sectors_total		= UINT_MAX;
-	unsigned max_entry_size		= min(j->buf[0].buf_size >> 9,
-					      j->buf[1].buf_size >> 9);
-	unsigned i, nr_online = 0, nr_devs = 0;
+	unsigned i, nr_devs = 0;
 	unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
 		? journal_prev_buf(j)->sectors
 		: 0;
-	int ret = 0;
-
-	lockdep_assert_held(&j->lock);

 	rcu_read_lock();
 	for_each_member_device_rcu(ca, c, i,
@ -59,9 +84,7 @@ void bch2_journal_space_available(struct journal *j)
 		if (!ja->nr)
 			continue;

-		nr_online++;
-
-		buckets_this_device = bch2_journal_dev_buckets_available(j, ja);
+		buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from);
 		sectors_this_device = ja->sectors_free;

 		/*
@ -94,28 +117,88 @@ void bch2_journal_space_available(struct journal *j)
 			buckets_this_device * ca->mi.bucket_size +
 			sectors_this_device);

-		max_entry_size = min_t(unsigned, max_entry_size,
-				       ca->mi.bucket_size);
-
 		nr_devs++;
 	}
 	rcu_read_unlock();

+	if (nr_devs < nr_devs_want)
+		return (struct journal_space) { 0, 0 };
+
+	return (struct journal_space) {
+		.next_entry	= sectors_next_entry,
+		.remaining	= max_t(int, 0, sectors_total - sectors_next_entry),
+	};
+}
+
+void bch2_journal_space_available(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	struct journal_space discarded, clean_ondisk, clean;
+	unsigned overhead, u64s_remaining = 0;
+	unsigned max_entry_size	 = min(j->buf[0].buf_size >> 9,
+				       j->buf[1].buf_size >> 9);
+	unsigned i, nr_online = 0, nr_devs_want;
+	bool can_discard = false;
+	int ret = 0;
+
+	lockdep_assert_held(&j->lock);
+
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i,
+				   &c->rw_devs[BCH_DATA_JOURNAL]) {
+		struct journal_device *ja = &ca->journal;
+
+		if (!ja->nr)
+			continue;
+
+		while (ja->dirty_idx != ja->cur_idx &&
+		       ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
+			ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
+
+		while (ja->dirty_idx_ondisk != ja->dirty_idx &&
+		       ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
+			ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
+
+		if (ja->discard_idx != ja->dirty_idx_ondisk)
+			can_discard = true;
+
+		max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
+		nr_online++;
+	}
+	rcu_read_unlock();
+
+	j->can_discard = can_discard;
+
 	if (nr_online < c->opts.metadata_replicas_required) {
 		ret = -EROFS;
-		sectors_next_entry = 0;
-	} else if (!sectors_next_entry ||
-		   nr_devs < min_t(unsigned, nr_online,
-				   c->opts.metadata_replicas)) {
-		ret = -ENOSPC;
-		sectors_next_entry = 0;
-	} else if (!fifo_free(&j->pin)) {
-		ret = -ENOSPC;
-		sectors_next_entry = 0;
+		goto out;
 	}

-	j->cur_entry_sectors	= sectors_next_entry;
+	if (!fifo_free(&j->pin)) {
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
+
+	discarded	= __journal_space_available(j, nr_devs_want, journal_space_discarded);
+	clean_ondisk	= __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk);
+	clean		= __journal_space_available(j, nr_devs_want, journal_space_clean);
+
+	if (!discarded.next_entry)
+		ret = -ENOSPC;
+
+	overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) *
+		journal_entry_overhead(j);
+	u64s_remaining = clean.remaining << 6;
+	u64s_remaining = max_t(int, 0, u64s_remaining - overhead);
+	u64s_remaining /= 4;
+out:
+	j->cur_entry_sectors	= !ret ? discarded.next_entry : 0;
 	j->cur_entry_error	= ret;
+	journal_set_remaining(j, u64s_remaining);
+	journal_check_may_get_unreserved(j);

 	if (!ret)
 		journal_wake(j);
@ -128,25 +211,23 @@ static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
 	bool ret;

 	spin_lock(&j->lock);
-	ret = ja->nr &&
-		ja->last_idx != ja->cur_idx &&
-		ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk;
+	ret = ja->discard_idx != ja->dirty_idx_ondisk;
 	spin_unlock(&j->lock);

 	return ret;
 }

 /*
- * Advance ja->last_idx as long as it points to buckets that are no longer
+ * Advance ja->discard_idx as long as it points to buckets that are no longer
 * dirty, issuing discards if necessary:
 */
-static void journal_do_discards(struct journal *j)
+void bch2_journal_do_discards(struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
 	unsigned iter;

-	mutex_lock(&j->reclaim_lock);
+	mutex_lock(&j->discard_lock);

 	for_each_rw_member(ca, c, iter) {
 		struct journal_device *ja = &ca->journal;
@ -156,18 +237,18 @@ static void journal_do_discards(struct journal *j)
 			    blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
 				blkdev_issue_discard(ca->disk_sb.bdev,
 					bucket_to_sector(ca,
-						ja->buckets[ja->last_idx]),
+						ja->buckets[ja->discard_idx]),
 					ca->mi.bucket_size, GFP_NOIO, 0);

 			spin_lock(&j->lock);
-			ja->last_idx = (ja->last_idx + 1) % ja->nr;
+			ja->discard_idx = (ja->discard_idx + 1) % ja->nr;

 			bch2_journal_space_available(j);
 			spin_unlock(&j->lock);
 		}
 	}

-	mutex_unlock(&j->reclaim_lock);
+	mutex_unlock(&j->discard_lock);
 }

 /*
@ -372,7 +453,7 @@ static void journal_flush_pins(struct journal *j, u64 seq_to_flush,
 }

 /**
- * bch2_journal_reclaim_work - free up journal buckets
+ * bch2_journal_reclaim - free up journal buckets
 *
 * Background journal reclaim writes out btree nodes. It should be run
 * early enough so that we never completely run out of journal buckets.
@ -389,29 +470,37 @@ static void journal_flush_pins(struct journal *j, u64 seq_to_flush,
 * 512 journal entries or 25% of all journal buckets, then
 * journal_next_bucket() should not stall.
 */
-void bch2_journal_reclaim_work(struct work_struct *work)
+void bch2_journal_reclaim(struct journal *j)
 {
-	struct bch_fs *c = container_of(to_delayed_work(work),
-				struct bch_fs, journal.reclaim_work);
-	struct journal *j = &c->journal;
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
-	unsigned iter, bucket_to_flush, min_nr = 0;
+	unsigned iter, min_nr = 0;
 	u64 seq_to_flush = 0;

-	journal_do_discards(j);
+	lockdep_assert_held(&j->reclaim_lock);
+
+	bch2_journal_do_discards(j);

-	mutex_lock(&j->reclaim_lock);
 	spin_lock(&j->lock);

 	for_each_rw_member(ca, c, iter) {
 		struct journal_device *ja = &ca->journal;
+		unsigned nr_buckets, bucket_to_flush;

 		if (!ja->nr)
 			continue;

-
 		/* Try to keep the journal at most half full: */
-		bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
+		nr_buckets = ja->nr / 2;
+
+		/* And include pre-reservations: */
+		nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
+					   (ca->mi.bucket_size << 6) -
+					   journal_entry_overhead(j));
+
+		nr_buckets = min(nr_buckets, ja->nr);
+
+		bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
 		seq_to_flush = max_t(u64, seq_to_flush,
 				     ja->bucket_seq[bucket_to_flush]);
 	}
@ -430,15 +519,26 @@ void bch2_journal_reclaim_work(struct work_struct *work)
 		       msecs_to_jiffies(j->reclaim_delay_ms)))
 		min_nr = 1;

-	journal_flush_pins(j, seq_to_flush, min_nr);
+	if (j->prereserved.reserved * 2 > j->prereserved.remaining)
+		min_nr = 1;

-	mutex_unlock(&j->reclaim_lock);
+	journal_flush_pins(j, seq_to_flush, min_nr);

 	if (!test_bit(BCH_FS_RO, &c->flags))
 		queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
 				   msecs_to_jiffies(j->reclaim_delay_ms));
 }

+void bch2_journal_reclaim_work(struct work_struct *work)
+{
+	struct journal *j = container_of(to_delayed_work(work),
+				struct journal, reclaim_work);
+
+	mutex_lock(&j->reclaim_lock);
+	bch2_journal_reclaim(j);
+	mutex_unlock(&j->reclaim_lock);
+}
+
 static int journal_flush_done(struct journal *j, u64 seq_to_flush)
 {
 	int ret;
--- a/libbcachefs/journal_reclaim.h
+++ b/libbcachefs/journal_reclaim.h
@ -3,8 +3,15 @@

 #define JOURNAL_PIN	(32 * 1024)

+enum journal_space_from {
+	journal_space_discarded,
+	journal_space_clean_ondisk,
+	journal_space_clean,
+};
+
 unsigned bch2_journal_dev_buckets_available(struct journal *,
-					    struct journal_device *);
+					    struct journal_device *,
+					    enum journal_space_from);
 void bch2_journal_space_available(struct journal *);

 static inline bool journal_pin_active(struct journal_entry_pin *pin)
@ -33,6 +40,8 @@ void bch2_journal_pin_add_if_older(struct journal *,
 				  journal_pin_flush_fn);
 void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);

+void bch2_journal_do_discards(struct journal *);
+void bch2_journal_reclaim(struct journal *);
 void bch2_journal_reclaim_work(struct work_struct *);

 void bch2_journal_flush_pins(struct journal *, u64);
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@ -79,6 +79,14 @@ struct journal_res {
 	u64			seq;
 };

+/*
+ * For reserving space in the journal prior to getting a reservation on a
+ * particular journal entry:
+ */
+struct journal_preres {
+	unsigned		u64s;
+};
+
 union journal_res_state {
 	struct {
 		atomic64_t	counter;
@ -97,6 +105,21 @@ union journal_res_state {
 	};
 };

+union journal_preres_state {
+	struct {
+		atomic64_t	counter;
+	};
+
+	struct {
+		u64		v;
+	};
+
+	struct {
+		u32		reserved;
+		u32		remaining;
+	};
+};
+
 /* bytes: */
 #define JOURNAL_ENTRY_SIZE_MIN		(64U << 10) /* 64k */
 #define JOURNAL_ENTRY_SIZE_MAX		(4U  << 20) /* 4M */
@ -121,6 +144,7 @@ enum {
 	JOURNAL_STARTED,
 	JOURNAL_NEED_WRITE,
 	JOURNAL_NOT_EMPTY,
+	JOURNAL_MAY_GET_UNRESERVED,
 };

 /* Embedded in struct bch_fs */
@ -141,6 +165,8 @@ struct journal {
 	 */
 	int			cur_entry_error;

+	union journal_preres_state prereserved;
+
 	/* Reserved space in journal entry to be used just prior to write */
 	unsigned		entry_u64s_reserved;

@ -160,6 +186,7 @@ struct journal {
 	/* Used when waiting because the journal was full */
 	wait_queue_head_t	wait;
 	struct closure_waitlist	async_wait;
+	struct closure_waitlist	preres_wait;

 	struct closure		io;
 	struct delayed_work	write_work;
@ -192,9 +219,6 @@ struct journal {
 		struct journal_entry_pin_list *data;
 	}			pin;

-	struct journal_entry_pin *flush_in_progress;
-	wait_queue_head_t	pin_flush_wait;
-
 	u64			replay_journal_seq;

 	struct mutex		blacklist_lock;
@ -205,10 +229,15 @@ struct journal {
 	spinlock_t		err_lock;

 	struct delayed_work	reclaim_work;
-	unsigned long		last_flushed;
-
-	/* protects advancing ja->last_idx: */
 	struct mutex		reclaim_lock;
+	unsigned long		last_flushed;
+	struct journal_entry_pin *flush_in_progress;
+	wait_queue_head_t	pin_flush_wait;
+
+	/* protects advancing ja->discard_idx: */
+	struct mutex		discard_lock;
+	bool			can_discard;
+
 	unsigned		write_delay_ms;
 	unsigned		reclaim_delay_ms;

@ -239,17 +268,15 @@ struct journal_device {

 	unsigned		sectors_free;

-	/* Journal bucket we're currently writing to */
-	unsigned		cur_idx;
-
-	/* Last journal bucket that still contains an open journal entry */
-
 	/*
-	 * j->lock and j->reclaim_lock must both be held to modify, j->lock
-	 * sufficient to read:
+	 * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx:
 	 */
-	unsigned		last_idx;
+	unsigned		discard_idx;		/* Next bucket to discard */
+	unsigned		dirty_idx_ondisk;
+	unsigned		dirty_idx;
+	unsigned		cur_idx;		/* Journal bucket we're currently writing to */
 	unsigned		nr;
+
 	u64			*buckets;

 	/* Bio for journal reads/writes to this device */