From fd67296247e3404a9843cc5f4226632f0fdd55ad Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 7 Mar 2019 16:39:43 -0500 Subject: [PATCH] Update bcachefs sources to 43a464c9dd bcachefs: Don't BUG_ON() on bucket sector count overflow --- .bcachefs_revision | 2 +- libbcachefs/alloc_background.c | 1 + libbcachefs/btree_iter.c | 4 +- libbcachefs/btree_iter.h | 3 +- libbcachefs/btree_locking.h | 2 - libbcachefs/btree_types.h | 3 +- libbcachefs/btree_update.h | 4 + libbcachefs/btree_update_leaf.c | 84 ++++++++++--- libbcachefs/buckets.c | 48 ++++---- libbcachefs/fs-io.c | 66 ++++++----- libbcachefs/fs.c | 23 +++- libbcachefs/fs.h | 1 + libbcachefs/journal.c | 116 ++++++++++++++---- libbcachefs/journal.h | 89 ++++++++++++++ libbcachefs/journal_io.c | 21 +++- libbcachefs/journal_reclaim.c | 204 ++++++++++++++++++++++++-------- libbcachefs/journal_reclaim.h | 11 +- libbcachefs/journal_types.h | 55 ++++++--- 18 files changed, 568 insertions(+), 169 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index 4f4bf212..b5f30cd3 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -a5e71b82006fdf563190c41955c2b462854af610 +43a464c9dd38b50c1a89845366f838fe70fbb743 diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index b080f30e..1297638b 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -959,6 +959,7 @@ retry: BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE| BTREE_INSERT_USE_ALLOC_RESERVE| + BTREE_INSERT_JOURNAL_RESERVED| flags, BTREE_INSERT_ENTRY(iter, &a->k_i)); if (ret == -EINTR) diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 73e2c5ef..263cd18f 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -1610,7 +1610,7 @@ void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c, prefetch(c->btree_roots[btree_id].b); } -void bch2_btree_iter_unlink(struct btree_iter *iter) +static void bch2_btree_iter_unlink(struct btree_iter *iter) { struct btree_iter *linked; @@ -1629,7 +1629,7 @@ void bch2_btree_iter_unlink(struct btree_iter *iter) BUG(); } -void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new) +static void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new) { BUG_ON(btree_iter_linked(new)); diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 873332f7..7c49a661 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -105,6 +105,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *, unsigned, unsigned); int bch2_btree_iter_unlock(struct btree_iter *); +bool bch2_btree_iter_relock(struct btree_iter *); bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned); bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned); @@ -164,8 +165,6 @@ static inline void bch2_btree_iter_init(struct btree_iter *iter, ? BTREE_ITER_IS_EXTENTS : 0)|flags); } -void bch2_btree_iter_link(struct btree_iter *, struct btree_iter *); -void bch2_btree_iter_unlink(struct btree_iter *); void bch2_btree_iter_copy(struct btree_iter *, struct btree_iter *); static inline struct bpos btree_type_successor(enum btree_id id, diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index 9054de0d..f565fa36 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -203,8 +203,6 @@ static inline bool bch2_btree_node_relock(struct btree_iter *iter, __bch2_btree_node_relock(iter, level); } -bool bch2_btree_iter_relock(struct btree_iter *); - void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *); void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *); diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index b38722da..15a4d382 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -245,10 +245,11 @@ struct btree_iter { #define BTREE_ITER_MAX 8 struct deferred_update { + struct journal_preres res; struct journal_entry_pin journal; spinlock_t lock; - unsigned gen; + unsigned dirty:1; u8 allocated_u64s; enum btree_id btree_id; diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index faacde9a..7545f4f1 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -26,6 +26,7 @@ struct btree_insert { struct bch_fs *c; struct disk_reservation *disk_res; struct journal_res journal_res; + struct journal_preres journal_preres; u64 *journal_seq; unsigned flags; bool did_work; @@ -81,6 +82,7 @@ enum { __BTREE_INSERT_USE_RESERVE, __BTREE_INSERT_USE_ALLOC_RESERVE, __BTREE_INSERT_JOURNAL_REPLAY, + __BTREE_INSERT_JOURNAL_RESERVED, __BTREE_INSERT_NOMARK, __BTREE_INSERT_NOWAIT, __BTREE_INSERT_GC_LOCK_HELD, @@ -111,6 +113,8 @@ enum { /* Insert is for journal replay - don't get journal reservations: */ #define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY) +#define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED) + /* Don't call bch2_mark_key: */ #define BTREE_INSERT_NOMARK (1 << __BTREE_INSERT_NOMARK) diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index da8c6987..4a4904e7 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -17,6 +17,9 @@ #include #include +static bool btree_trans_relock(struct btree_insert *); +static void btree_trans_unlock(struct btree_insert *); + /* Inserting into a given leaf node (last stage of insert): */ /* Handle overwrites and do insert, for non extents: */ @@ -239,15 +242,15 @@ btree_insert_key_leaf(struct btree_insert *trans, /* Deferred btree updates: */ static void deferred_update_flush(struct journal *j, - struct journal_entry_pin *pin, - u64 seq) + struct journal_entry_pin *pin, + u64 seq) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct deferred_update *d = container_of(pin, struct deferred_update, journal); + struct journal_preres res = { 0 }; u64 tmp[32]; struct bkey_i *k = (void *) tmp; - unsigned gen; int ret; if (d->allocated_u64s > ARRAY_SIZE(tmp)) { @@ -257,26 +260,32 @@ static void deferred_update_flush(struct journal *j, } spin_lock(&d->lock); - gen = d->gen; + if (d->dirty) { + BUG_ON(jset_u64s(d->k.k.u64s) > d->res.u64s); + + swap(res, d->res); - if (journal_pin_active(&d->journal)) { BUG_ON(d->k.k.u64s > d->allocated_u64s); - bkey_copy(k, &d->k); + bkey_copy(k, &d->k); + d->dirty = false; spin_unlock(&d->lock); ret = bch2_btree_insert(c, d->btree_id, k, NULL, NULL, - BTREE_INSERT_NOFAIL); + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_JOURNAL_RESERVED); bch2_fs_fatal_err_on(ret && !bch2_journal_error(j), - c, "error flushing deferred btree update: %i", ret); + c, "error flushing deferred btree update: %i", ret); spin_lock(&d->lock); } - if (gen == d->gen) + if (!d->dirty) bch2_journal_pin_drop(j, &d->journal); spin_unlock(&d->lock); + bch2_journal_preres_put(j, &res); if (k != (void *) tmp) kfree(k); } @@ -288,6 +297,7 @@ btree_insert_key_deferred(struct btree_insert *trans, struct bch_fs *c = trans->c; struct journal *j = &c->journal; struct deferred_update *d = insert->d; + int difference; BUG_ON(trans->flags & BTREE_INSERT_JOURNAL_REPLAY); BUG_ON(insert->k->u64s > d->allocated_u64s); @@ -295,12 +305,21 @@ btree_insert_key_deferred(struct btree_insert *trans, __btree_journal_key(trans, d->btree_id, insert->k); spin_lock(&d->lock); - d->gen++; + BUG_ON(jset_u64s(insert->k->u64s) > + trans->journal_preres.u64s); + + difference = jset_u64s(insert->k->u64s) - d->res.u64s; + if (difference > 0) { + trans->journal_preres.u64s -= difference; + d->res.u64s += difference; + } + bkey_copy(&d->k, insert->k); - spin_unlock(&d->lock); + d->dirty = true; bch2_journal_pin_update(j, trans->journal_res.seq, &d->journal, deferred_update_flush); + spin_unlock(&d->lock); return BTREE_INSERT_OK; } @@ -519,13 +538,16 @@ retry: } if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { + unsigned flags = (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) + ? JOURNAL_RES_GET_RESERVED : 0; + u64s = 0; trans_for_each_entry(trans, i) u64s += jset_u64s(i->k->k.u64s); ret = bch2_journal_res_get(&c->journal, &trans->journal_res, u64s, - JOURNAL_RES_GET_NONBLOCK); + flags|JOURNAL_RES_GET_NONBLOCK); if (likely(!ret)) goto got_journal_res; if (ret != -EAGAIN) @@ -536,7 +558,7 @@ retry: ret = bch2_journal_res_get(&c->journal, &trans->journal_res, u64s, - JOURNAL_RES_GET_CHECK); + flags|JOURNAL_RES_GET_CHECK); if (ret) return ret; @@ -586,6 +608,10 @@ got_journal_res: } } out: + BUG_ON(ret && + (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) && + trans->journal_res.ref); + multi_unlock_write(trans); bch2_journal_res_put(&c->journal, &trans->journal_res); @@ -627,7 +653,7 @@ int __bch2_btree_insert_at(struct btree_insert *trans) struct bch_fs *c = trans->c; struct btree_insert_entry *i; struct btree_iter *linked; - unsigned flags; + unsigned flags, u64s = 0; int ret; BUG_ON(!trans->nr); @@ -638,11 +664,39 @@ int __bch2_btree_insert_at(struct btree_insert *trans) if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) lockdep_assert_held(&c->gc_lock); + memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); + bubble_sort(trans->entries, trans->nr, btree_trans_cmp); trans_for_each_entry(trans, i) btree_insert_entry_checks(c, i); + trans_for_each_entry(trans, i) + if (i->deferred) + u64s += jset_u64s(i->k->k.u64s); + + if (u64s) { + ret = bch2_journal_preres_get(&c->journal, + &trans->journal_preres, u64s, + JOURNAL_RES_GET_NONBLOCK); + if (!ret) + goto got_journal_preres; + if (ret != -EAGAIN) + return ret; + + btree_trans_unlock(trans); + ret = bch2_journal_preres_get(&c->journal, + &trans->journal_preres, u64s, 0); + if (ret) + return ret; + + if (!btree_trans_relock(trans)) { + trans_restart(" (iter relock after journal preres get blocked)"); + bch2_journal_preres_put(&c->journal, &trans->journal_preres); + return -EINTR; + } + } +got_journal_preres: if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW) && !percpu_ref_tryget(&c->writes))) return -EROFS; @@ -674,6 +728,8 @@ retry: trans_for_each_iter(trans, i) bch2_btree_iter_downgrade(i->iter); out: + bch2_journal_preres_put(&c->journal, &trans->journal_preres); + if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) percpu_ref_put(&c->writes); diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 377a8b0f..072d22ae 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -536,11 +536,14 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, } #define checked_add(a, b) \ -do { \ +({ \ unsigned _res = (unsigned) (a) + (b); \ + bool overflow = _res > U16_MAX; \ + if (overflow) \ + _res = U16_MAX; \ (a) = _res; \ - BUG_ON((a) != _res); \ -} while (0) + overflow; \ +}) static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, size_t b, enum bch_data_type type, @@ -548,17 +551,25 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, { struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]); struct bucket *g = __bucket(ca, b, gc); - struct bucket_mark new; + struct bucket_mark old, new; + bool overflow; BUG_ON(type != BCH_DATA_SB && type != BCH_DATA_JOURNAL); - bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({ + old = bucket_cmpxchg(g, new, ({ new.dirty = true; new.data_type = type; - checked_add(new.dirty_sectors, sectors); + overflow = checked_add(new.dirty_sectors, sectors); })); + bch2_fs_inconsistent_on(overflow, c, + "bucket sector count overflow: %u + %u > U16_MAX", + old.dirty_sectors, sectors); + + if (c) + bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + return 0; } @@ -574,19 +585,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags, ca, b, type, sectors); } else { - struct bucket *g; - struct bucket_mark new; - - rcu_read_lock(); - - g = bucket(ca, b); - bucket_cmpxchg(g, new, ({ - new.dirty = true; - new.data_type = type; - checked_add(new.dirty_sectors, sectors); - })); - - rcu_read_unlock(); + __bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0); } } @@ -627,6 +626,7 @@ static bool bch2_mark_pointer(struct bch_fs *c, struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); size_t b = PTR_BUCKET_NR(ca, &p.ptr); struct bucket *g = __bucket(ca, b, gc); + bool overflow; u64 v; v = atomic64_read(&g->_mark.v); @@ -648,9 +648,9 @@ static bool bch2_mark_pointer(struct bch_fs *c, } if (!p.ptr.cached) - checked_add(new.dirty_sectors, sectors); + overflow = checked_add(new.dirty_sectors, sectors); else - checked_add(new.cached_sectors, sectors); + overflow = checked_add(new.cached_sectors, sectors); if (!new.dirty_sectors && !new.cached_sectors) { @@ -672,6 +672,12 @@ static bool bch2_mark_pointer(struct bch_fs *c, old.v.counter, new.v.counter)) != old.v.counter); + bch2_fs_inconsistent_on(overflow, c, + "bucket sector count overflow: %u + %lli > U16_MAX", + !p.ptr.cached + ? old.dirty_sectors + : old.cached_sectors, sectors); + bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); BUG_ON(!gc && bucket_became_unavailable(old, new)); diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 2cfc2d9e..ce46d0ef 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -229,20 +229,19 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, /* normal i_size/i_sectors update machinery: */ -static s64 sum_sector_overwrites(struct bkey_i *new, struct btree_iter *_iter, - bool *allocating) +static int sum_sector_overwrites(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct bkey_i *new, bool *allocating, + s64 *i_sectors_delta) { - struct btree_iter iter; + struct btree_iter *iter = bch2_trans_copy_iter(trans, extent_iter); struct bkey_s_c old; s64 delta = 0; - bch2_btree_iter_init(&iter, _iter->c, BTREE_ID_EXTENTS, POS_MIN, - BTREE_ITER_SLOTS); + if (IS_ERR(iter)) + return PTR_ERR(iter); - bch2_btree_iter_link(_iter, &iter); - bch2_btree_iter_copy(&iter, _iter); - - old = bch2_btree_iter_peek_slot(&iter); + old = bch2_btree_iter_peek_slot(iter); while (1) { /* @@ -268,12 +267,13 @@ static s64 sum_sector_overwrites(struct bkey_i *new, struct btree_iter *_iter, if (bkey_cmp(old.k->p, new->k.p) >= 0) break; - old = bch2_btree_iter_next_slot(&iter); + old = bch2_btree_iter_next_slot(iter); } - bch2_btree_iter_unlink(&iter); + bch2_trans_iter_free(trans, iter); - return delta; + *i_sectors_delta = delta; + return 0; } static int bch2_extent_update(struct btree_trans *trans, @@ -287,11 +287,11 @@ static int bch2_extent_update(struct btree_trans *trans, bool direct, s64 *total_delta) { - struct btree_iter *inode_iter = NULL; struct bch_inode_unpacked inode_u; struct bkey_inode_buf inode_p; bool allocating = false; bool extended = false; + bool inode_locked = false; s64 i_sectors_delta; int ret; @@ -303,7 +303,12 @@ static int bch2_extent_update(struct btree_trans *trans, bch2_extent_trim_atomic(k, extent_iter); - i_sectors_delta = sum_sector_overwrites(k, extent_iter, &allocating); + ret = sum_sector_overwrites(trans, extent_iter, + k, &allocating, + &i_sectors_delta); + if (ret) + return ret; + if (!may_allocate && allocating) return -ENOSPC; @@ -314,16 +319,20 @@ static int bch2_extent_update(struct btree_trans *trans, /* XXX: inode->i_size locking */ if (i_sectors_delta || new_i_size > inode->ei_inode.bi_size) { - inode_iter = bch2_trans_get_iter(trans, - BTREE_ID_INODES, - POS(k->k.p.inode, 0), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - if (IS_ERR(inode_iter)) - return PTR_ERR(inode_iter); + bch2_btree_iter_unlock(extent_iter); + mutex_lock(&inode->ei_update_lock); - ret = bch2_btree_iter_traverse(inode_iter); - if (ret) - goto err; + if (!bch2_btree_iter_relock(extent_iter)) { + mutex_unlock(&inode->ei_update_lock); + return -EINTR; + } + + inode_locked = true; + + if (!inode->ei_inode_update) + inode->ei_inode_update = + bch2_deferred_update_alloc(trans->c, + BTREE_ID_INODES, 64); inode_u = inode->ei_inode; inode_u.bi_sectors += i_sectors_delta; @@ -337,7 +346,8 @@ static int bch2_extent_update(struct btree_trans *trans, bch2_inode_pack(&inode_p, &inode_u); bch2_trans_update(trans, - BTREE_INSERT_ENTRY(inode_iter, &inode_p.inode.k_i)); + BTREE_INSERT_DEFERRED(inode->ei_inode_update, + &inode_p.inode.k_i)); } ret = bch2_trans_commit(trans, disk_res, @@ -371,13 +381,15 @@ static int bch2_extent_update(struct btree_trans *trans, if (total_delta) *total_delta += i_sectors_delta; err: - if (!IS_ERR_OR_NULL(inode_iter)) - bch2_trans_iter_put(trans, inode_iter); + if (inode_locked) + mutex_unlock(&inode->ei_update_lock); + return ret; } static int bchfs_write_index_update(struct bch_write_op *wop) { + struct bch_fs *c = wop->c; struct bchfs_write_op *op = container_of(wop, struct bchfs_write_op, op); struct quota_res *quota_res = op->is_dio @@ -392,7 +404,7 @@ static int bchfs_write_index_update(struct bch_write_op *wop) BUG_ON(k->k.p.inode != inode->v.i_ino); - bch2_trans_init(&trans, wop->c); + bch2_trans_init(&trans, c); bch2_trans_preload_iters(&trans); iter = bch2_trans_get_iter(&trans, diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index c7797338..8c9fdc84 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -105,12 +105,18 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans, inode_set_fn set, void *p) { + struct bch_fs *c = trans->c; struct btree_iter *iter; struct bkey_inode_buf *inode_p; int ret; lockdep_assert_held(&inode->ei_update_lock); + /* XXX: Don't do this with btree locks held */ + if (!inode->ei_inode_update) + inode->ei_inode_update = + bch2_deferred_update_alloc(c, BTREE_ID_INODES, 64); +#if 0 iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(inode->v.i_ino, 0), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); @@ -121,7 +127,7 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans, ret = bch2_btree_iter_traverse(iter); if (ret) return ret; - +#endif *inode_u = inode->ei_inode; if (set) { @@ -135,7 +141,15 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans, return PTR_ERR(inode_p); bch2_inode_pack(inode_p, inode_u); - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &inode_p->inode.k_i)); + + if (!inode->ei_inode_update) + bch2_trans_update(trans, + BTREE_INSERT_ENTRY(iter, &inode_p->inode.k_i)); + else + bch2_trans_update(trans, + BTREE_INSERT_DEFERRED(inode->ei_inode_update, + &inode_p->inode.k_i)); + return 0; } @@ -1346,6 +1360,7 @@ static struct inode *bch2_alloc_inode(struct super_block *sb) inode_init_once(&inode->v); mutex_init(&inode->ei_update_lock); mutex_init(&inode->ei_quota_lock); + inode->ei_inode_update = NULL; inode->ei_journal_seq = 0; return &inode->v; @@ -1409,6 +1424,10 @@ static void bch2_evict_inode(struct inode *vinode) BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); + if (inode->ei_inode_update) + bch2_deferred_update_free(c, inode->ei_inode_update); + inode->ei_inode_update = NULL; + if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) { bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), KEY_TYPE_QUOTA_WARN); diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h index 9ac6fc87..7080de1e 100644 --- a/libbcachefs/fs.h +++ b/libbcachefs/fs.h @@ -13,6 +13,7 @@ struct bch_inode_info { struct inode v; struct mutex ei_update_lock; + struct deferred_update *ei_inode_update; u64 ei_journal_seq; u64 ei_quota_reserved; unsigned long ei_last_dirtied; diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index f108a282..e2386bb7 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -322,6 +322,7 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *buf; + bool can_discard; int ret; retry: if (journal_res_get_fast(j, res, flags)) @@ -342,6 +343,16 @@ retry: return 0; } + if (!(flags & JOURNAL_RES_GET_RESERVED) && + !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { + /* + * Don't want to close current journal entry, just need to + * invoke reclaim: + */ + ret = -ENOSPC; + goto unlock; + } + /* * If we couldn't get a reservation because the current buf filled up, * and we had room for a bigger entry on disk, signal that we want to @@ -365,23 +376,38 @@ retry: } else { ret = journal_entry_open(j); } - +unlock: if ((ret == -EAGAIN || ret == -ENOSPC) && !j->res_get_blocked_start) j->res_get_blocked_start = local_clock() ?: 1; + can_discard = j->can_discard; spin_unlock(&j->lock); if (!ret) goto retry; + if (ret == -ENOSPC) { + BUG_ON(!can_discard && (flags & JOURNAL_RES_GET_RESERVED)); + /* * Journal is full - can't rely on reclaim from work item due to * freezing: */ trace_journal_full(c); - if (!(flags & JOURNAL_RES_GET_NONBLOCK)) - bch2_journal_reclaim_work(&j->reclaim_work.work); + + if (!(flags & JOURNAL_RES_GET_NONBLOCK)) { + if (can_discard) { + bch2_journal_do_discards(j); + goto retry; + } + + if (mutex_trylock(&j->reclaim_lock)) { + bch2_journal_reclaim(j); + mutex_unlock(&j->reclaim_lock); + } + } + ret = -EAGAIN; } @@ -409,6 +435,32 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, return ret; } +/* journal_preres: */ + +static bool journal_preres_available(struct journal *j, + struct journal_preres *res, + unsigned new_u64s) +{ + bool ret = bch2_journal_preres_get_fast(j, res, new_u64s); + + if (!ret) + bch2_journal_reclaim_work(&j->reclaim_work.work); + + return ret; +} + +int __bch2_journal_preres_get(struct journal *j, + struct journal_preres *res, + unsigned new_u64s) +{ + int ret; + + closure_wait_event(&j->preres_wait, + (ret = bch2_journal_error(j)) || + journal_preres_available(j, res, new_u64s)); + return ret; +} + /* journal_entry_res: */ void bch2_journal_entry_res_resize(struct journal *j, @@ -760,6 +812,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, while (ja->nr < nr) { struct open_bucket *ob = NULL; + unsigned pos; long bucket; if (new_fs) { @@ -786,21 +839,25 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, preempt_disable(); } - __array_insert_item(ja->buckets, ja->nr, ja->last_idx); - __array_insert_item(ja->bucket_seq, ja->nr, ja->last_idx); - __array_insert_item(journal_buckets->buckets, ja->nr, ja->last_idx); - - ja->buckets[ja->last_idx] = bucket; - ja->bucket_seq[ja->last_idx] = 0; - journal_buckets->buckets[ja->last_idx] = cpu_to_le64(bucket); - - if (ja->last_idx < ja->nr) { - if (ja->cur_idx >= ja->last_idx) - ja->cur_idx++; - ja->last_idx++; - } + pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0; + __array_insert_item(ja->buckets, ja->nr, pos); + __array_insert_item(ja->bucket_seq, ja->nr, pos); + __array_insert_item(journal_buckets->buckets, ja->nr, pos); ja->nr++; + ja->buckets[pos] = bucket; + ja->bucket_seq[pos] = 0; + journal_buckets->buckets[pos] = cpu_to_le64(bucket); + + if (pos <= ja->discard_idx) + ja->discard_idx = (ja->discard_idx + 1) % ja->nr; + if (pos <= ja->dirty_idx_ondisk) + ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; + if (pos <= ja->dirty_idx) + ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; + if (pos <= ja->cur_idx) + ja->cur_idx = (ja->cur_idx + 1) % ja->nr; + bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL, ca->mi.bucket_size, gc_phase(GC_PHASE_SB), @@ -1039,6 +1096,7 @@ int bch2_fs_journal_init(struct journal *j) mutex_init(&j->blacklist_lock); INIT_LIST_HEAD(&j->seq_blacklist); mutex_init(&j->reclaim_lock); + mutex_init(&j->discard_lock); lockdep_init_map(&j->res_map, "journal res", &res_key, 0); @@ -1087,11 +1145,16 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf) "seq:\t\t\t%llu\n" "last_seq:\t\t%llu\n" "last_seq_ondisk:\t%llu\n" + "prereserved:\t\t%u/%u\n" + "current entry sectors:\t%u\n" "current entry:\t\t", fifo_used(&j->pin), journal_cur_seq(j), journal_last_seq(j), - j->last_seq_ondisk); + j->last_seq_ondisk, + j->prereserved.reserved, + j->prereserved.remaining, + j->cur_entry_sectors); switch (s.cur_entry_offset) { case JOURNAL_ENTRY_ERROR_VAL: @@ -1113,8 +1176,9 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf) journal_state_count(s, s.idx)); if (s.prev_buf_unwritten) - pr_buf(&out, "yes, ref %u\n", - journal_state_count(s, !s.idx)); + pr_buf(&out, "yes, ref %u sectors %u\n", + journal_state_count(s, !s.idx), + journal_prev_buf(j)->sectors); else pr_buf(&out, "no\n"); @@ -1135,13 +1199,17 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf) "dev %u:\n" "\tnr\t\t%u\n" "\tavailable\t%u:%u\n" - "\tcur_idx\t\t%u (seq %llu)\n" - "\tlast_idx\t%u (seq %llu)\n", + "\tdiscard_idx\t\t%u\n" + "\tdirty_idx_ondisk\t%u (seq %llu)\n" + "\tdirty_idx\t\t%u (seq %llu)\n" + "\tcur_idx\t\t%u (seq %llu)\n", iter, ja->nr, - bch2_journal_dev_buckets_available(j, ja), + bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free, - ja->cur_idx, ja->bucket_seq[ja->cur_idx], - ja->last_idx, ja->bucket_seq[ja->last_idx]); + ja->discard_idx, + ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk], + ja->dirty_idx, ja->bucket_seq[ja->dirty_idx], + ja->cur_idx, ja->bucket_seq[ja->cur_idx]); } spin_unlock(&j->lock); diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 71929bd6..83b70d51 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -118,6 +118,7 @@ static inline void journal_wake(struct journal *j) { wake_up(&j->wait); closure_wake_up(&j->async_wait); + closure_wake_up(&j->preres_wait); } static inline struct journal_buf *journal_cur_buf(struct journal *j) @@ -271,6 +272,7 @@ int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, #define JOURNAL_RES_GET_NONBLOCK (1 << 0) #define JOURNAL_RES_GET_CHECK (1 << 1) +#define JOURNAL_RES_GET_RESERVED (1 << 2) static inline int journal_res_get_fast(struct journal *j, struct journal_res *res, @@ -291,6 +293,10 @@ static inline int journal_res_get_fast(struct journal *j, EBUG_ON(!journal_state_count(new, new.idx)); + if (!(flags & JOURNAL_RES_GET_RESERVED) && + !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) + return 0; + if (flags & JOURNAL_RES_GET_CHECK) return 1; @@ -330,6 +336,89 @@ out: return 0; } +/* journal_preres: */ + +static inline bool journal_check_may_get_unreserved(struct journal *j) +{ + union journal_preres_state s = READ_ONCE(j->prereserved); + bool ret = s.reserved <= s.remaining && + fifo_free(&j->pin) > 8; + + lockdep_assert_held(&j->lock); + + if (ret != test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { + if (ret) { + set_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); + journal_wake(j); + } else { + clear_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); + } + } + return ret; +} + +static inline void bch2_journal_preres_put(struct journal *j, + struct journal_preres *res) +{ + union journal_preres_state s = { .reserved = res->u64s }; + + if (!res->u64s) + return; + + s.v = atomic64_sub_return(s.v, &j->prereserved.counter); + res->u64s = 0; + closure_wake_up(&j->preres_wait); + + if (s.reserved <= s.remaining && + !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { + spin_lock(&j->lock); + journal_check_may_get_unreserved(j); + spin_unlock(&j->lock); + } +} + +int __bch2_journal_preres_get(struct journal *, + struct journal_preres *, unsigned); + +static inline int bch2_journal_preres_get_fast(struct journal *j, + struct journal_preres *res, + unsigned new_u64s) +{ + int d = new_u64s - res->u64s; + union journal_preres_state old, new; + u64 v = atomic64_read(&j->prereserved.counter); + + do { + old.v = new.v = v; + + new.reserved += d; + + if (new.reserved > new.remaining) + return 0; + } while ((v = atomic64_cmpxchg(&j->prereserved.counter, + old.v, new.v)) != old.v); + + res->u64s += d; + return 1; +} + +static inline int bch2_journal_preres_get(struct journal *j, + struct journal_preres *res, + unsigned new_u64s, + unsigned flags) +{ + if (new_u64s <= res->u64s) + return 0; + + if (bch2_journal_preres_get_fast(j, res, new_u64s)) + return 0; + + if (flags & JOURNAL_RES_GET_NONBLOCK) + return -EAGAIN; + + return __bch2_journal_preres_get(j, res, new_u64s); +} + /* journal_entry_res: */ void bch2_journal_entry_res_resize(struct journal *, diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 16cb6be8..b7c52b64 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -625,11 +625,12 @@ static void bch2_journal_read_device(struct closure *cl) ja->sectors_free = 0; /* - * Set last_idx to indicate the entire journal is full and needs to be + * Set dirty_idx to indicate the entire journal is full and needs to be * reclaimed - journal reclaim will immediately reclaim whatever isn't * pinned when it first runs: */ - ja->last_idx = (ja->cur_idx + 1) % ja->nr; + ja->discard_idx = ja->dirty_idx_ondisk = + ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; out: kvpfree(buf.data, buf.size); percpu_ref_put(&ca->io_ref); @@ -969,9 +970,16 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w, if (sectors > ja->sectors_free && sectors <= ca->mi.bucket_size && - bch2_journal_dev_buckets_available(j, ja)) { + bch2_journal_dev_buckets_available(j, ja, + journal_space_discarded)) { ja->cur_idx = (ja->cur_idx + 1) % ja->nr; ja->sectors_free = ca->mi.bucket_size; + + /* + * ja->bucket_seq[ja->cur_idx] must always have + * something sensible: + */ + ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); } } @@ -1069,12 +1077,13 @@ static void journal_write_done(struct closure *cl) goto err; spin_lock(&j->lock); - j->seq_ondisk = seq; - j->last_seq_ondisk = last_seq; - if (seq >= j->pin.front) journal_seq_pin(j, seq)->devs = devs; + j->seq_ondisk = seq; + j->last_seq_ondisk = last_seq; + bch2_journal_space_available(j); + /* * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard * more buckets: diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index b8603a1f..eb989462 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -8,47 +8,72 @@ /* Free space calculations: */ +static unsigned journal_space_from(struct journal_device *ja, + enum journal_space_from from) +{ + switch (from) { + case journal_space_discarded: + return ja->discard_idx; + case journal_space_clean_ondisk: + return ja->dirty_idx_ondisk; + case journal_space_clean: + return ja->dirty_idx; + default: + BUG(); + } +} + unsigned bch2_journal_dev_buckets_available(struct journal *j, - struct journal_device *ja) + struct journal_device *ja, + enum journal_space_from from) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - unsigned next = (ja->cur_idx + 1) % ja->nr; - unsigned available = (ja->last_idx + ja->nr - next) % ja->nr; + unsigned available = (journal_space_from(ja, from) - + ja->cur_idx - 1 + ja->nr) % ja->nr; /* * Allocator startup needs some journal space before we can do journal * replay: */ - if (available && - test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) - available--; + if (available && test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) + --available; /* * Don't use the last bucket unless writing the new last_seq * will make another bucket available: */ - if (available && - journal_last_seq(j) <= ja->bucket_seq[ja->last_idx]) + if (available && ja->dirty_idx_ondisk == ja->dirty_idx) --available; return available; } -void bch2_journal_space_available(struct journal *j) +static void journal_set_remaining(struct journal *j, unsigned u64s_remaining) +{ + union journal_preres_state old, new; + u64 v = atomic64_read(&j->prereserved.counter); + + do { + old.v = new.v = v; + new.remaining = u64s_remaining; + } while ((v = atomic64_cmpxchg(&j->prereserved.counter, + old.v, new.v)) != old.v); +} + +static struct journal_space { + unsigned next_entry; + unsigned remaining; +} __journal_space_available(struct journal *j, unsigned nr_devs_want, + enum journal_space_from from) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; unsigned sectors_next_entry = UINT_MAX; unsigned sectors_total = UINT_MAX; - unsigned max_entry_size = min(j->buf[0].buf_size >> 9, - j->buf[1].buf_size >> 9); - unsigned i, nr_online = 0, nr_devs = 0; + unsigned i, nr_devs = 0; unsigned unwritten_sectors = j->reservations.prev_buf_unwritten ? journal_prev_buf(j)->sectors : 0; - int ret = 0; - - lockdep_assert_held(&j->lock); rcu_read_lock(); for_each_member_device_rcu(ca, c, i, @@ -59,9 +84,7 @@ void bch2_journal_space_available(struct journal *j) if (!ja->nr) continue; - nr_online++; - - buckets_this_device = bch2_journal_dev_buckets_available(j, ja); + buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from); sectors_this_device = ja->sectors_free; /* @@ -94,28 +117,88 @@ void bch2_journal_space_available(struct journal *j) buckets_this_device * ca->mi.bucket_size + sectors_this_device); - max_entry_size = min_t(unsigned, max_entry_size, - ca->mi.bucket_size); - nr_devs++; } rcu_read_unlock(); + if (nr_devs < nr_devs_want) + return (struct journal_space) { 0, 0 }; + + return (struct journal_space) { + .next_entry = sectors_next_entry, + .remaining = max_t(int, 0, sectors_total - sectors_next_entry), + }; +} + +void bch2_journal_space_available(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + struct journal_space discarded, clean_ondisk, clean; + unsigned overhead, u64s_remaining = 0; + unsigned max_entry_size = min(j->buf[0].buf_size >> 9, + j->buf[1].buf_size >> 9); + unsigned i, nr_online = 0, nr_devs_want; + bool can_discard = false; + int ret = 0; + + lockdep_assert_held(&j->lock); + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, + &c->rw_devs[BCH_DATA_JOURNAL]) { + struct journal_device *ja = &ca->journal; + + if (!ja->nr) + continue; + + while (ja->dirty_idx != ja->cur_idx && + ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j)) + ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; + + while (ja->dirty_idx_ondisk != ja->dirty_idx && + ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk) + ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; + + if (ja->discard_idx != ja->dirty_idx_ondisk) + can_discard = true; + + max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); + nr_online++; + } + rcu_read_unlock(); + + j->can_discard = can_discard; + if (nr_online < c->opts.metadata_replicas_required) { ret = -EROFS; - sectors_next_entry = 0; - } else if (!sectors_next_entry || - nr_devs < min_t(unsigned, nr_online, - c->opts.metadata_replicas)) { - ret = -ENOSPC; - sectors_next_entry = 0; - } else if (!fifo_free(&j->pin)) { - ret = -ENOSPC; - sectors_next_entry = 0; + goto out; } - j->cur_entry_sectors = sectors_next_entry; + if (!fifo_free(&j->pin)) { + ret = -ENOSPC; + goto out; + } + + nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas); + + discarded = __journal_space_available(j, nr_devs_want, journal_space_discarded); + clean_ondisk = __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk); + clean = __journal_space_available(j, nr_devs_want, journal_space_clean); + + if (!discarded.next_entry) + ret = -ENOSPC; + + overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) * + journal_entry_overhead(j); + u64s_remaining = clean.remaining << 6; + u64s_remaining = max_t(int, 0, u64s_remaining - overhead); + u64s_remaining /= 4; +out: + j->cur_entry_sectors = !ret ? discarded.next_entry : 0; j->cur_entry_error = ret; + journal_set_remaining(j, u64s_remaining); + journal_check_may_get_unreserved(j); if (!ret) journal_wake(j); @@ -128,25 +211,23 @@ static bool should_discard_bucket(struct journal *j, struct journal_device *ja) bool ret; spin_lock(&j->lock); - ret = ja->nr && - ja->last_idx != ja->cur_idx && - ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk; + ret = ja->discard_idx != ja->dirty_idx_ondisk; spin_unlock(&j->lock); return ret; } /* - * Advance ja->last_idx as long as it points to buckets that are no longer + * Advance ja->discard_idx as long as it points to buckets that are no longer * dirty, issuing discards if necessary: */ -static void journal_do_discards(struct journal *j) +void bch2_journal_do_discards(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; unsigned iter; - mutex_lock(&j->reclaim_lock); + mutex_lock(&j->discard_lock); for_each_rw_member(ca, c, iter) { struct journal_device *ja = &ca->journal; @@ -156,18 +237,18 @@ static void journal_do_discards(struct journal *j) blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, - ja->buckets[ja->last_idx]), + ja->buckets[ja->discard_idx]), ca->mi.bucket_size, GFP_NOIO, 0); spin_lock(&j->lock); - ja->last_idx = (ja->last_idx + 1) % ja->nr; + ja->discard_idx = (ja->discard_idx + 1) % ja->nr; bch2_journal_space_available(j); spin_unlock(&j->lock); } } - mutex_unlock(&j->reclaim_lock); + mutex_unlock(&j->discard_lock); } /* @@ -372,7 +453,7 @@ static void journal_flush_pins(struct journal *j, u64 seq_to_flush, } /** - * bch2_journal_reclaim_work - free up journal buckets + * bch2_journal_reclaim - free up journal buckets * * Background journal reclaim writes out btree nodes. It should be run * early enough so that we never completely run out of journal buckets. @@ -389,29 +470,37 @@ static void journal_flush_pins(struct journal *j, u64 seq_to_flush, * 512 journal entries or 25% of all journal buckets, then * journal_next_bucket() should not stall. */ -void bch2_journal_reclaim_work(struct work_struct *work) +void bch2_journal_reclaim(struct journal *j) { - struct bch_fs *c = container_of(to_delayed_work(work), - struct bch_fs, journal.reclaim_work); - struct journal *j = &c->journal; + struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; - unsigned iter, bucket_to_flush, min_nr = 0; + unsigned iter, min_nr = 0; u64 seq_to_flush = 0; - journal_do_discards(j); + lockdep_assert_held(&j->reclaim_lock); + + bch2_journal_do_discards(j); - mutex_lock(&j->reclaim_lock); spin_lock(&j->lock); for_each_rw_member(ca, c, iter) { struct journal_device *ja = &ca->journal; + unsigned nr_buckets, bucket_to_flush; if (!ja->nr) continue; - /* Try to keep the journal at most half full: */ - bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr; + nr_buckets = ja->nr / 2; + + /* And include pre-reservations: */ + nr_buckets += DIV_ROUND_UP(j->prereserved.reserved, + (ca->mi.bucket_size << 6) - + journal_entry_overhead(j)); + + nr_buckets = min(nr_buckets, ja->nr); + + bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; seq_to_flush = max_t(u64, seq_to_flush, ja->bucket_seq[bucket_to_flush]); } @@ -430,15 +519,26 @@ void bch2_journal_reclaim_work(struct work_struct *work) msecs_to_jiffies(j->reclaim_delay_ms))) min_nr = 1; - journal_flush_pins(j, seq_to_flush, min_nr); + if (j->prereserved.reserved * 2 > j->prereserved.remaining) + min_nr = 1; - mutex_unlock(&j->reclaim_lock); + journal_flush_pins(j, seq_to_flush, min_nr); if (!test_bit(BCH_FS_RO, &c->flags)) queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, msecs_to_jiffies(j->reclaim_delay_ms)); } +void bch2_journal_reclaim_work(struct work_struct *work) +{ + struct journal *j = container_of(to_delayed_work(work), + struct journal, reclaim_work); + + mutex_lock(&j->reclaim_lock); + bch2_journal_reclaim(j); + mutex_unlock(&j->reclaim_lock); +} + static int journal_flush_done(struct journal *j, u64 seq_to_flush) { int ret; diff --git a/libbcachefs/journal_reclaim.h b/libbcachefs/journal_reclaim.h index 1d688d6f..7ecfc814 100644 --- a/libbcachefs/journal_reclaim.h +++ b/libbcachefs/journal_reclaim.h @@ -3,8 +3,15 @@ #define JOURNAL_PIN (32 * 1024) +enum journal_space_from { + journal_space_discarded, + journal_space_clean_ondisk, + journal_space_clean, +}; + unsigned bch2_journal_dev_buckets_available(struct journal *, - struct journal_device *); + struct journal_device *, + enum journal_space_from); void bch2_journal_space_available(struct journal *); static inline bool journal_pin_active(struct journal_entry_pin *pin) @@ -33,6 +40,8 @@ void bch2_journal_pin_add_if_older(struct journal *, journal_pin_flush_fn); void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *); +void bch2_journal_do_discards(struct journal *); +void bch2_journal_reclaim(struct journal *); void bch2_journal_reclaim_work(struct work_struct *); void bch2_journal_flush_pins(struct journal *, u64); diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index 8772e53f..4685cf67 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -79,6 +79,14 @@ struct journal_res { u64 seq; }; +/* + * For reserving space in the journal prior to getting a reservation on a + * particular journal entry: + */ +struct journal_preres { + unsigned u64s; +}; + union journal_res_state { struct { atomic64_t counter; @@ -97,6 +105,21 @@ union journal_res_state { }; }; +union journal_preres_state { + struct { + atomic64_t counter; + }; + + struct { + u64 v; + }; + + struct { + u32 reserved; + u32 remaining; + }; +}; + /* bytes: */ #define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ #define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ @@ -121,6 +144,7 @@ enum { JOURNAL_STARTED, JOURNAL_NEED_WRITE, JOURNAL_NOT_EMPTY, + JOURNAL_MAY_GET_UNRESERVED, }; /* Embedded in struct bch_fs */ @@ -141,6 +165,8 @@ struct journal { */ int cur_entry_error; + union journal_preres_state prereserved; + /* Reserved space in journal entry to be used just prior to write */ unsigned entry_u64s_reserved; @@ -160,6 +186,7 @@ struct journal { /* Used when waiting because the journal was full */ wait_queue_head_t wait; struct closure_waitlist async_wait; + struct closure_waitlist preres_wait; struct closure io; struct delayed_work write_work; @@ -192,9 +219,6 @@ struct journal { struct journal_entry_pin_list *data; } pin; - struct journal_entry_pin *flush_in_progress; - wait_queue_head_t pin_flush_wait; - u64 replay_journal_seq; struct mutex blacklist_lock; @@ -205,10 +229,15 @@ struct journal { spinlock_t err_lock; struct delayed_work reclaim_work; - unsigned long last_flushed; - - /* protects advancing ja->last_idx: */ struct mutex reclaim_lock; + unsigned long last_flushed; + struct journal_entry_pin *flush_in_progress; + wait_queue_head_t pin_flush_wait; + + /* protects advancing ja->discard_idx: */ + struct mutex discard_lock; + bool can_discard; + unsigned write_delay_ms; unsigned reclaim_delay_ms; @@ -239,17 +268,15 @@ struct journal_device { unsigned sectors_free; - /* Journal bucket we're currently writing to */ - unsigned cur_idx; - - /* Last journal bucket that still contains an open journal entry */ - /* - * j->lock and j->reclaim_lock must both be held to modify, j->lock - * sufficient to read: + * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx: */ - unsigned last_idx; + unsigned discard_idx; /* Next bucket to discard */ + unsigned dirty_idx_ondisk; + unsigned dirty_idx; + unsigned cur_idx; /* Journal bucket we're currently writing to */ unsigned nr; + u64 *buckets; /* Bio for journal reads/writes to this device */