mirror of
https://github.com/koverstreet/bcachefs-tools.git
synced 2025-02-22 00:00:03 +03:00
Update bcachefs sources to 43a464c9dd bcachefs: Don't BUG_ON() on bucket sector count overflow
This commit is contained in:
parent
70bb5ab7a8
commit
fd67296247
@ -1 +1 @@
|
||||
a5e71b82006fdf563190c41955c2b462854af610
|
||||
43a464c9dd38b50c1a89845366f838fe70fbb743
|
||||
|
@ -959,6 +959,7 @@ retry:
|
||||
BTREE_INSERT_NOFAIL|
|
||||
BTREE_INSERT_USE_RESERVE|
|
||||
BTREE_INSERT_USE_ALLOC_RESERVE|
|
||||
BTREE_INSERT_JOURNAL_RESERVED|
|
||||
flags,
|
||||
BTREE_INSERT_ENTRY(iter, &a->k_i));
|
||||
if (ret == -EINTR)
|
||||
|
@ -1610,7 +1610,7 @@ void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c,
|
||||
prefetch(c->btree_roots[btree_id].b);
|
||||
}
|
||||
|
||||
void bch2_btree_iter_unlink(struct btree_iter *iter)
|
||||
static void bch2_btree_iter_unlink(struct btree_iter *iter)
|
||||
{
|
||||
struct btree_iter *linked;
|
||||
|
||||
@ -1629,7 +1629,7 @@ void bch2_btree_iter_unlink(struct btree_iter *iter)
|
||||
BUG();
|
||||
}
|
||||
|
||||
void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
|
||||
static void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
|
||||
{
|
||||
BUG_ON(btree_iter_linked(new));
|
||||
|
||||
|
@ -105,6 +105,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
|
||||
unsigned, unsigned);
|
||||
|
||||
int bch2_btree_iter_unlock(struct btree_iter *);
|
||||
bool bch2_btree_iter_relock(struct btree_iter *);
|
||||
|
||||
bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned);
|
||||
bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned);
|
||||
@ -164,8 +165,6 @@ static inline void bch2_btree_iter_init(struct btree_iter *iter,
|
||||
? BTREE_ITER_IS_EXTENTS : 0)|flags);
|
||||
}
|
||||
|
||||
void bch2_btree_iter_link(struct btree_iter *, struct btree_iter *);
|
||||
void bch2_btree_iter_unlink(struct btree_iter *);
|
||||
void bch2_btree_iter_copy(struct btree_iter *, struct btree_iter *);
|
||||
|
||||
static inline struct bpos btree_type_successor(enum btree_id id,
|
||||
|
@ -203,8 +203,6 @@ static inline bool bch2_btree_node_relock(struct btree_iter *iter,
|
||||
__bch2_btree_node_relock(iter, level);
|
||||
}
|
||||
|
||||
bool bch2_btree_iter_relock(struct btree_iter *);
|
||||
|
||||
void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
|
||||
|
||||
void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
|
||||
|
@ -245,10 +245,11 @@ struct btree_iter {
|
||||
#define BTREE_ITER_MAX 8
|
||||
|
||||
struct deferred_update {
|
||||
struct journal_preres res;
|
||||
struct journal_entry_pin journal;
|
||||
|
||||
spinlock_t lock;
|
||||
unsigned gen;
|
||||
unsigned dirty:1;
|
||||
|
||||
u8 allocated_u64s;
|
||||
enum btree_id btree_id;
|
||||
|
@ -26,6 +26,7 @@ struct btree_insert {
|
||||
struct bch_fs *c;
|
||||
struct disk_reservation *disk_res;
|
||||
struct journal_res journal_res;
|
||||
struct journal_preres journal_preres;
|
||||
u64 *journal_seq;
|
||||
unsigned flags;
|
||||
bool did_work;
|
||||
@ -81,6 +82,7 @@ enum {
|
||||
__BTREE_INSERT_USE_RESERVE,
|
||||
__BTREE_INSERT_USE_ALLOC_RESERVE,
|
||||
__BTREE_INSERT_JOURNAL_REPLAY,
|
||||
__BTREE_INSERT_JOURNAL_RESERVED,
|
||||
__BTREE_INSERT_NOMARK,
|
||||
__BTREE_INSERT_NOWAIT,
|
||||
__BTREE_INSERT_GC_LOCK_HELD,
|
||||
@ -111,6 +113,8 @@ enum {
|
||||
/* Insert is for journal replay - don't get journal reservations: */
|
||||
#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY)
|
||||
|
||||
#define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED)
|
||||
|
||||
/* Don't call bch2_mark_key: */
|
||||
#define BTREE_INSERT_NOMARK (1 << __BTREE_INSERT_NOMARK)
|
||||
|
||||
|
@ -17,6 +17,9 @@
|
||||
#include <linux/sort.h>
|
||||
#include <trace/events/bcachefs.h>
|
||||
|
||||
static bool btree_trans_relock(struct btree_insert *);
|
||||
static void btree_trans_unlock(struct btree_insert *);
|
||||
|
||||
/* Inserting into a given leaf node (last stage of insert): */
|
||||
|
||||
/* Handle overwrites and do insert, for non extents: */
|
||||
@ -239,15 +242,15 @@ btree_insert_key_leaf(struct btree_insert *trans,
|
||||
/* Deferred btree updates: */
|
||||
|
||||
static void deferred_update_flush(struct journal *j,
|
||||
struct journal_entry_pin *pin,
|
||||
u64 seq)
|
||||
struct journal_entry_pin *pin,
|
||||
u64 seq)
|
||||
{
|
||||
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
||||
struct deferred_update *d =
|
||||
container_of(pin, struct deferred_update, journal);
|
||||
struct journal_preres res = { 0 };
|
||||
u64 tmp[32];
|
||||
struct bkey_i *k = (void *) tmp;
|
||||
unsigned gen;
|
||||
int ret;
|
||||
|
||||
if (d->allocated_u64s > ARRAY_SIZE(tmp)) {
|
||||
@ -257,26 +260,32 @@ static void deferred_update_flush(struct journal *j,
|
||||
}
|
||||
|
||||
spin_lock(&d->lock);
|
||||
gen = d->gen;
|
||||
if (d->dirty) {
|
||||
BUG_ON(jset_u64s(d->k.k.u64s) > d->res.u64s);
|
||||
|
||||
swap(res, d->res);
|
||||
|
||||
if (journal_pin_active(&d->journal)) {
|
||||
BUG_ON(d->k.k.u64s > d->allocated_u64s);
|
||||
bkey_copy(k, &d->k);
|
||||
|
||||
bkey_copy(k, &d->k);
|
||||
d->dirty = false;
|
||||
spin_unlock(&d->lock);
|
||||
|
||||
ret = bch2_btree_insert(c, d->btree_id, k, NULL, NULL,
|
||||
BTREE_INSERT_NOFAIL);
|
||||
BTREE_INSERT_NOFAIL|
|
||||
BTREE_INSERT_USE_RESERVE|
|
||||
BTREE_INSERT_JOURNAL_RESERVED);
|
||||
bch2_fs_fatal_err_on(ret && !bch2_journal_error(j),
|
||||
c, "error flushing deferred btree update: %i", ret);
|
||||
c, "error flushing deferred btree update: %i", ret);
|
||||
|
||||
spin_lock(&d->lock);
|
||||
}
|
||||
|
||||
if (gen == d->gen)
|
||||
if (!d->dirty)
|
||||
bch2_journal_pin_drop(j, &d->journal);
|
||||
spin_unlock(&d->lock);
|
||||
|
||||
bch2_journal_preres_put(j, &res);
|
||||
if (k != (void *) tmp)
|
||||
kfree(k);
|
||||
}
|
||||
@ -288,6 +297,7 @@ btree_insert_key_deferred(struct btree_insert *trans,
|
||||
struct bch_fs *c = trans->c;
|
||||
struct journal *j = &c->journal;
|
||||
struct deferred_update *d = insert->d;
|
||||
int difference;
|
||||
|
||||
BUG_ON(trans->flags & BTREE_INSERT_JOURNAL_REPLAY);
|
||||
BUG_ON(insert->k->u64s > d->allocated_u64s);
|
||||
@ -295,12 +305,21 @@ btree_insert_key_deferred(struct btree_insert *trans,
|
||||
__btree_journal_key(trans, d->btree_id, insert->k);
|
||||
|
||||
spin_lock(&d->lock);
|
||||
d->gen++;
|
||||
BUG_ON(jset_u64s(insert->k->u64s) >
|
||||
trans->journal_preres.u64s);
|
||||
|
||||
difference = jset_u64s(insert->k->u64s) - d->res.u64s;
|
||||
if (difference > 0) {
|
||||
trans->journal_preres.u64s -= difference;
|
||||
d->res.u64s += difference;
|
||||
}
|
||||
|
||||
bkey_copy(&d->k, insert->k);
|
||||
spin_unlock(&d->lock);
|
||||
d->dirty = true;
|
||||
|
||||
bch2_journal_pin_update(j, trans->journal_res.seq, &d->journal,
|
||||
deferred_update_flush);
|
||||
spin_unlock(&d->lock);
|
||||
|
||||
return BTREE_INSERT_OK;
|
||||
}
|
||||
@ -519,13 +538,16 @@ retry:
|
||||
}
|
||||
|
||||
if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
|
||||
unsigned flags = (trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
|
||||
? JOURNAL_RES_GET_RESERVED : 0;
|
||||
|
||||
u64s = 0;
|
||||
trans_for_each_entry(trans, i)
|
||||
u64s += jset_u64s(i->k->k.u64s);
|
||||
|
||||
ret = bch2_journal_res_get(&c->journal,
|
||||
&trans->journal_res, u64s,
|
||||
JOURNAL_RES_GET_NONBLOCK);
|
||||
flags|JOURNAL_RES_GET_NONBLOCK);
|
||||
if (likely(!ret))
|
||||
goto got_journal_res;
|
||||
if (ret != -EAGAIN)
|
||||
@ -536,7 +558,7 @@ retry:
|
||||
|
||||
ret = bch2_journal_res_get(&c->journal,
|
||||
&trans->journal_res, u64s,
|
||||
JOURNAL_RES_GET_CHECK);
|
||||
flags|JOURNAL_RES_GET_CHECK);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@ -586,6 +608,10 @@ got_journal_res:
|
||||
}
|
||||
}
|
||||
out:
|
||||
BUG_ON(ret &&
|
||||
(trans->flags & BTREE_INSERT_JOURNAL_RESERVED) &&
|
||||
trans->journal_res.ref);
|
||||
|
||||
multi_unlock_write(trans);
|
||||
bch2_journal_res_put(&c->journal, &trans->journal_res);
|
||||
|
||||
@ -627,7 +653,7 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_insert_entry *i;
|
||||
struct btree_iter *linked;
|
||||
unsigned flags;
|
||||
unsigned flags, u64s = 0;
|
||||
int ret;
|
||||
|
||||
BUG_ON(!trans->nr);
|
||||
@ -638,11 +664,39 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
|
||||
if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
|
||||
lockdep_assert_held(&c->gc_lock);
|
||||
|
||||
memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
|
||||
|
||||
bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
|
||||
|
||||
trans_for_each_entry(trans, i)
|
||||
btree_insert_entry_checks(c, i);
|
||||
|
||||
trans_for_each_entry(trans, i)
|
||||
if (i->deferred)
|
||||
u64s += jset_u64s(i->k->k.u64s);
|
||||
|
||||
if (u64s) {
|
||||
ret = bch2_journal_preres_get(&c->journal,
|
||||
&trans->journal_preres, u64s,
|
||||
JOURNAL_RES_GET_NONBLOCK);
|
||||
if (!ret)
|
||||
goto got_journal_preres;
|
||||
if (ret != -EAGAIN)
|
||||
return ret;
|
||||
|
||||
btree_trans_unlock(trans);
|
||||
ret = bch2_journal_preres_get(&c->journal,
|
||||
&trans->journal_preres, u64s, 0);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (!btree_trans_relock(trans)) {
|
||||
trans_restart(" (iter relock after journal preres get blocked)");
|
||||
bch2_journal_preres_put(&c->journal, &trans->journal_preres);
|
||||
return -EINTR;
|
||||
}
|
||||
}
|
||||
got_journal_preres:
|
||||
if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
|
||||
!percpu_ref_tryget(&c->writes)))
|
||||
return -EROFS;
|
||||
@ -674,6 +728,8 @@ retry:
|
||||
trans_for_each_iter(trans, i)
|
||||
bch2_btree_iter_downgrade(i->iter);
|
||||
out:
|
||||
bch2_journal_preres_put(&c->journal, &trans->journal_preres);
|
||||
|
||||
if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
|
||||
percpu_ref_put(&c->writes);
|
||||
|
||||
|
@ -536,11 +536,14 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
|
||||
}
|
||||
|
||||
#define checked_add(a, b) \
|
||||
do { \
|
||||
({ \
|
||||
unsigned _res = (unsigned) (a) + (b); \
|
||||
bool overflow = _res > U16_MAX; \
|
||||
if (overflow) \
|
||||
_res = U16_MAX; \
|
||||
(a) = _res; \
|
||||
BUG_ON((a) != _res); \
|
||||
} while (0)
|
||||
overflow; \
|
||||
})
|
||||
|
||||
static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
|
||||
size_t b, enum bch_data_type type,
|
||||
@ -548,17 +551,25 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
|
||||
{
|
||||
struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
|
||||
struct bucket *g = __bucket(ca, b, gc);
|
||||
struct bucket_mark new;
|
||||
struct bucket_mark old, new;
|
||||
bool overflow;
|
||||
|
||||
BUG_ON(type != BCH_DATA_SB &&
|
||||
type != BCH_DATA_JOURNAL);
|
||||
|
||||
bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
|
||||
old = bucket_cmpxchg(g, new, ({
|
||||
new.dirty = true;
|
||||
new.data_type = type;
|
||||
checked_add(new.dirty_sectors, sectors);
|
||||
overflow = checked_add(new.dirty_sectors, sectors);
|
||||
}));
|
||||
|
||||
bch2_fs_inconsistent_on(overflow, c,
|
||||
"bucket sector count overflow: %u + %u > U16_MAX",
|
||||
old.dirty_sectors, sectors);
|
||||
|
||||
if (c)
|
||||
bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -574,19 +585,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
|
||||
do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags,
|
||||
ca, b, type, sectors);
|
||||
} else {
|
||||
struct bucket *g;
|
||||
struct bucket_mark new;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
g = bucket(ca, b);
|
||||
bucket_cmpxchg(g, new, ({
|
||||
new.dirty = true;
|
||||
new.data_type = type;
|
||||
checked_add(new.dirty_sectors, sectors);
|
||||
}));
|
||||
|
||||
rcu_read_unlock();
|
||||
__bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0);
|
||||
}
|
||||
}
|
||||
|
||||
@ -627,6 +626,7 @@ static bool bch2_mark_pointer(struct bch_fs *c,
|
||||
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
|
||||
size_t b = PTR_BUCKET_NR(ca, &p.ptr);
|
||||
struct bucket *g = __bucket(ca, b, gc);
|
||||
bool overflow;
|
||||
u64 v;
|
||||
|
||||
v = atomic64_read(&g->_mark.v);
|
||||
@ -648,9 +648,9 @@ static bool bch2_mark_pointer(struct bch_fs *c,
|
||||
}
|
||||
|
||||
if (!p.ptr.cached)
|
||||
checked_add(new.dirty_sectors, sectors);
|
||||
overflow = checked_add(new.dirty_sectors, sectors);
|
||||
else
|
||||
checked_add(new.cached_sectors, sectors);
|
||||
overflow = checked_add(new.cached_sectors, sectors);
|
||||
|
||||
if (!new.dirty_sectors &&
|
||||
!new.cached_sectors) {
|
||||
@ -672,6 +672,12 @@ static bool bch2_mark_pointer(struct bch_fs *c,
|
||||
old.v.counter,
|
||||
new.v.counter)) != old.v.counter);
|
||||
|
||||
bch2_fs_inconsistent_on(overflow, c,
|
||||
"bucket sector count overflow: %u + %lli > U16_MAX",
|
||||
!p.ptr.cached
|
||||
? old.dirty_sectors
|
||||
: old.cached_sectors, sectors);
|
||||
|
||||
bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
|
||||
|
||||
BUG_ON(!gc && bucket_became_unavailable(old, new));
|
||||
|
@ -229,20 +229,19 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
|
||||
|
||||
/* normal i_size/i_sectors update machinery: */
|
||||
|
||||
static s64 sum_sector_overwrites(struct bkey_i *new, struct btree_iter *_iter,
|
||||
bool *allocating)
|
||||
static int sum_sector_overwrites(struct btree_trans *trans,
|
||||
struct btree_iter *extent_iter,
|
||||
struct bkey_i *new, bool *allocating,
|
||||
s64 *i_sectors_delta)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
struct btree_iter *iter = bch2_trans_copy_iter(trans, extent_iter);
|
||||
struct bkey_s_c old;
|
||||
s64 delta = 0;
|
||||
|
||||
bch2_btree_iter_init(&iter, _iter->c, BTREE_ID_EXTENTS, POS_MIN,
|
||||
BTREE_ITER_SLOTS);
|
||||
if (IS_ERR(iter))
|
||||
return PTR_ERR(iter);
|
||||
|
||||
bch2_btree_iter_link(_iter, &iter);
|
||||
bch2_btree_iter_copy(&iter, _iter);
|
||||
|
||||
old = bch2_btree_iter_peek_slot(&iter);
|
||||
old = bch2_btree_iter_peek_slot(iter);
|
||||
|
||||
while (1) {
|
||||
/*
|
||||
@ -268,12 +267,13 @@ static s64 sum_sector_overwrites(struct bkey_i *new, struct btree_iter *_iter,
|
||||
if (bkey_cmp(old.k->p, new->k.p) >= 0)
|
||||
break;
|
||||
|
||||
old = bch2_btree_iter_next_slot(&iter);
|
||||
old = bch2_btree_iter_next_slot(iter);
|
||||
}
|
||||
|
||||
bch2_btree_iter_unlink(&iter);
|
||||
bch2_trans_iter_free(trans, iter);
|
||||
|
||||
return delta;
|
||||
*i_sectors_delta = delta;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int bch2_extent_update(struct btree_trans *trans,
|
||||
@ -287,11 +287,11 @@ static int bch2_extent_update(struct btree_trans *trans,
|
||||
bool direct,
|
||||
s64 *total_delta)
|
||||
{
|
||||
struct btree_iter *inode_iter = NULL;
|
||||
struct bch_inode_unpacked inode_u;
|
||||
struct bkey_inode_buf inode_p;
|
||||
bool allocating = false;
|
||||
bool extended = false;
|
||||
bool inode_locked = false;
|
||||
s64 i_sectors_delta;
|
||||
int ret;
|
||||
|
||||
@ -303,7 +303,12 @@ static int bch2_extent_update(struct btree_trans *trans,
|
||||
|
||||
bch2_extent_trim_atomic(k, extent_iter);
|
||||
|
||||
i_sectors_delta = sum_sector_overwrites(k, extent_iter, &allocating);
|
||||
ret = sum_sector_overwrites(trans, extent_iter,
|
||||
k, &allocating,
|
||||
&i_sectors_delta);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (!may_allocate && allocating)
|
||||
return -ENOSPC;
|
||||
|
||||
@ -314,16 +319,20 @@ static int bch2_extent_update(struct btree_trans *trans,
|
||||
/* XXX: inode->i_size locking */
|
||||
if (i_sectors_delta ||
|
||||
new_i_size > inode->ei_inode.bi_size) {
|
||||
inode_iter = bch2_trans_get_iter(trans,
|
||||
BTREE_ID_INODES,
|
||||
POS(k->k.p.inode, 0),
|
||||
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
|
||||
if (IS_ERR(inode_iter))
|
||||
return PTR_ERR(inode_iter);
|
||||
bch2_btree_iter_unlock(extent_iter);
|
||||
mutex_lock(&inode->ei_update_lock);
|
||||
|
||||
ret = bch2_btree_iter_traverse(inode_iter);
|
||||
if (ret)
|
||||
goto err;
|
||||
if (!bch2_btree_iter_relock(extent_iter)) {
|
||||
mutex_unlock(&inode->ei_update_lock);
|
||||
return -EINTR;
|
||||
}
|
||||
|
||||
inode_locked = true;
|
||||
|
||||
if (!inode->ei_inode_update)
|
||||
inode->ei_inode_update =
|
||||
bch2_deferred_update_alloc(trans->c,
|
||||
BTREE_ID_INODES, 64);
|
||||
|
||||
inode_u = inode->ei_inode;
|
||||
inode_u.bi_sectors += i_sectors_delta;
|
||||
@ -337,7 +346,8 @@ static int bch2_extent_update(struct btree_trans *trans,
|
||||
|
||||
bch2_inode_pack(&inode_p, &inode_u);
|
||||
bch2_trans_update(trans,
|
||||
BTREE_INSERT_ENTRY(inode_iter, &inode_p.inode.k_i));
|
||||
BTREE_INSERT_DEFERRED(inode->ei_inode_update,
|
||||
&inode_p.inode.k_i));
|
||||
}
|
||||
|
||||
ret = bch2_trans_commit(trans, disk_res,
|
||||
@ -371,13 +381,15 @@ static int bch2_extent_update(struct btree_trans *trans,
|
||||
if (total_delta)
|
||||
*total_delta += i_sectors_delta;
|
||||
err:
|
||||
if (!IS_ERR_OR_NULL(inode_iter))
|
||||
bch2_trans_iter_put(trans, inode_iter);
|
||||
if (inode_locked)
|
||||
mutex_unlock(&inode->ei_update_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bchfs_write_index_update(struct bch_write_op *wop)
|
||||
{
|
||||
struct bch_fs *c = wop->c;
|
||||
struct bchfs_write_op *op = container_of(wop,
|
||||
struct bchfs_write_op, op);
|
||||
struct quota_res *quota_res = op->is_dio
|
||||
@ -392,7 +404,7 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
|
||||
|
||||
BUG_ON(k->k.p.inode != inode->v.i_ino);
|
||||
|
||||
bch2_trans_init(&trans, wop->c);
|
||||
bch2_trans_init(&trans, c);
|
||||
bch2_trans_preload_iters(&trans);
|
||||
|
||||
iter = bch2_trans_get_iter(&trans,
|
||||
|
@ -105,12 +105,18 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans,
|
||||
inode_set_fn set,
|
||||
void *p)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_iter *iter;
|
||||
struct bkey_inode_buf *inode_p;
|
||||
int ret;
|
||||
|
||||
lockdep_assert_held(&inode->ei_update_lock);
|
||||
|
||||
/* XXX: Don't do this with btree locks held */
|
||||
if (!inode->ei_inode_update)
|
||||
inode->ei_inode_update =
|
||||
bch2_deferred_update_alloc(c, BTREE_ID_INODES, 64);
|
||||
#if 0
|
||||
iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
|
||||
POS(inode->v.i_ino, 0),
|
||||
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
|
||||
@ -121,7 +127,7 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans,
|
||||
ret = bch2_btree_iter_traverse(iter);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
#endif
|
||||
*inode_u = inode->ei_inode;
|
||||
|
||||
if (set) {
|
||||
@ -135,7 +141,15 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans,
|
||||
return PTR_ERR(inode_p);
|
||||
|
||||
bch2_inode_pack(inode_p, inode_u);
|
||||
bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &inode_p->inode.k_i));
|
||||
|
||||
if (!inode->ei_inode_update)
|
||||
bch2_trans_update(trans,
|
||||
BTREE_INSERT_ENTRY(iter, &inode_p->inode.k_i));
|
||||
else
|
||||
bch2_trans_update(trans,
|
||||
BTREE_INSERT_DEFERRED(inode->ei_inode_update,
|
||||
&inode_p->inode.k_i));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1346,6 +1360,7 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
|
||||
inode_init_once(&inode->v);
|
||||
mutex_init(&inode->ei_update_lock);
|
||||
mutex_init(&inode->ei_quota_lock);
|
||||
inode->ei_inode_update = NULL;
|
||||
inode->ei_journal_seq = 0;
|
||||
|
||||
return &inode->v;
|
||||
@ -1409,6 +1424,10 @@ static void bch2_evict_inode(struct inode *vinode)
|
||||
|
||||
BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
|
||||
|
||||
if (inode->ei_inode_update)
|
||||
bch2_deferred_update_free(c, inode->ei_inode_update);
|
||||
inode->ei_inode_update = NULL;
|
||||
|
||||
if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
|
||||
bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
|
||||
KEY_TYPE_QUOTA_WARN);
|
||||
|
@ -13,6 +13,7 @@ struct bch_inode_info {
|
||||
struct inode v;
|
||||
|
||||
struct mutex ei_update_lock;
|
||||
struct deferred_update *ei_inode_update;
|
||||
u64 ei_journal_seq;
|
||||
u64 ei_quota_reserved;
|
||||
unsigned long ei_last_dirtied;
|
||||
|
@ -322,6 +322,7 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
|
||||
{
|
||||
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
||||
struct journal_buf *buf;
|
||||
bool can_discard;
|
||||
int ret;
|
||||
retry:
|
||||
if (journal_res_get_fast(j, res, flags))
|
||||
@ -342,6 +343,16 @@ retry:
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!(flags & JOURNAL_RES_GET_RESERVED) &&
|
||||
!test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
|
||||
/*
|
||||
* Don't want to close current journal entry, just need to
|
||||
* invoke reclaim:
|
||||
*/
|
||||
ret = -ENOSPC;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we couldn't get a reservation because the current buf filled up,
|
||||
* and we had room for a bigger entry on disk, signal that we want to
|
||||
@ -365,23 +376,38 @@ retry:
|
||||
} else {
|
||||
ret = journal_entry_open(j);
|
||||
}
|
||||
|
||||
unlock:
|
||||
if ((ret == -EAGAIN || ret == -ENOSPC) &&
|
||||
!j->res_get_blocked_start)
|
||||
j->res_get_blocked_start = local_clock() ?: 1;
|
||||
|
||||
can_discard = j->can_discard;
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
if (!ret)
|
||||
goto retry;
|
||||
|
||||
if (ret == -ENOSPC) {
|
||||
BUG_ON(!can_discard && (flags & JOURNAL_RES_GET_RESERVED));
|
||||
|
||||
/*
|
||||
* Journal is full - can't rely on reclaim from work item due to
|
||||
* freezing:
|
||||
*/
|
||||
trace_journal_full(c);
|
||||
if (!(flags & JOURNAL_RES_GET_NONBLOCK))
|
||||
bch2_journal_reclaim_work(&j->reclaim_work.work);
|
||||
|
||||
if (!(flags & JOURNAL_RES_GET_NONBLOCK)) {
|
||||
if (can_discard) {
|
||||
bch2_journal_do_discards(j);
|
||||
goto retry;
|
||||
}
|
||||
|
||||
if (mutex_trylock(&j->reclaim_lock)) {
|
||||
bch2_journal_reclaim(j);
|
||||
mutex_unlock(&j->reclaim_lock);
|
||||
}
|
||||
}
|
||||
|
||||
ret = -EAGAIN;
|
||||
}
|
||||
|
||||
@ -409,6 +435,32 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* journal_preres: */
|
||||
|
||||
static bool journal_preres_available(struct journal *j,
|
||||
struct journal_preres *res,
|
||||
unsigned new_u64s)
|
||||
{
|
||||
bool ret = bch2_journal_preres_get_fast(j, res, new_u64s);
|
||||
|
||||
if (!ret)
|
||||
bch2_journal_reclaim_work(&j->reclaim_work.work);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int __bch2_journal_preres_get(struct journal *j,
|
||||
struct journal_preres *res,
|
||||
unsigned new_u64s)
|
||||
{
|
||||
int ret;
|
||||
|
||||
closure_wait_event(&j->preres_wait,
|
||||
(ret = bch2_journal_error(j)) ||
|
||||
journal_preres_available(j, res, new_u64s));
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* journal_entry_res: */
|
||||
|
||||
void bch2_journal_entry_res_resize(struct journal *j,
|
||||
@ -760,6 +812,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
|
||||
|
||||
while (ja->nr < nr) {
|
||||
struct open_bucket *ob = NULL;
|
||||
unsigned pos;
|
||||
long bucket;
|
||||
|
||||
if (new_fs) {
|
||||
@ -786,21 +839,25 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
|
||||
preempt_disable();
|
||||
}
|
||||
|
||||
__array_insert_item(ja->buckets, ja->nr, ja->last_idx);
|
||||
__array_insert_item(ja->bucket_seq, ja->nr, ja->last_idx);
|
||||
__array_insert_item(journal_buckets->buckets, ja->nr, ja->last_idx);
|
||||
|
||||
ja->buckets[ja->last_idx] = bucket;
|
||||
ja->bucket_seq[ja->last_idx] = 0;
|
||||
journal_buckets->buckets[ja->last_idx] = cpu_to_le64(bucket);
|
||||
|
||||
if (ja->last_idx < ja->nr) {
|
||||
if (ja->cur_idx >= ja->last_idx)
|
||||
ja->cur_idx++;
|
||||
ja->last_idx++;
|
||||
}
|
||||
pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0;
|
||||
__array_insert_item(ja->buckets, ja->nr, pos);
|
||||
__array_insert_item(ja->bucket_seq, ja->nr, pos);
|
||||
__array_insert_item(journal_buckets->buckets, ja->nr, pos);
|
||||
ja->nr++;
|
||||
|
||||
ja->buckets[pos] = bucket;
|
||||
ja->bucket_seq[pos] = 0;
|
||||
journal_buckets->buckets[pos] = cpu_to_le64(bucket);
|
||||
|
||||
if (pos <= ja->discard_idx)
|
||||
ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
|
||||
if (pos <= ja->dirty_idx_ondisk)
|
||||
ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
|
||||
if (pos <= ja->dirty_idx)
|
||||
ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
|
||||
if (pos <= ja->cur_idx)
|
||||
ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
|
||||
|
||||
bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
|
||||
ca->mi.bucket_size,
|
||||
gc_phase(GC_PHASE_SB),
|
||||
@ -1039,6 +1096,7 @@ int bch2_fs_journal_init(struct journal *j)
|
||||
mutex_init(&j->blacklist_lock);
|
||||
INIT_LIST_HEAD(&j->seq_blacklist);
|
||||
mutex_init(&j->reclaim_lock);
|
||||
mutex_init(&j->discard_lock);
|
||||
|
||||
lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
|
||||
|
||||
@ -1087,11 +1145,16 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
|
||||
"seq:\t\t\t%llu\n"
|
||||
"last_seq:\t\t%llu\n"
|
||||
"last_seq_ondisk:\t%llu\n"
|
||||
"prereserved:\t\t%u/%u\n"
|
||||
"current entry sectors:\t%u\n"
|
||||
"current entry:\t\t",
|
||||
fifo_used(&j->pin),
|
||||
journal_cur_seq(j),
|
||||
journal_last_seq(j),
|
||||
j->last_seq_ondisk);
|
||||
j->last_seq_ondisk,
|
||||
j->prereserved.reserved,
|
||||
j->prereserved.remaining,
|
||||
j->cur_entry_sectors);
|
||||
|
||||
switch (s.cur_entry_offset) {
|
||||
case JOURNAL_ENTRY_ERROR_VAL:
|
||||
@ -1113,8 +1176,9 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
|
||||
journal_state_count(s, s.idx));
|
||||
|
||||
if (s.prev_buf_unwritten)
|
||||
pr_buf(&out, "yes, ref %u\n",
|
||||
journal_state_count(s, !s.idx));
|
||||
pr_buf(&out, "yes, ref %u sectors %u\n",
|
||||
journal_state_count(s, !s.idx),
|
||||
journal_prev_buf(j)->sectors);
|
||||
else
|
||||
pr_buf(&out, "no\n");
|
||||
|
||||
@ -1135,13 +1199,17 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
|
||||
"dev %u:\n"
|
||||
"\tnr\t\t%u\n"
|
||||
"\tavailable\t%u:%u\n"
|
||||
"\tcur_idx\t\t%u (seq %llu)\n"
|
||||
"\tlast_idx\t%u (seq %llu)\n",
|
||||
"\tdiscard_idx\t\t%u\n"
|
||||
"\tdirty_idx_ondisk\t%u (seq %llu)\n"
|
||||
"\tdirty_idx\t\t%u (seq %llu)\n"
|
||||
"\tcur_idx\t\t%u (seq %llu)\n",
|
||||
iter, ja->nr,
|
||||
bch2_journal_dev_buckets_available(j, ja),
|
||||
bch2_journal_dev_buckets_available(j, ja, journal_space_discarded),
|
||||
ja->sectors_free,
|
||||
ja->cur_idx, ja->bucket_seq[ja->cur_idx],
|
||||
ja->last_idx, ja->bucket_seq[ja->last_idx]);
|
||||
ja->discard_idx,
|
||||
ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk],
|
||||
ja->dirty_idx, ja->bucket_seq[ja->dirty_idx],
|
||||
ja->cur_idx, ja->bucket_seq[ja->cur_idx]);
|
||||
}
|
||||
|
||||
spin_unlock(&j->lock);
|
||||
|
@ -118,6 +118,7 @@ static inline void journal_wake(struct journal *j)
|
||||
{
|
||||
wake_up(&j->wait);
|
||||
closure_wake_up(&j->async_wait);
|
||||
closure_wake_up(&j->preres_wait);
|
||||
}
|
||||
|
||||
static inline struct journal_buf *journal_cur_buf(struct journal *j)
|
||||
@ -271,6 +272,7 @@ int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
|
||||
|
||||
#define JOURNAL_RES_GET_NONBLOCK (1 << 0)
|
||||
#define JOURNAL_RES_GET_CHECK (1 << 1)
|
||||
#define JOURNAL_RES_GET_RESERVED (1 << 2)
|
||||
|
||||
static inline int journal_res_get_fast(struct journal *j,
|
||||
struct journal_res *res,
|
||||
@ -291,6 +293,10 @@ static inline int journal_res_get_fast(struct journal *j,
|
||||
|
||||
EBUG_ON(!journal_state_count(new, new.idx));
|
||||
|
||||
if (!(flags & JOURNAL_RES_GET_RESERVED) &&
|
||||
!test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags))
|
||||
return 0;
|
||||
|
||||
if (flags & JOURNAL_RES_GET_CHECK)
|
||||
return 1;
|
||||
|
||||
@ -330,6 +336,89 @@ out:
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* journal_preres: */
|
||||
|
||||
static inline bool journal_check_may_get_unreserved(struct journal *j)
|
||||
{
|
||||
union journal_preres_state s = READ_ONCE(j->prereserved);
|
||||
bool ret = s.reserved <= s.remaining &&
|
||||
fifo_free(&j->pin) > 8;
|
||||
|
||||
lockdep_assert_held(&j->lock);
|
||||
|
||||
if (ret != test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
|
||||
if (ret) {
|
||||
set_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags);
|
||||
journal_wake(j);
|
||||
} else {
|
||||
clear_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags);
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void bch2_journal_preres_put(struct journal *j,
|
||||
struct journal_preres *res)
|
||||
{
|
||||
union journal_preres_state s = { .reserved = res->u64s };
|
||||
|
||||
if (!res->u64s)
|
||||
return;
|
||||
|
||||
s.v = atomic64_sub_return(s.v, &j->prereserved.counter);
|
||||
res->u64s = 0;
|
||||
closure_wake_up(&j->preres_wait);
|
||||
|
||||
if (s.reserved <= s.remaining &&
|
||||
!test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
|
||||
spin_lock(&j->lock);
|
||||
journal_check_may_get_unreserved(j);
|
||||
spin_unlock(&j->lock);
|
||||
}
|
||||
}
|
||||
|
||||
int __bch2_journal_preres_get(struct journal *,
|
||||
struct journal_preres *, unsigned);
|
||||
|
||||
static inline int bch2_journal_preres_get_fast(struct journal *j,
|
||||
struct journal_preres *res,
|
||||
unsigned new_u64s)
|
||||
{
|
||||
int d = new_u64s - res->u64s;
|
||||
union journal_preres_state old, new;
|
||||
u64 v = atomic64_read(&j->prereserved.counter);
|
||||
|
||||
do {
|
||||
old.v = new.v = v;
|
||||
|
||||
new.reserved += d;
|
||||
|
||||
if (new.reserved > new.remaining)
|
||||
return 0;
|
||||
} while ((v = atomic64_cmpxchg(&j->prereserved.counter,
|
||||
old.v, new.v)) != old.v);
|
||||
|
||||
res->u64s += d;
|
||||
return 1;
|
||||
}
|
||||
|
||||
static inline int bch2_journal_preres_get(struct journal *j,
|
||||
struct journal_preres *res,
|
||||
unsigned new_u64s,
|
||||
unsigned flags)
|
||||
{
|
||||
if (new_u64s <= res->u64s)
|
||||
return 0;
|
||||
|
||||
if (bch2_journal_preres_get_fast(j, res, new_u64s))
|
||||
return 0;
|
||||
|
||||
if (flags & JOURNAL_RES_GET_NONBLOCK)
|
||||
return -EAGAIN;
|
||||
|
||||
return __bch2_journal_preres_get(j, res, new_u64s);
|
||||
}
|
||||
|
||||
/* journal_entry_res: */
|
||||
|
||||
void bch2_journal_entry_res_resize(struct journal *,
|
||||
|
@ -625,11 +625,12 @@ static void bch2_journal_read_device(struct closure *cl)
|
||||
ja->sectors_free = 0;
|
||||
|
||||
/*
|
||||
* Set last_idx to indicate the entire journal is full and needs to be
|
||||
* Set dirty_idx to indicate the entire journal is full and needs to be
|
||||
* reclaimed - journal reclaim will immediately reclaim whatever isn't
|
||||
* pinned when it first runs:
|
||||
*/
|
||||
ja->last_idx = (ja->cur_idx + 1) % ja->nr;
|
||||
ja->discard_idx = ja->dirty_idx_ondisk =
|
||||
ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
|
||||
out:
|
||||
kvpfree(buf.data, buf.size);
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
@ -969,9 +970,16 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
|
||||
|
||||
if (sectors > ja->sectors_free &&
|
||||
sectors <= ca->mi.bucket_size &&
|
||||
bch2_journal_dev_buckets_available(j, ja)) {
|
||||
bch2_journal_dev_buckets_available(j, ja,
|
||||
journal_space_discarded)) {
|
||||
ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
|
||||
ja->sectors_free = ca->mi.bucket_size;
|
||||
|
||||
/*
|
||||
* ja->bucket_seq[ja->cur_idx] must always have
|
||||
* something sensible:
|
||||
*/
|
||||
ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1069,12 +1077,13 @@ static void journal_write_done(struct closure *cl)
|
||||
goto err;
|
||||
|
||||
spin_lock(&j->lock);
|
||||
j->seq_ondisk = seq;
|
||||
j->last_seq_ondisk = last_seq;
|
||||
|
||||
if (seq >= j->pin.front)
|
||||
journal_seq_pin(j, seq)->devs = devs;
|
||||
|
||||
j->seq_ondisk = seq;
|
||||
j->last_seq_ondisk = last_seq;
|
||||
bch2_journal_space_available(j);
|
||||
|
||||
/*
|
||||
* Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
|
||||
* more buckets:
|
||||
|
@ -8,47 +8,72 @@
|
||||
|
||||
/* Free space calculations: */
|
||||
|
||||
static unsigned journal_space_from(struct journal_device *ja,
|
||||
enum journal_space_from from)
|
||||
{
|
||||
switch (from) {
|
||||
case journal_space_discarded:
|
||||
return ja->discard_idx;
|
||||
case journal_space_clean_ondisk:
|
||||
return ja->dirty_idx_ondisk;
|
||||
case journal_space_clean:
|
||||
return ja->dirty_idx;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
unsigned bch2_journal_dev_buckets_available(struct journal *j,
|
||||
struct journal_device *ja)
|
||||
struct journal_device *ja,
|
||||
enum journal_space_from from)
|
||||
{
|
||||
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
||||
unsigned next = (ja->cur_idx + 1) % ja->nr;
|
||||
unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
|
||||
unsigned available = (journal_space_from(ja, from) -
|
||||
ja->cur_idx - 1 + ja->nr) % ja->nr;
|
||||
|
||||
/*
|
||||
* Allocator startup needs some journal space before we can do journal
|
||||
* replay:
|
||||
*/
|
||||
if (available &&
|
||||
test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags))
|
||||
available--;
|
||||
if (available && test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags))
|
||||
--available;
|
||||
|
||||
/*
|
||||
* Don't use the last bucket unless writing the new last_seq
|
||||
* will make another bucket available:
|
||||
*/
|
||||
if (available &&
|
||||
journal_last_seq(j) <= ja->bucket_seq[ja->last_idx])
|
||||
if (available && ja->dirty_idx_ondisk == ja->dirty_idx)
|
||||
--available;
|
||||
|
||||
return available;
|
||||
}
|
||||
|
||||
void bch2_journal_space_available(struct journal *j)
|
||||
static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
|
||||
{
|
||||
union journal_preres_state old, new;
|
||||
u64 v = atomic64_read(&j->prereserved.counter);
|
||||
|
||||
do {
|
||||
old.v = new.v = v;
|
||||
new.remaining = u64s_remaining;
|
||||
} while ((v = atomic64_cmpxchg(&j->prereserved.counter,
|
||||
old.v, new.v)) != old.v);
|
||||
}
|
||||
|
||||
static struct journal_space {
|
||||
unsigned next_entry;
|
||||
unsigned remaining;
|
||||
} __journal_space_available(struct journal *j, unsigned nr_devs_want,
|
||||
enum journal_space_from from)
|
||||
{
|
||||
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
||||
struct bch_dev *ca;
|
||||
unsigned sectors_next_entry = UINT_MAX;
|
||||
unsigned sectors_total = UINT_MAX;
|
||||
unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
|
||||
j->buf[1].buf_size >> 9);
|
||||
unsigned i, nr_online = 0, nr_devs = 0;
|
||||
unsigned i, nr_devs = 0;
|
||||
unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
|
||||
? journal_prev_buf(j)->sectors
|
||||
: 0;
|
||||
int ret = 0;
|
||||
|
||||
lockdep_assert_held(&j->lock);
|
||||
|
||||
rcu_read_lock();
|
||||
for_each_member_device_rcu(ca, c, i,
|
||||
@ -59,9 +84,7 @@ void bch2_journal_space_available(struct journal *j)
|
||||
if (!ja->nr)
|
||||
continue;
|
||||
|
||||
nr_online++;
|
||||
|
||||
buckets_this_device = bch2_journal_dev_buckets_available(j, ja);
|
||||
buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from);
|
||||
sectors_this_device = ja->sectors_free;
|
||||
|
||||
/*
|
||||
@ -94,28 +117,88 @@ void bch2_journal_space_available(struct journal *j)
|
||||
buckets_this_device * ca->mi.bucket_size +
|
||||
sectors_this_device);
|
||||
|
||||
max_entry_size = min_t(unsigned, max_entry_size,
|
||||
ca->mi.bucket_size);
|
||||
|
||||
nr_devs++;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
if (nr_devs < nr_devs_want)
|
||||
return (struct journal_space) { 0, 0 };
|
||||
|
||||
return (struct journal_space) {
|
||||
.next_entry = sectors_next_entry,
|
||||
.remaining = max_t(int, 0, sectors_total - sectors_next_entry),
|
||||
};
|
||||
}
|
||||
|
||||
void bch2_journal_space_available(struct journal *j)
|
||||
{
|
||||
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
||||
struct bch_dev *ca;
|
||||
struct journal_space discarded, clean_ondisk, clean;
|
||||
unsigned overhead, u64s_remaining = 0;
|
||||
unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
|
||||
j->buf[1].buf_size >> 9);
|
||||
unsigned i, nr_online = 0, nr_devs_want;
|
||||
bool can_discard = false;
|
||||
int ret = 0;
|
||||
|
||||
lockdep_assert_held(&j->lock);
|
||||
|
||||
rcu_read_lock();
|
||||
for_each_member_device_rcu(ca, c, i,
|
||||
&c->rw_devs[BCH_DATA_JOURNAL]) {
|
||||
struct journal_device *ja = &ca->journal;
|
||||
|
||||
if (!ja->nr)
|
||||
continue;
|
||||
|
||||
while (ja->dirty_idx != ja->cur_idx &&
|
||||
ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
|
||||
ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
|
||||
|
||||
while (ja->dirty_idx_ondisk != ja->dirty_idx &&
|
||||
ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
|
||||
ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
|
||||
|
||||
if (ja->discard_idx != ja->dirty_idx_ondisk)
|
||||
can_discard = true;
|
||||
|
||||
max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
|
||||
nr_online++;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
j->can_discard = can_discard;
|
||||
|
||||
if (nr_online < c->opts.metadata_replicas_required) {
|
||||
ret = -EROFS;
|
||||
sectors_next_entry = 0;
|
||||
} else if (!sectors_next_entry ||
|
||||
nr_devs < min_t(unsigned, nr_online,
|
||||
c->opts.metadata_replicas)) {
|
||||
ret = -ENOSPC;
|
||||
sectors_next_entry = 0;
|
||||
} else if (!fifo_free(&j->pin)) {
|
||||
ret = -ENOSPC;
|
||||
sectors_next_entry = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
j->cur_entry_sectors = sectors_next_entry;
|
||||
if (!fifo_free(&j->pin)) {
|
||||
ret = -ENOSPC;
|
||||
goto out;
|
||||
}
|
||||
|
||||
nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
|
||||
|
||||
discarded = __journal_space_available(j, nr_devs_want, journal_space_discarded);
|
||||
clean_ondisk = __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk);
|
||||
clean = __journal_space_available(j, nr_devs_want, journal_space_clean);
|
||||
|
||||
if (!discarded.next_entry)
|
||||
ret = -ENOSPC;
|
||||
|
||||
overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) *
|
||||
journal_entry_overhead(j);
|
||||
u64s_remaining = clean.remaining << 6;
|
||||
u64s_remaining = max_t(int, 0, u64s_remaining - overhead);
|
||||
u64s_remaining /= 4;
|
||||
out:
|
||||
j->cur_entry_sectors = !ret ? discarded.next_entry : 0;
|
||||
j->cur_entry_error = ret;
|
||||
journal_set_remaining(j, u64s_remaining);
|
||||
journal_check_may_get_unreserved(j);
|
||||
|
||||
if (!ret)
|
||||
journal_wake(j);
|
||||
@ -128,25 +211,23 @@ static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
|
||||
bool ret;
|
||||
|
||||
spin_lock(&j->lock);
|
||||
ret = ja->nr &&
|
||||
ja->last_idx != ja->cur_idx &&
|
||||
ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk;
|
||||
ret = ja->discard_idx != ja->dirty_idx_ondisk;
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Advance ja->last_idx as long as it points to buckets that are no longer
|
||||
* Advance ja->discard_idx as long as it points to buckets that are no longer
|
||||
* dirty, issuing discards if necessary:
|
||||
*/
|
||||
static void journal_do_discards(struct journal *j)
|
||||
void bch2_journal_do_discards(struct journal *j)
|
||||
{
|
||||
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
||||
struct bch_dev *ca;
|
||||
unsigned iter;
|
||||
|
||||
mutex_lock(&j->reclaim_lock);
|
||||
mutex_lock(&j->discard_lock);
|
||||
|
||||
for_each_rw_member(ca, c, iter) {
|
||||
struct journal_device *ja = &ca->journal;
|
||||
@ -156,18 +237,18 @@ static void journal_do_discards(struct journal *j)
|
||||
blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
|
||||
blkdev_issue_discard(ca->disk_sb.bdev,
|
||||
bucket_to_sector(ca,
|
||||
ja->buckets[ja->last_idx]),
|
||||
ja->buckets[ja->discard_idx]),
|
||||
ca->mi.bucket_size, GFP_NOIO, 0);
|
||||
|
||||
spin_lock(&j->lock);
|
||||
ja->last_idx = (ja->last_idx + 1) % ja->nr;
|
||||
ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
|
||||
|
||||
bch2_journal_space_available(j);
|
||||
spin_unlock(&j->lock);
|
||||
}
|
||||
}
|
||||
|
||||
mutex_unlock(&j->reclaim_lock);
|
||||
mutex_unlock(&j->discard_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -372,7 +453,7 @@ static void journal_flush_pins(struct journal *j, u64 seq_to_flush,
|
||||
}
|
||||
|
||||
/**
|
||||
* bch2_journal_reclaim_work - free up journal buckets
|
||||
* bch2_journal_reclaim - free up journal buckets
|
||||
*
|
||||
* Background journal reclaim writes out btree nodes. It should be run
|
||||
* early enough so that we never completely run out of journal buckets.
|
||||
@ -389,29 +470,37 @@ static void journal_flush_pins(struct journal *j, u64 seq_to_flush,
|
||||
* 512 journal entries or 25% of all journal buckets, then
|
||||
* journal_next_bucket() should not stall.
|
||||
*/
|
||||
void bch2_journal_reclaim_work(struct work_struct *work)
|
||||
void bch2_journal_reclaim(struct journal *j)
|
||||
{
|
||||
struct bch_fs *c = container_of(to_delayed_work(work),
|
||||
struct bch_fs, journal.reclaim_work);
|
||||
struct journal *j = &c->journal;
|
||||
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
||||
struct bch_dev *ca;
|
||||
unsigned iter, bucket_to_flush, min_nr = 0;
|
||||
unsigned iter, min_nr = 0;
|
||||
u64 seq_to_flush = 0;
|
||||
|
||||
journal_do_discards(j);
|
||||
lockdep_assert_held(&j->reclaim_lock);
|
||||
|
||||
bch2_journal_do_discards(j);
|
||||
|
||||
mutex_lock(&j->reclaim_lock);
|
||||
spin_lock(&j->lock);
|
||||
|
||||
for_each_rw_member(ca, c, iter) {
|
||||
struct journal_device *ja = &ca->journal;
|
||||
unsigned nr_buckets, bucket_to_flush;
|
||||
|
||||
if (!ja->nr)
|
||||
continue;
|
||||
|
||||
|
||||
/* Try to keep the journal at most half full: */
|
||||
bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
|
||||
nr_buckets = ja->nr / 2;
|
||||
|
||||
/* And include pre-reservations: */
|
||||
nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
|
||||
(ca->mi.bucket_size << 6) -
|
||||
journal_entry_overhead(j));
|
||||
|
||||
nr_buckets = min(nr_buckets, ja->nr);
|
||||
|
||||
bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
|
||||
seq_to_flush = max_t(u64, seq_to_flush,
|
||||
ja->bucket_seq[bucket_to_flush]);
|
||||
}
|
||||
@ -430,15 +519,26 @@ void bch2_journal_reclaim_work(struct work_struct *work)
|
||||
msecs_to_jiffies(j->reclaim_delay_ms)))
|
||||
min_nr = 1;
|
||||
|
||||
journal_flush_pins(j, seq_to_flush, min_nr);
|
||||
if (j->prereserved.reserved * 2 > j->prereserved.remaining)
|
||||
min_nr = 1;
|
||||
|
||||
mutex_unlock(&j->reclaim_lock);
|
||||
journal_flush_pins(j, seq_to_flush, min_nr);
|
||||
|
||||
if (!test_bit(BCH_FS_RO, &c->flags))
|
||||
queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
|
||||
msecs_to_jiffies(j->reclaim_delay_ms));
|
||||
}
|
||||
|
||||
void bch2_journal_reclaim_work(struct work_struct *work)
|
||||
{
|
||||
struct journal *j = container_of(to_delayed_work(work),
|
||||
struct journal, reclaim_work);
|
||||
|
||||
mutex_lock(&j->reclaim_lock);
|
||||
bch2_journal_reclaim(j);
|
||||
mutex_unlock(&j->reclaim_lock);
|
||||
}
|
||||
|
||||
static int journal_flush_done(struct journal *j, u64 seq_to_flush)
|
||||
{
|
||||
int ret;
|
||||
|
@ -3,8 +3,15 @@
|
||||
|
||||
#define JOURNAL_PIN (32 * 1024)
|
||||
|
||||
enum journal_space_from {
|
||||
journal_space_discarded,
|
||||
journal_space_clean_ondisk,
|
||||
journal_space_clean,
|
||||
};
|
||||
|
||||
unsigned bch2_journal_dev_buckets_available(struct journal *,
|
||||
struct journal_device *);
|
||||
struct journal_device *,
|
||||
enum journal_space_from);
|
||||
void bch2_journal_space_available(struct journal *);
|
||||
|
||||
static inline bool journal_pin_active(struct journal_entry_pin *pin)
|
||||
@ -33,6 +40,8 @@ void bch2_journal_pin_add_if_older(struct journal *,
|
||||
journal_pin_flush_fn);
|
||||
void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
|
||||
|
||||
void bch2_journal_do_discards(struct journal *);
|
||||
void bch2_journal_reclaim(struct journal *);
|
||||
void bch2_journal_reclaim_work(struct work_struct *);
|
||||
|
||||
void bch2_journal_flush_pins(struct journal *, u64);
|
||||
|
@ -79,6 +79,14 @@ struct journal_res {
|
||||
u64 seq;
|
||||
};
|
||||
|
||||
/*
|
||||
* For reserving space in the journal prior to getting a reservation on a
|
||||
* particular journal entry:
|
||||
*/
|
||||
struct journal_preres {
|
||||
unsigned u64s;
|
||||
};
|
||||
|
||||
union journal_res_state {
|
||||
struct {
|
||||
atomic64_t counter;
|
||||
@ -97,6 +105,21 @@ union journal_res_state {
|
||||
};
|
||||
};
|
||||
|
||||
union journal_preres_state {
|
||||
struct {
|
||||
atomic64_t counter;
|
||||
};
|
||||
|
||||
struct {
|
||||
u64 v;
|
||||
};
|
||||
|
||||
struct {
|
||||
u32 reserved;
|
||||
u32 remaining;
|
||||
};
|
||||
};
|
||||
|
||||
/* bytes: */
|
||||
#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */
|
||||
#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */
|
||||
@ -121,6 +144,7 @@ enum {
|
||||
JOURNAL_STARTED,
|
||||
JOURNAL_NEED_WRITE,
|
||||
JOURNAL_NOT_EMPTY,
|
||||
JOURNAL_MAY_GET_UNRESERVED,
|
||||
};
|
||||
|
||||
/* Embedded in struct bch_fs */
|
||||
@ -141,6 +165,8 @@ struct journal {
|
||||
*/
|
||||
int cur_entry_error;
|
||||
|
||||
union journal_preres_state prereserved;
|
||||
|
||||
/* Reserved space in journal entry to be used just prior to write */
|
||||
unsigned entry_u64s_reserved;
|
||||
|
||||
@ -160,6 +186,7 @@ struct journal {
|
||||
/* Used when waiting because the journal was full */
|
||||
wait_queue_head_t wait;
|
||||
struct closure_waitlist async_wait;
|
||||
struct closure_waitlist preres_wait;
|
||||
|
||||
struct closure io;
|
||||
struct delayed_work write_work;
|
||||
@ -192,9 +219,6 @@ struct journal {
|
||||
struct journal_entry_pin_list *data;
|
||||
} pin;
|
||||
|
||||
struct journal_entry_pin *flush_in_progress;
|
||||
wait_queue_head_t pin_flush_wait;
|
||||
|
||||
u64 replay_journal_seq;
|
||||
|
||||
struct mutex blacklist_lock;
|
||||
@ -205,10 +229,15 @@ struct journal {
|
||||
spinlock_t err_lock;
|
||||
|
||||
struct delayed_work reclaim_work;
|
||||
unsigned long last_flushed;
|
||||
|
||||
/* protects advancing ja->last_idx: */
|
||||
struct mutex reclaim_lock;
|
||||
unsigned long last_flushed;
|
||||
struct journal_entry_pin *flush_in_progress;
|
||||
wait_queue_head_t pin_flush_wait;
|
||||
|
||||
/* protects advancing ja->discard_idx: */
|
||||
struct mutex discard_lock;
|
||||
bool can_discard;
|
||||
|
||||
unsigned write_delay_ms;
|
||||
unsigned reclaim_delay_ms;
|
||||
|
||||
@ -239,17 +268,15 @@ struct journal_device {
|
||||
|
||||
unsigned sectors_free;
|
||||
|
||||
/* Journal bucket we're currently writing to */
|
||||
unsigned cur_idx;
|
||||
|
||||
/* Last journal bucket that still contains an open journal entry */
|
||||
|
||||
/*
|
||||
* j->lock and j->reclaim_lock must both be held to modify, j->lock
|
||||
* sufficient to read:
|
||||
* discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx:
|
||||
*/
|
||||
unsigned last_idx;
|
||||
unsigned discard_idx; /* Next bucket to discard */
|
||||
unsigned dirty_idx_ondisk;
|
||||
unsigned dirty_idx;
|
||||
unsigned cur_idx; /* Journal bucket we're currently writing to */
|
||||
unsigned nr;
|
||||
|
||||
u64 *buckets;
|
||||
|
||||
/* Bio for journal reads/writes to this device */
|
||||
|
Loading…
Reference in New Issue
Block a user