Update bcachefs sources to 3ca08ab51ec9 bcachefs: six locks: Simplify optimistic spinning

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2023-11-12 20:53:57 -05:00
parent a613340b26
commit 7fd6c3ffe4
65 changed files with 1040 additions and 979 deletions

View File

@ -1 +1 @@
d464ec667b2b9de097e39d1505b45aafd87a9552
3ca08ab51ec996180c20105489176b8c4327240c

View File

@ -278,4 +278,7 @@ static inline void dump_stack(void) {}
#define unsafe_memcpy(dst, src, bytes, justification) \
memcpy(dst, src, bytes)
#define DECLARE_FLEX_ARRAY(TYPE, NAME) \
__DECLARE_FLEX_ARRAY(TYPE, NAME)
#endif

View File

@ -98,4 +98,15 @@ static inline void hlist_del_init(struct hlist_node *n)
pos; \
pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
static inline size_t list_count_nodes(struct list_head *head)
{
struct list_head *pos;
size_t count = 0;
list_for_each(pos, head)
count++;
return count;
}
#endif /* _LIST_LIST_H */

View File

@ -561,8 +561,8 @@ int bch2_bucket_gens_init(struct bch_fs *c)
if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW,
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_lazy_rw,
bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
if (ret)
break;
@ -581,8 +581,8 @@ int bch2_bucket_gens_init(struct bch_fs *c)
if (have_bucket_gens_key && !ret)
ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW,
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_lazy_rw,
bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
bch2_trans_put(trans);
@ -1267,7 +1267,7 @@ delete:
ret = bch2_btree_delete_extent_at(trans, iter,
iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW);
BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw);
goto out;
}
@ -1422,8 +1422,8 @@ int bch2_check_alloc_info(struct bch_fs *c)
}
ret = bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW);
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_lazy_rw);
if (ret)
goto bkey_err;
@ -1453,7 +1453,7 @@ bkey_err:
for_each_btree_key_commit(trans, iter,
BTREE_ID_bucket_gens, POS_MIN,
BTREE_ITER_PREFETCH, k,
NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
bch2_check_bucket_gens_key(trans, &iter, k));
err:
bch2_trans_put(trans);
@ -1546,7 +1546,7 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
POS_MIN, BTREE_ITER_PREFETCH, k,
NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
bch2_check_alloc_to_lru_ref(trans, &iter)));
if (ret)
bch_err_fn(c, ret);
@ -1655,7 +1655,7 @@ write:
ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_WATERMARK_btree|
BTREE_INSERT_NOFAIL);
BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto out;
@ -1760,7 +1760,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
BTREE_TRIGGER_BUCKET_INVALIDATE) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_WATERMARK_btree|
BTREE_INSERT_NOFAIL);
BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto out;
@ -1884,8 +1884,8 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
ret = bch2_bucket_do_index(trans, k, a, true) ?:
bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_NOFAIL);
BCH_TRANS_COMMIT_lazy_rw|
BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto bkey_err;
@ -1905,8 +1905,8 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_NOFAIL);
BCH_TRANS_COMMIT_lazy_rw|
BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto bkey_err;

View File

@ -5,6 +5,7 @@
#include "backpointers.h"
#include "btree_cache.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
#include "error.h"
@ -220,18 +221,22 @@ out:
static void backpointer_not_found(struct btree_trans *trans,
struct bpos bp_pos,
struct bch_backpointer bp,
struct bkey_s_c k,
const char *thing_it_points_to)
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
/*
* If we're using the btree write buffer, the backpointer we were
* looking at may have already been deleted - failure to find what it
* pointed to is not an error:
*/
if (likely(!bch2_backpointers_no_use_write_buffer))
return;
prt_printf(&buf, "backpointer doesn't match %s it points to:\n ",
thing_it_points_to);
bp.level ? "btree node" : "extent");
prt_printf(&buf, "bucket: ");
bch2_bpos_to_text(&buf, bucket);
prt_printf(&buf, "\n ");
@ -257,16 +262,15 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
struct bch_backpointer bp,
unsigned iter_flags)
{
if (likely(!bp.level)) {
struct bch_fs *c = trans->c;
struct btree_root *r = bch2_btree_id_root(c, bp.btree_id);
struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
struct bkey_s_c k;
bch2_trans_node_iter_init(trans, iter,
bp.btree_id,
bp.pos,
0,
min(bp.level, r->level),
0, 0,
iter_flags);
k = bch2_btree_iter_peek_slot(iter);
if (bkey_err(k)) {
@ -274,39 +278,21 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
return k;
}
if (bp.level == r->level + 1)
k = bkey_i_to_s_c(&r->key);
if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
return k;
bch2_trans_iter_exit(trans, iter);
backpointer_not_found(trans, bp_pos, bp, k);
return bkey_s_c_null;
} else {
struct btree *b = bch2_backpointer_get_node(trans, iter, bp_pos, bp);
if (unlikely(bch2_backpointers_no_use_write_buffer)) {
if (bp.level) {
struct btree *b;
/*
* If a backpointer for a btree node wasn't found, it may be
* because it was overwritten by a new btree node that hasn't
* been written out yet - backpointer_get_node() checks for
* this:
*/
b = bch2_backpointer_get_node(trans, iter, bp_pos, bp);
if (!IS_ERR_OR_NULL(b))
return bkey_i_to_s_c(&b->key);
if (IS_ERR_OR_NULL(b)) {
bch2_trans_iter_exit(trans, iter);
if (IS_ERR(b))
return bkey_s_c_err(PTR_ERR(b));
return bkey_s_c_null;
return IS_ERR(b) ? bkey_s_c_err(PTR_ERR(b)) : bkey_s_c_null;
}
backpointer_not_found(trans, bp_pos, bp, k, "extent");
return bkey_i_to_s_c(&b->key);
}
return bkey_s_c_null;
}
struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
@ -327,19 +313,20 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
bp.level - 1,
0);
b = bch2_btree_iter_peek_node(iter);
if (IS_ERR(b))
if (IS_ERR_OR_NULL(b))
goto err;
if (b && extent_matches_bp(c, bp.btree_id, bp.level,
BUG_ON(b->c.level != bp.level - 1);
if (extent_matches_bp(c, bp.btree_id, bp.level,
bkey_i_to_s_c(&b->key),
bucket, bp))
return b;
if (b && btree_node_will_make_reachable(b)) {
if (btree_node_will_make_reachable(b)) {
b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
} else {
backpointer_not_found(trans, bp_pos, bp,
bkey_i_to_s_c(&b->key), "btree node");
backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key));
b = NULL;
}
err:
@ -395,7 +382,7 @@ int bch2_check_btree_backpointers(struct bch_fs *c)
ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_backpointers, POS_MIN, 0, k,
NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
bch2_check_btree_backpointer(trans, &iter, k)));
if (ret)
bch_err_fn(c, ret);
@ -642,8 +629,8 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
do {
ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_NOFAIL,
BCH_TRANS_COMMIT_lazy_rw|
BCH_TRANS_COMMIT_no_enospc,
check_extent_to_backpointers(trans, &iter,
bucket_start, bucket_end,
&last_flushed));
@ -657,8 +644,8 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
break;
ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_NOFAIL,
BCH_TRANS_COMMIT_lazy_rw|
BCH_TRANS_COMMIT_no_enospc,
check_btree_root_to_backpointers(trans, btree_id,
bucket_start, bucket_end,
&last_flushed));
@ -797,7 +784,8 @@ static int check_one_backpointer(struct btree_trans *trans,
if (fsck_err_on(!k.k, c,
backpointer_to_missing_ptr,
"backpointer for missing extent\n %s",
"backpointer for missing %s\n %s",
bp.v->level ? "btree node" : "extent",
(bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) {
ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p);
goto out;
@ -819,7 +807,7 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
POS_MIN, BTREE_ITER_PREFETCH, k,
NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
check_one_backpointer(trans, start, end,
bkey_s_c_to_backpointer(k),
&last_flushed_pos));

View File

@ -401,7 +401,9 @@ BCH_DEBUG_PARAMS_DEBUG()
x(journal_flush_write) \
x(journal_noflush_write) \
x(journal_flush_seq) \
x(blocked_journal) \
x(blocked_journal_low_on_space) \
x(blocked_journal_low_on_pin) \
x(blocked_journal_max_in_flight) \
x(blocked_allocate) \
x(blocked_allocate_open_bucket) \
x(nocow_lock_contended)
@ -617,7 +619,7 @@ struct journal_seq_blacklist_table {
u64 start;
u64 end;
bool dirty;
} entries[0];
} entries[];
};
struct journal_keys {

View File

@ -2256,7 +2256,8 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6);
enum btree_id_flags {
BTREE_ID_EXTENTS = BIT(0),
BTREE_ID_SNAPSHOTS = BIT(1),
BTREE_ID_DATA = BIT(2),
BTREE_ID_SNAPSHOT_FIELD = BIT(2),
BTREE_ID_DATA = BIT(3),
};
#define BCH_BTREE_IDS() \
@ -2311,12 +2312,12 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_bucket_gens)) \
x(snapshot_trees, 15, 0, \
BIT_ULL(KEY_TYPE_snapshot_tree)) \
x(deleted_inodes, 16, BTREE_ID_SNAPSHOTS, \
x(deleted_inodes, 16, BTREE_ID_SNAPSHOT_FIELD, \
BIT_ULL(KEY_TYPE_set)) \
x(logged_ops, 17, 0, \
BIT_ULL(KEY_TYPE_logged_op_truncate)| \
BIT_ULL(KEY_TYPE_logged_op_finsert)) \
x(rebalance_work, 18, BTREE_ID_SNAPSHOTS, \
x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \
BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))
enum btree_id {

View File

@ -186,15 +186,20 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
if (type != BKEY_TYPE_btree) {
enum btree_id btree = type - 1;
bkey_fsck_err_on(!btree_type_has_snapshots(btree) &&
k.k->p.snapshot, c, err,
bkey_snapshot_nonzero,
"nonzero snapshot");
bkey_fsck_err_on(btree_type_has_snapshots(btree) &&
!k.k->p.snapshot, c, err,
if (btree_type_has_snapshots(btree)) {
bkey_fsck_err_on(!k.k->p.snapshot, c, err,
bkey_snapshot_zero,
"snapshot == 0");
} else if (!btree_type_has_snapshot_field(btree)) {
bkey_fsck_err_on(k.k->p.snapshot, c, err,
bkey_snapshot_nonzero,
"nonzero snapshot");
} else {
/*
* btree uses snapshot field but it's not required to be
* nonzero
*/
}
bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), c, err,
bkey_at_pos_max,

View File

@ -93,7 +93,6 @@ static inline int bch2_mark_key(struct btree_trans *trans,
enum btree_update_flags {
__BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END,
__BTREE_UPDATE_NOJOURNAL,
__BTREE_UPDATE_PREJOURNAL,
__BTREE_UPDATE_KEY_CACHE_RECLAIM,
__BTREE_TRIGGER_NORUN, /* Don't run triggers at all */
@ -108,7 +107,6 @@ enum btree_update_flags {
#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
#define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL)
#define BTREE_UPDATE_PREJOURNAL (1U << __BTREE_UPDATE_PREJOURNAL)
#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN)

View File

@ -1502,7 +1502,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
POS(ca->dev_idx, ca->mi.first_bucket),
BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
NULL, NULL, BTREE_INSERT_LAZY_RW,
NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
bch2_alloc_write_key(trans, &iter, k, metadata_only));
if (ret < 0) {
@ -1659,7 +1659,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
ret = for_each_btree_key_commit(trans, iter,
BTREE_ID_reflink, POS_MIN,
BTREE_ITER_PREFETCH, k,
NULL, NULL, BTREE_INSERT_NOFAIL,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_gc_write_reflink_key(trans, &iter, k, &idx));
c->reflink_gc_nr = 0;
@ -1783,7 +1783,7 @@ static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
ret = for_each_btree_key_commit(trans, iter,
BTREE_ID_stripes, POS_MIN,
BTREE_ITER_PREFETCH, k,
NULL, NULL, BTREE_INSERT_NOFAIL,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_gc_write_stripes_key(trans, &iter, k));
bch2_trans_put(trans);
@ -2019,7 +2019,7 @@ int bch2_gc_gens(struct bch_fs *c)
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
k,
NULL, NULL,
BTREE_INSERT_NOFAIL,
BCH_TRANS_COMMIT_no_enospc,
gc_btree_gens_key(trans, &iter, k));
if (ret && !bch2_err_matches(ret, EROFS))
bch_err_fn(c, ret);
@ -2032,7 +2032,7 @@ int bch2_gc_gens(struct bch_fs *c)
BTREE_ITER_PREFETCH,
k,
NULL, NULL,
BTREE_INSERT_NOFAIL,
BCH_TRANS_COMMIT_no_enospc,
bch2_alloc_write_oldest_gen(trans, &iter, k));
if (ret && !bch2_err_matches(ret, EROFS))
bch_err_fn(c, ret);

View File

@ -1801,9 +1801,9 @@ static void btree_node_write_work(struct work_struct *work)
ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
BCH_WATERMARK_reclaim|
BTREE_INSERT_JOURNAL_RECLAIM|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_NOCHECK_RW,
BCH_TRANS_COMMIT_journal_reclaim|
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw,
!wbio->wbio.failed.nr));
if (ret)
goto err;

View File

@ -257,7 +257,7 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
!btree_type_has_snapshots(iter->btree_id));
!btree_type_has_snapshot_field(iter->btree_id));
if (iter->update_path)
bch2_btree_path_verify(trans, iter->update_path);
@ -1214,8 +1214,6 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
struct btree_path *path, struct bpos new_pos,
bool intent, unsigned long ip, int cmp)
{
unsigned level = path->level;
bch2_trans_verify_not_in_restart(trans);
EBUG_ON(!path->ref);
@ -1231,7 +1229,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
goto out;
}
level = btree_path_up_until_good_node(trans, path, cmp);
unsigned level = btree_path_up_until_good_node(trans, path, cmp);
if (btree_path_node(path, level)) {
struct btree_path_level *l = &path->l[level];
@ -2835,8 +2833,9 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
static inline void check_srcu_held_too_long(struct btree_trans *trans)
{
WARN(time_after(jiffies, trans->srcu_lock_time + HZ * 10),
"btree trans held srcu lock (delaying memory reclaim) by more than 10 seconds");
WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10),
"btree trans held srcu lock (delaying memory reclaim) for %lu seconds",
(jiffies - trans->srcu_lock_time) / HZ);
}
void bch2_trans_srcu_unlock(struct btree_trans *trans)
@ -3088,8 +3087,6 @@ void bch2_trans_put(struct btree_trans *trans)
srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
}
bch2_journal_preres_put(&c->journal, &trans->journal_preres);
kfree(trans->extra_journal_entries.data);
if (trans->fs_usage_deltas) {

View File

@ -416,7 +416,7 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
flags |= BTREE_ITER_IS_EXTENTS;
if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
!btree_type_has_snapshots(btree_id))
!btree_type_has_snapshot_field(btree_id))
flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&

View File

@ -90,10 +90,13 @@ static void bkey_cached_free(struct btree_key_cache *bc,
ck->btree_trans_barrier_seq =
start_poll_synchronize_srcu(&c->btree_trans_barrier);
if (ck->c.lock.readers)
if (ck->c.lock.readers) {
list_move_tail(&ck->list, &bc->freed_pcpu);
else
bc->nr_freed_pcpu++;
} else {
list_move_tail(&ck->list, &bc->freed_nonpcpu);
bc->nr_freed_nonpcpu++;
}
atomic_long_inc(&bc->nr_freed);
kfree(ck->k);
@ -110,6 +113,8 @@ static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
{
struct bkey_cached *pos;
bc->nr_freed_nonpcpu++;
list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
pos->btree_trans_barrier_seq)) {
@ -159,6 +164,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
#else
mutex_lock(&bc->lock);
list_move_tail(&ck->list, &bc->freed_nonpcpu);
bc->nr_freed_nonpcpu++;
mutex_unlock(&bc->lock);
#endif
} else {
@ -218,6 +224,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
f->nr < ARRAY_SIZE(f->objs) / 2) {
ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
list_del_init(&ck->list);
bc->nr_freed_nonpcpu--;
f->objs[f->nr++] = ck;
}
@ -230,6 +237,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
if (!list_empty(&bc->freed_nonpcpu)) {
ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
list_del_init(&ck->list);
bc->nr_freed_nonpcpu--;
}
mutex_unlock(&bc->lock);
#endif
@ -649,8 +657,8 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
BTREE_TRIGGER_NORUN) ?:
bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL|
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc|
(ck->journal.seq == journal_last_seq(j)
? BCH_WATERMARK_reclaim
: 0)|
@ -665,7 +673,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
goto out;
bch2_journal_pin_drop(j, &ck->journal);
bch2_journal_preres_put(j, &ck->res);
BUG_ON(!btree_node_locked(c_iter.path, 0));
@ -728,7 +735,7 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
ret = commit_do(trans, NULL, NULL, 0,
btree_key_cache_flush_pos(trans, key, seq,
BTREE_INSERT_JOURNAL_RECLAIM, false));
BCH_TRANS_COMMIT_journal_reclaim, false));
unlock:
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
@ -763,18 +770,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
BUG_ON(insert->k.u64s > ck->u64s);
if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
int difference;
BUG_ON(jset_u64s(insert->k.u64s) > trans->journal_preres.u64s);
difference = jset_u64s(insert->k.u64s) - ck->res.u64s;
if (difference > 0) {
trans->journal_preres.u64s -= difference;
ck->res.u64s += difference;
}
}
bkey_copy(ck->k, insert);
ck->valid = true;
@ -852,6 +847,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
* Newest freed entries are at the end of the list - once we hit one
* that's too new to be freed, we can bail out:
*/
scanned += bc->nr_freed_nonpcpu;
list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
@ -861,13 +858,15 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed);
scanned++;
freed++;
bc->nr_freed_nonpcpu--;
}
if (scanned >= nr)
goto out;
scanned += bc->nr_freed_pcpu;
list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
@ -877,8 +876,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed);
scanned++;
freed++;
bc->nr_freed_pcpu--;
}
if (scanned >= nr)
@ -985,6 +984,9 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
}
#endif
BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu);
BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu);
list_splice(&bc->freed_pcpu, &items);
list_splice(&bc->freed_nonpcpu, &items);
@ -994,7 +996,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
cond_resched();
bch2_journal_pin_drop(&c->journal, &ck->journal);
bch2_journal_preres_put(&c->journal, &ck->res);
list_del(&ck->list);
kfree(ck->k);

View File

@ -0,0 +1,34 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
struct btree_key_cache_freelist {
struct bkey_cached *objs[16];
unsigned nr;
};
struct btree_key_cache {
struct mutex lock;
struct rhashtable table;
bool table_init_done;
struct list_head freed_pcpu;
size_t nr_freed_pcpu;
struct list_head freed_nonpcpu;
size_t nr_freed_nonpcpu;
struct shrinker shrink;
unsigned shrink_iter;
struct btree_key_cache_freelist __percpu *pcpu_freed;
atomic_long_t nr_freed;
atomic_long_t nr_keys;
atomic_long_t nr_dirty;
};
struct bkey_cached_key {
u32 btree_id;
struct bpos pos;
} __packed __aligned(4);
#endif /* _BCACHEFS_BTREE_KEY_CACHE_TYPES_H */

View File

@ -78,6 +78,53 @@ inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
bch2_btree_init_next(trans, b);
}
static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
{
while (--i >= trans->updates) {
if (same_leaf_as_prev(trans, i))
continue;
bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
}
trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
}
static inline int bch2_trans_lock_write(struct btree_trans *trans)
{
struct btree_insert_entry *i;
EBUG_ON(trans->write_locked);
trans_for_each_update(trans, i) {
if (same_leaf_as_prev(trans, i))
continue;
if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
return trans_lock_write_fail(trans, i);
if (!i->cached)
bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
}
trans->write_locked = true;
return 0;
}
static inline void bch2_trans_unlock_write(struct btree_trans *trans)
{
if (likely(trans->write_locked)) {
struct btree_insert_entry *i;
trans_for_each_update(trans, i)
if (!same_leaf_as_prev(trans, i))
bch2_btree_node_unlock_write_inlined(trans, i->path,
insert_l(i)->b);
trans->write_locked = false;
}
}
/* Inserting into a given leaf node (last stage of insert): */
/* Handle overwrites and do insert, for non extents: */
@ -269,23 +316,13 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
BUG_ON(i->level != i->path->level);
BUG_ON(i->btree_id != i->path->btree_id);
EBUG_ON(!i->level &&
btree_type_has_snapshots(i->btree_id) &&
!(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
i->k->k.p.snapshot &&
bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
}
static noinline int
bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags,
unsigned long trace_ip)
{
return drop_locks_do(trans,
bch2_journal_preres_get(&trans->c->journal,
&trans->journal_preres,
trans->journal_preres_u64s,
(flags & BCH_WATERMARK_MASK)));
}
static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
unsigned flags)
{
@ -320,6 +357,45 @@ static inline int btree_key_can_insert(struct btree_trans *trans,
return 0;
}
noinline static int
btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
struct btree_path *path, unsigned new_u64s)
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
struct bkey_cached *ck = (void *) path->l[0].b;
struct bkey_i *new_k;
int ret;
bch2_trans_unlock_write(trans);
bch2_trans_unlock(trans);
new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
if (!new_k) {
bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
bch2_btree_id_str(path->btree_id), new_u64s);
return -BCH_ERR_ENOMEM_btree_key_cache_insert;
}
ret = bch2_trans_relock(trans) ?:
bch2_trans_lock_write(trans);
if (unlikely(ret)) {
kfree(new_k);
return ret;
}
memcpy(new_k, ck->k, ck->u64s * sizeof(u64));
trans_for_each_update(trans, i)
if (i->old_v == &ck->k->v)
i->old_v = &new_k->v;
kfree(ck->k);
ck->u64s = new_u64s;
ck->k = new_k;
return 0;
}
static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
struct btree_path *path, unsigned u64s)
{
@ -333,7 +409,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
bch2_btree_key_cache_must_wait(c) &&
!(flags & BTREE_INSERT_JOURNAL_RECLAIM))
!(flags & BCH_TRANS_COMMIT_journal_reclaim))
return -BCH_ERR_btree_insert_need_journal_reclaim;
/*
@ -346,12 +422,9 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
return 0;
new_u64s = roundup_pow_of_two(u64s);
new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
if (!new_k) {
bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
bch2_btree_id_str(path->btree_id), new_u64s);
return -BCH_ERR_ENOMEM_btree_key_cache_insert;
}
new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT);
if (unlikely(!new_k))
return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s);
trans_for_each_update(trans, i)
if (i->old_v == &ck->k->v)
@ -583,6 +656,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
*stopped_at = i;
return ret;
}
i->k->k.needs_whiteout = false;
}
if (trans->nr_wb_updates &&
@ -593,7 +668,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
* Don't get journal reservation until after we know insert will
* succeed:
*/
if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
ret = bch2_trans_journal_res_get(trans,
(flags & BCH_WATERMARK_MASK)|
JOURNAL_RES_GET_NONBLOCK);
@ -602,8 +677,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
if (unlikely(trans->journal_transaction_names))
journal_transaction_name(trans);
} else {
trans->journal_res.seq = c->journal.replay_journal_seq;
}
/*
@ -612,7 +685,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
*/
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
!(flags & BTREE_INSERT_JOURNAL_REPLAY)) {
!(flags & BCH_TRANS_COMMIT_no_journal_res)) {
if (bch2_journal_seq_verify)
trans_for_each_update(trans, i)
i->k->k.version.lo = trans->journal_res.seq;
@ -626,7 +699,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
return -BCH_ERR_btree_insert_need_mark_replicas;
if (trans->nr_wb_updates) {
EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY);
EBUG_ON(flags & BCH_TRANS_COMMIT_no_journal_res);
ret = bch2_btree_insert_keys_write_buffer(trans);
if (ret)
@ -663,7 +736,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
trans->journal_res.u64s -= trans->extra_journal_entries.nr;
}
if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
struct journal *j = &c->journal;
struct jset_entry *entry;
@ -705,15 +778,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
}
trans_for_each_update(trans, i) {
i->k->k.needs_whiteout = false;
if (!i->cached) {
u64 seq = trans->journal_res.seq;
if (i->flags & BTREE_UPDATE_PREJOURNAL)
seq = i->seq;
bch2_btree_insert_key_leaf(trans, i->path, i->k, seq);
bch2_btree_insert_key_leaf(trans, i->path, i->k, trans->journal_res.seq);
} else if (!i->key_cache_already_flushed)
bch2_btree_insert_key_cached(trans, flags, i);
else {
@ -731,37 +797,6 @@ revert_fs_usage:
return ret;
}
static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
{
while (--i >= trans->updates) {
if (same_leaf_as_prev(trans, i))
continue;
bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
}
trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
}
static inline int trans_lock_write(struct btree_trans *trans)
{
struct btree_insert_entry *i;
trans_for_each_update(trans, i) {
if (same_leaf_as_prev(trans, i))
continue;
if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
return trans_lock_write_fail(trans, i);
if (!i->cached)
bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
}
return 0;
}
static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
{
struct btree_insert_entry *i;
@ -799,6 +834,12 @@ static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
return -EINVAL;
}
static int bch2_trans_commit_journal_pin_flush(struct journal *j,
struct journal_entry_pin *_pin, u64 seq)
{
return 0;
}
/*
* Get journal reservation, take write locks, and attempt to do btree update(s):
*/
@ -829,15 +870,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
}
}
ret = bch2_journal_preres_get(&c->journal,
&trans->journal_preres, trans->journal_preres_u64s,
(flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK);
if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked))
ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip);
if (unlikely(ret))
return ret;
ret = trans_lock_write(trans);
ret = bch2_trans_lock_write(trans);
if (unlikely(ret))
return ret;
@ -846,19 +879,18 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
if (!ret && unlikely(trans->journal_replay_not_finished))
bch2_drop_overwrites_from_journal(trans);
trans_for_each_update(trans, i)
if (!same_leaf_as_prev(trans, i))
bch2_btree_node_unlock_write_inlined(trans, i->path,
insert_l(i)->b);
bch2_trans_unlock_write(trans);
if (!ret && trans->journal_pin)
bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
trans->journal_pin, NULL);
trans->journal_pin,
bch2_trans_commit_journal_pin_flush);
/*
* Drop journal reservation after dropping write locks, since dropping
* the journal reservation may kick off a journal write:
*/
if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
bch2_journal_res_put(&c->journal, &trans->journal_res);
return ret;
@ -896,7 +928,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
* XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
* flag
*/
if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
(flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
break;
@ -931,7 +963,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
if (wb->state.nr > wb->size * 3 / 4) {
bch2_trans_begin(trans);
ret = __bch2_btree_write_buffer_flush(trans,
flags|BTREE_INSERT_NOCHECK_RW, true);
flags|BCH_TRANS_COMMIT_no_check_rw, true);
if (!ret) {
trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
@ -951,8 +983,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
!(flags & BTREE_INSERT_NOWAIT) &&
(flags & BTREE_INSERT_NOFAIL), c,
(flags & BCH_TRANS_COMMIT_no_enospc), c,
"%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
return ret;
@ -964,7 +995,7 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
struct bch_fs *c = trans->c;
int ret;
if (likely(!(flags & BTREE_INSERT_LAZY_RW)) ||
if (likely(!(flags & BCH_TRANS_COMMIT_lazy_rw)) ||
test_bit(BCH_FS_STARTED, &c->flags))
return -BCH_ERR_erofs_trans_commit;
@ -1002,7 +1033,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
struct bch_fs *c = trans->c;
struct btree_insert_entry *i = NULL;
struct btree_write_buffered_key *wb;
unsigned u64s;
int ret = 0;
if (!trans->nr_updates &&
@ -1010,9 +1040,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
!trans->extra_journal_entries.nr)
goto out_reset;
if (flags & BTREE_INSERT_GC_LOCK_HELD)
lockdep_assert_held(&c->gc_lock);
ret = bch2_trans_commit_run_triggers(trans);
if (ret)
goto out_reset;
@ -1021,7 +1048,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
struct printbuf buf = PRINTBUF;
enum bkey_invalid_flags invalid_flags = 0;
if (!(flags & BTREE_INSERT_JOURNAL_REPLAY))
if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
@ -1039,7 +1066,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
goto out_reset;
}
if (!(flags & BTREE_INSERT_NOCHECK_RW) &&
if (!(flags & BCH_TRANS_COMMIT_no_check_rw) &&
unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
ret = bch2_trans_commit_get_rw_cold(trans, flags);
if (ret)
@ -1052,7 +1079,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
bch2_trans_unlock(trans);
ret = __bch2_btree_write_buffer_flush(trans,
flags|BTREE_INSERT_NOCHECK_RW, true);
flags|BCH_TRANS_COMMIT_no_check_rw, true);
if (!ret) {
trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
@ -1062,13 +1089,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
trans->journal_u64s = trans->extra_journal_entries.nr;
trans->journal_preres_u64s = 0;
trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
if (trans->journal_transaction_names)
trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
@ -1084,16 +1106,11 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
if (i->key_cache_already_flushed)
continue;
/* we're going to journal the key being updated: */
u64s = jset_u64s(i->k->k.u64s);
if (i->cached &&
likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY)))
trans->journal_preres_u64s += u64s;
if (i->flags & BTREE_UPDATE_NOJOURNAL)
continue;
trans->journal_u64s += u64s;
/* we're going to journal the key being updated: */
trans->journal_u64s += jset_u64s(i->k->k.u64s);
/* and we're also going to log the overwrite: */
if (trans->journal_transaction_names)
@ -1106,13 +1123,14 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
if (trans->extra_journal_res) {
ret = bch2_disk_reservation_add(c, trans->disk_res,
trans->extra_journal_res,
(flags & BTREE_INSERT_NOFAIL)
(flags & BCH_TRANS_COMMIT_no_enospc)
? BCH_DISK_RESERVATION_NOFAIL : 0);
if (ret)
goto err;
}
retry:
bch2_trans_verify_not_in_restart(trans);
if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
memset(&trans->journal_res, 0, sizeof(trans->journal_res));
ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_);
@ -1125,9 +1143,7 @@ retry:
trace_and_count(c, transaction_commit, trans, _RET_IP_);
out:
bch2_journal_preres_put(&c->journal, &trans->journal_preres);
if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw)))
bch2_write_ref_put(c, BCH_WRITE_REF_trans);
out_reset:
if (!ret)
@ -1140,5 +1156,17 @@ err:
if (ret)
goto out;
/*
* We might have done another transaction commit in the error path -
* i.e. btree write buffer flush - which will have made use of
* trans->journal_res, but with BCH_TRANS_COMMIT_no_journal_res that is
* how the journal sequence number to pin is passed in - so we must
* restart:
*/
if (flags & BCH_TRANS_COMMIT_no_journal_res) {
ret = -BCH_ERR_transaction_restart_nested;
goto out;
}
goto retry;
}

View File

@ -5,7 +5,7 @@
#include <linux/list.h>
#include <linux/rhashtable.h>
//#include "bkey_methods.h"
#include "btree_key_cache_types.h"
#include "buckets_types.h"
#include "darray.h"
#include "errcode.h"
@ -322,31 +322,6 @@ struct btree_iter {
#endif
};
struct btree_key_cache_freelist {
struct bkey_cached *objs[16];
unsigned nr;
};
struct btree_key_cache {
struct mutex lock;
struct rhashtable table;
bool table_init_done;
struct list_head freed_pcpu;
struct list_head freed_nonpcpu;
struct shrinker shrink;
unsigned shrink_iter;
struct btree_key_cache_freelist __percpu *pcpu_freed;
atomic_long_t nr_freed;
atomic_long_t nr_keys;
atomic_long_t nr_dirty;
};
struct bkey_cached_key {
u32 btree_id;
struct bpos pos;
} __packed __aligned(4);
#define BKEY_CACHED_ACCESSED 0
#define BKEY_CACHED_DIRTY 1
@ -362,7 +337,6 @@ struct bkey_cached {
struct rhash_head hash;
struct list_head list;
struct journal_preres res;
struct journal_entry_pin journal;
u64 seq;
@ -392,7 +366,6 @@ struct btree_insert_entry {
u8 old_btree_u64s;
struct bkey_i *k;
struct btree_path *path;
u64 seq;
/* key being overwritten: */
struct bkey old_k;
const struct bch_val *old_v;
@ -441,6 +414,7 @@ struct btree_trans {
bool journal_replay_not_finished:1;
bool is_initial_gc:1;
bool notrace_relock_fail:1;
bool write_locked:1;
enum bch_errcode restarted:16;
u32 restart_count;
unsigned long last_begin_ip;
@ -472,11 +446,9 @@ struct btree_trans {
struct journal_entry_pin *journal_pin;
struct journal_res journal_res;
struct journal_preres journal_preres;
u64 *journal_seq;
struct disk_reservation *disk_res;
unsigned journal_u64s;
unsigned journal_preres_u64s;
struct replicas_delta_list *fs_usage_deltas;
};
@ -717,6 +689,17 @@ static inline bool btree_type_has_snapshots(enum btree_id id)
return (1U << id) & mask;
}
static inline bool btree_type_has_snapshot_field(enum btree_id id)
{
const unsigned mask = 0
#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr)
BCH_BTREE_IDS()
#undef x
;
return (1U << id) & mask;
}
static inline bool btree_type_has_ptrs(enum btree_id id)
{
const unsigned mask = 0

View File

@ -380,21 +380,12 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i, n;
u64 seq = 0;
int cmp;
EBUG_ON(!path->should_be_locked);
EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
EBUG_ON(!bpos_eq(k->k.p, path->pos));
/*
* The transaction journal res hasn't been allocated at this point.
* That occurs at commit time. Reuse the seq field to pass in the seq
* of a prejournaled key.
*/
if (flags & BTREE_UPDATE_PREJOURNAL)
seq = trans->journal_res.seq;
n = (struct btree_insert_entry) {
.flags = flags,
.bkey_type = __btree_node_type(path->level, path->btree_id),
@ -403,7 +394,6 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
.cached = path->cached,
.path = path,
.k = k,
.seq = seq,
.ip_allocated = ip,
};
@ -431,7 +421,6 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
i->cached = n.cached;
i->k = n.k;
i->path = n.path;
i->seq = n.seq;
i->ip_allocated = n.ip_allocated;
} else {
array_insert_item(trans->updates, trans->nr_updates,
@ -542,18 +531,6 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_);
}
/*
* Add a transaction update for a key that has already been journaled.
*/
int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq,
struct btree_iter *iter, struct bkey_i *k,
enum btree_update_flags flags)
{
trans->journal_res.seq = seq;
return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL|
BTREE_UPDATE_PREJOURNAL);
}
int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
enum btree_id btree,
struct bkey_i *k)
@ -792,7 +769,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
bch2_trans_commit(trans, &disk_res, journal_seq,
BTREE_INSERT_NOFAIL);
BCH_TRANS_COMMIT_no_enospc);
bch2_disk_reservation_put(trans->c, &disk_res);
err:
/*
@ -897,7 +874,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
} else {
ret = bch2_trans_do(c, NULL, NULL,
BTREE_INSERT_LAZY_RW|commit_flags,
BCH_TRANS_COMMIT_lazy_rw|commit_flags,
__bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args));
}

View File

@ -21,37 +21,28 @@ void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
struct bkey_i *, u64);
enum btree_insert_flags {
#define BCH_TRANS_COMMIT_FLAGS() \
x(no_enospc, "don't check for enospc") \
x(no_check_rw, "don't attempt to take a ref on c->writes") \
x(lazy_rw, "go read-write if we haven't yet - only for use in recovery") \
x(no_journal_res, "don't take a journal reservation, instead " \
"pin journal entry referred to by trans->journal_res.seq") \
x(journal_reclaim, "operation required for journal reclaim; may return error" \
"instead of deadlocking if BCH_WATERMARK_reclaim not specified")\
enum __bch_trans_commit_flags {
/* First bits for bch_watermark: */
__BTREE_INSERT_NOFAIL = BCH_WATERMARK_BITS,
__BTREE_INSERT_NOCHECK_RW,
__BTREE_INSERT_LAZY_RW,
__BTREE_INSERT_JOURNAL_REPLAY,
__BTREE_INSERT_JOURNAL_RECLAIM,
__BTREE_INSERT_NOWAIT,
__BTREE_INSERT_GC_LOCK_HELD,
__BCH_HASH_SET_MUST_CREATE,
__BCH_HASH_SET_MUST_REPLACE,
__BCH_TRANS_COMMIT_FLAGS_START = BCH_WATERMARK_BITS,
#define x(n, ...) __BCH_TRANS_COMMIT_##n,
BCH_TRANS_COMMIT_FLAGS()
#undef x
};
/* Don't check for -ENOSPC: */
#define BTREE_INSERT_NOFAIL BIT(__BTREE_INSERT_NOFAIL)
#define BTREE_INSERT_NOCHECK_RW BIT(__BTREE_INSERT_NOCHECK_RW)
#define BTREE_INSERT_LAZY_RW BIT(__BTREE_INSERT_LAZY_RW)
/* Insert is for journal replay - don't get journal reservations: */
#define BTREE_INSERT_JOURNAL_REPLAY BIT(__BTREE_INSERT_JOURNAL_REPLAY)
/* Insert is being called from journal reclaim path: */
#define BTREE_INSERT_JOURNAL_RECLAIM BIT(__BTREE_INSERT_JOURNAL_RECLAIM)
/* Don't block on allocation failure (for new btree nodes: */
#define BTREE_INSERT_NOWAIT BIT(__BTREE_INSERT_NOWAIT)
#define BTREE_INSERT_GC_LOCK_HELD BIT(__BTREE_INSERT_GC_LOCK_HELD)
#define BCH_HASH_SET_MUST_CREATE BIT(__BCH_HASH_SET_MUST_CREATE)
#define BCH_HASH_SET_MUST_REPLACE BIT(__BCH_HASH_SET_MUST_REPLACE)
enum bch_trans_commit_flags {
#define x(n, ...) BCH_TRANS_COMMIT_##n = BIT(__BCH_TRANS_COMMIT_##n),
BCH_TRANS_COMMIT_FLAGS()
#undef x
};
int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
unsigned, unsigned);

View File

@ -475,9 +475,6 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
/*
* Protects reaping from the btree node cache and using the btree node
* open bucket reserve:
*
* BTREE_INSERT_NOWAIT only applies to btree node allocation, not
* blocking on this lock:
*/
ret = bch2_btree_cache_cannibalize_lock(c, cl);
if (ret)
@ -487,8 +484,7 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
struct prealloc_nodes *p = as->prealloc_nodes + interior;
while (p->nr < nr_nodes[interior]) {
b = __bch2_btree_node_alloc(trans, &as->disk_res,
flags & BTREE_INSERT_NOWAIT ? NULL : cl,
b = __bch2_btree_node_alloc(trans, &as->disk_res, cl,
interior, flags);
if (IS_ERR(b)) {
ret = PTR_ERR(b);
@ -513,8 +509,6 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *
up_read(&c->gc_lock);
as->took_gc_lock = false;
bch2_journal_preres_put(&c->journal, &as->journal_preres);
bch2_journal_pin_drop(&c->journal, &as->journal);
bch2_journal_pin_flush(&c->journal, &as->journal);
bch2_disk_reservation_put(c, &as->disk_res);
@ -646,9 +640,9 @@ static void btree_update_nodes_written(struct btree_update *as)
*/
ret = commit_do(trans, &as->disk_res, &journal_seq,
BCH_WATERMARK_reclaim|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_JOURNAL_RECLAIM,
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_journal_reclaim,
btree_update_nodes_written_trans(trans, as));
bch2_trans_unlock(trans);
@ -734,8 +728,6 @@ err:
bch2_journal_pin_drop(&c->journal, &as->journal);
bch2_journal_preres_put(&c->journal, &as->journal_preres);
mutex_lock(&c->btree_interior_update_lock);
for (i = 0; i < as->nr_new_nodes; i++) {
b = as->new_nodes[i];
@ -818,6 +810,12 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
mutex_unlock(&c->btree_interior_update_lock);
}
static int bch2_update_reparent_journal_pin_flush(struct journal *j,
struct journal_entry_pin *_pin, u64 seq)
{
return 0;
}
static void btree_update_reparent(struct btree_update *as,
struct btree_update *child)
{
@ -828,7 +826,8 @@ static void btree_update_reparent(struct btree_update *as,
child->b = NULL;
child->mode = BTREE_INTERIOR_UPDATING_AS;
bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal,
bch2_update_reparent_journal_pin_flush);
}
static void btree_update_updated_root(struct btree_update *as, struct btree *b)
@ -937,6 +936,12 @@ static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct b
b->ob.v[--b->ob.nr];
}
static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal *j,
struct journal_entry_pin *_pin, u64 seq)
{
return 0;
}
/*
* @b is being split/rewritten: it may have pointers to not-yet-written btree
* nodes and thus outstanding btree_updates - redirect @b's
@ -988,11 +993,13 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
* when the new nodes are persistent and reachable on disk:
*/
w = btree_current_write(b);
bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
bch2_btree_update_will_free_node_journal_pin_flush);
bch2_journal_pin_drop(&c->journal, &w->journal);
w = btree_prev_write(b);
bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
bch2_btree_update_will_free_node_journal_pin_flush);
bch2_journal_pin_drop(&c->journal, &w->journal);
mutex_unlock(&c->btree_interior_update_lock);
@ -1042,7 +1049,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
struct bch_fs *c = trans->c;
struct btree_update *as;
u64 start_time = local_clock();
int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc)
? BCH_DISK_RESERVATION_NOFAIL : 0;
unsigned nr_nodes[2] = { 0, 0 };
unsigned update_level = level;
@ -1061,7 +1068,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
flags &= ~BCH_WATERMARK_MASK;
flags |= watermark;
if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
if (flags & BCH_TRANS_COMMIT_journal_reclaim)
journal_flags |= JOURNAL_RES_GET_NONBLOCK;
journal_flags |= watermark;
@ -1087,9 +1094,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
}
if (flags & BTREE_INSERT_GC_LOCK_HELD)
lockdep_assert_held(&c->gc_lock);
else if (!down_read_trylock(&c->gc_lock)) {
if (!down_read_trylock(&c->gc_lock)) {
ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0));
if (ret) {
up_read(&c->gc_lock);
@ -1103,7 +1108,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
as->c = c;
as->start_time = start_time;
as->mode = BTREE_INTERIOR_NO_UPDATE;
as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
as->took_gc_lock = true;
as->btree_id = path->btree_id;
as->update_level = update_level;
INIT_LIST_HEAD(&as->list);
@ -1129,27 +1134,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
if (ret)
goto err;
ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
BTREE_UPDATE_JOURNAL_RES,
journal_flags|JOURNAL_RES_GET_NONBLOCK);
if (ret) {
if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
goto err;
}
ret = drop_locks_do(trans,
bch2_journal_preres_get(&c->journal, &as->journal_preres,
BTREE_UPDATE_JOURNAL_RES,
journal_flags));
if (ret == -BCH_ERR_journal_preres_get_blocked) {
trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get);
}
if (ret)
goto err;
}
ret = bch2_disk_reservation_get(c, &as->disk_res,
(nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
c->opts.metadata_replicas,
@ -1167,7 +1151,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
* flag
*/
if (bch2_err_matches(ret, ENOSPC) &&
(flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
(flags & BCH_TRANS_COMMIT_journal_reclaim) &&
watermark != BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
goto err;
@ -1855,7 +1839,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
parent = btree_node_parent(path, b);
as = bch2_btree_update_start(trans, path, level, false,
BTREE_INSERT_NOFAIL|flags);
BCH_TRANS_COMMIT_no_enospc|flags);
ret = PTR_ERR_OR_ZERO(as);
if (ret)
goto err;
@ -1941,7 +1925,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
struct btree_update *as;
int ret;
flags |= BTREE_INSERT_NOFAIL;
flags |= BCH_TRANS_COMMIT_no_enospc;
parent = btree_node_parent(iter->path, b);
as = bch2_btree_update_start(trans, iter->path, b->c.level,
@ -2418,23 +2402,17 @@ void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry
struct jset_entry *
bch2_btree_roots_to_journal_entries(struct bch_fs *c,
struct jset_entry *start,
struct jset_entry *end)
struct jset_entry *end,
unsigned long skip)
{
struct jset_entry *entry;
unsigned long have = 0;
unsigned i;
for (entry = start; entry < end; entry = vstruct_next(entry))
if (entry->type == BCH_JSET_ENTRY_btree_root)
__set_bit(entry->btree_id, &have);
mutex_lock(&c->btree_root_lock);
for (i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
if (r->alive && !test_bit(i, &have)) {
if (r->alive && !test_bit(i, &skip)) {
journal_entry_set(end, BCH_JSET_ENTRY_btree_root,
i, r->level, &r->key, r->key.k.u64s);
end = vstruct_next(end);

View File

@ -55,7 +55,6 @@ struct btree_update {
unsigned update_level;
struct disk_reservation disk_res;
struct journal_preres journal_preres;
/*
* BTREE_INTERIOR_UPDATING_NODE:
@ -325,7 +324,7 @@ bool bch2_btree_interior_updates_flush(struct bch_fs *);
void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *);
struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
struct jset_entry *, struct jset_entry *);
struct jset_entry *, unsigned long);
void bch2_do_pending_node_rewrites(struct bch_fs *);
void bch2_free_pending_node_rewrites(struct bch_fs *);

View File

@ -9,9 +9,11 @@
#include "journal.h"
#include "journal_reclaim.h"
#include <linux/atomic.h>
#include <linux/sort.h>
static int bch2_btree_write_buffer_journal_flush(struct journal *,
struct journal_entry_pin *, u64);
static int btree_write_buffered_key_cmp(const void *_l, const void *_r)
{
const struct btree_write_buffered_key *l = _l;
@ -46,6 +48,13 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
if (ret)
return ret;
/*
* We can't clone a path that has write locks: unshare it now, before
* set_pos and traverse():
*/
if (iter->path->ref > 1)
iter->path = __bch2_btree_path_make_mut(trans, iter->path, true, _THIS_IP_);
path = iter->path;
if (!*write_locked) {
@ -65,24 +74,18 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
(*fast)++;
if (path->ref > 1) {
/*
* We can't clone a path that has write locks: if the path is
* shared, unlock before set_pos(), traverse():
*/
bch2_btree_node_unlock_write(trans, path, path->l[0].b);
*write_locked = false;
}
return 0;
trans_commit:
return bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k,
trans->journal_res.seq = wb->journal_seq;
return bch2_trans_update(trans, iter, &wb->k,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, NULL, NULL,
commit_flags|
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_JOURNAL_RECLAIM);
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_journal_res|
BCH_TRANS_COMMIT_journal_reclaim);
}
static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb)
@ -125,8 +128,10 @@ btree_write_buffered_insert(struct btree_trans *trans,
bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k),
BTREE_ITER_CACHED|BTREE_ITER_INTENT);
trans->journal_res.seq = wb->journal_seq;
ret = bch2_btree_iter_traverse(&iter) ?:
bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k,
bch2_trans_update(trans, &iter, &wb->k,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
bch2_trans_iter_exit(trans, &iter);
return ret;
@ -151,7 +156,8 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
if (!locked && !mutex_trylock(&wb->flush_lock))
return 0;
bch2_journal_pin_copy(j, &pin, &wb->journal_pin, NULL);
bch2_journal_pin_copy(j, &pin, &wb->journal_pin,
bch2_btree_write_buffer_journal_flush);
bch2_journal_pin_drop(j, &wb->journal_pin);
s = btree_write_buffer_switch(wb);
@ -169,7 +175,7 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
* However, since we're not flushing in the order they appear in the
* journal we won't be able to drop our journal pin until everything is
* flushed - which means this could deadlock the journal if we weren't
* passing BTREE_INSERT_JOURNAL_RECLAIM. This causes the update to fail
* passing BCH_TRANS_COMMIT_journal_reclaim. This causes the update to fail
* if it would block taking a journal reservation.
*
* If that happens, simply skip the key so we can optimistically insert
@ -253,21 +259,14 @@ slowpath:
if (!i->journal_seq)
continue;
if (i->journal_seq > pin.seq) {
struct journal_entry_pin pin2;
memset(&pin2, 0, sizeof(pin2));
bch2_journal_pin_add(j, i->journal_seq, &pin2, NULL);
bch2_journal_pin_drop(j, &pin);
bch2_journal_pin_copy(j, &pin, &pin2, NULL);
bch2_journal_pin_drop(j, &pin2);
}
bch2_journal_pin_update(j, i->journal_seq, &pin,
bch2_btree_write_buffer_journal_flush);
ret = commit_do(trans, NULL, NULL,
commit_flags|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_JOURNAL_RECLAIM,
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_journal_res|
BCH_TRANS_COMMIT_journal_reclaim,
btree_write_buffered_insert(trans, i));
if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
break;
@ -297,7 +296,7 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j,
mutex_lock(&wb->flush_lock);
return bch2_trans_run(c,
__bch2_btree_write_buffer_flush(trans, BTREE_INSERT_NOCHECK_RW, true));
__bch2_btree_write_buffer_flush(trans, BCH_TRANS_COMMIT_no_check_rw, true));
}
static inline u64 btree_write_buffer_ref(int idx)

21
libbcachefs/darray.c Normal file
View File

@ -0,0 +1,21 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/log2.h>
#include <linux/slab.h>
#include "darray.h"
int __bch2_darray_resize(darray_void *d, size_t element_size, size_t new_size, gfp_t gfp)
{
if (new_size > d->size) {
new_size = roundup_pow_of_two(new_size);
void *data = krealloc_array(d->data, new_size, element_size, gfp);
if (!data)
return -ENOMEM;
d->data = data;
d->size = new_size;
}
return 0;
}

View File

@ -8,7 +8,6 @@
* Inspired by CCAN's darray
*/
#include "util.h"
#include <linux/slab.h>
#define DARRAY(type) \
@ -19,20 +18,25 @@ struct { \
typedef DARRAY(void) darray_void;
int __bch2_darray_resize(darray_void *, size_t, size_t, gfp_t);
static inline int __darray_resize(darray_void *d, size_t element_size,
size_t new_size, gfp_t gfp)
{
return unlikely(new_size > d->size)
? __bch2_darray_resize(d, element_size, new_size, gfp)
: 0;
}
#define darray_resize_gfp(_d, _new_size, _gfp) \
__darray_resize((darray_void *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp)
#define darray_resize(_d, _new_size) \
darray_resize_gfp(_d, _new_size, GFP_KERNEL)
static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, gfp_t gfp)
{
if (d->nr + more > d->size) {
size_t new_size = roundup_pow_of_two(d->nr + more);
void *data = krealloc_array(d->data, new_size, t_size, gfp);
if (!data)
return -ENOMEM;
d->data = data;
d->size = new_size;
}
return 0;
return __darray_resize(d, t_size, d->nr + more, gfp);
}
#define darray_make_room_gfp(_d, _more, _gfp) \
@ -41,6 +45,8 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more,
#define darray_make_room(_d, _more) \
darray_make_room_gfp(_d, _more, GFP_KERNEL)
#define darray_room(_d) ((_d).size - (_d).nr)
#define darray_top(_d) ((_d).data[(_d).nr])
#define darray_push_gfp(_d, _item, _gfp) \

View File

@ -239,6 +239,34 @@ restart_drop_extra_replicas:
next_pos = insert->k.p;
/*
* Check for nonce offset inconsistency:
* This is debug code - we've been seeing this bug rarely, and
* it's been hard to reproduce, so this should give us some more
* information when it does occur:
*/
struct printbuf err = PRINTBUF;
int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err);
printbuf_exit(&err);
if (invalid) {
struct printbuf buf = PRINTBUF;
prt_str(&buf, "about to insert invalid key in data update path");
prt_str(&buf, "\nold: ");
bch2_bkey_val_to_text(&buf, c, old);
prt_str(&buf, "\nk: ");
bch2_bkey_val_to_text(&buf, c, k);
prt_str(&buf, "\nnew: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
bch2_print_string_as_lines(KERN_ERR, buf.buf);
printbuf_exit(&buf);
bch2_fatal_error(c);
goto out;
}
ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, bkey_start_pos(&insert->k)) ?:
bch2_insert_snapshot_whiteouts(trans, m->btree_id,
@ -250,8 +278,8 @@ restart_drop_extra_replicas:
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, &op->res,
NULL,
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL|
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc|
m->data_opts.btree_insert_flags);
if (!ret) {
bch2_btree_iter_set_pos(&iter, next_pos);

View File

@ -201,7 +201,8 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum,
u64 *dir_offset, int flags)
u64 *dir_offset,
bch_str_hash_flags_t str_hash_flags)
{
struct bkey_i_dirent *dirent;
int ret;
@ -212,7 +213,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
return ret;
ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
dir, &dirent->k_i, flags);
dir, &dirent->k_i, str_hash_flags);
*dir_offset = dirent->k.p.offset;
return ret;

View File

@ -37,7 +37,8 @@ int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
int bch2_dirent_create(struct btree_trans *, subvol_inum,
const struct bch_hash_info *, u8,
const struct qstr *, u64, u64 *, int);
const struct qstr *, u64, u64 *,
bch_str_hash_flags_t);
static inline unsigned vfs_d_type(unsigned type)
{

View File

@ -555,6 +555,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
case TARGET_DEV: {
struct bch_dev *ca;
out->atomic++;
rcu_read_lock();
ca = t.dev < c->sb.nr_devices
? rcu_dereference(c->devs[t.dev])
@ -570,6 +571,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
}
rcu_read_unlock();
out->atomic--;
break;
}
case TARGET_GROUP:
@ -580,7 +582,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
}
}
void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
{
struct target t = target_decode(v);

View File

@ -150,6 +150,7 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset);
if (i < nr_data)
prt_printf(out, "#%u", stripe_blockcount_get(s, i));
prt_printf(out, " gen %u", ptr->gen);
if (ptr_stale(ca, ptr))
prt_printf(out, " stale");
}
@ -303,16 +304,21 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
struct bch_csum got = ec_block_checksum(buf, i, offset);
if (bch2_crc_cmp(want, got)) {
struct printbuf buf2 = PRINTBUF;
struct printbuf err = PRINTBUF;
struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev);
bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key));
prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n",
want.hi, want.lo,
got.hi, got.lo,
bch2_csum_types[v->csum_type]);
prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i);
bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
bch_err_ratelimited(ca, "%s", err.buf);
printbuf_exit(&err);
bch_err_ratelimited(c,
"stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
(void *) _RET_IP_, i, j, v->csum_type,
want.lo, got.lo, buf2.buf);
printbuf_exit(&buf2);
clear_bit(i, buf->valid);
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
break;
}
@ -475,14 +481,10 @@ err:
return ret;
}
static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
{
return bch2_trans_run(c, get_stripe_key_trans(trans, idx, stripe));
}
/* recovery read path: */
int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
{
struct bch_fs *c = trans->c;
struct ec_stripe_buf *buf;
struct closure cl;
struct bch_stripe *v;
@ -497,7 +499,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
if (!buf)
return -BCH_ERR_ENOMEM_ec_read_extent;
ret = get_stripe_key(c, rbio->pick.ec.idx, buf);
ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
if (ret) {
bch_err_ratelimited(c,
"error doing reconstruct read: error %i looking up stripe", ret);
@ -801,7 +803,7 @@ static void ec_stripe_delete_work(struct work_struct *work)
if (!idx)
break;
ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
ec_stripe_delete(trans, idx));
if (ret) {
bch_err_fn(c, ret);
@ -981,8 +983,8 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
while (1) {
ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL,
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc,
ec_stripe_update_extent(trans, bucket_pos, bucket.gen,
s, &bp_pos));
if (ret)
@ -1119,8 +1121,8 @@ static void ec_stripe_create(struct ec_stripe_new *s)
}
ret = bch2_trans_do(c, &s->res, NULL,
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL,
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc,
ec_stripe_key_update(trans,
bkey_i_to_stripe(&s->new_stripe.key),
!s->have_existing_stripe));
@ -1371,6 +1373,15 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
h->nr_active_devs++;
rcu_read_unlock();
/*
* If we only have redundancy + 1 devices, we're better off with just
* replication:
*/
if (h->nr_active_devs < h->redundancy + 2)
bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?",
h->nr_active_devs, h->redundancy + 2);
list_add(&h->list, &c->ec_stripe_head_list);
return h;
}
@ -1422,6 +1433,11 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans,
h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark);
found:
if (!IS_ERR_OR_NULL(h) &&
h->nr_active_devs < h->redundancy + 2) {
mutex_unlock(&h->lock);
h = NULL;
}
mutex_unlock(&c->ec_stripe_head_lock);
return h;
}
@ -1679,8 +1695,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
int ret;
h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark);
if (!h)
bch_err(c, "no stripe head");
if (IS_ERR_OR_NULL(h))
return h;

View File

@ -199,7 +199,7 @@ struct ec_stripe_head {
struct ec_stripe_new *s;
};
int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *);
int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *);
void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);

View File

@ -73,7 +73,6 @@
x(ENOMEM, ENOMEM_fsck_add_nlink) \
x(ENOMEM, ENOMEM_journal_key_insert) \
x(ENOMEM, ENOMEM_journal_keys_sort) \
x(ENOMEM, ENOMEM_journal_replay) \
x(ENOMEM, ENOMEM_read_superblock_clean) \
x(ENOMEM, ENOMEM_fs_alloc) \
x(ENOMEM, ENOMEM_fs_name_alloc) \

View File

@ -13,7 +13,7 @@
int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
loff_t start, u64 end,
int fgp_flags, gfp_t gfp,
fgf_t fgp_flags, gfp_t gfp,
folios *fs)
{
struct folio *f;

View File

@ -7,7 +7,7 @@
typedef DARRAY(struct folio *) folios;
int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t,
u64, int, gfp_t, folios *);
u64, fgf_t, gfp_t, folios *);
int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t);
/*

View File

@ -93,7 +93,7 @@ retry:
BTREE_ITER_INTENT) ?:
(set ? set(trans, inode, &inode_u, p) : 0) ?:
bch2_inode_write(trans, &iter, &inode_u) ?:
bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
/*
* the btree node lock protects inode->ei_inode, not ei_update_lock;
@ -452,7 +452,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL,
BCH_TRANS_COMMIT_no_enospc,
bch2_unlink_trans(trans,
inode_inum(dir), &dir_u,
&inode_u, &dentry->d_name,
@ -717,7 +717,7 @@ retry:
ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL);
BCH_TRANS_COMMIT_no_enospc);
btree_err:
bch2_trans_iter_exit(trans, &inode_iter);
@ -1922,10 +1922,7 @@ out:
return dget(sb->s_root);
err_put_super:
sb->s_fs_info = NULL;
c->vfs_sb = NULL;
deactivate_locked_super(sb);
bch2_fs_stop(c);
return ERR_PTR(bch2_err_class(ret));
}
@ -1933,10 +1930,7 @@ static void bch2_kill_sb(struct super_block *sb)
{
struct bch_fs *c = sb->s_fs_info;
if (c)
c->vfs_sb = NULL;
generic_shutdown_super(sb);
if (c)
bch2_fs_free(c);
}

View File

@ -208,8 +208,8 @@ static int fsck_write_inode(struct btree_trans *trans,
u32 snapshot)
{
int ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW,
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_lazy_rw,
__write_inode(trans, inode, snapshot));
if (ret)
bch_err_fn(trans->c, ret);
@ -354,8 +354,8 @@ static int reattach_inode(struct btree_trans *trans,
u32 inode_snapshot)
{
int ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_NOFAIL,
BCH_TRANS_COMMIT_lazy_rw|
BCH_TRANS_COMMIT_no_enospc,
__reattach_inode(trans, inode, inode_snapshot));
bch_err_msg(trans->c, ret, "reattaching inode %llu", inode->bi_inum);
return ret;
@ -757,8 +757,8 @@ static int hash_redo_key(struct btree_trans *trans,
BCH_HASH_SET_MUST_CREATE,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW);
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_lazy_rw);
}
static int hash_check_key(struct btree_trans *trans,
@ -992,7 +992,7 @@ int bch2_check_inodes(struct bch_fs *c)
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
check_inode(trans, &iter, k, &prev, &s, full));
snapshots_seen_exit(&s);
@ -1226,7 +1226,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
k1, k2) ?:
bch2_trans_commit(trans, &res, NULL,
BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL);
BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc);
bch2_disk_reservation_put(c, &res);
if (ret)
@ -1465,7 +1465,7 @@ int bch2_check_extents(struct bch_fs *c)
POS(BCACHEFS_ROOT_INO, 0),
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
&res, NULL,
BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc, ({
bch2_disk_reservation_put(c, &res);
check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
check_extent_overbig(trans, &iter, k);
@ -1494,7 +1494,7 @@ int bch2_check_indirect_extents(struct bch_fs *c)
POS_MIN,
BTREE_ITER_PREFETCH, k,
&res, NULL,
BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc, ({
bch2_disk_reservation_put(c, &res);
check_extent_overbig(trans, &iter, k);
}));
@ -1854,7 +1854,7 @@ int bch2_check_dirents(struct bch_fs *c)
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
k,
NULL, NULL,
BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s));
bch2_trans_put(trans);
@ -1918,7 +1918,7 @@ int bch2_check_xattrs(struct bch_fs *c)
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
k,
NULL, NULL,
BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
check_xattr(trans, &iter, k, &hash_info, &inode)));
bch_err_fn(c, ret);
return ret;
@ -1949,8 +1949,8 @@ static int check_root_trans(struct btree_trans *trans)
root_subvol.v.snapshot = cpu_to_le32(snapshot);
root_subvol.v.inode = cpu_to_le64(inum);
ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW,
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_lazy_rw,
bch2_btree_insert_trans(trans, BTREE_ID_subvolumes,
&root_subvol.k_i, 0));
bch_err_msg(c, ret, "writing root subvol");
@ -1986,8 +1986,8 @@ int bch2_check_root(struct bch_fs *c)
int ret;
ret = bch2_trans_do(c, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW,
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_lazy_rw,
check_root_trans(trans));
bch_err_fn(c, ret);
return ret;
@ -2116,8 +2116,8 @@ static int check_path(struct btree_trans *trans,
return 0;
ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW,
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_lazy_rw,
remove_backpointer(trans, inode));
if (ret) {
bch_err(c, "error removing dirent: %i", ret);
@ -2398,7 +2398,7 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
POS(0, range_start),
BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
if (ret < 0) {
bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
@ -2483,7 +2483,7 @@ int bch2_fix_reflink_p(struct bch_fs *c)
BTREE_ID_extents, POS_MIN,
BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|
BTREE_ITER_ALL_SNAPSHOTS, k,
NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
fix_reflink_p_key(trans, &iter, k)));
bch_err_fn(c, ret);
return ret;

View File

@ -830,7 +830,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL);
BCH_TRANS_COMMIT_no_enospc);
err:
if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
break;
@ -893,7 +893,7 @@ retry:
ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL);
BCH_TRANS_COMMIT_no_enospc);
err:
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@ -1057,7 +1057,7 @@ retry:
ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL);
BCH_TRANS_COMMIT_no_enospc);
err:
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@ -1091,7 +1091,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
ret = bch2_inode_unpack(k, &inode);
if (ret)
goto err;
goto out;
if (fsck_err_on(S_ISDIR(inode.bi_mode), c,
deleted_inode_is_dir,
@ -1109,38 +1109,45 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
!fsck_err(c,
deleted_inode_but_clean,
"filesystem marked as clean but have deleted inode %llu:%u",
pos.offset, pos.snapshot))
return 0;
pos.offset, pos.snapshot)) {
ret = 0;
goto out;
}
if (bch2_snapshot_is_internal_node(c, pos.snapshot)) {
struct bpos new_min_pos;
ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos);
if (ret)
goto err;
goto out;
inode.bi_flags &= ~BCH_INODE_unlinked;
ret = bch2_inode_write_flags(trans, &inode_iter, &inode,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_lazy_rw);
bch_err_msg(c, ret, "clearing inode unlinked flag");
if (ret)
return ret;
goto out;
/*
* We'll need another write buffer flush to pick up the new
* unlinked inodes in the snapshot leaves:
*/
*need_another_pass = true;
return 0;
goto out;
}
return 1;
err:
ret = 1;
out:
fsck_err:
bch2_trans_iter_exit(trans, &inode_iter);
return ret;
delete:
return bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
goto out;
}
int bch2_delete_dead_inodes(struct bch_fs *c)

View File

@ -256,7 +256,7 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
u64 new_i_size = le64_to_cpu(op->v.new_i_size);
int ret;
ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
truncate_set_isize(trans, inum, new_i_size));
if (ret)
goto err;
@ -378,7 +378,7 @@ case LOGGED_OP_FINSERT_start:
op->v.state = LOGGED_OP_FINSERT_shift_extents;
if (insert) {
ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
adjust_i_size(trans, inum, src_offset, len) ?:
bch2_logged_op_update(trans, &op->k_i));
if (ret)
@ -390,7 +390,7 @@ case LOGGED_OP_FINSERT_start:
if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto err;
ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_logged_op_update(trans, &op->k_i));
}
@ -455,7 +455,7 @@ case LOGGED_OP_FINSERT_shift_extents:
bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
bch2_logged_op_update(trans, &op->k_i) ?:
bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL);
bch2_trans_commit(trans, &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc);
btree_err:
bch2_disk_reservation_put(c, &disk_res);
@ -470,12 +470,12 @@ btree_err:
op->v.state = LOGGED_OP_FINSERT_finish;
if (!insert) {
ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
adjust_i_size(trans, inum, src_offset, shift) ?:
bch2_logged_op_update(trans, &op->k_i));
} else {
/* We need an inode update to update bi_journal_seq for fsync: */
ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
adjust_i_size(trans, inum, 0, 0) ?:
bch2_logged_op_update(trans, &op->k_i));
}

View File

@ -526,7 +526,7 @@ out:
static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
{
bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL,
bch2_trans_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
__bch2_rbio_narrow_crcs(trans, rbio));
}
@ -1025,7 +1025,7 @@ get_bio:
trans->notrace_relock_fail = true;
} else {
/* Attempting reconstruct read: */
if (bch2_ec_read_extent(c, rbio)) {
if (bch2_ec_read_extent(trans, rbio)) {
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
}

View File

@ -202,6 +202,17 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
struct btree_iter iter;
struct bkey_i *k;
struct bkey_i_inode_v3 *inode;
/*
* Crazy performance optimization:
* Every extent update needs to also update the inode: the inode trigger
* will set bi->journal_seq to the journal sequence number of this
* transaction - for fsync.
*
* But if that's the only reason we're updating the inode (we're not
* updating bi_size or bi_sectors), then we don't need the inode update
* to be journalled - if we crash, the bi_journal_seq update will be
* lost, but that's fine.
*/
unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
int ret;
@ -305,8 +316,8 @@ int bch2_extent_update(struct btree_trans *trans,
i_sectors_delta) ?:
bch2_trans_update(trans, iter, k, 0) ?:
bch2_trans_commit(trans, disk_res, NULL,
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL);
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc);
if (unlikely(ret))
return ret;
@ -1165,7 +1176,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents,
bkey_start_pos(&orig->k), orig->k.p,
BTREE_ITER_INTENT, k,
NULL, NULL, BTREE_INSERT_NOFAIL, ({
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
}));

View File

@ -361,11 +361,6 @@ static int journal_entry_open(struct journal *j)
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
if (j->res_get_blocked_start)
bch2_time_stats_update(j->blocked_time,
j->res_get_blocked_start);
j->res_get_blocked_start = 0;
mod_delayed_work(c->io_complete_wq,
&j->write_work,
msecs_to_jiffies(c->opts.journal_flush_delay));
@ -465,15 +460,12 @@ retry:
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
ret = journal_entry_open(j);
if (ret == JOURNAL_ERR_max_in_flight)
if (ret == JOURNAL_ERR_max_in_flight) {
track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
&j->max_in_flight_start, true);
trace_and_count(c, journal_entry_full, c);
unlock:
if ((ret && ret != JOURNAL_ERR_insufficient_devices) &&
!j->res_get_blocked_start) {
j->res_get_blocked_start = local_clock() ?: 1;
trace_and_count(c, journal_full, c);
}
unlock:
can_discard = j->can_discard;
spin_unlock(&j->lock);
@ -526,36 +518,6 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
return ret;
}
/* journal_preres: */
static bool journal_preres_available(struct journal *j,
struct journal_preres *res,
unsigned new_u64s,
unsigned flags)
{
bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true);
if (!ret && mutex_trylock(&j->reclaim_lock)) {
bch2_journal_reclaim(j);
mutex_unlock(&j->reclaim_lock);
}
return ret;
}
int __bch2_journal_preres_get(struct journal *j,
struct journal_preres *res,
unsigned new_u64s,
unsigned flags)
{
int ret;
closure_wait_event(&j->preres_wait,
(ret = bch2_journal_error(j)) ||
journal_preres_available(j, res, new_u64s, flags));
return ret;
}
/* journal_entry_res: */
void bch2_journal_entry_res_resize(struct journal *j,
@ -1290,6 +1252,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
union journal_res_state s;
struct bch_dev *ca;
unsigned long now = jiffies;
u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes;
u64 seq;
unsigned i;
@ -1306,11 +1269,13 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j));
prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk);
prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
prt_printf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining);
prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]);
prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes);
prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes);
prt_printf(out, "average write size:\t");
prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0);
prt_newline(out);
prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim);
prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked);

View File

@ -395,104 +395,6 @@ out:
return 0;
}
/* journal_preres: */
static inline void journal_set_watermark(struct journal *j)
{
union journal_preres_state s = READ_ONCE(j->prereserved);
unsigned watermark = BCH_WATERMARK_stripe;
if (fifo_free(&j->pin) < j->pin.size / 4)
watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc);
if (fifo_free(&j->pin) < j->pin.size / 8)
watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
if (s.reserved > s.remaining)
watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc);
if (!s.remaining)
watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
if (watermark == j->watermark)
return;
swap(watermark, j->watermark);
if (watermark > j->watermark)
journal_wake(j);
}
static inline void bch2_journal_preres_put(struct journal *j,
struct journal_preres *res)
{
union journal_preres_state s = { .reserved = res->u64s };
if (!res->u64s)
return;
s.v = atomic64_sub_return(s.v, &j->prereserved.counter);
res->u64s = 0;
if (unlikely(s.waiting)) {
clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)),
(unsigned long *) &j->prereserved.v);
closure_wake_up(&j->preres_wait);
}
if (s.reserved <= s.remaining && j->watermark)
journal_set_watermark(j);
}
int __bch2_journal_preres_get(struct journal *,
struct journal_preres *, unsigned, unsigned);
static inline int bch2_journal_preres_get_fast(struct journal *j,
struct journal_preres *res,
unsigned new_u64s,
unsigned flags,
bool set_waiting)
{
int d = new_u64s - res->u64s;
union journal_preres_state old, new;
u64 v = atomic64_read(&j->prereserved.counter);
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
int ret;
do {
old.v = new.v = v;
ret = 0;
if (watermark == BCH_WATERMARK_reclaim ||
new.reserved + d < new.remaining) {
new.reserved += d;
ret = 1;
} else if (set_waiting && !new.waiting)
new.waiting = true;
else
return 0;
} while ((v = atomic64_cmpxchg(&j->prereserved.counter,
old.v, new.v)) != old.v);
if (ret)
res->u64s += d;
return ret;
}
static inline int bch2_journal_preres_get(struct journal *j,
struct journal_preres *res,
unsigned new_u64s,
unsigned flags)
{
if (new_u64s <= res->u64s)
return 0;
if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false))
return 0;
if (flags & JOURNAL_RES_GET_NONBLOCK)
return -BCH_ERR_journal_preres_get_blocked;
return __bch2_journal_preres_get(j, res, new_u64s, flags);
}
/* journal_entry_res: */
void bch2_journal_entry_res_resize(struct journal *,

View File

@ -1079,6 +1079,12 @@ found:
if (ja->bucket_seq[ja->cur_idx] &&
ja->sectors_free == ca->mi.bucket_size) {
#if 0
/*
* Debug code for ZNS support, where we (probably) want to be
* correlated where we stopped in the journal to the zone write
* points:
*/
bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
for (i = 0; i < 3; i++) {
@ -1086,6 +1092,7 @@ found:
bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
}
#endif
ja->sectors_free = 0;
}
@ -1585,6 +1592,9 @@ static void journal_write_done(struct closure *cl)
bch2_journal_space_available(j);
track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
&j->max_in_flight_start, false);
closure_wake_up(&w->wait);
journal_wake(j);
@ -1678,9 +1688,15 @@ static void do_journal_write(struct closure *cl)
continue_at(cl, journal_write_done, c->io_complete_wq);
}
static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset)
static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
{
struct jset_entry *i, *next, *prev = NULL;
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct jset_entry *start, *end, *i, *next, *prev = NULL;
struct jset *jset = w->data;
unsigned sectors, bytes, u64s;
bool validate_before_checksum = false;
unsigned long btree_roots_have = 0;
int ret;
/*
* Simple compaction, dropping empty jset_entries (from journal
@ -1697,8 +1713,20 @@ static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset
if (!u64s)
continue;
if (i->type == BCH_JSET_ENTRY_btree_root)
/*
* New btree roots are set by journalling them; when the journal
* entry gets written we have to propagate them to
* c->btree_roots
*
* But, every journal entry we write has to contain all the
* btree roots (at least for now); so after we copy btree roots
* to c->btree_roots we have to get any missing btree roots and
* add them to this journal entry:
*/
if (i->type == BCH_JSET_ENTRY_btree_root) {
bch2_journal_entry_to_btree_root(c, i);
__set_bit(i->btree_id, &btree_roots_have);
}
/* Can we merge with previous entry? */
if (prev &&
@ -1722,85 +1750,10 @@ static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset
prev = prev ? vstruct_next(prev) : jset->start;
jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
}
void bch2_journal_write(struct closure *cl)
{
struct journal *j = container_of(cl, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_replicas_padded replicas;
struct jset_entry *start, *end;
struct jset *jset;
struct bio *bio;
struct printbuf journal_debug_buf = PRINTBUF;
bool validate_before_checksum = false;
unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
int ret;
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
journal_buf_realloc(j, w);
jset = w->data;
j->write_start_time = local_clock();
spin_lock(&j->lock);
/*
* If the journal is in an error state - we did an emergency shutdown -
* we prefer to continue doing journal writes. We just mark them as
* noflush so they'll never be used, but they'll still be visible by the
* list_journal tool - this helps in debugging.
*
* There's a caveat: the first journal write after marking the
* superblock dirty must always be a flush write, because on startup
* from a clean shutdown we didn't necessarily read the journal and the
* new journal write might overwrite whatever was in the journal
* previously - we can't leave the journal without any flush writes in
* it.
*
* So if we're in an error state, and we're still starting up, we don't
* write anything at all.
*/
if (!test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags) &&
(bch2_journal_error(j) ||
w->noflush ||
(!w->must_flush &&
(jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) {
w->noflush = true;
SET_JSET_NO_FLUSH(jset, true);
jset->last_seq = 0;
w->last_seq = 0;
j->nr_noflush_writes++;
} else if (!bch2_journal_error(j)) {
j->last_flush_write = jiffies;
j->nr_flush_writes++;
clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
} else {
spin_unlock(&j->lock);
goto err;
}
spin_unlock(&j->lock);
/*
* New btree roots are set by journalling them; when the journal entry
* gets written we have to propagate them to c->btree_roots
*
* But, every journal entry we write has to contain all the btree roots
* (at least for now); so after we copy btree roots to c->btree_roots we
* have to get any missing btree roots and add them to this journal
* entry:
*/
bch2_journal_entries_postprocess(c, jset);
start = end = vstruct_last(jset);
end = bch2_btree_roots_to_journal_entries(c, jset->start, end);
end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
bch2_journal_super_entries_add_common(c, &end,
le64_to_cpu(jset->seq));
@ -1816,7 +1769,7 @@ void bch2_journal_write(struct closure *cl)
bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
vstruct_bytes(jset), w->sectors << 9,
u64s, w->u64s_reserved, j->entry_u64s_reserved);
goto err;
return -EINVAL;
}
jset->magic = cpu_to_le64(jset_magic(c));
@ -1835,37 +1788,119 @@ void bch2_journal_write(struct closure *cl)
validate_before_checksum = true;
if (validate_before_checksum &&
jset_validate(c, NULL, jset, 0, WRITE))
goto err;
(ret = jset_validate(c, NULL, jset, 0, WRITE)))
return ret;
ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
jset->encrypted_start,
vstruct_end(jset) - (void *) jset->encrypted_start);
if (bch2_fs_fatal_err_on(ret, c,
"error decrypting journal entry: %i", ret))
goto err;
return ret;
jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
journal_nonce(jset), jset);
if (!validate_before_checksum &&
jset_validate(c, NULL, jset, 0, WRITE))
goto err;
(ret = jset_validate(c, NULL, jset, 0, WRITE)))
return ret;
memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
return 0;
}
retry_alloc:
spin_lock(&j->lock);
ret = journal_write_alloc(j, w);
static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
int error = bch2_journal_error(j);
if (ret && j->can_discard) {
spin_unlock(&j->lock);
bch2_journal_do_discards(j);
goto retry_alloc;
/*
* If the journal is in an error state - we did an emergency shutdown -
* we prefer to continue doing journal writes. We just mark them as
* noflush so they'll never be used, but they'll still be visible by the
* list_journal tool - this helps in debugging.
*
* There's a caveat: the first journal write after marking the
* superblock dirty must always be a flush write, because on startup
* from a clean shutdown we didn't necessarily read the journal and the
* new journal write might overwrite whatever was in the journal
* previously - we can't leave the journal without any flush writes in
* it.
*
* So if we're in an error state, and we're still starting up, we don't
* write anything at all.
*/
if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags))
return -EIO;
if (error ||
w->noflush ||
(!w->must_flush &&
(jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
w->noflush = true;
SET_JSET_NO_FLUSH(w->data, true);
w->data->last_seq = 0;
w->last_seq = 0;
j->nr_noflush_writes++;
} else {
j->last_flush_write = jiffies;
j->nr_flush_writes++;
clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
}
return 0;
}
void bch2_journal_write(struct closure *cl)
{
struct journal *j = container_of(cl, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_replicas_padded replicas;
struct bio *bio;
struct printbuf journal_debug_buf = PRINTBUF;
unsigned i, nr_rw_members = 0;
int ret;
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
j->write_start_time = local_clock();
spin_lock(&j->lock);
ret = bch2_journal_write_pick_flush(j, w);
spin_unlock(&j->lock);
if (ret)
goto err;
journal_buf_realloc(j, w);
ret = bch2_journal_write_prep(j, w);
if (ret)
goto err;
j->entry_bytes_written += vstruct_bytes(w->data);
while (1) {
spin_lock(&j->lock);
ret = journal_write_alloc(j, w);
if (!ret || !j->can_discard)
break;
spin_unlock(&j->lock);
bch2_journal_do_discards(j);
}
if (ret) {
__bch2_journal_debug_to_text(&journal_debug_buf, j);
spin_unlock(&j->lock);
bch_err(c, "Unable to allocate journal write:\n%s",
journal_debug_buf.buf);
printbuf_exit(&journal_debug_buf);
goto err;
}
/*
* write is allocated, no longer need to account for it in
@ -1880,13 +1915,6 @@ retry_alloc:
bch2_journal_space_available(j);
spin_unlock(&j->lock);
if (ret) {
bch_err(c, "Unable to allocate journal write:\n%s",
journal_debug_buf.buf);
printbuf_exit(&journal_debug_buf);
goto err;
}
w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
if (c->opts.nochanges)
@ -1908,7 +1936,7 @@ retry_alloc:
if (ret)
goto err;
if (!JSET_NO_FLUSH(jset) && w->separate_flush) {
if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
for_each_rw_member(ca, c, i) {
percpu_ref_get(&ca->io_ref);

View File

@ -50,16 +50,25 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
return available;
}
static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
static inline void journal_set_watermark(struct journal *j)
{
union journal_preres_state old, new;
u64 v = atomic64_read(&j->prereserved.counter);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
bool low_on_space = j->space[journal_space_clean].total * 4 <=
j->space[journal_space_total].total;
bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4;
unsigned watermark = low_on_space || low_on_pin
? BCH_WATERMARK_reclaim
: BCH_WATERMARK_stripe;
do {
old.v = new.v = v;
new.remaining = u64s_remaining;
} while ((v = atomic64_cmpxchg(&j->prereserved.counter,
old.v, new.v)) != old.v);
if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space],
&j->low_on_space_start, low_on_space) ||
track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin],
&j->low_on_pin_start, low_on_pin))
trace_and_count(c, journal_full, c);
swap(watermark, j->watermark);
if (watermark > j->watermark)
journal_wake(j);
}
static struct journal_space
@ -162,7 +171,6 @@ void bch2_journal_space_available(struct journal *j)
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
unsigned clean, clean_ondisk, total;
s64 u64s_remaining = 0;
unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
j->buf[1].buf_size >> 9);
unsigned i, nr_online = 0, nr_devs_want;
@ -222,16 +230,10 @@ void bch2_journal_space_available(struct journal *j)
else
clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
u64s_remaining = (u64) clean << 6;
u64s_remaining -= (u64) total << 3;
u64s_remaining = max(0LL, u64s_remaining);
u64s_remaining /= 4;
u64s_remaining = min_t(u64, u64s_remaining, U32_MAX);
journal_set_watermark(j);
out:
j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
j->cur_entry_error = ret;
journal_set_remaining(j, u64s_remaining);
journal_set_watermark(j);
if (!ret)
journal_wake(j);
@ -369,15 +371,36 @@ static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
return JOURNAL_PIN_other;
}
void bch2_journal_pin_set(struct journal *j, u64 seq,
static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn,
enum journal_pin_type type)
{
struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
/*
* flush_fn is how we identify journal pins in debugfs, so must always
* exist, even if it doesn't do anything:
*/
BUG_ON(!flush_fn);
atomic_inc(&pin_list->count);
pin->seq = seq;
pin->flush = flush_fn;
list_add(&pin->list, &pin_list->list[type]);
}
void bch2_journal_pin_copy(struct journal *j,
struct journal_entry_pin *dst,
struct journal_entry_pin *src,
journal_pin_flush_fn flush_fn)
{
struct journal_entry_pin_list *pin_list;
bool reclaim;
spin_lock(&j->lock);
u64 seq = READ_ONCE(src->seq);
if (seq < journal_last_seq(j)) {
/*
* bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on
@ -389,18 +412,34 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
return;
}
pin_list = journal_seq_pin(j, seq);
reclaim = __journal_pin_drop(j, dst);
bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
spin_unlock(&j->lock);
/*
* If the journal is currently full, we might want to call flush_fn
* immediately:
*/
journal_wake(j);
}
void bch2_journal_pin_set(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn)
{
bool reclaim;
spin_lock(&j->lock);
BUG_ON(seq < journal_last_seq(j));
reclaim = __journal_pin_drop(j, pin);
atomic_inc(&pin_list->count);
pin->seq = seq;
pin->flush = flush_fn;
if (flush_fn)
list_add(&pin->list, &pin_list->list[journal_pin_type(flush_fn)]);
else
list_add(&pin->list, &pin_list->flushed);
bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
@ -555,11 +594,6 @@ static u64 journal_seq_to_flush(struct journal *j)
/* Try to keep the journal at most half full: */
nr_buckets = ja->nr / 2;
/* And include pre-reservations: */
nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
(ca->mi.bucket_size << 6) -
journal_entry_overhead(j));
nr_buckets = min(nr_buckets, ja->nr);
bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
@ -638,10 +672,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
msecs_to_jiffies(c->opts.journal_reclaim_delay)))
min_nr = 1;
if (j->prereserved.reserved * 4 > j->prereserved.remaining)
min_nr = 1;
if (fifo_free(&j->pin) <= 32)
if (j->watermark != BCH_WATERMARK_stripe)
min_nr = 1;
if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
@ -652,8 +683,6 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
trace_and_count(c, journal_reclaim_start, c,
direct, kicked,
min_nr, min_key_cache,
j->prereserved.reserved,
j->prereserved.remaining,
atomic_read(&c->btree_cache.dirty),
c->btree_cache.used,
atomic_long_read(&c->btree_key_cache.nr_dirty),
@ -805,6 +834,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
{
/* time_stats this */
bool did_work = false;
if (!test_bit(JOURNAL_STARTED, &j->flags))

View File

@ -47,17 +47,10 @@ static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
bch2_journal_pin_set(j, seq, pin, flush_fn);
}
static inline void bch2_journal_pin_copy(struct journal *j,
struct journal_entry_pin *dst,
struct journal_entry_pin *src,
journal_pin_flush_fn flush_fn)
{
/* Guard against racing with journal_pin_drop(src): */
u64 seq = READ_ONCE(src->seq);
if (seq)
bch2_journal_pin_add(j, seq, dst, flush_fn);
}
void bch2_journal_pin_copy(struct journal *,
struct journal_entry_pin *,
struct journal_entry_pin *,
journal_pin_flush_fn);
static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
struct journal_entry_pin *pin,

View File

@ -76,14 +76,6 @@ struct journal_res {
u64 seq;
};
/*
* For reserving space in the journal prior to getting a reservation on a
* particular journal entry:
*/
struct journal_preres {
unsigned u64s;
};
union journal_res_state {
struct {
atomic64_t counter;
@ -104,22 +96,6 @@ union journal_res_state {
};
};
union journal_preres_state {
struct {
atomic64_t counter;
};
struct {
u64 v;
};
struct {
u64 waiting:1,
reserved:31,
remaining:32;
};
};
/* bytes: */
#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */
#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */
@ -180,8 +156,6 @@ struct journal {
union journal_res_state reservations;
enum bch_watermark watermark;
union journal_preres_state prereserved;
} __aligned(SMP_CACHE_BYTES);
unsigned long flags;
@ -288,15 +262,18 @@ struct journal {
unsigned long last_flush_write;
u64 res_get_blocked_start;
u64 write_start_time;
u64 nr_flush_writes;
u64 nr_noflush_writes;
u64 entry_bytes_written;
u64 low_on_space_start;
u64 low_on_pin_start;
u64 max_in_flight_start;
struct bch2_time_stats *flush_write_time;
struct bch2_time_stats *noflush_write_time;
struct bch2_time_stats *blocked_time;
struct bch2_time_stats *flush_seq_time;
#ifdef CONFIG_DEBUG_LOCK_ALLOC

View File

@ -85,13 +85,13 @@ static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
{
return commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
__bch2_logged_op_start(trans, k));
}
void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
{
int ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0));
/*
* This needs to be a fatal error because we've left an unfinished

View File

@ -155,7 +155,7 @@ int bch2_check_lrus(struct bch_fs *c)
ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
bch2_check_lru_key(trans, &iter, k, &last_flushed_pos)));
if (ret)
bch_err_fn(c, ret);

View File

@ -90,7 +90,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
NULL, NULL, BTREE_INSERT_NOFAIL,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags));
if (ret)
break;

View File

@ -263,7 +263,7 @@ static int bch2_extent_drop_ptrs(struct btree_trans *trans,
return bch2_trans_relock(trans) ?:
bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
int bch2_move_extent(struct moving_context *ctxt,

View File

@ -370,6 +370,7 @@ static int bch2_copygc_thread(void *arg)
if (min_member_capacity == U64_MAX)
min_member_capacity = 128 * 2048;
bch2_trans_unlock_long(ctxt.trans);
bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6),
MAX_SCHEDULE_TIMEOUT);
}

View File

@ -69,7 +69,7 @@ err:
int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
{
int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
__bch2_set_rebalance_needs_scan(trans, inum));
rebalance_wakeup(c);
return ret;
@ -125,7 +125,7 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
extent_entry_drop(bkey_i_to_s(n),
(void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
@ -273,7 +273,7 @@ static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
r->state = BCH_REBALANCE_scanning;
ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?:
commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_clear_rebalance_needs_scan(trans, inum, cookie));
bch2_move_stats_exit(&r->scan_stats, trans->c);

View File

@ -98,6 +98,11 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
unsigned update_flags = BTREE_TRIGGER_NORUN;
int ret;
if (k->overwritten)
return 0;
trans->journal_res.seq = k->journal_seq;
/*
* BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to
* keep the key cache coherent with the underlying btree. Nothing
@ -139,27 +144,14 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
static int bch2_journal_replay(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
struct journal_key **keys_sorted, *k;
DARRAY(struct journal_key *) keys_sorted = { 0 };
struct journal_key **kp;
struct journal *j = &c->journal;
u64 start_seq = c->journal_replay_seq_start;
u64 end_seq = c->journal_replay_seq_start;
size_t i;
struct btree_trans *trans = bch2_trans_get(c);
int ret;
move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
keys->gap = keys->nr;
keys_sorted = kvmalloc_array(keys->nr, sizeof(*keys_sorted), GFP_KERNEL);
if (!keys_sorted)
return -BCH_ERR_ENOMEM_journal_replay;
for (i = 0; i < keys->nr; i++)
keys_sorted[i] = &keys->d[i];
sort(keys_sorted, keys->nr,
sizeof(keys_sorted[0]),
journal_sort_seq_cmp, NULL);
if (keys->nr) {
ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)",
keys->nr, start_seq, end_seq);
@ -167,27 +159,61 @@ static int bch2_journal_replay(struct bch_fs *c)
goto err;
}
for (i = 0; i < keys->nr; i++) {
k = keys_sorted[i];
/*
* First, attempt to replay keys in sorted order. This is more
* efficient, but some might fail if that would cause a journal
* deadlock.
*/
for (size_t i = 0; i < keys->nr; i++) {
cond_resched();
replay_now_at(j, k->journal_seq);
struct journal_key *k = keys->d + i;
ret = bch2_trans_do(c, NULL, NULL,
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_NOFAIL|
(!k->allocated
? BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim
: 0),
ret = commit_do(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_journal_reclaim|
(!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
bch2_journal_replay_key(trans, k));
BUG_ON(!ret && !k->overwritten);
if (ret) {
bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s",
bch2_btree_id_str(k->btree_id), k->level, bch2_err_str(ret));
ret = darray_push(&keys_sorted, k);
if (ret)
goto err;
}
}
/*
* Now, replay any remaining keys in the order in which they appear in
* the journal, unpinning those journal entries as we go:
*/
sort(keys_sorted.data, keys_sorted.nr,
sizeof(keys_sorted.data[0]),
journal_sort_seq_cmp, NULL);
darray_for_each(keys_sorted, kp) {
cond_resched();
struct journal_key *k = *kp;
replay_now_at(j, k->journal_seq);
ret = commit_do(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc|
(!k->allocated
? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim
: 0),
bch2_journal_replay_key(trans, k));
bch_err_msg(c, ret, "while replaying key at btree %s level %u:",
bch2_btree_id_str(k->btree_id), k->level);
if (ret)
goto err;
BUG_ON(!k->overwritten);
}
bch2_trans_put(trans);
trans = NULL;
replay_now_at(j, j->replay_journal_seq_end);
j->replay_journal_seq = 0;
@ -198,9 +224,9 @@ static int bch2_journal_replay(struct bch_fs *c)
if (keys->nr && !ret)
bch2_journal_log_msg(c, "journal replay finished");
err:
kvfree(keys_sorted);
if (ret)
if (trans)
bch2_trans_put(trans);
darray_exit(&keys_sorted);
bch_err_fn(c, ret);
return ret;
}
@ -468,7 +494,7 @@ err:
noinline_for_stack
static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
{
int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
__bch2_fs_upgrade_for_subvolumes(trans));
if (ret)
bch_err_fn(c, ret);
@ -489,7 +515,19 @@ static int bch2_check_allocations(struct bch_fs *c)
static int bch2_set_may_go_rw(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
/*
* After we go RW, the journal keys buffer can't be modified (except for
* setting journal_key->overwritten: it will be accessed by multiple
* threads
*/
move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
keys->gap = keys->nr;
set_bit(BCH_FS_MAY_GO_RW, &c->flags);
if (keys->nr)
return bch2_fs_read_write_early(c);
return 0;
}

View File

@ -390,7 +390,7 @@ s64 bch2_remap_range(struct bch_fs *c,
inode_u.bi_size = new_i_size;
ret2 = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL);
BCH_TRANS_COMMIT_no_enospc);
}
bch2_trans_iter_exit(trans, &inode_iter);

View File

@ -376,7 +376,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
entry = sb_clean->start;
bch2_journal_super_entries_add_common(c, &entry, 0);
entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
entry = bch2_btree_roots_to_journal_entries(c, entry, 0);
BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
memset(entry, 0,

View File

@ -70,7 +70,7 @@ static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb,
prt_tab(out);
prt_u64(out, BCH_SB_ERROR_ENTRY_NR(&e->entries[i]));
prt_tab(out);
bch2_prt_date_seconds(out, le64_to_cpu(e->entries[i].last_error_time));
bch2_prt_datetime(out, le64_to_cpu(e->entries[i].last_error_time));
prt_newline(out);
}
}

View File

@ -230,7 +230,7 @@ static void member_to_text(struct printbuf *out,
prt_printf(out, "Last mount:");
prt_tab(out);
if (m.last_mount)
bch2_prt_date_seconds(out, le64_to_cpu(m.last_mount));
bch2_prt_datetime(out, le64_to_cpu(m.last_mount));
else
prt_printf(out, "(never)");
prt_newline(out);

View File

@ -590,7 +590,7 @@ int bch2_check_snapshot_trees(struct bch_fs *c)
for_each_btree_key_commit(trans, iter,
BTREE_ID_snapshot_trees, POS_MIN,
BTREE_ITER_PREFETCH, k,
NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
check_snapshot_tree(trans, &iter, k)));
if (ret)
@ -868,7 +868,7 @@ int bch2_check_snapshots(struct bch_fs *c)
for_each_btree_key_reverse_commit(trans, iter,
BTREE_ID_snapshots, POS_MAX,
BTREE_ITER_PREFETCH, k,
NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
check_snapshot(trans, &iter, k)));
if (ret)
bch_err_fn(c, ret);
@ -959,7 +959,7 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
parent_id, id))
goto err;
parent->v.children[i] = le32_to_cpu(child_id);
parent->v.children[i] = cpu_to_le32(child_id);
normalize_snapshot_child_pointers(&parent->v);
}
@ -1449,12 +1449,12 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
ret = for_each_btree_key_commit(trans, iter,
id, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
&res, NULL, BTREE_INSERT_NOFAIL,
&res, NULL, BCH_TRANS_COMMIT_no_enospc,
snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
for_each_btree_key_commit(trans, iter,
id, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
&res, NULL, BTREE_INSERT_NOFAIL,
&res, NULL, BCH_TRANS_COMMIT_no_enospc,
move_key_to_correct_snapshot(trans, &iter, k));
bch2_disk_reservation_put(c, &res);
@ -1489,7 +1489,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
*/
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
BTREE_ITER_INTENT, k,
NULL, NULL, BTREE_INSERT_NOFAIL,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
if (ret)
goto err_create_lock;

View File

@ -15,6 +15,16 @@
#include <crypto/hash.h>
#include <crypto/sha2.h>
typedef unsigned __bitwise bch_str_hash_flags_t;
enum bch_str_hash_flags {
__BCH_HASH_SET_MUST_CREATE,
__BCH_HASH_SET_MUST_REPLACE,
};
#define BCH_HASH_SET_MUST_CREATE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_CREATE)
#define BCH_HASH_SET_MUST_REPLACE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_REPLACE)
static inline enum bch_str_hash_type
bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
{
@ -246,7 +256,7 @@ int bch2_hash_set_snapshot(struct btree_trans *trans,
const struct bch_hash_info *info,
subvol_inum inum, u32 snapshot,
struct bkey_i *insert,
int flags,
bch_str_hash_flags_t str_hash_flags,
int update_flags)
{
struct btree_iter iter, slot = { NULL };
@ -269,7 +279,7 @@ int bch2_hash_set_snapshot(struct btree_trans *trans,
}
if (!slot.path &&
!(flags & BCH_HASH_SET_MUST_REPLACE))
!(str_hash_flags & BCH_HASH_SET_MUST_REPLACE))
bch2_trans_copy_iter(&slot, &iter);
if (k.k->type != KEY_TYPE_hash_whiteout)
@ -287,16 +297,16 @@ found:
found = true;
not_found:
if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) {
if (!found && (str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) {
ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
} else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) {
} else if (found && (str_hash_flags & BCH_HASH_SET_MUST_CREATE)) {
ret = -EEXIST;
} else {
if (!found && slot.path)
swap(iter, slot);
insert->k.p = iter.pos;
ret = bch2_trans_update(trans, &iter, insert, 0);
ret = bch2_trans_update(trans, &iter, insert, update_flags);
}
goto out;
@ -307,7 +317,8 @@ int bch2_hash_set(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum,
struct bkey_i *insert, int flags)
struct bkey_i *insert,
bch_str_hash_flags_t str_hash_flags)
{
u32 snapshot;
int ret;
@ -319,7 +330,7 @@ int bch2_hash_set(struct btree_trans *trans,
insert->k.p.inode = inum.inum;
return bch2_hash_set_snapshot(trans, desc, info, inum,
snapshot, insert, flags, 0);
snapshot, insert, str_hash_flags, 0);
}
static __always_inline

View File

@ -89,7 +89,7 @@ int bch2_check_subvols(struct bch_fs *c)
ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
check_subvol(trans, &iter, k)));
if (ret)
bch_err_fn(c, ret);
@ -219,7 +219,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d
BTREE_ITER_CACHED, &s)) ?:
for_each_btree_key_commit(trans, iter,
BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
NULL, NULL, BTREE_INSERT_NOFAIL,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_subvolume_reparent(trans, &iter, k,
subvolid_to_delete, le32_to_cpu(s.parent)));
}
@ -256,7 +256,7 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
{
return bch2_subvolumes_reparent(trans, subvolid) ?:
commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
__bch2_subvolume_delete(trans, subvolid));
}

View File

@ -20,7 +20,7 @@ struct snapshot_t {
};
struct snapshot_table {
struct snapshot_t s[0];
DECLARE_FLEX_ARRAY(struct snapshot_t, s);
};
typedef struct {

View File

@ -1183,7 +1183,7 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
prt_printf(out, "Created:");
prt_tab(out);
if (sb->time_base_lo)
bch2_prt_date_seconds(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
else
prt_printf(out, "(not set)");
prt_newline(out);

View File

@ -641,7 +641,9 @@ static int bch2_fs_online(struct bch_fs *c)
ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?:
kobject_add(&c->internal, &c->kobj, "internal") ?:
kobject_add(&c->opts_dir, &c->kobj, "options") ?:
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
#endif
kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
bch2_opts_create_sysfs_files(&c->opts_dir);
if (ret) {
@ -750,7 +752,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write];
c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write];
c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal];
c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
bch2_fs_btree_cache_init_early(&c->btree_cache);

View File

@ -196,10 +196,9 @@ DEFINE_EVENT(bio, journal_write,
TRACE_EVENT(journal_reclaim_start,
TP_PROTO(struct bch_fs *c, bool direct, bool kicked,
u64 min_nr, u64 min_key_cache,
u64 prereserved, u64 prereserved_total,
u64 btree_cache_dirty, u64 btree_cache_total,
u64 btree_key_cache_dirty, u64 btree_key_cache_total),
TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total,
TP_ARGS(c, direct, kicked, min_nr, min_key_cache,
btree_cache_dirty, btree_cache_total,
btree_key_cache_dirty, btree_key_cache_total),
@ -209,8 +208,6 @@ TRACE_EVENT(journal_reclaim_start,
__field(bool, kicked )
__field(u64, min_nr )
__field(u64, min_key_cache )
__field(u64, prereserved )
__field(u64, prereserved_total )
__field(u64, btree_cache_dirty )
__field(u64, btree_cache_total )
__field(u64, btree_key_cache_dirty )
@ -223,22 +220,18 @@ TRACE_EVENT(journal_reclaim_start,
__entry->kicked = kicked;
__entry->min_nr = min_nr;
__entry->min_key_cache = min_key_cache;
__entry->prereserved = prereserved;
__entry->prereserved_total = prereserved_total;
__entry->btree_cache_dirty = btree_cache_dirty;
__entry->btree_cache_total = btree_cache_total;
__entry->btree_key_cache_dirty = btree_key_cache_dirty;
__entry->btree_key_cache_total = btree_key_cache_total;
),
TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->direct,
__entry->kicked,
__entry->min_nr,
__entry->min_key_cache,
__entry->prereserved,
__entry->prereserved_total,
__entry->btree_cache_dirty,
__entry->btree_cache_total,
__entry->btree_key_cache_dirty,

View File

@ -315,6 +315,57 @@ int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task)
return ret;
}
#ifndef __KERNEL__
#include <time.h>
void bch2_prt_datetime(struct printbuf *out, time64_t sec)
{
time_t t = sec;
char buf[64];
ctime_r(&t, buf);
prt_str(out, buf);
}
#else
void bch2_prt_datetime(struct printbuf *out, time64_t sec)
{
char buf[64];
snprintf(buf, sizeof(buf), "%ptT", &sec);
prt_u64(out, sec);
}
#endif
static const struct time_unit {
const char *name;
u64 nsecs;
} time_units[] = {
{ "ns", 1 },
{ "us", NSEC_PER_USEC },
{ "ms", NSEC_PER_MSEC },
{ "s", NSEC_PER_SEC },
{ "m", (u64) NSEC_PER_SEC * 60},
{ "h", (u64) NSEC_PER_SEC * 3600},
{ "eon", U64_MAX },
};
static const struct time_unit *pick_time_units(u64 ns)
{
const struct time_unit *u;
for (u = time_units;
u + 1 < time_units + ARRAY_SIZE(time_units) &&
ns >= u[1].nsecs << 1;
u++)
;
return u;
}
void bch2_pr_time_units(struct printbuf *out, u64 ns)
{
const struct time_unit *u = pick_time_units(ns);
prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
}
/* time stats: */
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
@ -359,6 +410,7 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
stats->max_duration = max(stats->max_duration, duration);
stats->min_duration = min(stats->min_duration, duration);
stats->total_duration += duration;
bch2_quantiles_update(&stats->quantiles, duration);
}
@ -372,22 +424,26 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
}
}
static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
struct bch2_time_stat_buffer *b)
{
struct bch2_time_stat_buffer_entry *i;
unsigned long flags;
spin_lock_irqsave(&stats->lock, flags);
for (i = b->entries;
for (struct bch2_time_stat_buffer_entry *i = b->entries;
i < b->entries + ARRAY_SIZE(b->entries);
i++)
bch2_time_stats_update_one(stats, i->start, i->end);
spin_unlock_irqrestore(&stats->lock, flags);
b->nr = 0;
}
static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
struct bch2_time_stat_buffer *b)
{
unsigned long flags;
spin_lock_irqsave(&stats->lock, flags);
__bch2_time_stats_clear_buffer(stats, b);
spin_unlock_irqrestore(&stats->lock, flags);
}
void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
{
unsigned long flags;
@ -423,40 +479,6 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
preempt_enable();
}
}
#endif
static const struct time_unit {
const char *name;
u64 nsecs;
} time_units[] = {
{ "ns", 1 },
{ "us", NSEC_PER_USEC },
{ "ms", NSEC_PER_MSEC },
{ "s", NSEC_PER_SEC },
{ "m", (u64) NSEC_PER_SEC * 60},
{ "h", (u64) NSEC_PER_SEC * 3600},
{ "eon", U64_MAX },
};
static const struct time_unit *pick_time_units(u64 ns)
{
const struct time_unit *u;
for (u = time_units;
u + 1 < time_units + ARRAY_SIZE(time_units) &&
ns >= u[1].nsecs << 1;
u++)
;
return u;
}
void bch2_pr_time_units(struct printbuf *out, u64 ns)
{
const struct time_unit *u = pick_time_units(ns);
prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
}
static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
{
@ -467,26 +489,6 @@ static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
prt_printf(out, "%s", u->name);
}
#ifndef __KERNEL__
#include <time.h>
void bch2_prt_date_seconds(struct printbuf *out, time64_t sec)
{
time_t t = sec;
char buf[64];
ctime_r(&t, buf);
prt_str(out, buf);
}
#else
void bch2_prt_date_seconds(struct printbuf *out, time64_t sec)
{
char buf[64];
snprintf(buf, sizeof(buf), "%ptT", &sec);
prt_u64(out, sec);
}
#endif
#define TABSTOP_SIZE 12
static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
{
prt_str(out, name);
@ -495,12 +497,24 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64
prt_newline(out);
}
#define TABSTOP_SIZE 12
void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
{
const struct time_unit *u;
s64 f_mean = 0, d_mean = 0;
u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
int i;
if (stats->buffer) {
int cpu;
spin_lock_irq(&stats->lock);
for_each_possible_cpu(cpu)
__bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
spin_unlock_irq(&stats->lock);
}
/*
* avoid divide by zero
*/
@ -546,6 +560,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
pr_name_and_units(out, "min:", stats->min_duration);
pr_name_and_units(out, "max:", stats->max_duration);
pr_name_and_units(out, "total:", stats->total_duration);
prt_printf(out, "mean:");
prt_tab(out);
@ -603,6 +618,9 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
last_q = q;
}
}
#else
void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {}
#endif
void bch2_time_stats_exit(struct bch2_time_stats *stats)
{

View File

@ -244,7 +244,7 @@ do { \
#define prt_bitflags(...) bch2_prt_bitflags(__VA_ARGS__)
void bch2_pr_time_units(struct printbuf *, u64);
void bch2_prt_date_seconds(struct printbuf *, time64_t);
void bch2_prt_datetime(struct printbuf *, time64_t);
#ifdef __KERNEL__
static inline void uuid_unparse_lower(u8 *uuid, char *out)
@ -372,8 +372,9 @@ struct bch2_time_stat_buffer {
struct bch2_time_stats {
spinlock_t lock;
/* all fields are in nanoseconds */
u64 max_duration;
u64 min_duration;
u64 max_duration;
u64 total_duration;
u64 max_freq;
u64 min_freq;
u64 last_event;
@ -388,15 +389,39 @@ struct bch2_time_stats {
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
#else
static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
#endif
static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
{
__bch2_time_stats_update(stats, start, local_clock());
}
static inline bool track_event_change(struct bch2_time_stats *stats,
u64 *start, bool v)
{
if (v != !!*start) {
if (!v) {
bch2_time_stats_update(stats, *start);
*start = 0;
} else {
*start = local_clock() ?: 1;
return true;
}
}
return false;
}
#else
static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {}
static inline bool track_event_change(struct bch2_time_stats *stats,
u64 *start, bool v)
{
bool ret = v && !*start;
*start = v;
return ret;
}
#endif
void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
void bch2_time_stats_exit(struct bch2_time_stats *);