Update bcachefs sources to 3ca08ab51ec9 bcachefs: six locks: Simplify optimistic spinning

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2023-11-12 20:53:57 -05:00
parent a613340b26
commit 7fd6c3ffe4
65 changed files with 1040 additions and 979 deletions

View File

@ -1 +1 @@
d464ec667b2b9de097e39d1505b45aafd87a9552 3ca08ab51ec996180c20105489176b8c4327240c

View File

@ -278,4 +278,7 @@ static inline void dump_stack(void) {}
#define unsafe_memcpy(dst, src, bytes, justification) \ #define unsafe_memcpy(dst, src, bytes, justification) \
memcpy(dst, src, bytes) memcpy(dst, src, bytes)
#define DECLARE_FLEX_ARRAY(TYPE, NAME) \
__DECLARE_FLEX_ARRAY(TYPE, NAME)
#endif #endif

View File

@ -98,4 +98,15 @@ static inline void hlist_del_init(struct hlist_node *n)
pos; \ pos; \
pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
static inline size_t list_count_nodes(struct list_head *head)
{
struct list_head *pos;
size_t count = 0;
list_for_each(pos, head)
count++;
return count;
}
#endif /* _LIST_LIST_H */ #endif /* _LIST_LIST_H */

View File

@ -561,8 +561,8 @@ int bch2_bucket_gens_init(struct bch_fs *c)
if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) { if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
ret = commit_do(trans, NULL, NULL, ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL| BCH_TRANS_COMMIT_no_enospc|
BTREE_INSERT_LAZY_RW, BCH_TRANS_COMMIT_lazy_rw,
bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
if (ret) if (ret)
break; break;
@ -581,8 +581,8 @@ int bch2_bucket_gens_init(struct bch_fs *c)
if (have_bucket_gens_key && !ret) if (have_bucket_gens_key && !ret)
ret = commit_do(trans, NULL, NULL, ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL| BCH_TRANS_COMMIT_no_enospc|
BTREE_INSERT_LAZY_RW, BCH_TRANS_COMMIT_lazy_rw,
bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
bch2_trans_put(trans); bch2_trans_put(trans);
@ -1267,7 +1267,7 @@ delete:
ret = bch2_btree_delete_extent_at(trans, iter, ret = bch2_btree_delete_extent_at(trans, iter,
iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?: iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?:
bch2_trans_commit(trans, NULL, NULL, bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW); BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw);
goto out; goto out;
} }
@ -1422,8 +1422,8 @@ int bch2_check_alloc_info(struct bch_fs *c)
} }
ret = bch2_trans_commit(trans, NULL, NULL, ret = bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL| BCH_TRANS_COMMIT_no_enospc|
BTREE_INSERT_LAZY_RW); BCH_TRANS_COMMIT_lazy_rw);
if (ret) if (ret)
goto bkey_err; goto bkey_err;
@ -1453,7 +1453,7 @@ bkey_err:
for_each_btree_key_commit(trans, iter, for_each_btree_key_commit(trans, iter,
BTREE_ID_bucket_gens, POS_MIN, BTREE_ID_bucket_gens, POS_MIN,
BTREE_ITER_PREFETCH, k, BTREE_ITER_PREFETCH, k,
NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
bch2_check_bucket_gens_key(trans, &iter, k)); bch2_check_bucket_gens_key(trans, &iter, k));
err: err:
bch2_trans_put(trans); bch2_trans_put(trans);
@ -1546,7 +1546,7 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
ret = bch2_trans_run(c, ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
POS_MIN, BTREE_ITER_PREFETCH, k, POS_MIN, BTREE_ITER_PREFETCH, k,
NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
bch2_check_alloc_to_lru_ref(trans, &iter))); bch2_check_alloc_to_lru_ref(trans, &iter)));
if (ret) if (ret)
bch_err_fn(c, ret); bch_err_fn(c, ret);
@ -1655,7 +1655,7 @@ write:
ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL, bch2_trans_commit(trans, NULL, NULL,
BCH_WATERMARK_btree| BCH_WATERMARK_btree|
BTREE_INSERT_NOFAIL); BCH_TRANS_COMMIT_no_enospc);
if (ret) if (ret)
goto out; goto out;
@ -1760,7 +1760,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
BTREE_TRIGGER_BUCKET_INVALIDATE) ?: BTREE_TRIGGER_BUCKET_INVALIDATE) ?:
bch2_trans_commit(trans, NULL, NULL, bch2_trans_commit(trans, NULL, NULL,
BCH_WATERMARK_btree| BCH_WATERMARK_btree|
BTREE_INSERT_NOFAIL); BCH_TRANS_COMMIT_no_enospc);
if (ret) if (ret)
goto out; goto out;
@ -1884,8 +1884,8 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
ret = bch2_bucket_do_index(trans, k, a, true) ?: ret = bch2_bucket_do_index(trans, k, a, true) ?:
bch2_trans_commit(trans, NULL, NULL, bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_LAZY_RW| BCH_TRANS_COMMIT_lazy_rw|
BTREE_INSERT_NOFAIL); BCH_TRANS_COMMIT_no_enospc);
if (ret) if (ret)
goto bkey_err; goto bkey_err;
@ -1905,8 +1905,8 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?: ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
bch2_trans_commit(trans, NULL, NULL, bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_LAZY_RW| BCH_TRANS_COMMIT_lazy_rw|
BTREE_INSERT_NOFAIL); BCH_TRANS_COMMIT_no_enospc);
if (ret) if (ret)
goto bkey_err; goto bkey_err;

View File

@ -5,6 +5,7 @@
#include "backpointers.h" #include "backpointers.h"
#include "btree_cache.h" #include "btree_cache.h"
#include "btree_update.h" #include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h" #include "btree_write_buffer.h"
#include "error.h" #include "error.h"
@ -220,18 +221,22 @@ out:
static void backpointer_not_found(struct btree_trans *trans, static void backpointer_not_found(struct btree_trans *trans,
struct bpos bp_pos, struct bpos bp_pos,
struct bch_backpointer bp, struct bch_backpointer bp,
struct bkey_s_c k, struct bkey_s_c k)
const char *thing_it_points_to)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF; struct printbuf buf = PRINTBUF;
struct bpos bucket = bp_pos_to_bucket(c, bp_pos); struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
/*
* If we're using the btree write buffer, the backpointer we were
* looking at may have already been deleted - failure to find what it
* pointed to is not an error:
*/
if (likely(!bch2_backpointers_no_use_write_buffer)) if (likely(!bch2_backpointers_no_use_write_buffer))
return; return;
prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", prt_printf(&buf, "backpointer doesn't match %s it points to:\n ",
thing_it_points_to); bp.level ? "btree node" : "extent");
prt_printf(&buf, "bucket: "); prt_printf(&buf, "bucket: ");
bch2_bpos_to_text(&buf, bucket); bch2_bpos_to_text(&buf, bucket);
prt_printf(&buf, "\n "); prt_printf(&buf, "\n ");
@ -257,56 +262,37 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
struct bch_backpointer bp, struct bch_backpointer bp,
unsigned iter_flags) unsigned iter_flags)
{ {
struct bch_fs *c = trans->c; if (likely(!bp.level)) {
struct btree_root *r = bch2_btree_id_root(c, bp.btree_id); struct bch_fs *c = trans->c;
struct bpos bucket = bp_pos_to_bucket(c, bp_pos); struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
struct bkey_s_c k; struct bkey_s_c k;
bch2_trans_node_iter_init(trans, iter,
bp.btree_id,
bp.pos,
0,
min(bp.level, r->level),
iter_flags);
k = bch2_btree_iter_peek_slot(iter);
if (bkey_err(k)) {
bch2_trans_iter_exit(trans, iter);
return k;
}
if (bp.level == r->level + 1)
k = bkey_i_to_s_c(&r->key);
if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
return k;
bch2_trans_iter_exit(trans, iter);
if (unlikely(bch2_backpointers_no_use_write_buffer)) {
if (bp.level) {
struct btree *b;
/*
* If a backpointer for a btree node wasn't found, it may be
* because it was overwritten by a new btree node that hasn't
* been written out yet - backpointer_get_node() checks for
* this:
*/
b = bch2_backpointer_get_node(trans, iter, bp_pos, bp);
if (!IS_ERR_OR_NULL(b))
return bkey_i_to_s_c(&b->key);
bch2_trans_node_iter_init(trans, iter,
bp.btree_id,
bp.pos,
0, 0,
iter_flags);
k = bch2_btree_iter_peek_slot(iter);
if (bkey_err(k)) {
bch2_trans_iter_exit(trans, iter); bch2_trans_iter_exit(trans, iter);
return k;
if (IS_ERR(b))
return bkey_s_c_err(PTR_ERR(b));
return bkey_s_c_null;
} }
backpointer_not_found(trans, bp_pos, bp, k, "extent"); if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
} return k;
return bkey_s_c_null; bch2_trans_iter_exit(trans, iter);
backpointer_not_found(trans, bp_pos, bp, k);
return bkey_s_c_null;
} else {
struct btree *b = bch2_backpointer_get_node(trans, iter, bp_pos, bp);
if (IS_ERR_OR_NULL(b)) {
bch2_trans_iter_exit(trans, iter);
return IS_ERR(b) ? bkey_s_c_err(PTR_ERR(b)) : bkey_s_c_null;
}
return bkey_i_to_s_c(&b->key);
}
} }
struct btree *bch2_backpointer_get_node(struct btree_trans *trans, struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
@ -327,19 +313,20 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
bp.level - 1, bp.level - 1,
0); 0);
b = bch2_btree_iter_peek_node(iter); b = bch2_btree_iter_peek_node(iter);
if (IS_ERR(b)) if (IS_ERR_OR_NULL(b))
goto err; goto err;
if (b && extent_matches_bp(c, bp.btree_id, bp.level, BUG_ON(b->c.level != bp.level - 1);
bkey_i_to_s_c(&b->key),
bucket, bp)) if (extent_matches_bp(c, bp.btree_id, bp.level,
bkey_i_to_s_c(&b->key),
bucket, bp))
return b; return b;
if (b && btree_node_will_make_reachable(b)) { if (btree_node_will_make_reachable(b)) {
b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
} else { } else {
backpointer_not_found(trans, bp_pos, bp, backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key));
bkey_i_to_s_c(&b->key), "btree node");
b = NULL; b = NULL;
} }
err: err:
@ -395,7 +382,7 @@ int bch2_check_btree_backpointers(struct bch_fs *c)
ret = bch2_trans_run(c, ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, for_each_btree_key_commit(trans, iter,
BTREE_ID_backpointers, POS_MIN, 0, k, BTREE_ID_backpointers, POS_MIN, 0, k,
NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
bch2_check_btree_backpointer(trans, &iter, k))); bch2_check_btree_backpointer(trans, &iter, k)));
if (ret) if (ret)
bch_err_fn(c, ret); bch_err_fn(c, ret);
@ -642,8 +629,8 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
do { do {
ret = commit_do(trans, NULL, NULL, ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_LAZY_RW| BCH_TRANS_COMMIT_lazy_rw|
BTREE_INSERT_NOFAIL, BCH_TRANS_COMMIT_no_enospc,
check_extent_to_backpointers(trans, &iter, check_extent_to_backpointers(trans, &iter,
bucket_start, bucket_end, bucket_start, bucket_end,
&last_flushed)); &last_flushed));
@ -657,8 +644,8 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
break; break;
ret = commit_do(trans, NULL, NULL, ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_LAZY_RW| BCH_TRANS_COMMIT_lazy_rw|
BTREE_INSERT_NOFAIL, BCH_TRANS_COMMIT_no_enospc,
check_btree_root_to_backpointers(trans, btree_id, check_btree_root_to_backpointers(trans, btree_id,
bucket_start, bucket_end, bucket_start, bucket_end,
&last_flushed)); &last_flushed));
@ -797,7 +784,8 @@ static int check_one_backpointer(struct btree_trans *trans,
if (fsck_err_on(!k.k, c, if (fsck_err_on(!k.k, c,
backpointer_to_missing_ptr, backpointer_to_missing_ptr,
"backpointer for missing extent\n %s", "backpointer for missing %s\n %s",
bp.v->level ? "btree node" : "extent",
(bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) { (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) {
ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p); ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p);
goto out; goto out;
@ -819,7 +807,7 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
POS_MIN, BTREE_ITER_PREFETCH, k, POS_MIN, BTREE_ITER_PREFETCH, k,
NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
check_one_backpointer(trans, start, end, check_one_backpointer(trans, start, end,
bkey_s_c_to_backpointer(k), bkey_s_c_to_backpointer(k),
&last_flushed_pos)); &last_flushed_pos));

View File

@ -401,7 +401,9 @@ BCH_DEBUG_PARAMS_DEBUG()
x(journal_flush_write) \ x(journal_flush_write) \
x(journal_noflush_write) \ x(journal_noflush_write) \
x(journal_flush_seq) \ x(journal_flush_seq) \
x(blocked_journal) \ x(blocked_journal_low_on_space) \
x(blocked_journal_low_on_pin) \
x(blocked_journal_max_in_flight) \
x(blocked_allocate) \ x(blocked_allocate) \
x(blocked_allocate_open_bucket) \ x(blocked_allocate_open_bucket) \
x(nocow_lock_contended) x(nocow_lock_contended)
@ -617,7 +619,7 @@ struct journal_seq_blacklist_table {
u64 start; u64 start;
u64 end; u64 end;
bool dirty; bool dirty;
} entries[0]; } entries[];
}; };
struct journal_keys { struct journal_keys {

View File

@ -2256,7 +2256,8 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6);
enum btree_id_flags { enum btree_id_flags {
BTREE_ID_EXTENTS = BIT(0), BTREE_ID_EXTENTS = BIT(0),
BTREE_ID_SNAPSHOTS = BIT(1), BTREE_ID_SNAPSHOTS = BIT(1),
BTREE_ID_DATA = BIT(2), BTREE_ID_SNAPSHOT_FIELD = BIT(2),
BTREE_ID_DATA = BIT(3),
}; };
#define BCH_BTREE_IDS() \ #define BCH_BTREE_IDS() \
@ -2311,12 +2312,12 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_bucket_gens)) \ BIT_ULL(KEY_TYPE_bucket_gens)) \
x(snapshot_trees, 15, 0, \ x(snapshot_trees, 15, 0, \
BIT_ULL(KEY_TYPE_snapshot_tree)) \ BIT_ULL(KEY_TYPE_snapshot_tree)) \
x(deleted_inodes, 16, BTREE_ID_SNAPSHOTS, \ x(deleted_inodes, 16, BTREE_ID_SNAPSHOT_FIELD, \
BIT_ULL(KEY_TYPE_set)) \ BIT_ULL(KEY_TYPE_set)) \
x(logged_ops, 17, 0, \ x(logged_ops, 17, 0, \
BIT_ULL(KEY_TYPE_logged_op_truncate)| \ BIT_ULL(KEY_TYPE_logged_op_truncate)| \
BIT_ULL(KEY_TYPE_logged_op_finsert)) \ BIT_ULL(KEY_TYPE_logged_op_finsert)) \
x(rebalance_work, 18, BTREE_ID_SNAPSHOTS, \ x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \
BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))
enum btree_id { enum btree_id {

View File

@ -186,15 +186,20 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
if (type != BKEY_TYPE_btree) { if (type != BKEY_TYPE_btree) {
enum btree_id btree = type - 1; enum btree_id btree = type - 1;
bkey_fsck_err_on(!btree_type_has_snapshots(btree) && if (btree_type_has_snapshots(btree)) {
k.k->p.snapshot, c, err, bkey_fsck_err_on(!k.k->p.snapshot, c, err,
bkey_snapshot_nonzero, bkey_snapshot_zero,
"nonzero snapshot"); "snapshot == 0");
} else if (!btree_type_has_snapshot_field(btree)) {
bkey_fsck_err_on(btree_type_has_snapshots(btree) && bkey_fsck_err_on(k.k->p.snapshot, c, err,
!k.k->p.snapshot, c, err, bkey_snapshot_nonzero,
bkey_snapshot_zero, "nonzero snapshot");
"snapshot == 0"); } else {
/*
* btree uses snapshot field but it's not required to be
* nonzero
*/
}
bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), c, err, bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), c, err,
bkey_at_pos_max, bkey_at_pos_max,

View File

@ -93,7 +93,6 @@ static inline int bch2_mark_key(struct btree_trans *trans,
enum btree_update_flags { enum btree_update_flags {
__BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END, __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END,
__BTREE_UPDATE_NOJOURNAL, __BTREE_UPDATE_NOJOURNAL,
__BTREE_UPDATE_PREJOURNAL,
__BTREE_UPDATE_KEY_CACHE_RECLAIM, __BTREE_UPDATE_KEY_CACHE_RECLAIM,
__BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */
@ -108,7 +107,6 @@ enum btree_update_flags {
#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) #define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
#define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL) #define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL)
#define BTREE_UPDATE_PREJOURNAL (1U << __BTREE_UPDATE_PREJOURNAL)
#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM) #define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) #define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN)

View File

@ -1502,7 +1502,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
POS(ca->dev_idx, ca->mi.first_bucket), POS(ca->dev_idx, ca->mi.first_bucket),
BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k, BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
NULL, NULL, BTREE_INSERT_LAZY_RW, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
bch2_alloc_write_key(trans, &iter, k, metadata_only)); bch2_alloc_write_key(trans, &iter, k, metadata_only));
if (ret < 0) { if (ret < 0) {
@ -1659,7 +1659,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
ret = for_each_btree_key_commit(trans, iter, ret = for_each_btree_key_commit(trans, iter,
BTREE_ID_reflink, POS_MIN, BTREE_ID_reflink, POS_MIN,
BTREE_ITER_PREFETCH, k, BTREE_ITER_PREFETCH, k,
NULL, NULL, BTREE_INSERT_NOFAIL, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_gc_write_reflink_key(trans, &iter, k, &idx)); bch2_gc_write_reflink_key(trans, &iter, k, &idx));
c->reflink_gc_nr = 0; c->reflink_gc_nr = 0;
@ -1783,7 +1783,7 @@ static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
ret = for_each_btree_key_commit(trans, iter, ret = for_each_btree_key_commit(trans, iter,
BTREE_ID_stripes, POS_MIN, BTREE_ID_stripes, POS_MIN,
BTREE_ITER_PREFETCH, k, BTREE_ITER_PREFETCH, k,
NULL, NULL, BTREE_INSERT_NOFAIL, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_gc_write_stripes_key(trans, &iter, k)); bch2_gc_write_stripes_key(trans, &iter, k));
bch2_trans_put(trans); bch2_trans_put(trans);
@ -2019,7 +2019,7 @@ int bch2_gc_gens(struct bch_fs *c)
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
k, k,
NULL, NULL, NULL, NULL,
BTREE_INSERT_NOFAIL, BCH_TRANS_COMMIT_no_enospc,
gc_btree_gens_key(trans, &iter, k)); gc_btree_gens_key(trans, &iter, k));
if (ret && !bch2_err_matches(ret, EROFS)) if (ret && !bch2_err_matches(ret, EROFS))
bch_err_fn(c, ret); bch_err_fn(c, ret);
@ -2032,7 +2032,7 @@ int bch2_gc_gens(struct bch_fs *c)
BTREE_ITER_PREFETCH, BTREE_ITER_PREFETCH,
k, k,
NULL, NULL, NULL, NULL,
BTREE_INSERT_NOFAIL, BCH_TRANS_COMMIT_no_enospc,
bch2_alloc_write_oldest_gen(trans, &iter, k)); bch2_alloc_write_oldest_gen(trans, &iter, k));
if (ret && !bch2_err_matches(ret, EROFS)) if (ret && !bch2_err_matches(ret, EROFS))
bch_err_fn(c, ret); bch_err_fn(c, ret);

View File

@ -1801,9 +1801,9 @@ static void btree_node_write_work(struct work_struct *work)
ret = bch2_trans_do(c, NULL, NULL, 0, ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_btree_node_update_key_get_iter(trans, b, &wbio->key, bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
BCH_WATERMARK_reclaim| BCH_WATERMARK_reclaim|
BTREE_INSERT_JOURNAL_RECLAIM| BCH_TRANS_COMMIT_journal_reclaim|
BTREE_INSERT_NOFAIL| BCH_TRANS_COMMIT_no_enospc|
BTREE_INSERT_NOCHECK_RW, BCH_TRANS_COMMIT_no_check_rw,
!wbio->wbio.failed.nr)); !wbio->wbio.failed.nr));
if (ret) if (ret)
goto err; goto err;

View File

@ -257,7 +257,7 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) && BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
!btree_type_has_snapshots(iter->btree_id)); !btree_type_has_snapshot_field(iter->btree_id));
if (iter->update_path) if (iter->update_path)
bch2_btree_path_verify(trans, iter->update_path); bch2_btree_path_verify(trans, iter->update_path);
@ -1214,8 +1214,6 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
struct btree_path *path, struct bpos new_pos, struct btree_path *path, struct bpos new_pos,
bool intent, unsigned long ip, int cmp) bool intent, unsigned long ip, int cmp)
{ {
unsigned level = path->level;
bch2_trans_verify_not_in_restart(trans); bch2_trans_verify_not_in_restart(trans);
EBUG_ON(!path->ref); EBUG_ON(!path->ref);
@ -1231,7 +1229,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
goto out; goto out;
} }
level = btree_path_up_until_good_node(trans, path, cmp); unsigned level = btree_path_up_until_good_node(trans, path, cmp);
if (btree_path_node(path, level)) { if (btree_path_node(path, level)) {
struct btree_path_level *l = &path->l[level]; struct btree_path_level *l = &path->l[level];
@ -2835,8 +2833,9 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
static inline void check_srcu_held_too_long(struct btree_trans *trans) static inline void check_srcu_held_too_long(struct btree_trans *trans)
{ {
WARN(time_after(jiffies, trans->srcu_lock_time + HZ * 10), WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10),
"btree trans held srcu lock (delaying memory reclaim) by more than 10 seconds"); "btree trans held srcu lock (delaying memory reclaim) for %lu seconds",
(jiffies - trans->srcu_lock_time) / HZ);
} }
void bch2_trans_srcu_unlock(struct btree_trans *trans) void bch2_trans_srcu_unlock(struct btree_trans *trans)
@ -3088,8 +3087,6 @@ void bch2_trans_put(struct btree_trans *trans)
srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
} }
bch2_journal_preres_put(&c->journal, &trans->journal_preres);
kfree(trans->extra_journal_entries.data); kfree(trans->extra_journal_entries.data);
if (trans->fs_usage_deltas) { if (trans->fs_usage_deltas) {

View File

@ -416,7 +416,7 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
flags |= BTREE_ITER_IS_EXTENTS; flags |= BTREE_ITER_IS_EXTENTS;
if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) && if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
!btree_type_has_snapshots(btree_id)) !btree_type_has_snapshot_field(btree_id))
flags &= ~BTREE_ITER_ALL_SNAPSHOTS; flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) && if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&

View File

@ -90,10 +90,13 @@ static void bkey_cached_free(struct btree_key_cache *bc,
ck->btree_trans_barrier_seq = ck->btree_trans_barrier_seq =
start_poll_synchronize_srcu(&c->btree_trans_barrier); start_poll_synchronize_srcu(&c->btree_trans_barrier);
if (ck->c.lock.readers) if (ck->c.lock.readers) {
list_move_tail(&ck->list, &bc->freed_pcpu); list_move_tail(&ck->list, &bc->freed_pcpu);
else bc->nr_freed_pcpu++;
} else {
list_move_tail(&ck->list, &bc->freed_nonpcpu); list_move_tail(&ck->list, &bc->freed_nonpcpu);
bc->nr_freed_nonpcpu++;
}
atomic_long_inc(&bc->nr_freed); atomic_long_inc(&bc->nr_freed);
kfree(ck->k); kfree(ck->k);
@ -110,6 +113,8 @@ static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
{ {
struct bkey_cached *pos; struct bkey_cached *pos;
bc->nr_freed_nonpcpu++;
list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) { list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
if (ULONG_CMP_GE(ck->btree_trans_barrier_seq, if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
pos->btree_trans_barrier_seq)) { pos->btree_trans_barrier_seq)) {
@ -159,6 +164,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
#else #else
mutex_lock(&bc->lock); mutex_lock(&bc->lock);
list_move_tail(&ck->list, &bc->freed_nonpcpu); list_move_tail(&ck->list, &bc->freed_nonpcpu);
bc->nr_freed_nonpcpu++;
mutex_unlock(&bc->lock); mutex_unlock(&bc->lock);
#endif #endif
} else { } else {
@ -218,6 +224,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
f->nr < ARRAY_SIZE(f->objs) / 2) { f->nr < ARRAY_SIZE(f->objs) / 2) {
ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
list_del_init(&ck->list); list_del_init(&ck->list);
bc->nr_freed_nonpcpu--;
f->objs[f->nr++] = ck; f->objs[f->nr++] = ck;
} }
@ -230,6 +237,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
if (!list_empty(&bc->freed_nonpcpu)) { if (!list_empty(&bc->freed_nonpcpu)) {
ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
list_del_init(&ck->list); list_del_init(&ck->list);
bc->nr_freed_nonpcpu--;
} }
mutex_unlock(&bc->lock); mutex_unlock(&bc->lock);
#endif #endif
@ -649,8 +657,8 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
BTREE_TRIGGER_NORUN) ?: BTREE_TRIGGER_NORUN) ?:
bch2_trans_commit(trans, NULL, NULL, bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOCHECK_RW| BCH_TRANS_COMMIT_no_check_rw|
BTREE_INSERT_NOFAIL| BCH_TRANS_COMMIT_no_enospc|
(ck->journal.seq == journal_last_seq(j) (ck->journal.seq == journal_last_seq(j)
? BCH_WATERMARK_reclaim ? BCH_WATERMARK_reclaim
: 0)| : 0)|
@ -665,7 +673,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
goto out; goto out;
bch2_journal_pin_drop(j, &ck->journal); bch2_journal_pin_drop(j, &ck->journal);
bch2_journal_preres_put(j, &ck->res);
BUG_ON(!btree_node_locked(c_iter.path, 0)); BUG_ON(!btree_node_locked(c_iter.path, 0));
@ -728,7 +735,7 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
ret = commit_do(trans, NULL, NULL, 0, ret = commit_do(trans, NULL, NULL, 0,
btree_key_cache_flush_pos(trans, key, seq, btree_key_cache_flush_pos(trans, key, seq,
BTREE_INSERT_JOURNAL_RECLAIM, false)); BCH_TRANS_COMMIT_journal_reclaim, false));
unlock: unlock:
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
@ -763,18 +770,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
BUG_ON(insert->k.u64s > ck->u64s); BUG_ON(insert->k.u64s > ck->u64s);
if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
int difference;
BUG_ON(jset_u64s(insert->k.u64s) > trans->journal_preres.u64s);
difference = jset_u64s(insert->k.u64s) - ck->res.u64s;
if (difference > 0) {
trans->journal_preres.u64s -= difference;
ck->res.u64s += difference;
}
}
bkey_copy(ck->k, insert); bkey_copy(ck->k, insert);
ck->valid = true; ck->valid = true;
@ -852,6 +847,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
* Newest freed entries are at the end of the list - once we hit one * Newest freed entries are at the end of the list - once we hit one
* that's too new to be freed, we can bail out: * that's too new to be freed, we can bail out:
*/ */
scanned += bc->nr_freed_nonpcpu;
list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) { list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq)) ck->btree_trans_barrier_seq))
@ -861,13 +858,15 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
six_lock_exit(&ck->c.lock); six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck); kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed); atomic_long_dec(&bc->nr_freed);
scanned++;
freed++; freed++;
bc->nr_freed_nonpcpu--;
} }
if (scanned >= nr) if (scanned >= nr)
goto out; goto out;
scanned += bc->nr_freed_pcpu;
list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq)) ck->btree_trans_barrier_seq))
@ -877,8 +876,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
six_lock_exit(&ck->c.lock); six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck); kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed); atomic_long_dec(&bc->nr_freed);
scanned++;
freed++; freed++;
bc->nr_freed_pcpu--;
} }
if (scanned >= nr) if (scanned >= nr)
@ -985,6 +984,9 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
} }
#endif #endif
BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu);
BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu);
list_splice(&bc->freed_pcpu, &items); list_splice(&bc->freed_pcpu, &items);
list_splice(&bc->freed_nonpcpu, &items); list_splice(&bc->freed_nonpcpu, &items);
@ -994,7 +996,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
cond_resched(); cond_resched();
bch2_journal_pin_drop(&c->journal, &ck->journal); bch2_journal_pin_drop(&c->journal, &ck->journal);
bch2_journal_preres_put(&c->journal, &ck->res);
list_del(&ck->list); list_del(&ck->list);
kfree(ck->k); kfree(ck->k);

View File

@ -0,0 +1,34 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
struct btree_key_cache_freelist {
struct bkey_cached *objs[16];
unsigned nr;
};
struct btree_key_cache {
struct mutex lock;
struct rhashtable table;
bool table_init_done;
struct list_head freed_pcpu;
size_t nr_freed_pcpu;
struct list_head freed_nonpcpu;
size_t nr_freed_nonpcpu;
struct shrinker shrink;
unsigned shrink_iter;
struct btree_key_cache_freelist __percpu *pcpu_freed;
atomic_long_t nr_freed;
atomic_long_t nr_keys;
atomic_long_t nr_dirty;
};
struct bkey_cached_key {
u32 btree_id;
struct bpos pos;
} __packed __aligned(4);
#endif /* _BCACHEFS_BTREE_KEY_CACHE_TYPES_H */

View File

@ -78,6 +78,53 @@ inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
bch2_btree_init_next(trans, b); bch2_btree_init_next(trans, b);
} }
static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
{
while (--i >= trans->updates) {
if (same_leaf_as_prev(trans, i))
continue;
bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
}
trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
}
static inline int bch2_trans_lock_write(struct btree_trans *trans)
{
struct btree_insert_entry *i;
EBUG_ON(trans->write_locked);
trans_for_each_update(trans, i) {
if (same_leaf_as_prev(trans, i))
continue;
if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
return trans_lock_write_fail(trans, i);
if (!i->cached)
bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
}
trans->write_locked = true;
return 0;
}
static inline void bch2_trans_unlock_write(struct btree_trans *trans)
{
if (likely(trans->write_locked)) {
struct btree_insert_entry *i;
trans_for_each_update(trans, i)
if (!same_leaf_as_prev(trans, i))
bch2_btree_node_unlock_write_inlined(trans, i->path,
insert_l(i)->b);
trans->write_locked = false;
}
}
/* Inserting into a given leaf node (last stage of insert): */ /* Inserting into a given leaf node (last stage of insert): */
/* Handle overwrites and do insert, for non extents: */ /* Handle overwrites and do insert, for non extents: */
@ -269,23 +316,13 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
BUG_ON(i->level != i->path->level); BUG_ON(i->level != i->path->level);
BUG_ON(i->btree_id != i->path->btree_id); BUG_ON(i->btree_id != i->path->btree_id);
EBUG_ON(!i->level && EBUG_ON(!i->level &&
btree_type_has_snapshots(i->btree_id) &&
!(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
i->k->k.p.snapshot && i->k->k.p.snapshot &&
bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot)); bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
} }
static noinline int
bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags,
unsigned long trace_ip)
{
return drop_locks_do(trans,
bch2_journal_preres_get(&trans->c->journal,
&trans->journal_preres,
trans->journal_preres_u64s,
(flags & BCH_WATERMARK_MASK)));
}
static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
unsigned flags) unsigned flags)
{ {
@ -320,6 +357,45 @@ static inline int btree_key_can_insert(struct btree_trans *trans,
return 0; return 0;
} }
noinline static int
btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
struct btree_path *path, unsigned new_u64s)
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
struct bkey_cached *ck = (void *) path->l[0].b;
struct bkey_i *new_k;
int ret;
bch2_trans_unlock_write(trans);
bch2_trans_unlock(trans);
new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
if (!new_k) {
bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
bch2_btree_id_str(path->btree_id), new_u64s);
return -BCH_ERR_ENOMEM_btree_key_cache_insert;
}
ret = bch2_trans_relock(trans) ?:
bch2_trans_lock_write(trans);
if (unlikely(ret)) {
kfree(new_k);
return ret;
}
memcpy(new_k, ck->k, ck->u64s * sizeof(u64));
trans_for_each_update(trans, i)
if (i->old_v == &ck->k->v)
i->old_v = &new_k->v;
kfree(ck->k);
ck->u64s = new_u64s;
ck->k = new_k;
return 0;
}
static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags, static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
struct btree_path *path, unsigned u64s) struct btree_path *path, unsigned u64s)
{ {
@ -333,7 +409,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
bch2_btree_key_cache_must_wait(c) && bch2_btree_key_cache_must_wait(c) &&
!(flags & BTREE_INSERT_JOURNAL_RECLAIM)) !(flags & BCH_TRANS_COMMIT_journal_reclaim))
return -BCH_ERR_btree_insert_need_journal_reclaim; return -BCH_ERR_btree_insert_need_journal_reclaim;
/* /*
@ -346,12 +422,9 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
return 0; return 0;
new_u64s = roundup_pow_of_two(u64s); new_u64s = roundup_pow_of_two(u64s);
new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT);
if (!new_k) { if (unlikely(!new_k))
bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s);
bch2_btree_id_str(path->btree_id), new_u64s);
return -BCH_ERR_ENOMEM_btree_key_cache_insert;
}
trans_for_each_update(trans, i) trans_for_each_update(trans, i)
if (i->old_v == &ck->k->v) if (i->old_v == &ck->k->v)
@ -583,6 +656,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
*stopped_at = i; *stopped_at = i;
return ret; return ret;
} }
i->k->k.needs_whiteout = false;
} }
if (trans->nr_wb_updates && if (trans->nr_wb_updates &&
@ -593,7 +668,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
* Don't get journal reservation until after we know insert will * Don't get journal reservation until after we know insert will
* succeed: * succeed:
*/ */
if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
ret = bch2_trans_journal_res_get(trans, ret = bch2_trans_journal_res_get(trans,
(flags & BCH_WATERMARK_MASK)| (flags & BCH_WATERMARK_MASK)|
JOURNAL_RES_GET_NONBLOCK); JOURNAL_RES_GET_NONBLOCK);
@ -602,8 +677,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
if (unlikely(trans->journal_transaction_names)) if (unlikely(trans->journal_transaction_names))
journal_transaction_name(trans); journal_transaction_name(trans);
} else {
trans->journal_res.seq = c->journal.replay_journal_seq;
} }
/* /*
@ -612,7 +685,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
*/ */
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
!(flags & BTREE_INSERT_JOURNAL_REPLAY)) { !(flags & BCH_TRANS_COMMIT_no_journal_res)) {
if (bch2_journal_seq_verify) if (bch2_journal_seq_verify)
trans_for_each_update(trans, i) trans_for_each_update(trans, i)
i->k->k.version.lo = trans->journal_res.seq; i->k->k.version.lo = trans->journal_res.seq;
@ -626,7 +699,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
return -BCH_ERR_btree_insert_need_mark_replicas; return -BCH_ERR_btree_insert_need_mark_replicas;
if (trans->nr_wb_updates) { if (trans->nr_wb_updates) {
EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY); EBUG_ON(flags & BCH_TRANS_COMMIT_no_journal_res);
ret = bch2_btree_insert_keys_write_buffer(trans); ret = bch2_btree_insert_keys_write_buffer(trans);
if (ret) if (ret)
@ -663,7 +736,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
trans->journal_res.u64s -= trans->extra_journal_entries.nr; trans->journal_res.u64s -= trans->extra_journal_entries.nr;
} }
if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
struct journal *j = &c->journal; struct journal *j = &c->journal;
struct jset_entry *entry; struct jset_entry *entry;
@ -705,15 +778,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
} }
trans_for_each_update(trans, i) { trans_for_each_update(trans, i) {
i->k->k.needs_whiteout = false;
if (!i->cached) { if (!i->cached) {
u64 seq = trans->journal_res.seq; bch2_btree_insert_key_leaf(trans, i->path, i->k, trans->journal_res.seq);
if (i->flags & BTREE_UPDATE_PREJOURNAL)
seq = i->seq;
bch2_btree_insert_key_leaf(trans, i->path, i->k, seq);
} else if (!i->key_cache_already_flushed) } else if (!i->key_cache_already_flushed)
bch2_btree_insert_key_cached(trans, flags, i); bch2_btree_insert_key_cached(trans, flags, i);
else { else {
@ -731,37 +797,6 @@ revert_fs_usage:
return ret; return ret;
} }
static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
{
while (--i >= trans->updates) {
if (same_leaf_as_prev(trans, i))
continue;
bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
}
trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
}
static inline int trans_lock_write(struct btree_trans *trans)
{
struct btree_insert_entry *i;
trans_for_each_update(trans, i) {
if (same_leaf_as_prev(trans, i))
continue;
if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
return trans_lock_write_fail(trans, i);
if (!i->cached)
bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
}
return 0;
}
static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
{ {
struct btree_insert_entry *i; struct btree_insert_entry *i;
@ -799,6 +834,12 @@ static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
return -EINVAL; return -EINVAL;
} }
static int bch2_trans_commit_journal_pin_flush(struct journal *j,
struct journal_entry_pin *_pin, u64 seq)
{
return 0;
}
/* /*
* Get journal reservation, take write locks, and attempt to do btree update(s): * Get journal reservation, take write locks, and attempt to do btree update(s):
*/ */
@ -829,15 +870,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
} }
} }
ret = bch2_journal_preres_get(&c->journal, ret = bch2_trans_lock_write(trans);
&trans->journal_preres, trans->journal_preres_u64s,
(flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK);
if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked))
ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip);
if (unlikely(ret))
return ret;
ret = trans_lock_write(trans);
if (unlikely(ret)) if (unlikely(ret))
return ret; return ret;
@ -846,20 +879,19 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
if (!ret && unlikely(trans->journal_replay_not_finished)) if (!ret && unlikely(trans->journal_replay_not_finished))
bch2_drop_overwrites_from_journal(trans); bch2_drop_overwrites_from_journal(trans);
trans_for_each_update(trans, i) bch2_trans_unlock_write(trans);
if (!same_leaf_as_prev(trans, i))
bch2_btree_node_unlock_write_inlined(trans, i->path,
insert_l(i)->b);
if (!ret && trans->journal_pin) if (!ret && trans->journal_pin)
bch2_journal_pin_add(&c->journal, trans->journal_res.seq, bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
trans->journal_pin, NULL); trans->journal_pin,
bch2_trans_commit_journal_pin_flush);
/* /*
* Drop journal reservation after dropping write locks, since dropping * Drop journal reservation after dropping write locks, since dropping
* the journal reservation may kick off a journal write: * the journal reservation may kick off a journal write:
*/ */
bch2_journal_res_put(&c->journal, &trans->journal_res); if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
bch2_journal_res_put(&c->journal, &trans->journal_res);
return ret; return ret;
} }
@ -896,7 +928,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
* XXX: this should probably be a separate BTREE_INSERT_NONBLOCK * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
* flag * flag
*/ */
if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) && if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
(flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) { (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock; ret = -BCH_ERR_journal_reclaim_would_deadlock;
break; break;
@ -931,7 +963,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
if (wb->state.nr > wb->size * 3 / 4) { if (wb->state.nr > wb->size * 3 / 4) {
bch2_trans_begin(trans); bch2_trans_begin(trans);
ret = __bch2_btree_write_buffer_flush(trans, ret = __bch2_btree_write_buffer_flush(trans,
flags|BTREE_INSERT_NOCHECK_RW, true); flags|BCH_TRANS_COMMIT_no_check_rw, true);
if (!ret) { if (!ret) {
trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
@ -951,8 +983,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) && bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
!(flags & BTREE_INSERT_NOWAIT) && (flags & BCH_TRANS_COMMIT_no_enospc), c,
(flags & BTREE_INSERT_NOFAIL), c,
"%s: incorrectly got %s\n", __func__, bch2_err_str(ret)); "%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
return ret; return ret;
@ -964,7 +995,7 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
int ret; int ret;
if (likely(!(flags & BTREE_INSERT_LAZY_RW)) || if (likely(!(flags & BCH_TRANS_COMMIT_lazy_rw)) ||
test_bit(BCH_FS_STARTED, &c->flags)) test_bit(BCH_FS_STARTED, &c->flags))
return -BCH_ERR_erofs_trans_commit; return -BCH_ERR_erofs_trans_commit;
@ -1002,7 +1033,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct btree_insert_entry *i = NULL; struct btree_insert_entry *i = NULL;
struct btree_write_buffered_key *wb; struct btree_write_buffered_key *wb;
unsigned u64s;
int ret = 0; int ret = 0;
if (!trans->nr_updates && if (!trans->nr_updates &&
@ -1010,9 +1040,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
!trans->extra_journal_entries.nr) !trans->extra_journal_entries.nr)
goto out_reset; goto out_reset;
if (flags & BTREE_INSERT_GC_LOCK_HELD)
lockdep_assert_held(&c->gc_lock);
ret = bch2_trans_commit_run_triggers(trans); ret = bch2_trans_commit_run_triggers(trans);
if (ret) if (ret)
goto out_reset; goto out_reset;
@ -1021,7 +1048,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
struct printbuf buf = PRINTBUF; struct printbuf buf = PRINTBUF;
enum bkey_invalid_flags invalid_flags = 0; enum bkey_invalid_flags invalid_flags = 0;
if (!(flags & BTREE_INSERT_JOURNAL_REPLAY)) if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
@ -1039,7 +1066,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
goto out_reset; goto out_reset;
} }
if (!(flags & BTREE_INSERT_NOCHECK_RW) && if (!(flags & BCH_TRANS_COMMIT_no_check_rw) &&
unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
ret = bch2_trans_commit_get_rw_cold(trans, flags); ret = bch2_trans_commit_get_rw_cold(trans, flags);
if (ret) if (ret)
@ -1052,7 +1079,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
bch2_trans_unlock(trans); bch2_trans_unlock(trans);
ret = __bch2_btree_write_buffer_flush(trans, ret = __bch2_btree_write_buffer_flush(trans,
flags|BTREE_INSERT_NOCHECK_RW, true); flags|BCH_TRANS_COMMIT_no_check_rw, true);
if (!ret) { if (!ret) {
trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
@ -1062,13 +1089,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
trans->journal_u64s = trans->extra_journal_entries.nr; trans->journal_u64s = trans->extra_journal_entries.nr;
trans->journal_preres_u64s = 0;
trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
if (trans->journal_transaction_names) if (trans->journal_transaction_names)
trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
@ -1084,16 +1106,11 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
if (i->key_cache_already_flushed) if (i->key_cache_already_flushed)
continue; continue;
/* we're going to journal the key being updated: */
u64s = jset_u64s(i->k->k.u64s);
if (i->cached &&
likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY)))
trans->journal_preres_u64s += u64s;
if (i->flags & BTREE_UPDATE_NOJOURNAL) if (i->flags & BTREE_UPDATE_NOJOURNAL)
continue; continue;
trans->journal_u64s += u64s; /* we're going to journal the key being updated: */
trans->journal_u64s += jset_u64s(i->k->k.u64s);
/* and we're also going to log the overwrite: */ /* and we're also going to log the overwrite: */
if (trans->journal_transaction_names) if (trans->journal_transaction_names)
@ -1106,14 +1123,15 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
if (trans->extra_journal_res) { if (trans->extra_journal_res) {
ret = bch2_disk_reservation_add(c, trans->disk_res, ret = bch2_disk_reservation_add(c, trans->disk_res,
trans->extra_journal_res, trans->extra_journal_res,
(flags & BTREE_INSERT_NOFAIL) (flags & BCH_TRANS_COMMIT_no_enospc)
? BCH_DISK_RESERVATION_NOFAIL : 0); ? BCH_DISK_RESERVATION_NOFAIL : 0);
if (ret) if (ret)
goto err; goto err;
} }
retry: retry:
bch2_trans_verify_not_in_restart(trans); bch2_trans_verify_not_in_restart(trans);
memset(&trans->journal_res, 0, sizeof(trans->journal_res)); if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
memset(&trans->journal_res, 0, sizeof(trans->journal_res));
ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_); ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_);
@ -1125,9 +1143,7 @@ retry:
trace_and_count(c, transaction_commit, trans, _RET_IP_); trace_and_count(c, transaction_commit, trans, _RET_IP_);
out: out:
bch2_journal_preres_put(&c->journal, &trans->journal_preres); if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw)))
if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
bch2_write_ref_put(c, BCH_WRITE_REF_trans); bch2_write_ref_put(c, BCH_WRITE_REF_trans);
out_reset: out_reset:
if (!ret) if (!ret)
@ -1140,5 +1156,17 @@ err:
if (ret) if (ret)
goto out; goto out;
/*
* We might have done another transaction commit in the error path -
* i.e. btree write buffer flush - which will have made use of
* trans->journal_res, but with BCH_TRANS_COMMIT_no_journal_res that is
* how the journal sequence number to pin is passed in - so we must
* restart:
*/
if (flags & BCH_TRANS_COMMIT_no_journal_res) {
ret = -BCH_ERR_transaction_restart_nested;
goto out;
}
goto retry; goto retry;
} }

View File

@ -5,7 +5,7 @@
#include <linux/list.h> #include <linux/list.h>
#include <linux/rhashtable.h> #include <linux/rhashtable.h>
//#include "bkey_methods.h" #include "btree_key_cache_types.h"
#include "buckets_types.h" #include "buckets_types.h"
#include "darray.h" #include "darray.h"
#include "errcode.h" #include "errcode.h"
@ -322,31 +322,6 @@ struct btree_iter {
#endif #endif
}; };
struct btree_key_cache_freelist {
struct bkey_cached *objs[16];
unsigned nr;
};
struct btree_key_cache {
struct mutex lock;
struct rhashtable table;
bool table_init_done;
struct list_head freed_pcpu;
struct list_head freed_nonpcpu;
struct shrinker shrink;
unsigned shrink_iter;
struct btree_key_cache_freelist __percpu *pcpu_freed;
atomic_long_t nr_freed;
atomic_long_t nr_keys;
atomic_long_t nr_dirty;
};
struct bkey_cached_key {
u32 btree_id;
struct bpos pos;
} __packed __aligned(4);
#define BKEY_CACHED_ACCESSED 0 #define BKEY_CACHED_ACCESSED 0
#define BKEY_CACHED_DIRTY 1 #define BKEY_CACHED_DIRTY 1
@ -362,7 +337,6 @@ struct bkey_cached {
struct rhash_head hash; struct rhash_head hash;
struct list_head list; struct list_head list;
struct journal_preres res;
struct journal_entry_pin journal; struct journal_entry_pin journal;
u64 seq; u64 seq;
@ -392,7 +366,6 @@ struct btree_insert_entry {
u8 old_btree_u64s; u8 old_btree_u64s;
struct bkey_i *k; struct bkey_i *k;
struct btree_path *path; struct btree_path *path;
u64 seq;
/* key being overwritten: */ /* key being overwritten: */
struct bkey old_k; struct bkey old_k;
const struct bch_val *old_v; const struct bch_val *old_v;
@ -441,6 +414,7 @@ struct btree_trans {
bool journal_replay_not_finished:1; bool journal_replay_not_finished:1;
bool is_initial_gc:1; bool is_initial_gc:1;
bool notrace_relock_fail:1; bool notrace_relock_fail:1;
bool write_locked:1;
enum bch_errcode restarted:16; enum bch_errcode restarted:16;
u32 restart_count; u32 restart_count;
unsigned long last_begin_ip; unsigned long last_begin_ip;
@ -472,11 +446,9 @@ struct btree_trans {
struct journal_entry_pin *journal_pin; struct journal_entry_pin *journal_pin;
struct journal_res journal_res; struct journal_res journal_res;
struct journal_preres journal_preres;
u64 *journal_seq; u64 *journal_seq;
struct disk_reservation *disk_res; struct disk_reservation *disk_res;
unsigned journal_u64s; unsigned journal_u64s;
unsigned journal_preres_u64s;
struct replicas_delta_list *fs_usage_deltas; struct replicas_delta_list *fs_usage_deltas;
}; };
@ -717,6 +689,17 @@ static inline bool btree_type_has_snapshots(enum btree_id id)
return (1U << id) & mask; return (1U << id) & mask;
} }
static inline bool btree_type_has_snapshot_field(enum btree_id id)
{
const unsigned mask = 0
#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr)
BCH_BTREE_IDS()
#undef x
;
return (1U << id) & mask;
}
static inline bool btree_type_has_ptrs(enum btree_id id) static inline bool btree_type_has_ptrs(enum btree_id id)
{ {
const unsigned mask = 0 const unsigned mask = 0

View File

@ -380,21 +380,12 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct btree_insert_entry *i, n; struct btree_insert_entry *i, n;
u64 seq = 0;
int cmp; int cmp;
EBUG_ON(!path->should_be_locked); EBUG_ON(!path->should_be_locked);
EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX); EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
EBUG_ON(!bpos_eq(k->k.p, path->pos)); EBUG_ON(!bpos_eq(k->k.p, path->pos));
/*
* The transaction journal res hasn't been allocated at this point.
* That occurs at commit time. Reuse the seq field to pass in the seq
* of a prejournaled key.
*/
if (flags & BTREE_UPDATE_PREJOURNAL)
seq = trans->journal_res.seq;
n = (struct btree_insert_entry) { n = (struct btree_insert_entry) {
.flags = flags, .flags = flags,
.bkey_type = __btree_node_type(path->level, path->btree_id), .bkey_type = __btree_node_type(path->level, path->btree_id),
@ -403,7 +394,6 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
.cached = path->cached, .cached = path->cached,
.path = path, .path = path,
.k = k, .k = k,
.seq = seq,
.ip_allocated = ip, .ip_allocated = ip,
}; };
@ -431,7 +421,6 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
i->cached = n.cached; i->cached = n.cached;
i->k = n.k; i->k = n.k;
i->path = n.path; i->path = n.path;
i->seq = n.seq;
i->ip_allocated = n.ip_allocated; i->ip_allocated = n.ip_allocated;
} else { } else {
array_insert_item(trans->updates, trans->nr_updates, array_insert_item(trans->updates, trans->nr_updates,
@ -542,18 +531,6 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_); return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_);
} }
/*
* Add a transaction update for a key that has already been journaled.
*/
int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq,
struct btree_iter *iter, struct bkey_i *k,
enum btree_update_flags flags)
{
trans->journal_res.seq = seq;
return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL|
BTREE_UPDATE_PREJOURNAL);
}
int __must_check bch2_trans_update_buffered(struct btree_trans *trans, int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
enum btree_id btree, enum btree_id btree,
struct bkey_i *k) struct bkey_i *k)
@ -792,7 +769,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?: ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
bch2_trans_commit(trans, &disk_res, journal_seq, bch2_trans_commit(trans, &disk_res, journal_seq,
BTREE_INSERT_NOFAIL); BCH_TRANS_COMMIT_no_enospc);
bch2_disk_reservation_put(trans->c, &disk_res); bch2_disk_reservation_put(trans->c, &disk_res);
err: err:
/* /*
@ -897,7 +874,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args); ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
} else { } else {
ret = bch2_trans_do(c, NULL, NULL, ret = bch2_trans_do(c, NULL, NULL,
BTREE_INSERT_LAZY_RW|commit_flags, BCH_TRANS_COMMIT_lazy_rw|commit_flags,
__bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args)); __bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args));
} }

View File

@ -21,37 +21,28 @@ void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *, void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
struct bkey_i *, u64); struct bkey_i *, u64);
enum btree_insert_flags { #define BCH_TRANS_COMMIT_FLAGS() \
x(no_enospc, "don't check for enospc") \
x(no_check_rw, "don't attempt to take a ref on c->writes") \
x(lazy_rw, "go read-write if we haven't yet - only for use in recovery") \
x(no_journal_res, "don't take a journal reservation, instead " \
"pin journal entry referred to by trans->journal_res.seq") \
x(journal_reclaim, "operation required for journal reclaim; may return error" \
"instead of deadlocking if BCH_WATERMARK_reclaim not specified")\
enum __bch_trans_commit_flags {
/* First bits for bch_watermark: */ /* First bits for bch_watermark: */
__BTREE_INSERT_NOFAIL = BCH_WATERMARK_BITS, __BCH_TRANS_COMMIT_FLAGS_START = BCH_WATERMARK_BITS,
__BTREE_INSERT_NOCHECK_RW, #define x(n, ...) __BCH_TRANS_COMMIT_##n,
__BTREE_INSERT_LAZY_RW, BCH_TRANS_COMMIT_FLAGS()
__BTREE_INSERT_JOURNAL_REPLAY, #undef x
__BTREE_INSERT_JOURNAL_RECLAIM,
__BTREE_INSERT_NOWAIT,
__BTREE_INSERT_GC_LOCK_HELD,
__BCH_HASH_SET_MUST_CREATE,
__BCH_HASH_SET_MUST_REPLACE,
}; };
/* Don't check for -ENOSPC: */ enum bch_trans_commit_flags {
#define BTREE_INSERT_NOFAIL BIT(__BTREE_INSERT_NOFAIL) #define x(n, ...) BCH_TRANS_COMMIT_##n = BIT(__BCH_TRANS_COMMIT_##n),
BCH_TRANS_COMMIT_FLAGS()
#define BTREE_INSERT_NOCHECK_RW BIT(__BTREE_INSERT_NOCHECK_RW) #undef x
#define BTREE_INSERT_LAZY_RW BIT(__BTREE_INSERT_LAZY_RW) };
/* Insert is for journal replay - don't get journal reservations: */
#define BTREE_INSERT_JOURNAL_REPLAY BIT(__BTREE_INSERT_JOURNAL_REPLAY)
/* Insert is being called from journal reclaim path: */
#define BTREE_INSERT_JOURNAL_RECLAIM BIT(__BTREE_INSERT_JOURNAL_RECLAIM)
/* Don't block on allocation failure (for new btree nodes: */
#define BTREE_INSERT_NOWAIT BIT(__BTREE_INSERT_NOWAIT)
#define BTREE_INSERT_GC_LOCK_HELD BIT(__BTREE_INSERT_GC_LOCK_HELD)
#define BCH_HASH_SET_MUST_CREATE BIT(__BCH_HASH_SET_MUST_CREATE)
#define BCH_HASH_SET_MUST_REPLACE BIT(__BCH_HASH_SET_MUST_REPLACE)
int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *, int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
unsigned, unsigned); unsigned, unsigned);

View File

@ -475,9 +475,6 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
/* /*
* Protects reaping from the btree node cache and using the btree node * Protects reaping from the btree node cache and using the btree node
* open bucket reserve: * open bucket reserve:
*
* BTREE_INSERT_NOWAIT only applies to btree node allocation, not
* blocking on this lock:
*/ */
ret = bch2_btree_cache_cannibalize_lock(c, cl); ret = bch2_btree_cache_cannibalize_lock(c, cl);
if (ret) if (ret)
@ -487,9 +484,8 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
struct prealloc_nodes *p = as->prealloc_nodes + interior; struct prealloc_nodes *p = as->prealloc_nodes + interior;
while (p->nr < nr_nodes[interior]) { while (p->nr < nr_nodes[interior]) {
b = __bch2_btree_node_alloc(trans, &as->disk_res, b = __bch2_btree_node_alloc(trans, &as->disk_res, cl,
flags & BTREE_INSERT_NOWAIT ? NULL : cl, interior, flags);
interior, flags);
if (IS_ERR(b)) { if (IS_ERR(b)) {
ret = PTR_ERR(b); ret = PTR_ERR(b);
goto err; goto err;
@ -513,8 +509,6 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *
up_read(&c->gc_lock); up_read(&c->gc_lock);
as->took_gc_lock = false; as->took_gc_lock = false;
bch2_journal_preres_put(&c->journal, &as->journal_preres);
bch2_journal_pin_drop(&c->journal, &as->journal); bch2_journal_pin_drop(&c->journal, &as->journal);
bch2_journal_pin_flush(&c->journal, &as->journal); bch2_journal_pin_flush(&c->journal, &as->journal);
bch2_disk_reservation_put(c, &as->disk_res); bch2_disk_reservation_put(c, &as->disk_res);
@ -646,9 +640,9 @@ static void btree_update_nodes_written(struct btree_update *as)
*/ */
ret = commit_do(trans, &as->disk_res, &journal_seq, ret = commit_do(trans, &as->disk_res, &journal_seq,
BCH_WATERMARK_reclaim| BCH_WATERMARK_reclaim|
BTREE_INSERT_NOFAIL| BCH_TRANS_COMMIT_no_enospc|
BTREE_INSERT_NOCHECK_RW| BCH_TRANS_COMMIT_no_check_rw|
BTREE_INSERT_JOURNAL_RECLAIM, BCH_TRANS_COMMIT_journal_reclaim,
btree_update_nodes_written_trans(trans, as)); btree_update_nodes_written_trans(trans, as));
bch2_trans_unlock(trans); bch2_trans_unlock(trans);
@ -734,8 +728,6 @@ err:
bch2_journal_pin_drop(&c->journal, &as->journal); bch2_journal_pin_drop(&c->journal, &as->journal);
bch2_journal_preres_put(&c->journal, &as->journal_preres);
mutex_lock(&c->btree_interior_update_lock); mutex_lock(&c->btree_interior_update_lock);
for (i = 0; i < as->nr_new_nodes; i++) { for (i = 0; i < as->nr_new_nodes; i++) {
b = as->new_nodes[i]; b = as->new_nodes[i];
@ -818,6 +810,12 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
mutex_unlock(&c->btree_interior_update_lock); mutex_unlock(&c->btree_interior_update_lock);
} }
static int bch2_update_reparent_journal_pin_flush(struct journal *j,
struct journal_entry_pin *_pin, u64 seq)
{
return 0;
}
static void btree_update_reparent(struct btree_update *as, static void btree_update_reparent(struct btree_update *as,
struct btree_update *child) struct btree_update *child)
{ {
@ -828,7 +826,8 @@ static void btree_update_reparent(struct btree_update *as,
child->b = NULL; child->b = NULL;
child->mode = BTREE_INTERIOR_UPDATING_AS; child->mode = BTREE_INTERIOR_UPDATING_AS;
bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL); bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal,
bch2_update_reparent_journal_pin_flush);
} }
static void btree_update_updated_root(struct btree_update *as, struct btree *b) static void btree_update_updated_root(struct btree_update *as, struct btree *b)
@ -937,6 +936,12 @@ static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct b
b->ob.v[--b->ob.nr]; b->ob.v[--b->ob.nr];
} }
static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal *j,
struct journal_entry_pin *_pin, u64 seq)
{
return 0;
}
/* /*
* @b is being split/rewritten: it may have pointers to not-yet-written btree * @b is being split/rewritten: it may have pointers to not-yet-written btree
* nodes and thus outstanding btree_updates - redirect @b's * nodes and thus outstanding btree_updates - redirect @b's
@ -988,11 +993,13 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
* when the new nodes are persistent and reachable on disk: * when the new nodes are persistent and reachable on disk:
*/ */
w = btree_current_write(b); w = btree_current_write(b);
bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
bch2_btree_update_will_free_node_journal_pin_flush);
bch2_journal_pin_drop(&c->journal, &w->journal); bch2_journal_pin_drop(&c->journal, &w->journal);
w = btree_prev_write(b); w = btree_prev_write(b);
bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
bch2_btree_update_will_free_node_journal_pin_flush);
bch2_journal_pin_drop(&c->journal, &w->journal); bch2_journal_pin_drop(&c->journal, &w->journal);
mutex_unlock(&c->btree_interior_update_lock); mutex_unlock(&c->btree_interior_update_lock);
@ -1042,7 +1049,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct btree_update *as; struct btree_update *as;
u64 start_time = local_clock(); u64 start_time = local_clock();
int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc)
? BCH_DISK_RESERVATION_NOFAIL : 0; ? BCH_DISK_RESERVATION_NOFAIL : 0;
unsigned nr_nodes[2] = { 0, 0 }; unsigned nr_nodes[2] = { 0, 0 };
unsigned update_level = level; unsigned update_level = level;
@ -1061,7 +1068,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
flags &= ~BCH_WATERMARK_MASK; flags &= ~BCH_WATERMARK_MASK;
flags |= watermark; flags |= watermark;
if (flags & BTREE_INSERT_JOURNAL_RECLAIM) if (flags & BCH_TRANS_COMMIT_journal_reclaim)
journal_flags |= JOURNAL_RES_GET_NONBLOCK; journal_flags |= JOURNAL_RES_GET_NONBLOCK;
journal_flags |= watermark; journal_flags |= watermark;
@ -1087,9 +1094,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
} }
if (flags & BTREE_INSERT_GC_LOCK_HELD) if (!down_read_trylock(&c->gc_lock)) {
lockdep_assert_held(&c->gc_lock);
else if (!down_read_trylock(&c->gc_lock)) {
ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0)); ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0));
if (ret) { if (ret) {
up_read(&c->gc_lock); up_read(&c->gc_lock);
@ -1103,7 +1108,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
as->c = c; as->c = c;
as->start_time = start_time; as->start_time = start_time;
as->mode = BTREE_INTERIOR_NO_UPDATE; as->mode = BTREE_INTERIOR_NO_UPDATE;
as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD); as->took_gc_lock = true;
as->btree_id = path->btree_id; as->btree_id = path->btree_id;
as->update_level = update_level; as->update_level = update_level;
INIT_LIST_HEAD(&as->list); INIT_LIST_HEAD(&as->list);
@ -1129,27 +1134,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
if (ret) if (ret)
goto err; goto err;
ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
BTREE_UPDATE_JOURNAL_RES,
journal_flags|JOURNAL_RES_GET_NONBLOCK);
if (ret) {
if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
goto err;
}
ret = drop_locks_do(trans,
bch2_journal_preres_get(&c->journal, &as->journal_preres,
BTREE_UPDATE_JOURNAL_RES,
journal_flags));
if (ret == -BCH_ERR_journal_preres_get_blocked) {
trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get);
}
if (ret)
goto err;
}
ret = bch2_disk_reservation_get(c, &as->disk_res, ret = bch2_disk_reservation_get(c, &as->disk_res,
(nr_nodes[0] + nr_nodes[1]) * btree_sectors(c), (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
c->opts.metadata_replicas, c->opts.metadata_replicas,
@ -1167,7 +1151,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
* flag * flag
*/ */
if (bch2_err_matches(ret, ENOSPC) && if (bch2_err_matches(ret, ENOSPC) &&
(flags & BTREE_INSERT_JOURNAL_RECLAIM) && (flags & BCH_TRANS_COMMIT_journal_reclaim) &&
watermark != BCH_WATERMARK_reclaim) { watermark != BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock; ret = -BCH_ERR_journal_reclaim_would_deadlock;
goto err; goto err;
@ -1855,7 +1839,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
parent = btree_node_parent(path, b); parent = btree_node_parent(path, b);
as = bch2_btree_update_start(trans, path, level, false, as = bch2_btree_update_start(trans, path, level, false,
BTREE_INSERT_NOFAIL|flags); BCH_TRANS_COMMIT_no_enospc|flags);
ret = PTR_ERR_OR_ZERO(as); ret = PTR_ERR_OR_ZERO(as);
if (ret) if (ret)
goto err; goto err;
@ -1941,7 +1925,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
struct btree_update *as; struct btree_update *as;
int ret; int ret;
flags |= BTREE_INSERT_NOFAIL; flags |= BCH_TRANS_COMMIT_no_enospc;
parent = btree_node_parent(iter->path, b); parent = btree_node_parent(iter->path, b);
as = bch2_btree_update_start(trans, iter->path, b->c.level, as = bch2_btree_update_start(trans, iter->path, b->c.level,
@ -2418,23 +2402,17 @@ void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry
struct jset_entry * struct jset_entry *
bch2_btree_roots_to_journal_entries(struct bch_fs *c, bch2_btree_roots_to_journal_entries(struct bch_fs *c,
struct jset_entry *start, struct jset_entry *end,
struct jset_entry *end) unsigned long skip)
{ {
struct jset_entry *entry;
unsigned long have = 0;
unsigned i; unsigned i;
for (entry = start; entry < end; entry = vstruct_next(entry))
if (entry->type == BCH_JSET_ENTRY_btree_root)
__set_bit(entry->btree_id, &have);
mutex_lock(&c->btree_root_lock); mutex_lock(&c->btree_root_lock);
for (i = 0; i < btree_id_nr_alive(c); i++) { for (i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i); struct btree_root *r = bch2_btree_id_root(c, i);
if (r->alive && !test_bit(i, &have)) { if (r->alive && !test_bit(i, &skip)) {
journal_entry_set(end, BCH_JSET_ENTRY_btree_root, journal_entry_set(end, BCH_JSET_ENTRY_btree_root,
i, r->level, &r->key, r->key.k.u64s); i, r->level, &r->key, r->key.k.u64s);
end = vstruct_next(end); end = vstruct_next(end);

View File

@ -55,7 +55,6 @@ struct btree_update {
unsigned update_level; unsigned update_level;
struct disk_reservation disk_res; struct disk_reservation disk_res;
struct journal_preres journal_preres;
/* /*
* BTREE_INTERIOR_UPDATING_NODE: * BTREE_INTERIOR_UPDATING_NODE:
@ -325,7 +324,7 @@ bool bch2_btree_interior_updates_flush(struct bch_fs *);
void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *); void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *);
struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
struct jset_entry *, struct jset_entry *); struct jset_entry *, unsigned long);
void bch2_do_pending_node_rewrites(struct bch_fs *); void bch2_do_pending_node_rewrites(struct bch_fs *);
void bch2_free_pending_node_rewrites(struct bch_fs *); void bch2_free_pending_node_rewrites(struct bch_fs *);

View File

@ -9,9 +9,11 @@
#include "journal.h" #include "journal.h"
#include "journal_reclaim.h" #include "journal_reclaim.h"
#include <linux/atomic.h>
#include <linux/sort.h> #include <linux/sort.h>
static int bch2_btree_write_buffer_journal_flush(struct journal *,
struct journal_entry_pin *, u64);
static int btree_write_buffered_key_cmp(const void *_l, const void *_r) static int btree_write_buffered_key_cmp(const void *_l, const void *_r)
{ {
const struct btree_write_buffered_key *l = _l; const struct btree_write_buffered_key *l = _l;
@ -46,6 +48,13 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
if (ret) if (ret)
return ret; return ret;
/*
* We can't clone a path that has write locks: unshare it now, before
* set_pos and traverse():
*/
if (iter->path->ref > 1)
iter->path = __bch2_btree_path_make_mut(trans, iter->path, true, _THIS_IP_);
path = iter->path; path = iter->path;
if (!*write_locked) { if (!*write_locked) {
@ -65,24 +74,18 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq); bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
(*fast)++; (*fast)++;
if (path->ref > 1) {
/*
* We can't clone a path that has write locks: if the path is
* shared, unlock before set_pos(), traverse():
*/
bch2_btree_node_unlock_write(trans, path, path->l[0].b);
*write_locked = false;
}
return 0; return 0;
trans_commit: trans_commit:
return bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k, trans->journal_res.seq = wb->journal_seq;
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
return bch2_trans_update(trans, iter, &wb->k,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, NULL, NULL, bch2_trans_commit(trans, NULL, NULL,
commit_flags| commit_flags|
BTREE_INSERT_NOCHECK_RW| BCH_TRANS_COMMIT_no_check_rw|
BTREE_INSERT_NOFAIL| BCH_TRANS_COMMIT_no_enospc|
BTREE_INSERT_JOURNAL_RECLAIM); BCH_TRANS_COMMIT_no_journal_res|
BCH_TRANS_COMMIT_journal_reclaim);
} }
static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb) static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb)
@ -125,9 +128,11 @@ btree_write_buffered_insert(struct btree_trans *trans,
bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k), bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k),
BTREE_ITER_CACHED|BTREE_ITER_INTENT); BTREE_ITER_CACHED|BTREE_ITER_INTENT);
trans->journal_res.seq = wb->journal_seq;
ret = bch2_btree_iter_traverse(&iter) ?: ret = bch2_btree_iter_traverse(&iter) ?:
bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k, bch2_trans_update(trans, &iter, &wb->k,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
bch2_trans_iter_exit(trans, &iter); bch2_trans_iter_exit(trans, &iter);
return ret; return ret;
} }
@ -151,7 +156,8 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
if (!locked && !mutex_trylock(&wb->flush_lock)) if (!locked && !mutex_trylock(&wb->flush_lock))
return 0; return 0;
bch2_journal_pin_copy(j, &pin, &wb->journal_pin, NULL); bch2_journal_pin_copy(j, &pin, &wb->journal_pin,
bch2_btree_write_buffer_journal_flush);
bch2_journal_pin_drop(j, &wb->journal_pin); bch2_journal_pin_drop(j, &wb->journal_pin);
s = btree_write_buffer_switch(wb); s = btree_write_buffer_switch(wb);
@ -169,7 +175,7 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
* However, since we're not flushing in the order they appear in the * However, since we're not flushing in the order they appear in the
* journal we won't be able to drop our journal pin until everything is * journal we won't be able to drop our journal pin until everything is
* flushed - which means this could deadlock the journal if we weren't * flushed - which means this could deadlock the journal if we weren't
* passing BTREE_INSERT_JOURNAL_RECLAIM. This causes the update to fail * passing BCH_TRANS_COMMIT_journal_reclaim. This causes the update to fail
* if it would block taking a journal reservation. * if it would block taking a journal reservation.
* *
* If that happens, simply skip the key so we can optimistically insert * If that happens, simply skip the key so we can optimistically insert
@ -253,21 +259,14 @@ slowpath:
if (!i->journal_seq) if (!i->journal_seq)
continue; continue;
if (i->journal_seq > pin.seq) { bch2_journal_pin_update(j, i->journal_seq, &pin,
struct journal_entry_pin pin2; bch2_btree_write_buffer_journal_flush);
memset(&pin2, 0, sizeof(pin2));
bch2_journal_pin_add(j, i->journal_seq, &pin2, NULL);
bch2_journal_pin_drop(j, &pin);
bch2_journal_pin_copy(j, &pin, &pin2, NULL);
bch2_journal_pin_drop(j, &pin2);
}
ret = commit_do(trans, NULL, NULL, ret = commit_do(trans, NULL, NULL,
commit_flags| commit_flags|
BTREE_INSERT_NOFAIL| BCH_TRANS_COMMIT_no_enospc|
BTREE_INSERT_JOURNAL_RECLAIM, BCH_TRANS_COMMIT_no_journal_res|
BCH_TRANS_COMMIT_journal_reclaim,
btree_write_buffered_insert(trans, i)); btree_write_buffered_insert(trans, i));
if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret))) if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
break; break;
@ -297,7 +296,7 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j,
mutex_lock(&wb->flush_lock); mutex_lock(&wb->flush_lock);
return bch2_trans_run(c, return bch2_trans_run(c,
__bch2_btree_write_buffer_flush(trans, BTREE_INSERT_NOCHECK_RW, true)); __bch2_btree_write_buffer_flush(trans, BCH_TRANS_COMMIT_no_check_rw, true));
} }
static inline u64 btree_write_buffer_ref(int idx) static inline u64 btree_write_buffer_ref(int idx)

21
libbcachefs/darray.c Normal file
View File

@ -0,0 +1,21 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/log2.h>
#include <linux/slab.h>
#include "darray.h"
int __bch2_darray_resize(darray_void *d, size_t element_size, size_t new_size, gfp_t gfp)
{
if (new_size > d->size) {
new_size = roundup_pow_of_two(new_size);
void *data = krealloc_array(d->data, new_size, element_size, gfp);
if (!data)
return -ENOMEM;
d->data = data;
d->size = new_size;
}
return 0;
}

View File

@ -8,7 +8,6 @@
* Inspired by CCAN's darray * Inspired by CCAN's darray
*/ */
#include "util.h"
#include <linux/slab.h> #include <linux/slab.h>
#define DARRAY(type) \ #define DARRAY(type) \
@ -19,20 +18,25 @@ struct { \
typedef DARRAY(void) darray_void; typedef DARRAY(void) darray_void;
int __bch2_darray_resize(darray_void *, size_t, size_t, gfp_t);
static inline int __darray_resize(darray_void *d, size_t element_size,
size_t new_size, gfp_t gfp)
{
return unlikely(new_size > d->size)
? __bch2_darray_resize(d, element_size, new_size, gfp)
: 0;
}
#define darray_resize_gfp(_d, _new_size, _gfp) \
__darray_resize((darray_void *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp)
#define darray_resize(_d, _new_size) \
darray_resize_gfp(_d, _new_size, GFP_KERNEL)
static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, gfp_t gfp) static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, gfp_t gfp)
{ {
if (d->nr + more > d->size) { return __darray_resize(d, t_size, d->nr + more, gfp);
size_t new_size = roundup_pow_of_two(d->nr + more);
void *data = krealloc_array(d->data, new_size, t_size, gfp);
if (!data)
return -ENOMEM;
d->data = data;
d->size = new_size;
}
return 0;
} }
#define darray_make_room_gfp(_d, _more, _gfp) \ #define darray_make_room_gfp(_d, _more, _gfp) \
@ -41,6 +45,8 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more,
#define darray_make_room(_d, _more) \ #define darray_make_room(_d, _more) \
darray_make_room_gfp(_d, _more, GFP_KERNEL) darray_make_room_gfp(_d, _more, GFP_KERNEL)
#define darray_room(_d) ((_d).size - (_d).nr)
#define darray_top(_d) ((_d).data[(_d).nr]) #define darray_top(_d) ((_d).data[(_d).nr])
#define darray_push_gfp(_d, _item, _gfp) \ #define darray_push_gfp(_d, _item, _gfp) \

View File

@ -239,6 +239,34 @@ restart_drop_extra_replicas:
next_pos = insert->k.p; next_pos = insert->k.p;
/*
* Check for nonce offset inconsistency:
* This is debug code - we've been seeing this bug rarely, and
* it's been hard to reproduce, so this should give us some more
* information when it does occur:
*/
struct printbuf err = PRINTBUF;
int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err);
printbuf_exit(&err);
if (invalid) {
struct printbuf buf = PRINTBUF;
prt_str(&buf, "about to insert invalid key in data update path");
prt_str(&buf, "\nold: ");
bch2_bkey_val_to_text(&buf, c, old);
prt_str(&buf, "\nk: ");
bch2_bkey_val_to_text(&buf, c, k);
prt_str(&buf, "\nnew: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
bch2_print_string_as_lines(KERN_ERR, buf.buf);
printbuf_exit(&buf);
bch2_fatal_error(c);
goto out;
}
ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id, ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, bkey_start_pos(&insert->k)) ?: k.k->p, bkey_start_pos(&insert->k)) ?:
bch2_insert_snapshot_whiteouts(trans, m->btree_id, bch2_insert_snapshot_whiteouts(trans, m->btree_id,
@ -250,8 +278,8 @@ restart_drop_extra_replicas:
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, &op->res, bch2_trans_commit(trans, &op->res,
NULL, NULL,
BTREE_INSERT_NOCHECK_RW| BCH_TRANS_COMMIT_no_check_rw|
BTREE_INSERT_NOFAIL| BCH_TRANS_COMMIT_no_enospc|
m->data_opts.btree_insert_flags); m->data_opts.btree_insert_flags);
if (!ret) { if (!ret) {
bch2_btree_iter_set_pos(&iter, next_pos); bch2_btree_iter_set_pos(&iter, next_pos);

View File

@ -201,7 +201,8 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
const struct bch_hash_info *hash_info, const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum, u8 type, const struct qstr *name, u64 dst_inum,
u64 *dir_offset, int flags) u64 *dir_offset,
bch_str_hash_flags_t str_hash_flags)
{ {
struct bkey_i_dirent *dirent; struct bkey_i_dirent *dirent;
int ret; int ret;
@ -212,7 +213,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
return ret; return ret;
ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
dir, &dirent->k_i, flags); dir, &dirent->k_i, str_hash_flags);
*dir_offset = dirent->k.p.offset; *dir_offset = dirent->k.p.offset;
return ret; return ret;

View File

@ -37,7 +37,8 @@ int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
int bch2_dirent_create(struct btree_trans *, subvol_inum, int bch2_dirent_create(struct btree_trans *, subvol_inum,
const struct bch_hash_info *, u8, const struct bch_hash_info *, u8,
const struct qstr *, u64, u64 *, int); const struct qstr *, u64, u64 *,
bch_str_hash_flags_t);
static inline unsigned vfs_d_type(unsigned type) static inline unsigned vfs_d_type(unsigned type)
{ {

View File

@ -555,6 +555,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
case TARGET_DEV: { case TARGET_DEV: {
struct bch_dev *ca; struct bch_dev *ca;
out->atomic++;
rcu_read_lock(); rcu_read_lock();
ca = t.dev < c->sb.nr_devices ca = t.dev < c->sb.nr_devices
? rcu_dereference(c->devs[t.dev]) ? rcu_dereference(c->devs[t.dev])
@ -570,6 +571,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
} }
rcu_read_unlock(); rcu_read_unlock();
out->atomic--;
break; break;
} }
case TARGET_GROUP: case TARGET_GROUP:
@ -580,7 +582,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
} }
} }
void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
{ {
struct target t = target_decode(v); struct target t = target_decode(v);

View File

@ -150,6 +150,7 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset); prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset);
if (i < nr_data) if (i < nr_data)
prt_printf(out, "#%u", stripe_blockcount_get(s, i)); prt_printf(out, "#%u", stripe_blockcount_get(s, i));
prt_printf(out, " gen %u", ptr->gen);
if (ptr_stale(ca, ptr)) if (ptr_stale(ca, ptr))
prt_printf(out, " stale"); prt_printf(out, " stale");
} }
@ -303,16 +304,21 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
struct bch_csum got = ec_block_checksum(buf, i, offset); struct bch_csum got = ec_block_checksum(buf, i, offset);
if (bch2_crc_cmp(want, got)) { if (bch2_crc_cmp(want, got)) {
struct printbuf buf2 = PRINTBUF; struct printbuf err = PRINTBUF;
struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev);
bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key)); prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n",
want.hi, want.lo,
got.hi, got.lo,
bch2_csum_types[v->csum_type]);
prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i);
bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
bch_err_ratelimited(ca, "%s", err.buf);
printbuf_exit(&err);
bch_err_ratelimited(c,
"stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
(void *) _RET_IP_, i, j, v->csum_type,
want.lo, got.lo, buf2.buf);
printbuf_exit(&buf2);
clear_bit(i, buf->valid); clear_bit(i, buf->valid);
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
break; break;
} }
@ -475,14 +481,10 @@ err:
return ret; return ret;
} }
static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
{
return bch2_trans_run(c, get_stripe_key_trans(trans, idx, stripe));
}
/* recovery read path: */ /* recovery read path: */
int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
{ {
struct bch_fs *c = trans->c;
struct ec_stripe_buf *buf; struct ec_stripe_buf *buf;
struct closure cl; struct closure cl;
struct bch_stripe *v; struct bch_stripe *v;
@ -497,7 +499,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
if (!buf) if (!buf)
return -BCH_ERR_ENOMEM_ec_read_extent; return -BCH_ERR_ENOMEM_ec_read_extent;
ret = get_stripe_key(c, rbio->pick.ec.idx, buf); ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
if (ret) { if (ret) {
bch_err_ratelimited(c, bch_err_ratelimited(c,
"error doing reconstruct read: error %i looking up stripe", ret); "error doing reconstruct read: error %i looking up stripe", ret);
@ -801,7 +803,7 @@ static void ec_stripe_delete_work(struct work_struct *work)
if (!idx) if (!idx)
break; break;
ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
ec_stripe_delete(trans, idx)); ec_stripe_delete(trans, idx));
if (ret) { if (ret) {
bch_err_fn(c, ret); bch_err_fn(c, ret);
@ -981,8 +983,8 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
while (1) { while (1) {
ret = commit_do(trans, NULL, NULL, ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_NOCHECK_RW| BCH_TRANS_COMMIT_no_check_rw|
BTREE_INSERT_NOFAIL, BCH_TRANS_COMMIT_no_enospc,
ec_stripe_update_extent(trans, bucket_pos, bucket.gen, ec_stripe_update_extent(trans, bucket_pos, bucket.gen,
s, &bp_pos)); s, &bp_pos));
if (ret) if (ret)
@ -1119,8 +1121,8 @@ static void ec_stripe_create(struct ec_stripe_new *s)
} }
ret = bch2_trans_do(c, &s->res, NULL, ret = bch2_trans_do(c, &s->res, NULL,
BTREE_INSERT_NOCHECK_RW| BCH_TRANS_COMMIT_no_check_rw|
BTREE_INSERT_NOFAIL, BCH_TRANS_COMMIT_no_enospc,
ec_stripe_key_update(trans, ec_stripe_key_update(trans,
bkey_i_to_stripe(&s->new_stripe.key), bkey_i_to_stripe(&s->new_stripe.key),
!s->have_existing_stripe)); !s->have_existing_stripe));
@ -1371,6 +1373,15 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
h->nr_active_devs++; h->nr_active_devs++;
rcu_read_unlock(); rcu_read_unlock();
/*
* If we only have redundancy + 1 devices, we're better off with just
* replication:
*/
if (h->nr_active_devs < h->redundancy + 2)
bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?",
h->nr_active_devs, h->redundancy + 2);
list_add(&h->list, &c->ec_stripe_head_list); list_add(&h->list, &c->ec_stripe_head_list);
return h; return h;
} }
@ -1422,6 +1433,11 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans,
h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark); h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark);
found: found:
if (!IS_ERR_OR_NULL(h) &&
h->nr_active_devs < h->redundancy + 2) {
mutex_unlock(&h->lock);
h = NULL;
}
mutex_unlock(&c->ec_stripe_head_lock); mutex_unlock(&c->ec_stripe_head_lock);
return h; return h;
} }
@ -1679,8 +1695,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
int ret; int ret;
h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark); h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark);
if (!h)
bch_err(c, "no stripe head");
if (IS_ERR_OR_NULL(h)) if (IS_ERR_OR_NULL(h))
return h; return h;

View File

@ -199,7 +199,7 @@ struct ec_stripe_head {
struct ec_stripe_new *s; struct ec_stripe_new *s;
}; };
int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *); int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *);
void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);

View File

@ -73,7 +73,6 @@
x(ENOMEM, ENOMEM_fsck_add_nlink) \ x(ENOMEM, ENOMEM_fsck_add_nlink) \
x(ENOMEM, ENOMEM_journal_key_insert) \ x(ENOMEM, ENOMEM_journal_key_insert) \
x(ENOMEM, ENOMEM_journal_keys_sort) \ x(ENOMEM, ENOMEM_journal_keys_sort) \
x(ENOMEM, ENOMEM_journal_replay) \
x(ENOMEM, ENOMEM_read_superblock_clean) \ x(ENOMEM, ENOMEM_read_superblock_clean) \
x(ENOMEM, ENOMEM_fs_alloc) \ x(ENOMEM, ENOMEM_fs_alloc) \
x(ENOMEM, ENOMEM_fs_name_alloc) \ x(ENOMEM, ENOMEM_fs_name_alloc) \

View File

@ -13,7 +13,7 @@
int bch2_filemap_get_contig_folios_d(struct address_space *mapping, int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
loff_t start, u64 end, loff_t start, u64 end,
int fgp_flags, gfp_t gfp, fgf_t fgp_flags, gfp_t gfp,
folios *fs) folios *fs)
{ {
struct folio *f; struct folio *f;

View File

@ -7,7 +7,7 @@
typedef DARRAY(struct folio *) folios; typedef DARRAY(struct folio *) folios;
int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t, int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t,
u64, int, gfp_t, folios *); u64, fgf_t, gfp_t, folios *);
int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t); int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t);
/* /*

View File

@ -93,7 +93,7 @@ retry:
BTREE_ITER_INTENT) ?: BTREE_ITER_INTENT) ?:
(set ? set(trans, inode, &inode_u, p) : 0) ?: (set ? set(trans, inode, &inode_u, p) : 0) ?:
bch2_inode_write(trans, &iter, &inode_u) ?: bch2_inode_write(trans, &iter, &inode_u) ?:
bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
/* /*
* the btree node lock protects inode->ei_inode, not ei_update_lock; * the btree node lock protects inode->ei_inode, not ei_update_lock;
@ -452,7 +452,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
ret = commit_do(trans, NULL, NULL, ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL, BCH_TRANS_COMMIT_no_enospc,
bch2_unlink_trans(trans, bch2_unlink_trans(trans,
inode_inum(dir), &dir_u, inode_inum(dir), &dir_u,
&inode_u, &dentry->d_name, &inode_u, &dentry->d_name,
@ -717,7 +717,7 @@ retry:
ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?: ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
bch2_trans_commit(trans, NULL, NULL, bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL); BCH_TRANS_COMMIT_no_enospc);
btree_err: btree_err:
bch2_trans_iter_exit(trans, &inode_iter); bch2_trans_iter_exit(trans, &inode_iter);
@ -1922,10 +1922,7 @@ out:
return dget(sb->s_root); return dget(sb->s_root);
err_put_super: err_put_super:
sb->s_fs_info = NULL;
c->vfs_sb = NULL;
deactivate_locked_super(sb); deactivate_locked_super(sb);
bch2_fs_stop(c);
return ERR_PTR(bch2_err_class(ret)); return ERR_PTR(bch2_err_class(ret));
} }
@ -1933,11 +1930,8 @@ static void bch2_kill_sb(struct super_block *sb)
{ {
struct bch_fs *c = sb->s_fs_info; struct bch_fs *c = sb->s_fs_info;
if (c)
c->vfs_sb = NULL;
generic_shutdown_super(sb); generic_shutdown_super(sb);
if (c) bch2_fs_free(c);
bch2_fs_free(c);
} }
static struct file_system_type bcache_fs_type = { static struct file_system_type bcache_fs_type = {

View File

@ -208,8 +208,8 @@ static int fsck_write_inode(struct btree_trans *trans,
u32 snapshot) u32 snapshot)
{ {
int ret = commit_do(trans, NULL, NULL, int ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL| BCH_TRANS_COMMIT_no_enospc|
BTREE_INSERT_LAZY_RW, BCH_TRANS_COMMIT_lazy_rw,
__write_inode(trans, inode, snapshot)); __write_inode(trans, inode, snapshot));
if (ret) if (ret)
bch_err_fn(trans->c, ret); bch_err_fn(trans->c, ret);
@ -354,8 +354,8 @@ static int reattach_inode(struct btree_trans *trans,
u32 inode_snapshot) u32 inode_snapshot)
{ {
int ret = commit_do(trans, NULL, NULL, int ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_LAZY_RW| BCH_TRANS_COMMIT_lazy_rw|
BTREE_INSERT_NOFAIL, BCH_TRANS_COMMIT_no_enospc,
__reattach_inode(trans, inode, inode_snapshot)); __reattach_inode(trans, inode, inode_snapshot));
bch_err_msg(trans->c, ret, "reattaching inode %llu", inode->bi_inum); bch_err_msg(trans->c, ret, "reattaching inode %llu", inode->bi_inum);
return ret; return ret;
@ -757,8 +757,8 @@ static int hash_redo_key(struct btree_trans *trans,
BCH_HASH_SET_MUST_CREATE, BCH_HASH_SET_MUST_CREATE,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, NULL, NULL, bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL| BCH_TRANS_COMMIT_no_enospc|
BTREE_INSERT_LAZY_RW); BCH_TRANS_COMMIT_lazy_rw);
} }
static int hash_check_key(struct btree_trans *trans, static int hash_check_key(struct btree_trans *trans,
@ -992,7 +992,7 @@ int bch2_check_inodes(struct bch_fs *c)
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, ret = for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
POS_MIN, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
check_inode(trans, &iter, k, &prev, &s, full)); check_inode(trans, &iter, k, &prev, &s, full));
snapshots_seen_exit(&s); snapshots_seen_exit(&s);
@ -1226,7 +1226,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
k1, k2) ?: k1, k2) ?:
bch2_trans_commit(trans, &res, NULL, bch2_trans_commit(trans, &res, NULL,
BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL); BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc);
bch2_disk_reservation_put(c, &res); bch2_disk_reservation_put(c, &res);
if (ret) if (ret)
@ -1465,7 +1465,7 @@ int bch2_check_extents(struct bch_fs *c)
POS(BCACHEFS_ROOT_INO, 0), POS(BCACHEFS_ROOT_INO, 0),
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
&res, NULL, &res, NULL,
BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({ BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc, ({
bch2_disk_reservation_put(c, &res); bch2_disk_reservation_put(c, &res);
check_extent(trans, &iter, k, &w, &s, &extent_ends) ?: check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
check_extent_overbig(trans, &iter, k); check_extent_overbig(trans, &iter, k);
@ -1494,7 +1494,7 @@ int bch2_check_indirect_extents(struct bch_fs *c)
POS_MIN, POS_MIN,
BTREE_ITER_PREFETCH, k, BTREE_ITER_PREFETCH, k,
&res, NULL, &res, NULL,
BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({ BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc, ({
bch2_disk_reservation_put(c, &res); bch2_disk_reservation_put(c, &res);
check_extent_overbig(trans, &iter, k); check_extent_overbig(trans, &iter, k);
})); }));
@ -1854,7 +1854,7 @@ int bch2_check_dirents(struct bch_fs *c)
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
k, k,
NULL, NULL, NULL, NULL,
BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)); check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s));
bch2_trans_put(trans); bch2_trans_put(trans);
@ -1918,7 +1918,7 @@ int bch2_check_xattrs(struct bch_fs *c)
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
k, k,
NULL, NULL, NULL, NULL,
BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
check_xattr(trans, &iter, k, &hash_info, &inode))); check_xattr(trans, &iter, k, &hash_info, &inode)));
bch_err_fn(c, ret); bch_err_fn(c, ret);
return ret; return ret;
@ -1949,8 +1949,8 @@ static int check_root_trans(struct btree_trans *trans)
root_subvol.v.snapshot = cpu_to_le32(snapshot); root_subvol.v.snapshot = cpu_to_le32(snapshot);
root_subvol.v.inode = cpu_to_le64(inum); root_subvol.v.inode = cpu_to_le64(inum);
ret = commit_do(trans, NULL, NULL, ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL| BCH_TRANS_COMMIT_no_enospc|
BTREE_INSERT_LAZY_RW, BCH_TRANS_COMMIT_lazy_rw,
bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, bch2_btree_insert_trans(trans, BTREE_ID_subvolumes,
&root_subvol.k_i, 0)); &root_subvol.k_i, 0));
bch_err_msg(c, ret, "writing root subvol"); bch_err_msg(c, ret, "writing root subvol");
@ -1986,8 +1986,8 @@ int bch2_check_root(struct bch_fs *c)
int ret; int ret;
ret = bch2_trans_do(c, NULL, NULL, ret = bch2_trans_do(c, NULL, NULL,
BTREE_INSERT_NOFAIL| BCH_TRANS_COMMIT_no_enospc|
BTREE_INSERT_LAZY_RW, BCH_TRANS_COMMIT_lazy_rw,
check_root_trans(trans)); check_root_trans(trans));
bch_err_fn(c, ret); bch_err_fn(c, ret);
return ret; return ret;
@ -2116,8 +2116,8 @@ static int check_path(struct btree_trans *trans,
return 0; return 0;
ret = commit_do(trans, NULL, NULL, ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL| BCH_TRANS_COMMIT_no_enospc|
BTREE_INSERT_LAZY_RW, BCH_TRANS_COMMIT_lazy_rw,
remove_backpointer(trans, inode)); remove_backpointer(trans, inode));
if (ret) { if (ret) {
bch_err(c, "error removing dirent: %i", ret); bch_err(c, "error removing dirent: %i", ret);
@ -2398,7 +2398,7 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
POS(0, range_start), POS(0, range_start),
BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end))); check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
if (ret < 0) { if (ret < 0) {
bch_err(c, "error in fsck: btree error %i while walking inodes", ret); bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
@ -2483,7 +2483,7 @@ int bch2_fix_reflink_p(struct bch_fs *c)
BTREE_ID_extents, POS_MIN, BTREE_ID_extents, POS_MIN,
BTREE_ITER_INTENT|BTREE_ITER_PREFETCH| BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|
BTREE_ITER_ALL_SNAPSHOTS, k, BTREE_ITER_ALL_SNAPSHOTS, k,
NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
fix_reflink_p_key(trans, &iter, k))); fix_reflink_p_key(trans, &iter, k)));
bch_err_fn(c, ret); bch_err_fn(c, ret);
return ret; return ret;

View File

@ -830,7 +830,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
ret = bch2_trans_update(trans, &iter, &delete, 0) ?: ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
bch2_trans_commit(trans, NULL, NULL, bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL); BCH_TRANS_COMMIT_no_enospc);
err: err:
if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
break; break;
@ -893,7 +893,7 @@ retry:
ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL, bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL); BCH_TRANS_COMMIT_no_enospc);
err: err:
bch2_trans_iter_exit(trans, &iter); bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@ -1057,7 +1057,7 @@ retry:
ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL, bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL); BCH_TRANS_COMMIT_no_enospc);
err: err:
bch2_trans_iter_exit(trans, &iter); bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@ -1091,7 +1091,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
ret = bch2_inode_unpack(k, &inode); ret = bch2_inode_unpack(k, &inode);
if (ret) if (ret)
goto err; goto out;
if (fsck_err_on(S_ISDIR(inode.bi_mode), c, if (fsck_err_on(S_ISDIR(inode.bi_mode), c,
deleted_inode_is_dir, deleted_inode_is_dir,
@ -1109,38 +1109,45 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
!fsck_err(c, !fsck_err(c,
deleted_inode_but_clean, deleted_inode_but_clean,
"filesystem marked as clean but have deleted inode %llu:%u", "filesystem marked as clean but have deleted inode %llu:%u",
pos.offset, pos.snapshot)) pos.offset, pos.snapshot)) {
return 0; ret = 0;
goto out;
}
if (bch2_snapshot_is_internal_node(c, pos.snapshot)) { if (bch2_snapshot_is_internal_node(c, pos.snapshot)) {
struct bpos new_min_pos; struct bpos new_min_pos;
ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos); ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos);
if (ret) if (ret)
goto err; goto out;
inode.bi_flags &= ~BCH_INODE_unlinked; inode.bi_flags &= ~BCH_INODE_unlinked;
ret = bch2_inode_write_flags(trans, &inode_iter, &inode, ret = bch2_inode_write_flags(trans, &inode_iter, &inode,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_lazy_rw);
bch_err_msg(c, ret, "clearing inode unlinked flag"); bch_err_msg(c, ret, "clearing inode unlinked flag");
if (ret) if (ret)
return ret; goto out;
/* /*
* We'll need another write buffer flush to pick up the new * We'll need another write buffer flush to pick up the new
* unlinked inodes in the snapshot leaves: * unlinked inodes in the snapshot leaves:
*/ */
*need_another_pass = true; *need_another_pass = true;
return 0; goto out;
} }
return 1; ret = 1;
err: out:
fsck_err: fsck_err:
bch2_trans_iter_exit(trans, &inode_iter);
return ret; return ret;
delete: delete:
return bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false); ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
goto out;
} }
int bch2_delete_dead_inodes(struct bch_fs *c) int bch2_delete_dead_inodes(struct bch_fs *c)

View File

@ -256,7 +256,7 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
u64 new_i_size = le64_to_cpu(op->v.new_i_size); u64 new_i_size = le64_to_cpu(op->v.new_i_size);
int ret; int ret;
ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
truncate_set_isize(trans, inum, new_i_size)); truncate_set_isize(trans, inum, new_i_size));
if (ret) if (ret)
goto err; goto err;
@ -378,7 +378,7 @@ case LOGGED_OP_FINSERT_start:
op->v.state = LOGGED_OP_FINSERT_shift_extents; op->v.state = LOGGED_OP_FINSERT_shift_extents;
if (insert) { if (insert) {
ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
adjust_i_size(trans, inum, src_offset, len) ?: adjust_i_size(trans, inum, src_offset, len) ?:
bch2_logged_op_update(trans, &op->k_i)); bch2_logged_op_update(trans, &op->k_i));
if (ret) if (ret)
@ -390,7 +390,7 @@ case LOGGED_OP_FINSERT_start:
if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto err; goto err;
ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_logged_op_update(trans, &op->k_i)); bch2_logged_op_update(trans, &op->k_i));
} }
@ -455,7 +455,7 @@ case LOGGED_OP_FINSERT_shift_extents:
bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
bch2_logged_op_update(trans, &op->k_i) ?: bch2_logged_op_update(trans, &op->k_i) ?:
bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL); bch2_trans_commit(trans, &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc);
btree_err: btree_err:
bch2_disk_reservation_put(c, &disk_res); bch2_disk_reservation_put(c, &disk_res);
@ -470,12 +470,12 @@ btree_err:
op->v.state = LOGGED_OP_FINSERT_finish; op->v.state = LOGGED_OP_FINSERT_finish;
if (!insert) { if (!insert) {
ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
adjust_i_size(trans, inum, src_offset, shift) ?: adjust_i_size(trans, inum, src_offset, shift) ?:
bch2_logged_op_update(trans, &op->k_i)); bch2_logged_op_update(trans, &op->k_i));
} else { } else {
/* We need an inode update to update bi_journal_seq for fsync: */ /* We need an inode update to update bi_journal_seq for fsync: */
ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
adjust_i_size(trans, inum, 0, 0) ?: adjust_i_size(trans, inum, 0, 0) ?:
bch2_logged_op_update(trans, &op->k_i)); bch2_logged_op_update(trans, &op->k_i));
} }

View File

@ -526,7 +526,7 @@ out:
static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
{ {
bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL, bch2_trans_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
__bch2_rbio_narrow_crcs(trans, rbio)); __bch2_rbio_narrow_crcs(trans, rbio));
} }
@ -1025,7 +1025,7 @@ get_bio:
trans->notrace_relock_fail = true; trans->notrace_relock_fail = true;
} else { } else {
/* Attempting reconstruct read: */ /* Attempting reconstruct read: */
if (bch2_ec_read_extent(c, rbio)) { if (bch2_ec_read_extent(trans, rbio)) {
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out; goto out;
} }

View File

@ -202,6 +202,17 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
struct btree_iter iter; struct btree_iter iter;
struct bkey_i *k; struct bkey_i *k;
struct bkey_i_inode_v3 *inode; struct bkey_i_inode_v3 *inode;
/*
* Crazy performance optimization:
* Every extent update needs to also update the inode: the inode trigger
* will set bi->journal_seq to the journal sequence number of this
* transaction - for fsync.
*
* But if that's the only reason we're updating the inode (we're not
* updating bi_size or bi_sectors), then we don't need the inode update
* to be journalled - if we crash, the bi_journal_seq update will be
* lost, but that's fine.
*/
unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL; unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
int ret; int ret;
@ -305,8 +316,8 @@ int bch2_extent_update(struct btree_trans *trans,
i_sectors_delta) ?: i_sectors_delta) ?:
bch2_trans_update(trans, iter, k, 0) ?: bch2_trans_update(trans, iter, k, 0) ?:
bch2_trans_commit(trans, disk_res, NULL, bch2_trans_commit(trans, disk_res, NULL,
BTREE_INSERT_NOCHECK_RW| BCH_TRANS_COMMIT_no_check_rw|
BTREE_INSERT_NOFAIL); BCH_TRANS_COMMIT_no_enospc);
if (unlikely(ret)) if (unlikely(ret))
return ret; return ret;
@ -1165,7 +1176,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents, ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents,
bkey_start_pos(&orig->k), orig->k.p, bkey_start_pos(&orig->k), orig->k.p,
BTREE_ITER_INTENT, k, BTREE_ITER_INTENT, k,
NULL, NULL, BTREE_INSERT_NOFAIL, ({ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
})); }));

View File

@ -361,11 +361,6 @@ static int journal_entry_open(struct journal *j)
} while ((v = atomic64_cmpxchg(&j->reservations.counter, } while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v); old.v, new.v)) != old.v);
if (j->res_get_blocked_start)
bch2_time_stats_update(j->blocked_time,
j->res_get_blocked_start);
j->res_get_blocked_start = 0;
mod_delayed_work(c->io_complete_wq, mod_delayed_work(c->io_complete_wq,
&j->write_work, &j->write_work,
msecs_to_jiffies(c->opts.journal_flush_delay)); msecs_to_jiffies(c->opts.journal_flush_delay));
@ -465,15 +460,12 @@ retry:
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
ret = journal_entry_open(j); ret = journal_entry_open(j);
if (ret == JOURNAL_ERR_max_in_flight) if (ret == JOURNAL_ERR_max_in_flight) {
track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
&j->max_in_flight_start, true);
trace_and_count(c, journal_entry_full, c); trace_and_count(c, journal_entry_full, c);
unlock:
if ((ret && ret != JOURNAL_ERR_insufficient_devices) &&
!j->res_get_blocked_start) {
j->res_get_blocked_start = local_clock() ?: 1;
trace_and_count(c, journal_full, c);
} }
unlock:
can_discard = j->can_discard; can_discard = j->can_discard;
spin_unlock(&j->lock); spin_unlock(&j->lock);
@ -526,36 +518,6 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
return ret; return ret;
} }
/* journal_preres: */
static bool journal_preres_available(struct journal *j,
struct journal_preres *res,
unsigned new_u64s,
unsigned flags)
{
bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true);
if (!ret && mutex_trylock(&j->reclaim_lock)) {
bch2_journal_reclaim(j);
mutex_unlock(&j->reclaim_lock);
}
return ret;
}
int __bch2_journal_preres_get(struct journal *j,
struct journal_preres *res,
unsigned new_u64s,
unsigned flags)
{
int ret;
closure_wait_event(&j->preres_wait,
(ret = bch2_journal_error(j)) ||
journal_preres_available(j, res, new_u64s, flags));
return ret;
}
/* journal_entry_res: */ /* journal_entry_res: */
void bch2_journal_entry_res_resize(struct journal *j, void bch2_journal_entry_res_resize(struct journal *j,
@ -1290,6 +1252,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
union journal_res_state s; union journal_res_state s;
struct bch_dev *ca; struct bch_dev *ca;
unsigned long now = jiffies; unsigned long now = jiffies;
u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes;
u64 seq; u64 seq;
unsigned i; unsigned i;
@ -1303,21 +1266,23 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size); prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size);
prt_printf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j)); prt_printf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j));
prt_printf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk); prt_printf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk);
prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j));
prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk);
prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
prt_printf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining); prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]);
prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]); prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes);
prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes);
prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); prt_printf(out, "average write size:\t");
prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0);
prt_newline(out);
prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim);
prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked); prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked);
prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now)
? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]);
prt_printf(out, "current entry:\t\t"); prt_printf(out, "current entry:\t\t");
switch (s.cur_entry_offset) { switch (s.cur_entry_offset) {

View File

@ -395,104 +395,6 @@ out:
return 0; return 0;
} }
/* journal_preres: */
static inline void journal_set_watermark(struct journal *j)
{
union journal_preres_state s = READ_ONCE(j->prereserved);
unsigned watermark = BCH_WATERMARK_stripe;
if (fifo_free(&j->pin) < j->pin.size / 4)
watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc);
if (fifo_free(&j->pin) < j->pin.size / 8)
watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
if (s.reserved > s.remaining)
watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc);
if (!s.remaining)
watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
if (watermark == j->watermark)
return;
swap(watermark, j->watermark);
if (watermark > j->watermark)
journal_wake(j);
}
static inline void bch2_journal_preres_put(struct journal *j,
struct journal_preres *res)
{
union journal_preres_state s = { .reserved = res->u64s };
if (!res->u64s)
return;
s.v = atomic64_sub_return(s.v, &j->prereserved.counter);
res->u64s = 0;
if (unlikely(s.waiting)) {
clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)),
(unsigned long *) &j->prereserved.v);
closure_wake_up(&j->preres_wait);
}
if (s.reserved <= s.remaining && j->watermark)
journal_set_watermark(j);
}
int __bch2_journal_preres_get(struct journal *,
struct journal_preres *, unsigned, unsigned);
static inline int bch2_journal_preres_get_fast(struct journal *j,
struct journal_preres *res,
unsigned new_u64s,
unsigned flags,
bool set_waiting)
{
int d = new_u64s - res->u64s;
union journal_preres_state old, new;
u64 v = atomic64_read(&j->prereserved.counter);
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
int ret;
do {
old.v = new.v = v;
ret = 0;
if (watermark == BCH_WATERMARK_reclaim ||
new.reserved + d < new.remaining) {
new.reserved += d;
ret = 1;
} else if (set_waiting && !new.waiting)
new.waiting = true;
else
return 0;
} while ((v = atomic64_cmpxchg(&j->prereserved.counter,
old.v, new.v)) != old.v);
if (ret)
res->u64s += d;
return ret;
}
static inline int bch2_journal_preres_get(struct journal *j,
struct journal_preres *res,
unsigned new_u64s,
unsigned flags)
{
if (new_u64s <= res->u64s)
return 0;
if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false))
return 0;
if (flags & JOURNAL_RES_GET_NONBLOCK)
return -BCH_ERR_journal_preres_get_blocked;
return __bch2_journal_preres_get(j, res, new_u64s, flags);
}
/* journal_entry_res: */ /* journal_entry_res: */
void bch2_journal_entry_res_resize(struct journal *, void bch2_journal_entry_res_resize(struct journal *,

View File

@ -1079,6 +1079,12 @@ found:
if (ja->bucket_seq[ja->cur_idx] && if (ja->bucket_seq[ja->cur_idx] &&
ja->sectors_free == ca->mi.bucket_size) { ja->sectors_free == ca->mi.bucket_size) {
#if 0
/*
* Debug code for ZNS support, where we (probably) want to be
* correlated where we stopped in the journal to the zone write
* points:
*/
bch_err(c, "ja->sectors_free == ca->mi.bucket_size"); bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr); bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
for (i = 0; i < 3; i++) { for (i = 0; i < 3; i++) {
@ -1086,6 +1092,7 @@ found:
bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]); bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
} }
#endif
ja->sectors_free = 0; ja->sectors_free = 0;
} }
@ -1585,6 +1592,9 @@ static void journal_write_done(struct closure *cl)
bch2_journal_space_available(j); bch2_journal_space_available(j);
track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
&j->max_in_flight_start, false);
closure_wake_up(&w->wait); closure_wake_up(&w->wait);
journal_wake(j); journal_wake(j);
@ -1678,9 +1688,15 @@ static void do_journal_write(struct closure *cl)
continue_at(cl, journal_write_done, c->io_complete_wq); continue_at(cl, journal_write_done, c->io_complete_wq);
} }
static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset) static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
{ {
struct jset_entry *i, *next, *prev = NULL; struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct jset_entry *start, *end, *i, *next, *prev = NULL;
struct jset *jset = w->data;
unsigned sectors, bytes, u64s;
bool validate_before_checksum = false;
unsigned long btree_roots_have = 0;
int ret;
/* /*
* Simple compaction, dropping empty jset_entries (from journal * Simple compaction, dropping empty jset_entries (from journal
@ -1697,8 +1713,20 @@ static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset
if (!u64s) if (!u64s)
continue; continue;
if (i->type == BCH_JSET_ENTRY_btree_root) /*
* New btree roots are set by journalling them; when the journal
* entry gets written we have to propagate them to
* c->btree_roots
*
* But, every journal entry we write has to contain all the
* btree roots (at least for now); so after we copy btree roots
* to c->btree_roots we have to get any missing btree roots and
* add them to this journal entry:
*/
if (i->type == BCH_JSET_ENTRY_btree_root) {
bch2_journal_entry_to_btree_root(c, i); bch2_journal_entry_to_btree_root(c, i);
__set_bit(i->btree_id, &btree_roots_have);
}
/* Can we merge with previous entry? */ /* Can we merge with previous entry? */
if (prev && if (prev &&
@ -1722,85 +1750,10 @@ static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset
prev = prev ? vstruct_next(prev) : jset->start; prev = prev ? vstruct_next(prev) : jset->start;
jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
}
void bch2_journal_write(struct closure *cl)
{
struct journal *j = container_of(cl, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_replicas_padded replicas;
struct jset_entry *start, *end;
struct jset *jset;
struct bio *bio;
struct printbuf journal_debug_buf = PRINTBUF;
bool validate_before_checksum = false;
unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
int ret;
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
journal_buf_realloc(j, w);
jset = w->data;
j->write_start_time = local_clock();
spin_lock(&j->lock);
/*
* If the journal is in an error state - we did an emergency shutdown -
* we prefer to continue doing journal writes. We just mark them as
* noflush so they'll never be used, but they'll still be visible by the
* list_journal tool - this helps in debugging.
*
* There's a caveat: the first journal write after marking the
* superblock dirty must always be a flush write, because on startup
* from a clean shutdown we didn't necessarily read the journal and the
* new journal write might overwrite whatever was in the journal
* previously - we can't leave the journal without any flush writes in
* it.
*
* So if we're in an error state, and we're still starting up, we don't
* write anything at all.
*/
if (!test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags) &&
(bch2_journal_error(j) ||
w->noflush ||
(!w->must_flush &&
(jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) {
w->noflush = true;
SET_JSET_NO_FLUSH(jset, true);
jset->last_seq = 0;
w->last_seq = 0;
j->nr_noflush_writes++;
} else if (!bch2_journal_error(j)) {
j->last_flush_write = jiffies;
j->nr_flush_writes++;
clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
} else {
spin_unlock(&j->lock);
goto err;
}
spin_unlock(&j->lock);
/*
* New btree roots are set by journalling them; when the journal entry
* gets written we have to propagate them to c->btree_roots
*
* But, every journal entry we write has to contain all the btree roots
* (at least for now); so after we copy btree roots to c->btree_roots we
* have to get any missing btree roots and add them to this journal
* entry:
*/
bch2_journal_entries_postprocess(c, jset);
start = end = vstruct_last(jset); start = end = vstruct_last(jset);
end = bch2_btree_roots_to_journal_entries(c, jset->start, end); end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
bch2_journal_super_entries_add_common(c, &end, bch2_journal_super_entries_add_common(c, &end,
le64_to_cpu(jset->seq)); le64_to_cpu(jset->seq));
@ -1816,7 +1769,7 @@ void bch2_journal_write(struct closure *cl)
bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)", bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
vstruct_bytes(jset), w->sectors << 9, vstruct_bytes(jset), w->sectors << 9,
u64s, w->u64s_reserved, j->entry_u64s_reserved); u64s, w->u64s_reserved, j->entry_u64s_reserved);
goto err; return -EINVAL;
} }
jset->magic = cpu_to_le64(jset_magic(c)); jset->magic = cpu_to_le64(jset_magic(c));
@ -1835,37 +1788,119 @@ void bch2_journal_write(struct closure *cl)
validate_before_checksum = true; validate_before_checksum = true;
if (validate_before_checksum && if (validate_before_checksum &&
jset_validate(c, NULL, jset, 0, WRITE)) (ret = jset_validate(c, NULL, jset, 0, WRITE)))
goto err; return ret;
ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
jset->encrypted_start, jset->encrypted_start,
vstruct_end(jset) - (void *) jset->encrypted_start); vstruct_end(jset) - (void *) jset->encrypted_start);
if (bch2_fs_fatal_err_on(ret, c, if (bch2_fs_fatal_err_on(ret, c,
"error decrypting journal entry: %i", ret)) "error decrypting journal entry: %i", ret))
goto err; return ret;
jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
journal_nonce(jset), jset); journal_nonce(jset), jset);
if (!validate_before_checksum && if (!validate_before_checksum &&
jset_validate(c, NULL, jset, 0, WRITE)) (ret = jset_validate(c, NULL, jset, 0, WRITE)))
goto err; return ret;
memset((void *) jset + bytes, 0, (sectors << 9) - bytes); memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
return 0;
}
retry_alloc: static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w)
spin_lock(&j->lock); {
ret = journal_write_alloc(j, w); struct bch_fs *c = container_of(j, struct bch_fs, journal);
int error = bch2_journal_error(j);
if (ret && j->can_discard) { /*
spin_unlock(&j->lock); * If the journal is in an error state - we did an emergency shutdown -
bch2_journal_do_discards(j); * we prefer to continue doing journal writes. We just mark them as
goto retry_alloc; * noflush so they'll never be used, but they'll still be visible by the
* list_journal tool - this helps in debugging.
*
* There's a caveat: the first journal write after marking the
* superblock dirty must always be a flush write, because on startup
* from a clean shutdown we didn't necessarily read the journal and the
* new journal write might overwrite whatever was in the journal
* previously - we can't leave the journal without any flush writes in
* it.
*
* So if we're in an error state, and we're still starting up, we don't
* write anything at all.
*/
if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags))
return -EIO;
if (error ||
w->noflush ||
(!w->must_flush &&
(jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
w->noflush = true;
SET_JSET_NO_FLUSH(w->data, true);
w->data->last_seq = 0;
w->last_seq = 0;
j->nr_noflush_writes++;
} else {
j->last_flush_write = jiffies;
j->nr_flush_writes++;
clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
} }
return 0;
}
void bch2_journal_write(struct closure *cl)
{
struct journal *j = container_of(cl, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_replicas_padded replicas;
struct bio *bio;
struct printbuf journal_debug_buf = PRINTBUF;
unsigned i, nr_rw_members = 0;
int ret;
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
j->write_start_time = local_clock();
spin_lock(&j->lock);
ret = bch2_journal_write_pick_flush(j, w);
spin_unlock(&j->lock);
if (ret) if (ret)
goto err;
journal_buf_realloc(j, w);
ret = bch2_journal_write_prep(j, w);
if (ret)
goto err;
j->entry_bytes_written += vstruct_bytes(w->data);
while (1) {
spin_lock(&j->lock);
ret = journal_write_alloc(j, w);
if (!ret || !j->can_discard)
break;
spin_unlock(&j->lock);
bch2_journal_do_discards(j);
}
if (ret) {
__bch2_journal_debug_to_text(&journal_debug_buf, j); __bch2_journal_debug_to_text(&journal_debug_buf, j);
spin_unlock(&j->lock);
bch_err(c, "Unable to allocate journal write:\n%s",
journal_debug_buf.buf);
printbuf_exit(&journal_debug_buf);
goto err;
}
/* /*
* write is allocated, no longer need to account for it in * write is allocated, no longer need to account for it in
@ -1880,13 +1915,6 @@ retry_alloc:
bch2_journal_space_available(j); bch2_journal_space_available(j);
spin_unlock(&j->lock); spin_unlock(&j->lock);
if (ret) {
bch_err(c, "Unable to allocate journal write:\n%s",
journal_debug_buf.buf);
printbuf_exit(&journal_debug_buf);
goto err;
}
w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
if (c->opts.nochanges) if (c->opts.nochanges)
@ -1908,7 +1936,7 @@ retry_alloc:
if (ret) if (ret)
goto err; goto err;
if (!JSET_NO_FLUSH(jset) && w->separate_flush) { if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
for_each_rw_member(ca, c, i) { for_each_rw_member(ca, c, i) {
percpu_ref_get(&ca->io_ref); percpu_ref_get(&ca->io_ref);

View File

@ -50,16 +50,25 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
return available; return available;
} }
static void journal_set_remaining(struct journal *j, unsigned u64s_remaining) static inline void journal_set_watermark(struct journal *j)
{ {
union journal_preres_state old, new; struct bch_fs *c = container_of(j, struct bch_fs, journal);
u64 v = atomic64_read(&j->prereserved.counter); bool low_on_space = j->space[journal_space_clean].total * 4 <=
j->space[journal_space_total].total;
bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4;
unsigned watermark = low_on_space || low_on_pin
? BCH_WATERMARK_reclaim
: BCH_WATERMARK_stripe;
do { if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space],
old.v = new.v = v; &j->low_on_space_start, low_on_space) ||
new.remaining = u64s_remaining; track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin],
} while ((v = atomic64_cmpxchg(&j->prereserved.counter, &j->low_on_pin_start, low_on_pin))
old.v, new.v)) != old.v); trace_and_count(c, journal_full, c);
swap(watermark, j->watermark);
if (watermark > j->watermark)
journal_wake(j);
} }
static struct journal_space static struct journal_space
@ -162,7 +171,6 @@ void bch2_journal_space_available(struct journal *j)
struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca; struct bch_dev *ca;
unsigned clean, clean_ondisk, total; unsigned clean, clean_ondisk, total;
s64 u64s_remaining = 0;
unsigned max_entry_size = min(j->buf[0].buf_size >> 9, unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
j->buf[1].buf_size >> 9); j->buf[1].buf_size >> 9);
unsigned i, nr_online = 0, nr_devs_want; unsigned i, nr_online = 0, nr_devs_want;
@ -222,16 +230,10 @@ void bch2_journal_space_available(struct journal *j)
else else
clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
u64s_remaining = (u64) clean << 6; journal_set_watermark(j);
u64s_remaining -= (u64) total << 3;
u64s_remaining = max(0LL, u64s_remaining);
u64s_remaining /= 4;
u64s_remaining = min_t(u64, u64s_remaining, U32_MAX);
out: out:
j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
j->cur_entry_error = ret; j->cur_entry_error = ret;
journal_set_remaining(j, u64s_remaining);
journal_set_watermark(j);
if (!ret) if (!ret)
journal_wake(j); journal_wake(j);
@ -369,15 +371,36 @@ static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
return JOURNAL_PIN_other; return JOURNAL_PIN_other;
} }
void bch2_journal_pin_set(struct journal *j, u64 seq, static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
struct journal_entry_pin *pin, struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn) journal_pin_flush_fn flush_fn,
enum journal_pin_type type)
{
struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
/*
* flush_fn is how we identify journal pins in debugfs, so must always
* exist, even if it doesn't do anything:
*/
BUG_ON(!flush_fn);
atomic_inc(&pin_list->count);
pin->seq = seq;
pin->flush = flush_fn;
list_add(&pin->list, &pin_list->list[type]);
}
void bch2_journal_pin_copy(struct journal *j,
struct journal_entry_pin *dst,
struct journal_entry_pin *src,
journal_pin_flush_fn flush_fn)
{ {
struct journal_entry_pin_list *pin_list;
bool reclaim; bool reclaim;
spin_lock(&j->lock); spin_lock(&j->lock);
u64 seq = READ_ONCE(src->seq);
if (seq < journal_last_seq(j)) { if (seq < journal_last_seq(j)) {
/* /*
* bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on
@ -389,18 +412,34 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
return; return;
} }
pin_list = journal_seq_pin(j, seq); reclaim = __journal_pin_drop(j, dst);
bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
spin_unlock(&j->lock);
/*
* If the journal is currently full, we might want to call flush_fn
* immediately:
*/
journal_wake(j);
}
void bch2_journal_pin_set(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn)
{
bool reclaim;
spin_lock(&j->lock);
BUG_ON(seq < journal_last_seq(j));
reclaim = __journal_pin_drop(j, pin); reclaim = __journal_pin_drop(j, pin);
atomic_inc(&pin_list->count); bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
pin->seq = seq;
pin->flush = flush_fn;
if (flush_fn)
list_add(&pin->list, &pin_list->list[journal_pin_type(flush_fn)]);
else
list_add(&pin->list, &pin_list->flushed);
if (reclaim) if (reclaim)
bch2_journal_reclaim_fast(j); bch2_journal_reclaim_fast(j);
@ -555,11 +594,6 @@ static u64 journal_seq_to_flush(struct journal *j)
/* Try to keep the journal at most half full: */ /* Try to keep the journal at most half full: */
nr_buckets = ja->nr / 2; nr_buckets = ja->nr / 2;
/* And include pre-reservations: */
nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
(ca->mi.bucket_size << 6) -
journal_entry_overhead(j));
nr_buckets = min(nr_buckets, ja->nr); nr_buckets = min(nr_buckets, ja->nr);
bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
@ -638,10 +672,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
msecs_to_jiffies(c->opts.journal_reclaim_delay))) msecs_to_jiffies(c->opts.journal_reclaim_delay)))
min_nr = 1; min_nr = 1;
if (j->prereserved.reserved * 4 > j->prereserved.remaining) if (j->watermark != BCH_WATERMARK_stripe)
min_nr = 1;
if (fifo_free(&j->pin) <= 32)
min_nr = 1; min_nr = 1;
if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used) if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
@ -652,8 +683,6 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
trace_and_count(c, journal_reclaim_start, c, trace_and_count(c, journal_reclaim_start, c,
direct, kicked, direct, kicked,
min_nr, min_key_cache, min_nr, min_key_cache,
j->prereserved.reserved,
j->prereserved.remaining,
atomic_read(&c->btree_cache.dirty), atomic_read(&c->btree_cache.dirty),
c->btree_cache.used, c->btree_cache.used,
atomic_long_read(&c->btree_key_cache.nr_dirty), atomic_long_read(&c->btree_key_cache.nr_dirty),
@ -805,6 +834,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
{ {
/* time_stats this */
bool did_work = false; bool did_work = false;
if (!test_bit(JOURNAL_STARTED, &j->flags)) if (!test_bit(JOURNAL_STARTED, &j->flags))

View File

@ -47,17 +47,10 @@ static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
bch2_journal_pin_set(j, seq, pin, flush_fn); bch2_journal_pin_set(j, seq, pin, flush_fn);
} }
static inline void bch2_journal_pin_copy(struct journal *j, void bch2_journal_pin_copy(struct journal *,
struct journal_entry_pin *dst, struct journal_entry_pin *,
struct journal_entry_pin *src, struct journal_entry_pin *,
journal_pin_flush_fn flush_fn) journal_pin_flush_fn);
{
/* Guard against racing with journal_pin_drop(src): */
u64 seq = READ_ONCE(src->seq);
if (seq)
bch2_journal_pin_add(j, seq, dst, flush_fn);
}
static inline void bch2_journal_pin_update(struct journal *j, u64 seq, static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
struct journal_entry_pin *pin, struct journal_entry_pin *pin,

View File

@ -76,14 +76,6 @@ struct journal_res {
u64 seq; u64 seq;
}; };
/*
* For reserving space in the journal prior to getting a reservation on a
* particular journal entry:
*/
struct journal_preres {
unsigned u64s;
};
union journal_res_state { union journal_res_state {
struct { struct {
atomic64_t counter; atomic64_t counter;
@ -104,22 +96,6 @@ union journal_res_state {
}; };
}; };
union journal_preres_state {
struct {
atomic64_t counter;
};
struct {
u64 v;
};
struct {
u64 waiting:1,
reserved:31,
remaining:32;
};
};
/* bytes: */ /* bytes: */
#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ #define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */
#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ #define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */
@ -180,8 +156,6 @@ struct journal {
union journal_res_state reservations; union journal_res_state reservations;
enum bch_watermark watermark; enum bch_watermark watermark;
union journal_preres_state prereserved;
} __aligned(SMP_CACHE_BYTES); } __aligned(SMP_CACHE_BYTES);
unsigned long flags; unsigned long flags;
@ -288,15 +262,18 @@ struct journal {
unsigned long last_flush_write; unsigned long last_flush_write;
u64 res_get_blocked_start;
u64 write_start_time; u64 write_start_time;
u64 nr_flush_writes; u64 nr_flush_writes;
u64 nr_noflush_writes; u64 nr_noflush_writes;
u64 entry_bytes_written;
u64 low_on_space_start;
u64 low_on_pin_start;
u64 max_in_flight_start;
struct bch2_time_stats *flush_write_time; struct bch2_time_stats *flush_write_time;
struct bch2_time_stats *noflush_write_time; struct bch2_time_stats *noflush_write_time;
struct bch2_time_stats *blocked_time;
struct bch2_time_stats *flush_seq_time; struct bch2_time_stats *flush_seq_time;
#ifdef CONFIG_DEBUG_LOCK_ALLOC #ifdef CONFIG_DEBUG_LOCK_ALLOC

View File

@ -85,13 +85,13 @@ static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
{ {
return commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
__bch2_logged_op_start(trans, k)); __bch2_logged_op_start(trans, k));
} }
void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k) void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
{ {
int ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0)); bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0));
/* /*
* This needs to be a fatal error because we've left an unfinished * This needs to be a fatal error because we've left an unfinished

View File

@ -155,7 +155,7 @@ int bch2_check_lrus(struct bch_fs *c)
ret = bch2_trans_run(c, ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, for_each_btree_key_commit(trans, iter,
BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k, BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
bch2_check_lru_key(trans, &iter, k, &last_flushed_pos))); bch2_check_lru_key(trans, &iter, k, &last_flushed_pos)));
if (ret) if (ret)
bch_err_fn(c, ret); bch_err_fn(c, ret);

View File

@ -90,7 +90,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
NULL, NULL, BTREE_INSERT_NOFAIL, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags)); bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags));
if (ret) if (ret)
break; break;

View File

@ -263,7 +263,7 @@ static int bch2_extent_drop_ptrs(struct btree_trans *trans,
return bch2_trans_relock(trans) ?: return bch2_trans_relock(trans) ?:
bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
} }
int bch2_move_extent(struct moving_context *ctxt, int bch2_move_extent(struct moving_context *ctxt,

View File

@ -370,6 +370,7 @@ static int bch2_copygc_thread(void *arg)
if (min_member_capacity == U64_MAX) if (min_member_capacity == U64_MAX)
min_member_capacity = 128 * 2048; min_member_capacity = 128 * 2048;
bch2_trans_unlock_long(ctxt.trans);
bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6), bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6),
MAX_SCHEDULE_TIMEOUT); MAX_SCHEDULE_TIMEOUT);
} }

View File

@ -69,7 +69,7 @@ err:
int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
{ {
int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
__bch2_set_rebalance_needs_scan(trans, inum)); __bch2_set_rebalance_needs_scan(trans, inum));
rebalance_wakeup(c); rebalance_wakeup(c);
return ret; return ret;
@ -125,7 +125,7 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
extent_entry_drop(bkey_i_to_s(n), extent_entry_drop(bkey_i_to_s(n),
(void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n))); (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
} }
static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
@ -273,7 +273,7 @@ static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
r->state = BCH_REBALANCE_scanning; r->state = BCH_REBALANCE_scanning;
ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?: ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?:
commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_clear_rebalance_needs_scan(trans, inum, cookie)); bch2_clear_rebalance_needs_scan(trans, inum, cookie));
bch2_move_stats_exit(&r->scan_stats, trans->c); bch2_move_stats_exit(&r->scan_stats, trans->c);

View File

@ -98,6 +98,11 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
unsigned update_flags = BTREE_TRIGGER_NORUN; unsigned update_flags = BTREE_TRIGGER_NORUN;
int ret; int ret;
if (k->overwritten)
return 0;
trans->journal_res.seq = k->journal_seq;
/* /*
* BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to
* keep the key cache coherent with the underlying btree. Nothing * keep the key cache coherent with the underlying btree. Nothing
@ -139,27 +144,14 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
static int bch2_journal_replay(struct bch_fs *c) static int bch2_journal_replay(struct bch_fs *c)
{ {
struct journal_keys *keys = &c->journal_keys; struct journal_keys *keys = &c->journal_keys;
struct journal_key **keys_sorted, *k; DARRAY(struct journal_key *) keys_sorted = { 0 };
struct journal_key **kp;
struct journal *j = &c->journal; struct journal *j = &c->journal;
u64 start_seq = c->journal_replay_seq_start; u64 start_seq = c->journal_replay_seq_start;
u64 end_seq = c->journal_replay_seq_start; u64 end_seq = c->journal_replay_seq_start;
size_t i; struct btree_trans *trans = bch2_trans_get(c);
int ret; int ret;
move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
keys->gap = keys->nr;
keys_sorted = kvmalloc_array(keys->nr, sizeof(*keys_sorted), GFP_KERNEL);
if (!keys_sorted)
return -BCH_ERR_ENOMEM_journal_replay;
for (i = 0; i < keys->nr; i++)
keys_sorted[i] = &keys->d[i];
sort(keys_sorted, keys->nr,
sizeof(keys_sorted[0]),
journal_sort_seq_cmp, NULL);
if (keys->nr) { if (keys->nr) {
ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)", ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)",
keys->nr, start_seq, end_seq); keys->nr, start_seq, end_seq);
@ -167,27 +159,61 @@ static int bch2_journal_replay(struct bch_fs *c)
goto err; goto err;
} }
for (i = 0; i < keys->nr; i++) { /*
k = keys_sorted[i]; * First, attempt to replay keys in sorted order. This is more
* efficient, but some might fail if that would cause a journal
* deadlock.
*/
for (size_t i = 0; i < keys->nr; i++) {
cond_resched(); cond_resched();
struct journal_key *k = keys->d + i;
ret = commit_do(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_journal_reclaim|
(!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
bch2_journal_replay_key(trans, k));
BUG_ON(!ret && !k->overwritten);
if (ret) {
ret = darray_push(&keys_sorted, k);
if (ret)
goto err;
}
}
/*
* Now, replay any remaining keys in the order in which they appear in
* the journal, unpinning those journal entries as we go:
*/
sort(keys_sorted.data, keys_sorted.nr,
sizeof(keys_sorted.data[0]),
journal_sort_seq_cmp, NULL);
darray_for_each(keys_sorted, kp) {
cond_resched();
struct journal_key *k = *kp;
replay_now_at(j, k->journal_seq); replay_now_at(j, k->journal_seq);
ret = bch2_trans_do(c, NULL, NULL, ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_LAZY_RW| BCH_TRANS_COMMIT_no_enospc|
BTREE_INSERT_NOFAIL| (!k->allocated
(!k->allocated ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim
? BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim : 0),
: 0),
bch2_journal_replay_key(trans, k)); bch2_journal_replay_key(trans, k));
if (ret) { bch_err_msg(c, ret, "while replaying key at btree %s level %u:",
bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s", bch2_btree_id_str(k->btree_id), k->level);
bch2_btree_id_str(k->btree_id), k->level, bch2_err_str(ret)); if (ret)
goto err; goto err;
}
BUG_ON(!k->overwritten);
} }
bch2_trans_put(trans);
trans = NULL;
replay_now_at(j, j->replay_journal_seq_end); replay_now_at(j, j->replay_journal_seq_end);
j->replay_journal_seq = 0; j->replay_journal_seq = 0;
@ -198,10 +224,10 @@ static int bch2_journal_replay(struct bch_fs *c)
if (keys->nr && !ret) if (keys->nr && !ret)
bch2_journal_log_msg(c, "journal replay finished"); bch2_journal_log_msg(c, "journal replay finished");
err: err:
kvfree(keys_sorted); if (trans)
bch2_trans_put(trans);
if (ret) darray_exit(&keys_sorted);
bch_err_fn(c, ret); bch_err_fn(c, ret);
return ret; return ret;
} }
@ -468,7 +494,7 @@ err:
noinline_for_stack noinline_for_stack
static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
{ {
int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW, int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
__bch2_fs_upgrade_for_subvolumes(trans)); __bch2_fs_upgrade_for_subvolumes(trans));
if (ret) if (ret)
bch_err_fn(c, ret); bch_err_fn(c, ret);
@ -489,7 +515,19 @@ static int bch2_check_allocations(struct bch_fs *c)
static int bch2_set_may_go_rw(struct bch_fs *c) static int bch2_set_may_go_rw(struct bch_fs *c)
{ {
struct journal_keys *keys = &c->journal_keys;
/*
* After we go RW, the journal keys buffer can't be modified (except for
* setting journal_key->overwritten: it will be accessed by multiple
* threads
*/
move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
keys->gap = keys->nr;
set_bit(BCH_FS_MAY_GO_RW, &c->flags); set_bit(BCH_FS_MAY_GO_RW, &c->flags);
if (keys->nr)
return bch2_fs_read_write_early(c);
return 0; return 0;
} }

View File

@ -390,7 +390,7 @@ s64 bch2_remap_range(struct bch_fs *c,
inode_u.bi_size = new_i_size; inode_u.bi_size = new_i_size;
ret2 = bch2_inode_write(trans, &inode_iter, &inode_u) ?: ret2 = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
bch2_trans_commit(trans, NULL, NULL, bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL); BCH_TRANS_COMMIT_no_enospc);
} }
bch2_trans_iter_exit(trans, &inode_iter); bch2_trans_iter_exit(trans, &inode_iter);

View File

@ -376,7 +376,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
entry = sb_clean->start; entry = sb_clean->start;
bch2_journal_super_entries_add_common(c, &entry, 0); bch2_journal_super_entries_add_common(c, &entry, 0);
entry = bch2_btree_roots_to_journal_entries(c, entry, entry); entry = bch2_btree_roots_to_journal_entries(c, entry, 0);
BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
memset(entry, 0, memset(entry, 0,

View File

@ -70,7 +70,7 @@ static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb,
prt_tab(out); prt_tab(out);
prt_u64(out, BCH_SB_ERROR_ENTRY_NR(&e->entries[i])); prt_u64(out, BCH_SB_ERROR_ENTRY_NR(&e->entries[i]));
prt_tab(out); prt_tab(out);
bch2_prt_date_seconds(out, le64_to_cpu(e->entries[i].last_error_time)); bch2_prt_datetime(out, le64_to_cpu(e->entries[i].last_error_time));
prt_newline(out); prt_newline(out);
} }
} }

View File

@ -230,7 +230,7 @@ static void member_to_text(struct printbuf *out,
prt_printf(out, "Last mount:"); prt_printf(out, "Last mount:");
prt_tab(out); prt_tab(out);
if (m.last_mount) if (m.last_mount)
bch2_prt_date_seconds(out, le64_to_cpu(m.last_mount)); bch2_prt_datetime(out, le64_to_cpu(m.last_mount));
else else
prt_printf(out, "(never)"); prt_printf(out, "(never)");
prt_newline(out); prt_newline(out);

View File

@ -590,7 +590,7 @@ int bch2_check_snapshot_trees(struct bch_fs *c)
for_each_btree_key_commit(trans, iter, for_each_btree_key_commit(trans, iter,
BTREE_ID_snapshot_trees, POS_MIN, BTREE_ID_snapshot_trees, POS_MIN,
BTREE_ITER_PREFETCH, k, BTREE_ITER_PREFETCH, k,
NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
check_snapshot_tree(trans, &iter, k))); check_snapshot_tree(trans, &iter, k)));
if (ret) if (ret)
@ -868,7 +868,7 @@ int bch2_check_snapshots(struct bch_fs *c)
for_each_btree_key_reverse_commit(trans, iter, for_each_btree_key_reverse_commit(trans, iter,
BTREE_ID_snapshots, POS_MAX, BTREE_ID_snapshots, POS_MAX,
BTREE_ITER_PREFETCH, k, BTREE_ITER_PREFETCH, k,
NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
check_snapshot(trans, &iter, k))); check_snapshot(trans, &iter, k)));
if (ret) if (ret)
bch_err_fn(c, ret); bch_err_fn(c, ret);
@ -959,7 +959,7 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
parent_id, id)) parent_id, id))
goto err; goto err;
parent->v.children[i] = le32_to_cpu(child_id); parent->v.children[i] = cpu_to_le32(child_id);
normalize_snapshot_child_pointers(&parent->v); normalize_snapshot_child_pointers(&parent->v);
} }
@ -1449,12 +1449,12 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
ret = for_each_btree_key_commit(trans, iter, ret = for_each_btree_key_commit(trans, iter,
id, POS_MIN, id, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
&res, NULL, BTREE_INSERT_NOFAIL, &res, NULL, BCH_TRANS_COMMIT_no_enospc,
snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?: snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
for_each_btree_key_commit(trans, iter, for_each_btree_key_commit(trans, iter,
id, POS_MIN, id, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
&res, NULL, BTREE_INSERT_NOFAIL, &res, NULL, BCH_TRANS_COMMIT_no_enospc,
move_key_to_correct_snapshot(trans, &iter, k)); move_key_to_correct_snapshot(trans, &iter, k));
bch2_disk_reservation_put(c, &res); bch2_disk_reservation_put(c, &res);
@ -1489,7 +1489,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
*/ */
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
BTREE_ITER_INTENT, k, BTREE_ITER_INTENT, k,
NULL, NULL, BTREE_INSERT_NOFAIL, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior)); bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
if (ret) if (ret)
goto err_create_lock; goto err_create_lock;

View File

@ -15,6 +15,16 @@
#include <crypto/hash.h> #include <crypto/hash.h>
#include <crypto/sha2.h> #include <crypto/sha2.h>
typedef unsigned __bitwise bch_str_hash_flags_t;
enum bch_str_hash_flags {
__BCH_HASH_SET_MUST_CREATE,
__BCH_HASH_SET_MUST_REPLACE,
};
#define BCH_HASH_SET_MUST_CREATE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_CREATE)
#define BCH_HASH_SET_MUST_REPLACE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_REPLACE)
static inline enum bch_str_hash_type static inline enum bch_str_hash_type
bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
{ {
@ -246,7 +256,7 @@ int bch2_hash_set_snapshot(struct btree_trans *trans,
const struct bch_hash_info *info, const struct bch_hash_info *info,
subvol_inum inum, u32 snapshot, subvol_inum inum, u32 snapshot,
struct bkey_i *insert, struct bkey_i *insert,
int flags, bch_str_hash_flags_t str_hash_flags,
int update_flags) int update_flags)
{ {
struct btree_iter iter, slot = { NULL }; struct btree_iter iter, slot = { NULL };
@ -269,7 +279,7 @@ int bch2_hash_set_snapshot(struct btree_trans *trans,
} }
if (!slot.path && if (!slot.path &&
!(flags & BCH_HASH_SET_MUST_REPLACE)) !(str_hash_flags & BCH_HASH_SET_MUST_REPLACE))
bch2_trans_copy_iter(&slot, &iter); bch2_trans_copy_iter(&slot, &iter);
if (k.k->type != KEY_TYPE_hash_whiteout) if (k.k->type != KEY_TYPE_hash_whiteout)
@ -287,16 +297,16 @@ found:
found = true; found = true;
not_found: not_found:
if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) { if (!found && (str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) {
ret = -BCH_ERR_ENOENT_str_hash_set_must_replace; ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
} else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) { } else if (found && (str_hash_flags & BCH_HASH_SET_MUST_CREATE)) {
ret = -EEXIST; ret = -EEXIST;
} else { } else {
if (!found && slot.path) if (!found && slot.path)
swap(iter, slot); swap(iter, slot);
insert->k.p = iter.pos; insert->k.p = iter.pos;
ret = bch2_trans_update(trans, &iter, insert, 0); ret = bch2_trans_update(trans, &iter, insert, update_flags);
} }
goto out; goto out;
@ -307,7 +317,8 @@ int bch2_hash_set(struct btree_trans *trans,
const struct bch_hash_desc desc, const struct bch_hash_desc desc,
const struct bch_hash_info *info, const struct bch_hash_info *info,
subvol_inum inum, subvol_inum inum,
struct bkey_i *insert, int flags) struct bkey_i *insert,
bch_str_hash_flags_t str_hash_flags)
{ {
u32 snapshot; u32 snapshot;
int ret; int ret;
@ -319,7 +330,7 @@ int bch2_hash_set(struct btree_trans *trans,
insert->k.p.inode = inum.inum; insert->k.p.inode = inum.inum;
return bch2_hash_set_snapshot(trans, desc, info, inum, return bch2_hash_set_snapshot(trans, desc, info, inum,
snapshot, insert, flags, 0); snapshot, insert, str_hash_flags, 0);
} }
static __always_inline static __always_inline

View File

@ -89,7 +89,7 @@ int bch2_check_subvols(struct bch_fs *c)
ret = bch2_trans_run(c, ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, for_each_btree_key_commit(trans, iter,
BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
check_subvol(trans, &iter, k))); check_subvol(trans, &iter, k)));
if (ret) if (ret)
bch_err_fn(c, ret); bch_err_fn(c, ret);
@ -219,7 +219,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d
BTREE_ITER_CACHED, &s)) ?: BTREE_ITER_CACHED, &s)) ?:
for_each_btree_key_commit(trans, iter, for_each_btree_key_commit(trans, iter,
BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
NULL, NULL, BTREE_INSERT_NOFAIL, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_subvolume_reparent(trans, &iter, k, bch2_subvolume_reparent(trans, &iter, k,
subvolid_to_delete, le32_to_cpu(s.parent))); subvolid_to_delete, le32_to_cpu(s.parent)));
} }
@ -256,7 +256,7 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
{ {
return bch2_subvolumes_reparent(trans, subvolid) ?: return bch2_subvolumes_reparent(trans, subvolid) ?:
commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
__bch2_subvolume_delete(trans, subvolid)); __bch2_subvolume_delete(trans, subvolid));
} }

View File

@ -20,7 +20,7 @@ struct snapshot_t {
}; };
struct snapshot_table { struct snapshot_table {
struct snapshot_t s[0]; DECLARE_FLEX_ARRAY(struct snapshot_t, s);
}; };
typedef struct { typedef struct {

View File

@ -1183,7 +1183,7 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
prt_printf(out, "Created:"); prt_printf(out, "Created:");
prt_tab(out); prt_tab(out);
if (sb->time_base_lo) if (sb->time_base_lo)
bch2_prt_date_seconds(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
else else
prt_printf(out, "(not set)"); prt_printf(out, "(not set)");
prt_newline(out); prt_newline(out);

View File

@ -641,7 +641,9 @@ static int bch2_fs_online(struct bch_fs *c)
ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?: ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?:
kobject_add(&c->internal, &c->kobj, "internal") ?: kobject_add(&c->internal, &c->kobj, "internal") ?:
kobject_add(&c->opts_dir, &c->kobj, "options") ?: kobject_add(&c->opts_dir, &c->kobj, "options") ?:
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
#endif
kobject_add(&c->counters_kobj, &c->kobj, "counters") ?: kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
bch2_opts_create_sysfs_files(&c->opts_dir); bch2_opts_create_sysfs_files(&c->opts_dir);
if (ret) { if (ret) {
@ -750,7 +752,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write];
c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write];
c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal];
c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
bch2_fs_btree_cache_init_early(&c->btree_cache); bch2_fs_btree_cache_init_early(&c->btree_cache);

View File

@ -196,10 +196,9 @@ DEFINE_EVENT(bio, journal_write,
TRACE_EVENT(journal_reclaim_start, TRACE_EVENT(journal_reclaim_start,
TP_PROTO(struct bch_fs *c, bool direct, bool kicked, TP_PROTO(struct bch_fs *c, bool direct, bool kicked,
u64 min_nr, u64 min_key_cache, u64 min_nr, u64 min_key_cache,
u64 prereserved, u64 prereserved_total,
u64 btree_cache_dirty, u64 btree_cache_total, u64 btree_cache_dirty, u64 btree_cache_total,
u64 btree_key_cache_dirty, u64 btree_key_cache_total), u64 btree_key_cache_dirty, u64 btree_key_cache_total),
TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total, TP_ARGS(c, direct, kicked, min_nr, min_key_cache,
btree_cache_dirty, btree_cache_total, btree_cache_dirty, btree_cache_total,
btree_key_cache_dirty, btree_key_cache_total), btree_key_cache_dirty, btree_key_cache_total),
@ -209,8 +208,6 @@ TRACE_EVENT(journal_reclaim_start,
__field(bool, kicked ) __field(bool, kicked )
__field(u64, min_nr ) __field(u64, min_nr )
__field(u64, min_key_cache ) __field(u64, min_key_cache )
__field(u64, prereserved )
__field(u64, prereserved_total )
__field(u64, btree_cache_dirty ) __field(u64, btree_cache_dirty )
__field(u64, btree_cache_total ) __field(u64, btree_cache_total )
__field(u64, btree_key_cache_dirty ) __field(u64, btree_key_cache_dirty )
@ -223,22 +220,18 @@ TRACE_EVENT(journal_reclaim_start,
__entry->kicked = kicked; __entry->kicked = kicked;
__entry->min_nr = min_nr; __entry->min_nr = min_nr;
__entry->min_key_cache = min_key_cache; __entry->min_key_cache = min_key_cache;
__entry->prereserved = prereserved;
__entry->prereserved_total = prereserved_total;
__entry->btree_cache_dirty = btree_cache_dirty; __entry->btree_cache_dirty = btree_cache_dirty;
__entry->btree_cache_total = btree_cache_total; __entry->btree_cache_total = btree_cache_total;
__entry->btree_key_cache_dirty = btree_key_cache_dirty; __entry->btree_key_cache_dirty = btree_key_cache_dirty;
__entry->btree_key_cache_total = btree_key_cache_total; __entry->btree_key_cache_total = btree_key_cache_total;
), ),
TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu", TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu",
MAJOR(__entry->dev), MINOR(__entry->dev), MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->direct, __entry->direct,
__entry->kicked, __entry->kicked,
__entry->min_nr, __entry->min_nr,
__entry->min_key_cache, __entry->min_key_cache,
__entry->prereserved,
__entry->prereserved_total,
__entry->btree_cache_dirty, __entry->btree_cache_dirty,
__entry->btree_cache_total, __entry->btree_cache_total,
__entry->btree_key_cache_dirty, __entry->btree_key_cache_dirty,

View File

@ -315,6 +315,57 @@ int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task)
return ret; return ret;
} }
#ifndef __KERNEL__
#include <time.h>
void bch2_prt_datetime(struct printbuf *out, time64_t sec)
{
time_t t = sec;
char buf[64];
ctime_r(&t, buf);
prt_str(out, buf);
}
#else
void bch2_prt_datetime(struct printbuf *out, time64_t sec)
{
char buf[64];
snprintf(buf, sizeof(buf), "%ptT", &sec);
prt_u64(out, sec);
}
#endif
static const struct time_unit {
const char *name;
u64 nsecs;
} time_units[] = {
{ "ns", 1 },
{ "us", NSEC_PER_USEC },
{ "ms", NSEC_PER_MSEC },
{ "s", NSEC_PER_SEC },
{ "m", (u64) NSEC_PER_SEC * 60},
{ "h", (u64) NSEC_PER_SEC * 3600},
{ "eon", U64_MAX },
};
static const struct time_unit *pick_time_units(u64 ns)
{
const struct time_unit *u;
for (u = time_units;
u + 1 < time_units + ARRAY_SIZE(time_units) &&
ns >= u[1].nsecs << 1;
u++)
;
return u;
}
void bch2_pr_time_units(struct printbuf *out, u64 ns)
{
const struct time_unit *u = pick_time_units(ns);
prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
}
/* time stats: */ /* time stats: */
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
@ -359,6 +410,7 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration); mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
stats->max_duration = max(stats->max_duration, duration); stats->max_duration = max(stats->max_duration, duration);
stats->min_duration = min(stats->min_duration, duration); stats->min_duration = min(stats->min_duration, duration);
stats->total_duration += duration;
bch2_quantiles_update(&stats->quantiles, duration); bch2_quantiles_update(&stats->quantiles, duration);
} }
@ -372,22 +424,26 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
} }
} }
static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
struct bch2_time_stat_buffer *b) struct bch2_time_stat_buffer *b)
{ {
struct bch2_time_stat_buffer_entry *i; for (struct bch2_time_stat_buffer_entry *i = b->entries;
unsigned long flags;
spin_lock_irqsave(&stats->lock, flags);
for (i = b->entries;
i < b->entries + ARRAY_SIZE(b->entries); i < b->entries + ARRAY_SIZE(b->entries);
i++) i++)
bch2_time_stats_update_one(stats, i->start, i->end); bch2_time_stats_update_one(stats, i->start, i->end);
spin_unlock_irqrestore(&stats->lock, flags);
b->nr = 0; b->nr = 0;
} }
static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
struct bch2_time_stat_buffer *b)
{
unsigned long flags;
spin_lock_irqsave(&stats->lock, flags);
__bch2_time_stats_clear_buffer(stats, b);
spin_unlock_irqrestore(&stats->lock, flags);
}
void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
{ {
unsigned long flags; unsigned long flags;
@ -423,40 +479,6 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
preempt_enable(); preempt_enable();
} }
} }
#endif
static const struct time_unit {
const char *name;
u64 nsecs;
} time_units[] = {
{ "ns", 1 },
{ "us", NSEC_PER_USEC },
{ "ms", NSEC_PER_MSEC },
{ "s", NSEC_PER_SEC },
{ "m", (u64) NSEC_PER_SEC * 60},
{ "h", (u64) NSEC_PER_SEC * 3600},
{ "eon", U64_MAX },
};
static const struct time_unit *pick_time_units(u64 ns)
{
const struct time_unit *u;
for (u = time_units;
u + 1 < time_units + ARRAY_SIZE(time_units) &&
ns >= u[1].nsecs << 1;
u++)
;
return u;
}
void bch2_pr_time_units(struct printbuf *out, u64 ns)
{
const struct time_unit *u = pick_time_units(ns);
prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
}
static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
{ {
@ -467,26 +489,6 @@ static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
prt_printf(out, "%s", u->name); prt_printf(out, "%s", u->name);
} }
#ifndef __KERNEL__
#include <time.h>
void bch2_prt_date_seconds(struct printbuf *out, time64_t sec)
{
time_t t = sec;
char buf[64];
ctime_r(&t, buf);
prt_str(out, buf);
}
#else
void bch2_prt_date_seconds(struct printbuf *out, time64_t sec)
{
char buf[64];
snprintf(buf, sizeof(buf), "%ptT", &sec);
prt_u64(out, sec);
}
#endif
#define TABSTOP_SIZE 12
static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns) static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
{ {
prt_str(out, name); prt_str(out, name);
@ -495,12 +497,24 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64
prt_newline(out); prt_newline(out);
} }
#define TABSTOP_SIZE 12
void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
{ {
const struct time_unit *u; const struct time_unit *u;
s64 f_mean = 0, d_mean = 0; s64 f_mean = 0, d_mean = 0;
u64 q, last_q = 0, f_stddev = 0, d_stddev = 0; u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
int i; int i;
if (stats->buffer) {
int cpu;
spin_lock_irq(&stats->lock);
for_each_possible_cpu(cpu)
__bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
spin_unlock_irq(&stats->lock);
}
/* /*
* avoid divide by zero * avoid divide by zero
*/ */
@ -546,6 +560,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
pr_name_and_units(out, "min:", stats->min_duration); pr_name_and_units(out, "min:", stats->min_duration);
pr_name_and_units(out, "max:", stats->max_duration); pr_name_and_units(out, "max:", stats->max_duration);
pr_name_and_units(out, "total:", stats->total_duration);
prt_printf(out, "mean:"); prt_printf(out, "mean:");
prt_tab(out); prt_tab(out);
@ -603,6 +618,9 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
last_q = q; last_q = q;
} }
} }
#else
void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {}
#endif
void bch2_time_stats_exit(struct bch2_time_stats *stats) void bch2_time_stats_exit(struct bch2_time_stats *stats)
{ {

View File

@ -244,7 +244,7 @@ do { \
#define prt_bitflags(...) bch2_prt_bitflags(__VA_ARGS__) #define prt_bitflags(...) bch2_prt_bitflags(__VA_ARGS__)
void bch2_pr_time_units(struct printbuf *, u64); void bch2_pr_time_units(struct printbuf *, u64);
void bch2_prt_date_seconds(struct printbuf *, time64_t); void bch2_prt_datetime(struct printbuf *, time64_t);
#ifdef __KERNEL__ #ifdef __KERNEL__
static inline void uuid_unparse_lower(u8 *uuid, char *out) static inline void uuid_unparse_lower(u8 *uuid, char *out)
@ -372,8 +372,9 @@ struct bch2_time_stat_buffer {
struct bch2_time_stats { struct bch2_time_stats {
spinlock_t lock; spinlock_t lock;
/* all fields are in nanoseconds */ /* all fields are in nanoseconds */
u64 max_duration;
u64 min_duration; u64 min_duration;
u64 max_duration;
u64 total_duration;
u64 max_freq; u64 max_freq;
u64 min_freq; u64 min_freq;
u64 last_event; u64 last_event;
@ -388,15 +389,39 @@ struct bch2_time_stats {
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64); void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
#else
static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
#endif
static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
{ {
__bch2_time_stats_update(stats, start, local_clock()); __bch2_time_stats_update(stats, start, local_clock());
} }
static inline bool track_event_change(struct bch2_time_stats *stats,
u64 *start, bool v)
{
if (v != !!*start) {
if (!v) {
bch2_time_stats_update(stats, *start);
*start = 0;
} else {
*start = local_clock() ?: 1;
return true;
}
}
return false;
}
#else
static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {}
static inline bool track_event_change(struct bch2_time_stats *stats,
u64 *start, bool v)
{
bool ret = v && !*start;
*start = v;
return ret;
}
#endif
void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *); void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
void bch2_time_stats_exit(struct bch2_time_stats *); void bch2_time_stats_exit(struct bch2_time_stats *);