From 8b06995115a8ade5e0b154311b771279ff97317f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 5 Jul 2024 09:14:14 -0400 Subject: [PATCH] Update bcachefs sources to 2be6fc9b111c bcachefs: bch2_gc_btree() should not use btree_root_lock Signed-off-by: Kent Overstreet --- .bcachefs_revision | 2 +- include/linux/closure.h | 7 ++++ include/linux/lockdep.h | 2 + libbcachefs/acl.c | 4 +- libbcachefs/alloc_background.c | 48 ++++++++++----------- libbcachefs/alloc_foreground.c | 10 ++++- libbcachefs/backpointers.c | 70 ++++++++++++------------------- libbcachefs/bcachefs.h | 2 +- libbcachefs/bkey.c | 5 ++- libbcachefs/bkey.h | 7 ++++ libbcachefs/btree_gc.c | 54 ++++++++++++++---------- libbcachefs/btree_iter.c | 33 +++++++++++---- libbcachefs/btree_locking.c | 11 ++--- libbcachefs/btree_locking.h | 26 ++++++++++++ libbcachefs/btree_types.h | 4 ++ libbcachefs/btree_write_buffer.c | 37 ++++++++++++++++ libbcachefs/btree_write_buffer.h | 3 ++ libbcachefs/clock.c | 72 ++++++++++++++------------------ libbcachefs/clock.h | 9 ++-- libbcachefs/clock_types.h | 3 +- libbcachefs/data_update.c | 44 +++++++++++++++++++ libbcachefs/data_update.h | 5 +++ libbcachefs/debug.c | 12 +++--- libbcachefs/extents.c | 8 ++-- libbcachefs/extents.h | 2 + libbcachefs/eytzinger.h | 6 ++- libbcachefs/fs-io.c | 8 ++-- libbcachefs/fs.c | 22 +++++++--- libbcachefs/io_read.c | 72 ++++++++++++++++++++++---------- libbcachefs/journal.c | 18 ++++---- libbcachefs/journal.h | 2 +- libbcachefs/journal_io.c | 12 ++++-- libbcachefs/lru.c | 39 +++++++++++++++++ libbcachefs/lru.h | 3 ++ libbcachefs/move.c | 25 ----------- libbcachefs/movinggc.c | 36 +++++++++------- libbcachefs/sb-errors_format.h | 3 +- libbcachefs/super.c | 6 +-- libbcachefs/sysfs.c | 5 +++ linux/closure.c | 3 ++ 40 files changed, 477 insertions(+), 263 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index 50da14dd..2a12d1f7 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -9404a01d3dc5553b106fa590602f4771b8e0b8ae +2be6fc9b111cad37da8838e39c66244767bc7d0a diff --git a/include/linux/closure.h b/include/linux/closure.h index 59b8c06b..2af44427 100644 --- a/include/linux/closure.h +++ b/include/linux/closure.h @@ -159,6 +159,7 @@ struct closure { #ifdef CONFIG_DEBUG_CLOSURES #define CLOSURE_MAGIC_DEAD 0xc054dead #define CLOSURE_MAGIC_ALIVE 0xc054a11e +#define CLOSURE_MAGIC_STACK 0xc05451cc unsigned int magic; struct list_head all; @@ -323,12 +324,18 @@ static inline void closure_init_stack(struct closure *cl) { memset(cl, 0, sizeof(struct closure)); atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); +#ifdef CONFIG_DEBUG_CLOSURES + cl->magic = CLOSURE_MAGIC_STACK; +#endif } static inline void closure_init_stack_release(struct closure *cl) { memset(cl, 0, sizeof(struct closure)); atomic_set_release(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); +#ifdef CONFIG_DEBUG_CLOSURES + cl->magic = CLOSURE_MAGIC_STACK; +#endif } /** diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 3831ef2d..27bf6915 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -5,6 +5,8 @@ struct lock_class_key {}; struct task_struct; # define lock_acquire(l, s, t, r, c, n, i) do { } while (0) +# define lock_acquire_exclusive(...) do { } while (0) +# define lockdep_set_notrack_class(...) do { } while (0) # define lock_release(l, i) do { } while (0) # define lock_set_class(l, n, k, s, i) do { } while (0) # define lock_set_subclass(l, s, i) do { } while (0) diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c index 250d6c6d..a7b425d3 100644 --- a/libbcachefs/acl.c +++ b/libbcachefs/acl.c @@ -346,7 +346,6 @@ int bch2_set_acl(struct mnt_idmap *idmap, { struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans *trans = bch2_trans_get(c); struct btree_iter inode_iter = { NULL }; struct bch_inode_unpacked inode_u; struct posix_acl *acl; @@ -354,6 +353,7 @@ int bch2_set_acl(struct mnt_idmap *idmap, int ret; mutex_lock(&inode->ei_update_lock); + struct btree_trans *trans = bch2_trans_get(c); retry: bch2_trans_begin(trans); acl = _acl; @@ -394,8 +394,8 @@ btree_err: set_cached_acl(&inode->v, type, acl); err: - mutex_unlock(&inode->ei_update_lock); bch2_trans_put(trans); + mutex_unlock(&inode->ei_update_lock); return ret; } diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 8e8aed2a..9ff822e4 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -3,6 +3,7 @@ #include "alloc_background.h" #include "alloc_foreground.h" #include "backpointers.h" +#include "bkey_buf.h" #include "btree_cache.h" #include "btree_io.h" #include "btree_key_cache.h" @@ -1600,13 +1601,13 @@ err: } static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, - struct btree_iter *alloc_iter) + struct btree_iter *alloc_iter, + struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; - struct btree_iter lru_iter; struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a; - struct bkey_s_c alloc_k, lru_k; + struct bkey_s_c alloc_k; struct printbuf buf = PRINTBUF; int ret; @@ -1620,6 +1621,14 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, a = bch2_alloc_to_v4(alloc_k, &a_convert); + if (a->fragmentation_lru) { + ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START, + a->fragmentation_lru, + alloc_k, last_flushed); + if (ret) + return ret; + } + if (a->data_type != BCH_DATA_cached) return 0; @@ -1644,41 +1653,30 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, a = &a_mut->v; } - lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, - lru_pos(alloc_k.k->p.inode, - bucket_to_u64(alloc_k.k->p), - a->io_time[READ]), 0); - ret = bkey_err(lru_k); + ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, a->io_time[READ], + alloc_k, last_flushed); if (ret) - return ret; - - if (fsck_err_on(lru_k.k->type != KEY_TYPE_set, - trans, alloc_key_to_missing_lru_entry, - "missing lru entry\n" - " %s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { - ret = bch2_lru_set(trans, - alloc_k.k->p.inode, - bucket_to_u64(alloc_k.k->p), - a->io_time[READ]); - if (ret) - goto err; - } + goto err; err: fsck_err: - bch2_trans_iter_exit(trans, &lru_iter); printbuf_exit(&buf); return ret; } int bch2_check_alloc_to_lru_refs(struct bch_fs *c) { + struct bkey_buf last_flushed; + + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_alloc_to_lru_ref(trans, &iter))); + bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))); + + bch2_bkey_buf_exit(&last_flushed, c); bch_err_fn(c, ret); return ret; } diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index 73228b25..49a5789d 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -626,8 +626,14 @@ again: if (usage->d[BCH_DATA_need_gc_gens].buckets > avail) bch2_gc_gens_async(c); - if (should_invalidate_buckets(ca, *usage)) + if (should_invalidate_buckets(ca, *usage)) { bch2_dev_do_invalidates(ca); + rcu_read_lock(); + struct task_struct *t = rcu_dereference(c->copygc_thread); + if (t) + wake_up_process(t); + rcu_read_unlock(); + } if (!avail) { if (cl && !waiting) { @@ -1703,6 +1709,7 @@ void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) nr[c->open_buckets[i].data_type]++; + printbuf_tabstops_reset(out); printbuf_tabstop_push(out, 24); prt_printf(out, "hidden\t%llu\n", percpu_u64_get(&c->usage->hidden)); @@ -1734,6 +1741,7 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) nr[c->open_buckets[i].data_type]++; + printbuf_tabstops_reset(out); printbuf_tabstop_push(out, 12); printbuf_tabstop_push(out, 16); printbuf_tabstop_push(out, 16); diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c index ca16fa5d..96217e8d 100644 --- a/libbcachefs/backpointers.c +++ b/libbcachefs/backpointers.c @@ -434,13 +434,6 @@ int bch2_check_btree_backpointers(struct bch_fs *c) return ret; } -static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) -{ - return bpos_eq(l.k->p, r.k->p) && - bkey_bytes(l.k) == bkey_bytes(r.k) && - !memcmp(l.v, r.v, bkey_val_bytes(l.k)); -} - struct extents_to_bp_state { struct bpos bucket_start; struct bpos bucket_end; @@ -536,11 +529,8 @@ static int check_bp_exists(struct btree_trans *trans, struct btree_iter other_extent_iter = {}; struct printbuf buf = PRINTBUF; struct bkey_s_c bp_k; - struct bkey_buf tmp; int ret = 0; - bch2_bkey_buf_init(&tmp); - struct bch_dev *ca = bch2_dev_bucket_tryget(c, bucket); if (!ca) { prt_str(&buf, "extent for nonexistent device:bucket "); @@ -565,22 +555,9 @@ static int check_bp_exists(struct btree_trans *trans, if (bp_k.k->type != KEY_TYPE_backpointer || memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) { - bch2_bkey_buf_reassemble(&tmp, c, orig_k); - - if (!bkey_and_val_eq(orig_k, bkey_i_to_s_c(s->last_flushed.k))) { - if (bp.level) { - bch2_trans_unlock(trans); - bch2_btree_interior_updates_flush(c); - } - - ret = bch2_btree_write_buffer_flush_sync(trans); - if (ret) - goto err; - - bch2_bkey_buf_copy(&s->last_flushed, c, tmp.k); - ret = -BCH_ERR_transaction_restart_write_buffer_flush; - goto out; - } + ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed); + if (ret) + goto err; goto check_existing_bp; } @@ -589,7 +566,6 @@ err: fsck_err: bch2_trans_iter_exit(trans, &other_extent_iter); bch2_trans_iter_exit(trans, &bp_iter); - bch2_bkey_buf_exit(&tmp, c); bch2_dev_put(ca); printbuf_exit(&buf); return ret; @@ -794,6 +770,8 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, !((1U << btree) & btree_interior_mask)) continue; + bch2_trans_begin(trans); + __for_each_btree_node(trans, iter, btree, btree == start.btree ? start.pos : POS_MIN, 0, depth, BTREE_ITER_prefetch, b, ret) { @@ -905,7 +883,7 @@ static int check_one_backpointer(struct btree_trans *trans, struct bbpos start, struct bbpos end, struct bkey_s_c_backpointer bp, - struct bpos *last_flushed_pos) + struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; struct btree_iter iter; @@ -925,20 +903,18 @@ static int check_one_backpointer(struct btree_trans *trans, if (ret) return ret; - if (!k.k && !bpos_eq(*last_flushed_pos, bp.k->p)) { - *last_flushed_pos = bp.k->p; - ret = bch2_btree_write_buffer_flush_sync(trans) ?: - -BCH_ERR_transaction_restart_write_buffer_flush; - goto out; - } + if (!k.k) { + ret = bch2_btree_write_buffer_maybe_flush(trans, bp.s_c, last_flushed); + if (ret) + goto out; - if (fsck_err_on(!k.k, - trans, backpointer_to_missing_ptr, - "backpointer for missing %s\n %s", - bp.v->level ? "btree node" : "extent", - (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) { - ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p); - goto out; + if (fsck_err(trans, backpointer_to_missing_ptr, + "backpointer for missing %s\n %s", + bp.v->level ? "btree node" : "extent", + (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) { + ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p); + goto out; + } } out: fsck_err: @@ -951,14 +927,20 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, struct bbpos start, struct bbpos end) { - struct bpos last_flushed_pos = SPOS_MAX; + struct bkey_buf last_flushed; - return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + + int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_one_backpointer(trans, start, end, bkey_s_c_to_backpointer(k), - &last_flushed_pos)); + &last_flushed)); + + bch2_bkey_buf_exit(&last_flushed, trans->c); + return ret; } int bch2_check_backpointers_to_extents(struct bch_fs *c) diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 372bc339..65e46225 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -980,7 +980,7 @@ struct bch_fs { struct bch_fs_rebalance rebalance; /* COPYGC */ - struct task_struct *copygc_thread; + struct task_struct __rcu *copygc_thread; struct write_point copygc_write_point; s64 copygc_wait_at; s64 copygc_wait; diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c index 94a1d198..587d7318 100644 --- a/libbcachefs/bkey.c +++ b/libbcachefs/bkey.c @@ -660,8 +660,9 @@ int bch2_bkey_format_invalid(struct bch_fs *c, bch2_bkey_format_field_overflows(f, i)) { unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); - u64 packed_max = f->bits_per_field[i] - ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1)) + unsigned packed_bits = min(64, f->bits_per_field[i]); + u64 packed_max = packed_bits + ? ~((~0ULL << 1) << (packed_bits - 1)) : 0; prt_printf(err, "field %u too large: %llu + %llu > %llu", diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index fcd43915..93635714 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -194,6 +194,13 @@ static inline struct bpos bkey_max(struct bpos l, struct bpos r) return bkey_gt(l, r) ? l : r; } +static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) +{ + return bpos_eq(l.k->p, r.k->p) && + bkey_bytes(l.k) == bkey_bytes(r.k) && + !memcmp(l.v, r.v, bkey_val_bytes(l.k)); +} + void bch2_bpos_swab(struct bpos *); void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 2e9ccb20..1e4c3e30 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -675,15 +675,26 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in } /* root */ - mutex_lock(&c->btree_root_lock); - struct btree *b = bch2_btree_id_root(c, btree)->b; - if (!btree_node_fake(b)) { + do { + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, + 0, bch2_btree_id_root(c, btree)->b->c.level, 0); + struct btree *b = bch2_btree_iter_peek_node(&iter); + ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto err_root; + + if (b != btree_node_root(c, b)) { + bch2_trans_iter_exit(trans, &iter); + continue; + } + gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX)); - ret = lockrestart_do(trans, - bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, - NULL, NULL, bkey_i_to_s_c(&b->key), initial)); - } - mutex_unlock(&c->btree_root_lock); + struct bkey_s_c k = bkey_i_to_s_c(&b->key); + ret = bch2_gc_mark_key(trans, btree, b->c.level + 1, NULL, NULL, k, initial); +err_root: + bch2_trans_iter_exit(trans, &iter); + } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); err: bch_err_fn(c, ret); return ret; @@ -818,6 +829,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans, return ret; } + gc.fragmentation_lru = alloc_lru_idx_fragmentation(gc, ca); + if (fsck_err_on(new.data_type != gc.data_type, trans, alloc_key_data_type_wrong, "bucket %llu:%llu gen %u has wrong data_type" @@ -832,25 +845,20 @@ static int bch2_alloc_write_key(struct btree_trans *trans, if (fsck_err_on(new._f != gc._f, \ trans, _errtype, \ "bucket %llu:%llu gen %u data type %s has wrong " #_f \ - ": got %u, should be %u", \ + ": got %llu, should be %llu", \ iter->pos.inode, iter->pos.offset, \ gc.gen, \ bch2_data_type_str(gc.data_type), \ - new._f, gc._f)) \ + (u64) new._f, (u64) gc._f)) \ new._f = gc._f; \ - copy_bucket_field(alloc_key_gen_wrong, - gen); - copy_bucket_field(alloc_key_dirty_sectors_wrong, - dirty_sectors); - copy_bucket_field(alloc_key_stripe_sectors_wrong, - stripe_sectors); - copy_bucket_field(alloc_key_cached_sectors_wrong, - cached_sectors); - copy_bucket_field(alloc_key_stripe_wrong, - stripe); - copy_bucket_field(alloc_key_stripe_redundancy_wrong, - stripe_redundancy); + copy_bucket_field(alloc_key_gen_wrong, gen); + copy_bucket_field(alloc_key_dirty_sectors_wrong, dirty_sectors); + copy_bucket_field(alloc_key_stripe_sectors_wrong, stripe_sectors); + copy_bucket_field(alloc_key_cached_sectors_wrong, cached_sectors); + copy_bucket_field(alloc_key_stripe_wrong, stripe); + copy_bucket_field(alloc_key_stripe_redundancy_wrong, stripe_redundancy); + copy_bucket_field(alloc_key_fragmentation_lru_wrong, fragmentation_lru); #undef copy_bucket_field if (!bch2_alloc_v4_cmp(*old, new)) @@ -864,7 +872,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, a->v = new; /* - * The trigger normally makes sure this is set, but we're not running + * The trigger normally makes sure these are set, but we're not running * triggers: */ if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ]) diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 80f4a395..803cc58f 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -987,8 +987,7 @@ retry_all: bch2_trans_unlock(trans); cond_resched(); - trans->locked = true; - trans->last_unlock_ip = 0; + trans_set_locked(trans); if (unlikely(trans->memory_allocation_failure)) { struct closure cl; @@ -3088,8 +3087,8 @@ u32 bch2_trans_begin(struct btree_trans *trans) bch2_trans_srcu_unlock(trans); trans->last_begin_ip = _RET_IP_; - trans->locked = true; - trans->last_unlock_ip = 0; + + trans_set_locked(trans); if (trans->restarted) { bch2_btree_path_traverse_all(trans); @@ -3159,7 +3158,6 @@ got_trans: trans->last_begin_time = local_clock(); trans->fn_idx = fn_idx; trans->locking_wait.task = current; - trans->locked = true; trans->journal_replay_not_finished = unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)) && atomic_inc_not_zero(&c->journal_keys.ref); @@ -3173,6 +3171,9 @@ got_trans: trans->paths_allocated[0] = 1; + static struct lock_class_key lockdep_key; + lockdep_init_map(&trans->dep_map, "bcachefs_btree", &lockdep_key, 0); + if (fn_idx < BCH_TRANSACTIONS_NR) { trans->fn = bch2_btree_transaction_fns[fn_idx]; @@ -3193,6 +3194,7 @@ got_trans: trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); trans->srcu_lock_time = jiffies; trans->srcu_held = true; + trans_set_locked(trans); closure_init_stack_release(&trans->ref); return trans; @@ -3441,7 +3443,22 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, BTREE_TRANS_MEM_MAX) ?: init_srcu_struct(&c->btree_trans_barrier); - if (!ret) - c->btree_trans_barrier_initialized = true; - return ret; + if (ret) + return ret; + + /* + * static annotation (hackily done) for lock ordering of reclaim vs. + * btree node locks: + */ +#ifdef CONFIG_LOCKDEP + fs_reclaim_acquire(GFP_KERNEL); + struct btree_trans *trans = bch2_trans_get(c); + trans_set_locked(trans); + bch2_trans_put(trans); + fs_reclaim_release(GFP_KERNEL); +#endif + + c->btree_trans_barrier_initialized = true; + return 0; + } diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c index 79057cde..dd52372a 100644 --- a/libbcachefs/btree_locking.c +++ b/libbcachefs/btree_locking.c @@ -10,7 +10,7 @@ void bch2_btree_lock_init(struct btree_bkey_cached_common *b, enum six_lock_init_flags flags) { __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags); - lockdep_set_novalidate_class(&b->lock); + lockdep_set_notrack_class(&b->lock); } #ifdef CONFIG_LOCKDEP @@ -792,8 +792,7 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) return bch2_trans_relock_fail(trans, path, &f, trace); } - trans->locked = true; - trans->last_unlock_ip = 0; + trans_set_locked(trans); out: bch2_trans_verify_locks(trans); return 0; @@ -813,16 +812,14 @@ void bch2_trans_unlock_noassert(struct btree_trans *trans) { __bch2_trans_unlock(trans); - trans->locked = false; - trans->last_unlock_ip = _RET_IP_; + trans_set_unlocked(trans); } void bch2_trans_unlock(struct btree_trans *trans) { __bch2_trans_unlock(trans); - trans->locked = false; - trans->last_unlock_ip = _RET_IP_; + trans_set_unlocked(trans); } void bch2_trans_unlock_long(struct btree_trans *trans) diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index 2530fdb2..b9fd14e3 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -13,6 +13,8 @@ #include "btree_iter.h" #include "six.h" +#include + void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags); #ifdef CONFIG_LOCKDEP @@ -194,6 +196,30 @@ int bch2_six_check_for_deadlock(struct six_lock *lock, void *p); /* lock: */ +static inline void trans_set_locked(struct btree_trans *trans) +{ + if (!trans->locked) { + lock_acquire_exclusive(&trans->dep_map, 0, 0, NULL, _THIS_IP_); + trans->locked = true; + trans->last_unlock_ip = 0; + + trans->pf_memalloc_nofs = (current->flags & PF_MEMALLOC_NOFS) != 0; + current->flags |= PF_MEMALLOC_NOFS; + } +} + +static inline void trans_set_unlocked(struct btree_trans *trans) +{ + if (trans->locked) { + lock_release(&trans->dep_map, _THIS_IP_); + trans->locked = false; + trans->last_unlock_ip = _RET_IP_; + + if (!trans->pf_memalloc_nofs) + current->flags &= ~PF_MEMALLOC_NOFS; + } +} + static inline int __btree_node_lock_nopath(struct btree_trans *trans, struct btree_bkey_cached_common *b, enum six_lock_type type, diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index c9c9864a..79898f68 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -483,6 +483,7 @@ struct btree_trans { bool lock_may_not_fail:1; bool srcu_held:1; bool locked:1; + bool pf_memalloc_nofs:1; bool write_locked:1; bool used_mempool:1; bool in_traverse_all:1; @@ -522,6 +523,9 @@ struct btree_trans { unsigned journal_u64s; unsigned extra_disk_res; /* XXX kill */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif /* Entries before this are zeroed out on every bch2_trans_get() call */ struct list_head list; diff --git a/libbcachefs/btree_write_buffer.c b/libbcachefs/btree_write_buffer.c index 5e1262a5..b9fe7368 100644 --- a/libbcachefs/btree_write_buffer.c +++ b/libbcachefs/btree_write_buffer.c @@ -1,12 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey_buf.h" #include "btree_locking.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_write_buffer.h" #include "disk_accounting.h" #include "error.h" +#include "extents.h" #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" @@ -558,6 +560,41 @@ int bch2_btree_write_buffer_tryflush(struct btree_trans *trans) return ret; } +/** + * In check and repair code, when checking references to write buffer btrees we + * need to issue a flush before we have a definitive error: this issues a flush + * if this is a key we haven't yet checked. + */ +int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans, + struct bkey_s_c referring_k, + struct bkey_buf *last_flushed) +{ + struct bch_fs *c = trans->c; + struct bkey_buf tmp; + int ret = 0; + + bch2_bkey_buf_init(&tmp); + + if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) { + bch2_bkey_buf_reassemble(&tmp, c, referring_k); + + if (bkey_is_btree_ptr(referring_k.k)) { + bch2_trans_unlock(trans); + bch2_btree_interior_updates_flush(c); + } + + ret = bch2_btree_write_buffer_flush_sync(trans); + if (ret) + goto err; + + bch2_bkey_buf_copy(last_flushed, c, tmp.k); + ret = -BCH_ERR_transaction_restart_write_buffer_flush; + } +err: + bch2_bkey_buf_exit(&tmp, c); + return ret; +} + static void bch2_btree_write_buffer_flush_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work); diff --git a/libbcachefs/btree_write_buffer.h b/libbcachefs/btree_write_buffer.h index ee3710ea..725e7965 100644 --- a/libbcachefs/btree_write_buffer.h +++ b/libbcachefs/btree_write_buffer.h @@ -24,6 +24,9 @@ int bch2_btree_write_buffer_flush_sync(struct btree_trans *); int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *); int bch2_btree_write_buffer_tryflush(struct btree_trans *); +struct bkey_buf; +int bch2_btree_write_buffer_maybe_flush(struct btree_trans *, struct bkey_s_c, struct bkey_buf *); + struct journal_keys_to_wb { struct btree_write_buffer_keys *wb; size_t room; diff --git a/libbcachefs/clock.c b/libbcachefs/clock.c index 36364445..df3763c1 100644 --- a/libbcachefs/clock.c +++ b/libbcachefs/clock.c @@ -15,18 +15,15 @@ static inline long io_timer_cmp(io_timer_heap *h, void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) { - size_t i; - spin_lock(&clock->timer_lock); - if (time_after_eq((unsigned long) atomic64_read(&clock->now), - timer->expire)) { + if (time_after_eq64((u64) atomic64_read(&clock->now), timer->expire)) { spin_unlock(&clock->timer_lock); timer->fn(timer); return; } - for (i = 0; i < clock->timers.used; i++) + for (size_t i = 0; i < clock->timers.used; i++) if (clock->timers.data[i] == timer) goto out; @@ -37,11 +34,9 @@ out: void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) { - size_t i; - spin_lock(&clock->timer_lock); - for (i = 0; i < clock->timers.used; i++) + for (size_t i = 0; i < clock->timers.used; i++) if (clock->timers.data[i] == timer) { heap_del(&clock->timers, i, io_timer_cmp, NULL); break; @@ -75,33 +70,31 @@ static void io_clock_cpu_timeout(struct timer_list *timer) wake_up_process(wait->task); } -void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until) +void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until) { - struct io_clock_wait wait; + struct io_clock_wait wait = { + .io_timer.expire = until, + .io_timer.fn = io_clock_wait_fn, + .io_timer.fn2 = (void *) _RET_IP_, + .task = current, + }; - /* XXX: calculate sleep time rigorously */ - wait.io_timer.expire = until; - wait.io_timer.fn = io_clock_wait_fn; - wait.task = current; - wait.expired = 0; bch2_io_timer_add(clock, &wait.io_timer); - schedule(); - bch2_io_timer_del(clock, &wait.io_timer); } void bch2_kthread_io_clock_wait(struct io_clock *clock, - unsigned long io_until, - unsigned long cpu_timeout) + u64 io_until, unsigned long cpu_timeout) { bool kthread = (current->flags & PF_KTHREAD) != 0; - struct io_clock_wait wait; + struct io_clock_wait wait = { + .io_timer.expire = io_until, + .io_timer.fn = io_clock_wait_fn, + .io_timer.fn2 = (void *) _RET_IP_, + .task = current, + }; - wait.io_timer.expire = io_until; - wait.io_timer.fn = io_clock_wait_fn; - wait.task = current; - wait.expired = 0; bch2_io_timer_add(clock, &wait.io_timer); timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0); @@ -127,44 +120,41 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, bch2_io_timer_del(clock, &wait.io_timer); } -static struct io_timer *get_expired_timer(struct io_clock *clock, - unsigned long now) +static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now) { struct io_timer *ret = NULL; - spin_lock(&clock->timer_lock); - if (clock->timers.used && - time_after_eq(now, clock->timers.data[0]->expire)) + time_after_eq64(now, clock->timers.data[0]->expire)) heap_pop(&clock->timers, ret, io_timer_cmp, NULL); - - spin_unlock(&clock->timer_lock); - return ret; } -void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) +void __bch2_increment_clock(struct io_clock *clock, u64 sectors) { struct io_timer *timer; - unsigned long now = atomic64_add_return(sectors, &clock->now); + u64 now = atomic64_add_return(sectors, &clock->now); + spin_lock(&clock->timer_lock); while ((timer = get_expired_timer(clock, now))) timer->fn(timer); + spin_unlock(&clock->timer_lock); } void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) { - unsigned long now; - unsigned i; - out->atomic++; spin_lock(&clock->timer_lock); - now = atomic64_read(&clock->now); + u64 now = atomic64_read(&clock->now); - for (i = 0; i < clock->timers.used; i++) - prt_printf(out, "%ps:\t%li\n", + printbuf_tabstop_push(out, 40); + prt_printf(out, "current time:\t%llu\n", now); + + for (unsigned i = 0; i < clock->timers.used; i++) + prt_printf(out, "%ps %ps:\t%llu\n", clock->timers.data[i]->fn, - clock->timers.data[i]->expire - now); + clock->timers.data[i]->fn2, + clock->timers.data[i]->expire); spin_unlock(&clock->timer_lock); --out->atomic; } diff --git a/libbcachefs/clock.h b/libbcachefs/clock.h index 70a0f743..85c975df 100644 --- a/libbcachefs/clock.h +++ b/libbcachefs/clock.h @@ -4,12 +4,11 @@ void bch2_io_timer_add(struct io_clock *, struct io_timer *); void bch2_io_timer_del(struct io_clock *, struct io_timer *); -void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long, - unsigned long); +void bch2_kthread_io_clock_wait(struct io_clock *, u64, unsigned long); -void __bch2_increment_clock(struct io_clock *, unsigned); +void __bch2_increment_clock(struct io_clock *, u64); -static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors, +static inline void bch2_increment_clock(struct bch_fs *c, u64 sectors, int rw) { struct io_clock *clock = &c->io_clock[rw]; @@ -19,7 +18,7 @@ static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors, __bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0)); } -void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long); +void bch2_io_clock_schedule_timeout(struct io_clock *, u64); #define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\ ({ \ diff --git a/libbcachefs/clock_types.h b/libbcachefs/clock_types.h index 5fae0012..9c25d0fc 100644 --- a/libbcachefs/clock_types.h +++ b/libbcachefs/clock_types.h @@ -17,7 +17,8 @@ typedef void (*io_timer_fn)(struct io_timer *); struct io_timer { io_timer_fn fn; - unsigned long expire; + void *fn2; + u64 expire; }; /* Amount to buffer up on a percpu counter */ diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c index 1a0072ee..0087b855 100644 --- a/libbcachefs/data_update.c +++ b/libbcachefs/data_update.c @@ -5,7 +5,9 @@ #include "bkey_buf.h" #include "btree_update.h" #include "buckets.h" +#include "compress.h" #include "data_update.h" +#include "disk_groups.h" #include "ec.h" #include "error.h" #include "extents.h" @@ -454,6 +456,38 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, } } +void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + printbuf_tabstop_push(out, 20); + prt_str(out, "rewrite ptrs:\t"); + bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); + prt_newline(out); + + prt_str(out, "kill ptrs:\t"); + bch2_prt_u64_base2(out, data_opts->kill_ptrs); + prt_newline(out); + + prt_str(out, "target:\t"); + bch2_target_to_text(out, c, data_opts->target); + prt_newline(out); + + prt_str(out, "compression:\t"); + bch2_compression_opt_to_text(out, background_compression(*io_opts)); + prt_newline(out); + + prt_str(out, "extra replicas:\t"); + prt_u64(out, data_opts->extra_replicas); +} + +void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) +{ + bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); + prt_newline(out); + bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); +} + int bch2_extent_drop_ptrs(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -643,6 +677,16 @@ int bch2_data_update_init(struct btree_trans *trans, if (!(durability_have + durability_removing)) m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1); + if (!m->op.nr_replicas) { + struct printbuf buf = PRINTBUF; + + bch2_data_update_to_text(&buf, m); + WARN(1, "trying to move an extent, but nr_replicas=0\n%s", buf.buf); + printbuf_exit(&buf); + ret = -BCH_ERR_data_update_done; + goto done; + } + m->op.nr_replicas_required = m->op.nr_replicas; if (reserve_sectors) { diff --git a/libbcachefs/data_update.h b/libbcachefs/data_update.h index 991095bb..8d36365b 100644 --- a/libbcachefs/data_update.h +++ b/libbcachefs/data_update.h @@ -17,6 +17,9 @@ struct data_update_opts { unsigned write_flags; }; +void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *, + struct bch_io_opts *, struct data_update_opts *); + struct data_update { /* extent being updated: */ enum btree_id btree_id; @@ -27,6 +30,8 @@ struct data_update { struct bch_write_op op; }; +void bch2_data_update_to_text(struct printbuf *, struct data_update *); + int bch2_data_update_index_update(struct bch_write_op *); void bch2_data_update_read_done(struct data_update *, diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index f0d4727c..ebabab17 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -610,7 +610,7 @@ restart: list_sort(&c->btree_trans_list, list_ptr_order_cmp); list_for_each_entry(trans, &c->btree_trans_list, list) { - if ((ulong) trans < i->iter) + if ((ulong) trans <= i->iter) continue; i->iter = (ulong) trans; @@ -832,16 +832,16 @@ static const struct file_operations btree_transaction_stats_op = { static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c) { struct btree_trans *trans; - pid_t iter = 0; + ulong iter = 0; restart: seqmutex_lock(&c->btree_trans_lock); - list_for_each_entry(trans, &c->btree_trans_list, list) { - struct task_struct *task = READ_ONCE(trans->locking_wait.task); + list_sort(&c->btree_trans_list, list_ptr_order_cmp); - if (!task || task->pid <= iter) + list_for_each_entry(trans, &c->btree_trans_list, list) { + if ((ulong) trans <= iter) continue; - iter = task->pid; + iter = (ulong) trans; if (!closure_get_not_zero(&trans->ref)) continue; diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 057df38f..07973198 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -37,8 +37,8 @@ static void bch2_extent_crc_pack(union bch_extent_crc *, struct bch_extent_crc_unpacked, enum bch_extent_entry_type); -static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f, - unsigned dev) +struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f, + unsigned dev) { struct bch_dev_io_failures *i; @@ -52,7 +52,7 @@ static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f, void bch2_mark_io_failure(struct bch_io_failures *failed, struct extent_ptr_decoded *p) { - struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev); + struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev); if (!f) { BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); @@ -140,7 +140,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr))) continue; - f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL; + f = failed ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL; if (f) p.idx = f->nr_failed < f->nr_retries ? f->idx diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 530686aa..facdb8a8 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -399,6 +399,8 @@ out: \ /* utility code common to all keys with pointers: */ +struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *, + unsigned); void bch2_mark_io_failure(struct bch_io_failures *, struct extent_ptr_decoded *); int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, diff --git a/libbcachefs/eytzinger.h b/libbcachefs/eytzinger.h index c6e078f2..0541192d 100644 --- a/libbcachefs/eytzinger.h +++ b/libbcachefs/eytzinger.h @@ -48,7 +48,7 @@ static inline unsigned eytzinger1_right_child(unsigned i) static inline unsigned eytzinger1_first(unsigned size) { - return rounddown_pow_of_two(size); + return size ? rounddown_pow_of_two(size) : 0; } static inline unsigned eytzinger1_last(unsigned size) @@ -101,7 +101,9 @@ static inline unsigned eytzinger1_prev(unsigned i, unsigned size) static inline unsigned eytzinger1_extra(unsigned size) { - return (size + 1 - rounddown_pow_of_two(size)) << 1; + return size + ? (size + 1 - rounddown_pow_of_two(size)) << 1 + : 0; } static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 189d408e..77b85da3 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -515,7 +515,7 @@ static int inode_update_times_fn(struct btree_trans *trans, return 0; } -static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) +static noinline long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) { struct bch_fs *c = inode->v.i_sb->s_fs_info; u64 end = offset + len; @@ -554,7 +554,7 @@ err: return ret; } -static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, +static noinline long bchfs_fcollapse_finsert(struct bch_inode_info *inode, loff_t offset, loff_t len, bool insert) { @@ -590,7 +590,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, return ret; } -static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, +static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, u64 start_sector, u64 end_sector) { struct bch_fs *c = inode->v.i_sb->s_fs_info; @@ -711,7 +711,7 @@ bkey_err: return ret; } -static long bchfs_fallocate(struct bch_inode_info *inode, int mode, +static noinline long bchfs_fallocate(struct bch_inode_info *inode, int mode, loff_t offset, loff_t len) { struct bch_fs *c = inode->v.i_sb->s_fs_info; diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index d34d628f..3ea8dbc4 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -194,6 +194,12 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino * discard_new_inode() expects it to be set... */ inode->v.i_flags |= I_NEW; + /* + * We don't want bch2_evict_inode() to delete the inode on disk, + * we just raced and had another inode in cache. Normally new + * inodes don't have nlink == 0 - except tmpfiles do... + */ + set_nlink(&inode->v, 1); discard_new_inode(&inode->v); inode = old; } else { @@ -511,11 +517,11 @@ static int __bch2_link(struct bch_fs *c, struct bch_inode_info *dir, struct dentry *dentry) { - struct btree_trans *trans = bch2_trans_get(c); struct bch_inode_unpacked dir_u, inode_u; int ret; mutex_lock(&inode->ei_update_lock); + struct btree_trans *trans = bch2_trans_get(c); ret = commit_do(trans, NULL, NULL, 0, bch2_link_trans(trans, @@ -562,11 +568,12 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, struct bch_inode_info *dir = to_bch_ei(vdir); struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); struct bch_inode_unpacked dir_u, inode_u; - struct btree_trans *trans = bch2_trans_get(c); int ret; bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); + struct btree_trans *trans = bch2_trans_get(c); + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_unlink_trans(trans, @@ -589,8 +596,8 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, set_nlink(&inode->v, 0); } err: - bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); bch2_trans_put(trans); + bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); return ret; } @@ -675,14 +682,14 @@ static int bch2_rename2(struct mnt_idmap *idmap, return ret; } - trans = bch2_trans_get(c); - bch2_lock_inodes(INODE_UPDATE_LOCK, src_dir, dst_dir, src_inode, dst_inode); + trans = bch2_trans_get(c); + ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_subvol) ?: bch2_subvol_is_ro_trans(trans, dst_dir->ei_subvol); if (ret) @@ -2019,6 +2026,8 @@ out: err: darray_exit(&devs_to_fs); bch2_darray_str_exit(&devs); + if (ret) + pr_err("error: %s", bch2_err_str(ret)); /* * On an inconsistency error in recovery we might see an -EROFS derived * errorcode (from the journal), but we don't want to return that to @@ -2131,7 +2140,8 @@ int __init bch2_vfs_init(void) { int ret = -ENOMEM; - bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT); + bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT | + SLAB_ACCOUNT); if (!bch2_inode_cache) goto err; diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c index 2a5c4371..8b484c75 100644 --- a/libbcachefs/io_read.c +++ b/libbcachefs/io_read.c @@ -93,21 +93,24 @@ static const struct rhashtable_params bch_promote_params = { static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, struct bpos pos, struct bch_io_opts opts, - unsigned flags) + unsigned flags, + struct bch_io_failures *failed) { - BUG_ON(!opts.promote_target); + if (!failed) { + BUG_ON(!opts.promote_target); - if (!(flags & BCH_READ_MAY_PROMOTE)) - return -BCH_ERR_nopromote_may_not; + if (!(flags & BCH_READ_MAY_PROMOTE)) + return -BCH_ERR_nopromote_may_not; - if (bch2_bkey_has_target(c, k, opts.promote_target)) - return -BCH_ERR_nopromote_already_promoted; + if (bch2_bkey_has_target(c, k, opts.promote_target)) + return -BCH_ERR_nopromote_already_promoted; - if (bkey_extent_is_unwritten(k)) - return -BCH_ERR_nopromote_unwritten; + if (bkey_extent_is_unwritten(k)) + return -BCH_ERR_nopromote_unwritten; - if (bch2_target_congested(c, opts.promote_target)) - return -BCH_ERR_nopromote_congested; + if (bch2_target_congested(c, opts.promote_target)) + return -BCH_ERR_nopromote_congested; + } if (rhashtable_lookup_fast(&c->promote_table, &pos, bch_promote_params)) @@ -164,7 +167,8 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, struct extent_ptr_decoded *pick, struct bch_io_opts opts, unsigned sectors, - struct bch_read_bio **rbio) + struct bch_read_bio **rbio, + struct bch_io_failures *failed) { struct bch_fs *c = trans->c; struct promote_op *op = NULL; @@ -217,14 +221,28 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, bio = &op->write.op.wbio.bio; bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); + struct data_update_opts update_opts = {}; + + if (!failed) { + update_opts.target = opts.promote_target; + update_opts.extra_replicas = 1; + update_opts.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED; + } else { + update_opts.target = opts.foreground_target; + + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + unsigned i = 0; + bkey_for_each_ptr(ptrs, ptr) { + if (bch2_dev_io_failures(failed, ptr->dev)) + update_opts.rewrite_ptrs |= BIT(i); + i++; + } + } + ret = bch2_data_update_init(trans, NULL, NULL, &op->write, writepoint_hashed((unsigned long) current), opts, - (struct data_update_opts) { - .target = opts.promote_target, - .extra_replicas = 1, - .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED, - }, + update_opts, btree_id, k); /* * possible errors: -BCH_ERR_nocow_lock_blocked, @@ -258,10 +276,17 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, unsigned flags, struct bch_read_bio **rbio, bool *bounce, - bool *read_full) + bool *read_full, + struct bch_io_failures *failed) { struct bch_fs *c = trans->c; - bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); + /* + * if failed != NULL we're not actually doing a promote, we're + * recovering from an io/checksum error + */ + bool promote_full = (failed || + *read_full || + READ_ONCE(c->promote_whole_extents)); /* data might have to be decompressed in the write path: */ unsigned sectors = promote_full ? max(pick->crc.compressed_size, pick->crc.live_size) @@ -272,7 +297,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, struct promote_op *promote; int ret; - ret = should_promote(c, k, pos, opts, flags); + ret = should_promote(c, k, pos, opts, flags, failed); if (ret) goto nopromote; @@ -280,7 +305,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, k.k->type == KEY_TYPE_reflink_v ? BTREE_ID_reflink : BTREE_ID_extents, - k, pos, pick, opts, sectors, rbio); + k, pos, pick, opts, sectors, rbio, failed); ret = PTR_ERR_OR_ZERO(promote); if (ret) goto nopromote; @@ -910,9 +935,9 @@ retry_pick: bounce = true; } - if (orig->opts.promote_target) + if (orig->opts.promote_target)// || failed) promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, - &rbio, &bounce, &read_full); + &rbio, &bounce, &read_full, failed); if (!read_full) { EBUG_ON(crc_is_compressed(pick.crc)); @@ -1003,6 +1028,9 @@ get_bio: rbio->promote = promote; INIT_WORK(&rbio->work, NULL); + if (flags & BCH_READ_NODECODE) + orig->pick = pick; + rbio->bio.bi_opf = orig->bio.bi_opf; rbio->bio.bi_iter.bi_sector = pick.ptr.offset; rbio->bio.bi_end_io = bch2_read_endio; diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 6209d778..649e3a01 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -1094,7 +1094,7 @@ unlock: return ret; } -int bch2_dev_journal_alloc(struct bch_dev *ca) +int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) { unsigned nr; int ret; @@ -1116,7 +1116,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca) min(1 << 13, (1 << 24) / ca->mi.bucket_size)); - ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL); + ret = __bch2_set_nr_journal_buckets(ca, nr, new_fs, NULL); err: bch_err_fn(ca, ret); return ret; @@ -1128,7 +1128,7 @@ int bch2_fs_journal_alloc(struct bch_fs *c) if (ca->journal.nr) continue; - int ret = bch2_dev_journal_alloc(ca); + int ret = bch2_dev_journal_alloc(ca, true); if (ret) { percpu_ref_put(&ca->io_ref); return ret; @@ -1183,9 +1183,11 @@ void bch2_fs_journal_stop(struct journal *j) journal_quiesce(j); cancel_delayed_work_sync(&j->write_work); - BUG_ON(!bch2_journal_error(j) && - test_bit(JOURNAL_replay_done, &j->flags) && - j->last_empty_seq != journal_cur_seq(j)); + WARN(!bch2_journal_error(j) && + test_bit(JOURNAL_replay_done, &j->flags) && + j->last_empty_seq != journal_cur_seq(j), + "journal shutdown error: cur seq %llu but last empty seq %llu", + journal_cur_seq(j), j->last_empty_seq); if (!bch2_journal_error(j)) clear_bit(JOURNAL_running, &j->flags); @@ -1417,8 +1419,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) unsigned long now = jiffies; u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes; - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 28); + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 28); out->atomic++; rcu_read_lock(); diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 00d0f4fc..377a3750 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -433,7 +433,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned nr); -int bch2_dev_journal_alloc(struct bch_dev *); +int bch2_dev_journal_alloc(struct bch_dev *, bool); int bch2_fs_journal_alloc(struct bch_fs *); void bch2_dev_journal_stop(struct journal *, struct bch_dev *); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index ff832d20..7a833a3f 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -415,6 +415,8 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c, flags|BCH_VALIDATE_journal); if (ret == FSCK_DELETED_KEY) continue; + else if (ret) + return ret; k = bkey_next(k); } @@ -1766,11 +1768,13 @@ static CLOSURE_CALLBACK(journal_write_preflush) if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { spin_lock(&j->lock); - closure_wait(&j->async_wait, cl); + if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { + closure_wait(&j->async_wait, cl); + spin_unlock(&j->lock); + continue_at(cl, journal_write_preflush, j->wq); + return; + } spin_unlock(&j->lock); - - continue_at(cl, journal_write_preflush, j->wq); - return; } if (w->separate_flush) { diff --git a/libbcachefs/lru.c b/libbcachefs/lru.c index c535ba27..83b1586c 100644 --- a/libbcachefs/lru.c +++ b/libbcachefs/lru.c @@ -77,6 +77,45 @@ static const char * const bch2_lru_types[] = { NULL }; +int bch2_lru_check_set(struct btree_trans *trans, + u16 lru_id, u64 time, + struct bkey_s_c referring_k, + struct bkey_buf *last_flushed) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + struct btree_iter lru_iter; + struct bkey_s_c lru_k = + bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, + lru_pos(lru_id, + bucket_to_u64(referring_k.k->p), + time), 0); + int ret = bkey_err(lru_k); + if (ret) + return ret; + + if (lru_k.k->type != KEY_TYPE_set) { + ret = bch2_btree_write_buffer_maybe_flush(trans, referring_k, last_flushed); + if (ret) + goto err; + + if (fsck_err(trans, alloc_key_to_missing_lru_entry, + "missing %s lru entry\n" + " %s", + bch2_lru_types[lru_type(lru_k)], + (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) { + ret = bch2_lru_set(trans, lru_id, bucket_to_u64(referring_k.k->p), time); + if (ret) + goto err; + } + } +err: +fsck_err: + bch2_trans_iter_exit(trans, &lru_iter); + printbuf_exit(&buf); + return ret; +} + static int bch2_check_lru_key(struct btree_trans *trans, struct btree_iter *lru_iter, struct bkey_s_c lru_k, diff --git a/libbcachefs/lru.h b/libbcachefs/lru.h index 425ba732..5bd8974a 100644 --- a/libbcachefs/lru.h +++ b/libbcachefs/lru.h @@ -49,6 +49,9 @@ int bch2_lru_del(struct btree_trans *, u16, u64, u64); int bch2_lru_set(struct btree_trans *, u16, u64, u64); int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); +struct bkey_buf; +int bch2_lru_check_set(struct btree_trans *, u16, u64, struct bkey_s_c, struct bkey_buf *); + int bch2_check_lrus(struct bch_fs *); #endif /* _BCACHEFS_LRU_H */ diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 60e4ef94..7d3920e0 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -36,31 +36,6 @@ const char * const bch2_data_ops_strs[] = { NULL }; -static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - printbuf_tabstop_push(out, 20); - prt_str(out, "rewrite ptrs:\t"); - bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); - prt_newline(out); - - prt_str(out, "kill ptrs:\t"); - bch2_prt_u64_base2(out, data_opts->kill_ptrs); - prt_newline(out); - - prt_str(out, "target:\t"); - bch2_target_to_text(out, c, data_opts->target); - prt_newline(out); - - prt_str(out, "compression:\t"); - bch2_compression_opt_to_text(out, background_compression(*io_opts)); - prt_newline(out); - - prt_str(out, "extra replicas:\t"); - prt_u64(out, data_opts->extra_replicas); -} - static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index eb49dd04..f7aa835d 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -290,18 +290,23 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) { - prt_printf(out, "Currently waiting for: "); + printbuf_tabstop_push(out, 32); + prt_printf(out, "running:\t%u\n", c->copygc_running); + prt_printf(out, "copygc_wait:\t%llu\n", c->copygc_wait); + prt_printf(out, "copygc_wait_at:\t%llu\n", c->copygc_wait_at); + + prt_printf(out, "Currently waiting for:\t"); prt_human_readable_u64(out, max(0LL, c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now)) << 9); prt_newline(out); - prt_printf(out, "Currently waiting since: "); + prt_printf(out, "Currently waiting since:\t"); prt_human_readable_u64(out, max(0LL, atomic64_read(&c->io_clock[WRITE].now) - c->copygc_wait_at) << 9); prt_newline(out); - prt_printf(out, "Currently calculated wait: "); + prt_printf(out, "Currently calculated wait:\t"); prt_human_readable_u64(out, bch2_copygc_wait_amount(c)); prt_newline(out); } @@ -352,19 +357,18 @@ static int bch2_copygc_thread(void *arg) } last = atomic64_read(&clock->now); - wait = bch2_copygc_wait_amount(c); + wait = max_t(long, 0, bch2_copygc_wait_amount(c) - clock->max_slop); - if (wait > clock->max_slop) { + if (wait > 0) { c->copygc_wait_at = last; c->copygc_wait = last + wait; move_buckets_wait(&ctxt, buckets, true); - trace_and_count(c, copygc_wait, c, wait, last + wait); - bch2_kthread_io_clock_wait(clock, last + wait, - MAX_SCHEDULE_TIMEOUT); + trace_and_count(c, copygc_wait, c, wait, c->copygc_wait); + bch2_io_clock_schedule_timeout(clock, c->copygc_wait); continue; } - c->copygc_wait = 0; + c->copygc_wait = c->copygc_wait_at = 0; c->copygc_running = true; ret = bch2_copygc(&ctxt, buckets, &did_work); @@ -379,8 +383,7 @@ static int bch2_copygc_thread(void *arg) min_member_capacity = 128 * 2048; bch2_trans_unlock_long(ctxt.trans); - bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6), - MAX_SCHEDULE_TIMEOUT); + bch2_io_clock_schedule_timeout(clock, last + (min_member_capacity >> 8)); } } @@ -396,9 +399,10 @@ static int bch2_copygc_thread(void *arg) void bch2_copygc_stop(struct bch_fs *c) { - if (c->copygc_thread) { - kthread_stop(c->copygc_thread); - put_task_struct(c->copygc_thread); + struct task_struct *t = rcu_dereference_protected(c->copygc_thread, true); + if (t) { + kthread_stop(t); + put_task_struct(t); } c->copygc_thread = NULL; } @@ -425,8 +429,8 @@ int bch2_copygc_start(struct bch_fs *c) get_task_struct(t); - c->copygc_thread = t; - wake_up_process(c->copygc_thread); + rcu_assign_pointer(c->copygc_thread, t); + wake_up_process(t); return 0; } diff --git a/libbcachefs/sb-errors_format.h b/libbcachefs/sb-errors_format.h index 67648b77..d1b2f2aa 100644 --- a/libbcachefs/sb-errors_format.h +++ b/libbcachefs/sb-errors_format.h @@ -286,7 +286,8 @@ enum bch_fsck_flags { x(accounting_mismatch, 272, 0) \ x(accounting_replicas_not_marked, 273, 0) \ x(invalid_btree_id, 274, 0) \ - x(alloc_key_io_time_bad, 275, 0) + x(alloc_key_io_time_bad, 275, 0) \ + x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/libbcachefs/super.c b/libbcachefs/super.c index ced63397..0455a100 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -968,7 +968,7 @@ static void print_mount_opts(struct bch_fs *c) struct printbuf p = PRINTBUF; bool first = true; - prt_str(&p, "mounting version "); + prt_str(&p, "starting version "); bch2_version_to_text(&p, c->sb.version); if (c->opts.read_only) { @@ -1766,7 +1766,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (ret) goto err; - ret = bch2_dev_journal_alloc(ca); + ret = bch2_dev_journal_alloc(ca, true); bch_err_msg(c, ret, "allocating journal"); if (ret) goto err; @@ -1928,7 +1928,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path) } if (!ca->journal.nr) { - ret = bch2_dev_journal_alloc(ca); + ret = bch2_dev_journal_alloc(ca, false); bch_err_msg(ca, ret, "allocating journal"); if (ret) goto err; diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 8df52a23..7c9cafdc 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -141,6 +141,7 @@ write_attribute(trigger_gc); write_attribute(trigger_discards); write_attribute(trigger_invalidates); write_attribute(trigger_journal_flush); +write_attribute(trigger_journal_writes); write_attribute(trigger_btree_cache_shrink); write_attribute(trigger_btree_key_cache_shrink); rw_attribute(gc_gens_pos); @@ -471,6 +472,9 @@ STORE(bch2_fs) bch2_journal_meta(&c->journal); } + if (attr == &sysfs_trigger_journal_writes) + bch2_journal_do_writes(&c->journal); + #ifdef CONFIG_BCACHEFS_TESTS if (attr == &sysfs_perf_test) { char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; @@ -589,6 +593,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_discards, &sysfs_trigger_invalidates, &sysfs_trigger_journal_flush, + &sysfs_trigger_journal_writes, &sysfs_trigger_btree_cache_shrink, &sysfs_trigger_btree_key_cache_shrink, diff --git a/linux/closure.c b/linux/closure.c index c971216d..116afae2 100644 --- a/linux/closure.c +++ b/linux/closure.c @@ -244,6 +244,9 @@ void closure_debug_destroy(struct closure *cl) { unsigned long flags; + if (cl->magic == CLOSURE_MAGIC_STACK) + return; + BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE); cl->magic = CLOSURE_MAGIC_DEAD;