diff --git a/.bcachefs_revision b/.bcachefs_revision index 521fa21a..d1058838 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -ba398d29060ecc2e2c9d6292a94ddc181761de1a +a0d7001b0f35580ec941acc553cf5fe28d6efea9 diff --git a/cmd_migrate.c b/cmd_migrate.c index 4772b3bd..4da3ab1b 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -702,6 +702,7 @@ static int migrate_fs(const char *fs_path, opt_set(opts, sb, sb_offset); opt_set(opts, nostart, true); opt_set(opts, noexcl, true); + opt_set(opts, buckets_nouse, true); c = bch2_fs_open(path, 1, opts); if (IS_ERR(c)) diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index 36c4c884..a21a3923 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -358,7 +358,7 @@ TRACE_EVENT(btree_node_relock_fail, TP_STRUCT__entry( __array(char, trans_fn, 24 ) - __array(char, caller, 32 ) + __field(unsigned long, caller_ip ) __field(u8, btree_id ) __field(u64, pos_inode ) __field(u64, pos_offset ) @@ -370,7 +370,7 @@ TRACE_EVENT(btree_node_relock_fail, TP_fast_assign( strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); - snprintf(__entry->caller, sizeof(__entry->caller), "%pS", (void *) caller_ip); + __entry->caller_ip = caller_ip; __entry->btree_id = btree_id; __entry->pos_inode = pos->inode; __entry->pos_offset = pos->offset; @@ -380,9 +380,9 @@ TRACE_EVENT(btree_node_relock_fail, __entry->node_lock_seq = node_lock_seq; ), - TP_printk("%s %s btree %u pos %llu:%llu:%u, node %lu iter seq %u lock seq %u", + TP_printk("%s %pS btree %u pos %llu:%llu:%u, node %lu iter seq %u lock seq %u", __entry->trans_fn, - __entry->caller, + (void *) __entry->caller_ip, __entry->btree_id, __entry->pos_inode, __entry->pos_offset, @@ -673,7 +673,7 @@ DECLARE_EVENT_CLASS(transaction_restart_iter, TP_STRUCT__entry( __array(char, trans_fn, 24 ) - __array(char, caller, 32 ) + __field(unsigned long, caller_ip ) __field(u8, btree_id ) __field(u64, pos_inode ) __field(u64, pos_offset ) @@ -682,16 +682,16 @@ DECLARE_EVENT_CLASS(transaction_restart_iter, TP_fast_assign( strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); - snprintf(__entry->caller, sizeof(__entry->caller), "%pS", (void *) caller_ip); + __entry->caller_ip = caller_ip; __entry->btree_id = btree_id; __entry->pos_inode = pos->inode; __entry->pos_offset = pos->offset; __entry->pos_snapshot = pos->snapshot; ), - TP_printk("%s %s btree %u pos %llu:%llu:%u", + TP_printk("%s %pS btree %u pos %llu:%llu:%u", __entry->trans_fn, - __entry->caller, + (void *) __entry->caller_ip, __entry->btree_id, __entry->pos_inode, __entry->pos_offset, diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index 0a634125..9b81ed26 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -151,22 +151,6 @@ static void open_bucket_free_unused(struct bch_fs *c, } } -static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct open_bucket *ob; - unsigned i; - - rcu_read_lock(); - open_bucket_for_each(c, obs, ob, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); - - BUG_ON(*bucket_gen(ca, ob->bucket) != ob->gen); - } - rcu_read_unlock(); -#endif -} - /* _only_ for allocating the journal on a new device: */ long bch2_bucket_alloc_new_fs(struct bch_dev *ca) { @@ -857,8 +841,6 @@ alloc_done: BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); - verify_not_stale(c, &wp->ptrs); - return wp; err: open_bucket_for_each(c, &wp->ptrs, ob, i) diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index eec02f8a..0e9689f6 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -281,9 +281,6 @@ do { \ "significantly affect performance") \ BCH_DEBUG_PARAM(debug_check_iterators, \ "Enables extra verification for btree iterators") \ - BCH_DEBUG_PARAM(debug_check_bkeys, \ - "Run bkey_debugcheck (primarily checking GC/allocation "\ - "information) when iterating over keys") \ BCH_DEBUG_PARAM(debug_check_btree_accounting, \ "Verify btree accounting for keys within a node") \ BCH_DEBUG_PARAM(journal_seq_verify, \ @@ -807,6 +804,7 @@ struct bch_fs { * it's not while a gc is in progress. */ struct rw_semaphore gc_lock; + struct mutex gc_gens_lock; /* IO PATH */ struct semaphore io_in_flight; diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index 5c900cf8..e83aeb68 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -212,22 +212,6 @@ const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k) return NULL; } -void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) -{ - const char *invalid; - - BUG_ON(!k.k->u64s); - - invalid = bch2_bkey_invalid(c, k, btree_node_type(b)) ?: - bch2_bkey_in_btree_node(b, k); - if (invalid) { - char buf[160]; - - bch2_bkey_val_to_text(&PBUF(buf), c, k); - bch2_fs_inconsistent(c, "invalid bkey %s: %s", buf, invalid); - } -} - void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) { if (!bpos_cmp(pos, POS_MIN)) diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h index 3012035d..4fdac545 100644 --- a/libbcachefs/bkey_methods.h +++ b/libbcachefs/bkey_methods.h @@ -34,8 +34,6 @@ const char *bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type); const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c); -void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c); - void bch2_bpos_to_text(struct printbuf *, struct bpos); void bch2_bkey_to_text(struct printbuf *, const struct bkey *); void bch2_val_to_text(struct printbuf *, struct bch_fs *, diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index d9944fb8..648779cc 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -726,11 +726,9 @@ fsck_err: static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, unsigned level, bool is_root, struct bkey_s_c *k, - u8 *max_stale, bool initial) + bool initial) { struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs; - const struct bch_extent_ptr *ptr; struct bkey deleted = KEY(0, 0, 0); struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; unsigned flags = @@ -755,17 +753,6 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, atomic64_set(&c->key_version, k->k->version.lo); } - ptrs = bch2_bkey_ptrs_c(*k); - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_GC_BUCKET(ca, ptr); - - if (gen_after(g->oldest_gen, ptr->gen)) - g->oldest_gen = ptr->gen; - - *max_stale = max(*max_stale, ptr_stale(ca, ptr)); - } - ret = bch2_mark_key(trans, old, *k, flags); fsck_err: err: @@ -774,8 +761,7 @@ err: return ret; } -static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, u8 *max_stale, - bool initial) +static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial) { struct bch_fs *c = trans->c; struct btree_node_iter iter; @@ -784,8 +770,6 @@ static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, u8 *ma struct bkey_buf prev, cur; int ret = 0; - *max_stale = 0; - if (!btree_node_type_needs_gc(btree_node_type(b))) return 0; @@ -796,7 +780,7 @@ static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, u8 *ma while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, - &k, max_stale, initial); + &k, initial); if (ret) break; @@ -827,7 +811,6 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, : bch2_expensive_debug_checks ? 0 : !btree_node_type_needs_gc(btree_id) ? 1 : 0; - u8 max_stale = 0; int ret = 0; gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); @@ -838,21 +821,9 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, gc_pos_set(c, gc_pos_btree_node(b)); - ret = btree_gc_mark_node(trans, b, &max_stale, initial); + ret = btree_gc_mark_node(trans, b, initial); if (ret) break; - - if (!initial) { - if (max_stale > 64) - bch2_btree_node_rewrite(trans, &iter, b, - BTREE_INSERT_NOWAIT| - BTREE_INSERT_GC_LOCK_HELD); - else if (!bch2_btree_gc_rewrite_disabled && - (bch2_btree_gc_always_rewrite || max_stale > 16)) - bch2_btree_node_rewrite(trans, &iter, - b, BTREE_INSERT_NOWAIT| - BTREE_INSERT_GC_LOCK_HELD); - } } bch2_trans_iter_exit(trans, &iter); @@ -864,8 +835,8 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, if (!btree_node_fake(b)) { struct bkey_s_c k = bkey_i_to_s_c(&b->key); - ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true, - &k, &max_stale, initial); + ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, + true, &k, initial); } gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); mutex_unlock(&c->btree_root_lock); @@ -880,7 +851,6 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b struct btree_and_journal_iter iter; struct bkey_s_c k; struct bkey_buf cur, prev; - u8 max_stale = 0; char buf[200]; int ret = 0; @@ -893,8 +863,8 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0); BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0); - ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, - &k, &max_stale, true); + ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, + false, &k, true); if (ret) { bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret); goto fsck_err; @@ -985,7 +955,6 @@ static int bch2_gc_btree_init(struct btree_trans *trans, : bch2_expensive_debug_checks ? 0 : !btree_node_type_needs_gc(btree_id) ? 1 : 0; - u8 max_stale = 0; char buf[100]; int ret = 0; @@ -1018,7 +987,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans, struct bkey_s_c k = bkey_i_to_s_c(&b->key); ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true, - &k, &max_stale, true); + &k, true); } fsck_err: six_unlock_read(&b->c.lock); @@ -1313,7 +1282,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans, .dev = iter->pos.inode, .bucket = iter->pos.offset, .gen = g->mark.gen, - .oldest_gen = g->oldest_gen, .data_type = g->mark.data_type, .dirty_sectors = g->mark.dirty_sectors, .cached_sectors = g->mark.cached_sectors, @@ -1330,8 +1298,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, gc_u.data_type != BCH_DATA_btree) return 0; - if (!bkey_alloc_unpacked_cmp(old_u, gc_u) || - gen_after(old_u.gen, gc_u.gen)) + if (gen_after(old_u.gen, gc_u.gen)) return 0; #define copy_bucket_field(_f) \ @@ -1353,8 +1320,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans, copy_bucket_field(stripe); #undef copy_bucket_field - new_u.oldest_gen = gc_u.oldest_gen; - if (!bkey_alloc_unpacked_cmp(old_u, new_u)) return 0; @@ -1905,6 +1870,9 @@ int bch2_gc_gens(struct bch_fs *c) * introduces a deadlock in the RO path - we currently take the state * lock at the start of going RO, thus the gc thread may get stuck: */ + if (!mutex_trylock(&c->gc_gens_lock)) + return 0; + down_read(&c->gc_lock); bch2_trans_init(&trans, c, 0, 0); @@ -1964,6 +1932,7 @@ err: bch2_trans_exit(&trans); up_read(&c->gc_lock); + mutex_unlock(&c->gc_gens_lock); return ret; } diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index b9cb2ebf..ae63ecbc 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -58,6 +58,9 @@ static inline int __btree_path_cmp(const struct btree_path *l, struct bpos r_pos, unsigned r_level) { + /* + * Must match lock ordering as defined by __bch2_btree_node_lock: + */ return cmp_int(l->btree_id, r_btree_id) ?: cmp_int((int) l->cached, (int) r_cached) ?: bpos_cmp(l->pos, r_pos) ?: @@ -162,7 +165,7 @@ void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b) else this_cpu_sub(*b->c.lock.readers, readers); - btree_node_lock_type(trans->c, b, SIX_LOCK_write); + six_lock_write(&b->c.lock, NULL, NULL); if (!b->c.lock.readers) atomic64_add(__SIX_VAL(read_lock, readers), @@ -300,10 +303,8 @@ bool __bch2_btree_node_lock(struct btree_trans *trans, six_lock_should_sleep_fn should_sleep_fn, void *p, unsigned long ip) { - struct btree_path *linked, *deadlock_path = NULL; - u64 start_time = local_clock(); - unsigned reason = 9; - bool ret; + struct btree_path *linked; + unsigned reason; /* Check if it's safe to block: */ trans_for_each_path(trans, linked) { @@ -324,28 +325,28 @@ bool __bch2_btree_node_lock(struct btree_trans *trans, */ if (type == SIX_LOCK_intent && linked->nodes_locked != linked->nodes_intent_locked) { - deadlock_path = linked; reason = 1; + goto deadlock; } if (linked->btree_id != path->btree_id) { - if (linked->btree_id > path->btree_id) { - deadlock_path = linked; - reason = 3; - } - continue; + if (linked->btree_id < path->btree_id) + continue; + + reason = 3; + goto deadlock; } /* - * Within the same btree, cached paths come before non - * cached paths: + * Within the same btree, non-cached paths come before cached + * paths: */ if (linked->cached != path->cached) { - if (path->cached) { - deadlock_path = linked; - reason = 4; - } - continue; + if (!linked->cached) + continue; + + reason = 4; + goto deadlock; } /* @@ -354,50 +355,33 @@ bool __bch2_btree_node_lock(struct btree_trans *trans, * we're about to lock, it must have the ancestors locked too: */ if (level > __fls(linked->nodes_locked)) { - deadlock_path = linked; reason = 5; + goto deadlock; } /* Must lock btree nodes in key order: */ if (btree_node_locked(linked, level) && bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b, linked->cached)) <= 0) { - deadlock_path = linked; - reason = 7; BUG_ON(trans->in_traverse_all); + reason = 7; + goto deadlock; } } - if (unlikely(deadlock_path)) { - trace_trans_restart_would_deadlock(trans->fn, ip, - trans->in_traverse_all, reason, - deadlock_path->btree_id, - deadlock_path->cached, - &deadlock_path->pos, - path->btree_id, - path->cached, - &pos); - btree_trans_restart(trans); - return false; - } - - if (six_trylock_type(&b->c.lock, type)) - return true; - - trans->locking_path_idx = path->idx; - trans->locking_pos = pos; - trans->locking_btree_id = path->btree_id; - trans->locking_level = level; - trans->locking = b; - - ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0; - - trans->locking = NULL; - - if (ret) - bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)], - start_time); - return ret; + return btree_node_lock_type(trans, path, b, pos, level, + type, should_sleep_fn, p); +deadlock: + trace_trans_restart_would_deadlock(trans->fn, ip, + trans->in_traverse_all, reason, + linked->btree_id, + linked->cached, + &linked->pos, + path->btree_id, + path->cached, + &pos); + btree_trans_restart(trans); + return false; } /* Btree iterator locking: */ @@ -1005,8 +989,6 @@ static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c, struct bkey *u, struct bkey_packed *k) { - struct bkey_s_c ret; - if (unlikely(!k)) { /* * signal to bch2_btree_iter_peek_slot() that we're currently at @@ -1016,19 +998,7 @@ static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c, return bkey_s_c_null; } - ret = bkey_disassemble(l->b, k, u); - - /* - * XXX: bch2_btree_bset_insert_key() generates invalid keys when we - * overwrite extents - it sets k->type = KEY_TYPE_deleted on the key - * being overwritten but doesn't change k->size. But this is ok, because - * those keys are never written out, we just have to avoid a spurious - * assertion here: - */ - if (bch2_debug_check_bkeys && !bkey_deleted(ret.k)) - bch2_bkey_debugcheck(c, l->b, ret); - - return ret; + return bkey_disassemble(l->b, k, u); } static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, @@ -1504,17 +1474,17 @@ retry_all: while (i < trans->nr_sorted) { path = trans->paths + trans->sorted[i]; - EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); - - ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_); - if (ret) - goto retry_all; - - EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); - - if (path->nodes_locked || - !btree_path_node(path, path->level)) + /* + * Traversing a path can cause another path to be added at about + * the same position: + */ + if (path->uptodate) { + ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_); + if (ret) + goto retry_all; + } else { i++; + } } /* @@ -3092,6 +3062,8 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char *fn) __acquires(&c->btree_trans_barrier) { + BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); + memset(trans, 0, sizeof(*trans)); trans->c = c; trans->fn = fn; @@ -3213,6 +3185,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) struct btree_trans *trans; struct btree_path *path; struct btree *b; + static char lock_types[] = { 'r', 'i', 'w' }; unsigned l; mutex_lock(&c->btree_trans_lock); @@ -3249,10 +3222,11 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) b = READ_ONCE(trans->locking); if (b) { path = &trans->paths[trans->locking_path_idx]; - pr_buf(out, " locking path %u %c l=%u %s:", + pr_buf(out, " locking path %u %c l=%u %c %s:", trans->locking_path_idx, path->cached ? 'c' : 'b', trans->locking_level, + lock_types[trans->locking_lock_type], bch2_btree_ids[trans->locking_btree_id]); bch2_bpos_to_text(out, trans->locking_pos); diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index df016c98..928aab61 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -320,7 +320,6 @@ retry: if (!trans->restarted) goto retry; - trace_transaction_restart_ip(trans->fn, _THIS_IP_); ret = -EINTR; goto err; } diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index d599008c..b4434eca 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -128,23 +128,35 @@ static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type) } } -/* - * wrapper around six locks that just traces lock contended time - */ -static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b, - enum six_lock_type type) +static inline bool btree_node_lock_type(struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct bpos pos, unsigned level, + enum six_lock_type type, + six_lock_should_sleep_fn should_sleep_fn, void *p) { - u64 start_time = local_clock(); + struct bch_fs *c = trans->c; + u64 start_time; + bool ret; - six_lock_type(&b->c.lock, type, NULL, NULL); - bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time); -} + if (six_trylock_type(&b->c.lock, type)) + return true; -static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b, - enum six_lock_type type) -{ - if (!six_trylock_type(&b->c.lock, type)) - __btree_node_lock_type(c, b, type); + start_time = local_clock(); + + trans->locking_path_idx = path->idx; + trans->locking_pos = pos; + trans->locking_btree_id = path->btree_id; + trans->locking_level = level; + trans->locking_lock_type = type; + trans->locking = b; + ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0; + trans->locking = NULL; + + if (ret) + bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time); + + return ret; } /* diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 989129f9..68272f26 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -377,6 +377,7 @@ struct btree_trans { struct bpos locking_pos; u8 locking_btree_id; u8 locking_level; + u8 locking_lock_type; pid_t pid; int srcu_idx; diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index a0f7a9f0..088c3204 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -620,8 +620,8 @@ err: * we're in journal error state: */ - btree_node_lock_type(c, b, SIX_LOCK_intent); - btree_node_lock_type(c, b, SIX_LOCK_write); + six_lock_intent(&b->c.lock, NULL, NULL); + six_lock_write(&b->c.lock, NULL, NULL); mutex_lock(&c->btree_interior_update_lock); list_del(&as->write_blocked_list); @@ -675,7 +675,7 @@ err: for (i = 0; i < as->nr_new_nodes; i++) { b = as->new_nodes[i]; - btree_node_lock_type(c, b, SIX_LOCK_read); + six_lock_read(&b->c.lock, NULL, NULL); btree_node_write_if_need(c, b, SIX_LOCK_read); six_unlock_read(&b->c.lock); } diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 0e672164..4b37a486 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -168,7 +168,7 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, struct btree_write *w = container_of(pin, struct btree_write, journal); struct btree *b = container_of(w, struct btree, writes[i]); - btree_node_lock_type(c, b, SIX_LOCK_read); + six_lock_read(&b->c.lock, NULL, NULL); bch2_btree_node_write_cond(c, b, (btree_current_write(b) == w && w->journal.seq == seq)); six_unlock_read(&b->c.lock); @@ -619,8 +619,10 @@ static inline int trans_lock_write(struct btree_trans *trans) if (have_conflicting_read_lock(trans, i->path)) goto fail; - __btree_node_lock_type(trans->c, insert_l(i)->b, - SIX_LOCK_write); + btree_node_lock_type(trans, i->path, + insert_l(i)->b, + i->path->pos, i->level, + SIX_LOCK_write, NULL, NULL); } bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 50fcc075..ea331c6e 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -666,49 +666,50 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k, const struct bch_extent_ptr *ptr, s64 sectors, enum bch_data_type ptr_data_type, - u8 bucket_gen, u8 bucket_data_type, + u8 b_gen, u8 bucket_data_type, u16 dirty_sectors, u16 cached_sectors) { - size_t bucket_nr = PTR_BUCKET_NR(bch_dev_bkey_exists(c, ptr->dev), ptr); + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); u16 bucket_sectors = !ptr->cached ? dirty_sectors : cached_sectors; char buf[200]; - if (gen_after(ptr->gen, bucket_gen)) { + if (gen_after(ptr->gen, b_gen)) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" "while marking %s", - ptr->dev, bucket_nr, bucket_gen, + ptr->dev, bucket_nr, b_gen, bch2_data_types[bucket_data_type ?: ptr_data_type], ptr->gen, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); return -EIO; } - if (gen_cmp(bucket_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { + if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" "while marking %s", - ptr->dev, bucket_nr, bucket_gen, + ptr->dev, bucket_nr, b_gen, bch2_data_types[bucket_data_type ?: ptr_data_type], ptr->gen, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); return -EIO; } - if (bucket_gen != ptr->gen && !ptr->cached) { + if (b_gen != ptr->gen && !ptr->cached) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, "bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n" "while marking %s", - ptr->dev, bucket_nr, bucket_gen, + ptr->dev, bucket_nr, b_gen, bch2_data_types[bucket_data_type ?: ptr_data_type], ptr->gen, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); return -EIO; } - if (bucket_gen != ptr->gen) + if (b_gen != ptr->gen) return 1; if (bucket_data_type && ptr_data_type && @@ -716,7 +717,7 @@ static int check_bucket_ref(struct bch_fs *c, bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", - ptr->dev, bucket_nr, bucket_gen, + ptr->dev, bucket_nr, b_gen, bch2_data_types[bucket_data_type], bch2_data_types[ptr_data_type], (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); @@ -725,9 +726,10 @@ static int check_bucket_ref(struct bch_fs *c, if ((unsigned) (bucket_sectors + sectors) > U16_MAX) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n" + "bucket %u:%zu gen %u (mem gen %u) data type %s sector count overflow: %u + %lli > U16_MAX\n" "while marking %s", - ptr->dev, bucket_nr, bucket_gen, + ptr->dev, bucket_nr, b_gen, + *bucket_gen(ca, bucket_nr), bch2_data_types[bucket_data_type ?: ptr_data_type], bucket_sectors, sectors, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); @@ -2141,9 +2143,10 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) GFP_KERNEL|__GFP_ZERO)) || !(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, GFP_KERNEL|__GFP_ZERO)) || - !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * + (c->opts.buckets_nouse && + !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * sizeof(unsigned long), - GFP_KERNEL|__GFP_ZERO)) || + GFP_KERNEL|__GFP_ZERO))) || !init_fifo(&free[RESERVE_MOVINGGC], copygc_reserve, GFP_KERNEL) || !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) || @@ -2176,9 +2179,10 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) memcpy(bucket_gens->b, old_bucket_gens->b, n); - memcpy(buckets_nouse, - ca->buckets_nouse, - BITS_TO_LONGS(n) * sizeof(unsigned long)); + if (buckets_nouse) + memcpy(buckets_nouse, + ca->buckets_nouse, + BITS_TO_LONGS(n) * sizeof(unsigned long)); } rcu_assign_pointer(ca->buckets[0], buckets); diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 3b9b96e5..1d0871f6 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -1062,8 +1062,6 @@ retry: sectors = min(sectors, k.k->size - offset_into_extent); - bch2_trans_unlock(trans); - if (readpages_iter) readpage_bio_extend(readpages_iter, &rbio->bio, sectors, extent_partial_reads_expensive(k)); diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 43b6159b..ced4d671 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -1316,8 +1316,9 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) if (i->inode.bi_nlink == i->count) continue; - count2 = lockrestart_do(trans, - bch2_count_subdirs(trans, w->cur_inum, i->snapshot)); + count2 = bch2_count_subdirs(trans, w->cur_inum, i->snapshot); + if (count2 < 0) + return count2; if (i->count != count2) { bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu", diff --git a/libbcachefs/io.c b/libbcachefs/io.c index f0e93de4..10f8b3ae 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -1953,6 +1953,33 @@ err: return ret; } +static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, + struct bkey_s_c k, + struct bch_extent_ptr ptr) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev); + struct btree_iter iter; + char buf[200]; + int ret; + + bch2_bkey_val_to_text(&PBUF(buf), c, k); + bch2_fs_inconsistent(c, "Attempting to read from stale dirty pointer: %s", buf); + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + POS(ptr.dev, PTR_BUCKET_NR(ca, &ptr)), + BTREE_ITER_CACHED); + + ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + if (ret) + return; + + bch2_bkey_val_to_text(&PBUF(buf), c, k); + bch_err(c, "%s", buf); + bch_err(c, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); + bch2_trans_iter_exit(trans, &iter); +} + int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, struct bvec_iter iter, struct bpos read_pos, enum btree_id data_btree, struct bkey_s_c k, @@ -1962,7 +1989,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, struct bch_fs *c = trans->c; struct extent_ptr_decoded pick; struct bch_read_bio *rbio = NULL; - struct bch_dev *ca; + struct bch_dev *ca = NULL; struct promote_op *promote = NULL; bool bounce = false, read_full = false, narrow_crcs = false; struct bpos data_pos = bkey_start_pos(k.k); @@ -1979,7 +2006,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, zero_fill_bio_iter(&orig->bio, iter); goto out_read_done; } - +retry_pick: pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); /* hole or reservation - just zero fill: */ @@ -1992,8 +2019,20 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, goto err; } - if (pick_ret > 0) - ca = bch_dev_bkey_exists(c, pick.ptr.dev); + ca = bch_dev_bkey_exists(c, pick.ptr.dev); + + if (!pick.ptr.cached && + unlikely(ptr_stale(ca, &pick.ptr))) { + read_from_stale_dirty_pointer(trans, k, pick.ptr); + bch2_mark_io_failure(failed, &pick); + goto retry_pick; + } + + /* + * Unlock the iterator while the btree node's lock is still in + * cache, before doing the IO: + */ + bch2_trans_unlock(trans); if (flags & BCH_READ_NODECODE) { /* @@ -2281,12 +2320,6 @@ retry: */ sectors = min(sectors, k.k->size - offset_into_extent); - /* - * Unlock the iterator while the btree node's lock is still in - * cache, before doing the IO: - */ - bch2_trans_unlock(&trans); - bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; swap(bvec_iter.bi_size, bytes); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 651828b8..b5c204e7 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -299,11 +299,17 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs struct jset_entry *entry) { struct bkey_i *k; + bool first = true; - pr_buf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level); - - vstruct_for_each(entry, k) + vstruct_for_each(entry, k) { + if (!first) { + printbuf_newline(out); + pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]); + } + pr_buf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level); bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); + first = false; + } } static int journal_entry_btree_root_validate(struct bch_fs *c, diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 3e3dcec3..7ca7ce39 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -487,19 +487,22 @@ static void move_read_endio(struct bio *bio) closure_put(&ctxt->cl); } -static void do_pending_writes(struct moving_context *ctxt) +static void do_pending_writes(struct moving_context *ctxt, struct btree_trans *trans) { struct moving_io *io; + if (trans) + bch2_trans_unlock(trans); + while ((io = next_pending_write(ctxt))) { list_del(&io->list); closure_call(&io->cl, move_write, NULL, &ctxt->cl); } } -#define move_ctxt_wait_event(_ctxt, _cond) \ +#define move_ctxt_wait_event(_ctxt, _trans, _cond) \ do { \ - do_pending_writes(_ctxt); \ + do_pending_writes(_ctxt, _trans); \ \ if (_cond) \ break; \ @@ -507,11 +510,12 @@ do { \ next_pending_write(_ctxt) || (_cond)); \ } while (1) -static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) +static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt, + struct btree_trans *trans) { unsigned sectors_pending = atomic_read(&ctxt->write_sectors); - move_ctxt_wait_event(ctxt, + move_ctxt_wait_event(ctxt, trans, !atomic_read(&ctxt->write_sectors) || atomic_read(&ctxt->write_sectors) != sectors_pending); } @@ -533,14 +537,6 @@ static int bch2_move_extent(struct btree_trans *trans, unsigned sectors = k.k->size, pages; int ret = -ENOMEM; - move_ctxt_wait_event(ctxt, - atomic_read(&ctxt->write_sectors) < - SECTORS_IN_FLIGHT_PER_DEVICE); - - move_ctxt_wait_event(ctxt, - atomic_read(&ctxt->read_sectors) < - SECTORS_IN_FLIGHT_PER_DEVICE); - /* write path might have to decompress data: */ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); @@ -691,12 +687,19 @@ static int __bch2_move_data(struct bch_fs *c, schedule_timeout(delay); if (unlikely(freezing(current))) { - bch2_trans_unlock(&trans); - move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); + move_ctxt_wait_event(ctxt, &trans, list_empty(&ctxt->reads)); try_to_freeze(); } } while (delay); + move_ctxt_wait_event(ctxt, &trans, + atomic_read(&ctxt->write_sectors) < + SECTORS_IN_FLIGHT_PER_DEVICE); + + move_ctxt_wait_event(ctxt, &trans, + atomic_read(&ctxt->read_sectors) < + SECTORS_IN_FLIGHT_PER_DEVICE); + bch2_trans_begin(&trans); k = bch2_btree_iter_peek(&iter); @@ -748,10 +751,12 @@ static int __bch2_move_data(struct bch_fs *c, BUG(); } - /* unlock before doing IO: */ + /* + * The iterator gets unlocked by __bch2_read_extent - need to + * save a copy of @k elsewhere: + */ bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - bch2_trans_unlock(&trans); ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k, data_cmd, data_opts); @@ -761,7 +766,7 @@ static int __bch2_move_data(struct bch_fs *c, if (ret2 == -ENOMEM) { /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(ctxt); + bch2_move_ctxt_wait_for_io(ctxt, &trans); continue; } @@ -846,7 +851,7 @@ int bch2_move_data(struct bch_fs *c, } - move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); + move_ctxt_wait_event(&ctxt, NULL, list_empty(&ctxt.reads)); closure_sync(&ctxt.cl); EBUG_ON(atomic_read(&ctxt.write_sectors)); diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index c325a094..affe9233 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -365,6 +365,11 @@ enum opt_type { NO_SB_OPT, false, \ NULL, "Set superblock to latest version,\n" \ "allowing any new features to be used") \ + x(buckets_nouse, u8, \ + 0, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Allocate the buckets_nouse bitmap") \ x(project, u8, \ OPT_INODE, \ OPT_BOOL(), \ diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 7e4400cc..543db58f 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -1126,12 +1126,12 @@ use_clean: test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { bool metadata_only = c->opts.norecovery; - bch_info(c, "starting mark and sweep"); + bch_info(c, "checking allocations"); err = "error in mark and sweep"; ret = bch2_gc(c, true, metadata_only); if (ret) goto err; - bch_verbose(c, "mark and sweep done"); + bch_verbose(c, "done checking allocations"); } bch2_stripes_heap_start(c); diff --git a/libbcachefs/super.c b/libbcachefs/super.c index d8b72d8d..b36e6216 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -674,6 +674,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); init_rwsem(&c->gc_lock); + mutex_init(&c->gc_gens_lock); for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_init(&c->times[i]); @@ -1879,20 +1880,14 @@ err: } /* return with ref on ca->ref: */ -struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path) +struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) { struct bch_dev *ca; - dev_t dev; unsigned i; - int ret; - - ret = lookup_bdev(path, &dev); - if (ret) - return ERR_PTR(ret); rcu_read_lock(); for_each_member_device_rcu(ca, c, i, NULL) - if (ca->dev == dev) + if (!strcmp(name, ca->name)) goto found; ca = ERR_PTR(-ENOENT); found: diff --git a/libbcachefs/util.h b/libbcachefs/util.h index 3196bc30..e55407dc 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -238,6 +238,7 @@ do { \ struct printbuf { char *pos; char *end; + unsigned indent; }; static inline size_t printbuf_remaining(struct printbuf *buf) @@ -259,6 +260,27 @@ do { \ __VA_ARGS__); \ } while (0) +static inline void printbuf_indent_push(struct printbuf *buf, unsigned spaces) +{ + buf->indent += spaces; + while (spaces--) + pr_buf(buf, " "); +} + +static inline void printbuf_indent_pop(struct printbuf *buf, unsigned spaces) +{ + buf->indent -= spaces; +} + +static inline void printbuf_newline(struct printbuf *buf) +{ + unsigned i; + + pr_buf(buf, "\n"); + for (i = 0; i < buf->indent; i++) + pr_buf(buf, " "); +} + void bch_scnmemcpy(struct printbuf *, const char *, size_t); int bch2_strtoint_h(const char *, int *);