diff --git a/.bcachefs_revision b/.bcachefs_revision index 295ed546..d5115b93 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -26202210393adf3fce3d98a3a2598c21d07b5634 +24c6361e202cc09de0159505eb3ab3ca265520d8 diff --git a/include/linux/kernel.h b/include/linux/kernel.h index a0d3e467..d31b5f56 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -229,6 +229,7 @@ static inline int __must_check kstrtos32(const char *s, unsigned int base, s32 * } struct printbuf; +extern __printf(2, 0) void prt_vprintf(struct printbuf *out, const char *fmt, va_list args); extern __printf(2, 3) void prt_printf(struct printbuf *out, const char *fmt, ...); static const char hex_asc[] = "0123456789abcdef"; diff --git a/include/linux/six.h b/include/linux/six.h index f336ae04..362a577b 100644 --- a/include/linux/six.h +++ b/include/linux/six.h @@ -107,6 +107,7 @@ struct six_lock { struct task_struct *owner; unsigned __percpu *readers; unsigned intent_lock_recurse; + unsigned long ip; raw_spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -119,6 +120,7 @@ struct six_lock_waiter { struct task_struct *task; enum six_lock_type lock_want; bool lock_acquired; + u64 start_time; }; typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *); diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index ff5e6f7c..d3d9e965 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -401,6 +401,7 @@ TRACE_EVENT(btree_path_relock_fail, __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) __field(u8, btree_id ) + __field(u8, level ) TRACE_BPOS_entries(pos) __array(char, node, 24 ) __field(u32, iter_lock_seq ) @@ -413,6 +414,7 @@ TRACE_EVENT(btree_path_relock_fail, strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; __entry->btree_id = path->btree_id; + __entry->level = path->level; TRACE_BPOS_assign(pos, path->pos); if (IS_ERR(b)) strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node)); @@ -422,13 +424,14 @@ TRACE_EVENT(btree_path_relock_fail, __entry->node_lock_seq = is_btree_node(path, level) ? path->l[level].b->c.lock.state.seq : 0; ), - TP_printk("%s %pS btree %s pos %llu:%llu:%u, node %s iter seq %u lock seq %u", + TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s iter seq %u lock seq %u", __entry->trans_fn, (void *) __entry->caller_ip, bch2_btree_ids[__entry->btree_id], __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot, + __entry->level, __entry->node, __entry->iter_lock_seq, __entry->node_lock_seq) @@ -445,12 +448,15 @@ TRACE_EVENT(btree_path_upgrade_fail, __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) __field(u8, btree_id ) + __field(u8, level ) TRACE_BPOS_entries(pos) __field(u8, locked ) __field(u8, self_read_count ) __field(u8, self_intent_count) __field(u8, read_count ) __field(u8, intent_count ) + __field(u32, iter_lock_seq ) + __field(u32, node_lock_seq ) ), TP_fast_assign( @@ -459,6 +465,7 @@ TRACE_EVENT(btree_path_upgrade_fail, strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; __entry->btree_id = path->btree_id; + __entry->level = level; TRACE_BPOS_assign(pos, path->pos); __entry->locked = btree_node_locked(path, level); @@ -468,20 +475,25 @@ TRACE_EVENT(btree_path_upgrade_fail, c = six_lock_counts(&path->l[level].b->c.lock); __entry->read_count = c.n[SIX_LOCK_read]; __entry->intent_count = c.n[SIX_LOCK_read]; + __entry->iter_lock_seq = path->l[level].lock_seq; + __entry->node_lock_seq = is_btree_node(path, level) ? path->l[level].b->c.lock.state.seq : 0; ), - TP_printk("%s %pS btree %s pos %llu:%llu:%u, locked %u held %u:%u lock count %u:%u", + TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u", __entry->trans_fn, (void *) __entry->caller_ip, bch2_btree_ids[__entry->btree_id], __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot, + __entry->level, __entry->locked, __entry->self_read_count, __entry->self_intent_count, __entry->read_count, - __entry->intent_count) + __entry->intent_count, + __entry->iter_lock_seq, + __entry->node_lock_seq) ); /* Garbage collection */ @@ -499,22 +511,29 @@ DEFINE_EVENT(bch_fs, gc_gens_end, /* Allocator */ TRACE_EVENT(bucket_alloc, - TP_PROTO(struct bch_dev *ca, const char *alloc_reserve), - TP_ARGS(ca, alloc_reserve), + TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, + bool user, u64 bucket), + TP_ARGS(ca, alloc_reserve, user, bucket), TP_STRUCT__entry( __field(dev_t, dev ) __array(char, reserve, 16 ) + __field(bool, user ) + __field(u64, bucket ) ), TP_fast_assign( __entry->dev = ca->dev; strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); + __entry->user = user; + __entry->bucket = bucket; ), - TP_printk("%d,%d reserve %s", + TP_printk("%d,%d reserve %s user %u bucket %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->reserve) + __entry->reserve, + __entry->user, + __entry->bucket) ); TRACE_EVENT(bucket_alloc_fail, @@ -544,7 +563,7 @@ TRACE_EVENT(bucket_alloc_fail, __field(u64, need_journal_commit ) __field(u64, nouse ) __field(bool, nonblocking ) - __array(char, err, 16 ) + __array(char, err, 32 ) ), TP_fast_assign( @@ -881,18 +900,41 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, TP_ARGS(trans, caller_ip, path) ); -DEFINE_EVENT(transaction_restart_iter, trans_restart_upgrade, +TRACE_EVENT(trans_restart_upgrade, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); + struct btree_path *path, + unsigned old_locks_want, + unsigned new_locks_want), + TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want), -DEFINE_EVENT(transaction_restart_iter, trans_restart_iter_upgrade, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(u8, btree_id ) + __field(u8, old_locks_want ) + __field(u8, new_locks_want ) + TRACE_BPOS_entries(pos) + ), + + TP_fast_assign( + strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->btree_id = path->btree_id; + __entry->old_locks_want = old_locks_want; + __entry->new_locks_want = new_locks_want; + TRACE_BPOS_assign(pos, path->pos) + ), + + TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u", + __entry->trans_fn, + (void *) __entry->caller_ip, + bch2_btree_ids[__entry->btree_id], + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot, + __entry->old_locks_want, + __entry->new_locks_want) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_relock, @@ -964,57 +1006,16 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure, TP_ARGS(trans, caller_ip, path) ); -TRACE_EVENT(trans_restart_would_deadlock, +DEFINE_EVENT(transaction_event, trans_restart_would_deadlock, TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - unsigned reason, - struct btree_path *have, - struct btree_path *want, - struct bpos *want_pos), - TP_ARGS(trans, caller_ip, reason, - have, want, want_pos), + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(u8, in_traverse_all ) - __field(u8, reason ) - __field(u8, have_btree_id ) - __field(u8, have_type ) - __field(u8, want_btree_id ) - __field(u8, want_type ) - TRACE_BPOS_entries(have_pos) - TRACE_BPOS_entries(want_pos) - ), - - TP_fast_assign( - strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->in_traverse_all = trans->in_traverse_all; - __entry->reason = reason; - __entry->have_btree_id = have->btree_id; - __entry->have_type = have->cached; - __entry->want_btree_id = want->btree_id; - __entry->want_type = want->cached; - TRACE_BPOS_assign(have_pos, have->pos); - TRACE_BPOS_assign(want_pos, *want_pos); - ), - - TP_printk("%s %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u", - __entry->trans_fn, - (void *) __entry->caller_ip, - __entry->in_traverse_all, - __entry->reason, - __entry->have_btree_id, - __entry->have_type, - __entry->have_pos_inode, - __entry->have_pos_offset, - __entry->have_pos_snapshot, - __entry->want_btree_id, - __entry->want_type, - __entry->want_pos_inode, - __entry->want_pos_offset, - __entry->want_pos_snapshot) +DEFINE_EVENT(transaction_event, trans_restart_would_deadlock_recursion_limit, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) ); TRACE_EVENT(trans_restart_would_deadlock_write, diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index dce227c5..ce365fec 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -268,7 +268,6 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * spin_unlock(&c->freelist_lock); - trace_and_count(c, bucket_alloc, ca, bch2_alloc_reserves[reserve]); return ob; } @@ -575,7 +574,10 @@ err: if (!ob) ob = ERR_PTR(-BCH_ERR_no_buckets_found); - if (IS_ERR(ob)) + if (!IS_ERR(ob)) + trace_and_count(c, bucket_alloc, ca, bch2_alloc_reserves[reserve], + may_alloc_partial, ob->bucket); + else trace_and_count(c, bucket_alloc_fail, ca, bch2_alloc_reserves[reserve], usage.d[BCH_DATA_free].buckets, @@ -1223,7 +1225,9 @@ err: if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) || bch2_err_matches(ret, BCH_ERR_freelist_empty)) - return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC); + return cl + ? ERR_PTR(-EAGAIN) + : ERR_PTR(-BCH_ERR_ENOSPC_bucket_alloc); if (bch2_err_matches(ret, BCH_ERR_insufficient_devices)) return ERR_PTR(-EROFS); diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c index 029b1ec1..955f3ee9 100644 --- a/libbcachefs/backpointers.c +++ b/libbcachefs/backpointers.c @@ -529,14 +529,22 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, bch2_trans_iter_exit(trans, iter); if (bp.level) { + struct btree *b; + /* * If a backpointer for a btree node wasn't found, it may be * because it was overwritten by a new btree node that hasn't * been written out yet - backpointer_get_node() checks for * this: */ - bch2_backpointer_get_node(trans, iter, bucket, bp_offset, bp); + b = bch2_backpointer_get_node(trans, iter, bucket, bp_offset, bp); + if (!IS_ERR_OR_NULL(b)) + return bkey_i_to_s_c(&b->key); + bch2_trans_iter_exit(trans, iter); + + if (IS_ERR(b)) + return bkey_s_c_err(PTR_ERR(b)); return bkey_s_c_null; } diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index fc451e46..ccac2a3f 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -226,9 +226,11 @@ do { \ dynamic_fault("bcachefs:meta:write:" name) #ifdef __KERNEL__ -#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name) +#define bch2_log_msg(_c, fmt) "bcachefs (%s): " fmt, ((_c)->name) +#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") #define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum) #else +#define bch2_log_msg(_c, fmt) fmt #define bch2_fmt(_c, fmt) fmt "\n" #define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum) #endif @@ -812,7 +814,6 @@ struct bch_fs { struct mutex gc_gens_lock; /* IO PATH */ - struct semaphore io_in_flight; struct bio_set bio_read; struct bio_set bio_read_split; struct bio_set bio_write; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 7730e955..7b5fd726 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -1411,7 +1411,8 @@ struct bch_sb_field_disk_groups { x(trans_restart_key_cache_upgrade, 70) \ x(trans_traverse_all, 71) \ x(transaction_commit, 72) \ - x(write_super, 73) + x(write_super, 73) \ + x(trans_restart_would_deadlock_recursion_limit, 74) \ enum bch_persistent_counters { #define x(t, n, ...) BCH_COUNTER_##t, diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index dabdb25c..4d1fc39c 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -110,14 +110,17 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) return 0; } -static struct btree *__btree_node_mem_alloc(struct bch_fs *c) +static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) { - struct btree *b = kzalloc(sizeof(struct btree), GFP_KERNEL); + struct btree *b = kzalloc(sizeof(struct btree), gfp); if (!b) return NULL; bkey_btree_ptr_init(&b->key); __six_lock_init(&b->c.lock, "b->c.lock", &bch2_btree_node_lock_key); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + lockdep_set_no_check_recursion(&b->c.lock.dep_map); +#endif INIT_LIST_HEAD(&b->list); INIT_LIST_HEAD(&b->write_blocked); b->byte_order = ilog2(btree_bytes(c)); @@ -127,7 +130,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c) struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) { struct btree_cache *bc = &c->btree_cache; - struct btree *b = __btree_node_mem_alloc(c); + struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL); if (!b) return NULL; @@ -150,8 +153,6 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) /* Cause future lookups for this node to fail: */ b->hash_val = 0; - - six_lock_wakeup_all(&b->c.lock); } int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) @@ -281,20 +282,17 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, struct btree *b, *t; unsigned long nr = sc->nr_to_scan; unsigned long can_free = 0; - unsigned long touched = 0; unsigned long freed = 0; + unsigned long touched = 0; unsigned i, flags; unsigned long ret = SHRINK_STOP; + bool trigger_writes = atomic_read(&bc->dirty) + nr >= + bc->used * 3 / 4; if (bch2_btree_shrinker_disabled) return SHRINK_STOP; - /* Return -1 if we can't do anything right now */ - if (sc->gfp_mask & __GFP_FS) - mutex_lock(&bc->lock); - else if (!mutex_trylock(&bc->lock)) - goto out_norestore; - + mutex_lock(&bc->lock); flags = memalloc_nofs_save(); /* @@ -319,7 +317,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, touched++; if (touched >= nr) - break; + goto out; if (!btree_node_reclaim(c, b)) { btree_node_data_free(c, b); @@ -330,52 +328,43 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, } restart: list_for_each_entry_safe(b, t, &bc->live, list) { - /* tweak this */ + touched++; + if (btree_node_accessed(b)) { clear_btree_node_accessed(b); - goto touched; - } - - if (!btree_node_reclaim(c, b)) { - /* can't call bch2_btree_node_hash_remove under lock */ + } else if (!btree_node_reclaim(c, b)) { freed++; - if (&t->list != &bc->live) - list_move_tail(&bc->live, &t->list); - btree_node_data_free(c, b); - mutex_unlock(&bc->lock); bch2_btree_node_hash_remove(bc, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); - if (freed >= nr) - goto out; - - if (sc->gfp_mask & __GFP_FS) - mutex_lock(&bc->lock); - else if (!mutex_trylock(&bc->lock)) + if (freed == nr) goto out; + } else if (trigger_writes && + btree_node_dirty(b) && + !btree_node_will_make_reachable(b) && + !btree_node_write_blocked(b) && + six_trylock_read(&b->c.lock)) { + list_move(&bc->live, &b->list); + mutex_unlock(&bc->lock); + __bch2_btree_node_write(c, b, 0); + six_unlock_read(&b->c.lock); + if (touched >= nr) + goto out_nounlock; + mutex_lock(&bc->lock); goto restart; - } else { - continue; } -touched: - touched++; - if (touched >= nr) { - /* Save position */ - if (&t->list != &bc->live) - list_move_tail(&bc->live, &t->list); + if (touched >= nr) break; - } } - - mutex_unlock(&bc->lock); out: + mutex_unlock(&bc->lock); +out_nounlock: ret = freed; memalloc_nofs_restore(flags); -out_norestore: trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret); return ret; } @@ -596,9 +585,14 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks) goto got_node; } - b = __btree_node_mem_alloc(c); - if (!b) - goto err_locked; + b = __btree_node_mem_alloc(c, __GFP_NOWARN); + if (!b) { + mutex_unlock(&bc->lock); + b = __btree_node_mem_alloc(c, GFP_KERNEL); + if (!b) + goto err; + mutex_lock(&bc->lock); + } if (pcpu_read_locks) six_lock_pcpu_alloc(&b->c.lock); @@ -651,7 +645,7 @@ out: return b; err: mutex_lock(&bc->lock); -err_locked: + /* Try to cannibalize another cached btree node: */ if (bc->alloc_lock == current) { b2 = btree_node_cannibalize(c); @@ -763,16 +757,6 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, return b; } -static int lock_node_check_fn(struct six_lock *lock, void *p) -{ - struct btree *b = container_of(lock, struct btree, c.lock); - const struct bkey_i *k = p; - - if (b->hash_val != btree_ptr_hash_val(k)) - return BCH_ERR_lock_fail_node_reused; - return 0; -} - static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) { struct printbuf buf = PRINTBUF; @@ -894,15 +878,11 @@ lock_node: if (btree_node_read_locked(path, level + 1)) btree_node_unlock(trans, path, level + 1); - ret = btree_node_lock(trans, path, &b->c, k->k.p, level, lock_type, - lock_node_check_fn, (void *) k, trace_ip); - if (unlikely(ret)) { - if (bch2_err_matches(ret, BCH_ERR_lock_fail_node_reused)) - goto retry; - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ERR_PTR(ret); - BUG(); - } + ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ERR_PTR(ret); + + BUG_ON(ret); if (unlikely(b->hash_val != btree_ptr_hash_val(k) || b->c.level != level || @@ -1008,13 +988,10 @@ retry: } else { lock_node: ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read); - if (unlikely(ret)) { - if (bch2_err_matches(ret, BCH_ERR_lock_fail_node_reused)) - goto retry; - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ERR_PTR(ret); - BUG(); - } + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ERR_PTR(ret); + + BUG_ON(ret); if (unlikely(b->hash_val != btree_ptr_hash_val(k) || b->c.btree_id != btree_id || diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 663c66d0..5b7f7cd3 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -536,9 +536,9 @@ static int bch2_repair_topology(struct bch_fs *c) if (btree_node_fake(b)) continue; - six_unlock_read(&b->c.lock); btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); ret = bch2_btree_repair_topology_recurse(&trans, b); + six_unlock_read(&b->c.lock); if (ret == DROP_THIS_NODE) { bch_err(c, "empty btree root - repair unimplemented"); diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 177fd49d..13ce2975 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -513,9 +513,11 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, struct btree *b, struct bset *i, unsigned offset, int write) { - prt_printf(out, "error validating btree node "); - if (write) - prt_printf(out, "before write "); + prt_printf(out, bch2_log_msg(c, "")); + if (!write) + prt_str(out, "error validating btree node "); + else + prt_str(out, "corrupt btree node before write "); if (ca) prt_printf(out, "on %s ", ca->name); prt_printf(out, "at btree "); @@ -524,6 +526,7 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, prt_printf(out, "\n node offset %u", b->written); if (i) prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); + prt_str(out, ": "); } enum btree_err_type { @@ -543,7 +546,7 @@ enum btree_validate_ret { struct printbuf out = PRINTBUF; \ \ btree_err_msg(&out, c, ca, b, i, b->written, write); \ - prt_printf(&out, ": " msg, ##__VA_ARGS__); \ + prt_printf(&out, msg, ##__VA_ARGS__); \ \ if (type == BTREE_ERR_FIXABLE && \ write == READ && \ @@ -552,10 +555,10 @@ enum btree_validate_ret { goto out; \ } \ \ + bch2_print_string_as_lines(KERN_ERR, out.buf); \ + \ switch (write) { \ case READ: \ - bch_err(c, "%s", out.buf); \ - \ switch (type) { \ case BTREE_ERR_FIXABLE: \ ret = -BCH_ERR_fsck_errors_not_fixed; \ @@ -575,8 +578,6 @@ enum btree_validate_ret { } \ break; \ case WRITE: \ - bch_err(c, "corrupt metadata before write: %s", out.buf);\ - \ if (bch2_fs_inconsistent(c)) { \ ret = -BCH_ERR_fsck_errors_not_fixed; \ goto fsck_err; \ diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 512c3b2b..a7ff5df4 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -179,7 +179,7 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans, if (!btree_path_node(path, level)) return; - if (!bch2_btree_node_relock(trans, path, level)) + if (!bch2_btree_node_relock_notrace(trans, path, level)) return; BUG_ON(!btree_path_pos_in_node(path, l->b)); @@ -627,61 +627,6 @@ static inline bool btree_path_advance_to_pos(struct btree_path *path, return true; } -/* - * Verify that iterator for parent node points to child node: - */ -static void btree_path_verify_new_node(struct btree_trans *trans, - struct btree_path *path, struct btree *b) -{ - struct bch_fs *c = trans->c; - struct btree_path_level *l; - unsigned plevel; - bool parent_locked; - struct bkey_packed *k; - - if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) - return; - - if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) - return; - - plevel = b->c.level + 1; - if (!btree_path_node(path, plevel)) - return; - - parent_locked = btree_node_locked(path, plevel); - - if (!bch2_btree_node_relock(trans, path, plevel)) - return; - - l = &path->l[plevel]; - k = bch2_btree_node_iter_peek_all(&l->iter, l->b); - if (!k || - bkey_deleted(k) || - bkey_cmp_left_packed(l->b, k, &b->key.k.p)) { - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - struct printbuf buf3 = PRINTBUF; - struct printbuf buf4 = PRINTBUF; - struct bkey uk = bkey_unpack_key(b, k); - - bch2_dump_btree_node(c, l->b); - bch2_bpos_to_text(&buf1, path->pos); - bch2_bkey_to_text(&buf2, &uk); - bch2_bpos_to_text(&buf3, b->data->min_key); - bch2_bpos_to_text(&buf3, b->data->max_key); - panic("parent iter doesn't point to new node:\n" - "iter pos %s %s\n" - "iter key %s\n" - "new node %s-%s\n", - bch2_btree_ids[path->btree_id], - buf1.buf, buf2.buf, buf3.buf, buf4.buf); - } - - if (!parent_locked) - btree_node_unlock(trans, path, plevel); -} - static inline void __btree_path_level_init(struct btree_path *path, unsigned level) { @@ -697,14 +642,12 @@ static inline void __btree_path_level_init(struct btree_path *path, bch2_btree_node_iter_peek(&l->iter, l->b); } -static inline void btree_path_level_init(struct btree_trans *trans, - struct btree_path *path, - struct btree *b) +inline void bch2_btree_path_level_init(struct btree_trans *trans, + struct btree_path *path, + struct btree *b) { BUG_ON(path->cached); - btree_path_verify_new_node(trans, path, b); - EBUG_ON(!btree_path_pos_in_node(path, b)); EBUG_ON(b->c.lock.state.seq & 1); @@ -736,7 +679,7 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) mark_btree_node_locked(trans, path, b->c.level, t); } - btree_path_level_init(trans, path, b); + bch2_btree_path_level_init(trans, path, b); } } @@ -754,16 +697,6 @@ void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b) /* Btree path: traverse, set_pos: */ -static int lock_root_check_fn(struct six_lock *lock, void *p) -{ - struct btree *b = container_of(lock, struct btree, c.lock); - struct btree **rootp = p; - - if (b != *rootp) - return BCH_ERR_lock_fail_root_changed; - return 0; -} - static inline int btree_path_lock_root(struct btree_trans *trans, struct btree_path *path, unsigned depth_want, @@ -795,10 +728,8 @@ static inline int btree_path_lock_root(struct btree_trans *trans, } lock_type = __btree_lock_want(path, path->level); - ret = btree_node_lock(trans, path, &b->c, SPOS_MAX, - path->level, lock_type, - lock_root_check_fn, rootp, - trace_ip); + ret = btree_node_lock(trans, path, &b->c, + path->level, lock_type, trace_ip); if (unlikely(ret)) { if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed)) continue; @@ -817,7 +748,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans, path->l[i].b = NULL; mark_btree_node_locked(trans, path, path->level, lock_type); - btree_path_level_init(trans, path, b); + bch2_btree_path_level_init(trans, path, b); return 0; } @@ -990,7 +921,7 @@ static __always_inline int btree_path_down(struct btree_trans *trans, mark_btree_node_locked(trans, path, level, lock_type); path->level = level; - btree_path_level_init(trans, path, b); + bch2_btree_path_level_init(trans, path, b); bch2_btree_path_verify_locks(path); err: @@ -1006,7 +937,7 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans) struct bch_fs *c = trans->c; struct btree_path *path; unsigned long trace_ip = _RET_IP_; - int i, ret = 0; + int ret = 0; if (trans->in_traverse_all) return -BCH_ERR_transaction_restart_in_traverse_all; @@ -1021,17 +952,6 @@ retry_all: btree_trans_verify_sorted(trans); - for (i = trans->nr_sorted - 2; i >= 0; --i) { - struct btree_path *path1 = trans->paths + trans->sorted[i]; - struct btree_path *path2 = trans->paths + trans->sorted[i + 1]; - - if (path1->btree_id == path2->btree_id && - path1->locks_want < path2->locks_want) - __bch2_btree_path_upgrade(trans, path1, path2->locks_want); - else if (!path1->locks_want && path2->locks_want) - __bch2_btree_path_upgrade(trans, path1, 1); - } - bch2_trans_unlock(trans); cond_resched(); @@ -1120,7 +1040,7 @@ static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, int check_pos) { unsigned i, l = path->level; - +again: while (btree_path_node(path, l) && !btree_path_good_node(trans, path, l, check_pos)) __btree_path_set_level_up(trans, path, l++); @@ -1129,9 +1049,11 @@ static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, for (i = l + 1; i < path->locks_want && btree_path_node(path, i); i++) - if (!bch2_btree_node_relock(trans, path, i)) + if (!bch2_btree_node_relock(trans, path, i)) { while (l <= i) __btree_path_set_level_up(trans, path, l++); + goto again; + } return l; } @@ -1175,6 +1097,9 @@ static int btree_path_traverse_one(struct btree_trans *trans, path->level = btree_path_up_until_good_node(trans, path, 0); + EBUG_ON(btree_path_node(path, path->level) && + !btree_node_locked(path, path->level)); + /* * Note: path->nodes[path->level] may be temporarily NULL here - that * would indicate to other code that we got to the end of the btree, @@ -1431,7 +1356,7 @@ void bch2_dump_trans_updates(struct btree_trans *trans) struct printbuf buf = PRINTBUF; bch2_trans_updates_to_text(&buf, trans); - bch_err(trans->c, "%s", buf.buf); + bch2_print_string_as_lines(KERN_ERR, buf.buf); printbuf_exit(&buf); } @@ -1467,11 +1392,10 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans) struct printbuf buf = PRINTBUF; bch2_trans_paths_to_text(&buf, trans); + bch2_trans_updates_to_text(&buf, trans); - printk(KERN_ERR "%s", buf.buf); + bch2_print_string_as_lines(KERN_ERR, buf.buf); printbuf_exit(&buf); - - bch2_dump_trans_updates(trans); } noinline @@ -1485,7 +1409,8 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans) if (!buf.allocation_failure) { mutex_lock(&s->lock); if (s->nr_max_paths < hweight64(trans->paths_allocated)) { - s->nr_max_paths = hweight64(trans->paths_allocated); + s->nr_max_paths = trans->nr_max_paths = + hweight64(trans->paths_allocated); swap(s->max_paths_text, buf.buf); } mutex_unlock(&s->lock); @@ -1494,23 +1419,26 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans) printbuf_exit(&buf); } -static struct btree_path *btree_path_alloc(struct btree_trans *trans, - struct btree_path *pos) +static noinline void btree_path_overflow(struct btree_trans *trans) +{ + bch2_dump_trans_paths_updates(trans); + panic("trans path oveflow\n"); +} + +static inline struct btree_path *btree_path_alloc(struct btree_trans *trans, + struct btree_path *pos) { - struct btree_transaction_stats *s = btree_trans_stats(trans); struct btree_path *path; unsigned idx; if (unlikely(trans->paths_allocated == - ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) { - bch2_dump_trans_paths_updates(trans); - panic("trans path oveflow\n"); - } + ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) + btree_path_overflow(trans); idx = __ffs64(~trans->paths_allocated); trans->paths_allocated |= 1ULL << idx; - if (s && unlikely(hweight64(trans->paths_allocated) > s->nr_max_paths)) + if (unlikely(idx > trans->nr_max_paths)) bch2_trans_update_max_paths(trans); path = &trans->paths[idx]; @@ -2649,15 +2577,18 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) iter->key_cache_path = NULL; } -static void __bch2_trans_iter_init(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - unsigned locks_want, - unsigned depth, - unsigned flags, - unsigned long ip) +static inline void __bch2_trans_iter_init(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned locks_want, + unsigned depth, + unsigned flags, + unsigned long ip) { - EBUG_ON(trans->restarted); + if (trans->restarted) + panic("bch2_trans_iter_init(): in transaction restart, %s by %pS\n", + bch2_err_str(trans->restarted), + (void *) trans->last_restarted_ip); if (flags & BTREE_ITER_ALL_LEVELS) flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS; @@ -2742,37 +2673,34 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) dst->key_cache_path = NULL; } -void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) { unsigned new_top = trans->mem_top + size; + size_t old_bytes = trans->mem_bytes; + size_t new_bytes = roundup_pow_of_two(new_top); + void *new_mem; void *p; trans->mem_max = max(trans->mem_max, new_top); - if (new_top > trans->mem_bytes) { - size_t old_bytes = trans->mem_bytes; - size_t new_bytes = roundup_pow_of_two(new_top); - void *new_mem; + WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); - WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); + new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); + if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { + new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); + new_bytes = BTREE_TRANS_MEM_MAX; + kfree(trans->mem); + } - new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); - if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { - new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); - new_bytes = BTREE_TRANS_MEM_MAX; - kfree(trans->mem); - } + if (!new_mem) + return ERR_PTR(-ENOMEM); - if (!new_mem) - return ERR_PTR(-ENOMEM); + trans->mem = new_mem; + trans->mem_bytes = new_bytes; - trans->mem = new_mem; - trans->mem_bytes = new_bytes; - - if (old_bytes) { - trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); - return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); - } + if (old_bytes) { + trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); + return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); } p = trans->mem + trans->mem_top; @@ -2898,8 +2826,9 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char * trans->c = c; trans->fn = fn; trans->last_begin_time = ktime_get_ns(); - trans->task = current; trans->fn_idx = bch2_trans_get_fn_idx(trans, c, fn); + trans->locking_wait.task = current; + closure_init_stack(&trans->ref); bch2_trans_alloc_paths(trans, c); @@ -2909,6 +2838,7 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char * trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes); trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL); + trans->nr_max_paths = s->nr_max_paths; if (!unlikely(trans->mem)) { trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); @@ -2920,7 +2850,7 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char * mutex_lock(&c->btree_trans_lock); list_for_each_entry(pos, &c->btree_trans_list, list) { - if (trans->task->pid < pos->task->pid) { + if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) { list_add_tail(&trans->list, &pos->list); goto list_add_done; } @@ -2961,6 +2891,8 @@ void bch2_trans_exit(struct btree_trans *trans) bch2_trans_unlock(trans); + closure_sync(&trans->ref); + if (s) s->max_mem = max(s->max_mem, trans->mem_max); @@ -3009,8 +2941,8 @@ void bch2_trans_exit(struct btree_trans *trans) } static void __maybe_unused -bch2_btree_path_node_to_text(struct printbuf *out, - struct btree_bkey_cached_common *b) +bch2_btree_bkey_cached_common_to_text(struct printbuf *out, + struct btree_bkey_cached_common *b) { struct six_lock_count c = six_lock_counts(&b->lock); struct task_struct *owner; @@ -3021,11 +2953,13 @@ bch2_btree_path_node_to_text(struct printbuf *out, pid = owner ? owner->pid : 0;; rcu_read_unlock(); - prt_printf(out, " l=%u %s:", - b->level, bch2_btree_ids[b->btree_id]); + prt_tab(out); + prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b', + b->level, bch2_btree_ids[b->btree_id]); bch2_bpos_to_text(out, btree_node_pos(b)); - prt_printf(out, " locks %u:%u:%u held by pid %u", + prt_tab(out); + prt_printf(out, " locks %u:%u:%u held by pid %u", c.n[0], c.n[1], c.n[2], pid); } @@ -3036,7 +2970,12 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) static char lock_types[] = { 'r', 'i', 'w' }; unsigned l; - prt_printf(out, "%i %s\n", trans->task->pid, trans->fn); + if (!out->nr_tabstops) { + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 32); + } + + prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn); trans_for_each_path(trans, path) { if (!path->nodes_locked) @@ -3048,33 +2987,26 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) path->level, bch2_btree_ids[path->btree_id]); bch2_bpos_to_text(out, path->pos); - prt_printf(out, "\n"); + prt_newline(out); for (l = 0; l < BTREE_MAX_DEPTH; l++) { if (btree_node_locked(path, l) && !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) { prt_printf(out, " %c l=%u ", lock_types[btree_node_locked_type(path, l)], l); - bch2_btree_path_node_to_text(out, b); - prt_printf(out, "\n"); + bch2_btree_bkey_cached_common_to_text(out, b); + prt_newline(out); } } } b = READ_ONCE(trans->locking); if (b) { - path = &trans->paths[trans->locking_path_idx]; - prt_printf(out, " locking path %u %c l=%u %c %s:", - trans->locking_path_idx, - path->cached ? 'c' : 'b', - trans->locking_level, - lock_types[trans->locking_lock_type], - bch2_btree_ids[trans->locking_btree_id]); - bch2_bpos_to_text(out, trans->locking_pos); - - prt_printf(out, " node "); - bch2_btree_path_node_to_text(out, b); - prt_printf(out, "\n"); + prt_str(out, " want"); + prt_newline(out); + prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]); + bch2_btree_bkey_cached_common_to_text(out, b); + prt_newline(out); } } diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 7b47b880..4ec873aa 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -74,11 +74,14 @@ __trans_next_path(struct btree_trans *trans, unsigned idx) void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int); -#define trans_for_each_path(_trans, _path) \ - for (_path = __trans_next_path((_trans), 0); \ +#define trans_for_each_path_from(_trans, _path, _start) \ + for (_path = __trans_next_path((_trans), _start); \ (_path); \ _path = __trans_next_path((_trans), (_path)->idx + 1)) +#define trans_for_each_path(_trans, _path) \ + trans_for_each_path_from(_trans, _path, 0) + static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path) { unsigned idx = path ? path->sorted_idx + 1 : 0; @@ -143,6 +146,9 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bke struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *, struct btree_iter *, struct bpos); +inline void bch2_btree_path_level_init(struct btree_trans *, + struct btree_path *, struct btree *); + #ifdef CONFIG_BCACHEFS_DEBUG void bch2_trans_verify_paths(struct btree_trans *); void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, @@ -286,7 +292,23 @@ static inline void set_btree_iter_dontneed(struct btree_iter *iter) iter->path->preserve = false; } -void *bch2_trans_kmalloc(struct btree_trans *, size_t); +void *__bch2_trans_kmalloc(struct btree_trans *, size_t); + +static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +{ + unsigned new_top = trans->mem_top + size; + void *p = trans->mem + trans->mem_top; + + if (likely(new_top <= trans->mem_bytes)) { + trans->mem_top += size; + memset(p, 0, size); + return p; + } else { + return __bch2_trans_kmalloc(trans, size); + + } +} + u32 bch2_trans_begin(struct btree_trans *); static inline struct btree * diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index d900ff42..918dde31 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -398,20 +398,9 @@ err: return ret; } -static int bkey_cached_check_fn(struct six_lock *lock, void *p) -{ - struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock); - const struct btree_path *path = p; - - if (ck->key.btree_id != path->btree_id && - bpos_cmp(ck->key.pos, path->pos)) - return BCH_ERR_lock_fail_node_reused; - return 0; -} - -__flatten -int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, - unsigned flags) +noinline static int +bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path, + unsigned flags) { struct bch_fs *c = trans->c; struct bkey_cached *ck; @@ -440,16 +429,12 @@ retry: } else { enum six_lock_type lock_want = __btree_lock_want(path, 0); - ret = btree_node_lock(trans, path, (void *) ck, path->pos, 0, - lock_want, - bkey_cached_check_fn, path, _THIS_IP_); - if (ret) { - if (bch2_err_matches(ret, BCH_ERR_lock_fail_node_reused)) - goto retry; - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto err; - BUG(); - } + ret = btree_node_lock(trans, path, (void *) ck, 0, + lock_want, _THIS_IP_); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto err; + + BUG_ON(ret); if (ck->key.btree_id != path->btree_id || bpos_cmp(ck->key.pos, path->pos)) { @@ -496,6 +481,60 @@ err: return ret; } +int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_cached *ck; + int ret = 0; + + EBUG_ON(path->level); + + path->l[1].b = NULL; + + if (bch2_btree_node_relock(trans, path, 0)) { + ck = (void *) path->l[0].b; + goto fill; + } +retry: + ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); + if (!ck) { + return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); + } else { + enum six_lock_type lock_want = __btree_lock_want(path, 0); + + ret = btree_node_lock(trans, path, (void *) ck, 0, + lock_want, _THIS_IP_); + EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)); + + if (ret) + return ret; + + if (ck->key.btree_id != path->btree_id || + bpos_cmp(ck->key.pos, path->pos)) { + six_unlock_type(&ck->c.lock, lock_want); + goto retry; + } + + mark_btree_node_locked(trans, path, 0, lock_want); + } + + path->l[0].lock_seq = ck->c.lock.state.seq; + path->l[0].b = (void *) ck; +fill: + if (!ck->valid) + return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); + + if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) + set_bit(BKEY_CACHED_ACCESSED, &ck->flags); + + path->uptodate = BTREE_ITER_UPTODATE; + EBUG_ON(!ck->valid); + EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); + + return ret; +} + static int btree_key_cache_flush_pos(struct btree_trans *trans, struct bkey_cached_key key, u64 journal_seq, diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c index 1cdf7d4f..339d44ce 100644 --- a/libbcachefs/btree_locking.c +++ b/libbcachefs/btree_locking.c @@ -52,10 +52,257 @@ void bch2_btree_node_unlock_write(struct btree_trans *trans, /* lock */ -void __bch2_btree_node_lock_write(struct btree_trans *trans, - struct btree_bkey_cached_common *b) +/* + * @trans wants to lock @b with type @type + */ +struct trans_waiting_for_lock { + struct btree_trans *trans; + struct btree_bkey_cached_common *node_want; + enum six_lock_type lock_want; + + /* for iterating over held locks :*/ + u8 path_idx; + u8 level; + u64 lock_start_time; +}; + +struct lock_graph { + struct trans_waiting_for_lock g[8]; + unsigned nr; +}; + +static void lock_graph_pop(struct lock_graph *g) +{ + closure_put(&g->g[--g->nr].trans->ref); +} + +static noinline void print_cycle(struct printbuf *out, struct lock_graph *g) +{ + struct trans_waiting_for_lock *i; + + prt_printf(out, "Found lock cycle (%u entries):", g->nr); + prt_newline(out); + + for (i = g->g; i < g->g + g->nr; i++) + bch2_btree_trans_to_text(out, i->trans); +} + +static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i) +{ + int ret; + + if (i == g->g) { + trace_and_count(i->trans->c, trans_restart_would_deadlock, i->trans, _RET_IP_); + ret = btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock); + } else { + i->trans->lock_must_abort = true; + ret = 0; + } + + for (i = g->g + 1; i < g->g + g->nr; i++) + wake_up_process(i->trans->locking_wait.task); + return ret; +} + +static noinline int break_cycle(struct lock_graph *g) +{ + struct trans_waiting_for_lock *i; + + for (i = g->g; i < g->g + g->nr; i++) { + if (i->trans->lock_may_not_fail || + i->trans->locking_wait.lock_want == SIX_LOCK_write) + continue; + + return abort_lock(g, i); + } + + for (i = g->g; i < g->g + g->nr; i++) { + if (i->trans->lock_may_not_fail || + !i->trans->in_traverse_all) + continue; + + return abort_lock(g, i); + } + + for (i = g->g; i < g->g + g->nr; i++) { + if (i->trans->lock_may_not_fail) + continue; + + return abort_lock(g, i); + } + + BUG(); +} + +static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, + struct printbuf *cycle) +{ + struct btree_trans *orig_trans = g->g->trans; + struct trans_waiting_for_lock *i; + int ret = 0; + + for (i = g->g; i < g->g + g->nr; i++) { + if (i->trans->locking != i->node_want) + while (g->g + g->nr >= i) { + lock_graph_pop(g); + return 0; + } + + if (i->trans == trans) { + if (cycle) { + /* Only checking: */ + print_cycle(cycle, g); + ret = -1; + } else { + ret = break_cycle(g); + } + + if (ret) + goto deadlock; + /* + * If we didn't abort (instead telling another + * transaction to abort), keep checking: + */ + } + } + + if (g->nr == ARRAY_SIZE(g->g)) { + if (orig_trans->lock_may_not_fail) + return 0; + + trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_); + ret = btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit); + goto deadlock; + } + + closure_get(&trans->ref); + + g->g[g->nr++] = (struct trans_waiting_for_lock) { + .trans = trans, + .node_want = trans->locking, + .lock_want = trans->locking_wait.lock_want, + }; + + return 0; +deadlock: + while (g->nr) + lock_graph_pop(g); + return ret; +} + +static noinline void lock_graph_remove_non_waiters(struct lock_graph *g) +{ + struct trans_waiting_for_lock *i; + + for (i = g->g + 1; i < g->g + g->nr; i++) + if (i->trans->locking != i->node_want || + i->trans->locking_wait.start_time != i[-1].lock_start_time) { + while (g->g + g->nr >= i) + lock_graph_pop(g); + return; + } + BUG(); +} + +static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2) +{ + return t1 + t2 > 1; +} + +int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) +{ + struct lock_graph g; + struct trans_waiting_for_lock *top; + struct btree_bkey_cached_common *b; + struct btree_path *path; + int ret; + + if (trans->lock_must_abort) { + trace_and_count(trans->c, trans_restart_would_deadlock, trans, _RET_IP_); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock); + } + + g.nr = 0; + ret = lock_graph_descend(&g, trans, cycle); + BUG_ON(ret); +next: + if (!g.nr) + return 0; + + top = &g.g[g.nr - 1]; + + trans_for_each_path_from(top->trans, path, top->path_idx) { + if (!path->nodes_locked) + continue; + + if (top->path_idx != path->idx) { + top->path_idx = path->idx; + top->level = 0; + top->lock_start_time = 0; + } + + for (; + top->level < BTREE_MAX_DEPTH; + top->level++, top->lock_start_time = 0) { + int lock_held = btree_node_locked_type(path, top->level); + + if (lock_held == BTREE_NODE_UNLOCKED) + continue; + + b = &READ_ONCE(path->l[top->level].b)->c; + + if (unlikely(IS_ERR_OR_NULL(b))) { + lock_graph_remove_non_waiters(&g); + goto next; + } + + if (list_empty_careful(&b->lock.wait_list)) + continue; + + raw_spin_lock(&b->lock.wait_lock); + list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) { + BUG_ON(b != trans->locking); + + if (top->lock_start_time && + time_after_eq64(top->lock_start_time, trans->locking_wait.start_time)) + continue; + + top->lock_start_time = trans->locking_wait.start_time; + + /* Don't check for self deadlock: */ + if (trans == top->trans || + !lock_type_conflicts(lock_held, trans->locking_wait.lock_want)) + continue; + + ret = lock_graph_descend(&g, trans, cycle); + raw_spin_unlock(&b->lock.wait_lock); + + if (ret) + return ret < 0 ? ret : 0; + goto next; + + } + raw_spin_unlock(&b->lock.wait_lock); + } + } + + lock_graph_pop(&g); + goto next; +} + +int bch2_six_check_for_deadlock(struct six_lock *lock, void *p) +{ + struct btree_trans *trans = p; + + return bch2_check_for_deadlock(trans, NULL); +} + +int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path, + struct btree_bkey_cached_common *b, + bool lock_may_not_fail) { int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read]; + int ret; /* * Must drop our read locks before calling six_lock_write() - @@ -64,98 +311,13 @@ void __bch2_btree_node_lock_write(struct btree_trans *trans, * locked: */ six_lock_readers_add(&b->lock, -readers); - btree_node_lock_nopath_nofail(trans, b, SIX_LOCK_write); + ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write, lock_may_not_fail); six_lock_readers_add(&b->lock, readers); -} -static inline bool path_has_read_locks(struct btree_path *path) -{ - unsigned l; + if (ret) + mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_intent); - for (l = 0; l < BTREE_MAX_DEPTH; l++) - if (btree_node_read_locked(path, l)) - return true; - return false; -} - -/* Slowpath: */ -int __bch2_btree_node_lock(struct btree_trans *trans, - struct btree_path *path, - struct btree_bkey_cached_common *b, - struct bpos pos, unsigned level, - enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, void *p, - unsigned long ip) -{ - struct btree_path *linked; - unsigned reason; - - /* Check if it's safe to block: */ - trans_for_each_path(trans, linked) { - if (!linked->nodes_locked) - continue; - - /* - * Can't block taking an intent lock if we have _any_ nodes read - * locked: - * - * - Our read lock blocks another thread with an intent lock on - * the same node from getting a write lock, and thus from - * dropping its intent lock - * - * - And the other thread may have multiple nodes intent locked: - * both the node we want to intent lock, and the node we - * already have read locked - deadlock: - */ - if (type == SIX_LOCK_intent && - path_has_read_locks(linked)) { - reason = 1; - goto deadlock; - } - - if (linked->btree_id != path->btree_id) { - if (linked->btree_id < path->btree_id) - continue; - - reason = 3; - goto deadlock; - } - - /* - * Within the same btree, non-cached paths come before cached - * paths: - */ - if (linked->cached != path->cached) { - if (!linked->cached) - continue; - - reason = 4; - goto deadlock; - } - - /* - * Interior nodes must be locked before their descendants: if - * another path has possible descendants locked of the node - * we're about to lock, it must have the ancestors locked too: - */ - if (level > btree_path_highest_level_locked(linked)) { - reason = 5; - goto deadlock; - } - - /* Must lock btree nodes in key order: */ - if (btree_node_locked(linked, level) && - bpos_cmp(pos, btree_node_pos(&linked->l[level].b->c)) <= 0) { - reason = 7; - goto deadlock; - } - } - - return btree_node_lock_type(trans, path, b, pos, level, - type, should_sleep_fn, p); -deadlock: - trace_and_count(trans->c, trans_restart_would_deadlock, trans, ip, reason, linked, path, &pos); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock); + return ret; } /* relock */ @@ -205,7 +367,8 @@ static inline bool btree_path_get_locks(struct btree_trans *trans, } bool __bch2_btree_node_relock(struct btree_trans *trans, - struct btree_path *path, unsigned level) + struct btree_path *path, unsigned level, + bool trace) { struct btree *b = btree_path_node(path, level); int want = __btree_lock_want(path, level); @@ -220,7 +383,8 @@ bool __bch2_btree_node_relock(struct btree_trans *trans, return true; } fail: - trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level); + if (trace) + trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level); return false; } @@ -230,6 +394,7 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans, struct btree_path *path, unsigned level) { struct btree *b = path->l[level].b; + struct six_lock_count count = bch2_btree_node_lock_counts(trans, path, &b->c, level); if (!is_btree_node(path, level)) return false; @@ -253,11 +418,24 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans, if (race_fault()) return false; - if (btree_node_locked(path, level) - ? six_lock_tryupgrade(&b->c.lock) - : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq)) - goto success; + if (btree_node_locked(path, level)) { + bool ret; + six_lock_readers_add(&b->c.lock, -count.n[SIX_LOCK_read]); + ret = six_lock_tryupgrade(&b->c.lock); + six_lock_readers_add(&b->c.lock, count.n[SIX_LOCK_read]); + + if (ret) + goto success; + } else { + if (six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq)) + goto success; + } + + /* + * Do we already have an intent lock via another path? If so, just bump + * lock count: + */ if (btree_node_lock_seq_matches(path, b, level) && btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) { btree_node_unlock(trans, path, level); diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index 3bc490bc..61d5038a 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -184,49 +184,44 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat void bch2_btree_node_unlock_write(struct btree_trans *, struct btree_path *, struct btree *); +int bch2_six_check_for_deadlock(struct six_lock *lock, void *p); + /* lock: */ +static inline int __btree_node_lock_nopath(struct btree_trans *trans, + struct btree_bkey_cached_common *b, + enum six_lock_type type, + bool lock_may_not_fail) +{ + int ret; + trans->lock_may_not_fail = lock_may_not_fail; + trans->lock_must_abort = false; + trans->locking = b; + + ret = six_lock_type_waiter(&b->lock, type, &trans->locking_wait, + bch2_six_check_for_deadlock, trans); + WRITE_ONCE(trans->locking, NULL); + WRITE_ONCE(trans->locking_wait.start_time, 0); + return ret; +} + static inline int __must_check btree_node_lock_nopath(struct btree_trans *trans, struct btree_bkey_cached_common *b, enum six_lock_type type) { - six_lock_type(&b->lock, type, NULL, NULL); - return 0; + return __btree_node_lock_nopath(trans, b, type, false); } static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans, struct btree_bkey_cached_common *b, enum six_lock_type type) { - int ret = btree_node_lock_nopath(trans, b, type); + int ret = __btree_node_lock_nopath(trans, b, type, true); BUG_ON(ret); } -static inline int btree_node_lock_type(struct btree_trans *trans, - struct btree_path *path, - struct btree_bkey_cached_common *b, - struct bpos pos, unsigned level, - enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, void *p) -{ - int ret; - - if (six_trylock_type(&b->lock, type)) - return 0; - - trans->locking_path_idx = path->idx; - trans->locking_pos = pos; - trans->locking_btree_id = path->btree_id; - trans->locking_level = level; - trans->locking_lock_type = type; - trans->locking = b; - ret = six_lock_type(&b->lock, type, should_sleep_fn, p); - trans->locking = NULL; - return ret; -} - /* * Lock a btree node if we already have it locked on one of our linked * iterators: @@ -248,19 +243,11 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans, return false; } -int __bch2_btree_node_lock(struct btree_trans *, struct btree_path *, - struct btree_bkey_cached_common *, - struct bpos, unsigned, - enum six_lock_type, - six_lock_should_sleep_fn, void *, - unsigned long); - static inline int btree_node_lock(struct btree_trans *trans, struct btree_path *path, struct btree_bkey_cached_common *b, - struct bpos pos, unsigned level, + unsigned level, enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, void *p, unsigned long ip) { int ret = 0; @@ -270,8 +257,7 @@ static inline int btree_node_lock(struct btree_trans *trans, if (likely(six_trylock_type(&b->lock, type)) || btree_node_lock_increment(trans, b, level, type) || - !(ret = __bch2_btree_node_lock(trans, path, b, pos, level, type, - should_sleep_fn, p, ip))) { + !(ret = btree_node_lock_nopath(trans, b, type))) { #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS path->l[b->level].lock_taken_time = ktime_get_ns(); #endif @@ -280,11 +266,13 @@ static inline int btree_node_lock(struct btree_trans *trans, return ret; } -void __bch2_btree_node_lock_write(struct btree_trans *, struct btree_bkey_cached_common *); +int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_path *, + struct btree_bkey_cached_common *b, bool); -static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, - struct btree_path *path, - struct btree_bkey_cached_common *b) +static inline int __btree_node_lock_write(struct btree_trans *trans, + struct btree_path *path, + struct btree_bkey_cached_common *b, + bool lock_may_not_fail) { EBUG_ON(&path->l[b->level].b->c != b); EBUG_ON(path->l[b->level].lock_seq != b->lock.state.seq); @@ -297,8 +285,17 @@ static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, */ mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_write); - if (unlikely(!six_trylock_write(&b->lock))) - __bch2_btree_node_lock_write(trans, b); + return likely(six_trylock_write(&b->lock)) + ? 0 + : __bch2_btree_node_lock_write(trans, path, b, lock_may_not_fail); +} + +static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, + struct btree_path *path, + struct btree_bkey_cached_common *b) +{ + int ret = __btree_node_lock_write(trans, path, b, true); + BUG_ON(ret); } static inline int __must_check @@ -306,15 +303,14 @@ bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path, struct btree_bkey_cached_common *b) { - bch2_btree_node_lock_write_nofail(trans, path, b); - return 0; + return __btree_node_lock_write(trans, path, b, false); } /* relock: */ bool bch2_btree_path_relock_norestart(struct btree_trans *, struct btree_path *, unsigned long); -bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned); +bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned, bool trace); static inline bool bch2_btree_node_relock(struct btree_trans *trans, struct btree_path *path, unsigned level) @@ -325,7 +321,19 @@ static inline bool bch2_btree_node_relock(struct btree_trans *trans, return likely(btree_node_locked(path, level)) || (!IS_ERR_OR_NULL(path->l[level].b) && - __bch2_btree_node_relock(trans, path, level)); + __bch2_btree_node_relock(trans, path, level, true)); +} + +static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, + struct btree_path *path, unsigned level) +{ + EBUG_ON(btree_node_locked(path, level) && + !btree_node_write_locked(path, level) && + btree_node_locked_type(path, level) != __btree_lock_want(path, level)); + + return likely(btree_node_locked(path, level)) || + (!IS_ERR_OR_NULL(path->l[level].b) && + __bch2_btree_node_relock(trans, path, level, false)); } static inline int bch2_btree_path_relock(struct btree_trans *trans, @@ -346,15 +354,22 @@ bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *, bool __bch2_btree_path_upgrade(struct btree_trans *, struct btree_path *, unsigned); -static inline bool bch2_btree_path_upgrade(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want) +static inline int bch2_btree_path_upgrade(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) { + unsigned old_locks_want = path->locks_want; + new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); - return path->locks_want < new_locks_want - ? __bch2_btree_path_upgrade(trans, path, new_locks_want) - : path->uptodate == BTREE_ITER_UPTODATE; + if (path->locks_want < new_locks_want + ? __bch2_btree_path_upgrade(trans, path, new_locks_want) + : path->uptodate == BTREE_ITER_UPTODATE) + return 0; + + trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path, + old_locks_want, new_locks_want); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); } /* misc: */ @@ -389,6 +404,7 @@ struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *, struct btree_bkey_cached_common *b, unsigned); +int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *); #ifdef CONFIG_BCACHEFS_DEBUG void bch2_btree_path_verify_locks(struct btree_path *); diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 7c016637..af226eed 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -389,15 +389,15 @@ struct btree_trans_commit_hook { struct btree_trans { struct bch_fs *c; const char *fn; + struct closure ref; struct list_head list; u64 last_begin_time; + + u8 lock_may_not_fail; + u8 lock_must_abort; struct btree_bkey_cached_common *locking; - unsigned locking_path_idx; - struct bpos locking_pos; - u8 locking_btree_id; - u8 locking_level; - u8 locking_lock_type; - struct task_struct *task; + struct six_lock_waiter locking_wait; + int srcu_idx; u8 fn_idx; @@ -417,6 +417,7 @@ struct btree_trans { * extent: */ unsigned extra_journal_res; + unsigned nr_max_paths; u64 paths_allocated; diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index d31c6eeb..cf9b9ec4 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -28,6 +28,21 @@ static void bch2_btree_insert_node(struct btree_update *, struct btree_trans *, struct keylist *, unsigned); static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); +static struct btree_path *get_unlocked_mut_path(struct btree_trans *trans, + enum btree_id btree_id, + unsigned level, + struct bpos pos) +{ + struct btree_path *path; + + path = bch2_path_get(trans, btree_id, pos, level + 1, level, + BTREE_ITER_INTENT, _THIS_IP_); + path = bch2_btree_path_make_mut(trans, path, true, _THIS_IP_); + bch2_btree_path_downgrade(trans, path); + __bch2_btree_path_unlock(trans, path); + return path; +} + /* Debug code: */ /* @@ -608,6 +623,7 @@ static void btree_update_nodes_written(struct btree_update *as) ret = commit_do(&trans, &as->disk_res, &journal_seq, BTREE_INSERT_NOFAIL| BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_USE_RESERVE| BTREE_INSERT_JOURNAL_RECLAIM| JOURNAL_WATERMARK_reserved, btree_update_nodes_written_trans(&trans, as)); @@ -617,7 +633,10 @@ static void btree_update_nodes_written(struct btree_update *as) "error %i in btree_update_nodes_written()", ret); err: if (as->b) { + struct btree_path *path; + b = as->b; + path = get_unlocked_mut_path(&trans, as->btree_id, b->c.level, b->key.k.p); /* * @b is the node we did the final insert into: * @@ -631,7 +650,11 @@ err: */ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent); - btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_write); + mark_btree_node_locked(&trans, path, b->c.level, SIX_LOCK_intent); + bch2_btree_path_level_init(&trans, path, b); + + bch2_btree_node_lock_write_nofail(&trans, path, &b->c); + mutex_lock(&c->btree_interior_update_lock); list_del(&as->write_blocked_list); @@ -665,10 +688,13 @@ err: } mutex_unlock(&c->btree_interior_update_lock); + + mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent); six_unlock_write(&b->c.lock); btree_node_write_if_need(c, b, SIX_LOCK_intent); - six_unlock_intent(&b->c.lock); + btree_node_unlock(&trans, path, b->c.level); + bch2_path_put(&trans, path, true); } bch2_journal_pin_drop(&c->journal, &as->journal); @@ -1002,11 +1028,9 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, if (update_level < BTREE_MAX_DEPTH) nr_nodes[1] += 1; - if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) { - trace_and_count(c, trans_restart_iter_upgrade, trans, _RET_IP_, path); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); + ret = bch2_btree_path_upgrade(trans, path, U8_MAX); + if (ret) return ERR_PTR(ret); - } if (flags & BTREE_INSERT_GC_LOCK_HELD) lockdep_assert_held(&c->gc_lock); @@ -1084,16 +1108,16 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, goto err; ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL); - if (ret == -EAGAIN || - ret == -ENOMEM) { + if (bch2_err_matches(ret, ENOSPC) || + bch2_err_matches(ret, ENOMEM)) { struct closure cl; closure_init_stack(&cl); - bch2_trans_unlock(trans); - do { ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl); + + bch2_trans_unlock(trans); closure_sync(&cl); } while (ret == -EAGAIN); } @@ -1429,6 +1453,7 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans, struct bch_fs *c = as->c; struct btree *parent = btree_node_parent(path, b); struct btree *n1, *n2 = NULL, *n3 = NULL; + struct btree_path *path1 = NULL, *path2 = NULL; u64 start_time = local_clock(); BUG_ON(!parent && (b != btree_node_root(c, b))); @@ -1451,6 +1476,16 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans, six_unlock_write(&n2->c.lock); six_unlock_write(&n1->c.lock); + path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p); + six_lock_increment(&n1->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent); + bch2_btree_path_level_init(trans, path1, n1); + + path2 = get_unlocked_mut_path(trans, path->btree_id, n2->c.level, n2->key.k.p); + six_lock_increment(&n2->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, path2, n2->c.level, SIX_LOCK_intent); + bch2_btree_path_level_init(trans, path2, n2); + bch2_btree_update_add_new_node(as, n1); bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0); @@ -1468,6 +1503,12 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans, /* Depth increases, make a new root */ n3 = __btree_root_alloc(as, trans, b->c.level + 1); + path2->locks_want++; + BUG_ON(btree_node_locked(path2, n3->c.level)); + six_lock_increment(&n3->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, path2, n3->c.level, SIX_LOCK_intent); + bch2_btree_path_level_init(trans, path2, n3); + n3->sib_u64s[0] = U16_MAX; n3->sib_u64s[1] = U16_MAX; @@ -1481,6 +1522,11 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans, bch2_btree_build_aux_trees(n1); six_unlock_write(&n1->c.lock); + path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p); + six_lock_increment(&n1->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent); + bch2_btree_path_level_init(trans, path1, n1); + bch2_btree_update_add_new_node(as, n1); bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0); @@ -1527,6 +1573,15 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans, six_unlock_intent(&n2->c.lock); six_unlock_intent(&n1->c.lock); + if (path2) { + __bch2_btree_path_unlock(trans, path2); + bch2_path_put(trans, path2, true); + } + if (path1) { + __bch2_btree_path_unlock(trans, path1); + bch2_path_put(trans, path1, true); + } + bch2_trans_verify_locks(trans); bch2_time_stats_update(&c->times[n2 @@ -1643,7 +1698,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, enum btree_node_sibling sib) { struct bch_fs *c = trans->c; - struct btree_path *sib_path = NULL; + struct btree_path *sib_path = NULL, *new_path = NULL; struct btree_update *as; struct bkey_format_state new_s; struct bkey_format new_f; @@ -1767,6 +1822,11 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_btree_build_aux_trees(n); six_unlock_write(&n->c.lock); + new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p); + six_lock_increment(&n->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent); + bch2_btree_path_level_init(trans, new_path, n); + bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); bkey_init(&delete.k); @@ -1796,6 +1856,8 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time); out: err: + if (new_path) + bch2_path_put(trans, new_path, true); bch2_path_put(trans, sib_path, true); bch2_trans_verify_locks(trans); return ret; @@ -1810,6 +1872,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, unsigned flags) { struct bch_fs *c = trans->c; + struct btree_path *new_path = NULL; struct btree *n, *parent; struct btree_update *as; int ret; @@ -1831,6 +1894,11 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, bch2_btree_build_aux_trees(n); six_unlock_write(&n->c.lock); + new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p); + six_lock_increment(&n->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent); + bch2_btree_path_level_init(trans, new_path, n); + trace_and_count(c, btree_node_rewrite, c, b); bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); @@ -1851,6 +1919,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, six_unlock_intent(&n->c.lock); bch2_btree_update_done(as, trans); + bch2_path_put(trans, new_path, true); out: bch2_btree_path_downgrade(trans, iter->path); return ret; @@ -2035,9 +2104,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite struct closure cl; int ret = 0; - if (!btree_node_intent_locked(path, b->c.level) && - !bch2_btree_path_upgrade(trans, path, b->c.level + 1)) - return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); + ret = bch2_btree_path_upgrade(trans, path, b->c.level + 1); + if (ret) + return ret; closure_init_stack(&cl); diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index e9518fbc..08d7001f 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -734,79 +734,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, return ret; } -static inline void path_upgrade_readers(struct btree_trans *trans, struct btree_path *path) -{ - unsigned l; - - for (l = 0; l < BTREE_MAX_DEPTH; l++) - if (btree_node_read_locked(path, l)) - BUG_ON(!bch2_btree_node_upgrade(trans, path, l)); -} - -static inline void upgrade_readers(struct btree_trans *trans, struct btree_path *path) -{ - struct btree *b = path_l(path)->b; - unsigned l; - - do { - for (l = 0; l < BTREE_MAX_DEPTH; l++) - if (btree_node_read_locked(path, l)) - path_upgrade_readers(trans, path); - } while ((path = prev_btree_path(trans, path)) && - path_l(path)->b == b); -} - -/* - * Check for nodes that we have both read and intent locks on, and upgrade the - * readers to intent: - */ -static inline void normalize_read_intent_locks(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned i, nr_read = 0, nr_intent = 0; - - trans_for_each_path_inorder(trans, path, i) { - struct btree_path *next = i + 1 < trans->nr_sorted - ? trans->paths + trans->sorted[i + 1] - : NULL; - - switch (btree_node_locked_type(path, path->level)) { - case BTREE_NODE_READ_LOCKED: - nr_read++; - break; - case BTREE_NODE_INTENT_LOCKED: - nr_intent++; - break; - } - - if (!next || path_l(path)->b != path_l(next)->b) { - if (nr_read && nr_intent) - upgrade_readers(trans, path); - - nr_read = nr_intent = 0; - } - } - - bch2_trans_verify_locks(trans); -} - -static inline bool have_conflicting_read_lock(struct btree_trans *trans, struct btree_path *pos) -{ - struct btree_path *path; - unsigned i; - - trans_for_each_path_inorder(trans, path, i) { - //if (path == pos) - // break; - - if (btree_node_read_locked(path, path->level) && - !bch2_btree_path_upgrade(trans, path, path->level + 1)) - return true; - } - - return false; -} - static inline int trans_lock_write(struct btree_trans *trans) { struct btree_insert_entry *i; @@ -816,31 +743,15 @@ static inline int trans_lock_write(struct btree_trans *trans) if (same_leaf_as_prev(trans, i)) continue; - /* - * six locks are unfair, and read locks block while a thread - * wants a write lock: thus, we need to tell the cycle detector - * we have a write lock _before_ taking the lock: - */ - mark_btree_node_locked_noreset(i->path, i->level, SIX_LOCK_write); - - if (!six_trylock_write(&insert_l(i)->b->c.lock)) { - if (have_conflicting_read_lock(trans, i->path)) - goto fail; - - ret = btree_node_lock_type(trans, i->path, - &insert_l(i)->b->c, - i->path->pos, i->level, - SIX_LOCK_write, NULL, NULL); - BUG_ON(ret); - } + ret = bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c); + if (ret) + goto fail; bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); } return 0; fail: - mark_btree_node_locked_noreset(i->path, i->level, SIX_LOCK_intent); - while (--i >= trans->updates) { if (same_leaf_as_prev(trans, i)) continue; @@ -926,8 +837,6 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, if (unlikely(ret)) return ret; - normalize_read_intent_locks(trans); - ret = trans_lock_write(trans); if (unlikely(ret)) return ret; @@ -1031,9 +940,11 @@ int bch2_trans_commit_error(struct btree_trans *trans, } BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); - BUG_ON(ret == -ENOSPC && - !(trans->flags & BTREE_INSERT_NOWAIT) && - (trans->flags & BTREE_INSERT_NOFAIL)); + + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) && + !(trans->flags & BTREE_INSERT_NOWAIT) && + (trans->flags & BTREE_INSERT_NOFAIL), c, + "%s: incorrectly got %s\n", __func__, bch2_err_str(ret)); return ret; } @@ -1123,11 +1034,9 @@ int __bch2_trans_commit(struct btree_trans *trans) trans_for_each_update(trans, i) { BUG_ON(!i->path->should_be_locked); - if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) { - trace_and_count(c, trans_restart_upgrade, trans, _RET_IP_, i->path); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); + ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1); + if (unlikely(ret)) goto out; - } BUG_ON(!btree_node_intent_locked(i->path, i->level)); @@ -1191,7 +1100,7 @@ err: goto retry; } -static int check_pos_snapshot_overwritten(struct btree_trans *trans, +static noinline int __check_pos_snapshot_overwritten(struct btree_trans *trans, enum btree_id id, struct bpos pos) { @@ -1200,12 +1109,6 @@ static int check_pos_snapshot_overwritten(struct btree_trans *trans, struct bkey_s_c k; int ret; - if (!btree_type_has_snapshots(id)) - return 0; - - if (!snapshot_t(c, pos.snapshot)->children[0]) - return 0; - bch2_trans_iter_init(trans, &iter, id, pos, BTREE_ITER_NOT_EXTENTS| BTREE_ITER_ALL_SNAPSHOTS); @@ -1231,6 +1134,18 @@ static int check_pos_snapshot_overwritten(struct btree_trans *trans, return ret; } +static inline int check_pos_snapshot_overwritten(struct btree_trans *trans, + enum btree_id id, + struct bpos pos) +{ + if (!btree_type_has_snapshots(id) || + pos.snapshot == U32_MAX || + !snapshot_t(trans->c, pos.snapshot)->children[0]) + return 0; + + return __check_pos_snapshot_overwritten(trans, id, pos); +} + int bch2_trans_update_extent(struct btree_trans *trans, struct btree_iter *orig_iter, struct bkey_i *insert, @@ -1716,15 +1631,18 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, int ret = 0; bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT); -retry: - while ((k = bch2_btree_iter_peek(&iter)).k && - !(ret = bkey_err(k) ?: - btree_trans_too_many_iters(trans)) && - bkey_cmp(iter.pos, end) < 0) { + while ((k = bch2_btree_iter_peek(&iter)).k) { struct disk_reservation disk_res = bch2_disk_reservation_init(trans->c, 0); struct bkey_i delete; + ret = bkey_err(k); + if (ret) + goto err; + + if (bkey_cmp(iter.pos, end) >= 0) + break; + bkey_init(&delete.k); /* @@ -1753,23 +1671,27 @@ retry: ret = bch2_extent_trim_atomic(trans, &iter, &delete); if (ret) - break; + goto err; } ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?: bch2_trans_commit(trans, &disk_res, journal_seq, BTREE_INSERT_NOFAIL); bch2_disk_reservation_put(trans->c, &disk_res); +err: + /* + * the bch2_trans_begin() call is in a weird place because we + * need to call it after every transaction commit, to avoid path + * overflow, but don't want to call it if the delete operation + * is a no-op and we have no work to do: + */ + bch2_trans_begin(trans); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; if (ret) break; } - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - bch2_trans_begin(trans); - ret = 0; - goto retry; - } - bch2_trans_iter_exit(trans, &iter); if (!ret && trans_was_restarted(trans, restart_count)) diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index b4be2122..8af0dd02 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -1999,7 +1999,7 @@ recalculate: ret = 0; } else { atomic64_set(&c->sectors_available, sectors_available); - ret = -ENOSPC; + ret = -BCH_ERR_ENOSPC_disk_reservation; } mutex_unlock(&c->sectors_available_lock); diff --git a/libbcachefs/counters.c b/libbcachefs/counters.c index 745f856e..edd1b253 100644 --- a/libbcachefs/counters.c +++ b/libbcachefs/counters.c @@ -36,7 +36,7 @@ void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, for (i = 0; i < nr; i++) { if (i < BCH_COUNTER_NR) - prt_printf(out, "%s", bch2_counter_names[i]); + prt_printf(out, "%s ", bch2_counter_names[i]); else prt_printf(out, "(unknown)"); diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index fb518d59..bff5e9b6 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -11,6 +11,7 @@ #include "btree_cache.h" #include "btree_io.h" #include "btree_iter.h" +#include "btree_locking.h" #include "btree_update.h" #include "buckets.h" #include "debug.h" @@ -534,7 +535,7 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, mutex_lock(&c->btree_trans_lock); list_for_each_entry(trans, &c->btree_trans_list, list) { - if (trans->task->pid <= i->iter) + if (trans->locking_wait.task->pid <= i->iter) continue; ret = flush_buf(i); @@ -546,11 +547,11 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, prt_printf(&i->buf, "backtrace:"); prt_newline(&i->buf); printbuf_indent_add(&i->buf, 2); - prt_backtrace(&i->buf, trans->task); + prt_backtrace(&i->buf, trans->locking_wait.task); printbuf_indent_sub(&i->buf, 2); prt_newline(&i->buf); - i->iter = trans->task->pid; + i->iter = trans->locking_wait.task->pid; } mutex_unlock(&c->btree_trans_lock); @@ -707,6 +708,45 @@ static const struct file_operations lock_held_stats_op = { .read = lock_held_stats_read, }; +static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + struct btree_trans *trans; + ssize_t ret = 0; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + if (i->iter) + goto out; + + mutex_lock(&c->btree_trans_lock); + list_for_each_entry(trans, &c->btree_trans_list, list) + if (bch2_check_for_deadlock(trans, &i->buf)) { + i->iter = 1; + break; + } + mutex_unlock(&c->btree_trans_lock); +out: + if (i->buf.allocation_failure) + ret = -ENOMEM; + + if (!ret) + ret = flush_buf(i); + + return ret ?: i->ret; +} + +static const struct file_operations btree_deadlock_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_btree_deadlock_read, +}; + void bch2_fs_debug_exit(struct bch_fs *c) { if (!IS_ERR_OR_NULL(c->fs_debug_dir)) @@ -738,6 +778,9 @@ void bch2_fs_debug_init(struct bch_fs *c) debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir, c, &lock_held_stats_op); + debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir, + c->btree_debug, &btree_deadlock_ops); + c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir); if (IS_ERR_OR_NULL(c->btree_debug_dir)) return; diff --git a/libbcachefs/disk_groups.c b/libbcachefs/disk_groups.c index 22b6b841..19b44408 100644 --- a/libbcachefs/disk_groups.c +++ b/libbcachefs/disk_groups.c @@ -276,7 +276,7 @@ static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent, groups = bch2_sb_resize_disk_groups(sb, u64s); if (!groups) - return -ENOSPC; + return -BCH_ERR_ENOSPC_disk_label_add; nr_groups = disk_groups_nr(groups); } diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index f33acf1a..aa830114 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -731,7 +731,7 @@ static int ec_stripe_bkey_insert(struct btree_trans *trans, continue; } - ret = -ENOSPC; + ret = -BCH_ERR_ENOSPC_stripe_create; break; } @@ -1388,7 +1388,7 @@ static int __bch2_ec_stripe_head_reuse(struct bch_fs *c, idx = get_existing_stripe(c, h); if (idx < 0) { bch_err(c, "failed to find an existing stripe"); - return -ENOSPC; + return -BCH_ERR_ENOSPC_stripe_reuse; } h->s->have_existing_stripe = true; diff --git a/libbcachefs/errcode.c b/libbcachefs/errcode.c index 9da8a597..cc9ce0be 100644 --- a/libbcachefs/errcode.c +++ b/libbcachefs/errcode.c @@ -15,7 +15,7 @@ static const char * const bch2_errcode_strs[] = { #define BCH_ERR_0 0 static unsigned bch2_errcode_parents[] = { -#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = BCH_ERR_##class, +#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class, BCH_ERRCODES() #undef x }; @@ -49,3 +49,14 @@ bool __bch2_err_matches(int err, int class) return err == class; } + +int __bch2_err_class(int err) +{ + err = -err; + BUG_ON((unsigned) err >= BCH_ERR_MAX); + + while (err >= BCH_ERR_START && bch2_errcode_parents[err - BCH_ERR_START]) + err = bch2_errcode_parents[err - BCH_ERR_START]; + + return -err; +} diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index 232f7c79..fc0bb5f8 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -2,53 +2,67 @@ #ifndef _BCACHEFS_ERRCODE_H #define _BCACHEFS_ERRCODE_H -#define BCH_ERRCODES() \ - x(0, open_buckets_empty) \ - x(0, freelist_empty) \ - x(freelist_empty, no_buckets_found) \ - x(0, insufficient_devices) \ - x(0, transaction_restart) \ - x(transaction_restart, transaction_restart_fault_inject) \ - x(transaction_restart, transaction_restart_relock) \ - x(transaction_restart, transaction_restart_relock_path) \ - x(transaction_restart, transaction_restart_relock_path_intent) \ - x(transaction_restart, transaction_restart_relock_after_fill) \ - x(transaction_restart, transaction_restart_too_many_iters) \ - x(transaction_restart, transaction_restart_lock_node_reused) \ - x(transaction_restart, transaction_restart_fill_relock) \ - x(transaction_restart, transaction_restart_fill_mem_alloc_fail)\ - x(transaction_restart, transaction_restart_mem_realloced) \ - x(transaction_restart, transaction_restart_in_traverse_all) \ - x(transaction_restart, transaction_restart_would_deadlock) \ - x(transaction_restart, transaction_restart_would_deadlock_write)\ - x(transaction_restart, transaction_restart_upgrade) \ - x(transaction_restart, transaction_restart_key_cache_upgrade) \ - x(transaction_restart, transaction_restart_key_cache_fill) \ - x(transaction_restart, transaction_restart_key_cache_raced) \ - x(transaction_restart, transaction_restart_key_cache_realloced)\ - x(transaction_restart, transaction_restart_journal_preres_get) \ - x(transaction_restart, transaction_restart_nested) \ - x(0, no_btree_node) \ - x(no_btree_node, no_btree_node_relock) \ - x(no_btree_node, no_btree_node_upgrade) \ - x(no_btree_node, no_btree_node_drop) \ - x(no_btree_node, no_btree_node_lock_root) \ - x(no_btree_node, no_btree_node_up) \ - x(no_btree_node, no_btree_node_down) \ - x(no_btree_node, no_btree_node_init) \ - x(no_btree_node, no_btree_node_cached) \ - x(0, backpointer_to_overwritten_btree_node) \ - x(0, lock_fail_node_reused) \ - x(0, lock_fail_root_changed) \ - x(0, journal_reclaim_would_deadlock) \ - x(0, fsck) \ - x(fsck, fsck_fix) \ - x(fsck, fsck_ignore) \ - x(fsck, fsck_errors_not_fixed) \ - x(fsck, fsck_repair_unimplemented) \ - x(fsck, fsck_repair_impossible) \ - x(0, need_snapshot_cleanup) \ - x(0, need_topology_repair) +#define BCH_ERRCODES() \ + x(ENOSPC, ENOSPC_disk_reservation) \ + x(ENOSPC, ENOSPC_bucket_alloc) \ + x(ENOSPC, ENOSPC_disk_label_add) \ + x(ENOSPC, ENOSPC_stripe_create) \ + x(ENOSPC, ENOSPC_stripe_reuse) \ + x(ENOSPC, ENOSPC_inode_create) \ + x(ENOSPC, ENOSPC_str_hash_create) \ + x(ENOSPC, ENOSPC_snapshot_create) \ + x(ENOSPC, ENOSPC_subvolume_create) \ + x(ENOSPC, ENOSPC_sb) \ + x(ENOSPC, ENOSPC_sb_journal) \ + x(ENOSPC, ENOSPC_sb_quota) \ + x(ENOSPC, ENOSPC_sb_replicas) \ + x(ENOSPC, ENOSPC_sb_members) \ + x(0, open_buckets_empty) \ + x(0, freelist_empty) \ + x(BCH_ERR_freelist_empty, no_buckets_found) \ + x(0, insufficient_devices) \ + x(0, transaction_restart) \ + x(BCH_ERR_transaction_restart, transaction_restart_fault_inject) \ + x(BCH_ERR_transaction_restart, transaction_restart_relock) \ + x(BCH_ERR_transaction_restart, transaction_restart_relock_path) \ + x(BCH_ERR_transaction_restart, transaction_restart_relock_path_intent) \ + x(BCH_ERR_transaction_restart, transaction_restart_relock_after_fill) \ + x(BCH_ERR_transaction_restart, transaction_restart_too_many_iters) \ + x(BCH_ERR_transaction_restart, transaction_restart_lock_node_reused) \ + x(BCH_ERR_transaction_restart, transaction_restart_fill_relock) \ + x(BCH_ERR_transaction_restart, transaction_restart_fill_mem_alloc_fail)\ + x(BCH_ERR_transaction_restart, transaction_restart_mem_realloced) \ + x(BCH_ERR_transaction_restart, transaction_restart_in_traverse_all) \ + x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock) \ + x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock_write)\ + x(BCH_ERR_transaction_restart, transaction_restart_deadlock_recursion_limit)\ + x(BCH_ERR_transaction_restart, transaction_restart_upgrade) \ + x(BCH_ERR_transaction_restart, transaction_restart_key_cache_upgrade) \ + x(BCH_ERR_transaction_restart, transaction_restart_key_cache_fill) \ + x(BCH_ERR_transaction_restart, transaction_restart_key_cache_raced) \ + x(BCH_ERR_transaction_restart, transaction_restart_key_cache_realloced)\ + x(BCH_ERR_transaction_restart, transaction_restart_journal_preres_get) \ + x(BCH_ERR_transaction_restart, transaction_restart_nested) \ + x(0, no_btree_node) \ + x(BCH_ERR_no_btree_node, no_btree_node_relock) \ + x(BCH_ERR_no_btree_node, no_btree_node_upgrade) \ + x(BCH_ERR_no_btree_node, no_btree_node_drop) \ + x(BCH_ERR_no_btree_node, no_btree_node_lock_root) \ + x(BCH_ERR_no_btree_node, no_btree_node_up) \ + x(BCH_ERR_no_btree_node, no_btree_node_down) \ + x(BCH_ERR_no_btree_node, no_btree_node_init) \ + x(BCH_ERR_no_btree_node, no_btree_node_cached) \ + x(0, backpointer_to_overwritten_btree_node) \ + x(0, lock_fail_root_changed) \ + x(0, journal_reclaim_would_deadlock) \ + x(0, fsck) \ + x(BCH_ERR_fsck, fsck_fix) \ + x(BCH_ERR_fsck, fsck_ignore) \ + x(BCH_ERR_fsck, fsck_errors_not_fixed) \ + x(BCH_ERR_fsck, fsck_repair_unimplemented) \ + x(BCH_ERR_fsck, fsck_repair_impossible) \ + x(0, need_snapshot_cleanup) \ + x(0, need_topology_repair) enum bch_errcode { BCH_ERR_START = 2048, @@ -72,4 +86,11 @@ static inline bool _bch2_err_matches(int err, int class) _bch2_err_matches(_err, _class); \ }) +int __bch2_err_class(int); + +static inline long bch2_err_class(long err) +{ + return err < 0 ? __bch2_err_class(err) : err; +} + #endif /* _BCACHFES_ERRCODE_H */ diff --git a/libbcachefs/error.c b/libbcachefs/error.c index f6a895b2..762abdf2 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -68,102 +68,135 @@ void bch2_io_error(struct bch_dev *ca) #include "tools-util.h" #endif -int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...) +static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt) { - struct fsck_err_state *s = NULL; - va_list args; - bool fix = false, print = true, suppressing = false; - char _buf[sizeof(s->buf)], *buf = _buf; + struct fsck_err_state *s; - if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) { - va_start(args, fmt); - vprintk(fmt, args); - va_end(args); - - if (c->opts.errors == BCH_ON_ERROR_continue) { - bch_err(c, "fixing"); - return -BCH_ERR_fsck_fix; - } else { - bch2_inconsistent_error(c); - return -BCH_ERR_fsck_errors_not_fixed; - } - } - - mutex_lock(&c->fsck_error_lock); + if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) + return NULL; list_for_each_entry(s, &c->fsck_errors, list) - if (s->fmt == fmt) - goto found; + if (s->fmt == fmt) { + /* + * move it to the head of the list: repeated fsck errors + * are common + */ + list_move(&s->list, &c->fsck_errors); + return s; + } s = kzalloc(sizeof(*s), GFP_NOFS); if (!s) { if (!c->fsck_alloc_err) bch_err(c, "kmalloc err, cannot ratelimit fsck errs"); c->fsck_alloc_err = true; - buf = _buf; - goto print; + return NULL; } INIT_LIST_HEAD(&s->list); s->fmt = fmt; -found: - list_move(&s->list, &c->fsck_errors); - s->nr++; - if (c->opts.ratelimit_errors && - !(flags & FSCK_NO_RATELIMIT) && - s->nr >= FSCK_ERR_RATELIMIT_NR) { - if (s->nr == FSCK_ERR_RATELIMIT_NR) - suppressing = true; - else - print = false; + s->buf = PRINTBUF; + list_add(&s->list, &c->fsck_errors); + return s; +} + +int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...) +{ + struct fsck_err_state *s = NULL; + va_list args; + bool print = true, suppressing = false; + struct printbuf buf = PRINTBUF, *out = &buf; + int ret = -BCH_ERR_fsck_ignore; + + mutex_lock(&c->fsck_error_lock); + s = fsck_err_get(c, fmt); + if (s) { + if (c->opts.ratelimit_errors && + !(flags & FSCK_NO_RATELIMIT) && + s->nr >= FSCK_ERR_RATELIMIT_NR) { + if (s->nr == FSCK_ERR_RATELIMIT_NR) + suppressing = true; + else + print = false; + } + + printbuf_reset(&s->buf); + out = &s->buf; + s->nr++; } - buf = s->buf; -print: + + if (!strncmp(fmt, "bcachefs:", 9)) + prt_printf(out, bch2_log_msg(c, "")); + va_start(args, fmt); - vscnprintf(buf, sizeof(_buf), fmt, args); + prt_vprintf(out, fmt, args); va_end(args); - if (c->opts.fix_errors == FSCK_OPT_EXIT) { - bch_err(c, "%s, exiting", buf); + if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) { + if (c->opts.errors != BCH_ON_ERROR_continue || + !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { + prt_str(out, ", shutting down"); + bch2_inconsistent_error(c); + ret = -BCH_ERR_fsck_errors_not_fixed; + } else if (flags & FSCK_CAN_FIX) { + prt_str(out, ", fixing"); + ret = -BCH_ERR_fsck_fix; + } else { + prt_str(out, ", continuing"); + ret = -BCH_ERR_fsck_ignore; + } + } else if (c->opts.fix_errors == FSCK_OPT_EXIT) { + prt_str(out, ", exiting"); + ret = -BCH_ERR_fsck_errors_not_fixed; } else if (flags & FSCK_CAN_FIX) { if (c->opts.fix_errors == FSCK_OPT_ASK) { - printk(KERN_ERR "%s: fix?", buf); - fix = ask_yn(); + prt_str(out, ": fix?"); + bch2_print_string_as_lines(KERN_ERR, out->buf); + print = false; + ret = ask_yn() + ? -BCH_ERR_fsck_fix + : -BCH_ERR_fsck_ignore; } else if (c->opts.fix_errors == FSCK_OPT_YES || (c->opts.nochanges && !(flags & FSCK_CAN_IGNORE))) { - if (print) - bch_err(c, "%s, fixing", buf); - fix = true; + prt_str(out, ", fixing"); + ret = -BCH_ERR_fsck_fix; } else { - if (print) - bch_err(c, "%s, not fixing", buf); - fix = false; + prt_str(out, ", not fixing"); } } else if (flags & FSCK_NEED_FSCK) { - if (print) - bch_err(c, "%s (run fsck to correct)", buf); + prt_str(out, " (run fsck to correct)"); } else { - if (print) - bch_err(c, "%s (repair unimplemented)", buf); + prt_str(out, " (repair unimplemented)"); } - if (suppressing) + if (ret == -BCH_ERR_fsck_ignore && + (c->opts.fix_errors == FSCK_OPT_EXIT || + !(flags & FSCK_CAN_IGNORE))) + ret = -BCH_ERR_fsck_errors_not_fixed; + + if (print) + bch2_print_string_as_lines(KERN_ERR, out->buf); + + if (!test_bit(BCH_FS_FSCK_DONE, &c->flags) && + (ret != -BCH_ERR_fsck_fix && + ret != -BCH_ERR_fsck_ignore)) + bch_err(c, "Unable to continue, halting"); + else if (suppressing) bch_err(c, "Ratelimiting new instances of previous error"); mutex_unlock(&c->fsck_error_lock); - if (fix) { + printbuf_exit(&buf); + + if (ret == -BCH_ERR_fsck_fix) { set_bit(BCH_FS_ERRORS_FIXED, &c->flags); - return -BCH_ERR_fsck_fix; } else { set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags); set_bit(BCH_FS_ERROR, &c->flags); - return c->opts.fix_errors == FSCK_OPT_EXIT || - !(flags & FSCK_CAN_IGNORE) - ? -BCH_ERR_fsck_errors_not_fixed - : -BCH_ERR_fsck_ignore; } + + return ret; } void bch2_flush_fsck_errs(struct bch_fs *c) @@ -174,9 +207,10 @@ void bch2_flush_fsck_errs(struct bch_fs *c) list_for_each_entry_safe(s, n, &c->fsck_errors, list) { if (s->ratelimited) - bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf); + bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf.buf); list_del(&s->list); + printbuf_exit(&s->buf); kfree(s); } diff --git a/libbcachefs/error.h b/libbcachefs/error.h index b603d738..bbf9b6d8 100644 --- a/libbcachefs/error.h +++ b/libbcachefs/error.h @@ -103,7 +103,7 @@ struct fsck_err_state { const char *fmt; u64 nr; bool ratelimited; - char buf[512]; + struct printbuf buf; }; #define FSCK_CAN_FIX (1 << 0) @@ -121,7 +121,6 @@ void bch2_flush_fsck_errs(struct bch_fs *); \ if (_ret != -BCH_ERR_fsck_fix && \ _ret != -BCH_ERR_fsck_ignore) { \ - bch_err(c, "Unable to continue, halting"); \ ret = _ret; \ goto fsck_err; \ } \ diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 0a7f172f..95b84c3c 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -1213,7 +1213,7 @@ int bch2_read_folio(struct file *file, struct folio *folio) ret = bch2_read_single_page(page, page->mapping); folio_unlock(folio); - return ret; + return bch2_err_class(ret); } /* writepages: */ @@ -1249,8 +1249,6 @@ static void bch2_writepage_io_done(struct closure *cl) struct bio_vec *bvec; unsigned i; - up(&io->op.c->io_in_flight); - if (io->op.error) { set_bit(EI_INODE_ERROR, &io->inode->ei_flags); @@ -1313,8 +1311,6 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w) { struct bch_writepage_io *io = w->io; - down(&io->op.c->io_in_flight); - w->io = NULL; closure_call(&io->op.cl, bch2_write, NULL, &io->cl); continue_at(&io->cl, bch2_writepage_io_done, NULL); @@ -1501,7 +1497,7 @@ int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc if (w.io) bch2_writepage_do_io(&w); blk_finish_plug(&plug); - return ret; + return bch2_err_class(ret); } /* buffered writes: */ @@ -1586,7 +1582,7 @@ err_unlock: bch2_pagecache_add_put(&inode->ei_pagecache_lock); kfree(res); *fsdata = NULL; - return ret; + return bch2_err_class(ret); } int bch2_write_end(struct file *file, struct address_space *mapping, @@ -2010,7 +2006,7 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) iocb->ki_pos, iocb->ki_pos + count - 1); if (ret < 0) - return ret; + goto out; file_accessed(file); @@ -2025,8 +2021,8 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) ret = generic_file_read_iter(iocb, iter); bch2_pagecache_add_put(&inode->ei_pagecache_lock); } - - return ret; +out: + return bch2_err_class(ret); } /* O_DIRECT writes */ @@ -2094,8 +2090,6 @@ static long bch2_dio_write_loop(struct dio_write *dio) if (dio->loop) goto loop; - down(&c->io_in_flight); - while (1) { iter_count = dio->iter.count; @@ -2226,7 +2220,6 @@ loop: ret = dio->op.error ?: ((long) dio->written << 9); err: - up(&c->io_in_flight); bch2_pagecache_block_put(&inode->ei_pagecache_lock); bch2_quota_reservation_put(c, inode, &dio->quota_res); @@ -2347,8 +2340,10 @@ ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) struct bch_inode_info *inode = file_bch_inode(file); ssize_t ret; - if (iocb->ki_flags & IOCB_DIRECT) - return bch2_direct_write(iocb, from); + if (iocb->ki_flags & IOCB_DIRECT) { + ret = bch2_direct_write(iocb, from); + goto out; + } /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(&inode->v); @@ -2375,8 +2370,8 @@ unlock: if (ret > 0) ret = generic_write_sync(iocb, ret); - - return ret; +out: + return bch2_err_class(ret); } /* fsync: */ @@ -2410,7 +2405,7 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret2 = sync_inode_metadata(&inode->v, 1); ret3 = bch2_flush_inode(c, inode_inum(inode)); - return ret ?: ret2 ?: ret3; + return bch2_err_class(ret ?: ret2 ?: ret3); } /* truncate: */ @@ -2716,7 +2711,7 @@ int bch2_truncate(struct user_namespace *mnt_userns, ret = bch2_setattr_nonsize(mnt_userns, inode, iattr); err: bch2_pagecache_block_put(&inode->ei_pagecache_lock); - return ret; + return bch2_err_class(ret); } /* fallocate: */ @@ -3044,7 +3039,7 @@ bkey_err: bch2_trans_unlock(&trans); /* lock ordering, before taking pagecache locks: */ mark_pagecache_reserved(inode, start_sector, iter.pos.offset); - if (ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)) { + if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) { struct quota_res quota_res = { 0 }; s64 i_sectors_delta = 0; @@ -3095,7 +3090,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode, * so that the VFS cache i_size is consistent with the btree i_size: */ if (ret && - !(ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE))) + !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE))) return ret; if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size) @@ -3146,7 +3141,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode, inode_unlock(&inode->v); percpu_ref_put(&c->writes); - return ret; + return bch2_err_class(ret); } loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, @@ -3224,7 +3219,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, err: bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); - return ret; + return bch2_err_class(ret); } /* fseek: */ @@ -3447,18 +3442,26 @@ err: loff_t bch2_llseek(struct file *file, loff_t offset, int whence) { + loff_t ret; + switch (whence) { case SEEK_SET: case SEEK_CUR: case SEEK_END: - return generic_file_llseek(file, offset, whence); + ret = generic_file_llseek(file, offset, whence); + break; case SEEK_DATA: - return bch2_seek_data(file, offset); + ret = bch2_seek_data(file, offset); + break; case SEEK_HOLE: - return bch2_seek_hole(file, offset); + ret = bch2_seek_hole(file, offset); + break; + default: + ret = -EINVAL; + break; } - return -EINVAL; + return bch2_err_class(ret); } void bch2_fs_fsio_exit(struct bch_fs *c) diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index 9f329a62..bab0707b 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -455,51 +455,67 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; + long ret; switch (cmd) { case FS_IOC_GETFLAGS: - return bch2_ioc_getflags(inode, (int __user *) arg); + ret = bch2_ioc_getflags(inode, (int __user *) arg); + break; case FS_IOC_SETFLAGS: - return bch2_ioc_setflags(c, file, inode, (int __user *) arg); + ret = bch2_ioc_setflags(c, file, inode, (int __user *) arg); + break; case FS_IOC_FSGETXATTR: - return bch2_ioc_fsgetxattr(inode, (void __user *) arg); + ret = bch2_ioc_fsgetxattr(inode, (void __user *) arg); + break; + case FS_IOC_FSSETXATTR: - return bch2_ioc_fssetxattr(c, file, inode, - (void __user *) arg); + ret = bch2_ioc_fssetxattr(c, file, inode, + (void __user *) arg); + break; case BCHFS_IOC_REINHERIT_ATTRS: - return bch2_ioc_reinherit_attrs(c, file, inode, - (void __user *) arg); + ret = bch2_ioc_reinherit_attrs(c, file, inode, + (void __user *) arg); + break; case FS_IOC_GETVERSION: - return -ENOTTY; + ret = -ENOTTY; + break; + case FS_IOC_SETVERSION: - return -ENOTTY; + ret = -ENOTTY; + break; case FS_IOC_GOINGDOWN: - return bch2_ioc_goingdown(c, (u32 __user *) arg); + ret = bch2_ioc_goingdown(c, (u32 __user *) arg); + break; case BCH_IOCTL_SUBVOLUME_CREATE: { struct bch_ioctl_subvolume i; - if (copy_from_user(&i, (void __user *) arg, sizeof(i))) - return -EFAULT; - return bch2_ioctl_subvolume_create(c, file, i); + ret = copy_from_user(&i, (void __user *) arg, sizeof(i)) + ? -EFAULT + : bch2_ioctl_subvolume_create(c, file, i); + break; } case BCH_IOCTL_SUBVOLUME_DESTROY: { struct bch_ioctl_subvolume i; - if (copy_from_user(&i, (void __user *) arg, sizeof(i))) - return -EFAULT; - return bch2_ioctl_subvolume_destroy(c, file, i); + ret = copy_from_user(&i, (void __user *) arg, sizeof(i)) + ? -EFAULT + : bch2_ioctl_subvolume_destroy(c, file, i); + break; } default: - return bch2_fs_ioctl(c, cmd, (void __user *) arg); + ret = bch2_fs_ioctl(c, cmd, (void __user *) arg); + break; } + + return bch2_err_class(ret); } #ifdef CONFIG_COMPAT diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 3e2b6097..b5977c46 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -769,7 +769,7 @@ err_trans: err: mutex_unlock(&inode->ei_update_lock); - return ret; + return bch2_err_class(ret); } static int bch2_getattr(struct user_namespace *mnt_userns, @@ -1453,7 +1453,7 @@ static int bch2_vfs_write_inode(struct inode *vinode, ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); - return ret; + return bch2_err_class(ret); } static void bch2_evict_inode(struct inode *vinode) @@ -1557,6 +1557,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) static int bch2_sync_fs(struct super_block *sb, int wait) { struct bch_fs *c = sb->s_fs_info; + int ret; if (c->opts.journal_flush_disabled) return 0; @@ -1566,7 +1567,8 @@ static int bch2_sync_fs(struct super_block *sb, int wait) return 0; } - return bch2_journal_flush(&c->journal); + ret = bch2_journal_flush(&c->journal); + return bch2_err_class(ret); } static struct bch_fs *bch2_path_to_fs(const char *path) @@ -1622,7 +1624,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data) ret = bch2_parse_mount_opts(c, &opts, data); if (ret) - return ret; + goto err; if (opts.read_only != c->opts.read_only) { down_write(&c->state_lock); @@ -1636,7 +1638,8 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data) if (ret) { bch_err(c, "error going rw: %i", ret); up_write(&c->state_lock); - return -EINVAL; + ret = -EINVAL; + goto err; } sb->s_flags &= ~SB_RDONLY; @@ -1649,8 +1652,8 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data) if (opts.errors >= 0) c->opts.errors = opts.errors; - - return ret; +err: + return bch2_err_class(ret); } static int bch2_show_devname(struct seq_file *seq, struct dentry *root) diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 08310600..1f2782fc 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -567,7 +567,7 @@ again: } if (!ret && start == min) - ret = -ENOSPC; + ret = -BCH_ERR_ENOSPC_inode_create; if (ret) { bch2_trans_iter_exit(trans, iter); diff --git a/libbcachefs/io.c b/libbcachefs/io.c index a683a689..e047ef28 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -535,17 +535,11 @@ static void bch2_write_done(struct closure *cl) } } -/** - * bch_write_index - after a write, update index to point to new data - */ -static void __bch2_write_index(struct bch_write_op *op) +static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) { - struct bch_fs *c = op->c; struct keylist *keys = &op->insert_keys; struct bch_extent_ptr *ptr; - struct bkey_i *src, *dst = keys->keys, *n, *k; - unsigned dev; - int ret; + struct bkey_i *src, *dst = keys->keys, *n; for (src = keys->keys; src != keys->top; src = n) { n = bkey_next(src); @@ -554,10 +548,8 @@ static void __bch2_write_index(struct bch_write_op *op) bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, test_bit(ptr->dev, op->failed.d)); - if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) { - ret = -EIO; - goto err; - } + if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) + return -EIO; } if (dst != src) @@ -566,6 +558,25 @@ static void __bch2_write_index(struct bch_write_op *op) } keys->top = dst; + return 0; +} + +/** + * bch_write_index - after a write, update index to point to new data + */ +static void __bch2_write_index(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct keylist *keys = &op->insert_keys; + struct bkey_i *k; + unsigned dev; + int ret; + + if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { + ret = bch2_write_drop_io_error_ptrs(op); + if (ret) + goto err; + } /* * probably not the ideal place to hook this in, but I don't @@ -640,8 +651,10 @@ static void bch2_write_endio(struct bio *bio) op->pos.inode, op->pos.offset - bio_sectors(bio), /* XXX definitely wrong */ "data write error: %s", - bch2_blk_status_to_str(bio->bi_status))) + bch2_blk_status_to_str(bio->bi_status))) { set_bit(wbio->dev, op->failed.d); + op->flags |= BCH_WRITE_IO_ERROR; + } if (wbio->have_ioref) { bch2_latency_acct(ca, wbio->submit_time, WRITE); diff --git a/libbcachefs/io.h b/libbcachefs/io.h index fb511451..3ae31758 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -40,6 +40,7 @@ enum bch_write_flags { BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 10), BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 11), BCH_WRITE_DONE = (1 << 12), + BCH_WRITE_IO_ERROR = (1 << 13), }; static inline u64 *op_journal_seq(struct bch_write_op *op) diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 3e8972c2..ab594623 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -809,14 +809,16 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, if (new_fs) { bu[nr_got] = bch2_bucket_alloc_new_fs(ca); if (bu[nr_got] < 0) { - ret = -ENOSPC; + ret = -BCH_ERR_ENOSPC_bucket_alloc; break; } } else { ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none, false, cl); if (IS_ERR(ob[nr_got])) { - ret = cl ? -EAGAIN : -ENOSPC; + ret = cl + ? -EAGAIN + : -BCH_ERR_ENOSPC_bucket_alloc; break; } @@ -943,10 +945,11 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, * reservation to ensure we'll actually be able to allocate: */ - if (bch2_disk_reservation_get(c, &disk_res, - bucket_to_sector(ca, nr - ja->nr), 1, 0)) { + ret = bch2_disk_reservation_get(c, &disk_res, + bucket_to_sector(ca, nr - ja->nr), 1, 0); + if (ret) { mutex_unlock(&c->sb_lock); - return -ENOSPC; + return ret; } ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 55b86cbd..253a6ae2 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -187,30 +187,57 @@ static void journal_entry_null_range(void *start, void *end) #define JOURNAL_ENTRY_NONE 6 #define JOURNAL_ENTRY_BAD 7 -#define journal_entry_err(c, msg, ...) \ +static void journal_entry_err_msg(struct printbuf *out, + struct jset *jset, + struct jset_entry *entry) +{ + prt_str(out, "invalid journal entry "); + if (entry) + prt_printf(out, "%s ", bch2_jset_entry_types[entry->type]); + + if (!jset) + prt_printf(out, "in superblock"); + else if (!entry) + prt_printf(out, "at seq %llu", le64_to_cpu(jset->seq)); + else + prt_printf(out, "at offset %zi/%u seq %llu", + (u64 *) entry - jset->_data, + le32_to_cpu(jset->u64s), + le64_to_cpu(jset->seq)); + prt_str(out, ": "); +} + +#define journal_entry_err(c, jset, entry, msg, ...) \ ({ \ + struct printbuf buf = PRINTBUF; \ + \ + journal_entry_err_msg(&buf, jset, entry); \ + prt_printf(&buf, msg, ##__VA_ARGS__); \ + \ switch (write) { \ case READ: \ - mustfix_fsck_err(c, msg, ##__VA_ARGS__); \ + mustfix_fsck_err(c, "%s", buf.buf); \ break; \ case WRITE: \ - bch_err(c, "corrupt metadata before write:\n" \ - msg, ##__VA_ARGS__); \ + bch_err(c, "corrupt metadata before write: %s\n", buf.buf);\ if (bch2_fs_inconsistent(c)) { \ ret = -BCH_ERR_fsck_errors_not_fixed; \ goto fsck_err; \ } \ break; \ } \ + \ + printbuf_exit(&buf); \ true; \ }) -#define journal_entry_err_on(cond, c, msg, ...) \ - ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false) +#define journal_entry_err_on(cond, c, jset, entry, msg, ...) \ + ((cond) ? journal_entry_err(c, jset, entry, msg, ##__VA_ARGS__) : false) #define FSCK_DELETED_KEY 5 -static int journal_validate_key(struct bch_fs *c, const char *where, +static int journal_validate_key(struct bch_fs *c, + struct jset *jset, struct jset_entry *entry, unsigned level, enum btree_id btree_id, struct bkey_i *k, @@ -220,33 +247,24 @@ static int journal_validate_key(struct bch_fs *c, const char *where, struct printbuf buf = PRINTBUF; int ret = 0; - if (journal_entry_err_on(!k->k.u64s, c, - "invalid key in %s at %s offset %zi/%u: k->u64s 0", - bch2_jset_entry_types[entry->type], where, - (u64 *) k - entry->_data, - le16_to_cpu(entry->u64s))) { + if (journal_entry_err_on(!k->k.u64s, c, jset, entry, "k->u64s 0")) { entry->u64s = cpu_to_le16((u64 *) k - entry->_data); journal_entry_null_range(vstruct_next(entry), next); return FSCK_DELETED_KEY; } if (journal_entry_err_on((void *) bkey_next(k) > - (void *) vstruct_next(entry), c, - "invalid key in %s at %s offset %zi/%u: extends past end of journal entry", - bch2_jset_entry_types[entry->type], where, - (u64 *) k - entry->_data, - le16_to_cpu(entry->u64s))) { + (void *) vstruct_next(entry), + c, jset, entry, + "extends past end of journal entry")) { entry->u64s = cpu_to_le16((u64 *) k - entry->_data); journal_entry_null_range(vstruct_next(entry), next); return FSCK_DELETED_KEY; } - if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c, - "invalid key in %s at %s offset %zi/%u: bad format %u", - bch2_jset_entry_types[entry->type], where, - (u64 *) k - entry->_data, - le16_to_cpu(entry->u64s), - k->k.format)) { + if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, + c, jset, entry, + "bad format %u", k->k.format)) { le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); journal_entry_null_range(vstruct_next(entry), next); @@ -260,10 +278,11 @@ static int journal_validate_key(struct bch_fs *c, const char *where, if (bch2_bkey_invalid(c, bkey_i_to_s_c(k), __btree_node_type(level, btree_id), write, &buf)) { printbuf_reset(&buf); - prt_printf(&buf, "invalid key in %s at %s offset %zi/%u:", - bch2_jset_entry_types[entry->type], where, - (u64 *) k - entry->_data, - le16_to_cpu(entry->u64s)); + prt_printf(&buf, "invalid journal entry %s at offset %zi/%u seq %llu:", + bch2_jset_entry_types[entry->type], + (u64 *) entry - jset->_data, + le32_to_cpu(jset->u64s), + le64_to_cpu(jset->seq)); prt_newline(&buf); printbuf_indent_add(&buf, 2); @@ -291,14 +310,14 @@ fsck_err: } static int journal_entry_btree_keys_validate(struct bch_fs *c, - const char *where, + struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, int write) { struct bkey_i *k = entry->start; while (k != vstruct_last(entry)) { - int ret = journal_validate_key(c, where, entry, + int ret = journal_validate_key(c, jset, entry, entry->level, entry->btree_id, k, version, big_endian, write); @@ -329,7 +348,7 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs } static int journal_entry_btree_root_validate(struct bch_fs *c, - const char *where, + struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, int write) { @@ -337,7 +356,8 @@ static int journal_entry_btree_root_validate(struct bch_fs *c, int ret = 0; if (journal_entry_err_on(!entry->u64s || - le16_to_cpu(entry->u64s) != k->k.u64s, c, + le16_to_cpu(entry->u64s) != k->k.u64s, + c, jset, entry, "invalid btree root journal entry: wrong number of keys")) { void *next = vstruct_next(entry); /* @@ -350,7 +370,7 @@ static int journal_entry_btree_root_validate(struct bch_fs *c, return 0; } - return journal_validate_key(c, where, entry, 1, entry->btree_id, k, + return journal_validate_key(c, jset, entry, 1, entry->btree_id, k, version, big_endian, write); fsck_err: return ret; @@ -363,7 +383,7 @@ static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs } static int journal_entry_prio_ptrs_validate(struct bch_fs *c, - const char *where, + struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, int write) { @@ -377,13 +397,14 @@ static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs } static int journal_entry_blacklist_validate(struct bch_fs *c, - const char *where, + struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, int write) { int ret = 0; - if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c, + if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, + c, jset, entry, "invalid journal seq blacklist entry: bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); } @@ -401,14 +422,15 @@ static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs } static int journal_entry_blacklist_v2_validate(struct bch_fs *c, - const char *where, + struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, int write) { struct jset_entry_blacklist_v2 *bl_entry; int ret = 0; - if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c, + if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, + c, jset, entry, "invalid journal seq blacklist entry: bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); goto out; @@ -417,7 +439,8 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c, bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > - le64_to_cpu(bl_entry->end), c, + le64_to_cpu(bl_entry->end), + c, jset, entry, "invalid journal seq blacklist entry: start > end")) { journal_entry_null_range(entry, vstruct_next(entry)); } @@ -438,7 +461,7 @@ static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_ } static int journal_entry_usage_validate(struct bch_fs *c, - const char *where, + struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, int write) { @@ -448,7 +471,7 @@ static int journal_entry_usage_validate(struct bch_fs *c, int ret = 0; if (journal_entry_err_on(bytes < sizeof(*u), - c, + c, jset, entry, "invalid journal entry usage: bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; @@ -470,7 +493,7 @@ static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, } static int journal_entry_data_usage_validate(struct bch_fs *c, - const char *where, + struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, int write) { @@ -481,7 +504,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c, if (journal_entry_err_on(bytes < sizeof(*u) || bytes < sizeof(*u) + u->r.nr_devs, - c, + c, jset, entry, "invalid journal entry usage: bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; @@ -502,7 +525,7 @@ static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs } static int journal_entry_clock_validate(struct bch_fs *c, - const char *where, + struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, int write) { @@ -512,13 +535,13 @@ static int journal_entry_clock_validate(struct bch_fs *c, int ret = 0; if (journal_entry_err_on(bytes != sizeof(*clock), - c, "invalid journal entry clock: bad size")) { + c, jset, entry, "bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; } if (journal_entry_err_on(clock->rw > 1, - c, "invalid journal entry clock: bad rw")) { + c, jset, entry, "bad rw")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; } @@ -537,7 +560,7 @@ static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, } static int journal_entry_dev_usage_validate(struct bch_fs *c, - const char *where, + struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, int write) { @@ -549,7 +572,7 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c, int ret = 0; if (journal_entry_err_on(bytes < expected, - c, "invalid journal entry dev usage: bad size (%u < %u)", + c, jset, entry, "bad size (%u < %u)", bytes, expected)) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; @@ -558,13 +581,13 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c, dev = le32_to_cpu(u->dev); if (journal_entry_err_on(!bch2_dev_exists2(c, dev), - c, "invalid journal entry dev usage: bad dev")) { + c, jset, entry, "bad dev")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; } if (journal_entry_err_on(u->pad, - c, "invalid journal entry dev usage: bad pad")) { + c, jset, entry, "bad pad")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; } @@ -597,7 +620,7 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs } static int journal_entry_log_validate(struct bch_fs *c, - const char *where, + struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, int write) { @@ -613,11 +636,12 @@ static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, "%.*s", bytes, l->d); } -static int journal_entry_overwrite_validate(struct bch_fs *c, const char *where, +static int journal_entry_overwrite_validate(struct bch_fs *c, + struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, int write) { - return journal_entry_btree_keys_validate(c, where, entry, version, big_endian, write); + return journal_entry_btree_keys_validate(c, jset, entry, version, big_endian, write); } static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, @@ -627,7 +651,7 @@ static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs } struct jset_entry_ops { - int (*validate)(struct bch_fs *, const char *, + int (*validate)(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, int); void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); }; @@ -642,12 +666,13 @@ static const struct jset_entry_ops bch2_jset_entry_ops[] = { #undef x }; -int bch2_journal_entry_validate(struct bch_fs *c, const char *where, +int bch2_journal_entry_validate(struct bch_fs *c, + struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, int write) { return entry->type < BCH_JSET_ENTRY_NR - ? bch2_jset_entry_ops[entry->type].validate(c, where, entry, + ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, version, big_endian, write) : 0; } @@ -666,24 +691,18 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, static int jset_validate_entries(struct bch_fs *c, struct jset *jset, int write) { - char buf[100]; struct jset_entry *entry; int ret = 0; vstruct_for_each(jset, entry) { - scnprintf(buf, sizeof(buf), "jset %llu entry offset %zi/%u", - le64_to_cpu(jset->seq), - (u64 *) entry - jset->_data, - le32_to_cpu(jset->u64s)); - if (journal_entry_err_on(vstruct_next(entry) > - vstruct_last(jset), c, + vstruct_last(jset), c, jset, entry, "journal entry extends past end of jset")) { jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); break; } - ret = bch2_journal_entry_validate(c, buf, entry, + ret = bch2_journal_entry_validate(c, jset, entry, le32_to_cpu(jset->version), JSET_BIG_ENDIAN(jset), write); if (ret) @@ -711,7 +730,8 @@ static int jset_validate(struct bch_fs *c, version = le32_to_cpu(jset->version); if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD && version < bcachefs_metadata_version_min) || - version >= bcachefs_metadata_version_max, c, + version >= bcachefs_metadata_version_max, + c, jset, NULL, "%s sector %llu seq %llu: unknown journal entry version %u", ca ? ca->name : c->name, sector, le64_to_cpu(jset->seq), @@ -724,7 +744,8 @@ static int jset_validate(struct bch_fs *c, sectors_read < bucket_sectors_left) return JOURNAL_ENTRY_REREAD; - if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c, + if (journal_entry_err_on(bytes > bucket_sectors_left << 9, + c, jset, NULL, "%s sector %llu seq %llu: journal entry too big (%zu bytes)", ca ? ca->name : c->name, sector, le64_to_cpu(jset->seq), bytes)) { @@ -733,7 +754,8 @@ static int jset_validate(struct bch_fs *c, -((bytes - (bucket_sectors_left << 9)) / 8)); } - if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c, + if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), + c, jset, NULL, "%s sector %llu seq %llu: journal entry with unknown csum type %llu", ca ? ca->name : c->name, sector, le64_to_cpu(jset->seq), @@ -746,7 +768,8 @@ static int jset_validate(struct bch_fs *c, goto csum_done; csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); - if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c, + if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), + c, jset, NULL, "%s sector %llu seq %llu: journal checksum bad", ca ? ca->name : c->name, sector, le64_to_cpu(jset->seq))) @@ -760,7 +783,8 @@ static int jset_validate(struct bch_fs *c, csum_done: /* last_seq is ignored when JSET_NO_FLUSH is true */ if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && - le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c, + le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), + c, jset, NULL, "invalid journal entry: last_seq > seq (%llu > %llu)", le64_to_cpu(jset->last_seq), le64_to_cpu(jset->seq))) { diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h index 30e995c8..1a91f2c0 100644 --- a/libbcachefs/journal_io.h +++ b/libbcachefs/journal_io.h @@ -44,7 +44,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \ vstruct_for_each_safe(entry, k, _n) -int bch2_journal_entry_validate(struct bch_fs *, const char *, +int bch2_journal_entry_validate(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, int); void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, struct jset_entry *); diff --git a/libbcachefs/journal_sb.c b/libbcachefs/journal_sb.c index 001cecec..cfdbd92d 100644 --- a/libbcachefs/journal_sb.c +++ b/libbcachefs/journal_sb.c @@ -197,7 +197,7 @@ int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca) j = bch2_sb_resize_journal_v2(&ca->disk_sb, (sizeof(*j) + sizeof(j->d[0]) * nr) / sizeof(u64)); if (!j) - return -ENOSPC; + return -BCH_ERR_ENOSPC_sb_journal; bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c index 454c76e0..c12d715f 100644 --- a/libbcachefs/quota.c +++ b/libbcachefs/quota.c @@ -665,7 +665,7 @@ static int bch2_quota_set_info(struct super_block *sb, int type, sb_quota = bch2_sb_resize_quota(&c->disk_sb, sizeof(*sb_quota) / sizeof(u64)); if (!sb_quota) - return -ENOSPC; + return -BCH_ERR_ENOSPC_sb_quota; } if (info->i_fieldmask & QC_SPC_TIMER) diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index 9cb47ba6..fcf73d72 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -478,7 +478,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret) bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) { n = cpu_replicas_add_entry(&c->replicas_gc, e); if (!n.entries) { - ret = -ENOSPC; + ret = -ENOMEM; goto err; } @@ -487,10 +487,9 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret) } } - if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) { - ret = -ENOSPC; + ret = bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc); + if (ret) goto err; - } ret = replicas_table_update(c, &c->replicas_gc); err: @@ -593,10 +592,9 @@ retry: bch2_cpu_replicas_sort(&new); - if (bch2_cpu_replicas_to_sb_replicas(c, &new)) { - ret = -ENOSPC; + ret = bch2_cpu_replicas_to_sb_replicas(c, &new); + if (ret) goto err; - } ret = replicas_table_update(c, &new); err: @@ -751,7 +749,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb, DIV_ROUND_UP(bytes, sizeof(u64))); if (!sb_r) - return -ENOSPC; + return -BCH_ERR_ENOSPC_sb_replicas; bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb); @@ -796,7 +794,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, sb_r = bch2_sb_resize_replicas(&c->disk_sb, DIV_ROUND_UP(bytes, sizeof(u64))); if (!sb_r) - return -ENOSPC; + return -BCH_ERR_ENOSPC_sb_replicas; bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); sb_r = bch2_sb_get_replicas(c->disk_sb.sb); diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h index 560983df..6178ae62 100644 --- a/libbcachefs/str_hash.h +++ b/libbcachefs/str_hash.h @@ -207,7 +207,7 @@ bch2_hash_hole(struct btree_trans *trans, return 0; bch2_trans_iter_exit(trans, iter); - return ret ?: -ENOSPC; + return ret ?: -BCH_ERR_ENOSPC_str_hash_create; } static __always_inline @@ -277,7 +277,7 @@ int bch2_hash_set_snapshot(struct btree_trans *trans, } if (!ret) - ret = -ENOSPC; + ret = -BCH_ERR_ENOSPC_str_hash_create; out: bch2_trans_iter_exit(trans, &slot); bch2_trans_iter_exit(trans, &iter); diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c index fb3f8e40..8c98bacc 100644 --- a/libbcachefs/subvolume.c +++ b/libbcachefs/subvolume.c @@ -517,7 +517,7 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, goto err; if (!k.k || !k.k->p.offset) { - ret = -ENOSPC; + ret = -BCH_ERR_ENOSPC_snapshot_create; goto err; } @@ -1031,7 +1031,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, } if (!ret) - ret = -ENOSPC; + ret = -BCH_ERR_ENOSPC_subvolume_create; goto err; found_slot: snapshot_subvols[0] = dst_iter.pos.offset; diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index e1e70d35..d34aa6b6 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -132,7 +132,7 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) pr_err("%s: superblock too big: want %zu but have %llu", bdevname(sb->bdev, buf), new_bytes, max_bytes); - return -ENOSPC; + return -BCH_ERR_ENOSPC_sb; } } @@ -1156,7 +1156,7 @@ int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *cle for (entry = clean->start; entry < (struct jset_entry *) vstruct_end(&clean->field); entry = vstruct_next(entry)) { - ret = bch2_journal_entry_validate(c, "superblock", entry, + ret = bch2_journal_entry_validate(c, NULL, entry, le16_to_cpu(c->disk_sb.sb->version), BCH_SB_BIG_ENDIAN(c->disk_sb.sb), write); @@ -1477,7 +1477,7 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, unsigned nr_devices = 0; if (!out->nr_tabstops) - printbuf_tabstop_push(out, 32); + printbuf_tabstop_push(out, 44); mi = bch2_sb_get_members(sb); if (mi) { diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 8b3ce780..b1809f8c 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -686,8 +686,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) seqcount_init(&c->usage_lock); - sema_init(&c->io_in_flight, 64); - c->copy_gc_enabled = 1; c->rebalance.enabled = 1; c->promote_whole_extents = true; @@ -785,7 +783,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) goto err; } - ret = bch2_io_clock_init(&c->io_clock[READ]) ?: + ret = bch2_fs_counters_init(c) ?: + bch2_io_clock_init(&c->io_clock[READ]) ?: bch2_io_clock_init(&c->io_clock[WRITE]) ?: bch2_fs_journal_init(&c->journal) ?: bch2_fs_replicas_init(c) ?: @@ -799,8 +798,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_encryption_init(c) ?: bch2_fs_compress_init(c) ?: bch2_fs_ec_init(c) ?: - bch2_fs_fsio_init(c) ?: - bch2_fs_counters_init(c); + bch2_fs_fsio_init(c); if (ret) goto err; @@ -1592,7 +1590,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) le32_to_cpu(mi->field.u64s) + sizeof(dev_mi) / sizeof(u64))) { bch_err(c, "device add error: new device superblock too small"); - ret = -ENOSPC; + ret = -BCH_ERR_ENOSPC_sb_members; goto err_unlock; } @@ -1605,7 +1603,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) goto have_slot; no_slot: bch_err(c, "device add error: already have maximum number of devices"); - ret = -ENOSPC; + ret = -BCH_ERR_ENOSPC_sb_members; goto err_unlock; have_slot: @@ -1616,7 +1614,7 @@ have_slot: mi = bch2_sb_resize_members(&c->disk_sb, u64s); if (!mi) { bch_err(c, "device add error: no room in superblock for member info"); - ret = -ENOSPC; + ret = -BCH_ERR_ENOSPC_sb_members; goto err_unlock; } diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 98449e42..f1b0f001 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -41,14 +41,14 @@ #include "util.h" #define SYSFS_OPS(type) \ -const struct sysfs_ops type ## _sysfs_ops = { \ +const struct sysfs_ops type ## _sysfs_ops = { \ .show = type ## _show, \ .store = type ## _store \ } #define SHOW(fn) \ static ssize_t fn ## _to_text(struct printbuf *, \ - struct kobject *, struct attribute *);\ + struct kobject *, struct attribute *); \ \ static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ char *buf) \ @@ -67,15 +67,24 @@ static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ memcpy(buf, out.buf, ret); \ } \ printbuf_exit(&out); \ - return ret; \ + return bch2_err_class(ret); \ } \ \ static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\ struct attribute *attr) #define STORE(fn) \ +static ssize_t fn ## _store_inner(struct kobject *, struct attribute *,\ + const char *, size_t); \ + \ static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ const char *buf, size_t size) \ +{ \ + return bch2_err_class(fn##_store_inner(kobj, attr, buf, size)); \ +} \ + \ +static ssize_t fn ## _store_inner(struct kobject *kobj, struct attribute *attr,\ + const char *buf, size_t size) #define __sysfs_attribute(_name, _mode) \ static struct attribute sysfs_##_name = \ @@ -157,6 +166,7 @@ write_attribute(trigger_gc); write_attribute(trigger_discards); write_attribute(trigger_invalidates); write_attribute(prune_cache); +write_attribute(btree_wakeup); rw_attribute(btree_gc_periodic); rw_attribute(gc_gens_pos); @@ -363,6 +373,21 @@ static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) prt_printf(out, "\n"); } +static void bch2_btree_wakeup_all(struct bch_fs *c) +{ + struct btree_trans *trans; + + mutex_lock(&c->btree_trans_lock); + list_for_each_entry(trans, &c->btree_trans_list, list) { + struct btree_bkey_cached_common *b = READ_ONCE(trans->locking); + + if (b) + six_lock_wakeup_all(&b->lock); + + } + mutex_unlock(&c->btree_trans_lock); +} + SHOW(bch2_fs) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); @@ -480,6 +505,9 @@ STORE(bch2_fs) c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc); } + if (attr == &sysfs_btree_wakeup) + bch2_btree_wakeup_all(c); + if (attr == &sysfs_trigger_gc) { /* * Full gc is currently incompatible with btree key cache: @@ -610,6 +638,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_discards, &sysfs_trigger_invalidates, &sysfs_prune_cache, + &sysfs_btree_wakeup, &sysfs_gc_gens_pos, diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c index 56058a56..d0588618 100644 --- a/libbcachefs/tests.c +++ b/libbcachefs/tests.c @@ -675,7 +675,7 @@ static int rand_mixed_trans(struct btree_trans *trans, bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX)); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(iter))); + k = bch2_btree_iter_peek(iter); ret = bkey_err(k); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) bch_err(trans->c, "lookup error in rand_mixed: %s", bch2_err_str(ret)); diff --git a/libbcachefs/util.c b/libbcachefs/util.c index 42da6623..81befc43 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -274,6 +275,27 @@ void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits) prt_char(out, '0' + ((v >> --nr_bits) & 1)); } +void bch2_print_string_as_lines(const char *prefix, const char *lines) +{ + const char *p; + + if (!lines) { + printk("%s (null)\n", prefix); + return; + } + + console_lock(); + while (1) { + p = strchrnul(lines, '\n'); + printk("%s%.*s\n", prefix, (int) (p - lines), lines); + if (!*p) + break; + lines = p + 1; + prefix = KERN_CONT; + } + console_unlock(); +} + /* time stats: */ static void bch2_time_stats_update_one(struct time_stats *stats, diff --git a/libbcachefs/util.h b/libbcachefs/util.h index ab7e43d4..aa8b416a 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -355,6 +355,8 @@ u64 bch2_read_flag_list(char *, const char * const[]); void bch2_prt_u64_binary(struct printbuf *, u64, unsigned); +void bch2_print_string_as_lines(const char *prefix, const char *lines); + #define NR_QUANTILES 15 #define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) #define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index 186ffab5..6a5be6c9 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -350,17 +350,19 @@ err: bch2_trans_exit(&trans); if (ret) - return ret; + goto out; ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false); if (ret) - return ret; + goto out; ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true); if (ret) - return ret; + goto out; return buf.used; +out: + return bch2_err_class(ret); } static int bch2_xattr_get_handler(const struct xattr_handler *handler, diff --git a/linux/printbuf_userspace.c b/linux/printbuf_userspace.c index 84187f1f..df9567c5 100644 --- a/linux/printbuf_userspace.c +++ b/linux/printbuf_userspace.c @@ -2,15 +2,15 @@ #include #include -void prt_printf(struct printbuf *out, const char *fmt, ...) +void prt_vprintf(struct printbuf *out, const char *fmt, va_list args) { - va_list args; int len; do { - va_start(args, fmt); - len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args); - va_end(args); + va_list args2; + + va_copy(args2, args); + len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2); } while (len + 1 >= printbuf_remaining(out) && !printbuf_make_room(out, len + 1)); @@ -18,3 +18,12 @@ void prt_printf(struct printbuf *out, const char *fmt, ...) printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0); out->pos += len; } + +void prt_printf(struct printbuf *out, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + prt_vprintf(out, fmt, args); + va_end(args); +} diff --git a/linux/six.c b/linux/six.c index d2275055..b11660af 100644 --- a/linux/six.c +++ b/linux/six.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -16,7 +17,7 @@ #define EBUG_ON(cond) do {} while (0) #endif -#define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_) +#define six_acquire(l, t, r) lock_acquire(l, 0, t, r, 1, NULL, _RET_IP_) #define six_release(l) lock_release(l, _RET_IP_) static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type); @@ -124,7 +125,6 @@ static int __do_six_trylock_type(struct six_lock *lock, */ if (type == SIX_LOCK_read && lock->readers) { -retry: preempt_disable(); this_cpu_inc(*lock->readers); /* signal that we own lock */ @@ -136,27 +136,6 @@ retry: this_cpu_sub(*lock->readers, !ret); preempt_enable(); - /* - * If we failed from the lock path and the waiting bit wasn't - * set, set it: - */ - if (!try && !ret) { - v = old.v; - - do { - new.v = old.v = v; - - if (!(old.v & l[type].lock_fail)) - goto retry; - - if (new.waiters & (1 << type)) - break; - - new.waiters |= 1 << type; - } while ((v = atomic64_cmpxchg(&lock->state.counter, - old.v, new.v)) != old.v); - } - /* * If we failed because a writer was trying to take the * lock, issue a wakeup because we might have caused a @@ -300,7 +279,7 @@ static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type) return false; if (type != SIX_LOCK_write) - six_acquire(&lock->dep_map, 1); + six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read); return true; } @@ -337,7 +316,7 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type, six_lock_wakeup(lock, old, SIX_LOCK_write); if (ret) - six_acquire(&lock->dep_map, 1); + six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read); return ret; } @@ -354,7 +333,7 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type, six_set_owner(lock, type, old, current); if (type != SIX_LOCK_write) - six_acquire(&lock->dep_map, 1); + six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read); return true; } @@ -436,13 +415,27 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty wait->lock_acquired = false; raw_spin_lock(&lock->wait_lock); + if (!(lock->state.waiters & (1 << type))) + set_bit(waitlist_bitnr(type), (unsigned long *) &lock->state.v); /* * Retry taking the lock after taking waitlist lock, have raced with an * unlock: */ ret = __do_six_trylock_type(lock, type, current, false); - if (ret <= 0) + if (ret <= 0) { + wait->start_time = local_clock(); + + if (!list_empty(&lock->wait_list)) { + struct six_lock_waiter *last = + list_last_entry(&lock->wait_list, + struct six_lock_waiter, list); + + if (time_before_eq64(wait->start_time, last->start_time)) + wait->start_time = last->start_time + 1; + } + list_add_tail(&wait->list, &lock->wait_list); + } raw_spin_unlock(&lock->wait_lock); if (unlikely(ret > 0)) { @@ -481,7 +474,7 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty __set_current_state(TASK_RUNNING); out: - if (ret && type == SIX_LOCK_write) { + if (ret && type == SIX_LOCK_write && lock->state.write_locking) { old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1), &lock->state.counter); six_lock_wakeup(lock, old, SIX_LOCK_read); @@ -497,8 +490,10 @@ static int __six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type { int ret; + wait->start_time = 0; + if (type != SIX_LOCK_write) - six_acquire(&lock->dep_map, 0); + six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read); ret = do_six_trylock_type(lock, type, true) ? 0 : __six_lock_type_slowpath(lock, type, wait, should_sleep_fn, p); @@ -668,7 +663,7 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type) { const struct six_lock_vals l[] = LOCK_VALS; - six_acquire(&lock->dep_map, 0); + six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read); /* XXX: assert already locked, and that we don't overflow: */ @@ -695,8 +690,13 @@ EXPORT_SYMBOL_GPL(six_lock_increment); void six_lock_wakeup_all(struct six_lock *lock) { + union six_lock_state state = lock->state; struct six_lock_waiter *w; + six_lock_wakeup(lock, state, SIX_LOCK_read); + six_lock_wakeup(lock, state, SIX_LOCK_intent); + six_lock_wakeup(lock, state, SIX_LOCK_write); + raw_spin_lock(&lock->wait_lock); list_for_each_entry(w, &lock->wait_list, list) wake_up_process(w->task);