diff --git a/.bcachefs_revision b/.bcachefs_revision index 82c9b19f..feafaff4 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -fe72e70682cd2430a099c08c3135253675030d28 +3c41353bc185e0a0da4c6f63b1203575c41a2da1 diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index aade5624..ce058d55 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -259,7 +259,11 @@ do { \ BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \ "Disables rewriting of btree nodes during mark and sweep")\ BCH_DEBUG_PARAM(btree_shrinker_disabled, \ - "Disables the shrinker callback for the btree node cache") + "Disables the shrinker callback for the btree node cache")\ + BCH_DEBUG_PARAM(verify_btree_ondisk, \ + "Reread btree nodes at various points to verify the " \ + "mergesort in the read path against modifications " \ + "done in memory") /* Parameters that should only be compiled in in debug mode: */ #define BCH_DEBUG_PARAMS_DEBUG() \ @@ -273,10 +277,6 @@ do { \ "information) when iterating over keys") \ BCH_DEBUG_PARAM(debug_check_btree_accounting, \ "Verify btree accounting for keys within a node") \ - BCH_DEBUG_PARAM(verify_btree_ondisk, \ - "Reread btree nodes at various points to verify the " \ - "mergesort in the read path against modifications " \ - "done in memory") \ BCH_DEBUG_PARAM(journal_seq_verify, \ "Store the journal sequence number in the version " \ "number of every btree key, and verify that btree " \ @@ -545,6 +545,8 @@ struct btree_iter_buf { struct btree_iter *iter; }; +#define REPLICAS_DELTA_LIST_MAX (1U << 16) + struct bch_fs { struct closure cl; @@ -572,6 +574,7 @@ struct bch_fs { struct bch_replicas_cpu replicas; struct bch_replicas_cpu replicas_gc; struct mutex replicas_gc_lock; + mempool_t replicas_delta_pool; struct journal_entry_res btree_root_journal_res; struct journal_entry_res replicas_journal_res; @@ -644,6 +647,7 @@ struct bch_fs { struct mutex btree_trans_lock; struct list_head btree_trans_list; mempool_t btree_iters_pool; + mempool_t btree_trans_mem_pool; struct btree_iter_buf __percpu *btree_iters_bufs; struct srcu_struct btree_trans_barrier; @@ -813,11 +817,9 @@ struct bch_fs { /* DEBUG JUNK */ struct dentry *debug; struct btree_debug btree_debug[BTREE_ID_NR]; -#ifdef CONFIG_BCACHEFS_DEBUG struct btree *verify_data; struct btree_node *verify_ondisk; struct mutex verify_lock; -#endif u64 *unused_inode_hints; unsigned inode_shard_bits; diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index 450b613d..9f869bed 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -100,7 +100,6 @@ const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k) static unsigned bch2_key_types_allowed[] = { [BKEY_TYPE_extents] = - (1U << KEY_TYPE_discard)| (1U << KEY_TYPE_error)| (1U << KEY_TYPE_extent)| (1U << KEY_TYPE_reservation)| diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 9f963179..edc3c5ed 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -33,21 +33,21 @@ static inline unsigned btree_cache_can_free(struct btree_cache *bc) return max_t(int, 0, bc->used - bc->reserve); } -static void __btree_node_data_free(struct bch_fs *c, struct btree *b) -{ - EBUG_ON(btree_node_write_in_flight(b)); - - kvpfree(b->data, btree_bytes(c)); - b->data = NULL; - vfree(b->aux_data); - b->aux_data = NULL; -} - static void btree_node_data_free(struct bch_fs *c, struct btree *b) { struct btree_cache *bc = &c->btree_cache; - __btree_node_data_free(c, b); + EBUG_ON(btree_node_write_in_flight(b)); + + kvpfree(b->data, btree_bytes(c)); + b->data = NULL; +#ifdef __KERNEL__ + vfree(b->aux_data); +#else + munmap(b->aux_data, btree_aux_data_bytes(b)); +#endif + b->aux_data = NULL; + bc->used--; list_move(&b->list, &bc->freed); } @@ -75,8 +75,13 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) b->data = kvpmalloc(btree_bytes(c), gfp); if (!b->data) return -ENOMEM; - +#ifdef __KERNEL__ b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp); +#else + b->aux_data = mmap(NULL, btree_aux_data_bytes(b), + PROT_READ|PROT_WRITE|PROT_EXEC, + MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); +#endif if (!b->aux_data) { kvpfree(b->data, btree_bytes(c)); b->data = NULL; @@ -100,7 +105,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c) return b; } -static struct btree *btree_node_mem_alloc(struct bch_fs *c) +struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) { struct btree_cache *bc = &c->btree_cache; struct btree *b = __btree_node_mem_alloc(c); @@ -360,12 +365,10 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) flags = memalloc_nofs_save(); mutex_lock(&bc->lock); -#ifdef CONFIG_BCACHEFS_DEBUG if (c->verify_data) list_move(&c->verify_data->list, &bc->live); kvpfree(c->verify_ondisk, btree_bytes(c)); -#endif for (i = 0; i < BTREE_ID_NR; i++) if (c->btree_roots[i].b) @@ -419,31 +422,15 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bch2_recalc_btree_reserve(c); for (i = 0; i < bc->reserve; i++) - if (!btree_node_mem_alloc(c)) { + if (!__bch2_btree_node_mem_alloc(c)) { ret = -ENOMEM; goto out; } list_splice_init(&bc->live, &bc->freeable); -#ifdef CONFIG_BCACHEFS_DEBUG mutex_init(&c->verify_lock); - c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); - if (!c->verify_ondisk) { - ret = -ENOMEM; - goto out; - } - - c->verify_data = btree_node_mem_alloc(c); - if (!c->verify_data) { - ret = -ENOMEM; - goto out; - } - - list_del_init(&c->verify_data->list); -#endif - bc->shrink.count_objects = bch2_btree_cache_count; bc->shrink.scan_objects = bch2_btree_cache_scan; bc->shrink.seeks = 4; @@ -703,6 +690,41 @@ static int lock_node_check_fn(struct six_lock *lock, void *p) return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1; } +static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) +{ + char buf1[100], buf2[100], buf3[100], buf4[100]; + + if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) + return; + + bch2_bpos_to_text(&PBUF(buf1), b->key.k.type == KEY_TYPE_btree_ptr_v2 + ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key + : POS_MIN); + bch2_bpos_to_text(&PBUF(buf2), b->data->min_key); + + bch2_bpos_to_text(&PBUF(buf3), b->key.k.p); + bch2_bpos_to_text(&PBUF(buf4), b->data->max_key); + bch2_fs_inconsistent(c, "btree node header doesn't match ptr\n" + "btree: ptr %u header %llu\n" + "level: ptr %u header %llu\n" + "min ptr %s node header %s\n" + "max ptr %s node header %s", + b->c.btree_id, BTREE_NODE_ID(b->data), + b->c.level, BTREE_NODE_LEVEL(b->data), + buf1, buf2, buf3, buf4); +} + +static inline void btree_check_header(struct bch_fs *c, struct btree *b) +{ + if (b->c.btree_id != BTREE_NODE_ID(b->data) || + b->c.level != BTREE_NODE_LEVEL(b->data) || + bpos_cmp(b->data->max_key, b->key.k.p) || + (b->key.k.type == KEY_TYPE_btree_ptr_v2 && + bpos_cmp(b->data->min_key, + bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) + btree_bad_header(c, b); +} + /** * bch_btree_node_get - find a btree node in the cache and lock it, reading it * in from disk if necessary. @@ -833,10 +855,7 @@ lock_node: EBUG_ON(b->c.btree_id != iter->btree_id); EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); - EBUG_ON(bpos_cmp(b->data->max_key, k->k.p)); - EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && - bpos_cmp(b->data->min_key, - bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)); + btree_check_header(c, b); return b; } @@ -916,10 +935,7 @@ lock_node: EBUG_ON(b->c.btree_id != btree_id); EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); - EBUG_ON(bpos_cmp(b->data->max_key, k->k.p)); - EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && - bpos_cmp(b->data->min_key, - bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)); + btree_check_header(c, b); out: bch2_btree_cache_cannibalize_unlock(c); return b; diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index 4791c3b6..c517cc02 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -17,6 +17,7 @@ int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, void bch2_btree_cache_cannibalize_unlock(struct bch_fs *); int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); +struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 536947cc..864931ea 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -330,6 +330,10 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, BUG_ON(bch2_journal_seq_verify && k->k->version.lo > journal_cur_seq(&c->journal)); + ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k); + if (ret) + goto err; + if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c, "key version number higher than recorded: %llu > %llu", k->k->version.lo, @@ -346,8 +350,6 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, goto err; } } - - ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k); } ptrs = bch2_bkey_ptrs_c(*k); diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index c8d8df96..2de31a6b 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1340,6 +1340,13 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, return ret; } +static void btree_write_submit(struct work_struct *work) +{ + struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work); + + bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &wbio->key); +} + void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) { struct btree_write_bio *wbio; @@ -1347,7 +1354,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) struct bset *i; struct btree_node *bn = NULL; struct btree_node_entry *bne = NULL; - struct bkey_buf k; struct bch_extent_ptr *ptr; struct sort_iter sort_iter; struct nonce nonce; @@ -1358,8 +1364,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) bool validate_before_checksum = false; void *data; - bch2_bkey_buf_init(&k); - if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) return; @@ -1536,6 +1540,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) wbio_init(&wbio->wbio.bio); wbio->data = data; wbio->bytes = bytes; + wbio->wbio.c = c; wbio->wbio.used_mempool = used_mempool; wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META; wbio->wbio.bio.bi_end_io = btree_node_write_endio; @@ -1558,9 +1563,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) * just make all btree node writes FUA to keep things sane. */ - bch2_bkey_buf_copy(&k, c, &b->key); + bkey_copy(&wbio->key, &b->key); - bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(k.k)), ptr) + bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&wbio->key)), ptr) ptr->offset += b->written; b->written += sectors_to_write; @@ -1568,9 +1573,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) atomic64_inc(&c->btree_writes_nr); atomic64_add(sectors_to_write, &c->btree_writes_sectors); - /* XXX: submitting IO with btree locks held: */ - bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, k.k); - bch2_bkey_buf_exit(&k, c); + INIT_WORK(&wbio->work, btree_write_submit); + schedule_work(&wbio->work); return; err: set_btree_node_noevict(b); diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index 95c35161..c8a8b05a 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -42,6 +42,7 @@ struct btree_read_bio { struct btree_write_bio { struct work_struct work; + __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); void *data; unsigned bytes; struct bch_write_bio wbio; diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index c8f527bc..93194e62 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -2145,7 +2145,16 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) if (new_top > trans->mem_bytes) { size_t old_bytes = trans->mem_bytes; size_t new_bytes = roundup_pow_of_two(new_top); - void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); + void *new_mem; + + WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); + + new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); + if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { + new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); + new_bytes = BTREE_TRANS_MEM_MAX; + kfree(trans->mem); + } if (!new_mem) return ERR_PTR(-ENOMEM); @@ -2249,6 +2258,11 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, if (expected_mem_bytes) { trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes); trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL); + + if (!unlikely(trans->mem)) { + trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); + trans->mem_bytes = BTREE_TRANS_MEM_MAX; + } } trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); @@ -2290,8 +2304,19 @@ int bch2_trans_exit(struct btree_trans *trans) bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); - kfree(trans->fs_usage_deltas); - kfree(trans->mem); + if (trans->fs_usage_deltas) { + if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) == + REPLICAS_DELTA_LIST_MAX) + mempool_free(trans->fs_usage_deltas, + &trans->c->replicas_delta_pool); + else + kfree(trans->fs_usage_deltas); + } + + if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) + mempool_free(trans->mem, &trans->c->btree_trans_mem_pool); + else + kfree(trans->mem); #ifdef __KERNEL__ /* @@ -2299,6 +2324,7 @@ int bch2_trans_exit(struct btree_trans *trans) */ trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters); #endif + if (trans->iters) mempool_free(trans->iters, &trans->c->btree_iters_pool); @@ -2392,6 +2418,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) void bch2_fs_btree_iter_exit(struct bch_fs *c) { + mempool_exit(&c->btree_trans_mem_pool); mempool_exit(&c->btree_iters_pool); cleanup_srcu_struct(&c->btree_trans_barrier); } @@ -2407,5 +2434,7 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, sizeof(struct btree_iter) * nr + sizeof(struct btree_insert_entry) * nr + - sizeof(struct btree_insert_entry) * nr); + sizeof(struct btree_insert_entry) * nr) ?: + mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, + BTREE_TRANS_MEM_MAX); } diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index 53191c99..a5181a96 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -218,8 +218,14 @@ static int btree_key_cache_fill(struct btree_trans *trans, goto err; } - if (k.k->u64s > ck->u64s) { - new_u64s = roundup_pow_of_two(k.k->u64s); + /* + * bch2_varint_decode can read past the end of the buffer by at + * most 7 bytes (it won't be used): + */ + new_u64s = k.k->u64s + 1; + + if (new_u64s > ck->u64s) { + new_u64s = roundup_pow_of_two(new_u64s); new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS); if (!new_k) { ret = -ENOMEM; @@ -385,12 +391,18 @@ retry: goto evict; } + /* + * Since journal reclaim depends on us making progress here, and the + * allocator/copygc depend on journal reclaim making progress, we need + * to be using alloc reserves: + * */ ret = bch2_btree_iter_traverse(b_iter) ?: bch2_trans_update(trans, b_iter, ck->k, BTREE_TRIGGER_NORUN) ?: bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOUNLOCK| BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| (ck->journal.seq == journal_last_seq(j) ? BTREE_INSERT_JOURNAL_RESERVED : 0)| diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index f942ccf6..06a2c412 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -352,6 +352,8 @@ struct btree_trans_commit_hook { struct btree_trans_commit_hook *next; }; +#define BTREE_TRANS_MEM_MAX 4096 + struct btree_trans { struct bch_fs *c; #ifdef CONFIG_BCACHEFS_DEBUG diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 07c92534..87426d17 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -887,6 +887,14 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, btree_update_drop_new_node(c, b); btree_update_will_delete_key(as, &b->key); + + /* + * XXX: Waiting on io with btree node locks held, we don't want to be + * doing this. We can't have btree writes happening after the space has + * been freed, but we really only need to block before + * btree_update_nodes_written_trans() happens. + */ + btree_node_wait_on_io(b); } void bch2_btree_update_done(struct btree_update *as) @@ -1146,6 +1154,24 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b set_btree_node_need_write(b); } +static void +__bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, + struct btree_iter *iter, struct keylist *keys, + struct btree_node_iter node_iter) +{ + struct bkey_i *insert = bch2_keylist_front(keys); + struct bkey_packed *k; + + BUG_ON(btree_node_type(b) != BKEY_TYPE_btree); + + while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && + (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0)) + ; + + for_each_keylist_key(keys, insert) + bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter); +} + /* * Move keys from n1 (original replacement node, now lower node) to n2 (higher * node) @@ -1276,16 +1302,9 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, struct bkey_packed *src, *dst, *n; struct bset *i; - BUG_ON(btree_node_type(b) != BKEY_TYPE_btree); - bch2_btree_node_iter_init(&node_iter, b, &k->k.p); - while (!bch2_keylist_empty(keys)) { - k = bch2_keylist_front(keys); - - bch2_insert_fixup_btree_ptr(as, b, iter, k, &node_iter); - bch2_keylist_pop_front(keys); - } + __bch2_btree_insert_keys_interior(as, b, iter, keys, node_iter); /* * We can't tolerate whiteouts here - with whiteouts there can be @@ -1431,24 +1450,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, struct btree_iter *iter, struct keylist *keys) { struct btree_iter *linked; - struct btree_node_iter node_iter; - struct bkey_i *insert = bch2_keylist_front(keys); - struct bkey_packed *k; - /* Don't screw up @iter's position: */ - node_iter = iter->l[b->c.level].iter; - - /* - * btree_split(), btree_gc_coalesce() will insert keys before - * the iterator's current position - they know the keys go in - * the node the iterator points to: - */ - while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && - (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0)) - ; - - for_each_keylist_key(keys, insert) - bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter); + __bch2_btree_insert_keys_interior(as, b, iter, keys, iter->l[b->c.level].iter); btree_update_updated_node(as, b); @@ -1598,7 +1601,19 @@ retry: next = m; } - BUG_ON(bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key)); + if (bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key)) { + char buf1[100], buf2[100]; + + bch2_bpos_to_text(&PBUF(buf1), prev->data->max_key); + bch2_bpos_to_text(&PBUF(buf2), next->data->min_key); + bch2_fs_inconsistent(c, + "btree topology error in btree merge:\n" + "prev ends at %s\n" + "next starts at %s\n", + buf1, buf2); + ret = -EIO; + goto err; + } bch2_bkey_format_init(&new_s); bch2_bkey_format_add_pos(&new_s, prev->data->min_key); diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index afdcc98d..b793ab77 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -293,6 +293,12 @@ btree_key_can_insert_cached(struct btree_trans *trans, !(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM)) return BTREE_INSERT_NEED_JOURNAL_RECLAIM; + /* + * bch2_varint_decode can read past the end of the buffer by at most 7 + * bytes (it won't be used): + */ + u64s += 1; + if (u64s <= ck->u64s) return BTREE_INSERT_OK; diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 6b99f127..c3ad0bc8 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -396,20 +396,22 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, bch2_wake_allocator(ca); } -static inline void update_replicas(struct bch_fs *c, +static inline int update_replicas(struct bch_fs *c, struct bch_fs_usage *fs_usage, struct bch_replicas_entry *r, s64 sectors) { int idx = bch2_replicas_entry_idx(c, r); - BUG_ON(idx < 0); + if (idx < 0) + return -1; fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); fs_usage->replicas[idx] += sectors; + return 0; } -static inline void update_cached_sectors(struct bch_fs *c, +static inline int update_cached_sectors(struct bch_fs *c, struct bch_fs_usage *fs_usage, unsigned dev, s64 sectors) { @@ -417,7 +419,7 @@ static inline void update_cached_sectors(struct bch_fs *c, bch2_replicas_entry_cached(&r.e, dev); - update_replicas(c, fs_usage, &r.e, sectors); + return update_replicas(c, fs_usage, &r.e, sectors); } static struct replicas_delta_list * @@ -425,10 +427,26 @@ replicas_deltas_realloc(struct btree_trans *trans, unsigned more) { struct replicas_delta_list *d = trans->fs_usage_deltas; unsigned new_size = d ? (d->size + more) * 2 : 128; + unsigned alloc_size = sizeof(*d) + new_size; + + WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX); if (!d || d->used + more > d->size) { - d = krealloc(d, sizeof(*d) + new_size, GFP_NOIO|__GFP_ZERO); - BUG_ON(!d); + d = krealloc(d, alloc_size, GFP_NOIO|__GFP_ZERO); + + BUG_ON(!d && alloc_size > REPLICAS_DELTA_LIST_MAX); + + if (!d) { + d = mempool_alloc(&trans->c->replicas_delta_pool, GFP_NOIO); + memset(d, 0, REPLICAS_DELTA_LIST_MAX); + + if (trans->fs_usage_deltas) + memcpy(d, trans->fs_usage_deltas, + trans->fs_usage_deltas->size + sizeof(*d)); + + new_size = REPLICAS_DELTA_LIST_MAX - sizeof(*d); + kfree(trans->fs_usage_deltas); + } d->size = new_size; trans->fs_usage_deltas = d; @@ -553,8 +571,12 @@ static int bch2_mark_alloc(struct bch_fs *c, if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && old_m.cached_sectors) { - update_cached_sectors(c, fs_usage, ca->dev_idx, - -old_m.cached_sectors); + if (update_cached_sectors(c, fs_usage, ca->dev_idx, + -old_m.cached_sectors)) { + bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors"); + return -1; + } + trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset), old_m.cached_sectors); } @@ -936,8 +958,12 @@ static int bch2_mark_extent(struct bch_fs *c, if (p.ptr.cached) { if (!stale) - update_cached_sectors(c, fs_usage, p.ptr.dev, - disk_sectors); + if (update_cached_sectors(c, fs_usage, p.ptr.dev, + disk_sectors)) { + bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors"); + return -1; + + } } else if (!p.has_ec) { dirty_sectors += disk_sectors; r.e.devs[r.e.nr_devs++] = p.ptr.dev; @@ -956,8 +982,15 @@ static int bch2_mark_extent(struct bch_fs *c, } } - if (r.e.nr_devs) - update_replicas(c, fs_usage, &r.e, dirty_sectors); + if (r.e.nr_devs) { + if (update_replicas(c, fs_usage, &r.e, dirty_sectors)) { + char buf[200]; + + bch2_bkey_val_to_text(&PBUF(buf), c, k); + bch2_fs_fatal_error(c, "no replicas entry for %s", buf); + return -1; + } + } return 0; } @@ -1031,8 +1064,14 @@ static int bch2_mark_stripe(struct bch_fs *c, return ret; } - update_replicas(c, fs_usage, &m->r.e, - ((s64) m->sectors * m->nr_redundant)); + if (update_replicas(c, fs_usage, &m->r.e, + ((s64) m->sectors * m->nr_redundant))) { + char buf[200]; + + bch2_bkey_val_to_text(&PBUF(buf), c, new); + bch2_fs_fatal_error(c, "no replicas entry for %s", buf); + return -1; + } } return 0; @@ -1292,7 +1331,7 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, added += d->delta; } - update_replicas(c, dst, &d->r, d->delta); + BUG_ON(update_replicas(c, dst, &d->r, d->delta)); } dst->nr_inodes += deltas->nr_inodes; diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index 90364b55..4215c119 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -29,40 +29,19 @@ static struct dentry *bch_debug; -#ifdef CONFIG_BCACHEFS_DEBUG - -void __bch2_btree_verify(struct bch_fs *c, struct btree *b) +static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, + struct extent_ptr_decoded pick) { struct btree *v = c->verify_data; - struct btree_node *n_ondisk, *n_sorted, *n_inmemory; - struct bset *sorted, *inmemory; - struct extent_ptr_decoded pick; - struct bch_dev *ca; + struct btree_node *n_ondisk = c->verify_ondisk; + struct btree_node *n_sorted = c->verify_data->data; + struct bset *sorted, *inmemory = &b->data->keys; + struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev); struct bio *bio; + bool failed = false; - if (c->opts.nochanges) - return; - - btree_node_io_lock(b); - mutex_lock(&c->verify_lock); - - n_ondisk = c->verify_ondisk; - n_sorted = c->verify_data->data; - n_inmemory = b->data; - - bkey_copy(&v->key, &b->key); - v->written = 0; - v->c.level = b->c.level; - v->c.btree_id = b->c.btree_id; - bch2_btree_keys_init(v); - - if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), - NULL, &pick) <= 0) - return; - - ca = bch_dev_bkey_exists(c, pick.ptr.dev); if (!bch2_dev_get_ioref(ca, READ)) - return; + return false; bio = bio_alloc_bioset(GFP_NOIO, buf_pages(n_sorted, btree_bytes(c)), @@ -79,12 +58,12 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) memcpy(n_ondisk, n_sorted, btree_bytes(c)); + v->written = 0; if (bch2_btree_node_read_done(c, ca, v, false)) - goto out; + return false; n_sorted = c->verify_data->data; sorted = &n_sorted->keys; - inmemory = &n_inmemory->keys; if (inmemory->u64s != sorted->u64s || memcmp(inmemory->start, @@ -102,8 +81,8 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) printk(KERN_ERR "*** read back in:\n"); bch2_dump_bset(c, v, sorted, 0); - while (offset < b->written) { - if (!offset ) { + while (offset < v->written) { + if (!offset) { i = &n_ondisk->keys; sectors = vstruct_blocks(n_ondisk, c->block_bits) << c->block_bits; @@ -122,25 +101,84 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) offset += sectors; } - printk(KERN_ERR "*** block %u/%u not written\n", - offset >> c->block_bits, btree_blocks(c)); - for (j = 0; j < le16_to_cpu(inmemory->u64s); j++) if (inmemory->_data[j] != sorted->_data[j]) break; - printk(KERN_ERR "b->written %u\n", b->written); - console_unlock(); - panic("verify failed at %u\n", j); + bch_err(c, "verify failed at key %u", j); + + failed = true; + } + + if (v->written != b->written) { + bch_err(c, "written wrong: expected %u, got %u", + b->written, v->written); + failed = true; + } + + return failed; +} + +void __bch2_btree_verify(struct bch_fs *c, struct btree *b) +{ + struct bkey_ptrs_c ptrs; + struct extent_ptr_decoded p; + const union bch_extent_entry *entry; + struct btree *v; + struct bset *inmemory = &b->data->keys; + struct bkey_packed *k; + bool failed = false; + + if (c->opts.nochanges) + return; + + btree_node_io_lock(b); + mutex_lock(&c->verify_lock); + + if (!c->verify_ondisk) { + c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); + if (!c->verify_ondisk) + goto out; + } + + if (!c->verify_data) { + c->verify_data = __bch2_btree_node_mem_alloc(c); + if (!c->verify_data) + goto out; + + list_del_init(&c->verify_data->list); + } + + BUG_ON(b->nsets != 1); + + for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_next(k)) + if (k->type == KEY_TYPE_btree_ptr_v2) { + struct bch_btree_ptr_v2 *v = (void *) bkeyp_val(&b->format, k); + v->mem_ptr = 0; + } + + v = c->verify_data; + bkey_copy(&v->key, &b->key); + v->c.level = b->c.level; + v->c.btree_id = b->c.btree_id; + bch2_btree_keys_init(v); + + ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)); + bkey_for_each_ptr_decode(&b->key.k, ptrs, p, entry) + failed |= bch2_btree_verify_replica(c, b, p); + + if (failed) { + char buf[200]; + + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key)); + bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf); } out: mutex_unlock(&c->verify_lock); btree_node_io_unlock(b); } -#endif - #ifdef CONFIG_DEBUG_FS /* XXX: bch_fs refcounting */ diff --git a/libbcachefs/debug.h b/libbcachefs/debug.h index 7ac1615e..0b86736e 100644 --- a/libbcachefs/debug.h +++ b/libbcachefs/debug.h @@ -8,11 +8,7 @@ struct bio; struct btree; struct bch_fs; -#ifdef CONFIG_BCACHEFS_DEBUG void __bch2_btree_verify(struct bch_fs *, struct btree *); -#else -static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {} -#endif static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) { diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index f712f685..7062ab9c 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -1621,6 +1621,7 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags) if (ret) break; } + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index eb8ac164..26fbd8c2 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -38,9 +38,9 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) return ret ?: sectors; } -static int lookup_inode(struct btree_trans *trans, u64 inode_nr, - struct bch_inode_unpacked *inode, - u32 *snapshot) +static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, + struct bch_inode_unpacked *inode, + u32 *snapshot) { struct btree_iter *iter; struct bkey_s_c k; @@ -63,19 +63,34 @@ err: return ret; } -static int write_inode(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - u32 snapshot) +static int lookup_inode(struct btree_trans *trans, u64 inode_nr, + struct bch_inode_unpacked *inode, + u32 *snapshot) +{ + return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot)); +} + +static int __write_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 snapshot) { struct btree_iter *inode_iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, SPOS(0, inode->bi_inum, snapshot), BTREE_ITER_INTENT); + int ret = bch2_inode_write(trans, inode_iter, inode); + bch2_trans_iter_put(trans, inode_iter); + return ret; +} + +static int write_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 snapshot) +{ int ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW, - bch2_inode_write(trans, inode_iter, inode)); - bch2_trans_iter_put(trans, inode_iter); + __write_inode(trans, inode, snapshot)); if (ret) bch_err(trans->c, "error in fsck: error %i updating inode", ret); return ret; @@ -114,57 +129,101 @@ static int remove_dirent(struct btree_trans *trans, struct bpos pos) return ret; } -static int __reattach_inode(struct btree_trans *trans, - struct bch_inode_unpacked *lostfound, - u64 inum) +/* Get lost+found, create if it doesn't exist: */ +static int lookup_lostfound(struct btree_trans *trans, + struct bch_inode_unpacked *lostfound) { - struct bch_hash_info dir_hash = - bch2_hash_info_init(trans->c, lostfound); - struct bch_inode_unpacked inode_u; - char name_buf[20]; - struct qstr name; - u64 dir_offset = 0; + struct bch_fs *c = trans->c; + struct bch_inode_unpacked root; + struct bch_hash_info root_hash_info; + struct qstr lostfound_str = QSTR("lost+found"); + u64 inum; u32 snapshot; int ret; - snprintf(name_buf, sizeof(name_buf), "%llu", inum); - name = (struct qstr) QSTR(name_buf); + ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root, &snapshot); + if (ret && ret != -ENOENT) + return ret; - ret = lookup_inode(trans, inum, &inode_u, &snapshot); + root_hash_info = bch2_hash_info_init(c, &root); + inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info, + &lostfound_str); + if (!inum) { + bch_notice(c, "creating lost+found"); + goto create_lostfound; + } + + ret = lookup_inode(trans, inum, lostfound, &snapshot); + if (ret && ret != -ENOENT) { + /* + * The check_dirents pass has already run, dangling dirents + * shouldn't exist here: + */ + bch_err(c, "error looking up lost+found: %i", ret); + return ret; + } + + if (ret == -ENOENT) { +create_lostfound: + bch2_inode_init_early(c, lostfound); + + ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + bch2_create_trans(trans, + BCACHEFS_ROOT_INO, &root, + lostfound, + &lostfound_str, + 0, 0, S_IFDIR|0700, 0, NULL, NULL)); + if (ret) + bch_err(c, "error creating lost+found: %i", ret); + } + + return 0; +} + +static int reattach_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode) +{ + struct bch_hash_info dir_hash; + struct bch_inode_unpacked lostfound; + char name_buf[20]; + struct qstr name; + u64 dir_offset = 0; + int ret; + + ret = lookup_lostfound(trans, &lostfound); if (ret) return ret; - if (S_ISDIR(inode_u.bi_mode)) { - lostfound->bi_nlink++; + if (S_ISDIR(inode->bi_mode)) { + lostfound.bi_nlink++; - ret = write_inode(trans, lostfound, U32_MAX); + ret = write_inode(trans, &lostfound, U32_MAX); if (ret) return ret; } - ret = bch2_dirent_create(trans, lostfound->bi_inum, &dir_hash, - mode_to_type(inode_u.bi_mode), - &name, inum, &dir_offset, - BCH_HASH_SET_MUST_CREATE); - if (ret) + dir_hash = bch2_hash_info_init(trans->c, &lostfound); + + snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); + name = (struct qstr) QSTR(name_buf); + + ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW, + bch2_dirent_create(trans, lostfound.bi_inum, &dir_hash, + mode_to_type(inode->bi_mode), + &name, inode->bi_inum, &dir_offset, + BCH_HASH_SET_MUST_CREATE)); + if (ret) { + bch_err(trans->c, "error %i reattaching inode %llu", + ret, inode->bi_inum); return ret; + } - inode_u.bi_dir = lostfound->bi_inum; - inode_u.bi_dir_offset = dir_offset; + inode->bi_dir = lostfound.bi_inum; + inode->bi_dir_offset = dir_offset; - return write_inode(trans, &inode_u, U32_MAX); -} - -static int reattach_inode(struct btree_trans *trans, - struct bch_inode_unpacked *lostfound, - u64 inum) -{ - int ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW, - __reattach_inode(trans, lostfound, inum)); - if (ret) - bch_err(trans->c, "error %i reattaching inode %llu", ret, inum); - - return ret; + return write_inode(trans, inode, U32_MAX); } static int remove_backpointer(struct btree_trans *trans, @@ -931,58 +990,6 @@ create_root: BTREE_INSERT_LAZY_RW); } -/* Get lost+found, create if it doesn't exist: */ -static int check_lostfound(struct bch_fs *c, - struct bch_inode_unpacked *root_inode, - struct bch_inode_unpacked *lostfound_inode) -{ - struct qstr lostfound = QSTR("lost+found"); - struct bch_hash_info root_hash_info = - bch2_hash_info_init(c, root_inode); - u64 inum; - u32 snapshot; - int ret; - - bch_verbose(c, "checking lost+found"); - - inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info, - &lostfound); - if (!inum) { - bch_notice(c, "creating lost+found"); - goto create_lostfound; - } - - ret = bch2_trans_do(c, NULL, NULL, 0, - lookup_inode(&trans, inum, lostfound_inode, &snapshot)); - if (ret && ret != -ENOENT) - return ret; - - if (fsck_err_on(ret, c, "lost+found missing")) - goto create_lostfound; - - if (fsck_err_on(!S_ISDIR(lostfound_inode->bi_mode), c, - "lost+found inode not a directory")) - goto create_lostfound; - - return 0; -fsck_err: - return ret; -create_lostfound: - bch2_inode_init_early(c, lostfound_inode); - - ret = bch2_trans_do(c, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - bch2_create_trans(&trans, - BCACHEFS_ROOT_INO, root_inode, - lostfound_inode, &lostfound, - 0, 0, S_IFDIR|0700, 0, NULL, NULL)); - if (ret) - bch_err(c, "error creating lost+found: %i", ret); - - return ret; -} - struct pathbuf { size_t nr; size_t size; @@ -1014,7 +1021,6 @@ static int path_down(struct pathbuf *p, u64 inum) } static int check_path(struct btree_trans *trans, - struct bch_inode_unpacked *lostfound, struct pathbuf *p, struct bch_inode_unpacked *inode) { @@ -1038,7 +1044,7 @@ static int check_path(struct btree_trans *trans, inode->bi_nlink, inode->bi_dir, inode->bi_dir_offset)) - ret = reattach_inode(trans, lostfound, inode->bi_inum); + ret = reattach_inode(trans, inode); break; } ret = 0; @@ -1067,12 +1073,11 @@ static int check_path(struct btree_trans *trans, break; } - ret = reattach_inode(trans, lostfound, inode->bi_inum); + ret = reattach_inode(trans, inode); break; } - ret = lockrestart_do(trans, - lookup_inode(trans, inode->bi_dir, inode, &snapshot)); + ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot); if (ret) { /* Should have been caught in dirents pass */ bch_err(c, "error looking up parent directory: %i", ret); @@ -1090,8 +1095,7 @@ fsck_err: * After check_dirents(), if an inode backpointer doesn't exist that means it's * unreachable: */ -static int check_directory_structure(struct bch_fs *c, - struct bch_inode_unpacked *lostfound) +static int check_directory_structure(struct bch_fs *c) { struct btree_trans trans; struct btree_iter *iter; @@ -1113,7 +1117,7 @@ static int check_directory_structure(struct bch_fs *c, break; } - ret = check_path(&trans, lostfound, &path, &u); + ret = check_path(&trans, &path, &u); if (ret) break; } @@ -1190,7 +1194,6 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links, } static int check_inode_nlink(struct btree_trans *trans, - struct bch_inode_unpacked *lostfound_inode, struct btree_iter *iter, struct bkey_s_c_inode inode, unsigned nlink) @@ -1238,7 +1241,6 @@ fsck_err: noinline_for_stack static int bch2_gc_walk_inodes(struct bch_fs *c, - struct bch_inode_unpacked *lostfound_inode, nlink_table *links, u64 range_start, u64 range_end) { @@ -1259,7 +1261,7 @@ static int bch2_gc_walk_inodes(struct bch_fs *c, continue; link = genradix_ptr(links, k.k->p.offset - range_start); - ret = check_inode_nlink(&trans, lostfound_inode, iter, + ret = check_inode_nlink(&trans, iter, bkey_s_c_to_inode(k), link ? link->count : 0); if (ret) break; @@ -1275,8 +1277,7 @@ static int bch2_gc_walk_inodes(struct bch_fs *c, } noinline_for_stack -static int check_nlinks(struct bch_fs *c, - struct bch_inode_unpacked *lostfound_inode) +static int check_nlinks(struct bch_fs *c) { nlink_table links; u64 this_iter_range_start, next_iter_range_start = 0; @@ -1296,7 +1297,7 @@ static int check_nlinks(struct bch_fs *c, if (ret) break; - ret = bch2_gc_walk_inodes(c, lostfound_inode, &links, + ret = bch2_gc_walk_inodes(c, &links, this_iter_range_start, next_iter_range_start); if (ret) @@ -1316,16 +1317,15 @@ static int check_nlinks(struct bch_fs *c, */ int bch2_fsck_full(struct bch_fs *c) { - struct bch_inode_unpacked root_inode, lostfound_inode; + struct bch_inode_unpacked root_inode; return check_inodes(c, true) ?: check_extents(c) ?: check_dirents(c) ?: check_xattrs(c) ?: check_root(c, &root_inode) ?: - check_lostfound(c, &root_inode, &lostfound_inode) ?: - check_directory_structure(c, &lostfound_inode) ?: - check_nlinks(c, &lostfound_inode); + check_directory_structure(c) ?: + check_nlinks(c); } int bch2_fsck_walk_inodes_only(struct bch_fs *c) diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index f117d361..24d04e51 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -634,7 +634,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) msecs_to_jiffies(j->reclaim_delay_ms))) min_nr = 1; - if (j->prereserved.reserved * 2 > j->prereserved.remaining) + if (j->prereserved.reserved * 4 > j->prereserved.remaining) min_nr = 1; if (fifo_free(&j->pin) <= 32) diff --git a/libbcachefs/keylist.c b/libbcachefs/keylist.c index 864dfaa6..cda77835 100644 --- a/libbcachefs/keylist.c +++ b/libbcachefs/keylist.c @@ -62,6 +62,6 @@ void bch2_verify_keylist_sorted(struct keylist *l) for_each_keylist_key(l, k) BUG_ON(bkey_next(k) != l->top && - bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0); + bpos_cmp(k->k.p, bkey_next(k)->k.p) >= 0); } #endif diff --git a/libbcachefs/move.c b/libbcachefs/move.c index aa8e8c25..778ff72c 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -762,7 +762,7 @@ static int bch2_move_btree(struct bch_fs *c, id == start_btree_id ? start_pos : POS_MIN, BTREE_ITER_PREFETCH, b) { if (kthread && kthread_should_stop()) - goto out; + break; if ((cmp_int(id, end_btree_id) ?: bkey_cmp(b->key.k.p, end_pos)) > 0) @@ -789,8 +789,10 @@ next: } ret = bch2_trans_iter_free(&trans, iter) ?: ret; + if (kthread && kthread_should_stop()) + break; } -out: + bch2_trans_exit(&trans); if (ret) diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 80772cff..4ac7e61f 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -87,9 +87,20 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, if (i >= 0 && p.ptr.offset < h->data[i].offset + ca->mi.bucket_size && p.ptr.gen == h->data[i].gen) { + /* + * We need to use the journal reserve here, because + * - journal reclaim depends on btree key cache + * flushing to make forward progress, + * - which has to make forward progress when the + * journal is pre-reservation full, + * - and depends on allocation - meaning allocator and + * copygc + */ + data_opts->target = io_opts->background_target; data_opts->nr_replicas = 1; - data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE; + data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_JOURNAL_RESERVED; data_opts->rewrite_dev = p.ptr.dev; if (p.has_ec) diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index 4128a1b3..8e6cccd3 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -1063,11 +1063,27 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) return ret; } +void bch2_fs_replicas_exit(struct bch_fs *c) +{ + unsigned i; + + kfree(c->usage_scratch); + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + free_percpu(c->usage[i]); + kfree(c->usage_base); + kfree(c->replicas.entries); + kfree(c->replicas_gc.entries); + + mempool_exit(&c->replicas_delta_pool); +} + int bch2_fs_replicas_init(struct bch_fs *c) { bch2_journal_entry_res_resize(&c->journal, &c->replicas_journal_res, reserve_journal_replicas(c, &c->replicas)); - return replicas_table_update(c, &c->replicas); + return mempool_init_kmalloc_pool(&c->replicas_delta_pool, 1, + REPLICAS_DELTA_LIST_MAX) ?: + replicas_table_update(c, &c->replicas); } diff --git a/libbcachefs/replicas.h b/libbcachefs/replicas.h index c77e873e..72ac544f 100644 --- a/libbcachefs/replicas.h +++ b/libbcachefs/replicas.h @@ -102,6 +102,7 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *); extern const struct bch_sb_field_ops bch_sb_field_ops_replicas; extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0; +void bch2_fs_replicas_exit(struct bch_fs *); int bch2_fs_replicas_init(struct bch_fs *); #endif /* _BCACHEFS_REPLICAS_H */ diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 61fd1144..b6e449a7 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -477,6 +477,7 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_fs_btree_iter_exit(c); bch2_fs_btree_key_cache_exit(&c->btree_key_cache); bch2_fs_btree_cache_exit(c); + bch2_fs_replicas_exit(c); bch2_fs_journal_exit(&c->journal); bch2_io_clock_exit(&c->io_clock[WRITE]); bch2_io_clock_exit(&c->io_clock[READ]); @@ -484,10 +485,6 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_journal_keys_free(&c->journal_keys); bch2_journal_entries_free(&c->journal_entries); percpu_free_rwsem(&c->mark_lock); - kfree(c->usage_scratch); - for (i = 0; i < ARRAY_SIZE(c->usage); i++) - free_percpu(c->usage[i]); - kfree(c->usage_base); if (c->btree_iters_bufs) for_each_possible_cpu(cpu) @@ -501,8 +498,6 @@ static void __bch2_fs_free(struct bch_fs *c) bioset_exit(&c->btree_bio); mempool_exit(&c->fill_iter); percpu_ref_exit(&c->writes); - kfree(c->replicas.entries); - kfree(c->replicas_gc.entries); kfree(rcu_dereference_protected(c->disk_groups, 1)); kfree(c->journal_seq_blacklist_table); kfree(c->unused_inode_hints);