diff --git a/.bcachefs_revision b/.bcachefs_revision index 03f5be4e..609a75dd 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -af2c94ff96a44454a785878c6674fcf210c5a426 +fe41b3880c63cd911e1f0036312fa9d846d81c4f diff --git a/include/linux/types.h b/include/linux/types.h index a1473592..44cdff5b 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -32,6 +32,7 @@ typedef unsigned gfp_t; #define GFP_NOFS 0 #define GFP_NOIO 0 #define GFP_NOWAIT 0 +#define __GFP_RECLAIM 0 #define __GFP_FS 0 #define __GFP_IO 0 #define __GFP_NOWARN 0 diff --git a/libbcachefs/btree/cache.c b/libbcachefs/btree/cache.c index d73bd4b1..fb0edd29 100644 --- a/libbcachefs/btree/cache.c +++ b/libbcachefs/btree/cache.c @@ -18,10 +18,15 @@ #include "sb/counters.h" +#include #include #include #include +bool bch2_mm_avoid_compaction = true; +module_param_named(mm_avoid_compaction, bch2_mm_avoid_compaction, bool, 0644); +MODULE_PARM_DESC(force_read_device, ""); + const char * const bch2_btree_node_flags[] = { "typebit", "typebit", @@ -90,7 +95,20 @@ void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) six_unlock_intent(&b->c.lock); } -void __btree_node_data_free(struct btree *b) +static void __btree_node_data_free(struct btree *b) +{ + kvfree(b->data); + b->data = NULL; +#ifdef __KERNEL__ + kvfree(b->aux_data); +#else + if (b->aux_data) + munmap(b->aux_data, btree_aux_data_bytes(b)); +#endif + b->aux_data = NULL; +} + +void bch2_btree_node_data_free_locked(struct btree *b) { BUG_ON(!list_empty(&b->list)); BUG_ON(btree_node_hashed(b)); @@ -108,23 +126,15 @@ void __btree_node_data_free(struct btree *b) EBUG_ON(btree_node_write_in_flight(b)); clear_btree_node_just_written(b); - - kvfree(b->data); - b->data = NULL; -#ifdef __KERNEL__ - kvfree(b->aux_data); -#else - munmap(b->aux_data, btree_aux_data_bytes(b)); -#endif - b->aux_data = NULL; + __btree_node_data_free(b); } -static void btree_node_data_free(struct bch_fs_btree_cache *bc, struct btree *b) +static void bch2_btree_node_data_free(struct bch_fs_btree_cache *bc, struct btree *b) { BUG_ON(list_empty(&b->list)); list_del_init(&b->list); - __btree_node_data_free(b); + bch2_btree_node_data_free_locked(b); --bc->nr_freeable; btree_node_to_freedlist(bc, b); @@ -147,13 +157,29 @@ static const struct rhashtable_params bch_btree_cache_params = { .automatic_shrinking = true, }; -static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) +static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp, + bool avoid_compaction) { BUG_ON(b->data || b->aux_data); gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; - b->data = kvmalloc(btree_buf_bytes(b), gfp); + if (avoid_compaction && bch2_mm_avoid_compaction) { + /* + * Cursed hack: mm doesn't know how to limit the amount of time + * we spend blocked on compaction, even if we specified a + * vmalloc fallback. + * + * So we have to do that ourselves: only try for a high order + * page allocation if we're GFP_NOWAIT, otherwise straight to + * vmalloc. + */ + b->data = gfp & __GFP_RECLAIM + ? __vmalloc(btree_buf_bytes(b), gfp) + : kmalloc(btree_buf_bytes(b), gfp); + } else { + b->data = kvmalloc(btree_buf_bytes(b), gfp); + } if (!b->data) return bch_err_throw(c, ENOMEM_btree_node_mem_alloc); #ifdef __KERNEL__ @@ -165,20 +191,15 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) if (b->aux_data == MAP_FAILED) b->aux_data = NULL; #endif - if (!b->aux_data) { - kvfree(b->data); - b->data = NULL; + if (!b->aux_data) return bch_err_throw(c, ENOMEM_btree_node_mem_alloc); - } return 0; } static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) { - struct btree *b; - - b = kzalloc(sizeof(struct btree), gfp); + struct btree *b = kzalloc(sizeof(struct btree), gfp); if (!b) return NULL; @@ -195,7 +216,8 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) if (!b) return NULL; - if (btree_node_data_alloc(c, b, GFP_KERNEL)) { + if (btree_node_data_alloc(c, b, GFP_KERNEL, false)) { + __btree_node_data_free(b); kfree(b); return NULL; } @@ -262,6 +284,9 @@ void __bch2_btree_node_hash_remove(struct bch_fs_btree_cache *bc, struct btree * if (b->c.btree_id < BTREE_ID_NR) --bc->nr_by_btree[b->c.btree_id]; --bc->live[btree_node_pinned(b)].nr; + + bc->nr_vmalloc -= is_vmalloc_addr(b->data); + list_del_init(&b->list); } @@ -279,6 +304,8 @@ int __bch2_btree_node_hash_insert(struct bch_fs_btree_cache *bc, struct btree *b b->hash_val = btree_ptr_hash_val(&b->key); try(rhashtable_lookup_insert_fast(&bc->table, &b->hash, bch_btree_cache_params)); + bc->nr_vmalloc += is_vmalloc_addr(b->data); + if (b->c.btree_id < BTREE_ID_NR) bc->nr_by_btree[b->c.btree_id]++; @@ -502,7 +529,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, goto out; if (!btree_node_reclaim(c, b)) { - btree_node_data_free(bc, b); + bch2_btree_node_data_free(bc, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); freed++; @@ -519,7 +546,7 @@ restart: --touched; } else if (!btree_node_reclaim(c, b)) { __bch2_btree_node_hash_remove(bc, b); - __btree_node_data_free(b); + bch2_btree_node_data_free_locked(b); btree_node_to_freedlist(bc, b); freed++; @@ -606,7 +633,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) BUG_ON(btree_node_read_in_flight(b) || btree_node_write_in_flight(b)); - btree_node_data_free(bc, b); + bch2_btree_node_data_free(bc, b); cond_resched(); } @@ -830,10 +857,12 @@ got_node: mutex_unlock(&bc->lock); - if (btree_node_data_alloc(c, b, GFP_NOWAIT)) { + if (btree_node_data_alloc(c, b, GFP_NOWAIT, true)) { bch2_trans_unlock(trans); - if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) + if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN, true)) { + __btree_node_data_free(b); goto err; + } } got_mem: @@ -1371,7 +1400,7 @@ wait_on_io: mutex_lock(&bc->lock); bch2_btree_node_hash_remove(bc, b); - btree_node_data_free(bc, b); + bch2_btree_node_data_free(bc, b); mutex_unlock(&bc->lock); out: six_unlock_write(&b->c.lock); @@ -1484,6 +1513,7 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs_btree_ca prt_btree_cache_line(out, c, "live:", bc->live[0].nr); prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr); + prt_btree_cache_line(out, c, "vmalloc:", bc->nr_vmalloc); prt_btree_cache_line(out, c, "reserve:", bc->nr_reserve); prt_btree_cache_line(out, c, "freed:", bc->nr_freeable); prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty)); diff --git a/libbcachefs/btree/cache.h b/libbcachefs/btree/cache.h index 20d7899f..61235e95 100644 --- a/libbcachefs/btree/cache.h +++ b/libbcachefs/btree/cache.h @@ -30,7 +30,7 @@ void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsig void bch2_btree_cache_cannibalize_unlock(struct btree_trans *); int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *); -void __btree_node_data_free(struct btree *); +void bch2_btree_node_data_free_locked(struct btree *); struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool); diff --git a/libbcachefs/btree/node_scan.c b/libbcachefs/btree/node_scan.c index 912fe923..09e19f7f 100644 --- a/libbcachefs/btree/node_scan.c +++ b/libbcachefs/btree/node_scan.c @@ -261,7 +261,7 @@ static int read_btree_nodes_worker(void *p) } err: if (b) - __btree_node_data_free(b); + bch2_btree_node_data_free_locked(b); kfree(b); bio_put(bio); enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); diff --git a/libbcachefs/btree/read.c b/libbcachefs/btree/read.c index 03464b5d..c855e970 100644 --- a/libbcachefs/btree/read.c +++ b/libbcachefs/btree/read.c @@ -95,6 +95,7 @@ void bch2_btree_node_wait_on_write(struct btree *b) TASK_UNINTERRUPTIBLE); } +__printf(7, 0) static void btree_err_msg(struct printbuf *out, struct bch_fs *c, struct bch_dev *ca, struct btree *b, struct bset *i, struct bkey_packed *k, const char *fmt, va_list args) diff --git a/libbcachefs/btree/types.h b/libbcachefs/btree/types.h index 1dd51c56..4c99e2eb 100644 --- a/libbcachefs/btree/types.h +++ b/libbcachefs/btree/types.h @@ -200,6 +200,7 @@ struct bch_fs_btree_cache { struct list_head freed_nonpcpu; struct btree_cache_list live[2]; + size_t nr_vmalloc; size_t nr_freeable; size_t nr_reserve; size_t nr_by_btree[BTREE_ID_NR];