diff --git a/.bcachefs_revision b/.bcachefs_revision index baa146ea..4649f2ba 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -938f680845d1be28979e23aee972dba010c464ba +783085c3cc440183ba5e987b1aa7791cc1ca42ba diff --git a/Makefile b/Makefile index 61a62455..3bd84874 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ else Q = @ endif -CFLAGS+=-std=gnu11 -O2 -g -MMD -Wall -fPIC \ +CFLAGS+=-std=gnu11 -O2 -g -MMD -Wall -fPIC \ -Wno-pointer-sign \ -Wno-deprecated-declarations \ -fno-strict-aliasing \ diff --git a/cmd_data.c b/cmd_data.c index 6d709883..1ef689bc 100644 --- a/cmd_data.c +++ b/cmd_data.c @@ -5,6 +5,7 @@ #include "libbcachefs/bcachefs_ioctl.h" #include "libbcachefs/btree_cache.h" +#include "libbcachefs/move.h" #include "cmds.h" #include "libbcachefs.h" @@ -55,7 +56,7 @@ int cmd_data_rereplicate(int argc, char *argv[]) die("too many arguments"); return bchu_data(bcache_fs_open(fs_path), (struct bch_ioctl_data) { - .op = BCH_DATA_OP_REREPLICATE, + .op = BCH_DATA_OP_rereplicate, .start_btree = 0, .start_pos = POS_MIN, .end_btree = BTREE_ID_NR, @@ -70,7 +71,7 @@ static void data_job_usage(void) "\n" "Kick off a data job and report progress\n" "\n" - "job: one of scrub, rereplicate, migrate, or rewrite_old_nodes\n" + "job: one of scrub, rereplicate, migrate, rewrite_old_nodes, or drop_extra_replicas\n" "\n" "Options:\n" " -b btree btree to operate on\n" @@ -81,14 +82,6 @@ static void data_job_usage(void) exit(EXIT_SUCCESS); } -const char * const data_jobs[] = { - "scrub", - "rereplicate", - "migrate", - "rewrite_old_nodes", - NULL -}; - int cmd_data_job(int argc, char *argv[]) { struct bch_ioctl_data op = { @@ -121,10 +114,7 @@ int cmd_data_job(int argc, char *argv[]) if (!job) die("please specify which type of job"); - op.op = read_string_list_or_die(job, data_jobs, "bad job type"); - - if (op.op == BCH_DATA_OP_SCRUB) - die("scrub not implemented yet"); + op.op = read_string_list_or_die(job, bch2_data_ops_strs, "bad job type"); char *fs_path = arg_pop(); if (!fs_path) diff --git a/cmd_device.c b/cmd_device.c index 1cb31ab8..bd496835 100644 --- a/cmd_device.c +++ b/cmd_device.c @@ -332,7 +332,7 @@ int cmd_device_evacuate(int argc, char *argv[]) } return bchu_data(fs, (struct bch_ioctl_data) { - .op = BCH_DATA_OP_MIGRATE, + .op = BCH_DATA_OP_migrate, .start_btree = 0, .start_pos = POS_MIN, .end_btree = BTREE_ID_NR, diff --git a/include/linux/atomic.h b/include/linux/atomic.h index 2c983cd4..7effc161 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -161,6 +161,13 @@ static inline i_type a_type##_read(const a_type##_t *v) \ return __ATOMIC_READ(&v->counter); \ } \ \ +static inline i_type a_type##_read_acquire(const a_type##_t *v) \ +{ \ + i_type ret = __ATOMIC_READ(&v->counter); \ + smp_mb__after_atomic(); \ + return ret; \ +} \ + \ static inline void a_type##_set(a_type##_t *v, i_type i) \ { \ return __ATOMIC_SET(&v->counter, i); \ diff --git a/include/linux/closure.h b/include/linux/closure.h index de7bb47d..c554c6a0 100644 --- a/include/linux/closure.h +++ b/include/linux/closure.h @@ -104,7 +104,7 @@ struct closure; struct closure_syncer; -typedef void (closure_fn) (struct closure *); +typedef void (closure_fn) (struct work_struct *); extern struct dentry *bcache_debug; struct closure_waitlist { @@ -254,7 +254,7 @@ static inline void closure_queue(struct closure *cl) INIT_WORK(&cl->work, cl->work.func); BUG_ON(!queue_work(wq, &cl->work)); } else - cl->fn(cl); + cl->fn(&cl->work); } /** @@ -309,6 +309,11 @@ static inline void closure_wake_up(struct closure_waitlist *list) __closure_wake_up(list); } +#define CLOSURE_CALLBACK(name) void name(struct work_struct *ws) +#define closure_type(name, type, member) \ + struct closure *cl = container_of(ws, struct closure, work); \ + type *name = container_of(cl, type, member) + /** * continue_at - jump to another function with barrier * diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index bca00d61..2d1adabf 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -22,10 +22,18 @@ struct shrinker { int seeks; /* seeks to recreate an obj */ long batch; /* reclaim batch size, 0 = default */ struct list_head list; + void *private_data; }; -int register_shrinker(struct shrinker *, const char *, ...); -void unregister_shrinker(struct shrinker *); +static inline void shrinker_free(struct shrinker *s) +{ + free(s); +} + +struct shrinker *shrinker_alloc(unsigned int, const char *, ...); + +int shrinker_register(struct shrinker *); +void shrinker_unregister(struct shrinker *); void run_shrinkers(gfp_t gfp_mask, bool); diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 113273b2..1ed8506c 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -847,6 +847,19 @@ int bch2_trans_mark_alloc(struct btree_trans *trans, return ret; } + /* + * need to know if we're getting called from the invalidate path or + * not: + */ + + if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && + old_a->cached_sectors) { + ret = bch2_update_cached_sectors_list(trans, new->k.p.inode, + -((s64) old_a->cached_sectors)); + if (ret) + return ret; + } + return 0; } @@ -1212,7 +1225,7 @@ fsck_err: return ret; } -static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_trans *trans, +static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter) { struct bch_fs *c = trans->c; @@ -1271,24 +1284,6 @@ delete: goto out; } -static int bch2_check_discard_freespace_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos end) -{ - if (!btree_id_is_extents(iter->btree_id)) { - return __bch2_check_discard_freespace_key(trans, iter); - } else { - int ret = 0; - - while (!bkey_eq(iter->pos, end) && - !(ret = btree_trans_too_many_iters(trans) ?: - __bch2_check_discard_freespace_key(trans, iter))) - bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); - - return ret; - } -} - /* * We've already checked that generation numbers in the bucket_gens btree are * valid for buckets that exist; this just checks for keys for nonexistent @@ -1445,12 +1440,40 @@ bkey_err: ret = for_each_btree_key2(trans, iter, BTREE_ID_need_discard, POS_MIN, BTREE_ITER_PREFETCH, k, - bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?: - for_each_btree_key2(trans, iter, - BTREE_ID_freespace, POS_MIN, - BTREE_ITER_PREFETCH, k, - bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?: - for_each_btree_key_commit(trans, iter, + bch2_check_discard_freespace_key(trans, &iter)); + if (ret) + goto err; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN, + BTREE_ITER_PREFETCH); + while (1) { + bch2_trans_begin(trans); + k = bch2_btree_iter_peek(&iter); + if (!k.k) + break; + + ret = bkey_err(k) ?: + bch2_check_discard_freespace_key(trans, &iter); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + ret = 0; + continue; + } + if (ret) { + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, k); + + bch_err(c, "while checking %s", buf.buf); + printbuf_exit(&buf); + break; + } + + bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos)); + } + bch2_trans_iter_exit(trans, &iter); + if (ret) + goto err; + + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_bucket_gens, POS_MIN, BTREE_ITER_PREFETCH, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw, @@ -1802,7 +1825,7 @@ static void bch2_do_invalidates_work(struct work_struct *work) unsigned i; int ret = 0; - ret = bch2_btree_write_buffer_flush(trans); + ret = bch2_btree_write_buffer_tryflush(trans); if (ret) goto err; diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index b85c7765..eef6fa8d 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -1297,6 +1297,30 @@ out: return wp; } +static noinline void +deallocate_extra_replicas(struct bch_fs *c, + struct open_buckets *ptrs, + struct open_buckets *ptrs_no_use, + unsigned extra_replicas) +{ + struct open_buckets ptrs2 = { 0 }; + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, ptrs, ob, i) { + unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability; + + if (d && d <= extra_replicas) { + extra_replicas -= d; + ob_push(c, ptrs_no_use, ob); + } else { + ob_push(c, &ptrs2, ob); + } + } + + *ptrs = ptrs2; +} + /* * Get us an open_bucket we can allocate from, return with it locked: */ @@ -1382,6 +1406,9 @@ alloc_done: if (ret) goto err; + if (nr_effective > nr_replicas) + deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas); + /* Free buckets we didn't use: */ open_bucket_for_each(c, &wp->ptrs, ob, i) open_bucket_free_unused(c, ob); diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 3117ab44..53f93f03 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -406,6 +406,7 @@ BCH_DEBUG_PARAMS_DEBUG() x(blocked_journal_max_in_flight) \ x(blocked_allocate) \ x(blocked_allocate_open_bucket) \ + x(blocked_write_buffer_full) \ x(nocow_lock_contended) enum bch_time_stats { @@ -640,6 +641,8 @@ struct journal_keys { size_t gap; size_t nr; size_t size; + atomic_t ref; + bool initial_ref_held; }; struct btree_trans_buf { @@ -664,7 +667,8 @@ struct btree_trans_buf { x(invalidate) \ x(delete_dead_snapshots) \ x(snapshot_delete_pagecache) \ - x(sysfs) + x(sysfs) \ + x(btree_write_buffer) enum bch_write_ref { #define x(n) BCH_WRITE_REF_##n, @@ -1064,6 +1068,16 @@ static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref) #endif } +static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) +{ +#ifdef BCH_WRITE_REF_DEBUG + return !test_bit(BCH_FS_GOING_RO, &c->flags) && + atomic_long_inc_not_zero(&c->writes[ref]); +#else + return percpu_ref_tryget(&c->writes); +#endif +} + static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) { #ifdef BCH_WRITE_REF_DEBUG diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 0a750953..ad0f298c 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -303,6 +303,13 @@ struct bkey_i { struct bch_val v; }; +#define POS_KEY(_pos) \ +((struct bkey) { \ + .u64s = BKEY_U64s, \ + .format = KEY_FORMAT_CURRENT, \ + .p = _pos, \ +}) + #define KEY(_inode, _offset, _size) \ ((struct bkey) { \ .u64s = BKEY_U64s, \ @@ -1436,7 +1443,7 @@ struct bch_sb_field_replicas_v0 { struct bch_replicas_entry_v0 entries[]; } __packed __aligned(8); -struct bch_replicas_entry { +struct bch_replicas_entry_v1 { __u8 data_type; __u8 nr_devs; __u8 nr_required; @@ -1448,7 +1455,7 @@ struct bch_replicas_entry { struct bch_sb_field_replicas { struct bch_sb_field field; - struct bch_replicas_entry entries[]; + struct bch_replicas_entry_v1 entries[]; } __packed __aligned(8); /* BCH_SB_FIELD_quota: */ @@ -2124,7 +2131,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb) x(clock, 7) \ x(dev_usage, 8) \ x(log, 9) \ - x(overwrite, 10) + x(overwrite, 10) \ + x(write_buffer_keys, 11) enum { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, @@ -2174,7 +2182,7 @@ struct jset_entry_usage { struct jset_entry_data_usage { struct jset_entry entry; __le64 v; - struct bch_replicas_entry r; + struct bch_replicas_entry_v1 r; } __packed; struct jset_entry_clock { diff --git a/libbcachefs/bcachefs_ioctl.h b/libbcachefs/bcachefs_ioctl.h index f05881f7..18eb3254 100644 --- a/libbcachefs/bcachefs_ioctl.h +++ b/libbcachefs/bcachefs_ioctl.h @@ -173,12 +173,18 @@ struct bch_ioctl_disk_set_state { __u64 dev; }; +#define BCH_DATA_OPS() \ + x(scrub, 0) \ + x(rereplicate, 1) \ + x(migrate, 2) \ + x(rewrite_old_nodes, 3) \ + x(drop_extra_replicas, 4) + enum bch_data_ops { - BCH_DATA_OP_SCRUB = 0, - BCH_DATA_OP_REREPLICATE = 1, - BCH_DATA_OP_MIGRATE = 2, - BCH_DATA_OP_REWRITE_OLD_NODES = 3, - BCH_DATA_OP_NR = 4, +#define x(t, n) BCH_DATA_OP_##t = n, + BCH_DATA_OPS() +#undef x + BCH_DATA_OP_NR }; /* @@ -237,7 +243,7 @@ struct bch_ioctl_data_event { struct bch_replicas_usage { __u64 sectors; - struct bch_replicas_entry r; + struct bch_replicas_entry_v1 r; } __packed; static inline struct bch_replicas_usage * diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 84636ad0..72dea90e 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -318,8 +318,7 @@ static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct bch_fs *c = container_of(shrink, struct bch_fs, - btree_cache.shrink); + struct bch_fs *c = shrink->private_data; struct btree_cache *bc = &c->btree_cache; struct btree *b, *t; unsigned long nr = sc->nr_to_scan; @@ -420,8 +419,7 @@ out_nounlock: static unsigned long bch2_btree_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - struct bch_fs *c = container_of(shrink, struct bch_fs, - btree_cache.shrink); + struct bch_fs *c = shrink->private_data; struct btree_cache *bc = &c->btree_cache; if (bch2_btree_shrinker_disabled) @@ -432,8 +430,7 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink, static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink) { - struct bch_fs *c = container_of(shrink, struct bch_fs, - btree_cache.shrink); + struct bch_fs *c = shrink->private_data; char *cbuf; size_t buflen = seq_buf_get_buf(s, &cbuf); struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen); @@ -448,7 +445,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) struct btree *b; unsigned i, flags; - unregister_shrinker(&bc->shrink); + shrinker_free(bc->shrink); /* vfree() can allocate memory: */ flags = memalloc_nofs_save(); @@ -502,6 +499,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) int bch2_fs_btree_cache_init(struct bch_fs *c) { struct btree_cache *bc = &c->btree_cache; + struct shrinker *shrink; unsigned i; int ret = 0; @@ -521,13 +519,16 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) mutex_init(&c->verify_lock); - bc->shrink.count_objects = bch2_btree_cache_count; - bc->shrink.scan_objects = bch2_btree_cache_scan; - bc->shrink.to_text = bch2_btree_cache_shrinker_to_text; - bc->shrink.seeks = 4; - ret = register_shrinker(&bc->shrink, "%s-btree_cache", c->name); - if (ret) + shrink = shrinker_alloc(0, "%s-btree_cache", c->name); + if (!shrink) goto err; + bc->shrink = shrink; + shrink->count_objects = bch2_btree_cache_count; + shrink->scan_objects = bch2_btree_cache_scan; + shrink->to_text = bch2_btree_cache_shrinker_to_text; + shrink->seeks = 4; + shrink->private_data = c; + shrinker_register(shrink); return 0; err: diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index c4922bd3..7e5d52f8 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -1287,7 +1287,7 @@ static int bch2_gc_done(struct bch_fs *c, } for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *e = + struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); if (metadata_only && diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 1f73ee0e..3c663c59 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1358,10 +1358,9 @@ static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void * return offset; } -static void btree_node_read_all_replicas_done(struct closure *cl) +static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) { - struct btree_node_read_all *ra = - container_of(cl, struct btree_node_read_all, cl); + closure_type(ra, struct btree_node_read_all, cl); struct bch_fs *c = ra->c; struct btree *b = ra->b; struct printbuf buf = PRINTBUF; @@ -1567,7 +1566,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool if (sync) { closure_sync(&ra->cl); - btree_node_read_all_replicas_done(&ra->cl); + btree_node_read_all_replicas_done(&ra->cl.work); } else { continue_at(&ra->cl, btree_node_read_all_replicas_done, c->io_complete_wq); diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 31286950..a52fd206 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -1854,19 +1854,11 @@ static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, struct btree_iter *iter, struct bpos end_pos) { - struct bkey_i *k; - - if (bpos_lt(iter->path->pos, iter->journal_pos)) - iter->journal_idx = 0; - - k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id, - iter->path->level, - iter->path->pos, - end_pos, - &iter->journal_idx); - - iter->journal_pos = k ? k->k.p : end_pos; - return k; + return bch2_journal_keys_peek_upto(trans->c, iter->btree_id, + iter->path->level, + iter->path->pos, + end_pos, + &iter->journal_idx); } static noinline @@ -2874,7 +2866,8 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) trans->fn_idx = fn_idx; trans->locking_wait.task = current; trans->journal_replay_not_finished = - !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags); + unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) && + atomic_inc_not_zero(&c->journal_keys.ref); closure_init_stack(&trans->ref); s = btree_trans_stats(trans); @@ -2991,6 +2984,9 @@ void bch2_trans_put(struct btree_trans *trans) kfree(trans->fs_usage_deltas); } + if (unlikely(trans->journal_replay_not_finished)) + bch2_journal_keys_put(c); + if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) mempool_free(trans->mem, &c->btree_trans_mem_pool); else diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index e5b989a8..a4fec7cc 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -445,14 +445,16 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans, unsigned flags, unsigned long ip) { - memset(iter, 0, sizeof(*iter)); - iter->trans = trans; - iter->btree_id = btree_id; - iter->flags = flags; - iter->snapshot = pos.snapshot; - iter->pos = pos; - iter->k.p = pos; - + iter->trans = trans; + iter->update_path = NULL; + iter->key_cache_path = NULL; + iter->btree_id = btree_id; + iter->min_depth = 0; + iter->flags = flags; + iter->snapshot = pos.snapshot; + iter->pos = pos; + iter->k = POS_KEY(pos); + iter->journal_idx = 0; #ifdef CONFIG_BCACHEFS_DEBUG iter->ip_allocated = ip; #endif diff --git a/libbcachefs/btree_journal_iter.c b/libbcachefs/btree_journal_iter.c index 58a981bc..7a5e0a89 100644 --- a/libbcachefs/btree_journal_iter.c +++ b/libbcachefs/btree_journal_iter.c @@ -73,6 +73,7 @@ static size_t bch2_journal_key_search(struct journal_keys *keys, return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos)); } +/* Returns first non-overwritten key >= search key: */ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, unsigned level, struct bpos pos, struct bpos end_pos, size_t *idx) @@ -80,16 +81,32 @@ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree struct journal_keys *keys = &c->journal_keys; unsigned iters = 0; struct journal_key *k; + + BUG_ON(*idx > keys->nr); search: if (!*idx) *idx = __bch2_journal_key_search(keys, btree_id, level, pos); + while (*idx && + __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) { + --(*idx); + iters++; + if (iters == 10) { + *idx = 0; + goto search; + } + } + while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) return NULL; - if (__journal_key_cmp(btree_id, level, pos, k) <= 0 && - !k->overwritten) + if (k->overwritten) { + (*idx)++; + continue; + } + + if (__journal_key_cmp(btree_id, level, pos, k) <= 0) return k->k; (*idx)++; @@ -189,10 +206,12 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, /* Since @keys was full, there was no gap: */ memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); kvfree(keys->d); - *keys = new_keys; + keys->d = new_keys.d; + keys->nr = new_keys.nr; + keys->size = new_keys.size; /* And now the gap is at the end: */ - keys->gap = keys->nr; + keys->gap = keys->nr; } journal_iters_move_gap(c, keys->gap, idx); @@ -415,10 +434,16 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) cmp_int(l->journal_offset, r->journal_offset); } -void bch2_journal_keys_free(struct journal_keys *keys) +void bch2_journal_keys_put(struct bch_fs *c) { + struct journal_keys *keys = &c->journal_keys; struct journal_key *i; + BUG_ON(atomic_read(&keys->ref) <= 0); + + if (!atomic_dec_and_test(&keys->ref)) + return; + move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); keys->gap = keys->nr; @@ -429,6 +454,8 @@ void bch2_journal_keys_free(struct journal_keys *keys) kvfree(keys->d); keys->d = NULL; keys->nr = keys->gap = keys->size = 0; + + bch2_journal_entries_free(c); } static void __journal_keys_sort(struct journal_keys *keys) diff --git a/libbcachefs/btree_journal_iter.h b/libbcachefs/btree_journal_iter.h index 5d64e7e2..8ca4c100 100644 --- a/libbcachefs/btree_journal_iter.h +++ b/libbcachefs/btree_journal_iter.h @@ -49,7 +49,15 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, struct bch_fs *, struct btree *); -void bch2_journal_keys_free(struct journal_keys *); +void bch2_journal_keys_put(struct bch_fs *); + +static inline void bch2_journal_keys_put_initial(struct bch_fs *c) +{ + if (c->journal_keys.initial_ref_held) + bch2_journal_keys_put(c); + c->journal_keys.initial_ref_held = false; +} + void bch2_journal_entries_free(struct bch_fs *); int bch2_journal_keys_sort(struct bch_fs *); diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index 8e80a5b6..e14e9b4c 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -646,11 +646,19 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, if (journal_seq && ck->journal.seq != journal_seq) goto out; + trans->journal_res.seq = ck->journal.seq; + /* - * Since journal reclaim depends on us making progress here, and the - * allocator/copygc depend on journal reclaim making progress, we need - * to be using alloc reserves: + * If we're at the end of the journal, we really want to free up space + * in the journal right away - we don't want to pin that old journal + * sequence number with a new btree node write, we want to re-journal + * the update */ + if (ck->journal.seq == journal_last_seq(j)) + commit_flags |= BCH_WATERMARK_reclaim; + else + commit_flags |= BCH_TRANS_COMMIT_no_journal_res; + ret = bch2_btree_iter_traverse(&b_iter) ?: bch2_trans_update(trans, &b_iter, ck->k, BTREE_UPDATE_KEY_CACHE_RECLAIM| @@ -659,9 +667,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc| - (ck->journal.seq == journal_last_seq(j) - ? BCH_WATERMARK_reclaim - : 0)| commit_flags); bch2_fs_fatal_err_on(ret && @@ -830,8 +835,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct bch_fs *c = container_of(shrink, struct bch_fs, - btree_key_cache.shrink); + struct bch_fs *c = shrink->private_data; struct btree_key_cache *bc = &c->btree_key_cache; struct bucket_table *tbl; struct bkey_cached *ck, *t; @@ -932,8 +936,7 @@ out: static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - struct bch_fs *c = container_of(shrink, struct bch_fs, - btree_key_cache.shrink); + struct bch_fs *c = shrink->private_data; struct btree_key_cache *bc = &c->btree_key_cache; long nr = atomic_long_read(&bc->nr_keys) - atomic_long_read(&bc->nr_dirty); @@ -953,7 +956,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) int cpu; #endif - unregister_shrinker(&bc->shrink); + shrinker_free(bc->shrink); mutex_lock(&bc->lock); @@ -1028,8 +1031,8 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink) { - struct btree_key_cache *bc = - container_of(shrink, struct btree_key_cache, shrink); + struct bch_fs *c = shrink->private_data; + struct btree_key_cache *bc = &c->btree_key_cache; char *cbuf; size_t buflen = seq_buf_get_buf(s, &cbuf); struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen); @@ -1041,6 +1044,7 @@ static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shri int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) { struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); + struct shrinker *shrink; #ifdef __KERNEL__ bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist); @@ -1053,12 +1057,16 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) bc->table_init_done = true; - bc->shrink.seeks = 0; - bc->shrink.count_objects = bch2_btree_key_cache_count; - bc->shrink.scan_objects = bch2_btree_key_cache_scan; - bc->shrink.to_text = bch2_btree_key_cache_shrinker_to_text; - if (register_shrinker(&bc->shrink, "%s-btree_key_cache", c->name)) + shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name); + if (!shrink) return -BCH_ERR_ENOMEM_fs_btree_cache_init; + bc->shrink = shrink; + shrink->seeks = 0; + shrink->count_objects = bch2_btree_key_cache_count; + shrink->scan_objects = bch2_btree_key_cache_scan; + shrink->to_text = bch2_btree_key_cache_shrinker_to_text; + shrink->private_data = c; + shrinker_register(shrink); return 0; } diff --git a/libbcachefs/btree_key_cache_types.h b/libbcachefs/btree_key_cache_types.h index cfd09f51..290e4e57 100644 --- a/libbcachefs/btree_key_cache_types.h +++ b/libbcachefs/btree_key_cache_types.h @@ -17,7 +17,7 @@ struct btree_key_cache { struct list_head freed_nonpcpu; size_t nr_freed_nonpcpu; - struct shrinker shrink; + struct shrinker *shrink; unsigned shrink_iter; struct btree_key_cache_freelist __percpu *pcpu_freed; diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c index 70077efa..09e94cc4 100644 --- a/libbcachefs/btree_trans_commit.c +++ b/libbcachefs/btree_trans_commit.c @@ -660,10 +660,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, i->k->k.needs_whiteout = false; } - if (trans->nr_wb_updates && - trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size) - return -BCH_ERR_btree_insert_need_flush_buffer; - /* * Don't get journal reservation until after we know insert will * succeed: @@ -698,14 +694,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) return -BCH_ERR_btree_insert_need_mark_replicas; - if (trans->nr_wb_updates) { - EBUG_ON(flags & BCH_TRANS_COMMIT_no_journal_res); - - ret = bch2_btree_insert_keys_write_buffer(trans); - if (ret) - goto revert_fs_usage; - } - h = trans->hooks; while (h) { ret = h->fn(trans, h); @@ -767,7 +755,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, trans_for_each_wb_update(trans, wb) { entry = bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_btree_keys, + BCH_JSET_ENTRY_write_buffer_keys, wb->btree, 0, wb->k.k.u64s); bkey_copy((struct bkey_i *) entry->start, &wb->k); @@ -951,30 +939,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, ret = bch2_trans_relock(trans); break; - case -BCH_ERR_btree_insert_need_flush_buffer: { - struct btree_write_buffer *wb = &c->btree_write_buffer; - - ret = 0; - - if (wb->state.nr > wb->size * 3 / 4) { - bch2_trans_unlock(trans); - mutex_lock(&wb->flush_lock); - - if (wb->state.nr > wb->size * 3 / 4) { - bch2_trans_begin(trans); - ret = __bch2_btree_write_buffer_flush(trans, - flags|BCH_TRANS_COMMIT_no_check_rw, true); - if (!ret) { - trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); - } - } else { - mutex_unlock(&wb->flush_lock); - ret = bch2_trans_relock(trans); - } - } - break; - } default: BUG_ON(ret >= 0); break; @@ -1073,20 +1037,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) goto out_reset; } - if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 && - mutex_trylock(&c->btree_write_buffer.flush_lock)) { - bch2_trans_begin(trans); - bch2_trans_unlock(trans); - - ret = __bch2_btree_write_buffer_flush(trans, - flags|BCH_TRANS_COMMIT_no_check_rw, true); - if (!ret) { - trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); - } - goto out; - } - EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); trans->journal_u64s = trans->extra_journal_entries.nr; diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index e58575ad..14983e77 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -173,7 +173,7 @@ struct btree_cache { unsigned not_freed_will_make_reachable; unsigned not_freed_access_bit; atomic_t dirty; - struct shrinker shrink; + struct shrinker *shrink; /* * If we need to allocate memory for a new btree node and that @@ -297,8 +297,7 @@ struct btree_iter { struct btree_path *key_cache_path; enum btree_id btree_id:8; - unsigned min_depth:3; - unsigned advanced:1; + u8 min_depth; /* btree_iter_copy starts here: */ u16 flags; @@ -315,7 +314,6 @@ struct btree_iter { /* BTREE_ITER_WITH_JOURNAL: */ size_t journal_idx; - struct bpos journal_pos; #ifdef TRACK_PATH_ALLOCATED unsigned long ip_allocated; #endif diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 18e5a751..bfe4d797 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -774,9 +774,9 @@ static void btree_interior_update_work(struct work_struct *work) } } -static void btree_update_set_nodes_written(struct closure *cl) +static CLOSURE_CALLBACK(btree_update_set_nodes_written) { - struct btree_update *as = container_of(cl, struct btree_update, cl); + closure_type(as, struct btree_update, cl); struct bch_fs *c = as->c; mutex_lock(&c->btree_interior_update_lock); diff --git a/libbcachefs/btree_write_buffer.c b/libbcachefs/btree_write_buffer.c index a6bf6ed3..0c2db1fa 100644 --- a/libbcachefs/btree_write_buffer.c +++ b/libbcachefs/btree_write_buffer.c @@ -7,43 +7,132 @@ #include "btree_write_buffer.h" #include "error.h" #include "journal.h" +#include "journal_io.h" #include "journal_reclaim.h" -#include +#include static int bch2_btree_write_buffer_journal_flush(struct journal *, struct journal_entry_pin *, u64); -static int btree_write_buffered_key_cmp(const void *_l, const void *_r) -{ - const struct btree_write_buffered_key *l = _l; - const struct btree_write_buffered_key *r = _r; +static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct journal_buf *); - return cmp_int(l->btree, r->btree) ?: - bpos_cmp(l->k.k.p, r->k.k.p) ?: - cmp_int(l->journal_seq, r->journal_seq) ?: - cmp_int(l->journal_offset, r->journal_offset); +static inline bool __wb_key_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r) +{ + return (cmp_int(l->hi, r->hi) ?: + cmp_int(l->mi, r->mi) ?: + cmp_int(l->lo, r->lo)) >= 0; } -static int btree_write_buffered_journal_cmp(const void *_l, const void *_r) +static inline bool wb_key_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r) { - const struct btree_write_buffered_key *l = _l; - const struct btree_write_buffered_key *r = _r; +#ifdef CONFIG_X86_64 + int cmp; - return cmp_int(l->journal_seq, r->journal_seq); + asm(".intel_syntax noprefix;" + "mov rax, [%[l]];" + "sub rax, [%[r]];" + "mov rax, [%[l] + 8];" + "sbb rax, [%[r] + 8];" + "mov rax, [%[l] + 16];" + "sbb rax, [%[r] + 16];" + ".att_syntax prefix;" + : "=@ccae" (cmp) + : [l] "r" (l), [r] "r" (r) + : "rax", "cc"); + + EBUG_ON(cmp != __wb_key_cmp(l, r)); + return cmp; +#else + return __wb_key_cmp(l, r); +#endif } -static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans, - struct btree_iter *iter, - struct btree_write_buffered_key *wb, - unsigned commit_flags, - bool *write_locked, - size_t *fast) +/* Compare excluding idx, the low 24 bits: */ +static inline bool wb_key_eq(const void *_l, const void *_r) +{ + const struct wb_key_ref *l = _l; + const struct wb_key_ref *r = _r; + + return !((l->hi ^ r->hi)| + (l->mi ^ r->mi)| + ((l->lo >> 24) ^ (r->lo >> 24))); +} + +static noinline void wb_sort(struct wb_key_ref *base, size_t num) +{ + size_t n = num, a = num / 2; + + if (!a) /* num < 2 || size == 0 */ + return; + + for (;;) { + size_t b, c, d; + + if (a) /* Building heap: sift down --a */ + --a; + else if (--n) /* Sorting: Extract root to --n */ + swap(base[0], base[n]); + else /* Sort complete */ + break; + + /* + * Sift element at "a" down into heap. This is the + * "bottom-up" variant, which significantly reduces + * calls to cmp_func(): we find the sift-down path all + * the way to the leaves (one compare per level), then + * backtrack to find where to insert the target element. + * + * Because elements tend to sift down close to the leaves, + * this uses fewer compares than doing two per level + * on the way down. (A bit more than half as many on + * average, 3/4 worst-case.) + */ + for (b = a; c = 2*b + 1, (d = c + 1) < n;) + b = wb_key_cmp(base + c, base + d) ? c : d; + if (d == n) /* Special case last leaf with no sibling */ + b = c; + + /* Now backtrack from "b" to the correct location for "a" */ + while (b != a && wb_key_cmp(base + a, base + b)) + b = (b - 1) / 2; + c = b; /* Where "a" belongs */ + while (b != a) { /* Shift it into place */ + b = (b - 1) / 2; + swap(base[b], base[c]); + } + } +} + +static noinline int wb_flush_one_slowpath(struct btree_trans *trans, + struct btree_iter *iter, + struct btree_write_buffered_key *wb) +{ + bch2_btree_node_unlock_write(trans, iter->path, iter->path->l[0].b); + + trans->journal_res.seq = wb->journal_seq; + + return bch2_trans_update(trans, iter, &wb->k, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_commit(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_journal_res| + BCH_TRANS_COMMIT_journal_reclaim); +} + +static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter, + struct btree_write_buffered_key *wb, + bool *write_locked, size_t *fast) { struct bch_fs *c = trans->c; struct btree_path *path; int ret; + EBUG_ON(!wb->journal_seq); + EBUG_ON(!c->btree_write_buffer.flushing.pin.seq); + EBUG_ON(c->btree_write_buffer.flushing.pin.seq > wb->journal_seq); + ret = bch2_btree_iter_traverse(iter); if (ret) return ret; @@ -66,46 +155,14 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans, *write_locked = true; } - if (!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s)) { - bch2_btree_node_unlock_write(trans, path, path->l[0].b); + if (unlikely(!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s))) { *write_locked = false; - goto trans_commit; + return wb_flush_one_slowpath(trans, iter, wb); } bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq); (*fast)++; return 0; -trans_commit: - trans->journal_res.seq = wb->journal_seq; - - return bch2_trans_update(trans, iter, &wb->k, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: - bch2_trans_commit(trans, NULL, NULL, - commit_flags| - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_no_journal_res| - BCH_TRANS_COMMIT_journal_reclaim); -} - -static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb) -{ - union btree_write_buffer_state old, new; - u64 v = READ_ONCE(wb->state.v); - - do { - old.v = new.v = v; - - new.nr = 0; - new.idx++; - } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v); - - while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1) - cpu_relax(); - - smp_mb(); - - return old; } /* @@ -137,35 +194,79 @@ btree_write_buffered_insert(struct btree_trans *trans, return ret; } -int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags, - bool locked) +static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb) +{ + struct bch_fs *c = container_of(wb, struct bch_fs, btree_write_buffer); + struct journal *j = &c->journal; + + if (!wb->inc.keys.nr) + return; + + bch2_journal_pin_add(j, wb->inc.keys.data[0].journal_seq, &wb->flushing.pin, + bch2_btree_write_buffer_journal_flush); + + darray_resize(&wb->flushing.keys, min_t(size_t, 1U << 20, wb->flushing.keys.nr + wb->inc.keys.nr)); + darray_resize(&wb->sorted, wb->flushing.keys.size); + + if (!wb->flushing.keys.nr && wb->sorted.size >= wb->inc.keys.nr) { + swap(wb->flushing.keys, wb->inc.keys); + goto out; + } + + size_t nr = min(darray_room(wb->flushing.keys), + wb->sorted.size - wb->flushing.keys.nr); + nr = min(nr, wb->inc.keys.nr); + + memcpy(&darray_top(wb->flushing.keys), + wb->inc.keys.data, + sizeof(wb->inc.keys.data[0]) * nr); + + memmove(wb->inc.keys.data, + wb->inc.keys.data + nr, + sizeof(wb->inc.keys.data[0]) * (wb->inc.keys.nr - nr)); + + wb->flushing.keys.nr += nr; + wb->inc.keys.nr -= nr; +out: + if (!wb->inc.keys.nr) + bch2_journal_pin_drop(j, &wb->inc.pin); + else + bch2_journal_pin_update(j, wb->inc.keys.data[0].journal_seq, &wb->inc.pin, + bch2_btree_write_buffer_journal_flush); + + if (j->watermark) { + spin_lock(&j->lock); + bch2_journal_set_watermark(j); + spin_unlock(&j->lock); + } + + BUG_ON(wb->sorted.size < wb->flushing.keys.nr); +} + +static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) { struct bch_fs *c = trans->c; struct journal *j = &c->journal; struct btree_write_buffer *wb = &c->btree_write_buffer; - struct journal_entry_pin pin; - struct btree_write_buffered_key *i, *keys; + struct wb_key_ref *i; struct btree_iter iter = { NULL }; - size_t nr = 0, skipped = 0, fast = 0, slowpath = 0; + size_t skipped = 0, fast = 0, slowpath = 0; bool write_locked = false; - union btree_write_buffer_state s; int ret = 0; - memset(&pin, 0, sizeof(pin)); + bch2_trans_unlock(trans); + bch2_trans_begin(trans); - if (!locked && !mutex_trylock(&wb->flush_lock)) - return 0; + mutex_lock(&wb->inc.lock); + move_keys_from_inc_to_flushing(wb); + mutex_unlock(&wb->inc.lock); - bch2_journal_pin_copy(j, &pin, &wb->journal_pin, - bch2_btree_write_buffer_journal_flush); - bch2_journal_pin_drop(j, &wb->journal_pin); - - s = btree_write_buffer_switch(wb); - keys = wb->keys[s.idx]; - nr = s.nr; - - if (race_fault()) - goto slowpath; + for (size_t i = 0; i < wb->flushing.keys.nr; i++) { + wb->sorted.data[i].idx = i; + wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree; + memcpy(&wb->sorted.data[i].pos, &wb->flushing.keys.data[i].k.k.p, sizeof(struct bpos)); + } + wb->sorted.nr = wb->flushing.keys.nr; /* * We first sort so that we can detect and skip redundant updates, and @@ -181,110 +282,178 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f * If that happens, simply skip the key so we can optimistically insert * as many keys as possible in the fast path. */ - sort(keys, nr, sizeof(keys[0]), - btree_write_buffered_key_cmp, NULL); + wb_sort(wb->sorted.data, wb->sorted.nr); + + darray_for_each(wb->sorted, i) { + struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx]; + + for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++) + prefetch(&wb->flushing.keys.data[n->idx]); + + BUG_ON(!k->journal_seq); + + if (i + 1 < &darray_top(wb->sorted) && + wb_key_eq(i, i + 1)) { + struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx]; - for (i = keys; i < keys + nr; i++) { - if (i + 1 < keys + nr && - i[0].btree == i[1].btree && - bpos_eq(i[0].k.k.p, i[1].k.k.p)) { skipped++; - i->journal_seq = 0; + n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);; + k->journal_seq = 0; continue; } if (write_locked && - (iter.path->btree_id != i->btree || - bpos_gt(i->k.k.p, iter.path->l[0].b->key.k.p))) { + (iter.path->btree_id != k->btree || + bpos_gt(k->k.k.p, iter.path->l[0].b->key.k.p))) { bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b); write_locked = false; } - if (!iter.path || iter.path->btree_id != i->btree) { + if (!iter.path || iter.path->btree_id != k->btree) { bch2_trans_iter_exit(trans, &iter); - bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, + bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p, BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS); } - bch2_btree_iter_set_pos(&iter, i->k.k.p); + bch2_btree_iter_set_pos(&iter, k->k.k.p); iter.path->preserve = false; do { - ret = bch2_btree_write_buffer_flush_one(trans, &iter, i, - commit_flags, &write_locked, &fast); + if (race_fault()) { + ret = -BCH_ERR_journal_reclaim_would_deadlock; + break; + } + + ret = wb_flush_one(trans, &iter, k, &write_locked, &fast); if (!write_locked) bch2_trans_begin(trans); } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); - if (ret == -BCH_ERR_journal_reclaim_would_deadlock) { + if (!ret) { + k->journal_seq = 0; + } else if (ret == -BCH_ERR_journal_reclaim_would_deadlock) { slowpath++; - continue; - } - if (ret) + ret = 0; + } else break; - - i->journal_seq = 0; } if (write_locked) bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b); bch2_trans_iter_exit(trans, &iter); - trace_write_buffer_flush(trans, nr, skipped, fast, wb->size); + if (ret) + goto err; - if (slowpath) - goto slowpath; + if (slowpath) { + /* + * Flush in the order they were present in the journal, so that + * we can release journal pins: + * The fastpath zapped the seq of keys that were successfully flushed so + * we can skip those here. + */ + trace_write_buffer_flush_slowpath(trans, slowpath, wb->flushing.keys.nr); - bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)); -out: - bch2_journal_pin_drop(j, &pin); - mutex_unlock(&wb->flush_lock); - return ret; -slowpath: - trace_write_buffer_flush_slowpath(trans, i - keys, nr); + struct btree_write_buffered_key *i; + darray_for_each(wb->flushing.keys, i) { + if (!i->journal_seq) + continue; - /* - * Now sort the rest by journal seq and bump the journal pin as we go. - * The slowpath zapped the seq of keys that were successfully flushed so - * we can skip those here. - */ - sort(keys, nr, sizeof(keys[0]), - btree_write_buffered_journal_cmp, - NULL); + bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin, + bch2_btree_write_buffer_journal_flush); - commit_flags &= ~BCH_WATERMARK_MASK; - commit_flags |= BCH_WATERMARK_reclaim; + bch2_trans_begin(trans); - for (i = keys; i < keys + nr; i++) { - if (!i->journal_seq) - continue; - - bch2_journal_pin_update(j, i->journal_seq, &pin, - bch2_btree_write_buffer_journal_flush); - - ret = commit_do(trans, NULL, NULL, - commit_flags| - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_no_journal_res| - BCH_TRANS_COMMIT_journal_reclaim, - btree_write_buffered_insert(trans, i)); - if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret))) - break; + ret = commit_do(trans, NULL, NULL, + BCH_WATERMARK_reclaim| + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_journal_res| + BCH_TRANS_COMMIT_journal_reclaim, + btree_write_buffered_insert(trans, i)); + if (ret) + goto err; + } } +err: + bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)); + trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0); + bch2_journal_pin_drop(j, &wb->flushing.pin); + wb->flushing.keys.nr = 0; + return ret; +} - goto out; +static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq) +{ + struct journal *j = &c->journal; + struct journal_buf *buf; + int ret = 0; + + mutex_lock(&j->buf_lock); + while ((buf = bch2_next_write_buffer_flush_journal_buf(j, seq))) + if (bch2_journal_keys_to_write_buffer(c, buf)) { + ret = -ENOMEM; + break; + } + mutex_unlock(&j->buf_lock); + + return ret; } int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans) { + struct bch_fs *c = trans->c; + struct btree_write_buffer *wb = &c->btree_write_buffer; + int ret = 0, fetch_from_journal_err; + + trace_write_buffer_flush_sync(trans, _RET_IP_); +retry: bch2_trans_unlock(trans); - mutex_lock(&trans->c->btree_write_buffer.flush_lock); - return __bch2_btree_write_buffer_flush(trans, 0, true); + + bch2_journal_block_reservations(&c->journal); + fetch_from_journal_err = fetch_wb_keys_from_journal(c, U64_MAX); + bch2_journal_unblock(&c->journal); + + /* + * On memory allocation failure, bch2_btree_write_buffer_flush_locked() + * is not guaranteed to empty wb->inc: + */ + mutex_lock(&wb->flushing.lock); + while (!ret && + (wb->flushing.keys.nr || wb->inc.keys.nr)) + ret = bch2_btree_write_buffer_flush_locked(trans); + mutex_unlock(&wb->flushing.lock); + + if (!ret && fetch_from_journal_err) + goto retry; + + return ret; } -int bch2_btree_write_buffer_flush(struct btree_trans *trans) +int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans) { - return __bch2_btree_write_buffer_flush(trans, 0, false); + struct bch_fs *c = trans->c; + struct btree_write_buffer *wb = &c->btree_write_buffer; + int ret = 0; + + if (mutex_trylock(&wb->flushing.lock)) { + ret = bch2_btree_write_buffer_flush_locked(trans); + mutex_unlock(&wb->flushing.lock); + } + + return ret; +} + +int bch2_btree_write_buffer_tryflush(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer)) + return -BCH_ERR_erofs_no_writes; + + int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans); + bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); + return ret; } static int bch2_btree_write_buffer_journal_flush(struct journal *j, @@ -292,84 +461,195 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j, { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct btree_write_buffer *wb = &c->btree_write_buffer; + int ret, fetch_from_journal_err; - mutex_lock(&wb->flush_lock); + do { + fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq); - return bch2_trans_run(c, - __bch2_btree_write_buffer_flush(trans, BCH_TRANS_COMMIT_no_check_rw, true)); + mutex_lock(&wb->flushing.lock); + ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans)); + mutex_unlock(&wb->flushing.lock); + } while (!ret && + (fetch_from_journal_err || + (wb->flushing.pin.seq && wb->flushing.pin.seq <= seq) || + (wb->inc.pin.seq && wb->inc.pin.seq <= seq))); + + return ret; } -static inline u64 btree_write_buffer_ref(int idx) +static void bch2_btree_write_buffer_flush_work(struct work_struct *work) { - return ((union btree_write_buffer_state) { - .ref0 = idx == 0, - .ref1 = idx == 1, - }).v; -} - -int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; + struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work); struct btree_write_buffer *wb = &c->btree_write_buffer; - struct btree_write_buffered_key *i; - union btree_write_buffer_state old, new; - int ret = 0; - u64 v; + int ret; - trans_for_each_wb_update(trans, i) { - EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); + mutex_lock(&wb->flushing.lock); + do { + ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans)); + } while (!ret && bch2_btree_write_buffer_should_flush(c)); + mutex_unlock(&wb->flushing.lock); - i->journal_seq = trans->journal_res.seq; - i->journal_offset = trans->journal_res.offset; + bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); +} + +int __bch2_journal_key_to_wb(struct bch_fs *c, + struct journal_keys_to_wb *dst, + enum btree_id btree, struct bkey_i *k) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + int ret; +retry: + ret = darray_make_room_gfp(&dst->wb->keys, 1, GFP_KERNEL); + if (!ret && dst->wb == &wb->flushing) + ret = darray_resize(&wb->sorted, wb->flushing.keys.size); + + if (unlikely(ret)) { + if (dst->wb == &c->btree_write_buffer.flushing) { + mutex_unlock(&dst->wb->lock); + dst->wb = &c->btree_write_buffer.inc; + bch2_journal_pin_add(&c->journal, dst->seq, &dst->wb->pin, + bch2_btree_write_buffer_journal_flush); + goto retry; + } + + return ret; } - preempt_disable(); - v = READ_ONCE(wb->state.v); - do { - old.v = new.v = v; + dst->room = darray_room(dst->wb->keys); + if (dst->wb == &wb->flushing) + dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr); + BUG_ON(!dst->room); + BUG_ON(!dst->seq); - new.v += btree_write_buffer_ref(new.idx); - new.nr += trans->nr_wb_updates; - if (new.nr > wb->size) { - ret = -BCH_ERR_btree_insert_need_flush_buffer; - goto out; + struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys); + wb_k->journal_seq = dst->seq; + wb_k->btree = btree; + bkey_copy(&wb_k->k, k); + dst->wb->keys.nr++; + dst->room--; + return 0; +} + +void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_keys_to_wb *dst, u64 seq) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + if (mutex_trylock(&wb->flushing.lock)) { + mutex_lock(&wb->inc.lock); + move_keys_from_inc_to_flushing(wb); + + /* + * Attempt to skip wb->inc, and add keys directly to + * wb->flushing, saving us a copy later: + */ + + if (!wb->inc.keys.nr) { + dst->wb = &wb->flushing; + } else { + mutex_unlock(&wb->flushing.lock); + dst->wb = &wb->inc; } - } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v); + } else { + mutex_lock(&wb->inc.lock); + dst->wb = &wb->inc; + } - memcpy(wb->keys[new.idx] + old.nr, - trans->wb_updates, - sizeof(trans->wb_updates[0]) * trans->nr_wb_updates); + dst->room = darray_room(dst->wb->keys); + if (dst->wb == &wb->flushing) + dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr); + dst->seq = seq; - bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &wb->journal_pin, + bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin, bch2_btree_write_buffer_journal_flush); +} - atomic64_sub_return_release(btree_write_buffer_ref(new.idx), &wb->state.counter); +void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + if (!dst->wb->keys.nr) + bch2_journal_pin_drop(&c->journal, &dst->wb->pin); + + if (bch2_btree_write_buffer_should_flush(c) && + __bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) && + !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); + + if (dst->wb == &wb->flushing) + mutex_unlock(&wb->flushing.lock); + mutex_unlock(&wb->inc.lock); +} + +static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) +{ + struct journal_keys_to_wb dst; + struct jset_entry *entry; + struct bkey_i *k; + int ret = 0; + + bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); + + for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) { + jset_entry_for_each_key(entry, k) { + ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k); + if (ret) + goto out; + } + + entry->type = BCH_JSET_ENTRY_btree_keys; + } + + buf->need_flush_to_write_buffer = false; out: - preempt_enable(); + bch2_journal_keys_to_write_buffer_end(c, &dst); return ret; } +static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size) +{ + if (wb->keys.size >= new_size) + return 0; + + if (!mutex_trylock(&wb->lock)) + return -EINTR; + + int ret = darray_resize(&wb->keys, new_size); + mutex_unlock(&wb->lock); + return ret; +} + +int bch2_btree_write_buffer_resize(struct bch_fs *c, size_t new_size) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + return wb_keys_resize(&wb->flushing, new_size) ?: + wb_keys_resize(&wb->inc, new_size); +} + void bch2_fs_btree_write_buffer_exit(struct bch_fs *c) { struct btree_write_buffer *wb = &c->btree_write_buffer; - BUG_ON(wb->state.nr && !bch2_journal_error(&c->journal)); + BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) && + !bch2_journal_error(&c->journal)); - kvfree(wb->keys[1]); - kvfree(wb->keys[0]); + darray_exit(&wb->sorted); + darray_exit(&wb->flushing.keys); + darray_exit(&wb->inc.keys); } int bch2_fs_btree_write_buffer_init(struct bch_fs *c) { struct btree_write_buffer *wb = &c->btree_write_buffer; - mutex_init(&wb->flush_lock); - wb->size = c->opts.btree_write_buffer_size; + mutex_init(&wb->inc.lock); + mutex_init(&wb->flushing.lock); + INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work); - wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL); - wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL); - if (!wb->keys[0] || !wb->keys[1]) - return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init; + /* Will be resized by journal as needed: */ + unsigned initial_size = 1 << 16; - return 0; + return darray_make_room(&wb->inc.keys, initial_size) ?: + darray_make_room(&wb->flushing.keys, initial_size) ?: + darray_make_room(&wb->sorted, initial_size); } diff --git a/libbcachefs/btree_write_buffer.h b/libbcachefs/btree_write_buffer.h index 322df1c8..1f645f52 100644 --- a/libbcachefs/btree_write_buffer.h +++ b/libbcachefs/btree_write_buffer.h @@ -2,12 +2,59 @@ #ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H #define _BCACHEFS_BTREE_WRITE_BUFFER_H -int __bch2_btree_write_buffer_flush(struct btree_trans *, unsigned, bool); +#include "bkey.h" + +static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + return wb->inc.keys.nr + wb->flushing.keys.nr > wb->inc.keys.size / 4; +} + +static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + return wb->inc.keys.nr > wb->inc.keys.size * 3 / 4; +} + +struct btree_trans; int bch2_btree_write_buffer_flush_sync(struct btree_trans *); -int bch2_btree_write_buffer_flush(struct btree_trans *); +int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *); +int bch2_btree_write_buffer_tryflush(struct btree_trans *); -int bch2_btree_insert_keys_write_buffer(struct btree_trans *); +struct journal_keys_to_wb { + struct btree_write_buffer_keys *wb; + size_t room; + u64 seq; +}; +int __bch2_journal_key_to_wb(struct bch_fs *, + struct journal_keys_to_wb *, + enum btree_id, struct bkey_i *); + +static inline int bch2_journal_key_to_wb(struct bch_fs *c, + struct journal_keys_to_wb *dst, + enum btree_id btree, struct bkey_i *k) +{ + EBUG_ON(!dst->seq); + + if (unlikely(!dst->room)) + return __bch2_journal_key_to_wb(c, dst, btree, k); + + struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys); + wb_k->journal_seq = dst->seq; + wb_k->btree = btree; + bkey_copy(&wb_k->k, k); + dst->wb->keys.nr++; + dst->room--; + return 0; +} + +void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64); +void bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *); + +int bch2_btree_write_buffer_resize(struct bch_fs *, size_t); void bch2_fs_btree_write_buffer_exit(struct bch_fs *); int bch2_fs_btree_write_buffer_init(struct bch_fs *); diff --git a/libbcachefs/btree_write_buffer_types.h b/libbcachefs/btree_write_buffer_types.h index 99993ba7..9b9433de 100644 --- a/libbcachefs/btree_write_buffer_types.h +++ b/libbcachefs/btree_write_buffer_types.h @@ -2,43 +2,56 @@ #ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H #define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H +#include "darray.h" #include "journal_types.h" #define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4 #define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX) +struct wb_key_ref { +union { + struct { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + unsigned idx:24; + u8 pos[sizeof(struct bpos)]; + enum btree_id btree:8; +#else + enum btree_id btree:8; + u8 pos[sizeof(struct bpos)]; + unsigned idx:24; +#endif + } __packed; + struct { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + u64 lo; + u64 mi; + u64 hi; +#else + u64 hi; + u64 mi; + u64 lo; +#endif + }; +}; +}; + struct btree_write_buffered_key { - u64 journal_seq; - unsigned journal_offset; - enum btree_id btree; + enum btree_id btree:8; + u64 journal_seq:56; __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX); }; -union btree_write_buffer_state { - struct { - atomic64_t counter; - }; - - struct { - u64 v; - }; - - struct { - u64 nr:23; - u64 idx:1; - u64 ref0:20; - u64 ref1:20; - }; +struct btree_write_buffer_keys { + DARRAY(struct btree_write_buffered_key) keys; + struct journal_entry_pin pin; + struct mutex lock; }; struct btree_write_buffer { - struct mutex flush_lock; - struct journal_entry_pin journal_pin; - - union btree_write_buffer_state state; - size_t size; - - struct btree_write_buffered_key *keys[2]; + DARRAY(struct wb_key_ref) sorted; + struct btree_write_buffer_keys inc; + struct btree_write_buffer_keys flushing; + struct work_struct flush_work; }; #endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */ diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 58d8c6ff..5bfa102a 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -61,7 +61,7 @@ void bch2_fs_usage_initialize(struct bch_fs *c) usage->reserved += usage->persistent_reserved[i]; for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *e = + struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]); @@ -214,7 +214,7 @@ void bch2_fs_usage_to_text(struct printbuf *out, } for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *e = + struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); prt_printf(out, "\t"); @@ -345,7 +345,7 @@ static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, static inline int __update_replicas(struct bch_fs *c, struct bch_fs_usage *fs_usage, - struct bch_replicas_entry *r, + struct bch_replicas_entry_v1 *r, s64 sectors) { int idx = bch2_replicas_entry_idx(c, r); @@ -359,7 +359,7 @@ static inline int __update_replicas(struct bch_fs *c, } static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k, - struct bch_replicas_entry *r, s64 sectors, + struct bch_replicas_entry_v1 *r, s64 sectors, unsigned journal_seq, bool gc) { struct bch_fs_usage *fs_usage; @@ -453,9 +453,9 @@ int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more) __replicas_deltas_realloc(trans, more, _gfp)); } -static inline int update_replicas_list(struct btree_trans *trans, - struct bch_replicas_entry *r, - s64 sectors) +int bch2_update_replicas_list(struct btree_trans *trans, + struct bch_replicas_entry_v1 *r, + s64 sectors) { struct replicas_delta_list *d; struct replicas_delta *n; @@ -481,14 +481,13 @@ static inline int update_replicas_list(struct btree_trans *trans, return 0; } -static inline int update_cached_sectors_list(struct btree_trans *trans, - unsigned dev, s64 sectors) +int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64 sectors) { struct bch_replicas_padded r; bch2_replicas_entry_cached(&r.e, dev); - return update_replicas_list(trans, &r.e, sectors); + return bch2_update_replicas_list(trans, &r.e, sectors); } int bch2_mark_alloc(struct btree_trans *trans, @@ -580,23 +579,6 @@ int bch2_mark_alloc(struct btree_trans *trans, } percpu_up_read(&c->mark_lock); - /* - * need to know if we're getting called from the invalidate path or - * not: - */ - - if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && - old_a->cached_sectors) { - ret = update_cached_sectors(c, new, ca->dev_idx, - -((s64) old_a->cached_sectors), - journal_seq, gc); - if (ret) { - bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors", - __func__); - return ret; - } - } - if (new_a->data_type == BCH_DATA_free && (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk)) closure_wake_up(&c->freelist_wait); @@ -1470,7 +1452,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i)); r.e.data_type = data_type; - ret = update_replicas_list(trans, &r.e, sectors); + ret = bch2_update_replicas_list(trans, &r.e, sectors); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -1513,8 +1495,8 @@ static int __trans_mark_extent(struct btree_trans *trans, if (p.ptr.cached) { if (!stale) { - ret = update_cached_sectors_list(trans, p.ptr.dev, - disk_sectors); + ret = bch2_update_cached_sectors_list(trans, p.ptr.dev, + disk_sectors); if (ret) return ret; } @@ -1532,7 +1514,7 @@ static int __trans_mark_extent(struct btree_trans *trans, } if (r.e.nr_devs) - ret = update_replicas_list(trans, &r.e, dirty_sectors); + ret = bch2_update_replicas_list(trans, &r.e, dirty_sectors); return ret; } @@ -1669,7 +1651,7 @@ int bch2_trans_mark_stripe(struct btree_trans *trans, s64 sectors = le16_to_cpu(new_s->sectors); bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new)); - ret = update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); + ret = bch2_update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); if (ret) return ret; } @@ -1678,7 +1660,7 @@ int bch2_trans_mark_stripe(struct btree_trans *trans, s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); bch2_bkey_to_replicas(&r.e, old); - ret = update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); + ret = bch2_update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); if (ret) return ret; } diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 21f6cb35..5574b62e 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -315,6 +315,9 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, : c->usage[journal_seq & JOURNAL_BUF_MASK]); } +int bch2_update_replicas_list(struct btree_trans *, + struct bch_replicas_entry_v1 *, s64); +int bch2_update_cached_sectors_list(struct btree_trans *, unsigned, s64); int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned); void bch2_fs_usage_initialize(struct bch_fs *); diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index 4bb88aef..de3d82de 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -444,7 +444,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, dst_end = (void *) arg->replicas + replica_entries_bytes; for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *src_e = + struct bch_replicas_entry_v1 *src_e = cpu_replicas_entry(&c->replicas, i); /* check that we have enough space for one replicas entry */ diff --git a/libbcachefs/clock.c b/libbcachefs/clock.c index f4188909..c36bfc62 100644 --- a/libbcachefs/clock.c +++ b/libbcachefs/clock.c @@ -95,7 +95,6 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, unsigned long io_until, unsigned long cpu_timeout) { - bool kthread = (current->flags & PF_KTHREAD) != 0; struct io_clock_wait wait; wait.io_timer.expire = io_until; @@ -111,7 +110,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, while (1) { set_current_state(TASK_INTERRUPTIBLE); - if (kthread && kthread_should_stop()) + if (kthread_should_stop()) break; if (wait.expired) diff --git a/libbcachefs/darray.c b/libbcachefs/darray.c index aae07be1..4c900c85 100644 --- a/libbcachefs/darray.c +++ b/libbcachefs/darray.c @@ -9,10 +9,12 @@ int __bch2_darray_resize(darray_void *d, size_t element_size, size_t new_size, g if (new_size > d->size) { new_size = roundup_pow_of_two(new_size); - void *data = krealloc_array(d->data, new_size, element_size, gfp); + void *data = kvmalloc_array(new_size, element_size, gfp); if (!data) return -ENOMEM; + memcpy(data, d->data, d->size * element_size); + kvfree(d->data); d->data = data; d->size = new_size; } diff --git a/libbcachefs/darray.h b/libbcachefs/darray.h index 43ea21ad..6157c53d 100644 --- a/libbcachefs/darray.h +++ b/libbcachefs/darray.h @@ -92,7 +92,7 @@ do { \ #define darray_exit(_d) \ do { \ - kfree((_d)->data); \ + kvfree((_d)->data); \ darray_init(_d); \ } while (0) diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index c730f093..2a02bf00 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -1005,7 +1005,7 @@ static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) unsigned i, nr_data = v->nr_blocks - v->nr_redundant; int ret = 0; - ret = bch2_btree_write_buffer_flush(trans); + ret = bch2_btree_write_buffer_flush_sync(trans); if (ret) goto err; diff --git a/libbcachefs/ec_types.h b/libbcachefs/ec_types.h index e2b02a82..976426da 100644 --- a/libbcachefs/ec_types.h +++ b/libbcachefs/ec_types.h @@ -5,7 +5,7 @@ #include "bcachefs_format.h" struct bch_replicas_padded { - struct bch_replicas_entry e; + struct bch_replicas_entry_v1 e; u8 pad[BCH_BKEY_PTRS_MAX]; }; diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index e5c3262c..e42b4529 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -150,7 +150,6 @@ x(BCH_ERR_btree_insert_fail, btree_insert_need_mark_replicas) \ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \ - x(BCH_ERR_btree_insert_fail, btree_insert_need_flush_buffer) \ x(0, backpointer_to_overwritten_btree_node) \ x(0, lock_fail_root_changed) \ x(0, journal_reclaim_would_deadlock) \ diff --git a/libbcachefs/fs-io-direct.c b/libbcachefs/fs-io-direct.c index 5b42a76c..9a479e4d 100644 --- a/libbcachefs/fs-io-direct.c +++ b/libbcachefs/fs-io-direct.c @@ -35,9 +35,9 @@ static void bio_check_or_release(struct bio *bio, bool check_dirty) } } -static void bch2_dio_read_complete(struct closure *cl) +static CLOSURE_CALLBACK(bch2_dio_read_complete) { - struct dio_read *dio = container_of(cl, struct dio_read, cl); + closure_type(dio, struct dio_read, cl); dio->req->ki_complete(dio->req, dio->ret); bio_check_or_release(&dio->rbio.bio, dio->should_dirty); @@ -325,9 +325,9 @@ static noinline int bch2_dio_write_copy_iov(struct dio_write *dio) return 0; } -static void bch2_dio_write_flush_done(struct closure *cl) +static CLOSURE_CALLBACK(bch2_dio_write_flush_done) { - struct dio_write *dio = container_of(cl, struct dio_write, op.cl); + closure_type(dio, struct dio_write, op.cl); struct bch_fs *c = dio->op.c; closure_debug_destroy(cl); diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index b0e8144e..31f40e58 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -861,7 +861,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, abs(pos_src - pos_dst) < len) return -EINVAL; - bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); + lock_two_nondirectories(&src->v, &dst->v); + bch2_lock_inodes(INODE_PAGECACHE_BLOCK, src, dst); inode_dio_wait(&src->v); inode_dio_wait(&dst->v); @@ -914,7 +915,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, ret = bch2_flush_inode(c, dst); err: bch2_quota_reservation_put(c, dst, "a_res); - bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); + bch2_unlock_inodes(INODE_PAGECACHE_BLOCK, src, dst); + unlock_two_nondirectories(&src->v, &dst->v); return bch2_err_class(ret); } diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index d7c1b05a..5a39bcb5 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -453,35 +453,33 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, struct bch_ioctl_subvolume arg) { - struct filename *name; struct path path; struct inode *dir; - struct dentry *victim; int ret = 0; if (arg.flags) return -EINVAL; - name = getname((const char __user *)(unsigned long)arg.dst_ptr); - victim = filename_path_locked(arg.dirfd, name, &path); - putname(name); - if (IS_ERR(victim)) - return PTR_ERR(victim); + ret = user_path_at(arg.dirfd, + (const char __user *)(unsigned long)arg.dst_ptr, + LOOKUP_FOLLOW, &path); + if (ret) + return ret; - if (victim->d_sb->s_fs_info != c) { + if (path.dentry->d_sb->s_fs_info != c) { ret = -EXDEV; goto err; } - dir = d_inode(path.dentry); - ret = __bch2_unlink(dir, victim, true); - if (!ret) { - fsnotify_rmdir(dir, victim); - d_delete(victim); - } - inode_unlock(dir); + dir = path.dentry->d_parent->d_inode; + + ret = __bch2_unlink(dir, path.dentry, true); + if (ret) + goto err; + + fsnotify_rmdir(dir, path.dentry); + d_delete(path.dentry); err: - dput(victim); path_put(&path); return ret; } diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index f76d403c..0d0a37ca 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -1667,8 +1667,7 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root) if (!first) seq_putc(seq, ':'); first = false; - seq_puts(seq, "/dev/"); - seq_puts(seq, ca->name); + seq_puts(seq, ca->disk_sb.sb_name); } return 0; @@ -1901,7 +1900,7 @@ got_sb: sb->s_flags |= SB_POSIXACL; #endif - sb->s_shrink.seeks = 0; + sb->s_shrink->seeks = 0; vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); ret = PTR_ERR_OR_ZERO(vinode); diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h index 5edf1d4b..c3af7225 100644 --- a/libbcachefs/fs.h +++ b/libbcachefs/fs.h @@ -77,9 +77,8 @@ static inline int ptrcmp(void *l, void *r) } enum bch_inode_lock_op { - INODE_LOCK = (1U << 0), - INODE_PAGECACHE_BLOCK = (1U << 1), - INODE_UPDATE_LOCK = (1U << 2), + INODE_PAGECACHE_BLOCK = (1U << 0), + INODE_UPDATE_LOCK = (1U << 1), }; #define bch2_lock_inodes(_locks, ...) \ @@ -91,8 +90,6 @@ do { \ \ for (i = 1; i < ARRAY_SIZE(a); i++) \ if (a[i] != a[i - 1]) { \ - if ((_locks) & INODE_LOCK) \ - down_write_nested(&a[i]->v.i_rwsem, i); \ if ((_locks) & INODE_PAGECACHE_BLOCK) \ bch2_pagecache_block_get(a[i]);\ if ((_locks) & INODE_UPDATE_LOCK) \ @@ -109,8 +106,6 @@ do { \ \ for (i = 1; i < ARRAY_SIZE(a); i++) \ if (a[i] != a[i - 1]) { \ - if ((_locks) & INODE_LOCK) \ - up_write(&a[i]->v.i_rwsem); \ if ((_locks) & INODE_PAGECACHE_BLOCK) \ bch2_pagecache_block_put(a[i]);\ if ((_locks) & INODE_UPDATE_LOCK) \ diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index cc90279f..b1c89a48 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -826,6 +826,18 @@ fsck_err: goto out; } +static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0); + int ret = bkey_err(k); + if (ret) + return ret; + + bch2_trans_iter_exit(trans, &iter); + return k.k->type == KEY_TYPE_set; +} + static int check_inode(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -890,6 +902,17 @@ static int check_inode(struct btree_trans *trans, return 0; } + if (u.bi_flags & BCH_INODE_unlinked && + c->sb.version >= bcachefs_metadata_version_deleted_inodes) { + ret = check_inode_deleted_list(trans, k.k->p); + if (ret) + return ret; + + fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list, + "inode %llu:%u unlinked, but not on deleted list", + u.bi_inum, k.k->p.snapshot); + } + if (u.bi_flags & BCH_INODE_unlinked && (!c->sb.clean || fsck_err(c, inode_unlinked_but_clean, diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index b9d6dbf3..a2c96f7c 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -1157,10 +1157,6 @@ int bch2_delete_dead_inodes(struct bch_fs *c) again: need_another_pass = false; - ret = bch2_btree_write_buffer_flush_sync(trans); - if (ret) - goto err; - /* * Weird transaction restart handling here because on successful delete, * bch2_inode_rm_snapshot() will return a nested transaction restart, @@ -1191,8 +1187,12 @@ again: } bch2_trans_iter_exit(trans, &iter); - if (!ret && need_another_pass) + if (!ret && need_another_pass) { + ret = bch2_btree_write_buffer_flush_sync(trans); + if (ret) + goto err; goto again; + } err: bch2_trans_put(trans); diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c index 75376f04..d6bd8f78 100644 --- a/libbcachefs/io_write.c +++ b/libbcachefs/io_write.c @@ -580,9 +580,9 @@ static inline void wp_update_state(struct write_point *wp, bool running) __wp_update_state(wp, state); } -static void bch2_write_index(struct closure *cl) +static CLOSURE_CALLBACK(bch2_write_index) { - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + closure_type(op, struct bch_write_op, cl); struct write_point *wp = op->wp; struct workqueue_struct *wq = index_update_wq(op); unsigned long flags; @@ -1208,9 +1208,9 @@ static void __bch2_nocow_write_done(struct bch_write_op *op) bch2_nocow_write_convert_unwritten(op); } -static void bch2_nocow_write_done(struct closure *cl) +static CLOSURE_CALLBACK(bch2_nocow_write_done) { - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + closure_type(op, struct bch_write_op, cl); __bch2_nocow_write_done(op); bch2_write_done(cl); @@ -1363,7 +1363,7 @@ err: op->insert_keys.top = op->insert_keys.keys; } else if (op->flags & BCH_WRITE_SYNC) { closure_sync(&op->cl); - bch2_nocow_write_done(&op->cl); + bch2_nocow_write_done(&op->cl.work); } else { /* * XXX @@ -1566,9 +1566,9 @@ err: * If op->discard is true, instead of inserting the data it invalidates the * region of the cache represented by op->bio and op->inode. */ -void bch2_write(struct closure *cl) +CLOSURE_CALLBACK(bch2_write) { - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + closure_type(op, struct bch_write_op, cl); struct bio *bio = &op->wbio.bio; struct bch_fs *c = op->c; unsigned data_len; diff --git a/libbcachefs/io_write.h b/libbcachefs/io_write.h index 93231672..6c276a48 100644 --- a/libbcachefs/io_write.h +++ b/libbcachefs/io_write.h @@ -90,8 +90,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, op->devs_need_flush = NULL; } -void bch2_write(struct closure *); - +CLOSURE_CALLBACK(bch2_write); void bch2_write_point_do_index_updates(struct work_struct *); static inline struct bch_write_bio *wbio_init(struct bio *bio) diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 7d448136..86b148d9 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -10,6 +10,7 @@ #include "bkey_methods.h" #include "btree_gc.h" #include "btree_update.h" +#include "btree_write_buffer.h" #include "buckets.h" #include "error.h" #include "journal.h" @@ -147,6 +148,7 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write) bch2_journal_reclaim_fast(j); if (write) closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); + wake_up(&j->wait); } /* @@ -184,6 +186,8 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val) /* Close out old buffer: */ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); + trace_journal_entry_close(c, vstruct_bytes(buf->data)); + sectors = vstruct_blocks_plus(buf->data, c->block_bits, buf->u64s_reserved) << c->block_bits; BUG_ON(sectors > buf->sectors); @@ -328,6 +332,7 @@ static int journal_entry_open(struct journal *j) buf->must_flush = false; buf->separate_flush = false; buf->flush_time = 0; + buf->need_flush_to_write_buffer = true; memset(buf->data, 0, sizeof(*buf->data)); buf->data->seq = cpu_to_le64(journal_cur_seq(j)); @@ -764,6 +769,75 @@ void bch2_journal_block(struct journal *j) journal_quiesce(j); } +/* + * XXX: ideally this would not be closing the current journal entry, but + * otherwise we do not have a way to avoid racing with res_get() - j->blocked + * will race. + */ +static bool journal_reservations_stopped(struct journal *j) +{ + union journal_res_state s; + + journal_entry_close(j); + + s.v = atomic64_read_acquire(&j->reservations.counter); + + return s.buf0_count == 0 && + s.buf1_count == 0 && + s.buf2_count == 0 && + s.buf3_count == 0; +} + +void bch2_journal_block_reservations(struct journal *j) +{ + spin_lock(&j->lock); + j->blocked++; + spin_unlock(&j->lock); + + wait_event(j->wait, journal_reservations_stopped(j)); +} + +static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) +{ + spin_lock(&j->lock); + max_seq = min(max_seq, journal_cur_seq(j)); + + for (u64 seq = journal_last_unwritten_seq(j); + seq <= max_seq; + seq++) { + unsigned idx = seq & JOURNAL_BUF_MASK; + struct journal_buf *buf = j->buf + idx; + union journal_res_state s; + + if (!buf->need_flush_to_write_buffer) + continue; + + if (seq == journal_cur_seq(j)) + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + + s.v = atomic64_read_acquire(&j->reservations.counter); + + if (journal_state_count(s, idx)) { + spin_unlock(&j->lock); + return ERR_PTR(-EAGAIN); + } + + spin_unlock(&j->lock); + return buf; + } + + spin_unlock(&j->lock); + return NULL; +} + +struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) +{ + struct journal_buf *ret; + + wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, max_seq)) != ERR_PTR(-EAGAIN)); + return ret; +} + /* allocate journal on a device: */ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, @@ -1215,6 +1289,7 @@ int bch2_fs_journal_init(struct journal *j) static struct lock_class_key res_key; unsigned i; + mutex_init(&j->buf_lock); spin_lock_init(&j->lock); spin_lock_init(&j->err_lock); init_waitqueue_head(&j->wait); diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index c85d01cf..b5185f97 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -259,7 +259,7 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u { union journal_res_state s; - s.v = atomic64_sub_return(((union journal_res_state) { + s.v = atomic64_sub_return_release(((union journal_res_state) { .buf0_count = idx == 0, .buf1_count = idx == 1, .buf2_count = idx == 2, @@ -427,6 +427,8 @@ static inline void bch2_journal_set_replay_done(struct journal *j) void bch2_journal_unblock(struct journal *); void bch2_journal_block(struct journal *); +void bch2_journal_block_reservations(struct journal *); +struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq); void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_debug_to_text(struct printbuf *, struct journal *); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 109c1157..4ec5d5d3 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -4,6 +4,7 @@ #include "alloc_foreground.h" #include "btree_io.h" #include "btree_update_interior.h" +#include "btree_write_buffer.h" #include "buckets.h" #include "checksum.h" #include "disk_groups.h" @@ -713,6 +714,22 @@ static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs journal_entry_btree_keys_to_text(out, c, entry); } +static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) +{ + return journal_entry_btree_keys_validate(c, jset, entry, + version, big_endian, READ); +} + +static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + journal_entry_btree_keys_to_text(out, c, entry); +} + struct jset_entry_ops { int (*validate)(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, @@ -1025,10 +1042,9 @@ next_block: return 0; } -static void bch2_journal_read_device(struct closure *cl) +static CLOSURE_CALLBACK(bch2_journal_read_device) { - struct journal_device *ja = - container_of(cl, struct journal_device, read); + closure_type(ja, struct journal_device, read); struct bch_dev *ca = container_of(ja, struct bch_dev, journal); struct bch_fs *c = ca->fs; struct journal_list *jlist = @@ -1494,6 +1510,8 @@ done: static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + /* we aren't holding j->lock: */ unsigned new_size = READ_ONCE(j->buf_size_want); void *new_buf; @@ -1501,6 +1519,11 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) if (buf->buf_size >= new_size) return; + size_t btree_write_buffer_size = new_size / 64; + + if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) + return; + new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN); if (!new_buf) return; @@ -1520,9 +1543,9 @@ static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); } -static void journal_write_done(struct closure *cl) +static CLOSURE_CALLBACK(journal_write_done) { - struct journal *j = container_of(cl, struct journal, io); + closure_type(j, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *w = journal_last_unwritten_buf(j); struct bch_replicas_padded replicas; @@ -1590,6 +1613,7 @@ static void journal_write_done(struct closure *cl) } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); + bch2_journal_reclaim_fast(j); bch2_journal_space_available(j); track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], @@ -1641,9 +1665,9 @@ static void journal_write_endio(struct bio *bio) percpu_ref_put(&ca->io_ref); } -static void do_journal_write(struct closure *cl) +static CLOSURE_CALLBACK(do_journal_write) { - struct journal *j = container_of(cl, struct journal, io); + closure_type(j, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; struct journal_buf *w = journal_last_unwritten_buf(j); @@ -1693,9 +1717,11 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) struct bch_fs *c = container_of(j, struct bch_fs, journal); struct jset_entry *start, *end, *i, *next, *prev = NULL; struct jset *jset = w->data; + struct journal_keys_to_wb wb = { NULL }; unsigned sectors, bytes, u64s; - bool validate_before_checksum = false; unsigned long btree_roots_have = 0; + bool validate_before_checksum = false; + u64 seq = le64_to_cpu(jset->seq); int ret; /* @@ -1723,9 +1749,28 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) * to c->btree_roots we have to get any missing btree roots and * add them to this journal entry: */ - if (i->type == BCH_JSET_ENTRY_btree_root) { + switch (i->type) { + case BCH_JSET_ENTRY_btree_root: bch2_journal_entry_to_btree_root(c, i); __set_bit(i->btree_id, &btree_roots_have); + break; + case BCH_JSET_ENTRY_write_buffer_keys: + EBUG_ON(!w->need_flush_to_write_buffer); + + if (!wb.wb) + bch2_journal_keys_to_write_buffer_start(c, &wb, seq); + + struct bkey_i *k; + jset_entry_for_each_key(i, k) { + ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); + if (ret) { + bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer"); + bch2_journal_keys_to_write_buffer_end(c, &wb); + return ret; + } + } + i->type = BCH_JSET_ENTRY_btree_keys; + break; } /* Can we merge with previous entry? */ @@ -1748,6 +1793,10 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) memmove_u64s_down(prev, i, jset_u64s(u64s)); } + if (wb.wb) + bch2_journal_keys_to_write_buffer_end(c, &wb); + w->need_flush_to_write_buffer = false; + prev = prev ? vstruct_next(prev) : jset->start; jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); @@ -1755,8 +1804,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); - bch2_journal_super_entries_add_common(c, &end, - le64_to_cpu(jset->seq)); + bch2_journal_super_entries_add_common(c, &end, seq); u64s = (u64 *) end - (u64 *) start; BUG_ON(u64s > j->entry_u64s_reserved); @@ -1779,7 +1827,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) - j->last_empty_seq = le64_to_cpu(jset->seq); + j->last_empty_seq = seq; if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) validate_before_checksum = true; @@ -1838,7 +1886,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * (!w->must_flush && (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) { - w->noflush = true; + w->noflush = true; SET_JSET_NO_FLUSH(w->data, true); w->data->last_seq = 0; w->last_seq = 0; @@ -1853,9 +1901,9 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * return 0; } -void bch2_journal_write(struct closure *cl) +CLOSURE_CALLBACK(bch2_journal_write) { - struct journal *j = container_of(cl, struct journal, io); + closure_type(j, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; struct journal_buf *w = journal_last_unwritten_buf(j); @@ -1875,9 +1923,11 @@ void bch2_journal_write(struct closure *cl) if (ret) goto err; + mutex_lock(&j->buf_lock); journal_buf_realloc(j, w); ret = bch2_journal_write_prep(j, w); + mutex_unlock(&j->buf_lock); if (ret) goto err; diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h index a88d097b..c035e7c1 100644 --- a/libbcachefs/journal_io.h +++ b/libbcachefs/journal_io.h @@ -60,6 +60,6 @@ void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *, int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *); -void bch2_journal_write(struct closure *); +CLOSURE_CALLBACK(bch2_journal_write); #endif /* _BCACHEFS_JOURNAL_IO_H */ diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 8fa05bed..2aa4c0c6 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -3,6 +3,7 @@ #include "bcachefs.h" #include "btree_key_cache.h" #include "btree_update.h" +#include "btree_write_buffer.h" #include "buckets.h" #include "errcode.h" #include "error.h" @@ -50,20 +51,23 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, return available; } -static inline void journal_set_watermark(struct journal *j) +void bch2_journal_set_watermark(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); bool low_on_space = j->space[journal_space_clean].total * 4 <= j->space[journal_space_total].total; bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4; - unsigned watermark = low_on_space || low_on_pin + bool low_on_wb = bch2_btree_write_buffer_must_wait(c); + unsigned watermark = low_on_space || low_on_pin || low_on_wb ? BCH_WATERMARK_reclaim : BCH_WATERMARK_stripe; if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], &j->low_on_space_start, low_on_space) || track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], - &j->low_on_pin_start, low_on_pin)) + &j->low_on_pin_start, low_on_pin) || + track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], + &j->write_buffer_full_start, low_on_wb)) trace_and_count(c, journal_full, c); swap(watermark, j->watermark); @@ -230,7 +234,7 @@ void bch2_journal_space_available(struct journal *j) else clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); - journal_set_watermark(j); + bch2_journal_set_watermark(j); out: j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; j->cur_entry_error = ret; @@ -303,6 +307,7 @@ void bch2_journal_reclaim_fast(struct journal *j) * all btree nodes got written out */ while (!fifo_empty(&j->pin) && + j->pin.front <= j->seq_ondisk && !atomic_read(&fifo_peek_front(&j->pin).count)) { j->pin.front++; popped = true; @@ -635,7 +640,6 @@ static u64 journal_seq_to_flush(struct journal *j) static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - bool kthread = (current->flags & PF_KTHREAD) != 0; u64 seq_to_flush; size_t min_nr, min_key_cache, nr_flushed; unsigned flags; @@ -651,7 +655,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) flags = memalloc_noreclaim_save(); do { - if (kthread && kthread_should_stop()) + if (kthread_should_stop()) break; if (bch2_journal_error(j)) { diff --git a/libbcachefs/journal_reclaim.h b/libbcachefs/journal_reclaim.h index 7b15d682..ec84c334 100644 --- a/libbcachefs/journal_reclaim.h +++ b/libbcachefs/journal_reclaim.h @@ -16,6 +16,7 @@ static inline void journal_reclaim_kick(struct journal *j) unsigned bch2_journal_dev_buckets_available(struct journal *, struct journal_device *, enum journal_space_from); +void bch2_journal_set_watermark(struct journal *); void bch2_journal_space_available(struct journal *); static inline bool journal_pin_active(struct journal_entry_pin *pin) diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index 2427cce6..85c543af 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -36,6 +36,7 @@ struct journal_buf { bool noflush; /* write has already been kicked off, and was noflush */ bool must_flush; /* something wants a flush */ bool separate_flush; + bool need_flush_to_write_buffer; }; /* @@ -181,6 +182,12 @@ struct journal { */ darray_u64 early_journal_entries; + /* + * Protects journal_buf->data, when accessing without a jorunal + * reservation: for synchronization between the btree write buffer code + * and the journal write path: + */ + struct mutex buf_lock; /* * Two journal entries -- one is currently open for new entries, the * other is possibly being written out. @@ -271,6 +278,7 @@ struct journal { u64 low_on_space_start; u64 low_on_pin_start; u64 max_in_flight_start; + u64 write_buffer_full_start; struct bch2_time_stats *flush_write_time; struct bch2_time_stats *noflush_write_time; diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 4f7d1758..cf36f2b0 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -27,6 +27,13 @@ #include #include +const char * const bch2_data_ops_strs[] = { +#define x(t, n, ...) [n] = #t, + BCH_DATA_OPS() +#undef x + NULL +}; + static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k) { if (trace_move_extent_enabled()) { @@ -163,12 +170,17 @@ void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) atomic_read(&ctxt->write_sectors) != sectors_pending); } +static void bch2_moving_ctxt_flush_all(struct moving_context *ctxt) +{ + move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); + closure_sync(&ctxt->cl); +} + void bch2_moving_ctxt_exit(struct moving_context *ctxt) { struct bch_fs *c = ctxt->trans->c; - move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); - closure_sync(&ctxt->cl); + bch2_moving_ctxt_flush_all(ctxt); EBUG_ON(atomic_read(&ctxt->write_sectors)); EBUG_ON(atomic_read(&ctxt->write_ios)); @@ -216,7 +228,7 @@ void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c) trace_move_data(c, stats); } -void bch2_move_stats_init(struct bch_move_stats *stats, char *name) +void bch2_move_stats_init(struct bch_move_stats *stats, const char *name) { memset(stats, 0, sizeof(*stats)); stats->data_type = BCH_DATA_user; @@ -484,8 +496,8 @@ int bch2_move_ratelimit(struct moving_context *ctxt) struct bch_fs *c = ctxt->trans->c; u64 delay; - if (ctxt->wait_on_copygc && !c->copygc_running) { - bch2_trans_unlock_long(ctxt->trans); + if (ctxt->wait_on_copygc && c->copygc_running) { + bch2_moving_ctxt_flush_all(ctxt); wait_event_killable(c->copygc_running_wq, !c->copygc_running || kthread_should_stop()); @@ -503,7 +515,7 @@ int bch2_move_ratelimit(struct moving_context *ctxt) set_current_state(TASK_INTERRUPTIBLE); } - if ((current->flags & PF_KTHREAD) && kthread_should_stop()) { + if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); return 1; } @@ -512,7 +524,7 @@ int bch2_move_ratelimit(struct moving_context *ctxt) schedule_timeout(delay); if (unlikely(freezing(current))) { - move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); + bch2_moving_ctxt_flush_all(ctxt); try_to_freeze(); } } while (delay); @@ -721,11 +733,10 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size; fragmentation = a->fragmentation_lru; - ret = bch2_btree_write_buffer_flush(trans); - if (ret) { - bch_err_msg(c, ret, "flushing btree write buffer"); + ret = bch2_btree_write_buffer_tryflush(trans); + bch_err_msg(c, ret, "flushing btree write buffer"); + if (ret) goto err; - } while (!(ret = bch2_move_ratelimit(ctxt))) { bch2_trans_begin(trans); @@ -856,18 +867,17 @@ typedef bool (*move_btree_pred)(struct bch_fs *, void *, struct data_update_opts *); static int bch2_move_btree(struct bch_fs *c, - enum btree_id start_btree_id, struct bpos start_pos, - enum btree_id end_btree_id, struct bpos end_pos, + struct bbpos start, + struct bbpos end, move_btree_pred pred, void *arg, struct bch_move_stats *stats) { - bool kthread = (current->flags & PF_KTHREAD) != 0; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); struct moving_context ctxt; struct btree_trans *trans; struct btree_iter iter; struct btree *b; - enum btree_id id; + enum btree_id btree; struct data_update_opts data_opts; int ret = 0; @@ -878,26 +888,26 @@ static int bch2_move_btree(struct bch_fs *c, stats->data_type = BCH_DATA_btree; - for (id = start_btree_id; - id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1); - id++) { - stats->pos = BBPOS(id, POS_MIN); + for (btree = start.btree; + btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); + btree ++) { + stats->pos = BBPOS(btree, POS_MIN); - if (!bch2_btree_id_root(c, id)->b) + if (!bch2_btree_id_root(c, btree)->b) continue; - bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0, + bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0, BTREE_ITER_PREFETCH); retry: ret = 0; while (bch2_trans_begin(trans), (b = bch2_btree_iter_peek_node(&iter)) && !(ret = PTR_ERR_OR_ZERO(b))) { - if (kthread && kthread_should_stop()) + if (kthread_should_stop()) break; - if ((cmp_int(id, end_btree_id) ?: - bpos_cmp(b->key.k.p, end_pos)) > 0) + if ((cmp_int(btree, end.btree) ?: + bpos_cmp(b->key.k.p, end.pos)) > 0) break; stats->pos = BBPOS(iter.btree_id, iter.pos); @@ -918,7 +928,7 @@ next: bch2_trans_iter_exit(trans, &iter); - if (kthread && kthread_should_stop()) + if (kthread_should_stop()) break; } @@ -1034,8 +1044,8 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) int ret; ret = bch2_move_btree(c, - 0, POS_MIN, - BTREE_ID_NR, SPOS_MAX, + BBPOS_MIN, + BBPOS_MAX, rewrite_old_nodes_pred, c, stats); if (!ret) { mutex_lock(&c->sb_lock); @@ -1050,71 +1060,101 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) return ret; } +static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, + struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + unsigned durability = bch2_bkey_durability(c, k); + unsigned replicas = bkey_is_btree_ptr(k.k) + ? c->opts.metadata_replicas + : io_opts->data_replicas; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned i = 0; + + bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { + unsigned d = bch2_extent_ptr_durability(c, &p); + + if (d && durability - d >= replicas) { + data_opts->kill_ptrs |= BIT(i); + durability -= d; + } + + i++; + } + + return data_opts->kill_ptrs != 0; +} + +static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg, + struct btree *b, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); +} + int bch2_data_job(struct bch_fs *c, struct bch_move_stats *stats, struct bch_ioctl_data op) { + struct bbpos start = BBPOS(op.start_btree, op.start_pos); + struct bbpos end = BBPOS(op.end_btree, op.end_pos); int ret = 0; + if (op.op >= BCH_DATA_OP_NR) + return -EINVAL; + + bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]); + switch (op.op) { - case BCH_DATA_OP_REREPLICATE: - bch2_move_stats_init(stats, "rereplicate"); + case BCH_DATA_OP_rereplicate: stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, -1); - - ret = bch2_move_btree(c, - op.start_btree, op.start_pos, - op.end_btree, op.end_pos, + ret = bch2_move_btree(c, start, end, rereplicate_btree_pred, c, stats) ?: ret; - ret = bch2_replicas_gc2(c) ?: ret; - - ret = bch2_move_data(c, - (struct bbpos) { op.start_btree, op.start_pos }, - (struct bbpos) { op.end_btree, op.end_pos }, + ret = bch2_move_data(c, start, end, NULL, stats, writepoint_hashed((unsigned long) current), true, rereplicate_pred, c) ?: ret; ret = bch2_replicas_gc2(c) ?: ret; - - bch2_move_stats_exit(stats, c); break; - case BCH_DATA_OP_MIGRATE: + case BCH_DATA_OP_migrate: if (op.migrate.dev >= c->sb.nr_devices) return -EINVAL; - bch2_move_stats_init(stats, "migrate"); stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); - - ret = bch2_move_btree(c, - op.start_btree, op.start_pos, - op.end_btree, op.end_pos, + ret = bch2_move_btree(c, start, end, migrate_btree_pred, &op, stats) ?: ret; - ret = bch2_replicas_gc2(c) ?: ret; - - ret = bch2_move_data(c, - (struct bbpos) { op.start_btree, op.start_pos }, - (struct bbpos) { op.end_btree, op.end_pos }, + ret = bch2_move_data(c, start, end, NULL, stats, writepoint_hashed((unsigned long) current), true, migrate_pred, &op) ?: ret; ret = bch2_replicas_gc2(c) ?: ret; - - bch2_move_stats_exit(stats, c); break; - case BCH_DATA_OP_REWRITE_OLD_NODES: - bch2_move_stats_init(stats, "rewrite_old_nodes"); + case BCH_DATA_OP_rewrite_old_nodes: ret = bch2_scan_old_btree_nodes(c, stats); - bch2_move_stats_exit(stats, c); + break; + case BCH_DATA_OP_drop_extra_replicas: + ret = bch2_move_btree(c, start, end, + drop_extra_replicas_btree_pred, c, stats) ?: ret; + ret = bch2_move_data(c, start, end, NULL, stats, + writepoint_hashed((unsigned long) current), + true, + drop_extra_replicas_pred, c) ?: ret; + ret = bch2_replicas_gc2(c) ?: ret; break; default: ret = -EINVAL; } + bch2_move_stats_exit(stats, c); return ret; } diff --git a/libbcachefs/move.h b/libbcachefs/move.h index 07cf9d42..cedde6ee 100644 --- a/libbcachefs/move.h +++ b/libbcachefs/move.h @@ -56,6 +56,8 @@ do { \ typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c, struct bch_io_opts *, struct data_update_opts *); +extern const char * const bch2_data_ops_strs[]; + void bch2_moving_ctxt_exit(struct moving_context *); void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *, struct bch_ratelimit *, struct bch_move_stats *, @@ -130,7 +132,7 @@ int bch2_data_job(struct bch_fs *, void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *); void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *); -void bch2_move_stats_init(struct bch_move_stats *, char *); +void bch2_move_stats_init(struct bch_move_stats *, const char *); void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *); diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 0a057632..e884324b 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -153,8 +153,11 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, move_buckets_wait(ctxt, buckets_in_flight, false); - ret = bch2_btree_write_buffer_flush(trans); - if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()", + ret = bch2_btree_write_buffer_tryflush(trans); + if (bch2_err_matches(ret, EROFS)) + return ret; + + if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_tryflush()", __func__, bch2_err_str(ret))) return ret; diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 8526f177..b7f9990c 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -233,11 +233,6 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ NULL, "Stash pointer to in memory btree node in btree ptr")\ - x(btree_write_buffer_size, u32, \ - OPT_FS|OPT_MOUNT, \ - OPT_UINT(16, (1U << 20) - 1), \ - BCH2_NO_SB_OPT, 1U << 13, \ - NULL, "Number of btree write buffer entries") \ x(gc_reserve_percent, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(5, 21), \ diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 83fc121f..a94f2b5e 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -159,6 +159,8 @@ static int bch2_journal_replay(struct bch_fs *c) goto err; } + BUG_ON(!atomic_read(&keys->ref)); + /* * First, attempt to replay keys in sorted order. This is more * efficient - better locality of btree access - but some might fail if @@ -218,14 +220,15 @@ static int bch2_journal_replay(struct bch_fs *c) bch2_trans_put(trans); trans = NULL; + if (!c->opts.keep_journal) + bch2_journal_keys_put_initial(c); + replay_now_at(j, j->replay_journal_seq_end); j->replay_journal_seq = 0; bch2_journal_set_replay_done(j); - bch2_journal_flush_all_pins(j); - ret = bch2_journal_error(j); - if (keys->nr && !ret) + if (keys->nr) bch2_journal_log_msg(c, "journal replay finished"); err: if (trans) @@ -935,8 +938,12 @@ use_clean: bch2_move_stats_init(&stats, "recovery"); - bch_info(c, "scanning for old btree nodes"); - ret = bch2_fs_read_write(c) ?: + struct printbuf buf = PRINTBUF; + bch2_version_to_text(&buf, c->sb.version_min); + bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf); + printbuf_exit(&buf); + + ret = bch2_fs_read_write_early(c) ?: bch2_scan_old_btree_nodes(c, &stats); if (ret) goto err; @@ -953,10 +960,8 @@ out: bch2_flush_fsck_errs(c); if (!c->opts.keep_journal && - test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) { - bch2_journal_keys_free(&c->journal_keys); - bch2_journal_entries_free(c); - } + test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) + bch2_journal_keys_put_initial(c); kfree(clean); if (!ret && test_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) { diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index 1c3ae13b..820f9989 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -11,7 +11,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, /* Replicas tracking - in memory: */ -static void verify_replicas_entry(struct bch_replicas_entry *e) +static void verify_replicas_entry(struct bch_replicas_entry_v1 *e) { #ifdef CONFIG_BCACHEFS_DEBUG unsigned i; @@ -26,7 +26,7 @@ static void verify_replicas_entry(struct bch_replicas_entry *e) #endif } -void bch2_replicas_entry_sort(struct bch_replicas_entry *e) +void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e) { bubble_sort(e->devs, e->nr_devs, u8_cmp); } @@ -53,7 +53,7 @@ static void bch2_replicas_entry_v0_to_text(struct printbuf *out, } void bch2_replicas_entry_to_text(struct printbuf *out, - struct bch_replicas_entry *e) + struct bch_replicas_entry_v1 *e) { unsigned i; @@ -71,7 +71,7 @@ void bch2_replicas_entry_to_text(struct printbuf *out, void bch2_cpu_replicas_to_text(struct printbuf *out, struct bch_replicas_cpu *r) { - struct bch_replicas_entry *e; + struct bch_replicas_entry_v1 *e; bool first = true; for_each_cpu_replicas_entry(r, e) { @@ -84,7 +84,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *out, } static void extent_to_replicas(struct bkey_s_c k, - struct bch_replicas_entry *r) + struct bch_replicas_entry_v1 *r) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -104,7 +104,7 @@ static void extent_to_replicas(struct bkey_s_c k, } static void stripe_to_replicas(struct bkey_s_c k, - struct bch_replicas_entry *r) + struct bch_replicas_entry_v1 *r) { struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); const struct bch_extent_ptr *ptr; @@ -117,7 +117,7 @@ static void stripe_to_replicas(struct bkey_s_c k, r->devs[r->nr_devs++] = ptr->dev; } -void bch2_bkey_to_replicas(struct bch_replicas_entry *e, +void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e, struct bkey_s_c k) { e->nr_devs = 0; @@ -142,7 +142,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e, bch2_replicas_entry_sort(e); } -void bch2_devlist_to_replicas(struct bch_replicas_entry *e, +void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e, enum bch_data_type data_type, struct bch_devs_list devs) { @@ -164,7 +164,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e, static struct bch_replicas_cpu cpu_replicas_add_entry(struct bch_replicas_cpu *old, - struct bch_replicas_entry *new_entry) + struct bch_replicas_entry_v1 *new_entry) { unsigned i; struct bch_replicas_cpu new = { @@ -194,7 +194,7 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old, } static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, - struct bch_replicas_entry *search) + struct bch_replicas_entry_v1 *search) { int idx, entry_size = replicas_entry_bytes(search); @@ -212,7 +212,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, } int bch2_replicas_entry_idx(struct bch_fs *c, - struct bch_replicas_entry *search) + struct bch_replicas_entry_v1 *search) { bch2_replicas_entry_sort(search); @@ -220,13 +220,13 @@ int bch2_replicas_entry_idx(struct bch_fs *c, } static bool __replicas_has_entry(struct bch_replicas_cpu *r, - struct bch_replicas_entry *search) + struct bch_replicas_entry_v1 *search) { return __replicas_entry_idx(r, search) >= 0; } bool bch2_replicas_marked(struct bch_fs *c, - struct bch_replicas_entry *search) + struct bch_replicas_entry_v1 *search) { bool marked; @@ -343,7 +343,7 @@ err: static unsigned reserve_journal_replicas(struct bch_fs *c, struct bch_replicas_cpu *r) { - struct bch_replicas_entry *e; + struct bch_replicas_entry_v1 *e; unsigned journal_res_u64s = 0; /* nr_inodes: */ @@ -368,7 +368,7 @@ static unsigned reserve_journal_replicas(struct bch_fs *c, noinline static int bch2_mark_replicas_slowpath(struct bch_fs *c, - struct bch_replicas_entry *new_entry) + struct bch_replicas_entry_v1 *new_entry) { struct bch_replicas_cpu new_r, new_gc; int ret = 0; @@ -433,7 +433,7 @@ err: goto out; } -int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r) +int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r) { return likely(bch2_replicas_marked(c, r)) ? 0 : bch2_mark_replicas_slowpath(c, r); @@ -484,7 +484,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret) int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) { - struct bch_replicas_entry *e; + struct bch_replicas_entry_v1 *e; unsigned i = 0; lockdep_assert_held(&c->replicas_gc_lock); @@ -559,7 +559,7 @@ retry: } for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *e = + struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); if (e->data_type == BCH_DATA_journal || @@ -590,7 +590,7 @@ retry: } int bch2_replicas_set_usage(struct bch_fs *c, - struct bch_replicas_entry *r, + struct bch_replicas_entry_v1 *r, u64 sectors) { int ret, idx = bch2_replicas_entry_idx(c, r); @@ -623,7 +623,7 @@ static int __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, struct bch_replicas_cpu *cpu_r) { - struct bch_replicas_entry *e, *dst; + struct bch_replicas_entry_v1 *e, *dst; unsigned nr = 0, entry_size = 0, idx = 0; for_each_replicas_entry(sb_r, e) { @@ -661,7 +661,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, nr++; } - entry_size += sizeof(struct bch_replicas_entry) - + entry_size += sizeof(struct bch_replicas_entry_v1) - sizeof(struct bch_replicas_entry_v0); cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); @@ -672,7 +672,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, cpu_r->entry_size = entry_size; for_each_replicas_entry(sb_r, e) { - struct bch_replicas_entry *dst = + struct bch_replicas_entry_v1 *dst = cpu_replicas_entry(cpu_r, idx++); dst->data_type = e->data_type; @@ -716,7 +716,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, { struct bch_sb_field_replicas_v0 *sb_r; struct bch_replicas_entry_v0 *dst; - struct bch_replicas_entry *src; + struct bch_replicas_entry_v1 *src; size_t bytes; bytes = sizeof(struct bch_sb_field_replicas); @@ -754,7 +754,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, struct bch_replicas_cpu *r) { struct bch_sb_field_replicas *sb_r; - struct bch_replicas_entry *dst, *src; + struct bch_replicas_entry_v1 *dst, *src; bool need_v1 = false; size_t bytes; @@ -805,7 +805,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, memcmp, NULL); for (i = 0; i < cpu_r->nr; i++) { - struct bch_replicas_entry *e = + struct bch_replicas_entry_v1 *e = cpu_replicas_entry(cpu_r, i); if (e->data_type >= BCH_DATA_NR) { @@ -835,7 +835,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, } if (i + 1 < cpu_r->nr) { - struct bch_replicas_entry *n = + struct bch_replicas_entry_v1 *n = cpu_replicas_entry(cpu_r, i + 1); BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0); @@ -872,7 +872,7 @@ static void bch2_sb_replicas_to_text(struct printbuf *out, struct bch_sb_field *f) { struct bch_sb_field_replicas *r = field_to_type(f, replicas); - struct bch_replicas_entry *e; + struct bch_replicas_entry_v1 *e; bool first = true; for_each_replicas_entry(r, e) { @@ -934,7 +934,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, unsigned flags, bool print) { - struct bch_replicas_entry *e; + struct bch_replicas_entry_v1 *e; bool ret = true; percpu_down_read(&c->mark_lock); @@ -994,7 +994,7 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) replicas_v0 = bch2_sb_field_get(sb, replicas_v0); if (replicas) { - struct bch_replicas_entry *r; + struct bch_replicas_entry_v1 *r; for_each_replicas_entry(replicas, r) for (i = 0; i < r->nr_devs; i++) diff --git a/libbcachefs/replicas.h b/libbcachefs/replicas.h index 4887675a..b2bb12a9 100644 --- a/libbcachefs/replicas.h +++ b/libbcachefs/replicas.h @@ -6,26 +6,26 @@ #include "eytzinger.h" #include "replicas_types.h" -void bch2_replicas_entry_sort(struct bch_replicas_entry *); +void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *); void bch2_replicas_entry_to_text(struct printbuf *, - struct bch_replicas_entry *); + struct bch_replicas_entry_v1 *); void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); -static inline struct bch_replicas_entry * +static inline struct bch_replicas_entry_v1 * cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) { return (void *) r->entries + r->entry_size * i; } int bch2_replicas_entry_idx(struct bch_fs *, - struct bch_replicas_entry *); + struct bch_replicas_entry_v1 *); -void bch2_devlist_to_replicas(struct bch_replicas_entry *, +void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *, enum bch_data_type, struct bch_devs_list); -bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *); +bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry_v1 *); int bch2_mark_replicas(struct bch_fs *, - struct bch_replicas_entry *); + struct bch_replicas_entry_v1 *); static inline struct replicas_delta * replicas_delta_next(struct replicas_delta *d) @@ -35,9 +35,9 @@ replicas_delta_next(struct replicas_delta *d) int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *); -void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); +void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *, struct bkey_s_c); -static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, +static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e, unsigned dev) { e->data_type = BCH_DATA_cached; @@ -57,7 +57,7 @@ int bch2_replicas_gc_start(struct bch_fs *, unsigned); int bch2_replicas_gc2(struct bch_fs *); int bch2_replicas_set_usage(struct bch_fs *, - struct bch_replicas_entry *, + struct bch_replicas_entry_v1 *, u64); #define for_each_cpu_replicas_entry(_r, _i) \ diff --git a/libbcachefs/replicas_types.h b/libbcachefs/replicas_types.h index 5cfff489..03032407 100644 --- a/libbcachefs/replicas_types.h +++ b/libbcachefs/replicas_types.h @@ -5,12 +5,12 @@ struct bch_replicas_cpu { unsigned nr; unsigned entry_size; - struct bch_replicas_entry *entries; + struct bch_replicas_entry_v1 *entries; }; struct replicas_delta { s64 delta; - struct bch_replicas_entry r; + struct bch_replicas_entry_v1 r; } __packed; struct replicas_delta_list { diff --git a/libbcachefs/sb-clean.c b/libbcachefs/sb-clean.c index e151ada1..fedc9e10 100644 --- a/libbcachefs/sb-clean.c +++ b/libbcachefs/sb-clean.c @@ -235,7 +235,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, } for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *e = + struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); struct jset_entry_data_usage *u = container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs), diff --git a/libbcachefs/sb-errors.h b/libbcachefs/sb-errors.h index 5a09a539..57bce14c 100644 --- a/libbcachefs/sb-errors.h +++ b/libbcachefs/sb-errors.h @@ -248,7 +248,8 @@ x(root_inode_not_dir, 240) \ x(dir_loop, 241) \ x(hash_table_key_duplicate, 242) \ - x(hash_table_key_wrong_offset, 243) + x(hash_table_key_wrong_offset, 243) \ + x(unlinked_inode_not_on_deleted_list, 244) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, diff --git a/libbcachefs/six.c b/libbcachefs/six.c index f37d5ad2..3a494c5d 100644 --- a/libbcachefs/six.c +++ b/libbcachefs/six.c @@ -324,7 +324,7 @@ bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, } EXPORT_SYMBOL_GPL(six_relock_ip); -#ifdef CONFIG_LOCK_SPIN_ON_OWNER +#ifdef CONFIG_BCACHEFS_SIX_OPTIMISTIC_SPIN static inline bool six_owner_running(struct six_lock *lock) { diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index f4cad903..fa853478 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -166,6 +166,7 @@ void bch2_free_super(struct bch_sb_handle *sb) if (!IS_ERR_OR_NULL(sb->bdev)) blkdev_put(sb->bdev, sb->holder); kfree(sb->holder); + kfree(sb->sb_name); kfree(sb->sb); memset(sb, 0, sizeof(*sb)); @@ -657,12 +658,13 @@ reread: return 0; } -int bch2_read_super(const char *path, struct bch_opts *opts, - struct bch_sb_handle *sb) +int __bch2_read_super(const char *path, struct bch_opts *opts, + struct bch_sb_handle *sb, bool ignore_notbchfs_msg) { u64 offset = opt_get(*opts, sb); struct bch_sb_layout layout; struct printbuf err = PRINTBUF; + struct printbuf err2 = PRINTBUF; __le64 *i; int ret; #ifndef __KERNEL__ @@ -675,6 +677,10 @@ retry: if (!sb->holder) return -ENOMEM; + sb->sb_name = kstrdup(path, GFP_KERNEL); + if (!sb->sb_name) + return -ENOMEM; + #ifndef __KERNEL__ if (opt_get(*opts, direct_io) == false) sb->mode |= BLK_OPEN_BUFFERED; @@ -721,8 +727,14 @@ retry: if (opt_defined(*opts, sb)) goto err; - printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s\n", + prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n", path, err.buf); + if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg) + printk(KERN_INFO "%s", err2.buf); + else + printk(KERN_ERR "%s", err2.buf); + + printbuf_exit(&err2); printbuf_reset(&err); /* @@ -798,6 +810,20 @@ err_no_print: goto out; } +int bch2_read_super(const char *path, struct bch_opts *opts, + struct bch_sb_handle *sb) +{ + return __bch2_read_super(path, opts, sb, false); +} + +/* provide a silenced version for mount.bcachefs */ + +int bch2_read_super_silent(const char *path, struct bch_opts *opts, + struct bch_sb_handle *sb) +{ + return __bch2_read_super(path, opts, sb, true); +} + /* write superblock: */ static void write_super_endio(struct bio *bio) diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h index f5abd102..e6f40a05 100644 --- a/libbcachefs/super-io.h +++ b/libbcachefs/super-io.h @@ -74,6 +74,7 @@ void bch2_free_super(struct bch_sb_handle *); int bch2_sb_realloc(struct bch_sb_handle *, unsigned); int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); +int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *); int bch2_write_super(struct bch_fs *); void __bch2_check_set_feature(struct bch_fs *, unsigned); diff --git a/libbcachefs/super.c b/libbcachefs/super.c index bb945108..91f75717 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -314,7 +314,8 @@ void bch2_fs_read_only(struct bch_fs *c) BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); BUG_ON(atomic_read(&c->btree_cache.dirty)); BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); - BUG_ON(c->btree_write_buffer.state.nr); + BUG_ON(c->btree_write_buffer.inc.keys.nr); + BUG_ON(c->btree_write_buffer.flushing.keys.nr); bch_verbose(c, "marking filesystem clean"); bch2_fs_mark_clean(c); @@ -504,8 +505,8 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_io_clock_exit(&c->io_clock[WRITE]); bch2_io_clock_exit(&c->io_clock[READ]); bch2_fs_compress_exit(c); - bch2_journal_keys_free(&c->journal_keys); - bch2_journal_entries_free(c); + bch2_journal_keys_put_initial(c); + BUG_ON(atomic_read(&c->journal_keys.ref)); bch2_fs_btree_write_buffer_exit(c); percpu_free_rwsem(&c->mark_lock); free_percpu(c->online_reserved); @@ -704,6 +705,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) init_rwsem(&c->gc_lock); mutex_init(&c->gc_gens_lock); + atomic_set(&c->journal_keys.ref, 1); + c->journal_keys.initial_ref_held = true; for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_init(&c->times[i]); diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h index 7dda4985..9c1fd4ca 100644 --- a/libbcachefs/super_types.h +++ b/libbcachefs/super_types.h @@ -5,6 +5,7 @@ struct bch_sb_handle { struct bch_sb *sb; struct block_device *bdev; + char *sb_name; struct bio *bio; void *holder; size_t buffer_size; diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 8df45da5..264c46b4 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -496,7 +496,7 @@ STORE(bch2_fs) sc.gfp_mask = GFP_KERNEL; sc.nr_to_scan = strtoul_or_return(buf); - c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc); + c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc); } if (attr == &sysfs_btree_wakeup) diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h index 09a53032..7b24e7fe 100644 --- a/libbcachefs/trace.h +++ b/libbcachefs/trace.h @@ -188,6 +188,25 @@ DEFINE_EVENT(bch_fs, journal_entry_full, TP_ARGS(c) ); +TRACE_EVENT(journal_entry_close, + TP_PROTO(struct bch_fs *c, unsigned bytes), + TP_ARGS(c, bytes), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u32, bytes ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->bytes = bytes; + ), + + TP_printk("%d,%d entry bytes %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->bytes) +); + DEFINE_EVENT(bio, journal_write, TP_PROTO(struct bio *bio), TP_ARGS(bio) @@ -1313,21 +1332,38 @@ TRACE_EVENT(write_buffer_flush, __entry->nr, __entry->size, __entry->skipped, __entry->fast) ); -TRACE_EVENT(write_buffer_flush_slowpath, - TP_PROTO(struct btree_trans *trans, size_t nr, size_t size), - TP_ARGS(trans, nr, size), +TRACE_EVENT(write_buffer_flush_sync, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), + TP_ARGS(trans, caller_ip), TP_STRUCT__entry( - __field(size_t, nr ) - __field(size_t, size ) + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) ), TP_fast_assign( - __entry->nr = nr; - __entry->size = size; + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; ), - TP_printk("%zu/%zu", __entry->nr, __entry->size) + TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip) +); + +TRACE_EVENT(write_buffer_flush_slowpath, + TP_PROTO(struct btree_trans *trans, size_t slowpath, size_t total), + TP_ARGS(trans, slowpath, total), + + TP_STRUCT__entry( + __field(size_t, slowpath ) + __field(size_t, total ) + ), + + TP_fast_assign( + __entry->slowpath = slowpath; + __entry->total = total; + ), + + TP_printk("%zu/%zu", __entry->slowpath, __entry->total) ); #endif /* _TRACE_BCACHEFS_H */ diff --git a/linux/closure.c b/linux/closure.c index f86c9eea..c1654055 100644 --- a/linux/closure.c +++ b/linux/closure.c @@ -36,7 +36,7 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) closure_debug_destroy(cl); if (destructor) - destructor(cl); + destructor(&cl->work); if (parent) closure_put(parent); @@ -108,8 +108,9 @@ struct closure_syncer { int done; }; -static void closure_sync_fn(struct closure *cl) +static CLOSURE_CALLBACK(closure_sync_fn) { + struct closure *cl = container_of(ws, struct closure, work); struct closure_syncer *s = cl->s; struct task_struct *p; diff --git a/linux/shrinker.c b/linux/shrinker.c index dfefad40..eae2df6a 100644 --- a/linux/shrinker.c +++ b/linux/shrinker.c @@ -12,7 +12,12 @@ static LIST_HEAD(shrinker_list); static DEFINE_MUTEX(shrinker_lock); -int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) +struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...) +{ + return calloc(sizeof(struct shrinker), 1); +} + +int shrinker_register(struct shrinker *shrinker) { mutex_lock(&shrinker_lock); list_add_tail(&shrinker->list, &shrinker_list);