From e783d814e83b2309930e1f6459212da6da8c8a54 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 9 Apr 2017 20:11:10 -0800 Subject: [PATCH] Update bcachefs sources to da037866e6 --- .bcachefs_revision | 2 +- cmd_format.c | 6 - include/linux/bitops.h | 25 ---- include/linux/log2.h | 25 ++++ libbcachefs.c | 11 -- libbcachefs.h | 1 - libbcachefs/bcachefs_format.h | 2 +- libbcachefs/bset.h | 6 + libbcachefs/btree_cache.c | 70 +++++----- libbcachefs/btree_gc.c | 2 +- libbcachefs/btree_io.c | 60 ++++----- libbcachefs/btree_io.h | 32 ++++- libbcachefs/btree_update.c | 60 ++++----- libbcachefs/chardev.c | 2 +- libbcachefs/fifo.h | 37 ++--- libbcachefs/journal.c | 246 ++++++++++++++++++++-------------- libbcachefs/journal.h | 18 +-- libbcachefs/journal_types.h | 17 ++- libbcachefs/super-io.c | 7 - libbcachefs/super.c | 13 +- libbcachefs/sysfs.c | 3 - libbcachefs/util.h | 31 +++-- 22 files changed, 351 insertions(+), 325 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index f4cee9aa..35e8c14b 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -297c81ae4d608707fdabedc60158ff1f4fbec257 +da037866e669b09edc6b049ce09535d3456474cb diff --git a/cmd_format.c b/cmd_format.c index ae6dd33d..a4d12d77 100644 --- a/cmd_format.c +++ b/cmd_format.c @@ -41,7 +41,6 @@ x(0, metadata_replicas, "#", NULL) \ x(0, encrypted, NULL, "Enable whole filesystem encryption (chacha20/poly1305)")\ x(0, no_passphrase, NULL, "Don't encrypt master encryption key")\ x('e', error_action, "(continue|readonly|panic)", NULL) \ -x(0, max_journal_entry_size, "size", NULL) \ x('L', label, "label", NULL) \ x('U', uuid, "uuid", NULL) \ x('f', force, NULL, NULL) \ @@ -80,7 +79,6 @@ static void usage(void) " --no_passphrase Don't encrypt master encryption key\n" " --error_action=(continue|readonly|panic)\n" " Action to take on filesystem error\n" - " --max_journal_entry_size=size\n" " -l, --label=label\n" " --uuid=uuid\n" " -f, --force\n" @@ -185,10 +183,6 @@ int cmd_format(int argc, char *argv[]) read_string_list_or_die(optarg, bch2_error_actions, "error action"); break; - case O_max_journal_entry_size: - opts.max_journal_entry_size = - hatoi_validate(optarg, "journal entry size"); - break; case O_label: case 'L': opts.label = strdup(optarg); diff --git a/include/linux/bitops.h b/include/linux/bitops.h index a0c6508c..47fffb79 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -247,29 +247,4 @@ unsigned long rounddown_pow_of_two(unsigned long n) return 1UL << (fls_long(n) - 1); } -static inline __attribute_const__ -int __get_order(unsigned long size) -{ - int order; - - size--; - size >>= PAGE_SHIFT; -#if BITS_PER_LONG == 32 - order = fls(size); -#else - order = fls64(size); -#endif - return order; -} - -#define get_order(n) \ -( \ - __builtin_constant_p(n) ? ( \ - ((n) == 0UL) ? BITS_PER_LONG - PAGE_SHIFT : \ - (((n) < (1UL << PAGE_SHIFT)) ? 0 : \ - ilog2((n) - 1) - PAGE_SHIFT + 1) \ - ) : \ - __get_order(n) \ -) - #endif diff --git a/include/linux/log2.h b/include/linux/log2.h index 395cda29..6fecd393 100644 --- a/include/linux/log2.h +++ b/include/linux/log2.h @@ -184,4 +184,29 @@ unsigned long __rounddown_pow_of_two(unsigned long n) __rounddown_pow_of_two(n) \ ) +static inline __attribute_const__ +int __get_order(unsigned long size) +{ + int order; + + size--; + size >>= PAGE_SHIFT; +#if BITS_PER_LONG == 32 + order = fls(size); +#else + order = fls64(size); +#endif + return order; +} + +#define get_order(n) \ +( \ + __builtin_constant_p(n) ? ( \ + ((n) == 0UL) ? BITS_PER_LONG - PAGE_SHIFT : \ + (((n) < (1UL << PAGE_SHIFT)) ? 0 : \ + ilog2((n) - 1) - PAGE_SHIFT + 1) \ + ) : \ + __get_order(n) \ +) + #endif /* _TOOLS_LINUX_LOG2_H */ diff --git a/libbcachefs.c b/libbcachefs.c index 0fdf5da4..16bcd0c6 100644 --- a/libbcachefs.c +++ b/libbcachefs.c @@ -149,14 +149,6 @@ struct bch_sb *bch2_format(struct format_opts opts, min(opts.btree_node_size, i->bucket_size); } - if (!opts.max_journal_entry_size) { - /* 2 MB default: */ - opts.max_journal_entry_size = 4096; - } - - opts.max_journal_entry_size = - roundup_pow_of_two(opts.max_journal_entry_size); - if (uuid_is_null(opts.uuid.b)) uuid_generate(opts.uuid.b); @@ -191,7 +183,6 @@ struct bch_sb *bch2_format(struct format_opts opts, SET_BCH_SB_DATA_REPLICAS_REQ(sb, opts.data_replicas_required); SET_BCH_SB_ERROR_ACTION(sb, opts.on_error_action); SET_BCH_SB_STR_HASH_TYPE(sb, BCH_STR_HASH_SIPHASH); - SET_BCH_SB_JOURNAL_ENTRY_SIZE(sb, ilog2(opts.max_journal_entry_size)); struct timespec now; if (clock_gettime(CLOCK_REALTIME, &now)) @@ -319,7 +310,6 @@ void bch2_super_print(struct bch_sb *sb, int units) "Version: %llu\n" "Block_size: %s\n" "Btree node size: %s\n" - "Max journal entry size: %s\n" "Error action: %s\n" "Clean: %llu\n" @@ -342,7 +332,6 @@ void bch2_super_print(struct bch_sb *sb, int units) le64_to_cpu(sb->version), pr_units(le16_to_cpu(sb->block_size), units), pr_units(BCH_SB_BTREE_NODE_SIZE(sb), units), - pr_units(1U << BCH_SB_JOURNAL_ENTRY_SIZE(sb), units), BCH_SB_ERROR_ACTION(sb) < BCH_NR_ERROR_ACTIONS ? bch2_error_actions[BCH_SB_ERROR_ACTION(sb)] diff --git a/libbcachefs.h b/libbcachefs.h index e5f3b867..35ff73b2 100644 --- a/libbcachefs.h +++ b/libbcachefs.h @@ -13,7 +13,6 @@ struct format_opts { uuid_le uuid; unsigned on_error_action; - unsigned max_journal_entry_size; /* will be removed */ unsigned block_size; unsigned btree_node_size; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 8d780d27..a99d96cd 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -971,7 +971,7 @@ LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9); LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10); LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14); -LE64_BITMASK(BCH_SB_JOURNAL_ENTRY_SIZE, struct bch_sb, flags[1], 14, 20); +/* 14-20 unused, was JOURNAL_ENTRY_SIZE */ LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24); LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28); diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h index 76a83fcb..660a7283 100644 --- a/libbcachefs/bset.h +++ b/libbcachefs/bset.h @@ -191,6 +191,12 @@ bkey_unpack_key_format_checked(const struct btree *b, if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); + /* + * hack around a harmless race when compacting whiteouts + * for a write: + */ + dst2.needs_whiteout = dst.needs_whiteout; + BUG_ON(memcmp(&dst, &dst2, sizeof(dst))); } } diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index e98d9c16..bd47aecf 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -87,6 +87,7 @@ static struct btree *mca_bucket_alloc(struct bch_fs *c, gfp_t gfp) if (!b) return NULL; + bkey_extent_init(&b->key); six_lock_init(&b->lock); INIT_LIST_HEAD(&b->list); INIT_LIST_HEAD(&b->write_blocked); @@ -141,8 +142,10 @@ static inline struct btree *mca_find(struct bch_fs *c, * this version is for btree nodes that have already been freed (we're not * reaping a real btree node) */ -static int mca_reap_notrace(struct bch_fs *c, struct btree *b, bool flush) +static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) { + int ret = 0; + lockdep_assert_held(&c->btree_cache_lock); if (!six_trylock_intent(&b->lock)) @@ -155,45 +158,48 @@ static int mca_reap_notrace(struct bch_fs *c, struct btree *b, bool flush) btree_node_noevict(b)) goto out_unlock; - if (!list_empty(&b->write_blocked)) + if (!btree_node_may_write(b)) goto out_unlock; - if (!flush && - (btree_node_dirty(b) || - btree_node_write_in_flight(b))) - goto out_unlock; + if (btree_node_dirty(b) || + btree_node_write_in_flight(b)) { + if (!flush) + goto out_unlock; - /* - * Using the underscore version because we don't want to compact bsets - * after the write, since this node is about to be evicted - unless - * btree verify mode is enabled, since it runs out of the post write - * cleanup: - */ - if (btree_node_dirty(b)) { + /* + * Using the underscore version because we don't want to compact + * bsets after the write, since this node is about to be evicted + * - unless btree verify mode is enabled, since it runs out of + * the post write cleanup: + */ if (verify_btree_ondisk(c)) - bch2_btree_node_write(c, b, NULL, SIX_LOCK_intent, -1); + bch2_btree_node_write(c, b, NULL, SIX_LOCK_intent); else - __bch2_btree_node_write(c, b, NULL, SIX_LOCK_read, -1); + __bch2_btree_node_write(c, b, NULL, SIX_LOCK_read); + + /* wait for any in flight btree write */ + btree_node_wait_on_io(b); } - - /* wait for any in flight btree write */ - wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, - TASK_UNINTERRUPTIBLE); - - return 0; +out: + if (PTR_HASH(&b->key)) + trace_btree_node_reap(c, b, ret); + return ret; out_unlock: six_unlock_write(&b->lock); out_unlock_intent: six_unlock_intent(&b->lock); - return -ENOMEM; + ret = -ENOMEM; + goto out; } -static int mca_reap(struct bch_fs *c, struct btree *b, bool flush) +static int btree_node_reclaim(struct bch_fs *c, struct btree *b) { - int ret = mca_reap_notrace(c, b, flush); + return __btree_node_reclaim(c, b, false); +} - trace_btree_node_reap(c, b, ret); - return ret; +static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) +{ + return __btree_node_reclaim(c, b, true); } static unsigned long bch2_mca_scan(struct shrinker *shrink, @@ -239,7 +245,7 @@ static unsigned long bch2_mca_scan(struct shrinker *shrink, break; if (++i > 3 && - !mca_reap_notrace(c, b, false)) { + !btree_node_reclaim(c, b)) { mca_data_free(c, b); six_unlock_write(&b->lock); six_unlock_intent(&b->lock); @@ -258,7 +264,7 @@ restart: } if (!btree_node_accessed(b) && - !mca_reap(c, b, false)) { + !btree_node_reclaim(c, b)) { /* can't call bch2_btree_node_hash_remove under btree_cache_lock */ freed++; if (&t->list != &c->btree_cache) @@ -445,12 +451,12 @@ static struct btree *mca_cannibalize(struct bch_fs *c) struct btree *b; list_for_each_entry_reverse(b, &c->btree_cache, list) - if (!mca_reap(c, b, false)) + if (!btree_node_reclaim(c, b)) return b; while (1) { list_for_each_entry_reverse(b, &c->btree_cache, list) - if (!mca_reap(c, b, true)) + if (!btree_node_write_and_reclaim(c, b)) return b; /* @@ -474,7 +480,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) * the list. Check if there's any freed nodes there: */ list_for_each_entry(b, &c->btree_cache_freeable, list) - if (!mca_reap_notrace(c, b, false)) + if (!btree_node_reclaim(c, b)) goto out_unlock; /* @@ -482,7 +488,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) * disk node. Check the freed list before allocating a new one: */ list_for_each_entry(b, &c->btree_cache_freed, list) - if (!mca_reap_notrace(c, b, false)) { + if (!btree_node_reclaim(c, b)) { mca_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO); if (b->data) goto out_unlock; diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index e07a3f97..fc06a63a 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -685,7 +685,7 @@ static void bch2_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES], bch2_btree_build_aux_trees(n); six_unlock_write(&n->lock); - bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1); + bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent); } /* diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index d827692b..b56b1735 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1311,8 +1311,7 @@ static void btree_node_write_endio(struct bio *bio) void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, struct closure *parent, - enum six_lock_type lock_type_held, - int idx_to_write) + enum six_lock_type lock_type_held) { struct bio *bio; struct bch_write_bio *wbio; @@ -1344,14 +1343,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, if (!(old & (1 << BTREE_NODE_dirty))) return; - if (idx_to_write >= 0 && - idx_to_write != !!(old & (1 << BTREE_NODE_write_idx))) - return; - if (old & (1 << BTREE_NODE_write_in_flight)) { - wait_on_bit_io(&b->flags, - BTREE_NODE_write_in_flight, - TASK_UNINTERRUPTIBLE); + btree_node_wait_on_io(b); continue; } @@ -1614,37 +1607,29 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) */ void bch2_btree_node_write(struct bch_fs *c, struct btree *b, struct closure *parent, - enum six_lock_type lock_type_held, - int idx_to_write) + enum six_lock_type lock_type_held) { BUG_ON(lock_type_held == SIX_LOCK_write); if (lock_type_held == SIX_LOCK_intent || six_trylock_convert(&b->lock, SIX_LOCK_read, SIX_LOCK_intent)) { - __bch2_btree_node_write(c, b, parent, SIX_LOCK_intent, idx_to_write); + __bch2_btree_node_write(c, b, parent, SIX_LOCK_intent); - six_lock_write(&b->lock); - bch2_btree_post_write_cleanup(c, b); - six_unlock_write(&b->lock); + /* don't cycle lock unnecessarily: */ + if (btree_node_just_written(b)) { + six_lock_write(&b->lock); + bch2_btree_post_write_cleanup(c, b); + six_unlock_write(&b->lock); + } if (lock_type_held == SIX_LOCK_read) six_lock_downgrade(&b->lock); } else { - __bch2_btree_node_write(c, b, parent, SIX_LOCK_read, idx_to_write); + __bch2_btree_node_write(c, b, parent, SIX_LOCK_read); } } -static void bch2_btree_node_write_dirty(struct bch_fs *c, struct btree *b, - struct closure *parent) -{ - six_lock_read(&b->lock); - BUG_ON(b->level); - - bch2_btree_node_write(c, b, parent, SIX_LOCK_read, -1); - six_unlock_read(&b->lock); -} - /* * Write all dirty btree nodes to disk, including roots */ @@ -1654,7 +1639,7 @@ void bch2_btree_flush(struct bch_fs *c) struct btree *b; struct bucket_table *tbl; struct rhash_head *pos; - bool dropped_lock; + bool saw_dirty; unsigned i; closure_init_stack(&cl); @@ -1662,26 +1647,27 @@ void bch2_btree_flush(struct bch_fs *c) rcu_read_lock(); do { - dropped_lock = false; + saw_dirty = false; i = 0; restart: tbl = rht_dereference_rcu(c->btree_cache_table.tbl, &c->btree_cache_table); for (; i < tbl->size; i++) - rht_for_each_entry_rcu(b, pos, tbl, i, hash) - /* - * XXX - locking for b->level, when called from - * bch2_journal_move() - */ - if (!b->level && btree_node_dirty(b)) { + rht_for_each_entry_rcu(b, pos, tbl, i, hash) { + saw_dirty |= btree_node_dirty(b); + + if (btree_node_dirty(b) && + btree_node_may_write(b)) { rcu_read_unlock(); - bch2_btree_node_write_dirty(c, b, &cl); - dropped_lock = true; + six_lock_read(&b->lock); + bch2_btree_node_write_dirty(c, b, &cl, 1); + six_unlock_read(&b->lock); rcu_read_lock(); goto restart; } - } while (dropped_lock); + } + } while (saw_dirty); rcu_read_unlock(); diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index 290fb5d7..84731144 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -19,6 +19,17 @@ static inline void btree_node_io_lock(struct btree *b) TASK_UNINTERRUPTIBLE); } +static inline void btree_node_wait_on_io(struct btree *b) +{ + wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, + TASK_UNINTERRUPTIBLE); +} + +static inline bool btree_node_may_write(struct btree *b) +{ + return list_empty_careful(&b->write_blocked); +} + enum compact_mode { COMPACT_LAZY, COMPACT_WRITTEN, @@ -60,11 +71,28 @@ void bch2_btree_complete_write(struct bch_fs *, struct btree *, struct btree_write *); void __bch2_btree_node_write(struct bch_fs *, struct btree *, - struct closure *, enum six_lock_type, int); + struct closure *, enum six_lock_type); bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); void bch2_btree_node_write(struct bch_fs *, struct btree *, - struct closure *, enum six_lock_type, int); + struct closure *, enum six_lock_type); + +#define bch2_btree_node_write_dirty(_c, _b, _cl, cond) \ +do { \ + while ((_b)->written && btree_node_dirty(_b) && (cond)) { \ + if (!btree_node_may_write(_b)) \ + break; \ + \ + if (!btree_node_write_in_flight(_b)) { \ + bch2_btree_node_write(_c, _b, _cl, SIX_LOCK_read);\ + break; \ + } \ + \ + six_unlock_read(&(_b)->lock); \ + btree_node_wait_on_io(_b); \ + six_lock_read(&(_b)->lock); \ + } \ +} while (0) void bch2_btree_flush(struct bch_fs *); void bch2_btree_node_flush_journal_entries(struct bch_fs *, struct btree *, diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c index 51dff1b7..cdbc0de4 100644 --- a/libbcachefs/btree_update.c +++ b/libbcachefs/btree_update.c @@ -614,7 +614,7 @@ int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id, b = __btree_root_alloc(c, 0, id, reserve); - bch2_btree_node_write(c, b, writes, SIX_LOCK_intent, -1); + bch2_btree_node_write(c, b, writes, SIX_LOCK_intent); bch2_btree_set_root_initial(c, b, reserve); bch2_btree_open_bucket_put(c, b); @@ -750,39 +750,27 @@ overwrite: } static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, - unsigned i) + unsigned i, u64 seq) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct btree_write *w = container_of(pin, struct btree_write, journal); struct btree *b = container_of(w, struct btree, writes[i]); six_lock_read(&b->lock); - /* - * Reusing a btree node can race with the journal reclaim code calling - * the journal pin flush fn, and there's no good fix for this: we don't - * really want journal_pin_drop() to block until the flush fn is no - * longer running, because journal_pin_drop() is called from the btree - * node write endio function, and we can't wait on the flush fn to - * finish running in mca_reap() - where we make reused btree nodes ready - * to use again - because there, we're holding the lock this function - * needs - deadlock. - * - * So, the b->level check is a hack so we don't try to write nodes we - * shouldn't: - */ - if (!b->level) - bch2_btree_node_write(c, b, NULL, SIX_LOCK_read, i); + bch2_btree_node_write_dirty(c, b, NULL, + (btree_current_write(b) == w && + w->journal.pin_list == journal_seq_pin(j, seq))); six_unlock_read(&b->lock); } -static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin) +static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) { - return __btree_node_flush(j, pin, 0); + return __btree_node_flush(j, pin, 0, seq); } -static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin) +static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) { - return __btree_node_flush(j, pin, 1); + return __btree_node_flush(j, pin, 1, seq); } void bch2_btree_journal_key(struct btree_insert *trans, @@ -799,10 +787,11 @@ void bch2_btree_journal_key(struct btree_insert *trans, test_bit(JOURNAL_REPLAY_DONE, &j->flags)); if (!journal_pin_active(&w->journal)) - bch2_journal_pin_add(j, &w->journal, - btree_node_write_idx(b) == 0 - ? btree_node_flush0 - : btree_node_flush1); + bch2_journal_pin_add(j, &trans->journal_res, + &w->journal, + btree_node_write_idx(b) == 0 + ? btree_node_flush0 + : btree_node_flush1); if (trans->journal_res.ref) { u64 seq = trans->journal_res.seq; @@ -972,9 +961,9 @@ retry: closure_wait(&btree_current_write(b)->wait, cl); list_del(&as->write_blocked_list); + mutex_unlock(&c->btree_interior_update_lock); - if (list_empty(&b->write_blocked)) - bch2_btree_node_write(c, b, NULL, SIX_LOCK_read, -1); + bch2_btree_node_write_dirty(c, b, NULL, true); six_unlock_read(&b->lock); break; @@ -991,6 +980,7 @@ retry: * and then we have to wait on that btree_interior_update to finish: */ closure_wait(&as->parent_as->wait, cl); + mutex_unlock(&c->btree_interior_update_lock); break; case BTREE_INTERIOR_UPDATING_ROOT: @@ -1017,8 +1007,9 @@ retry: * can reuse the old nodes it'll have to do a journal commit: */ six_unlock_read(&b->lock); + mutex_unlock(&c->btree_interior_update_lock); + break; } - mutex_unlock(&c->btree_interior_update_lock); continue_at(cl, btree_interior_update_nodes_reachable, system_wq); } @@ -1083,7 +1074,8 @@ static void btree_interior_update_updated_root(struct bch_fs *c, system_freezable_wq); } -static void interior_update_flush(struct journal *j, struct journal_entry_pin *pin) +static void interior_update_flush(struct journal *j, + struct journal_entry_pin *pin, u64 seq) { struct btree_interior_update *as = container_of(pin, struct btree_interior_update, journal); @@ -1441,7 +1433,7 @@ static void btree_split(struct btree *b, struct btree_iter *iter, six_unlock_write(&n2->lock); six_unlock_write(&n1->lock); - bch2_btree_node_write(c, n2, &as->cl, SIX_LOCK_intent, -1); + bch2_btree_node_write(c, n2, &as->cl, SIX_LOCK_intent); /* * Note that on recursive parent_keys == insert_keys, so we @@ -1461,7 +1453,7 @@ static void btree_split(struct btree *b, struct btree_iter *iter, btree_split_insert_keys(iter, n3, &as->parent_keys, reserve); - bch2_btree_node_write(c, n3, &as->cl, SIX_LOCK_intent, -1); + bch2_btree_node_write(c, n3, &as->cl, SIX_LOCK_intent); } } else { trace_btree_node_compact(c, b, b->nr.live_u64s); @@ -1472,7 +1464,7 @@ static void btree_split(struct btree *b, struct btree_iter *iter, bch2_keylist_add(&as->parent_keys, &n1->key); } - bch2_btree_node_write(c, n1, &as->cl, SIX_LOCK_intent, -1); + bch2_btree_node_write(c, n1, &as->cl, SIX_LOCK_intent); /* New nodes all written, now make them visible: */ @@ -1773,7 +1765,7 @@ retry: bch2_keylist_add(&as->parent_keys, &delete); bch2_keylist_add(&as->parent_keys, &n->key); - bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1); + bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent); bch2_btree_insert_node(parent, iter, &as->parent_keys, reserve, as); @@ -2323,7 +2315,7 @@ int bch2_btree_node_rewrite(struct btree_iter *iter, struct btree *b, trace_btree_gc_rewrite_node(c, b); - bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1); + bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent); if (parent) { bch2_btree_insert_node(parent, iter, diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index d3cfb00b..2d20061d 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -49,7 +49,7 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, if (ca->disk_sb.bdev == bdev) goto found; - ca = NULL; + ca = ERR_PTR(-ENOENT); found: bdput(bdev); } diff --git a/libbcachefs/fifo.h b/libbcachefs/fifo.h index a391277e..853815f8 100644 --- a/libbcachefs/fifo.h +++ b/libbcachefs/fifo.h @@ -1,45 +1,30 @@ #ifndef _BCACHE_FIFO_H #define _BCACHE_FIFO_H +#include "util.h" + #define DECLARE_FIFO(type, name) \ struct { \ size_t front, back, size, mask; \ type *data; \ } name +#define fifo_buf_size(fifo) \ + (roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0])) + #define init_fifo(fifo, _size, _gfp) \ ({ \ - bool _ret = true; \ - gfp_t gfp_flags = (_gfp); \ - \ - if (gfp_flags & GFP_KERNEL) \ - gfp_flags |= __GFP_NOWARN; \ - \ - (fifo)->size = (_size); \ (fifo)->front = (fifo)->back = 0; \ - (fifo)->data = NULL; \ - \ - if ((fifo)->size) { \ - size_t _allocated_size, _bytes; \ - \ - _allocated_size = roundup_pow_of_two((fifo)->size); \ - _bytes = _allocated_size * sizeof(*(fifo)->data); \ - \ - (fifo)->mask = _allocated_size - 1; \ - \ - if (_bytes < KMALLOC_MAX_SIZE) \ - (fifo)->data = kmalloc(_bytes, gfp_flags); \ - if ((!(fifo)->data) && (gfp_flags & GFP_KERNEL)) \ - (fifo)->data = vmalloc(_bytes); \ - if ((!(fifo)->data)) \ - _ret = false; \ - } \ - _ret; \ + (fifo)->size = (_size); \ + (fifo)->mask = (fifo)->size \ + ? roundup_pow_of_two((fifo)->size) - 1 \ + : 0; \ + (fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \ }) #define free_fifo(fifo) \ do { \ - kvfree((fifo)->data); \ + kvpfree((fifo)->data, fifo_buf_size(fifo)); \ (fifo)->data = NULL; \ } while (0) diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 0fc680b4..9e290618 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -138,7 +138,7 @@ static inline void bch2_journal_add_prios(struct journal *j, } static void journal_seq_blacklist_flush(struct journal *j, - struct journal_entry_pin *pin) + struct journal_entry_pin *pin, u64 seq) { struct bch_fs *c = container_of(j, struct bch_fs, journal); @@ -406,7 +406,8 @@ static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist, if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq)) break; list_del(&i->list); - kfree(i); + kvpfree(i, offsetof(struct journal_replay, j) + + vstruct_bytes(&i->j)); } list_for_each_entry_reverse(i, jlist->head, list) { @@ -429,7 +430,7 @@ static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist, where = jlist->head; add: - i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); + i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); if (!i) { ret = -ENOMEM; goto out; @@ -646,12 +647,16 @@ static int journal_read_buf_realloc(struct journal_read_buf *b, { void *n; + /* the bios are sized for this many pages, max: */ + if (new_size > JOURNAL_ENTRY_SIZE_MAX) + return -ENOMEM; + new_size = roundup_pow_of_two(new_size); - n = (void *) __get_free_pages(GFP_KERNEL, get_order(new_size)); + n = kvpmalloc(new_size, GFP_KERNEL); if (!n) return -ENOMEM; - free_pages((unsigned long) b->data, get_order(b->size)); + kvpfree(b->data, b->size); b->data = n; b->size = new_size; return 0; @@ -894,7 +899,7 @@ search_done: !read_bucket(i)) break; out: - free_pages((unsigned long) buf.data, get_order(buf.size)); + kvpfree(buf.data, buf.size); percpu_ref_put(&ca->io_ref); closure_return(cl); err: @@ -912,7 +917,8 @@ void bch2_journal_entries_free(struct list_head *list) struct journal_replay *i = list_first_entry(list, struct journal_replay, list); list_del(&i->list); - kvfree(i); + kvpfree(i, offsetof(struct journal_replay, j) + + vstruct_bytes(&i->j)); } } @@ -958,14 +964,14 @@ static inline bool journal_has_keys(struct list_head *list) int bch2_journal_read(struct bch_fs *c, struct list_head *list) { + struct journal *j = &c->journal; struct jset_entry *prio_ptrs; struct journal_list jlist; struct journal_replay *i; - struct jset *j; struct journal_entry_pin_list *p; struct bch_dev *ca; u64 cur_seq, end_seq; - unsigned iter; + unsigned iter, keys = 0, entries = 0; int ret = 0; closure_init_stack(&jlist.cl); @@ -994,63 +1000,59 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) fsck_err_on(c->sb.clean && journal_has_keys(list), c, "filesystem marked clean but journal has keys to replay"); - j = &list_entry(list->prev, struct journal_replay, list)->j; + i = list_last_entry(list, struct journal_replay, list); - unfixable_fsck_err_on(le64_to_cpu(j->seq) - - le64_to_cpu(j->last_seq) + 1 > - c->journal.pin.size, c, + unfixable_fsck_err_on(le64_to_cpu(i->j.seq) - + le64_to_cpu(i->j.last_seq) + 1 > j->pin.size, c, "too many journal entries open for refcount fifo"); - c->journal.pin.back = le64_to_cpu(j->seq) - - le64_to_cpu(j->last_seq) + 1; + atomic64_set(&j->seq, le64_to_cpu(i->j.seq)); + j->last_seq_ondisk = le64_to_cpu(i->j.last_seq); - atomic64_set(&c->journal.seq, le64_to_cpu(j->seq)); - c->journal.last_seq_ondisk = le64_to_cpu(j->last_seq); + j->pin.front = le64_to_cpu(i->j.last_seq); + j->pin.back = le64_to_cpu(i->j.seq) + 1; - BUG_ON(last_seq(&c->journal) != le64_to_cpu(j->last_seq)); - - i = list_first_entry(list, struct journal_replay, list); - - mutex_lock(&c->journal.blacklist_lock); - - fifo_for_each_entry_ptr(p, &c->journal.pin, iter) { - u64 seq = journal_pin_seq(&c->journal, p); + BUG_ON(last_seq(j) != le64_to_cpu(i->j.last_seq)); + BUG_ON(journal_seq_pin(j, atomic64_read(&j->seq)) != + &fifo_peek_back(&j->pin)); + fifo_for_each_entry_ptr(p, &j->pin, iter) { INIT_LIST_HEAD(&p->list); + atomic_set(&p->count, 0); + } - if (i && le64_to_cpu(i->j.seq) == seq) { - atomic_set(&p->count, 1); + mutex_lock(&j->blacklist_lock); - if (journal_seq_blacklist_read(&c->journal, i, p)) { - mutex_unlock(&c->journal.blacklist_lock); - return -ENOMEM; - } + list_for_each_entry(i, list, list) { + p = journal_seq_pin(j, le64_to_cpu(i->j.seq)); - i = list_is_last(&i->list, list) - ? NULL - : list_next_entry(i, list); - } else { - atomic_set(&p->count, 0); + atomic_set(&p->count, 1); + + if (journal_seq_blacklist_read(j, i, p)) { + mutex_unlock(&j->blacklist_lock); + return -ENOMEM; } } - mutex_unlock(&c->journal.blacklist_lock); + mutex_unlock(&j->blacklist_lock); - cur_seq = last_seq(&c->journal); + cur_seq = last_seq(j); end_seq = le64_to_cpu(list_last_entry(list, struct journal_replay, list)->j.seq); list_for_each_entry(i, list, list) { + struct jset_entry *entry; + struct bkey_i *k, *_n; bool blacklisted; - mutex_lock(&c->journal.blacklist_lock); + mutex_lock(&j->blacklist_lock); while (cur_seq < le64_to_cpu(i->j.seq) && - journal_seq_blacklist_find(&c->journal, cur_seq)) + journal_seq_blacklist_find(j, cur_seq)) cur_seq++; - blacklisted = journal_seq_blacklist_find(&c->journal, + blacklisted = journal_seq_blacklist_find(j, le64_to_cpu(i->j.seq)); - mutex_unlock(&c->journal.blacklist_lock); + mutex_unlock(&j->blacklist_lock); fsck_err_on(blacklisted, c, "found blacklisted journal entry %llu", @@ -1059,17 +1061,25 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c, "journal entries %llu-%llu missing! (replaying %llu-%llu)", cur_seq, le64_to_cpu(i->j.seq) - 1, - last_seq(&c->journal), end_seq); + last_seq(j), end_seq); cur_seq = le64_to_cpu(i->j.seq) + 1; + + for_each_jset_key(k, _n, entry, &i->j) + keys++; + entries++; } - prio_ptrs = bch2_journal_find_entry(j, JOURNAL_ENTRY_PRIO_PTRS, 0); + bch_info(c, "journal read done, %i keys in %i entries, seq %llu", + keys, entries, (u64) atomic64_read(&j->seq)); + + i = list_last_entry(list, struct journal_replay, list); + prio_ptrs = bch2_journal_find_entry(&i->j, JOURNAL_ENTRY_PRIO_PTRS, 0); if (prio_ptrs) { - memcpy_u64s(c->journal.prio_buckets, + memcpy_u64s(j->prio_buckets, prio_ptrs->_data, le16_to_cpu(prio_ptrs->u64s)); - c->journal.nr_prio_buckets = le16_to_cpu(prio_ptrs->u64s); + j->nr_prio_buckets = le16_to_cpu(prio_ptrs->u64s); } fsck_err: return ret; @@ -1105,6 +1115,9 @@ static bool journal_entry_is_open(struct journal *j) void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set) { struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_buf *w = journal_prev_buf(j); + + atomic_dec_bug(&journal_seq_pin(j, w->data->seq)->count); if (!need_write_just_set && test_bit(JOURNAL_NEED_WRITE, &j->flags)) @@ -1120,8 +1133,7 @@ void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set) #endif } -static struct journal_entry_pin_list * -__journal_entry_new(struct journal *j, int count) +static void __journal_entry_new(struct journal *j, int count) { struct journal_entry_pin_list *p = fifo_push_ref(&j->pin); @@ -1131,25 +1143,18 @@ __journal_entry_new(struct journal *j, int count) */ atomic64_inc(&j->seq); - BUG_ON(journal_pin_seq(j, p) != atomic64_read(&j->seq)); + BUG_ON(journal_seq_pin(j, atomic64_read(&j->seq)) != + &fifo_peek_back(&j->pin)); INIT_LIST_HEAD(&p->list); atomic_set(&p->count, count); - - return p; } static void __bch2_journal_next_entry(struct journal *j) { - struct journal_entry_pin_list *p; struct journal_buf *buf; - p = __journal_entry_new(j, 1); - - if (test_bit(JOURNAL_REPLAY_DONE, &j->flags)) { - smp_wmb(); - j->cur_pin_list = p; - } + __journal_entry_new(j, 1); buf = journal_cur_buf(j); memset(buf->has_inode, 0, sizeof(buf->has_inode)); @@ -1181,6 +1186,8 @@ static enum { union journal_res_state old, new; u64 v = atomic64_read(&j->reservations.counter); + lockdep_assert_held(&j->lock); + do { old.v = new.v = v; if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL) @@ -1221,7 +1228,6 @@ static enum { BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors); - atomic_dec_bug(&fifo_peek_back(&j->pin).count); __bch2_journal_next_entry(j); cancel_delayed_work(&j->write_work); @@ -1295,7 +1301,7 @@ static int journal_entry_sectors(struct journal *j) struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; struct bkey_s_extent e = bkey_i_to_s_extent(&j->key); - unsigned sectors_available = j->entry_size_max >> 9; + unsigned sectors_available = UINT_MAX; unsigned i, nr_online = 0, nr_devs = 0; lockdep_assert_held(&j->lock); @@ -1363,6 +1369,10 @@ static int journal_entry_open(struct journal *j) if (sectors <= 0) return sectors; + buf->disk_sectors = sectors; + + sectors = min_t(unsigned, sectors, buf->size >> 9); + j->cur_buf_sectors = sectors; buf->nr_prio_buckets = j->nr_prio_buckets; @@ -1464,18 +1474,15 @@ void bch2_journal_start(struct bch_fs *c) int bch2_journal_replay(struct bch_fs *c, struct list_head *list) { - int ret = 0, keys = 0, entries = 0; struct journal *j = &c->journal; struct bkey_i *k, *_n; struct jset_entry *entry; struct journal_replay *i, *n; + int ret = 0, did_replay = 0; list_for_each_entry_safe(i, n, list, list) { - j->cur_pin_list = - &j->pin.data[((j->pin.back - 1 - - (atomic64_read(&j->seq) - - le64_to_cpu(i->j.seq))) & - j->pin.mask)]; + j->replay_pin_list = + journal_seq_pin(j, le64_to_cpu(i->j.seq)); for_each_jset_key(k, _n, entry, &i->j) { struct disk_reservation disk_res; @@ -1499,16 +1506,16 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list) } cond_resched(); - keys++; + did_replay = true; } - if (atomic_dec_and_test(&j->cur_pin_list->count)) + if (atomic_dec_and_test(&j->replay_pin_list->count)) wake_up(&j->wait); - - entries++; } - if (keys) { + j->replay_pin_list = NULL; + + if (did_replay) { bch2_btree_flush(c); /* @@ -1517,17 +1524,14 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list) * arbitrarily far in the future vs. the most recently written journal * entry on disk, if we crash before writing the next journal entry: */ - ret = bch2_journal_meta(&c->journal); + ret = bch2_journal_meta(j); if (ret) { bch_err(c, "journal replay: error %d flushing journal", ret); goto err; } } - bch_info(c, "journal replay done, %i keys in %i entries, seq %llu", - keys, entries, (u64) atomic64_read(&j->seq)); - - bch2_journal_set_replay_done(&c->journal); + bch2_journal_set_replay_done(j); err: bch2_journal_entries_free(list); return ret; @@ -1763,11 +1767,16 @@ static void journal_pin_add_entry(struct journal *j, } void bch2_journal_pin_add(struct journal *j, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) + struct journal_res *res, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) { + struct journal_entry_pin_list *pin_list = res->ref + ? journal_seq_pin(j, res->seq) + : j->replay_pin_list; + spin_lock_irq(&j->pin_lock); - __journal_pin_add(j, j->cur_pin_list, pin, flush_fn); + __journal_pin_add(j, pin_list, pin, flush_fn); spin_unlock_irq(&j->pin_lock); } @@ -1828,7 +1837,7 @@ void bch2_journal_pin_add_if_older(struct journal *j, } static struct journal_entry_pin * -journal_get_next_pin(struct journal *j, u64 seq_to_flush) +journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq) { struct journal_entry_pin_list *pin_list; struct journal_entry_pin *ret = NULL; @@ -1851,6 +1860,7 @@ journal_get_next_pin(struct journal *j, u64 seq_to_flush) if (ret) { /* must be list_del_init(), see bch2_journal_pin_drop() */ list_del_init(&ret->list); + *seq = journal_pin_seq(j, pin_list); break; } } @@ -1875,9 +1885,10 @@ static bool journal_has_pins(struct journal *j) void bch2_journal_flush_pins(struct journal *j) { struct journal_entry_pin *pin; + u64 seq; - while ((pin = journal_get_next_pin(j, U64_MAX))) - pin->flush(j, pin); + while ((pin = journal_get_next_pin(j, U64_MAX, &seq))) + pin->flush(j, pin, seq); wait_event(j->wait, !journal_has_pins(j) || bch2_journal_error(j)); } @@ -1920,7 +1931,7 @@ static void journal_reclaim_work(struct work_struct *work) struct journal *j = &c->journal; struct bch_dev *ca; struct journal_entry_pin *pin; - u64 seq_to_flush = 0; + u64 seq, seq_to_flush = 0; unsigned iter, bucket_to_flush; unsigned long next_flush; bool reclaim_lock_held = false, need_flush; @@ -1994,9 +2005,9 @@ static void journal_reclaim_work(struct work_struct *work) while ((pin = journal_get_next_pin(j, need_flush ? U64_MAX - : seq_to_flush))) { + : seq_to_flush, &seq))) { __set_current_state(TASK_RUNNING); - pin->flush(j, pin); + pin->flush(j, pin, seq); need_flush = false; j->last_flushed = jiffies; @@ -2196,17 +2207,39 @@ static void journal_write_done(struct closure *cl) mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0); } +static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) +{ + /* we aren't holding j->lock: */ + unsigned new_size = READ_ONCE(j->buf_size_want); + void *new_buf; + + if (buf->size >= new_size) + return; + + new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN); + if (!new_buf) + return; + + memcpy(new_buf, buf->data, buf->size); + kvpfree(buf->data, buf->size); + buf->data = new_buf; + buf->size = new_size; +} + static void journal_write(struct closure *cl) { struct journal *j = container_of(cl, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; struct journal_buf *w = journal_prev_buf(j); - struct jset *jset = w->data; + struct jset *jset; struct bio *bio; struct bch_extent_ptr *ptr; unsigned i, sectors, bytes; + journal_buf_realloc(j, w); + jset = w->data; + j->write_start_time = local_clock(); bch2_journal_add_prios(j, w); @@ -2346,6 +2379,7 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, unsigned u64s_min, unsigned u64s_max) { struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_buf *buf; int ret; retry: ret = journal_res_get_fast(j, res, u64s_min, u64s_max); @@ -2365,7 +2399,18 @@ retry: } /* - * Ok, no more room in the current journal entry - try to start a new + * If we couldn't get a reservation because the current buf filled up, + * and we had room for a bigger entry on disk, signal that we want to + * realloc the journal bufs: + */ + buf = journal_cur_buf(j); + if (journal_entry_is_open(j) && + buf->size >> 9 < buf->disk_sectors && + buf->size < JOURNAL_ENTRY_SIZE_MAX) + j->buf_size_want = max(j->buf_size_want, buf->size << 1); + + /* + * Close the current journal entry if necessary, then try to start a new * one: */ switch (journal_buf_switch(j, false)) { @@ -2765,11 +2810,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) struct journal_device *ja = &ca->journal; struct bch_sb_field_journal *journal_buckets = bch2_sb_get_journal(sb); - unsigned i, journal_entry_pages; - - journal_entry_pages = - DIV_ROUND_UP(1U << BCH_SB_JOURNAL_ENTRY_SIZE(sb), - PAGE_SECTORS); + unsigned i; ja->nr = bch2_nr_journal_buckets(journal_buckets); @@ -2777,7 +2818,8 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) if (!ja->bucket_seq) return -ENOMEM; - ca->journal.bio = bio_kmalloc(GFP_KERNEL, journal_entry_pages); + ca->journal.bio = bio_kmalloc(GFP_KERNEL, + DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE)); if (!ca->journal.bio) return -ENOMEM; @@ -2793,17 +2835,14 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) void bch2_fs_journal_exit(struct journal *j) { - unsigned order = get_order(j->entry_size_max); - - free_pages((unsigned long) j->buf[1].data, order); - free_pages((unsigned long) j->buf[0].data, order); + kvpfree(j->buf[1].data, j->buf[1].size); + kvpfree(j->buf[0].data, j->buf[0].size); free_fifo(&j->pin); } -int bch2_fs_journal_init(struct journal *j, unsigned entry_size_max) +int bch2_fs_journal_init(struct journal *j) { static struct lock_class_key res_key; - unsigned order = get_order(entry_size_max); spin_lock_init(&j->lock); spin_lock_init(&j->pin_lock); @@ -2817,7 +2856,8 @@ int bch2_fs_journal_init(struct journal *j, unsigned entry_size_max) lockdep_init_map(&j->res_map, "journal res", &res_key, 0); - j->entry_size_max = entry_size_max; + j->buf[0].size = JOURNAL_ENTRY_SIZE_MIN; + j->buf[1].size = JOURNAL_ENTRY_SIZE_MIN; j->write_delay_ms = 100; j->reclaim_delay_ms = 100; @@ -2828,9 +2868,11 @@ int bch2_fs_journal_init(struct journal *j, unsigned entry_size_max) { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || - !(j->buf[0].data = (void *) __get_free_pages(GFP_KERNEL, order)) || - !(j->buf[1].data = (void *) __get_free_pages(GFP_KERNEL, order))) + !(j->buf[0].data = kvpmalloc(j->buf[0].size, GFP_KERNEL)) || + !(j->buf[1].data = kvpmalloc(j->buf[1].size, GFP_KERNEL))) return -ENOMEM; + j->pin.front = j->pin.back = 1; + return 0; } diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 3825f0dc..9ad82c60 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -121,15 +121,21 @@ struct journal_replay { struct jset j; }; -#define JOURNAL_PIN ((32 * 1024) - 1) +#define JOURNAL_PIN (32 * 1024) static inline bool journal_pin_active(struct journal_entry_pin *pin) { return pin->pin_list != NULL; } -void bch2_journal_pin_add(struct journal *, struct journal_entry_pin *, - journal_pin_flush_fn); +static inline struct journal_entry_pin_list * +journal_seq_pin(struct journal *j, u64 seq) +{ + return &j->pin.data[(size_t) seq & j->pin.mask]; +} + +void bch2_journal_pin_add(struct journal *, struct journal_res *, + struct journal_entry_pin *, journal_pin_flush_fn); void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); void bch2_journal_pin_add_if_older(struct journal *, struct journal_entry_pin *, @@ -343,12 +349,8 @@ int bch2_journal_replay(struct bch_fs *, struct list_head *); static inline void bch2_journal_set_replay_done(struct journal *j) { - spin_lock(&j->lock); BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); - set_bit(JOURNAL_REPLAY_DONE, &j->flags); - j->cur_pin_list = &fifo_peek_back(&j->pin); - spin_unlock(&j->lock); } ssize_t bch2_journal_print_debug(struct journal *, char *); @@ -368,6 +370,6 @@ void bch2_fs_journal_stop(struct journal *); void bch2_dev_journal_exit(struct bch_dev *); int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); void bch2_fs_journal_exit(struct journal *); -int bch2_fs_journal_init(struct journal *, unsigned); +int bch2_fs_journal_init(struct journal *); #endif /* _BCACHE_JOURNAL_H */ diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index ebc340ad..75712aed 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -15,8 +15,12 @@ struct journal_res; */ struct journal_buf { struct jset *data; + struct closure_waitlist wait; + unsigned size; + unsigned disk_sectors; + /* * ugh, prio_buckets are stupid - need to convert them to new * transaction machinery when it arrives @@ -39,7 +43,8 @@ struct journal_entry_pin_list { struct journal; struct journal_entry_pin; -typedef void (*journal_pin_flush_fn)(struct journal *j, struct journal_entry_pin *); +typedef void (*journal_pin_flush_fn)(struct journal *j, + struct journal_entry_pin *, u64); struct journal_entry_pin { struct list_head list; @@ -90,11 +95,13 @@ union journal_res_state { }; }; -/* 4 mb, in bytes: */ -#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) +/* bytes: */ +#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ +#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ /* * We stash some journal state as sentinal values in cur_entry_offset: + * note - cur_entry_offset is in units of u64s */ #define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) @@ -123,7 +130,7 @@ struct journal { unsigned cur_entry_u64s; unsigned prev_buf_sectors; unsigned cur_buf_sectors; - unsigned entry_size_max; /* bytes */ + unsigned buf_size_want; /* * Two journal entries -- one is currently open for new entries, the @@ -162,7 +169,7 @@ struct journal { * longer needed, the bucket can be discarded and reused. */ DECLARE_FIFO(struct journal_entry_pin_list, pin); - struct journal_entry_pin_list *cur_pin_list; + struct journal_entry_pin_list *replay_pin_list; /* * Protects the pin lists - the fifo itself is still protected by diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 9f41d71d..fa020af3 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -377,13 +377,6 @@ const char *bch2_validate_cache_super(struct bcache_superblock *disk_sb) if (BCH_SB_GC_RESERVE(sb) < 5) return "gc reserve percentage too small"; - if (1U << BCH_SB_JOURNAL_ENTRY_SIZE(sb) < block_size) - return "max journal entry size too small"; - - /* 4 mb max: */ - if (512U << BCH_SB_JOURNAL_ENTRY_SIZE(sb) > JOURNAL_ENTRY_SIZE_MAX) - return "max journal entry size too big"; - if (!sb->time_precision || le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) return "invalid time precision"; diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 19f96921..6cbfc801 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -460,14 +460,11 @@ void bch2_fs_stop(struct bch_fs *c) bch2_fs_exit(c); } -#define alloc_bucket_pages(gfp, ca) \ - ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(ca)))) - static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) { struct bch_sb_field_members *mi; struct bch_fs *c; - unsigned i, iter_size, journal_entry_bytes; + unsigned i, iter_size; c = kzalloc(sizeof(struct bch_fs), GFP_KERNEL); if (!c) @@ -555,8 +552,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) iter_size = (btree_blocks(c) + 1) * 2 * sizeof(struct btree_node_iter_set); - journal_entry_bytes = 512U << BCH_SB_JOURNAL_ENTRY_SIZE(sb); - if (!(c->wq = alloc_workqueue("bcachefs", WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || !(c->copygc_wq = alloc_workqueue("bcache_copygc", @@ -583,7 +578,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bdi_setup_and_register(&c->bdi, "bcachefs") || bch2_io_clock_init(&c->io_clock[READ]) || bch2_io_clock_init(&c->io_clock[WRITE]) || - bch2_fs_journal_init(&c->journal, journal_entry_bytes) || + bch2_fs_journal_init(&c->journal) || bch2_fs_btree_init(c) || bch2_fs_encryption_init(c) || bch2_fs_compress_init(c) || @@ -974,7 +969,7 @@ static void bch2_dev_free(struct bch_dev *ca) free_percpu(ca->sectors_written); bioset_exit(&ca->replica_set); free_percpu(ca->usage_percpu); - free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); + kvpfree(ca->disk_buckets, bucket_bytes(ca)); kfree(ca->prio_buckets); kfree(ca->bio_prio); vfree(ca->buckets); @@ -1144,7 +1139,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) ca->mi.nbuckets)) || !(ca->prio_buckets = kzalloc(sizeof(u64) * prio_buckets(ca) * 2, GFP_KERNEL)) || - !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) || + !(ca->disk_buckets = kvpmalloc(bucket_bytes(ca), GFP_KERNEL)) || !(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) || !(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca))) || bioset_init(&ca->replica_set, 4, diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index a0b9faeb..808b3089 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -175,7 +175,6 @@ read_attribute(cache_read_races); rw_attribute(journal_write_delay_ms); rw_attribute(journal_reclaim_delay_ms); -read_attribute(journal_entry_size_max); rw_attribute(discard); rw_attribute(cache_replacement_policy); @@ -406,7 +405,6 @@ SHOW(bch2_fs) sysfs_print(journal_write_delay_ms, c->journal.write_delay_ms); sysfs_print(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); - sysfs_hprint(journal_entry_size_max, c->journal.entry_size_max); sysfs_hprint(block_size, block_bytes(c)); sysfs_print(block_size_bytes, block_bytes(c)); @@ -561,7 +559,6 @@ SYSFS_OPS(bch2_fs); struct attribute *bch2_fs_files[] = { &sysfs_journal_write_delay_ms, &sysfs_journal_reclaim_delay_ms, - &sysfs_journal_entry_size_max, &sysfs_block_size, &sysfs_block_size_bytes, diff --git a/libbcachefs/util.h b/libbcachefs/util.h index 5f13c824..5669cb8a 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -78,16 +79,22 @@ do { \ (__builtin_types_compatible_p(typeof(_val), _type) || \ __builtin_types_compatible_p(typeof(_val), const _type)) -static inline void *kvmalloc(size_t bytes, gfp_t gfp) +static inline void kvpfree(void *p, size_t size) { - if (bytes <= PAGE_SIZE || - !(gfp & GFP_KERNEL)) - return kmalloc(bytes, gfp); + if (size < PAGE_SIZE) + kfree(p); + else if (is_vmalloc_addr(p)) + vfree(p); + else + free_pages((unsigned long) p, get_order(size)); - return ((bytes <= KMALLOC_MAX_SIZE) - ? kmalloc(bytes, gfp|__GFP_NOWARN) - : NULL) ?: - vmalloc(bytes); +} + +static inline void *kvpmalloc(size_t size, gfp_t gfp_mask) +{ + return size < PAGE_SIZE ? kmalloc(size, gfp_mask) + : (void *) __get_free_pages(gfp_mask, get_order(size)) + ?: __vmalloc(size, gfp_mask, PAGE_KERNEL); } #define DECLARE_HEAP(type, name) \ @@ -98,17 +105,15 @@ static inline void *kvmalloc(size_t bytes, gfp_t gfp) #define init_heap(heap, _size, gfp) \ ({ \ - size_t _bytes; \ (heap)->used = 0; \ (heap)->size = (_size); \ - _bytes = (heap)->size * sizeof(*(heap)->data); \ - (heap)->data = kvmalloc(_bytes, (gfp)); \ - (heap)->data; \ + (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\ + (gfp)); \ }) #define free_heap(heap) \ do { \ - kvfree((heap)->data); \ + kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0])); \ (heap)->data = NULL; \ } while (0)