diff --git a/.bcachefs_revision b/.bcachefs_revision index af8b3eda..2aea1a7d 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -f1c9030ccbf6d7b5c46f08f92ee878bfc9f6ee6b +be2d60d9484734b4c619ac0ddf54b3103210c9c0 diff --git a/libbcachefs.c b/libbcachefs.c index 4fe2c3db..092a54a6 100644 --- a/libbcachefs.c +++ b/libbcachefs.c @@ -39,7 +39,7 @@ static void init_layout(struct bch_sb_layout *l, memset(l, 0, sizeof(*l)); - l->magic = BCACHE_MAGIC; + l->magic = BCHFS_MAGIC; l->layout_type = 0; l->nr_superblocks = 2; l->sb_max_size_bits = ilog2(sb_size); @@ -188,7 +188,7 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs, sb.sb->version = le16_to_cpu(opts.version); sb.sb->version_min = le16_to_cpu(opts.version); - sb.sb->magic = BCACHE_MAGIC; + sb.sb->magic = BCHFS_MAGIC; sb.sb->user_uuid = opts.uuid; sb.sb->nr_devices = nr_devs; @@ -353,7 +353,8 @@ struct bch_sb *__bch2_super_read(int fd, u64 sector) xpread(fd, &sb, sizeof(sb), sector << 9); - if (memcmp(&sb.magic, &BCACHE_MAGIC, sizeof(sb.magic))) + if (memcmp(&sb.magic, &BCACHE_MAGIC, sizeof(sb.magic)) && + memcmp(&sb.magic, &BCHFS_MAGIC, sizeof(sb.magic))) die("not a bcachefs superblock"); size_t bytes = vstruct_bytes(&sb); diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 0aa522b7..f29e3e7e 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -1578,7 +1578,7 @@ struct bch_sb_layout { * @version_min - Oldest metadata version this filesystem contains; so we can * safely drop compatibility code and refuse to mount filesystems * we'd need it for - * @magic - identifies as a bcachefs superblock (BCACHE_MAGIC) + * @magic - identifies as a bcachefs superblock (BCHFS_MAGIC) * @seq - incremented each time superblock is written * @uuid - used for generating various magic numbers and identifying * member devices, never changes @@ -1894,6 +1894,9 @@ enum bch_compression_opts { #define BCACHE_MAGIC \ UUID_LE(0xf67385c6, 0x1a4e, 0xca45, \ 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81) +#define BCHFS_MAGIC \ + UUID_LE(0xf67385c6, 0xce66, 0xa990, \ + 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef) #define BCACHEFS_STATFS_MAGIC 0xca451a4e diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 0d280e60..f9ccc216 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -1249,7 +1249,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos new_pos, bool intent, unsigned long ip, int cmp) { - unsigned l = path->level; + unsigned level = path->level; EBUG_ON(trans->restarted); EBUG_ON(!path->ref); @@ -1267,10 +1267,12 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, goto out; } - l = btree_path_up_until_good_node(trans, path, cmp); + level = btree_path_up_until_good_node(trans, path, cmp); - if (btree_path_node(path, l)) { - BUG_ON(!btree_node_locked(path, l)); + if (btree_path_node(path, level)) { + struct btree_path_level *l = &path->l[level]; + + BUG_ON(!btree_node_locked(path, level)); /* * We might have to skip over many keys, or just a few: try * advancing the node iterator, and if we have to skip over too @@ -1278,11 +1280,18 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, * is expensive). */ if (cmp < 0 || - !btree_path_advance_to_pos(path, &path->l[l], 8)) - __btree_path_level_init(path, l); + !btree_path_advance_to_pos(path, l, 8)) + bch2_btree_node_iter_init(&l->iter, l->b, &path->pos); + + /* + * Iterators to interior nodes should always be pointed at the first non + * whiteout: + */ + if (unlikely(level)) + bch2_btree_node_iter_peek(&l->iter, l->b); } - if (unlikely(l != path->level)) { + if (unlikely(level != path->level)) { btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); __bch2_btree_path_unlock(trans, path); } diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 179361be..2b1974a9 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -987,7 +987,6 @@ static void bch2_journal_read_device(struct closure *cl) struct journal_replay *r, **_r; struct genradix_iter iter; struct journal_read_buf buf = { NULL, 0 }; - u64 min_seq = U64_MAX; unsigned i; int ret = 0; @@ -1006,45 +1005,27 @@ static void bch2_journal_read_device(struct closure *cl) goto err; } - /* Find the journal bucket with the highest sequence number: */ - for (i = 0; i < ja->nr; i++) { - if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx]) - ja->cur_idx = i; - - min_seq = min(ja->bucket_seq[i], min_seq); - } - - /* - * If there's duplicate journal entries in multiple buckets (which - * definitely isn't supposed to happen, but...) - make sure to start - * cur_idx at the last of those buckets, so we don't deadlock trying to - * allocate - */ - while (ja->bucket_seq[ja->cur_idx] > min_seq && - ja->bucket_seq[ja->cur_idx] == - ja->bucket_seq[(ja->cur_idx + 1) % ja->nr]) - ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - ja->sectors_free = ca->mi.bucket_size; mutex_lock(&jlist->lock); - genradix_for_each(&c->journal_entries, iter, _r) { + genradix_for_each_reverse(&c->journal_entries, iter, _r) { r = *_r; if (!r) continue; for (i = 0; i < r->nr_ptrs; i++) { - if (r->ptrs[i].dev == ca->dev_idx && - sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) { + if (r->ptrs[i].dev == ca->dev_idx) { unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) + vstruct_sectors(&r->j, c->block_bits); - ja->sectors_free = min(ja->sectors_free, - ca->mi.bucket_size - wrote); + ja->cur_idx = r->ptrs[i].bucket; + ja->sectors_free = ca->mi.bucket_size - wrote; + goto found; } } } +found: mutex_unlock(&jlist->lock); if (ja->bucket_seq[ja->cur_idx] && @@ -1660,20 +1641,42 @@ void bch2_journal_write(struct closure *cl) j->write_start_time = local_clock(); spin_lock(&j->lock); - if (bch2_journal_error(j) || - w->noflush || - (!w->must_flush && - (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && - test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) { + + /* + * If the journal is in an error state - we did an emergency shutdown - + * we prefer to continue doing journal writes. We just mark them as + * noflush so they'll never be used, but they'll still be visible by the + * list_journal tool - this helps in debugging. + * + * There's a caveat: the first journal write after marking the + * superblock dirty must always be a flush write, because on startup + * from a clean shutdown we didn't necessarily read the journal and the + * new journal write might overwrite whatever was in the journal + * previously - we can't leave the journal without any flush writes in + * it. + * + * So if we're in an error state, and we're still starting up, we don't + * write anything at all. + */ + if (!test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags) && + (bch2_journal_error(j) || + w->noflush || + (!w->must_flush && + (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && + test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) { w->noflush = true; SET_JSET_NO_FLUSH(jset, true); jset->last_seq = 0; w->last_seq = 0; j->nr_noflush_writes++; - } else { + } else if (!bch2_journal_error(j)) { j->last_flush_write = jiffies; j->nr_flush_writes++; + clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags); + } else { + spin_unlock(&j->lock); + goto err; } spin_unlock(&j->lock); diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index a6cdb885..045ee95a 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -141,10 +141,11 @@ enum journal_space_from { journal_space_nr, }; -enum { +enum journal_flags { JOURNAL_REPLAY_DONE, JOURNAL_STARTED, JOURNAL_MAY_SKIP_FLUSH, + JOURNAL_NEED_FLUSH_WRITE, }; #define JOURNAL_WATERMARKS() \ diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 6d99a581..0aa243f5 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -211,7 +211,8 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out u64 offset, prev_offset, max_sectors; unsigned i; - if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) { + if (uuid_le_cmp(layout->magic, BCACHE_MAGIC) && + uuid_le_cmp(layout->magic, BCHFS_MAGIC)) { prt_printf(out, "Not a bcachefs superblock layout"); return -BCH_ERR_invalid_sb_layout; } @@ -538,7 +539,8 @@ reread: return ret; } - if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) { + if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC) && + uuid_le_cmp(sb->sb->magic, BCHFS_MAGIC)) { prt_printf(err, "Not a bcachefs superblock"); return -BCH_ERR_invalid_sb_magic; } diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 47ca2153..7cac0567 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -367,6 +367,14 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags); + /* + * First journal write must be a flush write: after a clean shutdown we + * don't read the journal, so the first journal write may end up + * overwriting whatever was there previously, and there must always be + * at least one non-flush write in the journal or recovery will fail: + */ + set_bit(JOURNAL_NEED_FLUSH_WRITE, &c->journal.flags); + for_each_rw_member(ca, c, i) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c);