Update bcachefs sources to be2d60d948 bcachefs: New magic number

2025-03-31 00:00:03 +03:00 · 2022-12-02 19:47:25 -05:00 · 2022-12-02 19:47:25 -05:00 · 3867739e56
commit 3867739e56
parent f82cd58008
8 changed files with 73 additions and 46 deletions
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@ -1 +1 @@
-f1c9030ccbf6d7b5c46f08f92ee878bfc9f6ee6b
+be2d60d9484734b4c619ac0ddf54b3103210c9c0
--- a/libbcachefs.c
+++ b/libbcachefs.c
@ -39,7 +39,7 @@ static void init_layout(struct bch_sb_layout *l,

 	memset(l, 0, sizeof(*l));

-	l->magic		= BCACHE_MAGIC;
+	l->magic		= BCHFS_MAGIC;
 	l->layout_type		= 0;
 	l->nr_superblocks	= 2;
 	l->sb_max_size_bits	= ilog2(sb_size);
@ -188,7 +188,7 @@ struct bch_sb *bch2_format(struct bch_opt_strs	fs_opt_strs,

 	sb.sb->version		= le16_to_cpu(opts.version);
 	sb.sb->version_min	= le16_to_cpu(opts.version);
-	sb.sb->magic		= BCACHE_MAGIC;
+	sb.sb->magic		= BCHFS_MAGIC;
 	sb.sb->user_uuid	= opts.uuid;
 	sb.sb->nr_devices	= nr_devs;

@ -353,7 +353,8 @@ struct bch_sb *__bch2_super_read(int fd, u64 sector)

 	xpread(fd, &sb, sizeof(sb), sector << 9);

-	if (memcmp(&sb.magic, &BCACHE_MAGIC, sizeof(sb.magic)))
+	if (memcmp(&sb.magic, &BCACHE_MAGIC, sizeof(sb.magic)) &&
+	    memcmp(&sb.magic, &BCHFS_MAGIC, sizeof(sb.magic)))
 		die("not a bcachefs superblock");

 	size_t bytes = vstruct_bytes(&sb);
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@ -1578,7 +1578,7 @@ struct bch_sb_layout {
 * @version_min	- Oldest metadata version this filesystem contains; so we can
 *		  safely drop compatibility code and refuse to mount filesystems
 *		  we'd need it for
- * @magic	- identifies as a bcachefs superblock (BCACHE_MAGIC)
+ * @magic	- identifies as a bcachefs superblock (BCHFS_MAGIC)
 * @seq		- incremented each time superblock is written
 * @uuid	- used for generating various magic numbers and identifying
 *                member devices, never changes
@ -1894,6 +1894,9 @@ enum bch_compression_opts {
 #define BCACHE_MAGIC							\
 	UUID_LE(0xf67385c6, 0x1a4e, 0xca45,				\
 		0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81)
+#define BCHFS_MAGIC							\
+	UUID_LE(0xf67385c6, 0xce66, 0xa990,				\
+		0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)

 #define BCACHEFS_STATFS_MAGIC		0xca451a4e

--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@ -1249,7 +1249,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
 		   struct btree_path *path, struct bpos new_pos,
 		   bool intent, unsigned long ip, int cmp)
 {
-	unsigned l = path->level;
+	unsigned level = path->level;

 	EBUG_ON(trans->restarted);
 	EBUG_ON(!path->ref);
@ -1267,10 +1267,12 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
 		goto out;
 	}

-	l = btree_path_up_until_good_node(trans, path, cmp);
+	level = btree_path_up_until_good_node(trans, path, cmp);

-	if (btree_path_node(path, l)) {
-		BUG_ON(!btree_node_locked(path, l));
+	if (btree_path_node(path, level)) {
+		struct btree_path_level *l = &path->l[level];
+
+		BUG_ON(!btree_node_locked(path, level));
 		/*
 		 * We might have to skip over many keys, or just a few: try
 		 * advancing the node iterator, and if we have to skip over too
@ -1278,11 +1280,18 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
 		 * is expensive).
 		 */
 		if (cmp < 0 ||
-		    !btree_path_advance_to_pos(path, &path->l[l], 8))
-			__btree_path_level_init(path, l);
+		    !btree_path_advance_to_pos(path, l, 8))
+			bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
+
+		/*
+		 * Iterators to interior nodes should always be pointed at the first non
+		 * whiteout:
+		 */
+		if (unlikely(level))
+			bch2_btree_node_iter_peek(&l->iter, l->b);
 	}

-	if (unlikely(l != path->level)) {
+	if (unlikely(level != path->level)) {
 		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 		__bch2_btree_path_unlock(trans, path);
 	}
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@ -987,7 +987,6 @@ static void bch2_journal_read_device(struct closure *cl)
 	struct journal_replay *r, **_r;
 	struct genradix_iter iter;
 	struct journal_read_buf buf = { NULL, 0 };
-	u64 min_seq = U64_MAX;
 	unsigned i;
 	int ret = 0;

@ -1006,45 +1005,27 @@ static void bch2_journal_read_device(struct closure *cl)
 			goto err;
 	}

-	/* Find the journal bucket with the highest sequence number: */
-	for (i = 0; i < ja->nr; i++) {
-		if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx])
-			ja->cur_idx = i;
-
-		min_seq = min(ja->bucket_seq[i], min_seq);
-	}
-
-	/*
-	 * If there's duplicate journal entries in multiple buckets (which
-	 * definitely isn't supposed to happen, but...) - make sure to start
-	 * cur_idx at the last of those buckets, so we don't deadlock trying to
-	 * allocate
-	 */
-	while (ja->bucket_seq[ja->cur_idx] > min_seq &&
-	       ja->bucket_seq[ja->cur_idx] ==
-	       ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
-		ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
-
 	ja->sectors_free = ca->mi.bucket_size;

 	mutex_lock(&jlist->lock);
-	genradix_for_each(&c->journal_entries, iter, _r) {
+	genradix_for_each_reverse(&c->journal_entries, iter, _r) {
 		r = *_r;

 		if (!r)
 			continue;

 		for (i = 0; i < r->nr_ptrs; i++) {
-			if (r->ptrs[i].dev == ca->dev_idx &&
-			    sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) {
+			if (r->ptrs[i].dev == ca->dev_idx) {
 				unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
 					vstruct_sectors(&r->j, c->block_bits);

-				ja->sectors_free = min(ja->sectors_free,
-						       ca->mi.bucket_size - wrote);
+				ja->cur_idx = r->ptrs[i].bucket;
+				ja->sectors_free = ca->mi.bucket_size - wrote;
+				goto found;
 			}
 		}
 	}
+found:
 	mutex_unlock(&jlist->lock);

 	if (ja->bucket_seq[ja->cur_idx] &&
@ -1660,20 +1641,42 @@ void bch2_journal_write(struct closure *cl)
 	j->write_start_time = local_clock();

 	spin_lock(&j->lock);
-	if (bch2_journal_error(j) ||
-	    w->noflush ||
-	    (!w->must_flush &&
-	     (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
-	     test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
+
+	/*
+	 * If the journal is in an error state - we did an emergency shutdown -
+	 * we prefer to continue doing journal writes. We just mark them as
+	 * noflush so they'll never be used, but they'll still be visible by the
+	 * list_journal tool - this helps in debugging.
+	 *
+	 * There's a caveat: the first journal write after marking the
+	 * superblock dirty must always be a flush write, because on startup
+	 * from a clean shutdown we didn't necessarily read the journal and the
+	 * new journal write might overwrite whatever was in the journal
+	 * previously - we can't leave the journal without any flush writes in
+	 * it.
+	 *
+	 * So if we're in an error state, and we're still starting up, we don't
+	 * write anything at all.
+	 */
+	if (!test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags) &&
+	    (bch2_journal_error(j) ||
+	     w->noflush ||
+	     (!w->must_flush &&
+	      (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
+	      test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) {
 		w->noflush = true;
 		SET_JSET_NO_FLUSH(jset, true);
 		jset->last_seq	= 0;
 		w->last_seq	= 0;

 		j->nr_noflush_writes++;
-	} else {
+	} else if (!bch2_journal_error(j)) {
 		j->last_flush_write = jiffies;
 		j->nr_flush_writes++;
+		clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
+	} else {
+		spin_unlock(&j->lock);
+		goto err;
 	}
 	spin_unlock(&j->lock);

--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@ -141,10 +141,11 @@ enum journal_space_from {
 	journal_space_nr,
 };

-enum {
+enum journal_flags {
 	JOURNAL_REPLAY_DONE,
 	JOURNAL_STARTED,
 	JOURNAL_MAY_SKIP_FLUSH,
+	JOURNAL_NEED_FLUSH_WRITE,
 };

 #define JOURNAL_WATERMARKS()		\
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@ -211,7 +211,8 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out
 	u64 offset, prev_offset, max_sectors;
 	unsigned i;

-	if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) {
+	if (uuid_le_cmp(layout->magic, BCACHE_MAGIC) &&
+	    uuid_le_cmp(layout->magic, BCHFS_MAGIC)) {
 		prt_printf(out, "Not a bcachefs superblock layout");
 		return -BCH_ERR_invalid_sb_layout;
 	}
@ -538,7 +539,8 @@ reread:
 		return ret;
 	}

-	if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) {
+	if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC) &&
+	    uuid_le_cmp(sb->sb->magic, BCHFS_MAGIC)) {
 		prt_printf(err, "Not a bcachefs superblock");
 		return -BCH_ERR_invalid_sb_magic;
 	}
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@ -367,6 +367,14 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)

 	clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);

+	/*
+	 * First journal write must be a flush write: after a clean shutdown we
+	 * don't read the journal, so the first journal write may end up
+	 * overwriting whatever was there previously, and there must always be
+	 * at least one non-flush write in the journal or recovery will fail:
+	 */
+	set_bit(JOURNAL_NEED_FLUSH_WRITE, &c->journal.flags);
+
 	for_each_rw_member(ca, c, i)
 		bch2_dev_allocator_add(c, ca);
 	bch2_recalc_capacity(c);