diff --git a/.bcachefs_revision b/.bcachefs_revision index 9abb2e20..0370dc87 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -794723fc10c4a1ff28d4b11c436277ba783f47e6 +09d4c2acbf4c864fef0f520bbcba256c9a19102e diff --git a/Makefile.compiler b/Makefile.compiler index 8fcb4274..92be0c9a 100644 --- a/Makefile.compiler +++ b/Makefile.compiler @@ -38,7 +38,7 @@ as-option = $(call try-run,\ # Usage: aflags-y += $(call as-instr,instr,option1,option2) as-instr = $(call try-run,\ - printf "%b\n" "$(1)" | $(CC) -Werror $(CLANG_FLAGS) $(KBUILD_AFLAGS) -c -x assembler-with-cpp -o "$$TMP" -,$(2),$(3)) + printf "%b\n" "$(1)" | $(CC) -Werror $(CLANG_FLAGS) $(KBUILD_AFLAGS) -Wa$(comma)--fatal-warnings -c -x assembler-with-cpp -o "$$TMP" -,$(2),$(3)) # __cc-option # Usage: MY_CFLAGS += $(call __cc-option,$(CC),$(MY_CFLAGS),-march=winchip-c6,-march=i586) diff --git a/c_src/cmd_migrate.c b/c_src/cmd_migrate.c index e26b3bb8..5ac326dd 100644 --- a/c_src/cmd_migrate.c +++ b/c_src/cmd_migrate.c @@ -676,12 +676,12 @@ static int migrate_fs(const char *fs_path, struct dev_opts dev = dev_opts_default(); dev.path = dev_t_to_path(stat.st_dev); - dev.handle = bdev_open_by_path(dev.path, BLK_OPEN_READ|BLK_OPEN_WRITE, &dev, NULL); + dev.file = bdev_file_open_by_path(dev.path, BLK_OPEN_READ|BLK_OPEN_WRITE, &dev, NULL); - int ret = PTR_ERR_OR_ZERO(dev.handle); + int ret = PTR_ERR_OR_ZERO(dev.file); if (ret < 0) die("Error opening device to format %s: %s", dev.path, strerror(-ret)); - dev.bdev = dev.handle->bdev; + dev.bdev = file_bdev(dev.file); opt_set(fs_opts, block_size, get_blocksize(dev.bdev->bd_fd)); diff --git a/c_src/libbcachefs.h b/c_src/libbcachefs.h index 739783f0..05720573 100644 --- a/c_src/libbcachefs.h +++ b/c_src/libbcachefs.h @@ -52,7 +52,7 @@ static inline struct format_opts format_opts_default() } struct dev_opts { - struct bdev_handle *handle; + struct file *file; struct block_device *bdev; char *path; u64 size; /* bytes*/ diff --git a/c_src/tools-util.c b/c_src/tools-util.c index d8371c24..7da1d689 100644 --- a/c_src/tools-util.c +++ b/c_src/tools-util.c @@ -189,13 +189,13 @@ int open_for_format(struct dev_opts *dev, bool force) const char *fs_type = NULL, *fs_label = NULL; size_t fs_type_len, fs_label_len; - dev->handle = bdev_open_by_path(dev->path, + dev->file = bdev_file_open_by_path(dev->path, BLK_OPEN_READ|BLK_OPEN_WRITE|BLK_OPEN_EXCL|BLK_OPEN_BUFFERED, dev, NULL); - int ret = PTR_ERR_OR_ZERO(dev->handle); + int ret = PTR_ERR_OR_ZERO(dev->file); if (ret < 0) die("Error opening device to format %s: %s", dev->path, strerror(-ret)); - dev->bdev = dev->handle->bdev; + dev->bdev = file_bdev(dev->file); if (!(pr = blkid_new_probe())) die("blkid error 1"); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 80560ab6..3cbf8c9e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -16,6 +16,28 @@ typedef void (bio_end_io_t) (struct bio *); #define BDEVNAME_SIZE 32 +typedef unsigned int __bitwise blk_mode_t; + +/* open for reading */ +#define BLK_OPEN_READ ((__force blk_mode_t)(1 << 0)) +/* open for writing */ +#define BLK_OPEN_WRITE ((__force blk_mode_t)(1 << 1)) +/* open exclusively (vs other exclusive openers */ +#define BLK_OPEN_EXCL ((__force blk_mode_t)(1 << 2)) +/* opened with O_NDELAY */ +#define BLK_OPEN_NDELAY ((__force blk_mode_t)(1 << 3)) +/* open for "writes" only for ioctls (specialy hack for floppy.c) */ +#define BLK_OPEN_WRITE_IOCTL ((__force blk_mode_t)(1 << 4)) + +#define BLK_OPEN_BUFFERED ((__force blk_mode_t)(1 << 5)) + +struct inode { + unsigned long i_ino; + loff_t i_size; + struct super_block *i_sb; + blk_mode_t mode; +}; + struct request_queue { struct backing_dev_info *backing_dev_info; }; @@ -34,6 +56,7 @@ struct block_device { dev_t bd_dev; char name[BDEVNAME_SIZE]; struct inode *bd_inode; + struct inode __bd_inode; struct request_queue queue; void *bd_holder; struct gendisk * bd_disk; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 998f5e2c..1d825a70 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -23,27 +23,6 @@ struct user_namespace; #define MINOR(dev) ((unsigned int) ((dev) & MINORMASK)) #define MKDEV(ma,mi) (((ma) << MINORBITS) | (mi)) -typedef unsigned int __bitwise blk_mode_t; - -/* open for reading */ -#define BLK_OPEN_READ ((__force blk_mode_t)(1 << 0)) -/* open for writing */ -#define BLK_OPEN_WRITE ((__force blk_mode_t)(1 << 1)) -/* open exclusively (vs other exclusive openers */ -#define BLK_OPEN_EXCL ((__force blk_mode_t)(1 << 2)) -/* opened with O_NDELAY */ -#define BLK_OPEN_NDELAY ((__force blk_mode_t)(1 << 3)) -/* open for "writes" only for ioctls (specialy hack for floppy.c) */ -#define BLK_OPEN_WRITE_IOCTL ((__force blk_mode_t)(1 << 4)) - -#define BLK_OPEN_BUFFERED ((__force blk_mode_t)(1 << 5)) - -struct inode { - unsigned long i_ino; - loff_t i_size; - struct super_block *i_sb; -}; - struct file { struct inode *f_inode; }; @@ -89,15 +68,14 @@ struct blk_holder_ops { void (*mark_dead)(struct block_device *bdev); }; -struct bdev_handle { - struct block_device *bdev; - void *holder; - blk_mode_t mode; -}; +static inline struct block_device *file_bdev(struct file *file) +{ + return container_of(file->f_inode, struct block_device, __bd_inode); +} -void bdev_release(struct bdev_handle *); -struct bdev_handle *bdev_open_by_path(const char *, blk_mode_t, void *, - const struct blk_holder_ops *); +void fput(struct file *); +struct file *bdev_file_open_by_path(const char *, blk_mode_t, void *, + const struct blk_holder_ops *); int lookup_bdev(const char *path, dev_t *); struct super_block { diff --git a/include/linux/poison.h b/include/linux/poison.h index 27a7dad1..1f0ee245 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -92,4 +92,7 @@ /********** VFS **********/ #define VFS_PTR_POISON ((void *)(0xF5 + POISON_POINTER_DELTA)) +/********** lib/stackdepot.c **********/ +#define STACK_DEPOT_POISON ((void *)(0xD390 + POISON_POINTER_DELTA)) + #endif diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 893e38f9..4ff56fa4 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -1713,34 +1713,37 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, if (ret) goto out; - if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { - a->v.gen++; - SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); - goto write; - } - - if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { - bch2_trans_inconsistent(trans, - "clearing need_discard but journal_seq %llu > flushed_seq %llu\n" - "%s", - a->v.journal_seq, - c->journal.flushed_seq_ondisk, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + if (a->v.dirty_sectors) { + if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, + trans, "attempting to discard bucket with dirty data\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = -EIO; - } goto out; } if (a->v.data_type != BCH_DATA_need_discard) { - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { - bch2_trans_inconsistent(trans, - "bucket incorrectly set in need_discard btree\n" - "%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EIO; + if (data_type_is_empty(a->v.data_type) && + BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { + a->v.gen++; + SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); + goto write; } + if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, + trans, "bucket incorrectly set in need_discard btree\n" + "%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = -EIO; + goto out; + } + + if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { + if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, + trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s", + a->v.journal_seq, + c->journal.flushed_seq_ondisk, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = -EIO; goto out; } @@ -1835,6 +1838,7 @@ static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpo if (ret) goto err; + BUG_ON(a->v.dirty_sectors); SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); a->v.data_type = alloc_data_type(a->v, a->v.data_type); @@ -1942,6 +1946,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, goto out; BUG_ON(a->v.data_type != BCH_DATA_cached); + BUG_ON(a->v.dirty_sectors); if (!a->v.cached_sectors) bch_err(c, "invalidating empty bucket, confused"); diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index 214b15c8..a1fc30ad 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -188,8 +188,10 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca) static inline unsigned open_buckets_reserved(enum bch_watermark watermark) { switch (watermark) { - case BCH_WATERMARK_reclaim: + case BCH_WATERMARK_interior_updates: return 0; + case BCH_WATERMARK_reclaim: + return OPEN_BUCKETS_COUNT / 6; case BCH_WATERMARK_btree: case BCH_WATERMARK_btree_copygc: return OPEN_BUCKETS_COUNT / 4; diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h index b91b7a46..c2226e94 100644 --- a/libbcachefs/alloc_types.h +++ b/libbcachefs/alloc_types.h @@ -22,7 +22,8 @@ struct bucket_alloc_state { x(copygc) \ x(btree) \ x(btree_copygc) \ - x(reclaim) + x(reclaim) \ + x(interior_updates) enum bch_watermark { #define x(name) BCH_WATERMARK_##name, diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c index 762c8ddf..114328ac 100644 --- a/libbcachefs/backpointers.c +++ b/libbcachefs/backpointers.c @@ -8,6 +8,7 @@ #include "btree_update.h" #include "btree_update_interior.h" #include "btree_write_buffer.h" +#include "checksum.h" #include "error.h" #include @@ -418,6 +419,84 @@ struct extents_to_bp_state { struct bkey_buf last_flushed; }; +static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree, + struct bkey_s_c extent, unsigned dev) +{ + struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent); + int ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + bch2_bkey_drop_device(bkey_i_to_s(n), dev); + return bch2_btree_insert_trans(trans, btree, n, 0); +} + +static int check_extent_checksum(struct btree_trans *trans, + enum btree_id btree, struct bkey_s_c extent, + enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev) +{ + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct printbuf buf = PRINTBUF; + void *data_buf = NULL; + struct bio *bio = NULL; + size_t bytes; + int ret = 0; + + if (bkey_is_btree_ptr(extent.k)) + return false; + + bkey_for_each_ptr_decode(extent.k, ptrs, p, entry) + if (p.ptr.dev == dev) + goto found; + BUG(); +found: + if (!p.crc.csum_type) + return false; + + bytes = p.crc.compressed_size << 9; + + struct bch_dev *ca = bch_dev_bkey_exists(c, dev); + if (!bch2_dev_get_ioref(ca, READ)) + return false; + + data_buf = kvmalloc(bytes, GFP_KERNEL); + if (!data_buf) { + ret = -ENOMEM; + goto err; + } + + bio = bio_alloc(ca->disk_sb.bdev, 1, REQ_OP_READ, GFP_KERNEL); + bio->bi_iter.bi_sector = p.ptr.offset; + bch2_bio_map(bio, data_buf, bytes); + ret = submit_bio_wait(bio); + if (ret) + goto err; + + prt_str(&buf, "extents pointing to same space, but first extent checksum bad:"); + prt_printf(&buf, "\n %s ", bch2_btree_id_str(btree)); + bch2_bkey_val_to_text(&buf, c, extent); + prt_printf(&buf, "\n %s ", bch2_btree_id_str(o_btree)); + bch2_bkey_val_to_text(&buf, c, extent2); + + struct nonce nonce = extent_nonce(extent.k->version, p.crc); + struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes); + if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum), + c, dup_backpointer_to_bad_csum_extent, + "%s", buf.buf)) + ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1; +fsck_err: +err: + if (bio) + bio_put(bio); + kvfree(data_buf); + percpu_ref_put(&ca->io_ref); + printbuf_exit(&buf); + return ret; +} + static int check_bp_exists(struct btree_trans *trans, struct extents_to_bp_state *s, struct bpos bucket, @@ -425,7 +504,8 @@ static int check_bp_exists(struct btree_trans *trans, struct bkey_s_c orig_k) { struct bch_fs *c = trans->c; - struct btree_iter bp_iter = { NULL }; + struct btree_iter bp_iter = {}; + struct btree_iter other_extent_iter = {}; struct printbuf buf = PRINTBUF; struct bkey_s_c bp_k; struct bkey_buf tmp; @@ -433,13 +513,19 @@ static int check_bp_exists(struct btree_trans *trans, bch2_bkey_buf_init(&tmp); + if (!bch2_dev_bucket_exists(c, bucket)) { + prt_str(&buf, "extent for nonexistent device:bucket "); + bch2_bpos_to_text(&buf, bucket); + prt_str(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, orig_k); + bch_err(c, "%s", buf.buf); + return -BCH_ERR_fsck_repair_unimplemented; + } + if (bpos_lt(bucket, s->bucket_start) || bpos_gt(bucket, s->bucket_end)) return 0; - if (!bch2_dev_bucket_exists(c, bucket)) - goto missing; - bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bucket_pos_to_bp(c, bucket, bp.bucket_offset), 0); @@ -465,21 +551,94 @@ static int check_bp_exists(struct btree_trans *trans, ret = -BCH_ERR_transaction_restart_write_buffer_flush; goto out; } - goto missing; + + goto check_existing_bp; } out: err: fsck_err: + bch2_trans_iter_exit(trans, &other_extent_iter); bch2_trans_iter_exit(trans, &bp_iter); bch2_bkey_buf_exit(&tmp, c); printbuf_exit(&buf); return ret; +check_existing_bp: + /* Do we have a backpointer for a different extent? */ + if (bp_k.k->type != KEY_TYPE_backpointer) + goto missing; + + struct bch_backpointer other_bp = *bkey_s_c_to_backpointer(bp_k).v; + + struct bkey_s_c other_extent = + bch2_backpointer_get_key(trans, &other_extent_iter, bp_k.k->p, other_bp, 0); + ret = bkey_err(other_extent); + if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) + ret = 0; + if (ret) + goto err; + + if (!other_extent.k) + goto missing; + + if (bch2_extents_match(orig_k, other_extent)) { + printbuf_reset(&buf); + prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n "); + bch2_bkey_val_to_text(&buf, c, orig_k); + prt_str(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, other_extent); + bch_err(c, "%s", buf.buf); + + if (other_extent.k->size <= orig_k.k->size) { + ret = drop_dev_and_update(trans, other_bp.btree_id, other_extent, bucket.inode); + if (ret) + goto err; + goto out; + } else { + ret = drop_dev_and_update(trans, bp.btree_id, orig_k, bucket.inode); + if (ret) + goto err; + goto missing; + } + } + + ret = check_extent_checksum(trans, other_bp.btree_id, other_extent, bp.btree_id, orig_k, bucket.inode); + if (ret < 0) + goto err; + if (ret) { + ret = 0; + goto missing; + } + + ret = check_extent_checksum(trans, bp.btree_id, orig_k, other_bp.btree_id, other_extent, bucket.inode); + if (ret < 0) + goto err; + if (ret) { + ret = 0; + goto out; + } + + printbuf_reset(&buf); + prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bucket.inode); + bch2_bkey_val_to_text(&buf, c, orig_k); + prt_str(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, other_extent); + bch_err(c, "%s", buf.buf); + ret = -BCH_ERR_fsck_repair_unimplemented; + goto err; missing: + printbuf_reset(&buf); prt_printf(&buf, "missing backpointer for btree=%s l=%u ", bch2_btree_id_str(bp.btree_id), bp.level); bch2_bkey_val_to_text(&buf, c, orig_k); - prt_printf(&buf, "\nbp pos "); - bch2_bpos_to_text(&buf, bp_iter.pos); + prt_printf(&buf, "\n got: "); + bch2_bkey_val_to_text(&buf, c, bp_k); + + struct bkey_i_backpointer n_bp_k; + bkey_backpointer_init(&n_bp_k.k_i); + n_bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset); + n_bp_k.v = bp; + prt_printf(&buf, "\n want: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i)); if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true); diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index f129fc4f..a31a5f70 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -615,6 +615,7 @@ struct bch_dev { */ #define BCH_FS_FLAGS() \ + x(new_fs) \ x(started) \ x(may_go_rw) \ x(rw) \ @@ -797,6 +798,7 @@ struct bch_fs { u64 features; u64 compat; unsigned long errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)]; + u64 btrees_lost_data; } sb; @@ -826,7 +828,6 @@ struct bch_fs { struct btree_root btree_roots_known[BTREE_ID_NR]; DARRAY(struct btree_root) btree_roots_extra; struct mutex btree_root_lock; - unsigned long btrees_lost_data; /* bitmask */ struct btree_cache btree_cache; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index bff8750a..63102992 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -818,6 +818,7 @@ struct bch_sb_field_ext { struct bch_sb_field field; __le64 recovery_passes_required[2]; __le64 errors_silent[8]; + __le64 btrees_lost_data; }; struct bch_sb_field_downgrade_entry { diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 9cd9b657..d7de82ac 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1264,11 +1264,11 @@ out: return retry_read; fsck_err: if (ret == -BCH_ERR_btree_node_read_err_want_retry || - ret == -BCH_ERR_btree_node_read_err_must_retry) + ret == -BCH_ERR_btree_node_read_err_must_retry) { retry_read = 1; - else { + } else { set_btree_node_read_error(b); - set_bit(b->c.btree_id, &c->btrees_lost_data); + bch2_btree_lost_data(c, b->c.btree_id); } goto out; } @@ -1330,7 +1330,7 @@ start: if (!can_retry) { set_btree_node_read_error(b); - set_bit(b->c.btree_id, &c->btrees_lost_data); + bch2_btree_lost_data(c, b->c.btree_id); break; } } @@ -1532,7 +1532,7 @@ fsck_err: if (ret) { set_btree_node_read_error(b); - set_bit(b->c.btree_id, &c->btrees_lost_data); + bch2_btree_lost_data(c, b->c.btree_id); } else if (*saw_error) bch2_btree_node_rewrite_async(c, b); @@ -1669,7 +1669,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, bch2_fatal_error(c); set_btree_node_read_error(b); - set_bit(b->c.btree_id, &c->btrees_lost_data); + bch2_btree_lost_data(c, b->c.btree_id); clear_btree_node_read_in_flight(b); wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); printbuf_exit(&buf); @@ -1866,7 +1866,7 @@ static void btree_node_write_work(struct work_struct *work) } else { ret = bch2_trans_do(c, NULL, NULL, 0, bch2_btree_node_update_key_get_iter(trans, b, &wbio->key, - BCH_WATERMARK_reclaim| + BCH_WATERMARK_interior_updates| BCH_TRANS_COMMIT_journal_reclaim| BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_no_check_rw, diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c index 96669fed..aa9da497 100644 --- a/libbcachefs/btree_trans_commit.c +++ b/libbcachefs/btree_trans_commit.c @@ -887,6 +887,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, int ret, unsigned long trace_ip) { struct bch_fs *c = trans->c; + enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; switch (ret) { case -BCH_ERR_btree_insert_btree_node_full: @@ -905,7 +906,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, * flag */ if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && - (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) { + watermark < BCH_WATERMARK_reclaim) { ret = -BCH_ERR_journal_reclaim_would_deadlock; break; } diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 8d74dfcb..32397b99 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -26,6 +26,13 @@ #include +const char * const bch2_btree_update_modes[] = { +#define x(t) #t, + BCH_WATERMARKS() +#undef x + NULL +}; + static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, btree_path_idx_t, struct btree *, struct keylist *); static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); @@ -303,7 +310,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, struct open_buckets obs = { .nr = 0 }; struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim + unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim ? BTREE_NODE_RESERVE : 0; int ret; @@ -687,7 +694,7 @@ static void btree_update_nodes_written(struct btree_update *as) * which may require allocations as well. */ ret = commit_do(trans, &as->disk_res, &journal_seq, - BCH_WATERMARK_reclaim| + BCH_WATERMARK_interior_updates| BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_journal_reclaim, @@ -846,11 +853,11 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) mutex_lock(&c->btree_interior_update_lock); list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); - BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); + BUG_ON(as->mode != BTREE_UPDATE_none); BUG_ON(!btree_node_dirty(b)); BUG_ON(!b->c.level); - as->mode = BTREE_INTERIOR_UPDATING_NODE; + as->mode = BTREE_UPDATE_node; as->b = b; set_btree_node_write_blocked(b); @@ -873,7 +880,7 @@ static void btree_update_reparent(struct btree_update *as, lockdep_assert_held(&c->btree_interior_update_lock); child->b = NULL; - child->mode = BTREE_INTERIOR_UPDATING_AS; + child->mode = BTREE_UPDATE_update; bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, bch2_update_reparent_journal_pin_flush); @@ -884,7 +891,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b) struct bkey_i *insert = &b->key; struct bch_fs *c = as->c; - BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); + BUG_ON(as->mode != BTREE_UPDATE_none); BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > ARRAY_SIZE(as->journal_entries)); @@ -898,7 +905,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b) mutex_lock(&c->btree_interior_update_lock); list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); - as->mode = BTREE_INTERIOR_UPDATING_ROOT; + as->mode = BTREE_UPDATE_root; mutex_unlock(&c->btree_interior_update_lock); } @@ -1076,7 +1083,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans * struct bch_fs *c = as->c; u64 start_time = as->start_time; - BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); + BUG_ON(as->mode == BTREE_UPDATE_none); if (as->took_gc_lock) up_read(&as->c->gc_lock); @@ -1121,7 +1128,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, unsigned journal_flags = watermark|JOURNAL_RES_GET_CHECK; if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && - watermark != BCH_WATERMARK_reclaim) + watermark < BCH_WATERMARK_reclaim) journal_flags |= JOURNAL_RES_GET_NONBLOCK; ret = drop_locks_do(trans, @@ -1172,7 +1179,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, as->c = c; as->start_time = start_time; as->ip_started = _RET_IP_; - as->mode = BTREE_INTERIOR_NO_UPDATE; + as->mode = BTREE_UPDATE_none; + as->watermark = watermark; as->took_gc_lock = true; as->btree_id = path->btree_id; as->update_level = update_level; @@ -1217,7 +1225,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, */ if (bch2_err_matches(ret, ENOSPC) && (flags & BCH_TRANS_COMMIT_journal_reclaim) && - watermark != BCH_WATERMARK_reclaim) { + watermark < BCH_WATERMARK_reclaim) { ret = -BCH_ERR_journal_reclaim_would_deadlock; goto err; } @@ -2509,18 +2517,25 @@ void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned lev bch2_trans_run(c, __bch2_btree_root_alloc_fake(trans, id, level)); } +static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) +{ + prt_printf(out, "%ps: btree=%s watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", + (void *) as->ip_started, + bch2_btree_id_str(as->btree_id), + bch2_watermarks[as->watermark], + bch2_btree_update_modes[as->mode], + as->nodes_written, + closure_nr_remaining(&as->cl), + as->journal.seq); +} + void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) { struct btree_update *as; mutex_lock(&c->btree_interior_update_lock); list_for_each_entry(as, &c->btree_interior_update_list, list) - prt_printf(out, "%ps: mode=%u nodes_written=%u cl.remaining=%u journal_seq=%llu\n", - (void *) as->ip_started, - as->mode, - as->nodes_written, - closure_nr_remaining(&as->cl), - as->journal.seq); + bch2_btree_update_to_text(out, as); mutex_unlock(&c->btree_interior_update_lock); } diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index cffb2c67..88dcf5a2 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -12,6 +12,18 @@ int bch2_btree_node_check_topology(struct btree_trans *, struct btree *); +#define BTREE_UPDATE_MODES() \ + x(none) \ + x(node) \ + x(root) \ + x(update) + +enum btree_update_mode { +#define x(n) BTREE_UPDATE_##n, + BTREE_UPDATE_MODES() +#undef x +}; + /* * Tracks an in progress split/rewrite of a btree node and the update to the * parent node: @@ -39,14 +51,8 @@ struct btree_update { struct list_head list; struct list_head unwritten_list; - /* What kind of update are we doing? */ - enum { - BTREE_INTERIOR_NO_UPDATE, - BTREE_INTERIOR_UPDATING_NODE, - BTREE_INTERIOR_UPDATING_ROOT, - BTREE_INTERIOR_UPDATING_AS, - } mode; - + enum btree_update_mode mode; + enum bch_watermark watermark; unsigned nodes_written:1; unsigned took_gc_lock:1; @@ -56,7 +62,7 @@ struct btree_update { struct disk_reservation disk_res; /* - * BTREE_INTERIOR_UPDATING_NODE: + * BTREE_UPDATE_node: * The update that made the new nodes visible was a regular update to an * existing interior node - @b. We can't write out the update to @b * until the new nodes we created are finished writing, so we block @b diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 6387e039..00aaf4bb 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -226,6 +226,7 @@ static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_waterma fallthrough; case BCH_WATERMARK_btree_copygc: case BCH_WATERMARK_reclaim: + case BCH_WATERMARK_interior_updates: break; } diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c index b564404d..34731ee0 100644 --- a/libbcachefs/data_update.c +++ b/libbcachefs/data_update.c @@ -580,8 +580,7 @@ int bch2_data_update_init(struct btree_trans *trans, move_ctxt_wait_event(ctxt, (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, PTR_BUCKET_POS(c, &p.ptr), 0)) || - (!atomic_read(&ctxt->read_sectors) && - !atomic_read(&ctxt->write_sectors))); + list_empty(&ctxt->ios)); if (!locked) bch2_bucket_nocow_lock(&c->nocow_locks, diff --git a/libbcachefs/error.c b/libbcachefs/error.c index f942a394..82a6656c 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "error.h" +#include "journal.h" #include "recovery_passes.h" #include "super.h" #include "thread_with_file.h" @@ -16,7 +17,8 @@ bool bch2_inconsistent_error(struct bch_fs *c) return false; case BCH_ON_ERROR_ro: if (bch2_fs_emergency_read_only(c)) - bch_err(c, "inconsistency detected - emergency read only"); + bch_err(c, "inconsistency detected - emergency read only at journal seq %llu", + journal_cur_seq(&c->journal)); return true; case BCH_ON_ERROR_panic: panic(bch2_fmt(c, "panic after error")); diff --git a/libbcachefs/eytzinger.c b/libbcachefs/eytzinger.c index a064ccff..4ce5e957 100644 --- a/libbcachefs/eytzinger.c +++ b/libbcachefs/eytzinger.c @@ -115,7 +115,7 @@ static void swap_bytes(void *a, void *b, size_t n) struct wrapper { cmp_func_t cmp; - swap_func_t swap_f; + swap_func_t swap; }; /* @@ -125,7 +125,7 @@ struct wrapper { static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv) { if (swap_func == SWAP_WRAPPER) { - ((const struct wrapper *)priv)->swap_f(a, b, (int)size); + ((const struct wrapper *)priv)->swap(a, b, (int)size); return; } @@ -174,7 +174,7 @@ void eytzinger0_sort_r(void *base, size_t n, size_t size, int i, c, r; /* called from 'sort' without swap function, let's pick the default */ - if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_f) + if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap) swap_func = NULL; if (!swap_func) { @@ -227,7 +227,7 @@ void eytzinger0_sort(void *base, size_t n, size_t size, { struct wrapper w = { .cmp = cmp_func, - .swap_f = swap_func, + .swap = swap_func, }; return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w); diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index aca6fae4..8e201021 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -63,9 +63,7 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol, u32 *snapshot, u64 *inum) { struct bch_subvolume s; - int ret; - - ret = bch2_subvolume_get(trans, subvol, false, 0, &s); + int ret = bch2_subvolume_get(trans, subvol, false, 0, &s); *snapshot = le32_to_cpu(s.snapshot); *inum = le64_to_cpu(s.inode); @@ -158,9 +156,10 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos) bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT); - ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - &dir_hash_info, &iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + &dir_hash_info, &iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); bch2_trans_iter_exit(trans, &iter); err: bch_err_fn(c, ret); @@ -169,7 +168,8 @@ err: /* Get lost+found, create if it doesn't exist: */ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, - struct bch_inode_unpacked *lostfound) + struct bch_inode_unpacked *lostfound, + u64 reattaching_inum) { struct bch_fs *c = trans->c; struct qstr lostfound_str = QSTR("lost+found"); @@ -184,19 +184,36 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, return ret; subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) }; - u32 subvol_snapshot; - ret = subvol_lookup(trans, le32_to_cpu(st.master_subvol), - &subvol_snapshot, &root_inum.inum); - bch_err_msg(c, ret, "looking up root subvol"); + struct bch_subvolume subvol; + ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol), + false, 0, &subvol); + bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u", + le32_to_cpu(st.master_subvol), snapshot); if (ret) return ret; + if (!subvol.inode) { + struct btree_iter iter; + struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)), + 0, subvolume); + ret = PTR_ERR_OR_ZERO(subvol); + if (ret) + return ret; + + subvol->v.inode = cpu_to_le64(reattaching_inum); + bch2_trans_iter_exit(trans, &iter); + } + + root_inum.inum = le64_to_cpu(subvol.inode); + struct bch_inode_unpacked root_inode; struct bch_hash_info root_hash_info; u32 root_inode_snapshot = snapshot; ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot); - bch_err_msg(c, ret, "looking up root inode"); + bch_err_msg(c, ret, "looking up root inode %llu for subvol %u", + root_inum.inum, le32_to_cpu(st.master_subvol)); if (ret) return ret; @@ -292,7 +309,7 @@ static int reattach_inode(struct btree_trans *trans, snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); } - ret = lookup_lostfound(trans, dirent_snapshot, &lostfound); + ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum); if (ret) return ret; @@ -363,6 +380,112 @@ static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume return ret; } +static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum) +{ + struct bch_fs *c = trans->c; + + if (!bch2_snapshot_is_leaf(c, snapshotid)) { + bch_err(c, "need to reconstruct subvol, but have interior node snapshot"); + return -BCH_ERR_fsck_repair_unimplemented; + } + + /* + * If inum isn't set, that means we're being called from check_dirents, + * not check_inodes - the root of this subvolume doesn't exist or we + * would have found it there: + */ + if (!inum) { + struct btree_iter inode_iter = {}; + struct bch_inode_unpacked new_inode; + u64 cpu = raw_smp_processor_id(); + + bch2_inode_init_early(c, &new_inode); + bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); + + new_inode.bi_subvol = subvolid; + + int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?: + bch2_btree_iter_traverse(&inode_iter) ?: + bch2_inode_write(trans, &inode_iter, &new_inode); + bch2_trans_iter_exit(trans, &inode_iter); + if (ret) + return ret; + + inum = new_inode.bi_inum; + } + + bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum); + + struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol)); + int ret = PTR_ERR_OR_ZERO(new_subvol); + if (ret) + return ret; + + bkey_subvolume_init(&new_subvol->k_i); + new_subvol->k.p.offset = subvolid; + new_subvol->v.snapshot = cpu_to_le32(snapshotid); + new_subvol->v.inode = cpu_to_le64(inum); + ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0); + if (ret) + return ret; + + struct btree_iter iter; + struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_snapshots, POS(0, snapshotid), + 0, snapshot); + ret = PTR_ERR_OR_ZERO(s); + bch_err_msg(c, ret, "getting snapshot %u", snapshotid); + if (ret) + return ret; + + u32 snapshot_tree = le32_to_cpu(s->v.tree); + + s->v.subvol = cpu_to_le32(subvolid); + SET_BCH_SNAPSHOT_SUBVOL(&s->v, true); + bch2_trans_iter_exit(trans, &iter); + + struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_snapshot_trees, POS(0, snapshot_tree), + 0, snapshot_tree); + ret = PTR_ERR_OR_ZERO(st); + bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree); + if (ret) + return ret; + + if (!st->v.master_subvol) + st->v.master_subvol = cpu_to_le32(subvolid); + + bch2_trans_iter_exit(trans, &iter); + return 0; +} + +static int reconstruct_inode(struct btree_trans *trans, u32 snapshot, u64 inum, u64 size, unsigned mode) +{ + struct bch_fs *c = trans->c; + struct bch_inode_unpacked new_inode; + + bch2_inode_init_early(c, &new_inode); + bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, mode|0755, 0, NULL); + new_inode.bi_size = size; + new_inode.bi_inum = inum; + + return __bch2_fsck_write_inode(trans, &new_inode, snapshot); +} + +static int reconstruct_reg_inode(struct btree_trans *trans, u32 snapshot, u64 inum) +{ + struct btree_iter iter = {}; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); + struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter); + bch2_trans_iter_exit(trans, &iter); + int ret = bkey_err(k); + if (ret) + return ret; + + return reconstruct_inode(trans, snapshot, inum, k.k->p.offset << 9, S_IFREG); +} + struct snapshots_seen_entry { u32 id; u32 equiv; @@ -1064,6 +1187,11 @@ static int check_inode(struct btree_trans *trans, if (ret && !bch2_err_matches(ret, ENOENT)) goto err; + if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { + ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum); + goto do_update; + } + if (fsck_err_on(ret, c, inode_bi_subvol_missing, "inode %llu:%u bi_subvol points to missing subvolume %u", @@ -1081,7 +1209,7 @@ static int check_inode(struct btree_trans *trans, do_update = true; } } - +do_update: if (do_update) { ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot); bch_err_msg(c, ret, "in fsck updating inode"); @@ -1130,8 +1258,8 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal i->count = count2; if (i->count != count2) { - bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", - w->last_pos.inode, i->snapshot, i->count, count2); + bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", + w->last_pos.inode, i->snapshot, i->count, count2); return -BCH_ERR_internal_fsck_err; } @@ -1371,10 +1499,6 @@ static int check_overlapping_extents(struct btree_trans *trans, goto err; } - ret = extent_ends_at(c, extent_ends, seen, k); - if (ret) - goto err; - extent_ends->last_pos = k.k->p; err: return ret; @@ -1438,6 +1562,17 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, goto err; if (k.k->type != KEY_TYPE_whiteout) { + if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { + ret = reconstruct_reg_inode(trans, k.k->p.snapshot, k.k->p.inode) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + if (ret) + goto err; + + inode->last_pos.inode--; + ret = -BCH_ERR_transaction_restart_nested; + goto err; + } + if (fsck_err_on(!i, c, extent_in_missing_inode, "extent in missing inode:\n %s", (printbuf_reset(&buf), @@ -1504,6 +1639,12 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, i->seen_this_pos = true; } + + if (k.k->type != KEY_TYPE_whiteout) { + ret = extent_ends_at(c, extent_ends, s, k); + if (ret) + goto err; + } out: err: fsck_err: @@ -1584,8 +1725,8 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_ return count2; if (i->count != count2) { - bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu", - i->count, count2); + bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu", + w->last_pos.inode, i->snapshot, i->count, count2); i->count = count2; if (i->inode.bi_nlink == i->count) continue; @@ -1782,6 +1923,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol); u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); u32 parent_snapshot; + u32 new_parent_subvol = 0; u64 parent_inum; struct printbuf buf = PRINTBUF; int ret = 0; @@ -1790,6 +1932,27 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * if (ret && !bch2_err_matches(ret, ENOENT)) return ret; + if (ret || + (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) { + int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); + if (ret2 && !bch2_err_matches(ret, ENOENT)) + return ret2; + } + + if (ret && + !new_parent_subvol && + (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { + /* + * Couldn't find a subvol for dirent's snapshot - but we lost + * subvols, so we need to reconstruct: + */ + ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0); + if (ret) + return ret; + + parent_snapshot = d.k->p.snapshot; + } + if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol, "dirent parent_subvol points to missing subvolume\n%s", (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) || @@ -1798,10 +1961,10 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s", parent_snapshot, (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { - u32 new_parent_subvol; - ret = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); - if (ret) - goto err; + if (!new_parent_subvol) { + bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot); + return -BCH_ERR_fsck_repair_unimplemented; + } struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); ret = PTR_ERR_OR_ZERO(new_dirent); @@ -1847,9 +2010,16 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot); if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; + goto err; - if (fsck_err_on(parent_subvol != subvol_root.bi_parent_subvol, + if (ret) { + bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum); + ret = -BCH_ERR_fsck_repair_unimplemented; + ret = 0; + goto err; + } + + if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol, c, inode_bi_parent_wrong, "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u", target_inum, @@ -1857,13 +2027,13 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * subvol_root.bi_parent_subvol = parent_subvol; ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot); if (ret) - return ret; + goto err; } ret = check_dirent_target(trans, iter, d, &subvol_root, target_snapshot); if (ret) - return ret; + goto err; out: err: fsck_err: @@ -1880,7 +2050,6 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, struct snapshots_seen *s) { struct bch_fs *c = trans->c; - struct bkey_s_c_dirent d; struct inode_walker_entry *i; struct printbuf buf = PRINTBUF; struct bpos equiv; @@ -1919,6 +2088,17 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode); dir->first_this_inode = false; + if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { + ret = reconstruct_inode(trans, k.k->p.snapshot, k.k->p.inode, 0, S_IFDIR) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + if (ret) + goto err; + + dir->last_pos.inode--; + ret = -BCH_ERR_transaction_restart_nested; + goto err; + } + if (fsck_err_on(!i, c, dirent_in_missing_dir_inode, "dirent in nonexisting directory:\n%s", (printbuf_reset(&buf), @@ -1953,7 +2133,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (k.k->type != KEY_TYPE_dirent) goto out; - d = bkey_s_c_to_dirent(k); + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); if (d.v->d_type == DT_SUBVOL) { ret = check_dirent_to_subvol(trans, iter, d); diff --git a/libbcachefs/logged_ops.c b/libbcachefs/logged_ops.c index 9fac838d..b82f8209 100644 --- a/libbcachefs/logged_ops.c +++ b/libbcachefs/logged_ops.c @@ -37,7 +37,6 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type); struct bkey_buf sk; u32 restart_count = trans->restart_count; - int ret; if (!fn) return 0; @@ -45,11 +44,11 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, bch2_bkey_buf_init(&sk); bch2_bkey_buf_reassemble(&sk, c, k); - ret = drop_locks_do(trans, (bch2_fs_lazy_rw(c), 0)) ?: - fn->resume(trans, sk.k) ?: trans_was_restarted(trans, restart_count); + fn->resume(trans, sk.k); bch2_bkey_buf_exit(&sk, c); - return ret; + + return trans_was_restarted(trans, restart_count); } int bch2_resume_logged_ops(struct bch_fs *c) diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index 08ea0cfc..e1800c41 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -7,6 +7,7 @@ #include "disk_groups.h" #include "error.h" #include "opts.h" +#include "recovery_passes.h" #include "super-io.h" #include "util.h" @@ -205,6 +206,9 @@ const struct bch_option bch2_opt_table[] = { #define OPT_STR(_choices) .type = BCH_OPT_STR, \ .min = 0, .max = ARRAY_SIZE(_choices), \ .choices = _choices +#define OPT_STR_NOLIMIT(_choices) .type = BCH_OPT_STR, \ + .min = 0, .max = U64_MAX, \ + .choices = _choices #define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn #define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \ diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 9da66395..1ac4135c 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -362,7 +362,12 @@ enum fsck_err_opts { OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ - NULL, "Don't replay the journal") \ + NULL, "Exit recovery immediately prior to journal replay")\ + x(recovery_pass_last, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_STR_NOLIMIT(bch2_recovery_passes), \ + BCH2_NO_SB_OPT, 0, \ + NULL, "Exit recovery after specified pass") \ x(retain_recovery_info, u8, \ 0, \ OPT_BOOL(), \ diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index b2b8fdf2..b76c1615 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -33,6 +33,20 @@ #define QSTR(n) { { { .len = strlen(n) } }, .name = n } +void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) +{ + u64 b = BIT_ULL(btree); + + if (!(c->sb.btrees_lost_data & b)) { + bch_err(c, "flagging btree %s lost data", bch2_btree_id_str(btree)); + + mutex_lock(&c->sb_lock); + bch2_sb_field_get(c->disk_sb.sb, ext)->btrees_lost_data |= cpu_to_le64(b); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } +} + static bool btree_id_is_alloc(enum btree_id id) { switch (id) { @@ -272,7 +286,8 @@ int bch2_journal_replay(struct bch_fs *c) bch2_trans_put(trans); trans = NULL; - if (!c->opts.retain_recovery_info) + if (!c->opts.retain_recovery_info && + c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) bch2_journal_keys_put_initial(c); replay_now_at(j, j->replay_journal_seq_end); @@ -468,8 +483,8 @@ static int read_btree_roots(struct bch_fs *c) c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); } - set_bit(i, &c->btrees_lost_data); ret = 0; + bch2_btree_lost_data(c, i); } } @@ -590,27 +605,14 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - if (c->opts.fsck && c->opts.norecovery) { - bch_err(c, "cannot select both norecovery and fsck"); - ret = -EINVAL; - goto err; - } - - c->opts.retain_recovery_info |= c->opts.norecovery; - c->opts.nochanges |= c->opts.norecovery; + if (c->opts.norecovery) + c->opts.recovery_pass_last = BCH_RECOVERY_PASS_journal_replay - 1; if (!c->opts.nochanges) { mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); bool write_sb = false; - struct bch_sb_field_ext *ext = - bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64)); - if (!ext) { - ret = -BCH_ERR_ENOSPC_sb; - mutex_unlock(&c->sb_lock); - goto err; - } - if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) { ext->recovery_passes_required[0] |= cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology))); @@ -841,6 +843,7 @@ use_clean: } mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); bool write_sb = false; if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) { @@ -854,15 +857,18 @@ use_clean: write_sb = true; } - if (!test_bit(BCH_FS_error, &c->flags)) { - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - if (ext && - (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) || - !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent)))) { - memset(ext->recovery_passes_required, 0, sizeof(ext->recovery_passes_required)); - memset(ext->errors_silent, 0, sizeof(ext->errors_silent)); - write_sb = true; - } + if (!test_bit(BCH_FS_error, &c->flags) && + !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent))) { + memset(ext->errors_silent, 0, sizeof(ext->errors_silent)); + write_sb = true; + } + + if (c->opts.fsck && + !test_bit(BCH_FS_error, &c->flags) && + c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 && + ext->btrees_lost_data) { + ext->btrees_lost_data = 0; + write_sb = true; } if (c->opts.fsck && @@ -932,6 +938,7 @@ int bch2_fs_initialize(struct bch_fs *c) int ret; bch_notice(c, "initializing new filesystem"); + set_bit(BCH_FS_new_fs, &c->flags); mutex_lock(&c->sb_lock); c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); diff --git a/libbcachefs/recovery.h b/libbcachefs/recovery.h index 3962fd87..4bf818de 100644 --- a/libbcachefs/recovery.h +++ b/libbcachefs/recovery.h @@ -2,6 +2,8 @@ #ifndef _BCACHEFS_RECOVERY_H #define _BCACHEFS_RECOVERY_H +void bch2_btree_lost_data(struct bch_fs *, enum btree_id); + int bch2_journal_replay(struct bch_fs *); int bch2_fs_recovery(struct bch_fs *); diff --git a/libbcachefs/recovery_passes.c b/libbcachefs/recovery_passes.c index 6fa095fe..cb501460 100644 --- a/libbcachefs/recovery_passes.c +++ b/libbcachefs/recovery_passes.c @@ -17,6 +17,7 @@ #include "snapshot.h" #include "subvolume.h" #include "super.h" +#include "super-io.h" const char * const bch2_recovery_passes[] = { #define x(_fn, ...) #_fn, @@ -27,7 +28,7 @@ const char * const bch2_recovery_passes[] = { static int bch2_check_allocations(struct bch_fs *c) { - return bch2_gc(c, true, c->opts.norecovery); + return bch2_gc(c, true, false); } static int bch2_set_may_go_rw(struct bch_fs *c) @@ -59,18 +60,23 @@ static struct recovery_pass_fn recovery_pass_fns[] = { #undef x }; -u64 bch2_recovery_passes_to_stable(u64 v) -{ - static const u8 map[] = { +static const u8 passes_to_stable_map[] = { #define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, BCH_RECOVERY_PASSES() #undef x - }; +}; +static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass) +{ + return passes_to_stable_map[pass]; +} + +u64 bch2_recovery_passes_to_stable(u64 v) +{ u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(map); i++) + for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++) if (v & BIT_ULL(i)) - ret |= BIT_ULL(map[i]); + ret |= BIT_ULL(passes_to_stable_map[i]); return ret; } @@ -113,6 +119,38 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c, } } +int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass); + + mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + + if (!test_bit_le64(s, ext->recovery_passes_required)) { + __set_bit_le64(s, ext->recovery_passes_required); + bch2_write_super(c); + } + mutex_unlock(&c->sb_lock); + + return bch2_run_explicit_recovery_pass(c, pass); +} + +static void bch2_clear_recovery_pass_required(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass); + + mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + + if (test_bit_le64(s, ext->recovery_passes_required)) { + __clear_bit_le64(s, ext->recovery_passes_required); + bch2_write_super(c); + } + mutex_unlock(&c->sb_lock); +} + u64 bch2_fsck_recovery_passes(void) { u64 ret = 0; @@ -127,8 +165,6 @@ static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pa { struct recovery_pass_fn *p = recovery_pass_fns + pass; - if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read) - return false; if (c->recovery_passes_explicit & BIT_ULL(pass)) return true; if ((p->when & PASS_FSCK) && c->opts.fsck) @@ -184,6 +220,10 @@ int bch2_run_recovery_passes(struct bch_fs *c) int ret = 0; while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { + if (c->opts.recovery_pass_last && + c->curr_recovery_pass > c->opts.recovery_pass_last) + break; + if (should_run_recovery_pass(c, c->curr_recovery_pass)) { unsigned pass = c->curr_recovery_pass; @@ -196,8 +236,13 @@ int bch2_run_recovery_passes(struct bch_fs *c) c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass); } - c->curr_recovery_pass++; + c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass); + + if (!test_bit(BCH_FS_error, &c->flags)) + bch2_clear_recovery_pass_required(c, c->curr_recovery_pass); + + c->curr_recovery_pass++; } return ret; diff --git a/libbcachefs/recovery_passes.h b/libbcachefs/recovery_passes.h index abefa677..99b464e1 100644 --- a/libbcachefs/recovery_passes.h +++ b/libbcachefs/recovery_passes.h @@ -9,6 +9,7 @@ u64 bch2_recovery_passes_from_stable(u64 v); u64 bch2_fsck_recovery_passes(void); int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); +int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass); int bch2_run_online_recovery_passes(struct bch_fs *); int bch2_run_recovery_passes(struct bch_fs *); diff --git a/libbcachefs/recovery_passes_types.h b/libbcachefs/recovery_passes_types.h index 840542cf..773aea9a 100644 --- a/libbcachefs/recovery_passes_types.h +++ b/libbcachefs/recovery_passes_types.h @@ -32,6 +32,7 @@ x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \ x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ x(bucket_gens_init, 17, 0) \ + x(reconstruct_snapshots, 38, 0) \ x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ diff --git a/libbcachefs/sb-errors_types.h b/libbcachefs/sb-errors_types.h index 2fec03a2..d7d60913 100644 --- a/libbcachefs/sb-errors_types.h +++ b/libbcachefs/sb-errors_types.h @@ -268,7 +268,9 @@ x(btree_node_bkey_bad_u64s, 260) \ x(btree_node_topology_empty_interior_node, 261) \ x(btree_ptr_v2_min_key_bad, 262) \ - x(btree_root_unreadable_and_scan_found_nothing, 263) + x(btree_root_unreadable_and_scan_found_nothing, 263) \ + x(snapshot_node_missing, 264) \ + x(dup_backpointer_to_bad_csum_extent, 265) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, diff --git a/libbcachefs/snapshot.c b/libbcachefs/snapshot.c index 4e074136..0e806f04 100644 --- a/libbcachefs/snapshot.c +++ b/libbcachefs/snapshot.c @@ -8,6 +8,7 @@ #include "errcode.h" #include "error.h" #include "fs.h" +#include "recovery_passes.h" #include "snapshot.h" #include @@ -131,7 +132,7 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) rcu_read_lock(); struct snapshot_table *t = rcu_dereference(c->snapshots); - if (unlikely(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots)) { + if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) { ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor); goto out; } @@ -574,6 +575,13 @@ static int check_snapshot_tree(struct btree_trans *trans, u32 subvol_id; ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id); + bch_err_fn(c, ret); + + if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */ + ret = 0; + goto err; + } + if (ret) goto err; @@ -731,7 +739,6 @@ static int check_snapshot(struct btree_trans *trans, u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset); u32 real_depth; struct printbuf buf = PRINTBUF; - bool should_have_subvol; u32 i, id; int ret = 0; @@ -777,7 +784,7 @@ static int check_snapshot(struct btree_trans *trans, } } - should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && + bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && !BCH_SNAPSHOT_DELETED(&s); if (should_have_subvol) { @@ -879,6 +886,154 @@ int bch2_check_snapshots(struct bch_fs *c) return ret; } +static int check_snapshot_exists(struct btree_trans *trans, u32 id) +{ + struct bch_fs *c = trans->c; + + if (bch2_snapshot_equiv(c, id)) + return 0; + + u32 tree_id; + int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id); + if (ret) + return ret; + + struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot)); + ret = PTR_ERR_OR_ZERO(snapshot); + if (ret) + return ret; + + bkey_snapshot_init(&snapshot->k_i); + snapshot->k.p = POS(0, id); + snapshot->v.tree = cpu_to_le32(tree_id); + snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c)); + + return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?: + bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, + bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?: + bch2_snapshot_set_equiv(trans, bkey_i_to_s_c(&snapshot->k_i)); +} + +/* Figure out which snapshot nodes belong in the same tree: */ +struct snapshot_tree_reconstruct { + enum btree_id btree; + struct bpos cur_pos; + snapshot_id_list cur_ids; + DARRAY(snapshot_id_list) trees; +}; + +static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r) +{ + darray_for_each(r->trees, i) + darray_exit(i); + darray_exit(&r->trees); + darray_exit(&r->cur_ids); +} + +static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos) +{ + return r->btree == BTREE_ID_inodes + ? r->cur_pos.offset == pos.offset + : r->cur_pos.inode == pos.inode; +} + +static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r) +{ + darray_for_each(*l, i) + if (snapshot_list_has_id(r, *i)) + return true; + return false; +} + +static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s) +{ + bool first = true; + darray_for_each(*s, i) { + if (!first) + prt_char(out, ' '); + first = false; + prt_printf(out, "%u", *i); + } +} + +static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r) +{ + if (r->cur_ids.nr) { + darray_for_each(r->trees, i) + if (snapshot_id_lists_have_common(i, &r->cur_ids)) { + int ret = snapshot_list_merge(c, i, &r->cur_ids); + if (ret) + return ret; + goto out; + } + darray_push(&r->trees, r->cur_ids); + darray_init(&r->cur_ids); + } +out: + r->cur_ids.nr = 0; + return 0; +} + +static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos) +{ + if (!same_snapshot(r, pos)) + snapshot_tree_reconstruct_next(c, r); + r->cur_pos = pos; + return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot); +} + +int bch2_reconstruct_snapshots(struct bch_fs *c) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct printbuf buf = PRINTBUF; + struct snapshot_tree_reconstruct r = {}; + int ret = 0; + + for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { + if (btree_type_has_snapshots(btree)) { + r.btree = btree; + + ret = for_each_btree_key(trans, iter, btree, POS_MIN, + BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_PREFETCH, k, ({ + get_snapshot_trees(c, &r, k.k->p); + })); + if (ret) + goto err; + + snapshot_tree_reconstruct_next(c, &r); + } + } + + darray_for_each(r.trees, t) { + printbuf_reset(&buf); + snapshot_id_list_to_text(&buf, t); + + darray_for_each(*t, id) { + if (fsck_err_on(!bch2_snapshot_equiv(c, *id), + c, snapshot_node_missing, + "snapshot node %u from tree %s missing", *id, buf.buf)) { + if (t->nr > 1) { + bch_err(c, "cannot reconstruct snapshot trees with multiple nodes"); + ret = -BCH_ERR_fsck_repair_unimplemented; + goto err; + } + + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_snapshot_exists(trans, *id)); + if (ret) + goto err; + } + } + } +fsck_err: +err: + bch2_trans_put(trans); + snapshot_tree_reconstruct_exit(&r); + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} + /* * Mark a snapshot as deleted, for future cleanup: */ @@ -1689,6 +1844,20 @@ int bch2_snapshots_read(struct bch_fs *c) POS_MIN, 0, k, (set_is_ancestor_bitmap(c, k.k->p.offset), 0))); bch_err_fn(c, ret); + + /* + * It's important that we check if we need to reconstruct snapshots + * before going RW, so we mark that pass as required in the superblock - + * otherwise, we could end up deleting keys with missing snapshot nodes + * instead + */ + BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) && + test_bit(BCH_FS_may_go_rw, &c->flags)); + + if (bch2_err_matches(ret, EIO) || + (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots))) + ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots); + return ret; } diff --git a/libbcachefs/snapshot.h b/libbcachefs/snapshot.h index 331f20fd..b7d2fed3 100644 --- a/libbcachefs/snapshot.h +++ b/libbcachefs/snapshot.h @@ -209,15 +209,34 @@ static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id) { - int ret; - BUG_ON(snapshot_list_has_id(s, id)); - ret = darray_push(s, id); + int ret = darray_push(s, id); if (ret) bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); return ret; } +static inline int snapshot_list_add_nodup(struct bch_fs *c, snapshot_id_list *s, u32 id) +{ + int ret = snapshot_list_has_id(s, id) + ? 0 + : darray_push(s, id); + if (ret) + bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); + return ret; +} + +static inline int snapshot_list_merge(struct bch_fs *c, snapshot_id_list *dst, snapshot_id_list *src) +{ + darray_for_each(*src, i) { + int ret = snapshot_list_add_nodup(c, dst, *i); + if (ret) + return ret; + } + + return 0; +} + int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, struct bch_snapshot *s); int bch2_snapshot_get_subvol(struct btree_trans *, u32, @@ -229,6 +248,7 @@ int bch2_snapshot_node_create(struct btree_trans *, u32, int bch2_check_snapshot_trees(struct bch_fs *); int bch2_check_snapshots(struct bch_fs *); +int bch2_reconstruct_snapshots(struct bch_fs *); int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); void bch2_delete_dead_snapshots_work(struct work_struct *); diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 61fc0ce7..e0aa3655 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -142,8 +142,8 @@ void bch2_sb_field_delete(struct bch_sb_handle *sb, void bch2_free_super(struct bch_sb_handle *sb) { kfree(sb->bio); - if (!IS_ERR_OR_NULL(sb->bdev_handle)) - bdev_release(sb->bdev_handle); + if (!IS_ERR_OR_NULL(sb->s_bdev_file)) + fput(sb->s_bdev_file); kfree(sb->holder); kfree(sb->sb_name); @@ -527,9 +527,11 @@ static void bch2_sb_update(struct bch_fs *c) memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent)); struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext); - if (ext) + if (ext) { le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent, sizeof(c->sb.errors_silent) * 8); + c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data); + } for_each_member_device(c, ca) { struct bch_member m = bch2_sb_member_get(src, ca->dev_idx); @@ -712,23 +714,23 @@ retry: if (!opt_get(*opts, nochanges)) sb->mode |= BLK_OPEN_WRITE; - sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); - if (IS_ERR(sb->bdev_handle) && - PTR_ERR(sb->bdev_handle) == -EACCES && + sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); + if (IS_ERR(sb->s_bdev_file) && + PTR_ERR(sb->s_bdev_file) == -EACCES && opt_get(*opts, read_only)) { sb->mode &= ~BLK_OPEN_WRITE; - sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); - if (!IS_ERR(sb->bdev_handle)) + sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); + if (!IS_ERR(sb->s_bdev_file)) opt_set(*opts, nochanges, true); } - if (IS_ERR(sb->bdev_handle)) { - ret = PTR_ERR(sb->bdev_handle); + if (IS_ERR(sb->s_bdev_file)) { + ret = PTR_ERR(sb->s_bdev_file); prt_printf(&err, "error opening %s: %s", path, bch2_err_str(ret)); goto err; } - sb->bdev = sb->bdev_handle->bdev; + sb->bdev = file_bdev(sb->s_bdev_file); ret = bch2_sb_realloc(sb, 0); if (ret) { @@ -1162,6 +1164,11 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb, kfree(errors_silent); } + + prt_printf(out, "Btrees with missing data:"); + prt_tab(out); + prt_bitflags(out, __bch2_btree_ids, le64_to_cpu(e->btrees_lost_data)); + prt_newline(out); } static const struct bch_sb_field_ops bch_sb_field_ops_ext = { diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 6269bec9..ed63018f 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -366,7 +366,7 @@ void bch2_fs_read_only(struct bch_fs *c) !test_bit(BCH_FS_emergency_ro, &c->flags) && test_bit(BCH_FS_started, &c->flags) && test_bit(BCH_FS_clean_shutdown, &c->flags) && - !c->opts.norecovery) { + c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) { BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); BUG_ON(atomic_read(&c->btree_cache.dirty)); BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); @@ -511,7 +511,8 @@ err: int bch2_fs_read_write(struct bch_fs *c) { - if (c->opts.norecovery) + if (c->opts.recovery_pass_last && + c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay) return -BCH_ERR_erofs_norecovery; if (c->opts.nochanges) @@ -1018,8 +1019,16 @@ int bch2_fs_start(struct bch_fs *c) for_each_online_member(c, ca) bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now); + struct bch_sb_field_ext *ext = + bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64)); mutex_unlock(&c->sb_lock); + if (!ext) { + bch_err(c, "insufficient space in superblock for sb_field_ext"); + ret = -BCH_ERR_ENOSPC_sb; + goto err; + } + for_each_rw_member(c, ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h index 0e5a14fc..ec784d97 100644 --- a/libbcachefs/super_types.h +++ b/libbcachefs/super_types.h @@ -4,7 +4,7 @@ struct bch_sb_handle { struct bch_sb *sb; - struct bdev_handle *bdev_handle; + struct file *s_bdev_file; struct block_device *bdev; char *sb_name; struct bio *bio; diff --git a/libbcachefs/util.h b/libbcachefs/util.h index b4621b68..b7e7c292 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -793,4 +793,14 @@ static inline void __set_bit_le64(size_t bit, __le64 *addr) addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64)); } +static inline void __clear_bit_le64(size_t bit, __le64 *addr) +{ + addr[bit / 64] &= !cpu_to_le64(BIT_ULL(bit % 64)); +} + +static inline bool test_bit_le64(size_t bit, __le64 *addr) +{ + return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0; +} + #endif /* _BCACHEFS_UTIL_H */ diff --git a/linux/blkdev.c b/linux/blkdev.c index b7f58737..54122955 100644 --- a/linux/blkdev.c +++ b/linux/blkdev.c @@ -162,16 +162,18 @@ sector_t get_capacity(struct gendisk *disk) return bytes >> 9; } -void bdev_release(struct bdev_handle *handle) +void fput(struct file *file) { - fdatasync(handle->bdev->bd_fd); - close(handle->bdev->bd_fd); - free(handle->bdev); - free(handle); + struct block_device *bdev = file_bdev(file); + + fdatasync(bdev->bd_fd); + close(bdev->bd_fd); + free(bdev); + free(file); } -struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode, - void *holder, const struct blk_holder_ops *hop) +struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, + void *holder, const struct blk_holder_ops *hop) { int fd, flags = 0; @@ -204,13 +206,12 @@ struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode, bdev->bd_disk = &bdev->__bd_disk; bdev->bd_disk->bdi = &bdev->bd_disk->__bdi; bdev->queue.backing_dev_info = bdev->bd_disk->bdi; + bdev->bd_inode = &bdev->__bd_inode; - struct bdev_handle *handle = calloc(sizeof(*handle), 1); - handle->bdev = bdev; - handle->holder = holder; - handle->mode = mode; + struct file *file = calloc(sizeof(*file), 1); + file->f_inode = bdev->bd_inode; - return handle; + return file; } int lookup_bdev(const char *path, dev_t *dev)