From 1cf4d51dc4661f336f5318c176a3561ddf5bf04f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 21 Dec 2017 18:00:30 -0500 Subject: [PATCH] Update bcachefs sources to 14ce2a2031 bcachefs: fixes for building in userspace --- .bcachefs_revision | 2 +- cmd_debug.c | 6 +- cmd_fsck.c | 7 +- cmd_migrate.c | 12 +- include/linux/bio.h | 14 +- include/linux/blk_types.h | 24 +- include/linux/blkdev.h | 3 + include/linux/bug.h | 2 +- include/linux/time64.h | 15 + libbcachefs/acl.c | 3 +- libbcachefs/alloc.c | 18 +- libbcachefs/bcachefs.h | 15 +- libbcachefs/bcachefs_format.h | 37 +- libbcachefs/bkey.c | 37 +- libbcachefs/bkey.h | 4 +- libbcachefs/bkey_methods.c | 127 +++-- libbcachefs/bkey_methods.h | 16 +- libbcachefs/btree_gc.c | 21 +- libbcachefs/btree_io.c | 141 ++++-- libbcachefs/btree_iter.c | 23 +- libbcachefs/btree_locking.h | 3 +- libbcachefs/btree_types.h | 2 + libbcachefs/btree_update.h | 4 +- libbcachefs/btree_update_interior.c | 192 ++++---- libbcachefs/buckets.c | 6 +- libbcachefs/buckets_types.h | 6 +- libbcachefs/chardev.c | 5 +- libbcachefs/checksum.h | 13 +- libbcachefs/error.c | 10 +- libbcachefs/error.h | 9 +- libbcachefs/extents.c | 73 ++- libbcachefs/extents.h | 9 +- libbcachefs/fs-io.c | 643 +++++++++++------------- libbcachefs/fs-ioctl.c | 8 +- libbcachefs/fs.c | 44 +- libbcachefs/fs.h | 17 +- libbcachefs/fsck.c | 16 +- libbcachefs/inode.c | 29 +- libbcachefs/inode.h | 44 +- libbcachefs/io.c | 128 +++-- libbcachefs/io.h | 50 +- libbcachefs/io_types.h | 8 +- libbcachefs/journal.c | 119 ++++- libbcachefs/journal.h | 4 + libbcachefs/journal_types.h | 1 + libbcachefs/migrate.c | 199 +++++--- libbcachefs/migrate.h | 5 +- libbcachefs/move.c | 36 +- libbcachefs/opts.c | 71 ++- libbcachefs/opts.h | 28 +- libbcachefs/super-io.c | 736 ++++++++++++++-------------- libbcachefs/super-io.h | 28 +- libbcachefs/super.c | 193 ++++---- libbcachefs/super.h | 30 +- libbcachefs/sysfs.c | 2 +- libbcachefs/tier.c | 3 +- libbcachefs/util.h | 6 +- libbcachefs/vstructs.h | 8 +- libbcachefs/xattr.c | 133 ++++- linux/bio.c | 58 ++- linux/blkdev.c | 10 +- 61 files changed, 2074 insertions(+), 1442 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index 77247162..03838458 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -e57b5958cf4e8530d26f7c36a6e1427fb284cc70 +14ce2a2031f3761a4b957aa2e5aac446ce18b87c diff --git a/cmd_debug.c b/cmd_debug.c index b1bdda8c..1a2c1dbd 100644 --- a/cmd_debug.c +++ b/cmd_debug.c @@ -293,11 +293,11 @@ int cmd_list(int argc, char *argv[]) list_modes, "list mode"); break; case 'f': - opts.fix_errors = FSCK_ERR_YES; - opts.norecovery = false; + opt_set(opts, fix_errors, FSCK_OPT_YES); + opt_set(opts, norecovery, false); break; case 'v': - opts.verbose_recovery = true; + opt_set(opts, verbose_recovery, true); break; case 'h': list_keys_usage(); diff --git a/cmd_fsck.c b/cmd_fsck.c index 9b01524a..556a4e1b 100644 --- a/cmd_fsck.c +++ b/cmd_fsck.c @@ -28,18 +28,19 @@ int cmd_fsck(int argc, char *argv[]) int opt; opt_set(opts, degraded, true); + opt_set(opts, fix_errors, FSCK_OPT_ASK); while ((opt = getopt(argc, argv, "pynfvh")) != -1) switch (opt) { case 'p': - opt_set(opts, fix_errors, FSCK_ERR_YES); + opt_set(opts, fix_errors, FSCK_OPT_YES); break; case 'y': - opt_set(opts, fix_errors, FSCK_ERR_YES); + opt_set(opts, fix_errors, FSCK_OPT_YES); break; case 'n': opt_set(opts, nochanges, true); - opt_set(opts, fix_errors, FSCK_ERR_NO); + opt_set(opts, fix_errors, FSCK_OPT_NO); break; case 'f': /* force check, even if filesystem marked clean: */ diff --git a/cmd_migrate.c b/cmd_migrate.c index 58c0bb96..f46a09dd 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -164,7 +164,7 @@ static struct bch_inode_unpacked create_file(struct bch_fs *c, struct bch_inode_unpacked new_inode; int ret; - bch2_inode_init(c, &new_inode, uid, gid, mode, rdev); + bch2_inode_init(c, &new_inode, uid, gid, mode, rdev, parent); ret = bch2_inode_create(c, &new_inode, BLOCKDEV_INODE_MAX, 0, &c->unused_inode_hint); @@ -247,7 +247,6 @@ static void write_data(struct bch_fs *c, struct bch_inode_unpacked *dst_inode, u64 dst_offset, void *buf, size_t len) { - struct disk_reservation res; struct bch_write_op op; struct bio_vec bv; struct closure cl; @@ -261,12 +260,15 @@ static void write_data(struct bch_fs *c, op.wbio.bio.bi_iter.bi_size = len; bch2_bio_map(&op.wbio.bio, buf); - int ret = bch2_disk_reservation_get(c, &res, len >> 9, 0); + bch2_write_op_init(&op, c); + + op.write_point = writepoint_hashed(0); + op.pos = POS(dst_inode->bi_inum, dst_offset >> 9); + + int ret = bch2_disk_reservation_get(c, &op.res, len >> 9, 0); if (ret) die("error reserving space in new filesystem: %s", strerror(-ret)); - bch2_write_op_init(&op, c, res, NULL, writepoint_hashed(0), - POS(dst_inode->bi_inum, dst_offset >> 9), NULL, 0); closure_call(&op.cl, bch2_write, NULL, &cl); closure_sync(&cl); diff --git a/include/linux/bio.h b/include/linux/bio.h index 10cad5cc..7293eef0 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -243,7 +243,8 @@ static inline void bioset_free(struct bio_set *bs) static inline int bioset_init(struct bio_set *bs, unsigned pool_size, - unsigned front_pad) + unsigned front_pad, + int flags) { bs->front_pad = front_pad; return 0; @@ -251,6 +252,10 @@ static inline int bioset_init(struct bio_set *bs, extern struct bio_set *bioset_create(unsigned int, unsigned int); extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int); +enum { + BIOSET_NEED_BVECS = 1 << 0, + BIOSET_NEED_RESCUER = 1 << 1, +}; extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); extern void bio_put(struct bio *); @@ -271,13 +276,6 @@ static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask) } extern void bio_endio(struct bio *); -extern void bio_endio_nodec(struct bio *); - -static inline void bio_io_error(struct bio *bio) -{ - bio->bi_error = -EIO; - bio_endio(bio); -} extern void bio_advance(struct bio *, unsigned); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 55179944..42cd0032 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -13,7 +13,27 @@ struct bio_set; struct bio; struct block_device; typedef void (bio_end_io_t) (struct bio *); -typedef void (bio_destructor_t) (struct bio *); + +/* + * Block error status values. See block/blk-core:blk_errors for the details. + */ +typedef u8 __bitwise blk_status_t; +#define BLK_STS_OK 0 +#define BLK_STS_NOTSUPP ((__force blk_status_t)1) +#define BLK_STS_TIMEOUT ((__force blk_status_t)2) +#define BLK_STS_NOSPC ((__force blk_status_t)3) +#define BLK_STS_TRANSPORT ((__force blk_status_t)4) +#define BLK_STS_TARGET ((__force blk_status_t)5) +#define BLK_STS_NEXUS ((__force blk_status_t)6) +#define BLK_STS_MEDIUM ((__force blk_status_t)7) +#define BLK_STS_PROTECTION ((__force blk_status_t)8) +#define BLK_STS_RESOURCE ((__force blk_status_t)9) +#define BLK_STS_IOERR ((__force blk_status_t)10) + +/* hack for device mapper, don't use elsewhere: */ +#define BLK_STS_DM_REQUEUE ((__force blk_status_t)11) + +#define BLK_STS_AGAIN ((__force blk_status_t)12) /* * main unit of I/O for the block layer and lower layers (ie drivers and @@ -22,7 +42,7 @@ typedef void (bio_destructor_t) (struct bio *); struct bio { struct bio *bi_next; /* request queue link */ struct block_device *bi_bdev; - int bi_error; + blk_status_t bi_status; unsigned int bi_opf; /* bottom bits req flags, * top bits REQ_OP. Use * accessors. diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f196c704..1d5581dc 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -197,5 +197,8 @@ static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx) #define capable(cap) true +int blk_status_to_errno(blk_status_t status); +blk_status_t errno_to_blk_status(int errno); + #endif /* __TOOLS_LINUX_BLKDEV_H */ diff --git a/include/linux/bug.h b/include/linux/bug.h index 89cdd30d..e25568c8 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -14,7 +14,7 @@ #define BUG() do { assert(0); unreachable(); } while (0) #define BUG_ON(cond) assert(!(cond)) -#define WARN_ON_ONCE(cond) assert(!(cond)) +#define WARN_ON_ONCE(cond) ({ bool _r = (cond); if (_r) assert(0); _r; }) #define WARN_ONCE(cond, msg) ({ bool _r = (cond); if (_r) assert(0); _r; }) #define __WARN() assert(0) diff --git a/include/linux/time64.h b/include/linux/time64.h index 2d9f8291..870bdef4 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -204,4 +204,19 @@ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns) extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs, const struct timespec64 rhs); +static inline struct timespec timespec_trunc(struct timespec t, unsigned gran) +{ + /* Avoid division in the common cases 1 ns and 1 s. */ + if (gran == 1) { + /* nothing */ + } else if (gran == NSEC_PER_SEC) { + t.tv_nsec = 0; + } else if (gran > 1 && gran < NSEC_PER_SEC) { + t.tv_nsec -= t.tv_nsec % gran; + } else { + WARN(1, "illegal file time granularity: %u", gran); + } + return t; +} + #endif /* _LINUX_TIME64_H */ diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c index 2632d21c..480941d6 100644 --- a/libbcachefs/acl.c +++ b/libbcachefs/acl.c @@ -193,8 +193,7 @@ int bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type) if (ret < 0) return ret; else { - inode->v.i_ctime = - current_fs_time(inode->v.i_sb); + inode->v.i_ctime = current_time(&inode->v); mark_inode_dirty(&inode->v); if (ret == 0) acl = NULL; diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c index d29d871a..29799df6 100644 --- a/libbcachefs/alloc.c +++ b/libbcachefs/alloc.c @@ -257,7 +257,7 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k) return; a = bkey_s_c_to_alloc(k); - ca = c->devs[a.k->p.inode]; + ca = bch_dev_bkey_exists(c, a.k->p.inode); if (a.k->p.offset >= ca->mi.nbuckets) return; @@ -305,10 +305,12 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list) bch2_alloc_read_key(c, bkey_i_to_s_c(k)); } + mutex_lock(&c->bucket_lock); for_each_member_device(ca, c, i) { bch2_recalc_min_prio(c, ca, READ); bch2_recalc_min_prio(c, ca, WRITE); } + mutex_unlock(&c->bucket_lock); return 0; } @@ -368,7 +370,7 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos) if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode]) return 0; - ca = c->devs[pos.inode]; + ca = bch_dev_bkey_exists(c, pos.inode); if (pos.offset >= ca->mi.nbuckets) return 0; @@ -461,7 +463,7 @@ static void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, /* Bucket heap / gen */ -void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw) +static void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw) { struct prio_clock *clock = &c->prio_clock[rw]; struct bucket *g; @@ -975,7 +977,7 @@ static int bch2_allocator_thread(void *arg) void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) { - struct bch_dev *ca = c->devs[ob->ptr.dev]; + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); spin_lock(&ob->lock); bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), false, @@ -1303,7 +1305,7 @@ static void writepoint_drop_ptrs(struct bch_fs *c, for (i = wp->nr_ptrs - 1; i >= 0; --i) { struct open_bucket *ob = wp->ptrs[i]; - struct bch_dev *ca = c->devs[ob->ptr.dev]; + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); if (nr_ptrs_dislike && !test_bit(ob->ptr.dev, devs->d)) { BUG_ON(ca->open_buckets_partial_nr >= @@ -1331,7 +1333,7 @@ static void verify_not_stale(struct bch_fs *c, const struct write_point *wp) unsigned i; writepoint_for_each_ptr(wp, ob, i) { - struct bch_dev *ca = c->devs[ob->ptr.dev]; + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); BUG_ON(ptr_stale(ca, &ob->ptr)); } @@ -1537,7 +1539,7 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, for (i = 0; i < wp->nr_ptrs_can_use; i++) { struct open_bucket *ob = wp->ptrs[i]; - struct bch_dev *ca = c->devs[ob->ptr.dev]; + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); struct bch_extent_ptr tmp = ob->ptr; EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptr.dev)); @@ -1589,7 +1591,7 @@ void bch2_recalc_capacity(struct bch_fs *c) ra_pages += bdi->ra_pages; } - c->bdi.ra_pages = ra_pages; + bch2_set_ra_pages(c, ra_pages); /* Find fastest, slowest tiers with devices: */ diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index b679dd16..e25baf56 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -326,9 +326,9 @@ struct io_count { struct bch_dev { struct kobject kobj; struct percpu_ref ref; + struct completion ref_completion; struct percpu_ref io_ref; - struct completion stop_complete; - struct completion offline_complete; + struct completion io_ref_completion; struct bch_fs *fs; @@ -515,12 +515,11 @@ struct bch_fs { struct closure sb_write; struct mutex sb_lock; - struct backing_dev_info bdi; - /* BTREE CACHE */ struct bio_set btree_read_bio; struct btree_root btree_roots[BTREE_ID_NR]; + bool btree_roots_dirty; struct mutex btree_root_lock; struct btree_cache btree_cache; @@ -710,6 +709,14 @@ struct bch_fs { #undef BCH_TIME_STAT }; +static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) +{ +#ifndef NO_BCACHEFS_FS + if (c->vfs_sb) + c->vfs_sb->s_bdi->ra_pages = ra_pages; +#endif +} + static inline bool bch2_fs_running(struct bch_fs *c) { return c->state == BCH_FS_RO || c->state == BCH_FS_RW; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 2dc9a7e0..6e0e0452 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -593,18 +593,24 @@ struct bch_inode_generation { } __attribute__((packed, aligned(8))); BKEY_VAL_TYPE(inode_generation, BCH_INODE_GENERATION); -#define BCH_INODE_FIELDS() \ - BCH_INODE_FIELD(bi_atime, 64) \ - BCH_INODE_FIELD(bi_ctime, 64) \ - BCH_INODE_FIELD(bi_mtime, 64) \ - BCH_INODE_FIELD(bi_otime, 64) \ - BCH_INODE_FIELD(bi_size, 64) \ - BCH_INODE_FIELD(bi_sectors, 64) \ - BCH_INODE_FIELD(bi_uid, 32) \ - BCH_INODE_FIELD(bi_gid, 32) \ - BCH_INODE_FIELD(bi_nlink, 32) \ - BCH_INODE_FIELD(bi_generation, 32) \ - BCH_INODE_FIELD(bi_dev, 32) +#define BCH_INODE_FIELDS() \ + BCH_INODE_FIELD(bi_atime, 64) \ + BCH_INODE_FIELD(bi_ctime, 64) \ + BCH_INODE_FIELD(bi_mtime, 64) \ + BCH_INODE_FIELD(bi_otime, 64) \ + BCH_INODE_FIELD(bi_size, 64) \ + BCH_INODE_FIELD(bi_sectors, 64) \ + BCH_INODE_FIELD(bi_uid, 32) \ + BCH_INODE_FIELD(bi_gid, 32) \ + BCH_INODE_FIELD(bi_nlink, 32) \ + BCH_INODE_FIELD(bi_generation, 32) \ + BCH_INODE_FIELD(bi_dev, 32) \ + BCH_INODE_FIELD(bi_data_checksum, 8) \ + BCH_INODE_FIELD(bi_compression, 8) + +#define BCH_INODE_FIELDS_INHERIT() \ + BCH_INODE_FIELD(bi_data_checksum) \ + BCH_INODE_FIELD(bi_compression) enum { /* @@ -794,7 +800,7 @@ struct bch_sb_layout { __u8 sb_max_size_bits; /* base 2 of 512 byte sectors */ __u8 nr_superblocks; __u8 pad[5]; - __u64 sb_offset[61]; + __le64 sb_offset[61]; } __attribute__((packed, aligned(8))); #define BCH_SB_LAYOUT_SECTOR 7 @@ -1089,6 +1095,11 @@ struct jset_entry { }; }; +struct jset_entry_blacklist { + struct jset_entry entry; + __le64 seq; +}; + #define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64)) enum { diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c index 73089a90..97015084 100644 --- a/libbcachefs/bkey.c +++ b/libbcachefs/bkey.c @@ -1,6 +1,7 @@ #include "bcachefs.h" #include "bkey.h" +#include "bkey_methods.h" #include "bset.h" #include "util.h" @@ -80,37 +81,6 @@ static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed, const struct bkey_format *format) {} #endif -int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k) -{ - char *out = buf, *end = buf + size; - -#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) - - p("u64s %u type %u %llu:%llu snap %u len %u ver %llu", - k->u64s, k->type, k->p.inode, k->p.offset, - k->p.snapshot, k->size, k->version.lo); - - BUG_ON(bkey_packed(k)); - - switch (k->type) { - case KEY_TYPE_DELETED: - p(" deleted"); - break; - case KEY_TYPE_DISCARD: - p(" discard"); - break; - case KEY_TYPE_ERROR: - p(" error"); - break; - case KEY_TYPE_COOKIE: - p(" cookie"); - break; - } -#undef p - - return out - buf; -} - struct pack_state { const struct bkey_format *format; unsigned bits; /* bits remaining in current word */ @@ -336,7 +306,8 @@ bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in, * Extents - we have to guarantee that if an extent is packed, a trimmed * version will also pack: */ - if (bkey_start_offset(in) < format->field_offset[BKEY_FIELD_OFFSET]) + if (bkey_start_offset(in) < + le64_to_cpu(format->field_offset[BKEY_FIELD_OFFSET])) return false; pack_state_finish(&state, out); @@ -800,7 +771,7 @@ static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out, bool *eax_zeroed) { unsigned bits = format->bits_per_field[field]; - u64 offset = format->field_offset[field]; + u64 offset = le64_to_cpu(format->field_offset[field]); unsigned i, byte, bit_offset, align, shl, shr; if (!bits && !offset) { diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index dc0b88f7..89697956 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -8,7 +8,6 @@ #include "vstructs.h" void bch2_to_binary(char *, const u64 *, unsigned); -int bch2_bkey_to_text(char *, size_t, const struct bkey *); #define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX) @@ -377,7 +376,8 @@ static inline u64 bkey_field_max(const struct bkey_format *f, enum bch_bkey_fields nr) { return f->bits_per_field[nr] < 64 - ? f->field_offset[nr] + ~(~0ULL << f->bits_per_field[nr]) + ? (le64_to_cpu(f->field_offset[nr]) + + ~(~0ULL << f->bits_per_field[nr])) : U64_MAX; } diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index 23894158..1736a483 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -18,28 +18,11 @@ const struct bkey_ops *bch2_bkey_ops[] = { [BKEY_TYPE_BTREE] = &bch2_bkey_btree_ops, }; -/* Returns string indicating reason for being invalid, or NULL if valid: */ -const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type, - struct bkey_s_c k) +const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type, + struct bkey_s_c k) { const struct bkey_ops *ops = bch2_bkey_ops[type]; - if (k.k->u64s < BKEY_U64s) - return "u64s too small"; - - if (!ops->is_extents) { - if (k.k->size) - return "nonzero size field"; - } else { - if ((k.k->size == 0) != bkey_deleted(k.k)) - return "bad size field"; - } - - if (ops->is_extents && - !k.k->size && - !bkey_deleted(k.k)) - return "zero size field"; - switch (k.k->type) { case KEY_TYPE_DELETED: case KEY_TYPE_DISCARD: @@ -63,8 +46,41 @@ const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type, } } -const char *bch2_btree_bkey_invalid(struct bch_fs *c, struct btree *b, - struct bkey_s_c k) +const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type, + struct bkey_s_c k) +{ + const struct bkey_ops *ops = bch2_bkey_ops[type]; + + if (k.k->u64s < BKEY_U64s) + return "u64s too small"; + + if (!ops->is_extents) { + if (k.k->size) + return "nonzero size field"; + } else { + if ((k.k->size == 0) != bkey_deleted(k.k)) + return "bad size field"; + } + + if (ops->is_extents && + !k.k->size && + !bkey_deleted(k.k)) + return "zero size field"; + + if (k.k->p.snapshot) + return "nonzero snapshot"; + + return NULL; +} + +const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type, + struct bkey_s_c k) +{ + return __bch2_bkey_invalid(c, type, k) ?: + bch2_bkey_val_invalid(c, type, k); +} + +const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k) { if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0) return "key before start of btree node"; @@ -72,10 +88,7 @@ const char *bch2_btree_bkey_invalid(struct bch_fs *c, struct btree *b, if (bkey_cmp(k.k->p, b->data->max_key) > 0) return "key past end of btree node"; - if (k.k->p.snapshot) - return "nonzero snapshot"; - - return bch2_bkey_invalid(c, btree_node_type(b), k); + return NULL; } void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) @@ -86,7 +99,8 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) BUG_ON(!k.k->u64s); - invalid = bch2_btree_bkey_invalid(c, b, k); + invalid = bch2_bkey_invalid(c, type, k) ?: + bch2_bkey_in_btree_node(b, k); if (invalid) { char buf[160]; @@ -100,33 +114,62 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) ops->key_debugcheck(c, b, k); } -char *bch2_val_to_text(struct bch_fs *c, enum bkey_type type, - char *buf, size_t size, struct bkey_s_c k) +#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) + +int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k) { - const struct bkey_ops *ops = bch2_bkey_ops[type]; + char *out = buf, *end = buf + size; - if (k.k->type >= KEY_TYPE_GENERIC_NR && - ops->val_to_text) - ops->val_to_text(c, buf, size, k); + p("u64s %u type %u ", k->u64s, k->type); - return buf; + if (bkey_cmp(k->p, POS_MAX)) + p("%llu:%llu", k->p.inode, k->p.offset); + else + p("POS_MAX"); + + p(" snap %u len %u ver %llu", k->p.snapshot, k->size, k->version.lo); + + return out - buf; } -char *bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type, - char *buf, size_t size, struct bkey_s_c k) +int bch2_val_to_text(struct bch_fs *c, enum bkey_type type, + char *buf, size_t size, struct bkey_s_c k) { const struct bkey_ops *ops = bch2_bkey_ops[type]; char *out = buf, *end = buf + size; - out += bch2_bkey_to_text(out, end - out, k.k); - - if (k.k->type >= KEY_TYPE_GENERIC_NR && - ops->val_to_text) { - out += scnprintf(out, end - out, ": "); - ops->val_to_text(c, out, end - out, k); + switch (k.k->type) { + case KEY_TYPE_DELETED: + p(" deleted"); + break; + case KEY_TYPE_DISCARD: + p(" discard"); + break; + case KEY_TYPE_ERROR: + p(" error"); + break; + case KEY_TYPE_COOKIE: + p(" cookie"); + break; + default: + if (k.k->type >= KEY_TYPE_GENERIC_NR && ops->val_to_text) + ops->val_to_text(c, buf, size, k); + break; } - return buf; + return out - buf; +} + +int bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type, + char *buf, size_t size, struct bkey_s_c k) +{ + char *out = buf, *end = buf + size; + + out += bch2_bkey_to_text(out, end - out, k.k); + out += scnprintf(out, end - out, ": "); + out += bch2_val_to_text(c, type, out, end - out, k); + + return out - buf; } void bch2_bkey_swab(enum bkey_type type, diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h index 29c1abd3..59db3037 100644 --- a/libbcachefs/bkey_methods.h +++ b/libbcachefs/bkey_methods.h @@ -64,15 +64,19 @@ struct bkey_ops { bool is_extents; }; +const char *bch2_bkey_val_invalid(struct bch_fs *, enum bkey_type, + struct bkey_s_c); +const char *__bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c); const char *bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c); -const char *bch2_btree_bkey_invalid(struct bch_fs *, struct btree *, - struct bkey_s_c); +const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c); void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c); -char *bch2_val_to_text(struct bch_fs *, enum bkey_type, - char *, size_t, struct bkey_s_c); -char *bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type, - char *, size_t, struct bkey_s_c); + +int bch2_bkey_to_text(char *, size_t, const struct bkey *); +int bch2_val_to_text(struct bch_fs *, enum bkey_type, + char *, size_t, struct bkey_s_c); +int bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type, + char *, size_t, struct bkey_s_c); void bch2_bkey_swab(enum bkey_type, const struct bkey_format *, struct bkey_packed *); diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 1198fe39..2294cc3a 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -96,7 +96,7 @@ u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k) struct bkey_s_c_extent e = bkey_s_c_to_extent(k); extent_for_each_ptr(e, ptr) { - struct bch_dev *ca = c->devs[ptr->dev]; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); size_t b = PTR_BUCKET_NR(ca, ptr); if (gen_after(ca->oldest_gens[b], ptr->gen)) @@ -159,14 +159,15 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || (!c->opts.nofsck && fsck_err_on(!bch2_sb_has_replicas(c, e, data_type), c, - "superblock not marked as containing replicas"))) { + "superblock not marked as containing replicas (type %u)", + data_type))) { ret = bch2_check_mark_super(c, e, data_type); if (ret) return ret; } extent_for_each_ptr(e, ptr) { - struct bch_dev *ca = c->devs[ptr->dev]; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); struct bucket *g = PTR_BUCKET(ca, ptr); if (mustfix_fsck_err_on(!g->mark.gen_valid, c, @@ -315,14 +316,14 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, lockdep_assert_held(&c->sb_lock); for (i = 0; i < layout->nr_superblocks; i++) { - if (layout->sb_offset[i] == BCH_SB_SECTOR) + u64 offset = le64_to_cpu(layout->sb_offset[i]); + + if (offset == BCH_SB_SECTOR) mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR, BUCKET_SB, flags); - mark_metadata_sectors(c, ca, - layout->sb_offset[i], - layout->sb_offset[i] + - (1 << layout->sb_max_size_bits), + mark_metadata_sectors(c, ca, offset, + offset + (1 << layout->sb_max_size_bits), BUCKET_SB, flags); } @@ -414,7 +415,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) spin_lock(&ob->lock); if (ob->valid) { gc_pos_set(c, gc_pos_alloc(c, ob)); - ca = c->devs[ob->ptr.dev]; + ca = bch_dev_bkey_exists(c, ob->ptr.dev); bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), true, gc_pos_alloc(c, ob), BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| @@ -424,7 +425,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) } } -void bch2_gc_start(struct bch_fs *c) +static void bch2_gc_start(struct bch_fs *c) { struct bch_dev *ca; struct bucket *g; diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 38c373c6..87a8ddf9 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -556,7 +556,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, struct bset_tree *t; struct bset *start_bset = bset(b, &b->set[start_idx]); bool used_mempool = false; - u64 start_time; + u64 start_time, seq = 0; unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1; bool sorting_entire_node = start_idx == 0 && end_idx == b->nsets; @@ -595,12 +595,9 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, bch2_time_stats_update(&c->btree_sort_time, start_time); /* Make sure we preserve bset journal_seq: */ - for (t = b->set + start_idx + 1; - t < b->set + end_idx; - t++) - start_bset->journal_seq = - max(start_bset->journal_seq, - bset(b, t)->journal_seq); + for (t = b->set + start_idx; t < b->set + end_idx; t++) + seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq)); + start_bset->journal_seq = cpu_to_le64(seq); if (sorting_entire_node) { unsigned u64s = le16_to_cpu(out->keys.u64s); @@ -958,6 +955,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b, { struct bkey_packed *k, *prev = NULL; struct bpos prev_pos = POS_MIN; + enum bkey_type type = btree_node_type(b); bool seen_non_whiteout = false; const char *err; int ret = 0; @@ -1025,7 +1023,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b, if (!BSET_SEPARATE_WHITEOUTS(i)) { seen_non_whiteout = true; - whiteout_u64s = 0; + *whiteout_u64s = 0; } for (k = i->start; @@ -1059,16 +1057,17 @@ static int validate_bset(struct bch_fs *c, struct btree *b, } if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) - bch2_bkey_swab(btree_node_type(b), &b->format, k); + bch2_bkey_swab(type, &b->format, k); u = bkey_disassemble(b, k, &tmp); - invalid = bch2_btree_bkey_invalid(c, b, u); + invalid = __bch2_bkey_invalid(c, type, u) ?: + bch2_bkey_in_btree_node(b, u) ?: + (write ? bch2_bkey_val_invalid(c, type, u) : NULL); if (invalid) { char buf[160]; - bch2_bkey_val_to_text(c, btree_node_type(b), - buf, sizeof(buf), u); + bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u); btree_err(BTREE_ERR_FIXABLE, c, b, i, "invalid bkey %s: %s", buf, invalid); @@ -1114,6 +1113,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry struct btree_node_entry *bne; struct btree_node_iter *iter; struct btree_node *sorted; + struct bkey_packed *k; + struct bset *i; bool used_mempool; unsigned u64s; int ret, retry_read = 0, write = READ; @@ -1137,7 +1138,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry unsigned sectors, whiteout_u64s = 0; struct nonce nonce; struct bch_csum csum; - struct bset *i; if (!b->written) { i = &b->data->keys; @@ -1238,6 +1238,31 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry btree_bounce_free(c, btree_page_order(c), used_mempool, sorted); + i = &b->data->keys; + for (k = i->start; k != vstruct_last(i);) { + enum bkey_type type = btree_node_type(b); + struct bkey tmp; + struct bkey_s_c u = bkey_disassemble(b, k, &tmp); + const char *invalid = bch2_bkey_val_invalid(c, type, u); + + if (invalid) { + char buf[160]; + + bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u); + btree_err(BTREE_ERR_FIXABLE, c, b, i, + "invalid bkey %s: %s", buf, invalid); + + btree_keys_account_key_drop(&b->nr, 0, k); + + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_next(k), + (u64 *) vstruct_end(i) - (u64 *) k); + continue; + } + + k = bkey_next(k); + } + bch2_bset_build_aux_tree(b, b->set, false); set_needs_whiteout(btree_bset_first(b)); @@ -1278,13 +1303,13 @@ static void btree_node_read_work(struct work_struct *work) bio->bi_iter.bi_size = btree_bytes(c); submit_bio_wait(bio); start: - bch2_dev_io_err_on(bio->bi_error, rb->pick.ca, "btree read"); + bch2_dev_io_err_on(bio->bi_status, rb->pick.ca, "btree read"); percpu_ref_put(&rb->pick.ca->io_ref); __set_bit(rb->pick.ca->dev_idx, avoid.d); rb->pick = bch2_btree_pick_ptr(c, b, &avoid); - if (!bio->bi_error && + if (!bio->bi_status && !bch2_btree_node_read_done(c, b, !IS_ERR_OR_NULL(rb->pick.ca))) goto out; } while (!IS_ERR_OR_NULL(rb->pick.ca)); @@ -1377,17 +1402,24 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id)); bch2_btree_node_read(c, b, true); - six_unlock_write(&b->lock); if (btree_node_read_error(b)) { - six_unlock_intent(&b->lock); - return -EIO; + bch2_btree_node_hash_remove(&c->btree_cache, b); + + mutex_lock(&c->btree_cache.lock); + list_move(&b->list, &c->btree_cache.freeable); + mutex_unlock(&c->btree_cache.lock); + + ret = -EIO; + goto err; } bch2_btree_set_root_for_read(c, b); +err: + six_unlock_write(&b->lock); six_unlock_intent(&b->lock); - return 0; + return ret; } void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, @@ -1412,35 +1444,57 @@ static void bch2_btree_node_write_error(struct bch_fs *c, struct closure *cl = wbio->cl; __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; struct bkey_i_extent *new_key; + struct bkey_s_extent e; + struct bch_extent_ptr *ptr; + struct btree_iter iter; + int ret; - six_lock_read(&b->lock); - bkey_copy(&tmp.k, &b->key); - six_unlock_read(&b->lock); + __bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p, + BTREE_MAX_DEPTH, + b->level, 0); +retry: + ret = bch2_btree_iter_traverse(&iter); + if (ret) + goto err; - if (!bkey_extent_is_data(&tmp.k.k) || !PTR_HASH(&tmp.k)) { - /* Node has been freed: */ + /* has node been freed? */ + if (iter.nodes[b->level] != b) { + /* node has been freed: */ + if (!btree_node_dying(b)) + panic("foo4\n"); goto out; } + if (!btree_node_hashed(b)) + panic("foo5\n"); + + bkey_copy(&tmp.k, &b->key); + new_key = bkey_i_to_extent(&tmp.k); + e = extent_i_to_s(new_key); + extent_for_each_ptr_backwards(e, ptr) + if (bch2_dev_list_has_dev(wbio->failed, ptr->dev)) + bch2_extent_drop_ptr(e, ptr); - while (wbio->replicas_failed) { - unsigned idx = __fls(wbio->replicas_failed); + if (!bch2_extent_nr_ptrs(e.c)) + goto err; - bch2_extent_drop_ptr_idx(extent_i_to_s(new_key), idx); - wbio->replicas_failed ^= 1 << idx; - } - - if (!bch2_extent_nr_ptrs(extent_i_to_s_c(new_key)) || - bch2_btree_node_update_key(c, b, new_key)) { - set_btree_node_noevict(b); - bch2_fatal_error(c); - } + ret = bch2_btree_node_update_key(c, &iter, b, new_key); + if (ret == -EINTR) + goto retry; + if (ret) + goto err; out: + bch2_btree_iter_unlock(&iter); bio_put(&wbio->bio); btree_node_write_done(c, b); if (cl) closure_put(cl); + return; +err: + set_btree_node_noevict(b); + bch2_fs_fatal_error(c, "fatal error writing btree node"); + goto out; } void bch2_btree_write_error_work(struct work_struct *work) @@ -1470,12 +1524,17 @@ static void btree_node_write_endio(struct bio *bio) struct closure *cl = !wbio->split ? wbio->cl : NULL; struct bch_fs *c = wbio->c; struct bch_dev *ca = wbio->ca; + unsigned long flags; bch2_latency_acct(ca, wbio->submit_time_us, WRITE); - if (bch2_dev_io_err_on(bio->bi_error, ca, "btree write") || - bch2_meta_write_fault("btree")) - set_bit(wbio->ptr_idx, (unsigned long *) &orig->replicas_failed); + if (bio->bi_status == BLK_STS_REMOVED || + bch2_dev_io_err_on(bio->bi_status, ca, "btree write") || + bch2_meta_write_fault("btree")) { + spin_lock_irqsave(&c->btree_write_error_lock, flags); + bch2_dev_list_add_dev(&orig->failed, ca->dev_idx); + spin_unlock_irqrestore(&c->btree_write_error_lock, flags); + } if (wbio->have_io_ref) percpu_ref_put(&ca->io_ref); @@ -1491,12 +1550,11 @@ static void btree_node_write_endio(struct bio *bio) wbio->used_mempool, wbio->data); - if (wbio->replicas_failed) { - unsigned long flags; - + if (wbio->failed.nr) { spin_lock_irqsave(&c->btree_write_error_lock, flags); bio_list_add(&c->btree_write_error_list, &wbio->bio); spin_unlock_irqrestore(&c->btree_write_error_lock, flags); + queue_work(c->wq, &c->btree_write_error_work); return; } @@ -1707,6 +1765,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, wbio = wbio_init(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write)); wbio->cl = parent; + wbio->failed.nr = 0; wbio->order = order; wbio->used_mempool = used_mempool; wbio->data = data; diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index b0e64957..0b505a73 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -75,8 +75,8 @@ bool bch2_btree_node_relock(struct btree_iter *iter, unsigned level) { struct btree_iter *linked; struct btree *b = iter->nodes[level]; - enum btree_node_locked_type want = btree_lock_want(iter, level); - enum btree_node_locked_type have = btree_node_locked_type(iter, level); + int want = btree_lock_want(iter, level); + int have = btree_node_locked_type(iter, level); if (want == have) return true; @@ -108,6 +108,17 @@ success: return true; } +bool bch2_btree_iter_relock(struct btree_iter *iter) +{ + unsigned l; + + for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++) + if (!bch2_btree_node_relock(iter, l)) + return false; + + return true; +} + /* Slowpath: */ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, unsigned level, @@ -214,7 +225,6 @@ bool __bch2_btree_iter_set_locks_want(struct btree_iter *iter, unsigned new_locks_want) { struct btree_iter *linked; - unsigned l; /* Drop locks we don't want anymore: */ if (new_locks_want < iter->locks_want) @@ -228,12 +238,9 @@ bool __bch2_btree_iter_set_locks_want(struct btree_iter *iter, iter->locks_want = new_locks_want; btree_iter_drop_extra_locks(iter); - for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++) - if (!bch2_btree_node_relock(iter, l)) - goto fail; + if (bch2_btree_iter_relock(iter)) + return true; - return true; -fail: /* * Just an optimization: ancestor nodes must be locked before child * nodes, so set locks_want on iterators that might lock ancestors diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index c2711892..acfe5b59 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -75,7 +75,7 @@ static inline void mark_btree_node_intent_locked(struct btree_iter *iter, mark_btree_node_locked(iter, level, SIX_LOCK_intent); } -static inline int btree_lock_want(struct btree_iter *iter, int level) +static inline enum six_lock_type btree_lock_want(struct btree_iter *iter, int level) { return level < iter->locks_want ? SIX_LOCK_intent @@ -111,6 +111,7 @@ static inline bool btree_node_lock(struct btree *b, struct bpos pos, } bool bch2_btree_node_relock(struct btree_iter *, unsigned); +bool bch2_btree_iter_relock(struct btree_iter *); void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *); void bch2_btree_node_lock_write(struct btree *, struct btree_iter *); diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index f1e06a37..f0e6896a 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -196,6 +196,7 @@ enum btree_flags { BTREE_NODE_accessed, BTREE_NODE_write_in_flight, BTREE_NODE_just_written, + BTREE_NODE_dying, }; BTREE_FLAG(read_in_flight); @@ -207,6 +208,7 @@ BTREE_FLAG(write_idx); BTREE_FLAG(accessed); BTREE_FLAG(write_in_flight); BTREE_FLAG(just_written); +BTREE_FLAG(dying); static inline struct btree_write *btree_current_write(struct btree *b) { diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index e11fcec9..c7c29306 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -130,7 +130,7 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id, int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *, __le64, unsigned); -int bch2_btree_node_update_key(struct bch_fs *, struct btree *, - struct bkey_i_extent *); +int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *, + struct btree *, struct bkey_i_extent *); #endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 1fe8fff8..04854532 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -21,7 +21,7 @@ static void btree_node_will_make_reachable(struct btree_update *, struct btree *); static void btree_update_drop_new_node(struct bch_fs *, struct btree *); -static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *); +static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int); /* Debug code: */ @@ -686,7 +686,7 @@ retry: BUG_ON(c->btree_roots[b->btree_id].as != as); c->btree_roots[b->btree_id].as = NULL; - bch2_btree_set_root_ondisk(c, b); + bch2_btree_set_root_ondisk(c, b, WRITE); /* * We don't have to wait anything anything here (before @@ -914,6 +914,7 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, struct btree_write *w; struct bset_tree *t; + set_btree_node_dying(b); btree_interior_update_add_node_reference(as, b); /* @@ -925,7 +926,8 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, * in with keys that aren't in the journal anymore: */ for_each_bset(b, t) - as->journal_seq = max(as->journal_seq, bset(b, t)->journal_seq); + as->journal_seq = max(as->journal_seq, + le64_to_cpu(bset(b, t)->journal_seq)); mutex_lock(&c->btree_interior_update_lock); @@ -1027,6 +1029,10 @@ static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) mutex_unlock(&c->btree_cache.lock); mutex_lock(&c->btree_root_lock); + BUG_ON(btree_node_root(c, b) && + (b->level < btree_node_root(c, b)->level || + !btree_node_dying(btree_node_root(c, b)))); + btree_node_root(c, b) = b; mutex_unlock(&c->btree_root_lock); @@ -1054,7 +1060,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b) gc_pos_btree_root(b->btree_id)); } -static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b) +static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw) { struct btree_root *r = &c->btree_roots[b->btree_id]; @@ -1064,6 +1070,8 @@ static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b) bkey_copy(&r->key, &b->key); r->level = b->level; r->alive = true; + if (rw == WRITE) + c->btree_roots_dirty = true; mutex_unlock(&c->btree_root_lock); } @@ -1787,64 +1795,16 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, return ret; } -int bch2_btree_node_update_key(struct bch_fs *c, struct btree *b, - struct bkey_i_extent *new_key) +static void __bch2_btree_node_update_key(struct bch_fs *c, + struct btree_update *as, + struct btree_iter *iter, + struct btree *b, struct btree *new_hash, + struct bkey_i_extent *new_key) { - struct btree_update *as = NULL; - struct btree *parent, *new_hash = NULL; - struct btree_iter iter; - struct closure cl; + struct btree *parent; bool must_rewrite_parent = false; int ret; - __bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p, - BTREE_MAX_DEPTH, - b->level, 0); - closure_init_stack(&cl); - - ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE); - if (ret) - return ret; - -retry: - down_read(&c->gc_lock); - ret = bch2_btree_iter_traverse(&iter); - if (ret) - goto err; - - /* check PTR_HASH() after @b is locked by btree_iter_traverse(): */ - if (!new_hash && - PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) { - /* bch2_btree_reserve_get will unlock */ - do { - ret = bch2_btree_cache_cannibalize_lock(c, &cl); - closure_sync(&cl); - } while (ret == -EAGAIN); - - BUG_ON(ret); - - new_hash = bch2_btree_node_mem_alloc(c); - } - - as = bch2_btree_update_start(c, iter.btree_id, - btree_update_reserve_required(c, b), - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_USE_ALLOC_RESERVE, - &cl); - if (IS_ERR(as)) { - ret = PTR_ERR(as); - if (ret == -EAGAIN || ret == -EINTR) { - bch2_btree_iter_unlock(&iter); - up_read(&c->gc_lock); - closure_sync(&cl); - goto retry; - } - goto err; - } - - mutex_lock(&c->btree_interior_update_lock); - /* * Two corner cases that need to be thought about here: * @@ -1869,22 +1829,12 @@ retry: if (b->will_make_reachable) must_rewrite_parent = true; - /* other case: btree node being freed */ - if (iter.nodes[b->level] != b) { - /* node has been freed: */ - BUG_ON(btree_node_hashed(b)); - mutex_unlock(&c->btree_interior_update_lock); - goto err; - } - - mutex_unlock(&c->btree_interior_update_lock); - if (must_rewrite_parent) as->flags |= BTREE_INTERIOR_UPDATE_MUST_REWRITE; btree_interior_update_add_node_reference(as, b); - parent = iter.nodes[b->level + 1]; + parent = iter->nodes[b->level + 1]; if (parent) { if (new_hash) { bkey_copy(&new_hash->key, &new_key->k_i); @@ -1893,8 +1843,8 @@ retry: BUG_ON(ret); } - bch2_btree_insert_node(as, parent, &iter, - &keylist_single(&new_key->k_i)); + bch2_keylist_add(&as->parent_keys, &new_key->k_i); + bch2_btree_insert_node(as, parent, iter, &as->parent_keys); if (new_hash) { mutex_lock(&c->btree_cache.lock); @@ -1914,7 +1864,7 @@ retry: BUG_ON(btree_node_root(c, b) != b); - bch2_btree_node_lock_write(b, &iter); + bch2_btree_node_lock_write(b, iter); bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i), c->opts.btree_node_size, true, @@ -1925,14 +1875,94 @@ retry: &stats); bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res, gc_pos_btree_root(b->btree_id)); - bkey_copy(&b->key, &new_key->k_i); + + if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) { + mutex_lock(&c->btree_cache.lock); + bch2_btree_node_hash_remove(&c->btree_cache, b); + + bkey_copy(&b->key, &new_key->k_i); + ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); + BUG_ON(ret); + mutex_unlock(&c->btree_cache.lock); + } else { + bkey_copy(&b->key, &new_key->k_i); + } btree_update_updated_root(as); - bch2_btree_node_unlock_write(b, &iter); + bch2_btree_node_unlock_write(b, iter); } bch2_btree_update_done(as); -out: +} + +int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, + struct btree *b, struct bkey_i_extent *new_key) +{ + struct btree_update *as = NULL; + struct btree *new_hash = NULL; + struct closure cl; + int ret; + + closure_init_stack(&cl); + + if (!down_read_trylock(&c->gc_lock)) { + bch2_btree_iter_unlock(iter); + down_read(&c->gc_lock); + + if (!bch2_btree_iter_relock(iter)) { + ret = -EINTR; + goto err; + } + } + + /* check PTR_HASH() after @b is locked by btree_iter_traverse(): */ + if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) { + /* bch2_btree_reserve_get will unlock */ + ret = bch2_btree_cache_cannibalize_lock(c, &cl); + if (ret) { + ret = -EINTR; + + bch2_btree_iter_unlock(iter); + up_read(&c->gc_lock); + closure_sync(&cl); + down_read(&c->gc_lock); + + if (!bch2_btree_iter_relock(iter)) + goto err; + } + + new_hash = bch2_btree_node_mem_alloc(c); + } + + as = bch2_btree_update_start(c, iter->btree_id, + btree_update_reserve_required(c, b), + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_USE_ALLOC_RESERVE, + &cl); + if (IS_ERR(as)) { + ret = PTR_ERR(as); + if (ret == -EAGAIN) + ret = -EINTR; + + if (ret != -EINTR) + goto err; + + bch2_btree_iter_unlock(iter); + up_read(&c->gc_lock); + closure_sync(&cl); + down_read(&c->gc_lock); + + if (!bch2_btree_iter_relock(iter)) + goto err; + } + + ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE); + if (ret) + goto err_free_update; + + __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key); +err: if (new_hash) { mutex_lock(&c->btree_cache.lock); list_move(&new_hash->list, &c->btree_cache.freeable); @@ -1941,14 +1971,12 @@ out: six_unlock_write(&new_hash->lock); six_unlock_intent(&new_hash->lock); } - bch2_btree_iter_unlock(&iter); up_read(&c->gc_lock); closure_sync(&cl); return ret; -err: - if (as) - bch2_btree_update_free(as); - goto out; +err_free_update: + bch2_btree_update_free(as); + goto err; } /* Init code: */ @@ -1962,7 +1990,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) BUG_ON(btree_node_root(c, b)); __bch2_btree_set_root_inmem(c, b); - bch2_btree_set_root_ondisk(c, b); + bch2_btree_set_root_ondisk(c, b, READ); } int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id, @@ -1998,7 +2026,7 @@ int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id, BUG_ON(btree_node_root(c, b)); bch2_btree_set_root_inmem(as, b); - bch2_btree_set_root_ondisk(c, b); + bch2_btree_set_root_ondisk(c, b, WRITE); bch2_btree_open_bucket_put(c, b); six_unlock_intent(&b->lock); diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index b73002de..f0a63232 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -174,9 +174,11 @@ do { \ #define bch2_usage_read_raw(_stats) \ ({ \ - typeof(*this_cpu_ptr(_stats)) _acc = { 0 }; \ + typeof(*this_cpu_ptr(_stats)) _acc; \ int cpu; \ \ + memset(&_acc, 0, sizeof(_acc)); \ + \ for_each_possible_cpu(cpu) \ bch2_usage_add(&_acc, per_cpu_ptr((_stats), cpu)); \ \ @@ -479,7 +481,7 @@ static void bch2_mark_pointer(struct bch_fs *c, { struct bucket_mark old, new; unsigned saturated; - struct bch_dev *ca = c->devs[ptr->dev]; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); struct bucket *g = ca->buckets + PTR_BUCKET_NR(ca, ptr); unsigned data_type = type == S_META ? BUCKET_BTREE : BUCKET_DATA; diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index 0bd8d2d8..6f9b1226 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -68,16 +68,14 @@ struct bch_dev_usage { struct bch_fs_usage { /* all fields are in units of 512 byte sectors: */ - /* _uncompressed_ sectors: */ + u64 online_reserved; + u64 available_cache; struct { u64 data[S_ALLOC_NR]; u64 persistent_reserved; } s[BCH_REPLICAS_MAX]; - - u64 online_reserved; - u64 available_cache; }; /* diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index d9a3212c..24af2ca1 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "bcachefs_ioctl.h" +#include "chardev.h" #include "super.h" #include "super-io.h" @@ -25,7 +26,7 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, return ERR_PTR(-EINVAL); rcu_read_lock(); - ca = c->devs[dev]; + ca = rcu_dereference(c->devs[dev]); if (ca) percpu_ref_get(&ca->ref); rcu_read_unlock(); @@ -80,7 +81,7 @@ static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg) devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL); - if (copy_from_user(user_devs, arg.devs, + if (copy_from_user(user_devs, user_arg->devs, sizeof(u64) * arg.nr_devs)) goto err; diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h index 1a089417..b0c8a50e 100644 --- a/libbcachefs/checksum.h +++ b/libbcachefs/checksum.h @@ -72,14 +72,15 @@ static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type, } } -static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c) +static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, + unsigned opt) { if (c->sb.encryption_type) return c->opts.wide_macs ? BCH_CSUM_CHACHA20_POLY1305_128 : BCH_CSUM_CHACHA20_POLY1305_80; - return bch2_csum_opt_to_type(c->opts.data_checksum, true); + return bch2_csum_opt_to_type(opt, true); } static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) @@ -143,6 +144,14 @@ static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) return nonce; } +static inline struct nonce null_nonce(void) +{ + struct nonce ret; + + memset(&ret, 0, sizeof(ret)); + return ret; +} + static inline struct nonce extent_nonce(struct bversion version, struct bch_extent_crc_unpacked crc) { diff --git a/libbcachefs/error.c b/libbcachefs/error.c index 8357c8de..ca2a06e2 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -95,11 +95,17 @@ print: vscnprintf(buf, sizeof(_buf), fmt, args); va_end(args); + if (c->opts.fix_errors == FSCK_OPT_EXIT) { + bch_err(c, "%s, exiting", buf); + mutex_unlock(&c->fsck_error_lock); + return FSCK_ERR_EXIT; + } + if (flags & FSCK_CAN_FIX) { - if (c->opts.fix_errors == FSCK_ERR_ASK) { + if (c->opts.fix_errors == FSCK_OPT_ASK) { printk(KERN_ERR "%s: fix?", buf); fix = ask_yn(); - } else if (c->opts.fix_errors == FSCK_ERR_YES || + } else if (c->opts.fix_errors == FSCK_OPT_YES || (c->opts.nochanges && !(flags & FSCK_CAN_IGNORE))) { if (print) diff --git a/libbcachefs/error.h b/libbcachefs/error.h index 68635eee..28fe4fce 100644 --- a/libbcachefs/error.h +++ b/libbcachefs/error.h @@ -96,9 +96,10 @@ enum { }; enum fsck_err_opts { - FSCK_ERR_NO, - FSCK_ERR_YES, - FSCK_ERR_ASK, + FSCK_OPT_EXIT, + FSCK_OPT_YES, + FSCK_OPT_NO, + FSCK_OPT_ASK, }; enum fsck_err_ret { @@ -217,7 +218,7 @@ do { \ #define bcache_io_error(c, bio, fmt, ...) \ do { \ __bcache_io_error(c, fmt, ##__VA_ARGS__); \ - (bio)->bi_error = -EIO; \ + (bio)->bi_status = BLK_STS_IOERR; \ } while (0) #endif /* _BCACHEFS_ERROR_H */ diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 6e79f491..176978ca 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -18,6 +18,7 @@ #include "extents.h" #include "inode.h" #include "journal.h" +#include "super.h" #include "super-io.h" #include "util.h" #include "xattr.h" @@ -156,6 +157,19 @@ unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k) return nr_ptrs; } +unsigned bch2_extent_nr_good_ptrs(struct bch_fs *c, struct bkey_s_c_extent e) +{ + const struct bch_extent_ptr *ptr; + unsigned nr_ptrs = 0; + + extent_for_each_ptr(e, ptr) + nr_ptrs += (!ptr->cached && + bch_dev_bkey_exists(c, ptr->dev)->mi.state != + BCH_MEMBER_STATE_FAILED); + + return nr_ptrs; +} + unsigned bch2_extent_is_compressed(struct bkey_s_c k) { struct bkey_s_c_extent e; @@ -362,7 +376,7 @@ static bool should_drop_ptr(const struct bch_fs *c, struct bkey_s_c_extent e, const struct bch_extent_ptr *ptr) { - return ptr->cached && ptr_stale(c->devs[ptr->dev], ptr); + return ptr->cached && ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr); } static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e) @@ -411,8 +425,10 @@ static void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); break; case BCH_EXTENT_ENTRY_crc128: - entry->crc128.csum.hi = swab64(entry->crc64.csum_hi); - entry->crc128.csum.lo = swab64(entry->crc64.csum_lo); + entry->crc128.csum.hi = (__force __le64) + swab64((__force u64) entry->crc128.csum.hi); + entry->crc128.csum.lo = (__force __le64) + swab64((__force u64) entry->crc128.csum.lo); break; case BCH_EXTENT_ENTRY_ptr: break; @@ -432,10 +448,11 @@ static const char *extent_ptr_invalid(const struct bch_fs *c, const struct bch_extent_ptr *ptr2; struct bch_dev *ca; - if (ptr->dev >= c->sb.nr_devices) + if (ptr->dev >= c->sb.nr_devices || + !c->devs[ptr->dev]) return "pointer to invalid device"; - ca = c->devs[ptr->dev]; + ca = bch_dev_bkey_exists(c, ptr->dev); if (!ca) return "pointer to invalid device"; @@ -487,7 +504,9 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf, break; case BCH_EXTENT_ENTRY_ptr: ptr = entry_to_ptr(entry); - ca = c->devs[ptr->dev]; + ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] + ? bch_dev_bkey_exists(c, ptr->dev) + : NULL; p("ptr: %u:%llu gen %u%s", ptr->dev, (u64) ptr->offset, ptr->gen, @@ -528,7 +547,7 @@ static void extent_pick_read_device(struct bch_fs *c, struct bch_extent_crc_unpacked crc; extent_for_each_ptr_crc(e, ptr, crc) { - struct bch_dev *ca = c->devs[ptr->dev]; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); if (ptr->cached && ptr_stale(ca, ptr)) continue; @@ -621,7 +640,7 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, bool bad; extent_for_each_ptr(e, ptr) { - ca = c->devs[ptr->dev]; + ca = bch_dev_bkey_exists(c, ptr->dev); g = PTR_BUCKET(ca, ptr); replicas++; @@ -1730,7 +1749,7 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b, memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier)); extent_for_each_ptr(e, ptr) { - ca = c->devs[ptr->dev]; + ca = bch_dev_bkey_exists(c, ptr->dev); g = PTR_BUCKET(ca, ptr); replicas++; ptrs_per_tier[ca->mi.tier]++; @@ -1844,7 +1863,7 @@ static void bch2_extent_to_text(struct bch_fs *c, char *buf, static unsigned PTR_TIER(struct bch_fs *c, const struct bch_extent_ptr *ptr) { - return c->devs[ptr->dev]->mi.tier; + return bch_dev_bkey_exists(c, ptr->dev)->mi.tier; } static void bch2_extent_crc_init(union bch_extent_crc *crc, @@ -1971,14 +1990,10 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c, struct bkey_s_extent e) { struct bch_extent_ptr *ptr; - unsigned tier = 0, nr_cached = 0, nr_good = 0; + unsigned tier = 0, nr_cached = 0; + unsigned nr_good = bch2_extent_nr_good_ptrs(c, e.c); bool have_higher_tier; - extent_for_each_ptr(e, ptr) - if (!ptr->cached && - c->devs[ptr->dev]->mi.state != BCH_MEMBER_STATE_FAILED) - nr_good++; - if (nr_good <= c->opts.data_replicas) return; @@ -2103,7 +2118,7 @@ static enum merge_result bch2_extent_merge(struct bch_fs *c, return BCH_MERGE_NOMERGE; /* We don't allow extents to straddle buckets: */ - ca = c->devs[lp->dev]; + ca = bch_dev_bkey_exists(c, lp->dev); if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) return BCH_MERGE_NOMERGE; @@ -2347,6 +2362,30 @@ static bool bch2_extent_merge_inline(struct bch_fs *c, } } +int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size) +{ + struct btree_iter iter; + struct bpos end = pos; + struct bkey_s_c k; + int ret = 0; + + end.offset += size; + + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos, + BTREE_ITER_WITH_HOLES, k) { + if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) + break; + + if (!bch2_extent_is_fully_allocated(k)) { + ret = -ENOSPC; + break; + } + } + bch2_btree_iter_unlock(&iter); + + return ret; +} + const struct bkey_ops bch2_bkey_extent_ops = { .key_invalid = bch2_extent_invalid, .key_debugcheck = bch2_extent_debugcheck, diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 1ec2db5e..ab7993ab 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -45,6 +45,7 @@ bch2_extent_has_device(struct bkey_s_c_extent, unsigned); unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent); unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c); +unsigned bch2_extent_nr_good_ptrs(struct bch_fs *, struct bkey_s_c_extent); unsigned bch2_extent_is_compressed(struct bkey_s_c); bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent, @@ -243,14 +244,14 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) case BCH_EXTENT_CRC32: return (struct bch_extent_crc_unpacked) { common_fields(crc->crc32), - .csum.lo = crc->crc32.csum, + .csum.lo = (__force __le64) crc->crc32.csum, }; case BCH_EXTENT_CRC64: return (struct bch_extent_crc_unpacked) { common_fields(crc->crc64), .nonce = crc->crc64.nonce, - .csum.lo = crc->crc64.csum_lo, - .csum.hi = crc->crc64.csum_hi, + .csum.lo = (__force __le64) crc->crc64.csum_lo, + .csum.hi = (__force __le64) crc->crc64.csum_hi, }; case BCH_EXTENT_CRC128: return (struct bch_extent_crc_unpacked) { @@ -425,4 +426,6 @@ bool bch2_cut_front(struct bpos, struct bkey_i *); bool bch2_cut_back(struct bpos, struct bkey *); void bch2_key_resize(struct bkey *, unsigned); +int bch2_check_range_allocated(struct bch_fs *, struct bpos, u64); + #endif /* _BCACHEFS_EXTENTS_H */ diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 298e3592..2c34a85c 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -28,8 +28,11 @@ struct i_sectors_hook { struct extent_insert_hook hook; - s64 sectors; struct bch_inode_info *inode; + s64 sectors; + u64 new_i_size; + unsigned flags; + unsigned appending:1; }; struct bchfs_write_op { @@ -43,17 +46,6 @@ struct bchfs_write_op { struct bch_write_op op; }; -static inline void bch2_fswrite_op_init(struct bchfs_write_op *op, - struct bch_inode_info *inode, - bool is_dio) -{ - op->inode = inode; - op->sectors_added = 0; - op->is_dio = is_dio; - op->unalloc = false; - op->new_i_size = U64_MAX; -} - struct bch_writepage_io { struct closure cl; @@ -65,12 +57,8 @@ struct dio_write { struct closure cl; struct kiocb *req; struct bch_fs *c; - long written; - long error; loff_t offset; - struct disk_reservation res; - struct iovec *iovec; struct iovec inline_vecs[UIO_FASTIOV]; struct iov_iter iter; @@ -129,12 +117,6 @@ static int inode_set_size(struct bch_inode_info *inode, lockdep_assert_held(&inode->ei_update_lock); bi->bi_size = *new_i_size; - - if (atomic_long_read(&inode->ei_size_dirty_count)) - bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY; - else - bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; - return 0; } @@ -145,16 +127,16 @@ static int __must_check bch2_write_inode_size(struct bch_fs *c, return __bch2_write_inode(c, inode, inode_set_size, &new_size); } -static inline void i_size_dirty_put(struct bch_inode_info *inode) +static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, int sectors) { - atomic_long_dec_bug(&inode->ei_size_dirty_count); + inode->v.i_blocks += sectors; } -static inline void i_size_dirty_get(struct bch_inode_info *inode) +static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, int sectors) { - lockdep_assert_held(&inode->v.i_rwsem); - - atomic_long_inc(&inode->ei_size_dirty_count); + mutex_lock(&inode->ei_update_lock); + __i_sectors_acct(c, inode, sectors); + mutex_unlock(&inode->ei_update_lock); } /* i_sectors accounting: */ @@ -172,90 +154,83 @@ i_sectors_hook_fn(struct extent_insert_hook *hook, int sign = bkey_extent_is_allocation(&insert->k) - (k.k && bkey_extent_is_allocation(k.k)); - EBUG_ON(!(h->inode->ei_flags & BCH_INODE_I_SECTORS_DIRTY)); - EBUG_ON(!atomic_long_read(&h->inode->ei_sectors_dirty_count)); + EBUG_ON(!(h->inode->ei_inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY)); h->sectors += sectors * sign; return BTREE_INSERT_OK; } -static int inode_set_i_sectors_dirty(struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, void *p) +static int i_sectors_dirty_finish_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) { - BUG_ON(bi->bi_flags & BCH_INODE_I_SECTORS_DIRTY); + struct i_sectors_hook *h = p; - bi->bi_flags |= BCH_INODE_I_SECTORS_DIRTY; + if (h->new_i_size != U64_MAX && + (!h->appending || + h->new_i_size > bi->bi_size)) + bi->bi_size = h->new_i_size; + bi->bi_sectors += h->sectors; + bi->bi_flags &= ~h->flags; return 0; } -static int inode_clear_i_sectors_dirty(struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) +static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h) { - BUG_ON(!(bi->bi_flags & BCH_INODE_I_SECTORS_DIRTY)); + int ret; - bi->bi_sectors = atomic64_read(&inode->ei_sectors); - bi->bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY; - return 0; -} + mutex_lock(&h->inode->ei_update_lock); + if (h->new_i_size != U64_MAX) + i_size_write(&h->inode->v, h->new_i_size); -static void i_sectors_dirty_put(struct bch_fs *c, - struct bch_inode_info *inode, - struct i_sectors_hook *h) -{ - if (h->sectors) { - spin_lock(&inode->v.i_lock); - inode->v.i_blocks += h->sectors; - spin_unlock(&inode->v.i_lock); + __i_sectors_acct(c, h->inode, h->sectors); - atomic64_add(h->sectors, &inode->ei_sectors); - EBUG_ON(atomic64_read(&inode->ei_sectors) < 0); - } + ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h); + mutex_unlock(&h->inode->ei_update_lock); - EBUG_ON(atomic_long_read(&inode->ei_sectors_dirty_count) <= 0); - - mutex_lock(&inode->ei_update_lock); - - if (atomic_long_dec_and_test(&inode->ei_sectors_dirty_count)) { - int ret = __bch2_write_inode(c, inode, - inode_clear_i_sectors_dirty, NULL); - - ret = ret; - } - - mutex_unlock(&inode->ei_update_lock); -} - -static int __must_check i_sectors_dirty_get(struct bch_fs *c, - struct bch_inode_info *inode, - struct i_sectors_hook *h) -{ - int ret = 0; - - h->hook.fn = i_sectors_hook_fn; - h->sectors = 0; -#ifdef CONFIG_BCACHEFS_DEBUG - h->inode = inode; -#endif - - if (atomic_long_inc_not_zero(&inode->ei_sectors_dirty_count)) - return 0; - - mutex_lock(&inode->ei_update_lock); - - if (!(inode->ei_flags & BCH_INODE_I_SECTORS_DIRTY)) - ret = __bch2_write_inode(c, inode, inode_set_i_sectors_dirty, - NULL); - - if (!ret) - atomic_long_inc(&inode->ei_sectors_dirty_count); - - mutex_unlock(&inode->ei_update_lock); + h->sectors = 0; return ret; } +static int i_sectors_dirty_start_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, void *p) +{ + struct i_sectors_hook *h = p; + + if (h->flags & BCH_INODE_I_SIZE_DIRTY) + bi->bi_size = h->new_i_size; + + bi->bi_flags |= h->flags; + return 0; +} + +static int i_sectors_dirty_start(struct bch_fs *c, struct i_sectors_hook *h) +{ + int ret; + + mutex_lock(&h->inode->ei_update_lock); + ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_start_fn, h); + mutex_unlock(&h->inode->ei_update_lock); + + return ret; +} + +static inline struct i_sectors_hook +i_sectors_hook_init(struct bch_inode_info *inode, unsigned flags) +{ + return (struct i_sectors_hook) { + .hook.fn = i_sectors_hook_fn, + .inode = inode, + .sectors = 0, + .new_i_size = U64_MAX, + .flags = flags|BCH_INODE_I_SECTORS_DIRTY, + }; +} + +/* normal i_size/i_sectors update machinery: */ + struct bchfs_extent_trans_hook { struct bchfs_write_op *op; struct extent_insert_hook hook; @@ -289,18 +264,18 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook, BUG_ON((next_pos.offset << 9) > round_up(offset, PAGE_SIZE)); /* XXX: inode->i_size locking */ - if (offset > inode->ei_size) { - BUG_ON(inode->ei_flags & BCH_INODE_I_SIZE_DIRTY); - + if (offset > inode->ei_inode.bi_size) { if (!h->need_inode_update) { h->need_inode_update = true; return BTREE_INSERT_NEED_TRAVERSE; } + BUG_ON(h->inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY); + h->inode_u.bi_size = offset; do_pack = true; - inode->ei_size = offset; + inode->ei_inode.bi_size = offset; if (h->op->is_dio) i_size_write(&inode->v, offset); @@ -315,15 +290,7 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook, h->inode_u.bi_sectors += sectors; do_pack = true; - atomic64_add(sectors, &inode->ei_sectors); - h->op->sectors_added += sectors; - - if (h->op->is_dio) { - spin_lock(&inode->v.i_lock); - inode->v.i_blocks += sectors; - spin_unlock(&inode->v.i_lock); - } } if (do_pack) @@ -340,6 +307,7 @@ static int bchfs_write_index_update(struct bch_write_op *wop) struct btree_iter extent_iter, inode_iter; struct bchfs_extent_trans_hook hook; struct bkey_i *k = bch2_keylist_front(keys); + s64 orig_sectors_added = op->sectors_added; int ret; BUG_ON(k->k.p.inode != op->inode->v.i_ino); @@ -362,7 +330,8 @@ static int bchfs_write_index_update(struct bch_write_op *wop) /* XXX: inode->i_size locking */ k = bch2_keylist_front(keys); - if (min(k->k.p.offset << 9, op->new_i_size) > op->inode->ei_size) + if (min(k->k.p.offset << 9, op->new_i_size) > + op->inode->ei_inode.bi_size) hook.need_inode_update = true; if (hook.need_inode_update) { @@ -430,9 +399,41 @@ err: bch2_btree_iter_unlock(&extent_iter); bch2_btree_iter_unlock(&inode_iter); + if (op->is_dio) + i_sectors_acct(wop->c, op->inode, + op->sectors_added - orig_sectors_added); + return ret; } +static inline void bch2_fswrite_op_init(struct bchfs_write_op *op, + struct bch_fs *c, + struct bch_inode_info *inode, + struct bch_io_opts opts, + bool is_dio) +{ + op->inode = inode; + op->sectors_added = 0; + op->is_dio = is_dio; + op->unalloc = false; + op->new_i_size = U64_MAX; + + bch2_write_op_init(&op->op, c); + op->op.csum_type = bch2_data_checksum_type(c, opts.data_checksum); + op->op.compression_type = bch2_compression_opt_to_type(opts.compression); + op->op.devs = c->fastest_devs; + op->op.index_update_fn = bchfs_write_index_update; + op_journal_seq_set(&op->op, &inode->ei_journal_seq); +} + +static inline struct bch_io_opts io_opts(struct bch_fs *c, struct bch_inode_info *inode) +{ + struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts); + + bch2_io_opts_apply(&opts, bch2_inode_opts_get(&inode->ei_inode)); + return opts; +} + /* page state: */ /* stored in page->private: */ @@ -551,11 +552,8 @@ static void bch2_clear_page_bits(struct page *page) s = xchg(page_state(page), (struct bch_page_state) { .v = 0 }); ClearPagePrivate(page); - if (s.dirty_sectors) { - spin_lock(&inode->v.i_lock); - inode->v.i_blocks -= s.dirty_sectors; - spin_unlock(&inode->v.i_lock); - } + if (s.dirty_sectors) + i_sectors_acct(c, inode, -s.dirty_sectors); if (s.reserved) bch2_disk_reservation_put(c, &res); @@ -563,19 +561,16 @@ static void bch2_clear_page_bits(struct page *page) int bch2_set_page_dirty(struct page *page) { + struct bch_inode_info *inode = to_bch_ei(page->mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_page_state old, new; old = page_state_cmpxchg(page_state(page), new, new.dirty_sectors = PAGE_SECTORS - new.sectors; ); - if (old.dirty_sectors != new.dirty_sectors) { - struct bch_inode_info *inode = to_bch_ei(page->mapping->host); - - spin_lock(&inode->v.i_lock); - inode->v.i_blocks += new.dirty_sectors - old.dirty_sectors; - spin_unlock(&inode->v.i_lock); - } + if (old.dirty_sectors != new.dirty_sectors) + i_sectors_acct(c, inode, new.dirty_sectors - old.dirty_sectors); return __set_page_dirty_nobuffers(page); } @@ -624,7 +619,7 @@ static void bch2_readpages_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - if (!bio->bi_error) { + if (!bio->bi_status) { SetPageUptodate(page); } else { ClearPageUptodate(page); @@ -846,6 +841,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping, { struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_io_opts opts = io_opts(c, inode); struct btree_iter iter; struct page *page; struct readpages_iter readpages_iter = { @@ -868,7 +864,8 @@ int bch2_readpages(struct file *file, struct address_space *mapping, c->sb.encoded_extent_max >> PAGE_SECTOR_SHIFT); struct bch_read_bio *rbio = - to_rbio(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read)); + rbio_init(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read), + opts); rbio->bio.bi_end_io = bch2_readpages_end_io; bio_add_page_contig(&rbio->bio, page); @@ -914,9 +911,10 @@ int bch2_readpage(struct file *file, struct page *page) { struct bch_inode_info *inode = to_bch_ei(page->mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_io_opts opts = io_opts(c, inode); struct bch_read_bio *rbio; - rbio = to_rbio(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read)); + rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts); rbio->bio.bi_end_io = bch2_readpages_end_io; __bchfs_readpage(c, rbio, inode->v.i_ino, page); @@ -925,8 +923,15 @@ int bch2_readpage(struct file *file, struct page *page) struct bch_writepage_state { struct bch_writepage_io *io; + struct bch_io_opts opts; }; +static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, + struct bch_inode_info *inode) +{ + return (struct bch_writepage_state) { .opts = io_opts(c, inode) }; +} + static void bch2_writepage_io_free(struct closure *cl) { struct bch_writepage_io *io = container_of(cl, @@ -982,13 +987,8 @@ static void bch2_writepage_io_done(struct closure *cl) * PageWriteback is effectively our ref on the inode - fixup i_blocks * before calling end_page_writeback: */ - if (io->op.sectors_added) { - struct bch_inode_info *inode = io->op.inode; - - spin_lock(&inode->v.i_lock); - inode->v.i_blocks += io->op.sectors_added; - spin_unlock(&inode->v.i_lock); - } + if (io->op.sectors_added) + i_sectors_acct(c, io->op.inode, io->op.sectors_added); bio_for_each_segment_all(bvec, bio, i) end_page_writeback(bvec->bv_page); @@ -1004,8 +1004,6 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w) w->io = NULL; atomic_add(bio->bi_vcnt, &io->op.op.c->writeback_pages); - io->op.op.pos.offset = bio->bi_iter.bi_sector; - closure_call(&io->op.op.cl, bch2_write, NULL, &io->cl); continue_at(&io->cl, bch2_writepage_io_done, NULL); } @@ -1017,46 +1015,26 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w) static void bch2_writepage_io_alloc(struct bch_fs *c, struct bch_writepage_state *w, struct bch_inode_info *inode, - struct page *page) + struct page *page, + struct bch_page_state s) { - u64 inum = inode->v.i_ino; - unsigned nr_replicas = page_state(page)->nr_replicas; + struct bch_write_op *op; + u64 offset = (u64) page->index << PAGE_SECTOR_SHIFT; - EBUG_ON(!nr_replicas); - /* XXX: disk_reservation->gen isn't plumbed through */ + w->io = container_of(bio_alloc_bioset(GFP_NOFS, + BIO_MAX_PAGES, + &c->writepage_bioset), + struct bch_writepage_io, op.op.wbio.bio); + op = &w->io->op.op; - if (!w->io) { -alloc_io: - w->io = container_of(bio_alloc_bioset(GFP_NOFS, - BIO_MAX_PAGES, - &c->writepage_bioset), - struct bch_writepage_io, op.op.wbio.bio); + closure_init(&w->io->cl, NULL); - closure_init(&w->io->cl, NULL); - bch2_fswrite_op_init(&w->io->op, inode, false); - bch2_write_op_init(&w->io->op.op, c, - (struct disk_reservation) { - .nr_replicas = c->opts.data_replicas, - }, - c->fastest_devs, - writepoint_hashed(inode->ei_last_dirtied), - POS(inum, 0), - &inode->ei_journal_seq, - 0); - w->io->op.op.index_update_fn = bchfs_write_index_update; - } - - if (w->io->op.op.res.nr_replicas != nr_replicas || - bio_add_page_contig(&w->io->op.op.wbio.bio, page)) { - bch2_writepage_do_io(w); - goto alloc_io; - } - - /* - * We shouldn't ever be handed pages for multiple inodes in a single - * pass - right? - */ - BUG_ON(inode != w->io->op.inode); + bch2_fswrite_op_init(&w->io->op, c, inode, w->opts, false); + op->nr_replicas = s.nr_replicas; + op->res.nr_replicas = s.nr_replicas; + op->write_point = writepoint_hashed(inode->ei_last_dirtied); + op->pos = POS(inode->v.i_ino, offset); + op->wbio.bio.bi_iter.bi_sector = offset; } static int __bch2_writepage(struct bch_fs *c, struct page *page, @@ -1091,7 +1069,32 @@ static int __bch2_writepage(struct bch_fs *c, struct page *page, */ zero_user_segment(page, offset, PAGE_SIZE); do_io: - bch2_writepage_io_alloc(c, w, inode, page); + /* Before unlocking the page, transfer reservation to w->io: */ + old = page_state_cmpxchg(page_state(page), new, { + EBUG_ON(!new.reserved && + (new.sectors != PAGE_SECTORS || + !new.allocated)); + + if (new.allocated && w->opts.compression) + new.allocated = 0; + else if (!new.reserved) + break; + new.reserved = 0; + }); + + if (w->io && + (w->io->op.op.res.nr_replicas != old.nr_replicas || + !bio_can_add_page_contig(&w->io->op.op.wbio.bio, page))) + bch2_writepage_do_io(w); + + if (!w->io) + bch2_writepage_io_alloc(c, w, inode, page, old); + + BUG_ON(inode != w->io->op.inode); + BUG_ON(bio_add_page_contig(&w->io->op.op.wbio.bio, page)); + + if (old.reserved) + w->io->op.op.res.sectors += old.nr_replicas * PAGE_SECTORS; /* while page is locked: */ w->io->op.new_i_size = i_size; @@ -1099,24 +1102,6 @@ do_io: if (wbc->sync_mode == WB_SYNC_ALL) w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC; - /* Before unlocking the page, transfer reservation to w->io: */ - old = page_state_cmpxchg(page_state(page), new, { - EBUG_ON(!new.reserved && - (new.sectors != PAGE_SECTORS || - !new.allocated)); - - if (new.allocated && - w->io->op.op.compression_type != BCH_COMPRESSION_NONE) - new.allocated = 0; - else if (!new.reserved) - goto out; - new.reserved = 0; - }); - - w->io->op.op.res.sectors += PAGE_SECTORS * - (old.reserved - new.reserved) * - old.nr_replicas; -out: BUG_ON(PageWriteback(page)); set_page_writeback(page); unlock_page(page); @@ -1127,7 +1112,8 @@ out: int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct bch_fs *c = mapping->host->i_sb->s_fs_info; - struct bch_writepage_state w = { NULL }; + struct bch_writepage_state w = + bch_writepage_state_init(c, to_bch_ei(mapping->host)); struct pagecache_iter iter; struct page *page; int ret = 0; @@ -1275,7 +1261,8 @@ continue_unlock: int bch2_writepage(struct page *page, struct writeback_control *wbc) { struct bch_fs *c = page->mapping->host->i_sb->s_fs_info; - struct bch_writepage_state w = { NULL }; + struct bch_writepage_state w = + bch_writepage_state_init(c, to_bch_ei(page->mapping->host)); int ret; ret = __bch2_writepage(c, page, wbc, &w); @@ -1306,7 +1293,7 @@ static int bch2_read_single_page(struct page *page, __bchfs_readpage(c, rbio, inode->v.i_ino, page); wait_for_completion(&done); - ret = rbio->bio.bi_error; + ret = blk_status_to_errno(rbio->bio.bi_status); bio_put(&rbio->bio); if (ret < 0) @@ -1440,8 +1427,8 @@ static void bch2_direct_IO_read_endio(struct bio *bio) { struct dio_read *dio = bio->bi_private; - if (bio->bi_error) - dio->ret = bio->bi_error; + if (bio->bi_status) + dio->ret = blk_status_to_errno(bio->bi_status); closure_put(&dio->cl); } @@ -1456,6 +1443,7 @@ static int bch2_direct_IO_read(struct bch_fs *c, struct kiocb *req, struct file *file, struct bch_inode_info *inode, struct iov_iter *iter, loff_t offset) { + struct bch_io_opts opts = io_opts(c, inode); struct dio_read *dio; struct bio *bio; bool sync = is_sync_kiocb(req); @@ -1512,7 +1500,7 @@ start: ret = bio_iov_iter_get_pages(bio, iter); if (ret < 0) { /* XXX: fault inject this path */ - bio->bi_error = ret; + bio->bi_status = BLK_STS_RESOURCE; bio_endio(bio); break; } @@ -1523,7 +1511,7 @@ start: if (iter->count) closure_get(&dio->cl); - bch2_read(c, to_rbio(bio), inode->v.i_ino); + bch2_read(c, rbio_init(bio, opts), inode->v.i_ino); } if (sync) { @@ -1542,9 +1530,9 @@ static long __bch2_dio_write_complete(struct dio_write *dio) struct file *file = dio->req->ki_filp; struct address_space *mapping = file->f_mapping; struct bch_inode_info *inode = file_bch_inode(file); - long ret = dio->error ?: dio->written; + long ret = dio->iop.op.error ?: ((long) dio->iop.op.written << 9); - bch2_disk_reservation_put(dio->c, &dio->res); + bch2_disk_reservation_put(dio->c, &dio->iop.op.res); __pagecache_block_put(&mapping->add_lock); inode_dio_end(&inode->v); @@ -1569,11 +1557,6 @@ static void bch2_dio_write_done(struct dio_write *dio) struct bio_vec *bv; int i; - dio->written += dio->iop.op.written << 9; - - if (dio->iop.op.error) - dio->error = dio->iop.op.error; - bio_for_each_segment_all(bv, &dio->iop.op.wbio.bio, i) put_page(bv->bv_page); @@ -1586,38 +1569,15 @@ static void bch2_do_direct_IO_write(struct dio_write *dio) struct file *file = dio->req->ki_filp; struct bch_inode_info *inode = file_bch_inode(file); struct bio *bio = &dio->iop.op.wbio.bio; - unsigned flags = 0; int ret; - if ((dio->req->ki_flags & IOCB_DSYNC) && - !dio->c->opts.journal_flush_disabled) - flags |= BCH_WRITE_FLUSH; - ret = bio_iov_iter_get_pages(bio, &dio->iter); if (ret < 0) { - /* - * these didn't get initialized, but bch2_dio_write_done() will - * look at them: - */ - dio->iop.op.error = 0; - dio->iop.op.written = 0; - dio->error = ret; + dio->iop.op.error = ret; return; } - dio->iop.sectors_added = 0; - bch2_write_op_init(&dio->iop.op, dio->c, dio->res, - dio->c->fastest_devs, - writepoint_hashed((unsigned long) dio->task), - POS(inode->v.i_ino, (dio->offset + dio->written) >> 9), - &inode->ei_journal_seq, - flags); - dio->iop.op.index_update_fn = bchfs_write_index_update; - - if (!dio->iop.unalloc) { - dio->res.sectors -= bio_sectors(bio); - dio->iop.op.res.sectors = bio_sectors(bio); - } + dio->iop.op.pos = POS(inode->v.i_ino, (dio->offset >> 9) + dio->iop.op.written); task_io_account_write(bio->bi_iter.bi_size); @@ -1632,7 +1592,7 @@ static void bch2_dio_write_loop_async(struct closure *cl) bch2_dio_write_done(dio); - if (dio->iter.count && !dio->error) { + if (dio->iter.count && !dio->iop.op.error) { use_mm(dio->task->mm); pagecache_block_get(&mapping->add_lock); @@ -1652,31 +1612,6 @@ static void bch2_dio_write_loop_async(struct closure *cl) } } -static int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, - u64 size) -{ - struct btree_iter iter; - struct bpos end = pos; - struct bkey_s_c k; - int ret = 0; - - end.offset += size; - - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos, - BTREE_ITER_WITH_HOLES, k) { - if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) - break; - - if (!bch2_extent_is_fully_allocated(k)) { - ret = -ENOSPC; - break; - } - } - bch2_btree_iter_unlock(&iter); - - return ret; -} - static int bch2_direct_IO_write(struct bch_fs *c, struct kiocb *req, struct file *file, struct bch_inode_info *inode, @@ -1703,13 +1638,17 @@ static int bch2_direct_IO_write(struct bch_fs *c, closure_init(&dio->cl, NULL); dio->req = req; dio->c = c; - dio->written = 0; - dio->error = 0; dio->offset = offset; dio->iovec = NULL; dio->iter = *iter; dio->task = current; - bch2_fswrite_op_init(&dio->iop, inode, true); + bch2_fswrite_op_init(&dio->iop, c, inode, io_opts(c, inode), true); + dio->iop.op.write_point = writepoint_hashed((unsigned long) dio->task); + dio->iop.op.flags |= BCH_WRITE_NOPUT_RESERVATION; + + if ((dio->req->ki_flags & IOCB_DSYNC) && + !c->opts.journal_flush_disabled) + dio->iop.op.flags |= BCH_WRITE_FLUSH; if (offset + iter->count > inode->v.i_size) sync = true; @@ -1722,7 +1661,7 @@ static int bch2_direct_IO_write(struct bch_fs *c, * Have to then guard against racing with truncate (deleting data that * we would have been overwriting) */ - ret = bch2_disk_reservation_get(c, &dio->res, iter->count >> 9, 0); + ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9, 0); if (unlikely(ret)) { if (bch2_check_range_allocated(c, POS(inode->v.i_ino, offset >> 9), @@ -1735,6 +1674,8 @@ static int bch2_direct_IO_write(struct bch_fs *c, dio->iop.unalloc = true; } + dio->iop.op.nr_replicas = dio->iop.op.res.nr_replicas; + inode_dio_begin(&inode->v); __pagecache_block_get(&mapping->add_lock); @@ -1744,20 +1685,20 @@ static int bch2_direct_IO_write(struct bch_fs *c, closure_sync(&dio->cl); bch2_dio_write_done(dio); - } while (dio->iter.count && !dio->error); + } while (dio->iter.count && !dio->iop.op.error); closure_debug_destroy(&dio->cl); return __bch2_dio_write_complete(dio); } else { bch2_do_direct_IO_write(dio); - if (dio->iter.count && !dio->error) { + if (dio->iter.count && !dio->iop.op.error) { if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { dio->iovec = kmalloc(dio->iter.nr_segs * sizeof(struct iovec), GFP_KERNEL); if (!dio->iovec) - dio->error = -ENOMEM; + dio->iop.op.error = -ENOMEM; } else { dio->iovec = dio->inline_vecs; } @@ -1965,11 +1906,11 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) return bch2_journal_flush_seq(&c->journal, inode->ei_journal_seq); } -static int __bch2_truncate_page(struct address_space *mapping, +static int __bch2_truncate_page(struct bch_inode_info *inode, pgoff_t index, loff_t start, loff_t end) { - struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct address_space *mapping = inode->v.i_mapping; unsigned start_offset = start & (PAGE_SIZE - 1); unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1; struct page *page; @@ -2049,10 +1990,10 @@ out: return ret; } -static int bch2_truncate_page(struct address_space *mapping, loff_t from) +static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from) { - return __bch2_truncate_page(mapping, from >> PAGE_SHIFT, - from, from + PAGE_SIZE); + return __bch2_truncate_page(inode, from >> PAGE_SHIFT, + from, from + PAGE_SIZE); } int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) @@ -2060,6 +2001,8 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; bool shrink = iattr->ia_size <= inode->v.i_size; + struct i_sectors_hook i_sectors_hook = + i_sectors_hook_init(inode, BCH_INODE_I_SIZE_DIRTY); int ret = 0; inode_dio_wait(&inode->v); @@ -2069,17 +2012,15 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) /* sync appends.. */ /* XXX what protects inode->i_size? */ - if (iattr->ia_size > inode->ei_size) + if (iattr->ia_size > inode->ei_inode.bi_size) ret = filemap_write_and_wait_range(mapping, - inode->ei_size, S64_MAX); + inode->ei_inode.bi_size, S64_MAX); if (ret) goto err_put_pagecache; - mutex_lock(&inode->ei_update_lock); - i_size_dirty_get(inode); - ret = bch2_write_inode_size(c, inode, inode->v.i_size); - mutex_unlock(&inode->ei_update_lock); + i_sectors_hook.new_i_size = iattr->ia_size; + ret = i_sectors_dirty_start(c, &i_sectors_hook); if (unlikely(ret)) goto err; @@ -2090,45 +2031,32 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) * here (new i_size < current i_size): */ if (shrink) { - struct i_sectors_hook i_sectors_hook; - int ret; - - ret = i_sectors_dirty_get(c, inode, &i_sectors_hook); + ret = bch2_truncate_page(inode, iattr->ia_size); if (unlikely(ret)) goto err; - ret = bch2_truncate_page(inode->v.i_mapping, iattr->ia_size); - if (unlikely(ret)) { - i_sectors_dirty_put(c, inode, &i_sectors_hook); - goto err; - } - ret = bch2_inode_truncate(c, inode->v.i_ino, - round_up(iattr->ia_size, PAGE_SIZE) >> 9, - &i_sectors_hook.hook, - &inode->ei_journal_seq); - - i_sectors_dirty_put(c, inode, &i_sectors_hook); - + round_up(iattr->ia_size, PAGE_SIZE) >> 9, + &i_sectors_hook.hook, + &inode->ei_journal_seq); if (unlikely(ret)) goto err; } - mutex_lock(&inode->ei_update_lock); setattr_copy(&inode->v, iattr); - inode->v.i_mtime = inode->v.i_ctime = current_fs_time(inode->v.i_sb); -out: - /* clear I_SIZE_DIRTY: */ - i_size_dirty_put(inode); - ret = bch2_write_inode_size(c, inode, inode->v.i_size); - mutex_unlock(&inode->ei_update_lock); + inode->v.i_mtime = inode->v.i_ctime = current_time(&inode->v); +err: + /* + * On error - in particular, bch2_truncate_page() error - don't clear + * I_SIZE_DIRTY, as we've left data above i_size!: + */ + if (ret) + i_sectors_hook.flags &= ~BCH_INODE_I_SIZE_DIRTY; + ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret; err_put_pagecache: pagecache_block_put(&mapping->add_lock); return ret; -err: - mutex_lock(&inode->ei_update_lock); - goto out; } static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) @@ -2144,33 +2072,41 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) inode_dio_wait(&inode->v); pagecache_block_get(&mapping->add_lock); - ret = __bch2_truncate_page(mapping, + ret = __bch2_truncate_page(inode, offset >> PAGE_SHIFT, offset, offset + len); if (unlikely(ret)) - goto out; + goto err; if (offset >> PAGE_SHIFT != (offset + len) >> PAGE_SHIFT) { - ret = __bch2_truncate_page(mapping, + ret = __bch2_truncate_page(inode, (offset + len) >> PAGE_SHIFT, offset, offset + len); if (unlikely(ret)) - goto out; + goto err; } truncate_pagecache_range(&inode->v, offset, offset + len - 1); if (discard_start < discard_end) { struct disk_reservation disk_res; - struct i_sectors_hook i_sectors_hook; + struct i_sectors_hook i_sectors_hook = + i_sectors_hook_init(inode, 0); int ret; - BUG_ON(bch2_disk_reservation_get(c, &disk_res, 0, 0)); - - ret = i_sectors_dirty_get(c, inode, &i_sectors_hook); + ret = i_sectors_dirty_start(c, &i_sectors_hook); if (unlikely(ret)) - goto out; + goto err; + + /* + * We need to pass in a disk reservation here because we might + * be splitting a compressed extent into two. This isn't a + * problem with truncate because truncate will never split an + * extent, only truncate it... + */ + ret = bch2_disk_reservation_get(c, &disk_res, 0, 0); + BUG_ON(ret); ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS, @@ -2180,11 +2116,11 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) &disk_res, &i_sectors_hook.hook, &inode->ei_journal_seq); - - i_sectors_dirty_put(c, inode, &i_sectors_hook); bch2_disk_reservation_put(c, &disk_res); + + ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret; } -out: +err: pagecache_block_put(&mapping->add_lock); inode_unlock(&inode->v); @@ -2200,7 +2136,7 @@ static long bch2_fcollapse(struct bch_inode_info *inode, struct btree_iter dst; BKEY_PADDED(k) copy; struct bkey_s_c k; - struct i_sectors_hook i_sectors_hook; + struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0); loff_t new_size; int ret; @@ -2237,7 +2173,7 @@ static long bch2_fcollapse(struct bch_inode_info *inode, if (ret) goto err; - ret = i_sectors_dirty_get(c, inode, &i_sectors_hook); + ret = i_sectors_dirty_start(c, &i_sectors_hook); if (ret) goto err; @@ -2278,8 +2214,14 @@ static long bch2_fcollapse(struct bch_inode_info *inode, BTREE_INSERT_ENTRY(&dst, ©.k)); bch2_disk_reservation_put(c, &disk_res); btree_iter_err: - if (ret < 0 && ret != -EINTR) - goto err_unwind; + if (ret == -EINTR) + ret = 0; + if (ret) + goto err_put_sectors_dirty; + /* + * XXX: if we error here we've left data with multiple + * pointers... which isn't a _super_ serious problem... + */ bch2_btree_iter_cond_resched(&src); } @@ -2292,30 +2234,18 @@ btree_iter_err: &i_sectors_hook.hook, &inode->ei_journal_seq); if (ret) - goto err_unwind; + goto err_put_sectors_dirty; - i_sectors_dirty_put(c, inode, &i_sectors_hook); - - mutex_lock(&inode->ei_update_lock); i_size_write(&inode->v, new_size); - ret = bch2_write_inode_size(c, inode, inode->v.i_size); - mutex_unlock(&inode->ei_update_lock); - + i_sectors_hook.new_i_size = new_size; +err_put_sectors_dirty: + ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret; +err: pagecache_block_put(&mapping->add_lock); inode_unlock(&inode->v); - return ret; -err_unwind: - /* - * XXX: we've left data with multiple pointers... which isn't a _super_ - * serious problem... - */ - i_sectors_dirty_put(c, inode, &i_sectors_hook); -err: bch2_btree_iter_unlock(&src); bch2_btree_iter_unlock(&dst); - pagecache_block_put(&mapping->add_lock); - inode_unlock(&inode->v); return ret; } @@ -2324,11 +2254,11 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, { struct address_space *mapping = inode->v.i_mapping; struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct i_sectors_hook i_sectors_hook; + struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0); struct btree_iter iter; - struct bpos end; + struct bpos end_pos; loff_t block_start, block_end; - loff_t new_size = offset + len; + loff_t end = offset + len; unsigned sectors; unsigned replicas = READ_ONCE(c->opts.data_replicas); int ret; @@ -2340,45 +2270,43 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, inode_dio_wait(&inode->v); pagecache_block_get(&mapping->add_lock); - if (!(mode & FALLOC_FL_KEEP_SIZE) && - new_size > inode->v.i_size) { - ret = inode_newsize_ok(&inode->v, new_size); + if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { + ret = inode_newsize_ok(&inode->v, end); if (ret) goto err; } if (mode & FALLOC_FL_ZERO_RANGE) { - ret = __bch2_truncate_page(mapping, + ret = __bch2_truncate_page(inode, offset >> PAGE_SHIFT, - offset, offset + len); + offset, end); if (!ret && - offset >> PAGE_SHIFT != - (offset + len) >> PAGE_SHIFT) - ret = __bch2_truncate_page(mapping, - (offset + len) >> PAGE_SHIFT, - offset, offset + len); + offset >> PAGE_SHIFT != end >> PAGE_SHIFT) + ret = __bch2_truncate_page(inode, + end >> PAGE_SHIFT, + offset, end); if (unlikely(ret)) goto err; - truncate_pagecache_range(&inode->v, offset, offset + len - 1); + truncate_pagecache_range(&inode->v, offset, end - 1); block_start = round_up(offset, PAGE_SIZE); - block_end = round_down(offset + len, PAGE_SIZE); + block_end = round_down(end, PAGE_SIZE); } else { block_start = round_down(offset, PAGE_SIZE); - block_end = round_up(offset + len, PAGE_SIZE); + block_end = round_up(end, PAGE_SIZE); } bch2_btree_iter_set_pos(&iter, POS(inode->v.i_ino, block_start >> 9)); - end = POS(inode->v.i_ino, block_end >> 9); + end_pos = POS(inode->v.i_ino, block_end >> 9); - ret = i_sectors_dirty_get(c, inode, &i_sectors_hook); + ret = i_sectors_dirty_start(c, &i_sectors_hook); if (unlikely(ret)) goto err; - while (bkey_cmp(iter.pos, end) < 0) { + while (bkey_cmp(iter.pos, end_pos) < 0) { struct disk_reservation disk_res = { 0 }; struct bkey_i_reservation reservation; struct bkey_s_c k; @@ -2407,7 +2335,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, reservation.k.size = k.k->size; bch2_cut_front(iter.pos, &reservation.k_i); - bch2_cut_back(end, &reservation.k); + bch2_cut_back(end_pos, &reservation.k); sectors = reservation.k.size; reservation.v.nr_replicas = bch2_extent_nr_dirty_ptrs(k); @@ -2435,11 +2363,11 @@ btree_iter_err: } bch2_btree_iter_unlock(&iter); - i_sectors_dirty_put(c, inode, &i_sectors_hook); + ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret; if (!(mode & FALLOC_FL_KEEP_SIZE) && - new_size > inode->v.i_size) { - i_size_write(&inode->v, new_size); + end > inode->v.i_size) { + i_size_write(&inode->v, end); mutex_lock(&inode->ei_update_lock); ret = bch2_write_inode_size(c, inode, inode->v.i_size); @@ -2449,14 +2377,14 @@ btree_iter_err: /* blech */ if ((mode & FALLOC_FL_KEEP_SIZE) && (mode & FALLOC_FL_ZERO_RANGE) && - inode->ei_size != inode->v.i_size) { + inode->ei_inode.bi_size != inode->v.i_size) { /* sync appends.. */ ret = filemap_write_and_wait_range(mapping, - inode->ei_size, S64_MAX); + inode->ei_inode.bi_size, S64_MAX); if (ret) goto err; - if (inode->ei_size != inode->v.i_size) { + if (inode->ei_inode.bi_size != inode->v.i_size) { mutex_lock(&inode->ei_update_lock); ret = bch2_write_inode_size(c, inode, inode->v.i_size); mutex_unlock(&inode->ei_update_lock); @@ -2468,7 +2396,7 @@ btree_iter_err: return 0; err_put_sectors_dirty: - i_sectors_dirty_put(c, inode, &i_sectors_hook); + ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret; err: bch2_btree_iter_unlock(&iter); pagecache_block_put(&mapping->add_lock); @@ -2669,11 +2597,14 @@ void bch2_fs_fsio_exit(struct bch_fs *c) int bch2_fs_fsio_init(struct bch_fs *c) { if (bioset_init(&c->writepage_bioset, - 4, offsetof(struct bch_writepage_io, op.op.wbio.bio)) || + 4, offsetof(struct bch_writepage_io, op.op.wbio.bio), + BIOSET_NEED_BVECS) || bioset_init(&c->dio_read_bioset, - 4, offsetof(struct dio_read, rbio.bio)) || + 4, offsetof(struct dio_read, rbio.bio), + BIOSET_NEED_BVECS) || bioset_init(&c->dio_write_bioset, - 4, offsetof(struct dio_write, iop.op.wbio.bio))) + 4, offsetof(struct dio_write, iop.op.wbio.bio), + BIOSET_NEED_BVECS)) return -ENOMEM; return 0; diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index bd915fec..24228c8e 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -75,7 +75,7 @@ do { \ /* Set VFS inode flags from bcachefs inode: */ void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) { - set_flags(bch_flags_to_vfs, inode->ei_flags, inode->v.i_flags); + set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); } static int bch2_inode_flags_set(struct bch_inode_info *inode, @@ -99,13 +99,13 @@ static int bch2_inode_flags_set(struct bch_inode_info *inode, return -EINVAL; bi->bi_flags = newflags; - inode->v.i_ctime = current_fs_time(inode->v.i_sb); + inode->v.i_ctime = current_time(&inode->v); return 0; } static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg) { - unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_flags); + unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags); return put_user(flags, arg); } @@ -153,7 +153,7 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode, { struct fsxattr fa = { 0 }; - fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_flags); + fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags); return copy_to_user(arg, &fa, sizeof(fa)); } diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 43688cd3..cb0397f1 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -12,6 +12,7 @@ #include "fs-ioctl.h" #include "fsck.h" #include "inode.h" +#include "io.h" #include "journal.h" #include "keylist.h" #include "super.h" @@ -130,10 +131,8 @@ int __must_check __bch2_write_inode(struct bch_fs *c, BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i)); } while (ret == -EINTR); - if (!ret) { - inode->ei_size = inode_u.bi_size; - inode->ei_flags = inode_u.bi_flags; - } + if (!ret) + inode->ei_inode = inode_u; out: bch2_btree_iter_unlock(&iter); @@ -146,7 +145,7 @@ int __must_check bch2_write_inode(struct bch_fs *c, return __bch2_write_inode(c, inode, NULL, NULL); } -int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode) +static int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode) { int ret; @@ -158,7 +157,7 @@ int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode) return ret; } -int bch2_dec_nlink(struct bch_fs *c, struct bch_inode_info *inode) +static int bch2_dec_nlink(struct bch_fs *c, struct bch_inode_info *inode) { int ret = 0; @@ -223,7 +222,9 @@ static struct bch_inode_info *bch2_vfs_inode_create(struct bch_fs *c, bch2_inode_init(c, &inode_u, i_uid_read(&inode->v), i_gid_read(&inode->v), - inode->v.i_mode, rdev); + inode->v.i_mode, rdev, + &dir->ei_inode); + ret = bch2_inode_create(c, &inode_u, BLOCKDEV_INODE_MAX, 0, &c->unused_inode_hint); @@ -277,7 +278,7 @@ static int bch2_vfs_dirent_create(struct bch_fs *c, if (unlikely(ret)) return ret; - dir->v.i_mtime = dir->v.i_ctime = current_fs_time(c->vfs_sb); + dir->v.i_mtime = dir->v.i_ctime = current_time(&dir->v); mark_inode_dirty_sync(&dir->v); return 0; } @@ -344,7 +345,7 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir, lockdep_assert_held(&inode->v.i_rwsem); - inode->v.i_ctime = current_fs_time(dir->v.i_sb); + inode->v.i_ctime = current_time(&dir->v); ret = bch2_inc_nlink(c, inode); if (ret) @@ -473,7 +474,7 @@ static int bch2_rename(struct bch_fs *c, { struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode); struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode); - struct timespec now = current_fs_time(old_dir->v.i_sb); + struct timespec now = current_time(&old_dir->v); int ret; lockdep_assert_held(&old_dir->v.i_rwsem); @@ -551,7 +552,7 @@ static int bch2_rename_exchange(struct bch_fs *c, { struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode); struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode); - struct timespec now = current_fs_time(old_dir->v.i_sb); + struct timespec now = current_time(&old_dir->v); int ret; ret = bch2_dirent_rename(c, @@ -909,10 +910,8 @@ static void bch2_vfs_inode_init(struct bch_fs *c, inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime); inode->ei_journal_seq = 0; - inode->ei_size = bi->bi_size; - inode->ei_flags = bi->bi_flags; - atomic64_set(&inode->ei_sectors, bi->bi_sectors); inode->ei_str_hash = bch2_hash_info_init(c, bi); + inode->ei_inode = *bi; bch2_inode_flags_to_vfs(inode); @@ -949,8 +948,6 @@ static struct inode *bch2_alloc_inode(struct super_block *sb) inode_init_once(&inode->v); mutex_init(&inode->ei_update_lock); inode->ei_journal_seq = 0; - atomic_long_set(&inode->ei_size_dirty_count, 0); - atomic_long_set(&inode->ei_sectors_dirty_count, 0); return &inode->v; } @@ -995,12 +992,6 @@ static void bch2_evict_inode(struct inode *vinode) truncate_inode_pages_final(&inode->v.i_data); - if (!bch2_journal_error(&c->journal) && !is_bad_inode(&inode->v)) { - /* XXX - we want to check this stuff iff there weren't IO errors: */ - BUG_ON(atomic_long_read(&inode->ei_sectors_dirty_count)); - BUG_ON(atomic64_read(&inode->ei_sectors) != inode->v.i_blocks); - } - clear_inode(&inode->v); if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) { @@ -1272,9 +1263,16 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, sb->s_magic = BCACHEFS_STATFS_MAGIC; sb->s_time_gran = c->sb.time_precision; c->vfs_sb = sb; - sb->s_bdi = &c->bdi; strlcpy(sb->s_id, c->name, sizeof(sb->s_id)); + ret = super_setup_bdi(sb); + if (ret) + goto err_put_super; + + sb->s_bdi->congested_fn = bch2_congested; + sb->s_bdi->congested_data = c; + sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + for_each_online_member(ca, c, i) { struct block_device *bdev = ca->disk_sb.bdev; diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h index d255ca7c..652105fb 100644 --- a/libbcachefs/fs.h +++ b/libbcachefs/fs.h @@ -1,6 +1,7 @@ #ifndef _BCACHEFS_FS_H #define _BCACHEFS_FS_H +#include "opts.h" #include "str_hash.h" #include @@ -11,22 +12,12 @@ struct bch_inode_info { struct mutex ei_update_lock; u64 ei_journal_seq; - - atomic_long_t ei_size_dirty_count; - - /* - * these are updated whenever we update the inode in the btree - for - * e.g. fsync - */ - u64 ei_size; - u32 ei_flags; - - atomic_long_t ei_sectors_dirty_count; - atomic64_t ei_sectors; + unsigned long ei_last_dirtied; struct bch_hash_info ei_str_hash; - unsigned long ei_last_dirtied; + /* copy of inode in btree: */ + struct bch_inode_unpacked ei_inode; }; #define to_bch_ei(_inode) \ diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 4760b16e..696926fe 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -204,7 +204,7 @@ static int hash_check_key(const struct bch_hash_desc desc, "hash table key at wrong offset: %llu, " "hashed to %llu chain starts at %llu\n%s", k.k->p.offset, hashed, h->chain.pos.offset, - bch2_bkey_val_to_text(c, desc.btree_id, + bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id), buf, sizeof(buf), k))) { ret = hash_redo_key(desc, h, c, k_iter, k, hashed); if (ret) { @@ -224,7 +224,7 @@ static int hash_check_key(const struct bch_hash_desc desc, if (fsck_err_on(k2.k->type == desc.key_type && !desc.cmp_bkey(k, k2), c, "duplicate hash table keys:\n%s", - bch2_bkey_val_to_text(c, desc.btree_id, + bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id), buf, sizeof(buf), k))) { ret = bch2_hash_delete_at(desc, &h->info, &h->iter, NULL); if (ret) @@ -397,9 +397,9 @@ static int check_dirents(struct bch_fs *c) if (fsck_err_on(have_target && d.v->d_type != - mode_to_type(le16_to_cpu(target.bi_mode)), c, + mode_to_type(target.bi_mode), c, "incorrect d_type: should be %u:\n%s", - mode_to_type(le16_to_cpu(target.bi_mode)), + mode_to_type(target.bi_mode), bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS, buf, sizeof(buf), k))) { struct bkey_i_dirent *n; @@ -411,7 +411,7 @@ static int check_dirents(struct bch_fs *c) } bkey_reassemble(&n->k_i, d.s_c); - n->v.d_type = mode_to_type(le16_to_cpu(target.bi_mode)); + n->v.d_type = mode_to_type(target.bi_mode); ret = bch2_btree_insert_at(c, NULL, NULL, NULL, BTREE_INSERT_NOFAIL, @@ -493,7 +493,8 @@ static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode) fsck_err: return ret; create_root: - bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0); + bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, + 0, NULL); root_inode->bi_inum = BCACHEFS_ROOT_INO; bch2_inode_pack(&packed, root_inode); @@ -545,7 +546,8 @@ create_lostfound: if (ret) return ret; - bch2_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0); + bch2_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, + 0, root_inode); ret = bch2_inode_create(c, lostfound_inode, BLOCKDEV_INODE_MAX, 0, &c->unused_inode_hint); diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 05f617ae..71a24cc6 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -198,6 +198,12 @@ static const char *bch2_inode_invalid(const struct bch_fs *c, if (bch2_inode_unpack(inode, &unpacked)) return "invalid variable length fields"; + if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) + return "invalid data checksum type"; + + if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) + return "invalid data checksum type"; + return NULL; } case BCH_INODE_BLOCKDEV: @@ -221,6 +227,7 @@ static const char *bch2_inode_invalid(const struct bch_fs *c, static void bch2_inode_to_text(struct bch_fs *c, char *buf, size_t size, struct bkey_s_c k) { + char *out = buf, *end = out + size; struct bkey_s_c_inode inode; struct bch_inode_unpacked unpacked; @@ -228,11 +235,14 @@ static void bch2_inode_to_text(struct bch_fs *c, char *buf, case BCH_INODE_FS: inode = bkey_s_c_to_inode(k); if (bch2_inode_unpack(inode, &unpacked)) { - scnprintf(buf, size, "(unpack error)"); + out += scnprintf(out, end - out, "(unpack error)"); break; } - scnprintf(buf, size, "i_size %llu", unpacked.bi_size); +#define BCH_INODE_FIELD(_name, _bits) \ + out += scnprintf(out, end - out, #_name ": %llu ", (u64) unpacked._name); + BCH_INODE_FIELDS() +#undef BCH_INODE_FIELD break; } } @@ -243,9 +253,12 @@ const struct bkey_ops bch2_bkey_inode_ops = { }; void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, - uid_t uid, gid_t gid, umode_t mode, dev_t rdev) + uid_t uid, gid_t gid, umode_t mode, dev_t rdev, + struct bch_inode_unpacked *parent) { - s64 now = timespec_to_bch2_time(c, CURRENT_TIME); + s64 now = timespec_to_bch2_time(c, + timespec_trunc(current_kernel_time(), + c->sb.time_precision)); memset(inode_u, 0, sizeof(*inode_u)); @@ -261,6 +274,12 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, inode_u->bi_mtime = now; inode_u->bi_ctime = now; inode_u->bi_otime = now; + + if (parent) { +#define BCH_INODE_FIELD(_name) inode_u->_name = parent->_name; + BCH_INODE_FIELDS_INHERIT() +#undef BCH_INODE_FIELD + } } int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u, @@ -416,7 +435,7 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) struct bch_inode_unpacked inode_u; if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u)) - bi_generation = cpu_to_le32(inode_u.bi_generation) + 1; + bi_generation = inode_u.bi_generation + 1; break; } case BCH_INODE_GENERATION: { diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index 53c70617..8ebb6fb6 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -1,6 +1,8 @@ #ifndef _BCACHEFS_INODE_H #define _BCACHEFS_INODE_H +#include "opts.h" + #include extern const struct bkey_ops bch2_bkey_inode_ops; @@ -28,7 +30,8 @@ void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *) int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, - uid_t, gid_t, umode_t, dev_t); + uid_t, gid_t, umode_t, dev_t, + struct bch_inode_unpacked *); int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *, u64, u64, u64 *); int bch2_inode_truncate(struct bch_fs *, u64, u64, @@ -55,6 +58,45 @@ static inline u64 timespec_to_bch2_time(struct bch_fs *c, struct timespec ts) return div_s64(ns, c->sb.time_precision); } +static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode) +{ + struct bch_io_opts ret = { 0 }; + +#define BCH_INODE_OPT(_name, _bits) \ + if (inode->bi_##_name) \ + opt_set(ret, _name, inode->bi_##_name - 1); + BCH_INODE_OPTS() +#undef BCH_INODE_OPT + return ret; +} + +static inline void __bch2_inode_opt_set(struct bch_inode_unpacked *inode, + enum bch_opt_id id, u64 v) +{ + switch (id) { +#define BCH_INODE_OPT(_name, ...) \ + case Opt_##_name: \ + inode->bi_##_name = v; \ + break; + BCH_INODE_OPTS() +#undef BCH_INODE_OPT + default: + BUG(); + } +} + +static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode, + enum bch_opt_id id, u64 v) +{ + return __bch2_inode_opt_set(inode, id, v + 1); +} + +static inline void bch2_inode_opt_clear(struct bch_inode_unpacked *inode, + enum bch_opt_id id) +{ + return __bch2_inode_opt_set(inode, id, 0); +} + #ifdef CONFIG_BCACHEFS_DEBUG void bch2_inode_pack_test(void); #else diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 0c41e411..3369a2ff 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -20,6 +20,7 @@ #include "journal.h" #include "keylist.h" #include "move.h" +#include "super.h" #include "super-io.h" #include @@ -139,7 +140,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, const struct bch_extent_ptr *ptr; struct bch_write_bio *n; struct bch_dev *ca; - unsigned ptr_idx = 0; BUG_ON(c->opts.nochanges); @@ -147,7 +147,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX || !c->devs[ptr->dev]); - ca = c->devs[ptr->dev]; + ca = bch_dev_bkey_exists(c, ptr->dev); if (ptr + 1 < &extent_entry_last(e)->ptr) { n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO, @@ -168,7 +168,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, n->c = c; n->ca = ca; - n->ptr_idx = ptr_idx++; n->submit_time_us = local_clock_us(); n->bio.bi_iter.bi_sector = ptr->offset; @@ -184,7 +183,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, submit_bio(&n->bio); } else { n->have_io_ref = false; - bcache_io_error(c, &n->bio, "device has been removed"); + n->bio.bi_status = BLK_STS_REMOVED; bio_endio(&n->bio); } } @@ -201,9 +200,12 @@ static void bch2_write_done(struct closure *cl) if (!op->error && (op->flags & BCH_WRITE_FLUSH)) op->error = bch2_journal_error(&op->c->journal); - bch2_disk_reservation_put(op->c, &op->res); + if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION)) + bch2_disk_reservation_put(op->c, &op->res); percpu_ref_put(&op->c->writes); bch2_keylist_free(&op->insert_keys, op->inline_keys); + op->flags &= ~(BCH_WRITE_DONE|BCH_WRITE_LOOPED); + closure_return(cl); } @@ -244,9 +246,37 @@ static void bch2_write_index(struct closure *cl) struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct bch_fs *c = op->c; struct keylist *keys = &op->insert_keys; + struct bkey_s_extent e; + struct bch_extent_ptr *ptr; + struct bkey_i *src, *dst = keys->keys, *n; + int ret; op->flags |= BCH_WRITE_LOOPED; + for (src = keys->keys; src != keys->top; src = n) { + n = bkey_next(src); + bkey_copy(dst, src); + + e = bkey_i_to_s_extent(dst); + extent_for_each_ptr_backwards(e, ptr) + if (test_bit(ptr->dev, op->failed.d)) + bch2_extent_drop_ptr(e, ptr); + + ret = bch2_extent_nr_ptrs(e.c) + ? bch2_check_mark_super(c, e.c, BCH_DATA_USER) + : -EIO; + if (ret) { + keys->top = keys->keys; + op->error = ret; + op->flags |= BCH_WRITE_DONE; + goto err; + } + + dst = bkey_next(dst); + } + + keys->top = dst; + if (!bch2_keylist_empty(keys)) { u64 sectors_start = keylist_sectors(keys); int ret = op->index_update_fn(op); @@ -260,7 +290,7 @@ static void bch2_write_index(struct closure *cl) op->error = ret; } } - +err: bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets); if (!(op->flags & BCH_WRITE_DONE)) @@ -276,43 +306,6 @@ static void bch2_write_index(struct closure *cl) } } -static void bch2_write_io_error(struct closure *cl) -{ - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - struct keylist *keys = &op->insert_keys; - struct bch_fs *c = op->c; - struct bch_extent_ptr *ptr; - struct bkey_i *k; - int ret; - - for_each_keylist_key(keys, k) { - struct bkey_i *n = bkey_next(k); - struct bkey_s_extent e = bkey_i_to_s_extent(k); - - extent_for_each_ptr_backwards(e, ptr) - if (test_bit(ptr->dev, op->failed.d)) - bch2_extent_drop_ptr(e, ptr); - - memmove(bkey_next(k), n, (void *) keys->top - (void *) n); - keys->top_p -= (u64 *) n - (u64 *) bkey_next(k); - - ret = bch2_extent_nr_ptrs(e.c) - ? bch2_check_mark_super(c, e.c, BCH_DATA_USER) - : -EIO; - if (ret) { - keys->top = keys->keys; - op->error = ret; - op->flags |= BCH_WRITE_DONE; - break; - } - } - - memset(&op->failed, 0, sizeof(op->failed)); - - bch2_write_index(cl); - return; -} - static void bch2_write_endio(struct bio *bio) { struct closure *cl = bio->bi_private; @@ -324,10 +317,8 @@ static void bch2_write_endio(struct bio *bio) bch2_latency_acct(ca, wbio->submit_time_us, WRITE); - if (bch2_dev_io_err_on(bio->bi_error, ca, "data write")) { + if (bch2_dev_io_err_on(bio->bi_status, ca, "data write")) set_bit(ca->dev_idx, op->failed.d); - set_closure_fn(cl, bch2_write_io_error, index_update_wq(op)); - } if (wbio->have_io_ref) percpu_ref_put(&ca->io_ref); @@ -706,11 +697,6 @@ do_write: key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset); - ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(key_to_write), - BCH_DATA_USER); - if (ret) - goto err; - dst->bi_end_io = bch2_write_endio; dst->bi_private = &op->cl; bio_set_op_attrs(dst, REQ_OP_WRITE, 0); @@ -870,7 +856,8 @@ void bch2_write(struct closure *cl) !percpu_ref_tryget(&c->writes)) { __bcache_io_error(c, "read only"); op->error = -EROFS; - bch2_disk_reservation_put(c, &op->res); + if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION)) + bch2_disk_reservation_put(c, &op->res); closure_return(cl); } @@ -916,7 +903,10 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) swap(bio->bi_vcnt, rbio->bio.bi_vcnt); rbio->promote = NULL; - __bch2_write_op_init(&op->write.op, c); + bch2_write_op_init(&op->write.op, c); + op->write.op.csum_type = bch2_data_checksum_type(c, rbio->opts.data_checksum); + op->write.op.compression_type = + bch2_compression_opt_to_type(rbio->opts.compression); op->write.move_dev = -1; op->write.op.devs = c->fastest_devs; @@ -1060,7 +1050,7 @@ static void bch2_rbio_retry(struct work_struct *work) if (rbio->split) rbio = bch2_rbio_free(rbio); else - rbio->bio.bi_error = 0; + rbio->bio.bi_status = 0; if (!(flags & BCH_READ_NODECODE)) flags |= BCH_READ_MUST_CLONE; @@ -1073,7 +1063,8 @@ static void bch2_rbio_retry(struct work_struct *work) __bch2_read(c, rbio, iter, inode, &avoid, flags); } -static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error) +static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, + blk_status_t error) { rbio->retry = retry; @@ -1081,7 +1072,7 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error) return; if (retry == READ_ERR) { - bch2_rbio_parent(rbio)->bio.bi_error = error; + bch2_rbio_parent(rbio)->bio.bi_status = error; bch2_rbio_done(rbio); } else { bch2_rbio_punt(rbio, bch2_rbio_retry, @@ -1236,7 +1227,7 @@ csum_err: */ if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { rbio->flags |= BCH_READ_MUST_BOUNCE; - bch2_rbio_error(rbio, READ_RETRY, -EIO); + bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); return; } @@ -1245,13 +1236,13 @@ csum_err: rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector, rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, csum.hi, csum.lo, crc.csum_type); - bch2_rbio_error(rbio, READ_RETRY_AVOID, -EIO); + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); return; decompression_err: __bcache_io_error(c, "decompression error, inode %llu offset %llu", rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector); - bch2_rbio_error(rbio, READ_ERR, -EIO); + bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); return; } @@ -1270,8 +1261,8 @@ static void bch2_read_endio(struct bio *bio) if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; - if (bch2_dev_io_err_on(bio->bi_error, rbio->pick.ca, "data read")) { - bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_error); + if (bch2_dev_io_err_on(bio->bi_status, rbio->pick.ca, "data read")) { + bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); return; } @@ -1281,9 +1272,9 @@ static void bch2_read_endio(struct bio *bio) atomic_long_inc(&c->read_realloc_races); if (rbio->flags & BCH_READ_RETRY_IF_STALE) - bch2_rbio_error(rbio, READ_RETRY, -EINTR); + bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); else - bch2_rbio_error(rbio, READ_ERR, -EINTR); + bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); return; } @@ -1360,7 +1351,8 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, rbio = rbio_init(bio_alloc_bioset(GFP_NOIO, DIV_ROUND_UP(sectors, PAGE_SECTORS), - &c->bio_read_split)); + &c->bio_read_split), + orig->opts); bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); split = true; @@ -1374,7 +1366,8 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, * lose the error) */ rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO, - &c->bio_read_split)); + &c->bio_read_split), + orig->opts); rbio->bio.bi_iter = iter; split = true; } else { @@ -1428,6 +1421,8 @@ noclone: bch2_read_endio(&rbio->bio); ret = rbio->retry; + if (rbio->split) + rbio = bch2_rbio_free(rbio); if (!ret) bch2_rbio_done(rbio); } @@ -1503,7 +1498,7 @@ err: * possibly bigger than the memory that was * originally allocated) */ - rbio->bio.bi_error = -EINTR; + rbio->bio.bi_status = BLK_STS_AGAIN; bio_endio(&rbio->bio); return; } @@ -1561,6 +1556,7 @@ retry: case READ_RETRY: goto retry; case READ_ERR: + rbio->bio.bi_status = BLK_STS_IOERR; bio_endio(&rbio->bio); return; }; diff --git a/libbcachefs/io.h b/libbcachefs/io.h index bd0d7c43..0c145eb6 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -21,6 +21,8 @@ void bch2_latency_acct(struct bch_dev *, unsigned, int); void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, enum bch_data_type, const struct bkey_i *); +#define BLK_STS_REMOVED ((__force blk_status_t)128) + enum bch_write_flags { BCH_WRITE_ALLOC_NOWAIT = (1 << 0), BCH_WRITE_CACHED = (1 << 1), @@ -29,11 +31,12 @@ enum bch_write_flags { BCH_WRITE_PAGES_STABLE = (1 << 4), BCH_WRITE_PAGES_OWNED = (1 << 5), BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6), + BCH_WRITE_NOPUT_RESERVATION = (1 << 7), /* Internal: */ - BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 7), - BCH_WRITE_DONE = (1 << 8), - BCH_WRITE_LOOPED = (1 << 9), + BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 8), + BCH_WRITE_DONE = (1 << 9), + BCH_WRITE_LOOPED = (1 << 10), }; static inline u64 *op_journal_seq(struct bch_write_op *op) @@ -42,6 +45,12 @@ static inline u64 *op_journal_seq(struct bch_write_op *op) ? op->journal_seq_p : &op->journal_seq; } +static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq) +{ + op->journal_seq_p = journal_seq; + op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR; +} + static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) { return op->alloc_reserve == RESERVE_MOVINGGC @@ -51,14 +60,14 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) int bch2_write_index_default(struct bch_write_op *); -static inline void __bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c) +static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c) { op->c = c; op->io_wq = index_update_wq(op); op->flags = 0; op->written = 0; op->error = 0; - op->csum_type = bch2_data_checksum_type(c); + op->csum_type = bch2_data_checksum_type(c, c->opts.data_checksum); op->compression_type = bch2_compression_opt_to_type(c->opts.compression); op->nr_replicas = 0; @@ -75,27 +84,6 @@ static inline void __bch2_write_op_init(struct bch_write_op *op, struct bch_fs * op->index_update_fn = bch2_write_index_default; } -static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, - struct disk_reservation res, - struct bch_devs_mask *devs, - struct write_point_specifier write_point, - struct bpos pos, - u64 *journal_seq, unsigned flags) -{ - __bch2_write_op_init(op, c); - op->flags = flags; - op->nr_replicas = res.nr_replicas; - op->pos = pos; - op->res = res; - op->devs = devs; - op->write_point = write_point; - - if (journal_seq) { - op->journal_seq_p = journal_seq; - op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR; - } -} - void bch2_write(struct closure *); static inline struct bch_write_bio *wbio_init(struct bio *bio) @@ -134,25 +122,27 @@ static inline void bch2_read_extent(struct bch_fs *c, struct extent_pick_ptr *pick, unsigned flags) { - rbio->_state = 0; __bch2_read_extent(c, rbio, rbio->bio.bi_iter, e, pick, flags); } static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) { - rbio->_state = 0; + BUG_ON(rbio->_state); __bch2_read(c, rbio, rbio->bio.bi_iter, inode, NULL, BCH_READ_RETRY_IF_STALE| BCH_READ_MAY_PROMOTE| BCH_READ_USER_MAPPED); } -static inline struct bch_read_bio *rbio_init(struct bio *bio) +static inline struct bch_read_bio *rbio_init(struct bio *bio, + struct bch_io_opts opts) { struct bch_read_bio *rbio = to_rbio(bio); - rbio->_state = 0; + rbio->_state = 0; + rbio->promote = NULL; + rbio->opts = opts; return rbio; } diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h index ed9a4bbe..ff18fdc9 100644 --- a/libbcachefs/io_types.h +++ b/libbcachefs/io_types.h @@ -6,6 +6,7 @@ #include "buckets_types.h" #include "extents_types.h" #include "keylist_types.h" +#include "opts.h" #include "super_types.h" #include @@ -56,6 +57,8 @@ struct bch_read_bio { struct promote_op *promote; + struct bch_io_opts opts; + struct work_struct work; struct bio bio; @@ -69,8 +72,7 @@ struct bch_write_bio { struct closure *cl; }; - u8 ptr_idx; - u8 replicas_failed; + struct bch_devs_list failed; u8 order; unsigned split:1, @@ -90,8 +92,8 @@ struct bch_write_op { struct bch_fs *c; struct workqueue_struct *io_wq; + unsigned written; /* sectors */ u16 flags; - u16 written; /* sectors */ s8 error; unsigned csum_type:4; diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 5d9a298d..b4e149ac 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -338,8 +338,8 @@ struct journal_list { * Given a journal entry we just read, add it to the list of journal entries to * be replayed: */ -static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist, - struct jset *j) +static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, + struct journal_list *jlist, struct jset *j) { struct journal_replay *i, *pos; struct list_head *where; @@ -347,8 +347,6 @@ static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist, __le64 last_seq; int ret; - mutex_lock(&jlist->lock); - last_seq = !list_empty(jlist->head) ? list_last_entry(jlist->head, struct journal_replay, list)->j.last_seq @@ -376,9 +374,7 @@ static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist, memcmp(j, &i->j, bytes), c, "found duplicate but non identical journal entries (seq %llu)", le64_to_cpu(j->seq)); - - ret = JOURNAL_ENTRY_ADD_OK; - goto out; + goto found; } if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) { @@ -395,12 +391,16 @@ add: goto out; } - memcpy(&i->j, j, bytes); list_add(&i->list, where); + i->devs.nr = 0; + memcpy(&i->j, j, bytes); +found: + if (!fsck_err_on(bch2_dev_list_has_dev(i->devs, ca->dev_idx), + c, "duplicate journal entries on same device")) + bch2_dev_list_add_dev(&i->devs, ca->dev_idx); ret = JOURNAL_ENTRY_ADD_OK; out: fsck_err: - mutex_unlock(&jlist->lock); return ret; } @@ -496,8 +496,8 @@ fsck_err: #define journal_entry_err_on(cond, c, msg, ...) \ ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false) -static int __journal_entry_validate(struct bch_fs *c, struct jset *j, - int write) +static int journal_entry_validate_entries(struct bch_fs *c, struct jset *j, + int write) { struct jset_entry *entry; int ret = 0; @@ -508,7 +508,7 @@ static int __journal_entry_validate(struct bch_fs *c, struct jset *j, if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(j), c, "journal entry extends past end of jset")) { - j->u64s = cpu_to_le64((u64 *) entry - j->_data); + j->u64s = cpu_to_le32((u64 *) entry - j->_data); break; } @@ -614,7 +614,7 @@ static int journal_entry_validate(struct bch_fs *c, "invalid journal entry: last_seq > seq")) j->last_seq = j->seq; - return __journal_entry_validate(c, j, write); + return 0; fsck_err: return ret; } @@ -722,7 +722,10 @@ reread: sectors_read = min_t(unsigned, ja->bucket_seq[bucket] = le64_to_cpu(j->seq); - ret = journal_entry_add(c, jlist, j); + mutex_lock(&jlist->lock); + ret = journal_entry_add(c, ca, jlist, j); + mutex_unlock(&jlist->lock); + switch (ret) { case JOURNAL_ENTRY_ADD_OK: *entries_found = true; @@ -916,7 +919,9 @@ static int journal_seq_blacklist_read(struct journal *j, for_each_jset_entry_type(entry, &i->j, JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED) { - seq = le64_to_cpu(entry->_data[0]); + struct jset_entry_blacklist *bl_entry = + container_of(entry, struct jset_entry_blacklist, entry); + seq = le64_to_cpu(bl_entry->seq); bch_verbose(c, "blacklisting existing journal seq %llu", seq); @@ -982,6 +987,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) fsck_err_on(c->sb.clean && journal_has_keys(list), c, "filesystem marked clean but journal has keys to replay"); + list_for_each_entry(i, list, list) { + ret = journal_entry_validate_entries(c, &i->j, READ); + if (ret) + goto fsck_err; + } + i = list_last_entry(list, struct journal_replay, list); unfixable_fsck_err_on(le64_to_cpu(i->j.seq) - @@ -1002,6 +1013,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) INIT_LIST_HEAD(&p->list); INIT_LIST_HEAD(&p->flushed); atomic_set(&p->count, 0); + p->devs.nr = 0; } mutex_lock(&j->blacklist_lock); @@ -1010,6 +1022,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) p = journal_seq_pin(j, le64_to_cpu(i->j.seq)); atomic_set(&p->count, 1); + p->devs = i->devs; if (journal_seq_blacklist_read(j, i, p)) { mutex_unlock(&j->blacklist_lock); @@ -1090,7 +1103,7 @@ void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set) { struct journal_buf *w = journal_prev_buf(j); - atomic_dec_bug(&journal_seq_pin(j, w->data->seq)->count); + atomic_dec_bug(&journal_seq_pin(j, le64_to_cpu(w->data->seq))->count); if (!need_write_just_set && test_bit(JOURNAL_NEED_WRITE, &j->flags)) @@ -1122,6 +1135,7 @@ static void __journal_entry_new(struct journal *j, int count) INIT_LIST_HEAD(&p->list); INIT_LIST_HEAD(&p->flushed); atomic_set(&p->count, count); + p->devs.nr = 0; } static void __bch2_journal_next_entry(struct journal *j) @@ -1851,6 +1865,21 @@ void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) bch2_journal_error(j)); } +int bch2_journal_flush_all_pins(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + bool flush; + + bch2_journal_flush_pins(j, U64_MAX); + + spin_lock(&j->lock); + flush = last_seq(j) != j->last_seq_ondisk || + c->btree_roots_dirty; + spin_unlock(&j->lock); + + return flush ? bch2_journal_meta(j) : 0; +} + static bool should_discard_bucket(struct journal *j, struct journal_device *ja) { bool ret; @@ -2002,7 +2031,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors) * i.e. whichever device was limiting the current journal entry size. */ extent_for_each_ptr_backwards(e, ptr) { - ca = c->devs[ptr->dev]; + ca = bch_dev_bkey_exists(c, ptr->dev); if (ca->mi.state != BCH_MEMBER_STATE_RW || ca->journal.sectors_free <= sectors) @@ -2197,7 +2226,7 @@ static void journal_write_endio(struct bio *bio) struct bch_dev *ca = bio->bi_private; struct journal *j = &ca->fs->journal; - if (bch2_dev_io_err_on(bio->bi_error, ca, "journal write") || + if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write") || bch2_meta_write_fault("journal")) { /* Was this a flush or an actual journal write? */ if (ca->journal.ptr_idx != U8_MAX) { @@ -2233,6 +2262,7 @@ static void journal_write(struct closure *cl) if (r->alive) bch2_journal_add_btree_root(w, i, &r->key, r->level); } + c->btree_roots_dirty = false; mutex_unlock(&c->btree_root_lock); journal_write_compact(jset); @@ -2246,7 +2276,7 @@ static void journal_write(struct closure *cl) SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) && - __journal_entry_validate(c, jset, WRITE)) + journal_entry_validate_entries(c, jset, WRITE)) goto err; bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), @@ -2257,7 +2287,7 @@ static void journal_write(struct closure *cl) journal_nonce(jset), jset); if (!bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) && - __journal_entry_validate(c, jset, WRITE)) + journal_entry_validate_entries(c, jset, WRITE)) goto err; sectors = vstruct_sectors(jset, c->block_bits); @@ -2277,6 +2307,9 @@ static void journal_write(struct closure *cl) BCH_DATA_JOURNAL)) goto err; + journal_seq_pin(j, le64_to_cpu(jset->seq))->devs = + bch2_extent_devs(bkey_i_to_s_c_extent(&j->key)); + /* * XXX: we really should just disable the entire journal in nochanges * mode @@ -2285,7 +2318,7 @@ static void journal_write(struct closure *cl) goto no_io; extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) { - ca = c->devs[ptr->dev]; + ca = bch_dev_bkey_exists(c, ptr->dev); if (!percpu_ref_tryget(&ca->io_ref)) { /* XXX: fix this */ bch_err(c, "missing device for journal write\n"); @@ -2693,6 +2726,46 @@ int bch2_journal_flush(struct journal *j) return bch2_journal_flush_seq(j, seq); } +int bch2_journal_flush_device(struct journal *j, unsigned dev_idx) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_entry_pin_list *p; + struct bch_devs_list devs; + u64 seq = 0; + unsigned iter; + int ret = 0; + + spin_lock(&j->lock); + fifo_for_each_entry_ptr(p, &j->pin, iter) + if (bch2_dev_list_has_dev(p->devs, dev_idx)) + seq = journal_pin_seq(j, p); + spin_unlock(&j->lock); + + bch2_journal_flush_pins(j, seq); + + mutex_lock(&c->replicas_gc_lock); + bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL); + + seq = 0; + + spin_lock(&j->lock); + while (!ret && seq < atomic64_read(&j->seq)) { + seq = max(seq, last_seq(j)); + devs = journal_seq_pin(j, seq)->devs; + seq++; + + spin_unlock(&j->lock); + ret = bch2_check_mark_super_devlist(c, &devs, BCH_DATA_JOURNAL); + spin_lock(&j->lock); + } + spin_unlock(&j->lock); + + bch2_replicas_gc_end(c, ret); + mutex_unlock(&c->replicas_gc_lock); + + return ret; +} + ssize_t bch2_journal_print_debug(struct journal *j, char *buf) { struct bch_fs *c = container_of(j, struct bch_fs, journal); @@ -2862,9 +2935,7 @@ void bch2_fs_journal_stop(struct journal *j) * journal entries, then force a brand new empty journal entry to be * written: */ - bch2_journal_flush_pins(j, U64_MAX); - bch2_journal_flush_async(j, NULL); - bch2_journal_meta(j); + bch2_journal_flush_all_pins(j); cancel_delayed_work_sync(&j->write_work); cancel_delayed_work_sync(&j->reclaim_work); diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 9d6c79c6..5f3ece08 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -118,6 +118,8 @@ */ struct journal_replay { struct list_head list; + struct bch_devs_list devs; + /* must be last: */ struct jset j; }; @@ -164,6 +166,7 @@ void bch2_journal_pin_add_if_older(struct journal *, struct journal_entry_pin *, journal_pin_flush_fn); void bch2_journal_flush_pins(struct journal *, u64); +int bch2_journal_flush_all_pins(struct journal *); struct closure; struct bch_fs; @@ -356,6 +359,7 @@ void bch2_journal_meta_async(struct journal *, struct closure *); int bch2_journal_flush_seq(struct journal *, u64); int bch2_journal_flush(struct journal *); int bch2_journal_meta(struct journal *); +int bch2_journal_flush_device(struct journal *, unsigned); void bch2_journal_halt(struct journal *); diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index 55b41c56..87f378a6 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -34,6 +34,7 @@ struct journal_entry_pin_list { struct list_head list; struct list_head flushed; atomic_t count; + struct bch_devs_list devs; }; struct journal; diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index 8d1c0ee0..e11ee953 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -27,23 +27,9 @@ static bool migrate_pred(void *arg, struct bkey_s_c_extent e) #define MAX_DATA_OFF_ITER 10 -/* - * This moves only the data off, leaving the meta-data (if any) in place. - * It walks the key space, and for any key with a valid pointer to the - * relevant device, it copies it elsewhere, updating the key to point to - * the copy. - * The meta-data is moved off by bch_move_meta_data_off_device. - * - * Note: If the number of data replicas desired is > 1, ideally, any - * new copies would not be made in the same device that already have a - * copy (if there are enough devices). - * This is _not_ currently implemented. The multiple replicas can - * land in the same device even if there are others available. - */ - -int bch2_move_data_off_device(struct bch_dev *ca) +static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca, + int flags) { - struct bch_fs *c = ca->fs; struct btree_iter iter; struct bkey_s_c k; u64 keys_moved, sectors_moved; @@ -113,10 +99,6 @@ int bch2_move_data_off_device(struct bch_dev *ca) return ret; } -/* - * This walks the btree, and for any node on the relevant device it moves the - * node elsewhere. - */ static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca, enum btree_id id) { @@ -200,9 +182,9 @@ static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca, * is written. */ -int bch2_move_metadata_off_device(struct bch_dev *ca) +static int bch2_dev_metadata_migrate(struct bch_fs *c, struct bch_dev *ca, + int flags) { - struct bch_fs *c = ca->fs; unsigned i; int ret = 0; @@ -240,37 +222,31 @@ err: return ret; } -/* - * Flagging data bad when forcibly removing a device after failing to - * migrate the data off the device. - */ - -static int bch2_flag_key_bad(struct btree_iter *iter, - struct bch_dev *ca, - struct bkey_s_c_extent orig) +int bch2_dev_data_migrate(struct bch_fs *c, struct bch_dev *ca, int flags) { - BKEY_PADDED(key) tmp; - struct bkey_s_extent e; - struct bch_extent_ptr *ptr; - struct bch_fs *c = ca->fs; + return bch2_dev_usrdata_migrate(c, ca, flags) ?: + bch2_dev_metadata_migrate(c, ca, flags); +} - bkey_reassemble(&tmp.key, orig.s_c); - e = bkey_i_to_s_extent(&tmp.key); +static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e, + unsigned dev_idx, int flags, bool metadata) +{ + struct bch_extent_ptr *ptr; + unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas; + unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST; + unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED; + unsigned nr_good; extent_for_each_ptr_backwards(e, ptr) - if (ptr->dev == ca->dev_idx) + if (ptr->dev == dev_idx) bch2_extent_drop_ptr(e, ptr); - /* - * If the new extent no longer has any pointers, bch2_extent_normalize() - * will do the appropriate thing with it (turning it into a - * KEY_TYPE_ERROR key, or just a discard if it was a cached extent) - */ - bch2_extent_normalize(c, e.s); + nr_good = bch2_extent_nr_good_ptrs(c, e.c); + if ((!nr_good && !(flags & lost)) || + (nr_good < replicas && !(flags & degraded))) + return -EINVAL; - return bch2_btree_insert_at(c, NULL, NULL, NULL, - BTREE_INSERT_ATOMIC, - BTREE_INSERT_ENTRY(iter, &tmp.key)); + return 0; } /* @@ -284,11 +260,11 @@ static int bch2_flag_key_bad(struct btree_iter *iter, * that we've already tried to move the data MAX_DATA_OFF_ITER times and * are not likely to succeed if we try again. */ -int bch2_flag_data_bad(struct bch_dev *ca) +static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) { - struct bch_fs *c = ca->fs; struct bkey_s_c k; - struct bkey_s_c_extent e; + struct bkey_s_extent e; + BKEY_PADDED(key) tmp; struct btree_iter iter; int ret = 0; @@ -303,11 +279,33 @@ int bch2_flag_data_bad(struct bch_dev *ca) if (!bkey_extent_is_data(k.k)) goto advance; - e = bkey_s_c_to_extent(k); - if (!bch2_extent_has_device(e, ca->dev_idx)) + if (!bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) goto advance; - ret = bch2_flag_key_bad(&iter, ca, e); + bkey_reassemble(&tmp.key, k); + e = bkey_i_to_s_extent(&tmp.key); + + ret = drop_dev_ptrs(c, e, dev_idx, flags, false); + if (ret) + break; + + /* + * If the new extent no longer has any pointers, bch2_extent_normalize() + * will do the appropriate thing with it (turning it into a + * KEY_TYPE_ERROR key, or just a discard if it was a cached extent) + */ + bch2_extent_normalize(c, e.s); + + if (bkey_extent_is_data(e.k) && + (ret = bch2_check_mark_super(c, e.c, BCH_DATA_USER))) + break; + + iter.pos = bkey_start_pos(&tmp.key.k); + + ret = bch2_btree_insert_at(c, NULL, NULL, NULL, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL, + BTREE_INSERT_ENTRY(&iter, &tmp.key)); /* * don't want to leave ret == -EINTR, since if we raced and @@ -319,26 +317,6 @@ int bch2_flag_data_bad(struct bch_dev *ca) if (ret) break; - /* - * If the replica we're dropping was dirty and there is an - * additional cached replica, the cached replica will now be - * considered dirty - upon inserting the new version of the key, - * the bucket accounting will be updated to reflect the fact - * that the cached data is now dirty and everything works out as - * if by magic without us having to do anything. - * - * The one thing we need to be concerned with here is there's a - * race between when we drop any stale pointers from the key - * we're about to insert, and when the key actually gets - * inserted and the cached data is marked as dirty - we could - * end up trying to insert a key with a pointer that should be - * dirty, but points to stale data. - * - * If that happens the insert code just bails out and doesn't do - * the insert - however, it doesn't return an error. Hence we - * need to always recheck the current key before advancing to - * the next: - */ continue; advance: if (bkey_extent_is_data(k.k)) { @@ -357,3 +335,80 @@ advance: return ret; } + +static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) +{ + struct btree_iter iter; + struct closure cl; + struct btree *b; + unsigned id; + int ret; + + /* don't handle this yet: */ + if (flags & BCH_FORCE_IF_METADATA_LOST) + return -EINVAL; + + closure_init_stack(&cl); + + mutex_lock(&c->replicas_gc_lock); + bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE); + + for (id = 0; id < BTREE_ID_NR; id++) { + for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) { + __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; + struct bkey_i_extent *new_key; +retry: + if (!bch2_extent_has_device(bkey_i_to_s_c_extent(&b->key), + dev_idx)) { + bch2_btree_iter_set_locks_want(&iter, 0); + + ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key), + BCH_DATA_BTREE); + if (ret) + goto err; + } else { + bkey_copy(&tmp.k, &b->key); + new_key = bkey_i_to_extent(&tmp.k); + + ret = drop_dev_ptrs(c, extent_i_to_s(new_key), + dev_idx, flags, true); + if (ret) + goto err; + + if (!bch2_btree_iter_set_locks_want(&iter, U8_MAX)) { + b = bch2_btree_iter_peek_node(&iter); + goto retry; + } + + ret = bch2_btree_node_update_key(c, &iter, b, new_key); + if (ret == -EINTR) { + b = bch2_btree_iter_peek_node(&iter); + goto retry; + } + if (ret) + goto err; + } + } + bch2_btree_iter_unlock(&iter); + + /* btree root */ + mutex_lock(&c->btree_root_lock); + mutex_unlock(&c->btree_root_lock); + } + + ret = 0; +out: + bch2_replicas_gc_end(c, ret); + mutex_unlock(&c->replicas_gc_lock); + + return ret; +err: + bch2_btree_iter_unlock(&iter); + goto out; +} + +int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags) +{ + return bch2_dev_usrdata_drop(c, dev_idx, flags) ?: + bch2_dev_metadata_drop(c, dev_idx, flags); +} diff --git a/libbcachefs/migrate.h b/libbcachefs/migrate.h index 9bdaa792..6db7b911 100644 --- a/libbcachefs/migrate.h +++ b/libbcachefs/migrate.h @@ -1,8 +1,7 @@ #ifndef _BCACHEFS_MIGRATE_H #define _BCACHEFS_MIGRATE_H -int bch2_move_data_off_device(struct bch_dev *); -int bch2_move_metadata_off_device(struct bch_dev *); -int bch2_flag_data_bad(struct bch_dev *); +int bch2_dev_data_migrate(struct bch_fs *, struct bch_dev *, int); +int bch2_dev_data_drop(struct bch_fs *, unsigned, int); #endif /* _BCACHEFS_MIGRATE_H */ diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 5eaf0cf8..8ce63d66 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -3,6 +3,7 @@ #include "btree_gc.h" #include "btree_update.h" #include "buckets.h" +#include "inode.h" #include "io.h" #include "move.h" #include "super-io.h" @@ -206,7 +207,7 @@ static void move_write(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); - if (likely(!io->rbio.bio.bi_error)) { + if (likely(!io->rbio.bio.bi_status)) { bch2_migrate_write_init(&io->write, &io->rbio); closure_call(&io->write.op.cl, bch2_write, NULL, cl); } @@ -240,6 +241,7 @@ static int bch2_move_extent(struct bch_fs *c, struct write_point_specifier wp, int btree_insert_flags, int move_device, + struct bch_io_opts opts, struct bkey_s_c k) { struct extent_pick_ptr pick; @@ -276,6 +278,7 @@ static int bch2_move_extent(struct bch_fs *c, goto err; } + io->rbio.opts = opts; bio_init(&io->rbio.bio, io->bi_inline_vecs, pages); bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); io->rbio.bio.bi_iter.bi_size = sectors << 9; @@ -284,9 +287,13 @@ static int bch2_move_extent(struct bch_fs *c, io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); io->rbio.bio.bi_end_io = move_read_endio; - __bch2_write_op_init(&io->write.op, c); io->write.btree_insert_flags = btree_insert_flags; io->write.move_dev = move_device; + + bch2_write_op_init(&io->write.op, c); + io->write.op.csum_type = bch2_data_checksum_type(c, opts.data_checksum); + io->write.op.compression_type = + bch2_compression_opt_to_type(opts.compression); io->write.op.devs = devs; io->write.op.write_point = wp; @@ -371,9 +378,11 @@ int bch2_move_data(struct bch_fs *c, { bool kthread = (current->flags & PF_KTHREAD) != 0; struct moving_context ctxt; + struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts); struct btree_iter iter; BKEY_PADDED(k) tmp; struct bkey_s_c k; + u64 cur_inum = U64_MAX; int ret = 0; bch2_move_ctxt_init(&ctxt); @@ -396,7 +405,7 @@ int bch2_move_data(struct bch_fs *c, (bch2_btree_iter_unlock(&iter), (ret = bch2_ratelimit_wait_freezable_stoppable(rate)))) break; - +peek: k = bch2_btree_iter_peek(&iter); if (!k.k) break; @@ -404,8 +413,23 @@ int bch2_move_data(struct bch_fs *c, if (ret) break; - if (!bkey_extent_is_data(k.k) || - !pred(arg, bkey_s_c_to_extent(k))) + if (!bkey_extent_is_data(k.k)) + goto next; + + if (cur_inum != k.k->p.inode) { + struct bch_inode_unpacked inode; + + /* don't hold btree locks while looking up inode: */ + bch2_btree_iter_unlock(&iter); + + opts = bch2_opts_to_inode_opts(c->opts); + if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode)) + bch2_io_opts_apply(&opts, bch2_inode_opts_get(&inode)); + cur_inum = k.k->p.inode; + goto peek; + } + + if (!pred(arg, bkey_s_c_to_extent(k))) goto next; /* unlock before doing IO: */ @@ -415,7 +439,7 @@ int bch2_move_data(struct bch_fs *c, if (bch2_move_extent(c, &ctxt, devs, wp, btree_insert_flags, - move_device, k)) { + move_device, opts, k)) { /* memory allocation failure, wait for some IO to finish */ bch2_move_ctxt_wait_for_io(&ctxt); continue; diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index c9482151..28e40e41 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -76,16 +76,27 @@ void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) #undef BCH_OPT } +bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id) +{ + switch (id) { +#define BCH_OPT(_name, ...) \ + case Opt_##_name: \ + return opt_defined(*opts, _name); + BCH_OPTS() +#undef BCH_OPT + default: + BUG(); + } +} + u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id) { switch (id) { #define BCH_OPT(_name, ...) \ case Opt_##_name: \ - return opts->_name; \ - + return opts->_name; BCH_OPTS() #undef BCH_OPT - default: BUG(); } @@ -98,10 +109,8 @@ void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v) case Opt_##_name: \ opt_set(*opts, _name, v); \ break; - BCH_OPTS() #undef BCH_OPT - default: BUG(); } @@ -118,7 +127,6 @@ struct bch_opts bch2_opts_from_sb(struct bch_sb *sb) #define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default) \ if (_sb_opt != NO_SB_OPT) \ opt_set(opts, _name, _sb_opt(sb)); - BCH_OPTS() #undef BCH_OPT @@ -145,7 +153,7 @@ const struct bch_option bch2_opt_table[] = { #undef BCH_OPT }; -static int bch2_opt_lookup(const char *name) +int bch2_opt_lookup(const char *name) { const struct bch_option *i; @@ -247,3 +255,52 @@ no_val: pr_err("Mount option %s requires a value", name); return -1; } + +/* io opts: */ + +struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) +{ + struct bch_io_opts ret = { 0 }; +#define BCH_INODE_OPT(_name, _bits) \ + if (opt_defined(src, _name)) \ + opt_set(ret, _name, src._name); + BCH_INODE_OPTS() +#undef BCH_INODE_OPT + return ret; +} + +struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src) +{ + struct bch_opts ret = { 0 }; +#define BCH_INODE_OPT(_name, _bits) \ + if (opt_defined(src, _name)) \ + opt_set(ret, _name, src._name); + BCH_INODE_OPTS() +#undef BCH_INODE_OPT + return ret; +} + +void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src) +{ +#define BCH_INODE_OPT(_name, _bits) \ + if (opt_defined(src, _name)) \ + opt_set(*dst, _name, src._name); + BCH_INODE_OPTS() +#undef BCH_INODE_OPT +} + +bool bch2_opt_is_inode_opt(enum bch_opt_id id) +{ + static const enum bch_opt_id inode_opt_list[] = { +#define BCH_INODE_OPT(_name, _bits) Opt_##_name, + BCH_INODE_OPTS() +#undef BCH_INODE_OPT + }; + unsigned i; + + for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++) + if (inode_opt_list[i] == id) + return true; + + return false; +} diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 33e3a2c8..126056e6 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -181,10 +181,7 @@ do { \ static inline struct bch_opts bch2_opts_empty(void) { - struct bch_opts opts; - - memset(&opts, 0, sizeof(opts)); - return opts; + return (struct bch_opts) { 0 }; } void bch2_opts_apply(struct bch_opts *, struct bch_opts); @@ -215,12 +212,35 @@ struct bch_option { extern const struct bch_option bch2_opt_table[]; +bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id); u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); struct bch_opts bch2_opts_from_sb(struct bch_sb *); +int bch2_opt_lookup(const char *); int bch2_opt_parse(const struct bch_option *, const char *, u64 *); int bch2_parse_mount_opts(struct bch_opts *, char *); +/* inode opts: */ + +#define BCH_INODE_OPTS() \ + BCH_INODE_OPT(data_checksum, 8) \ + BCH_INODE_OPT(compression, 8) + +struct bch_io_opts { +#define BCH_INODE_OPT(_name, _bits) unsigned _name##_defined:1; + BCH_INODE_OPTS() +#undef BCH_INODE_OPT + +#define BCH_INODE_OPT(_name, _bits) u##_bits _name; + BCH_INODE_OPTS() +#undef BCH_INODE_OPT +}; + +struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); +struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts); +void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts); +bool bch2_opt_is_inode_opt(enum bch_opt_id); + #endif /* _BCACHEFS_OPTS_H */ diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index a3ecfb92..3f55c244 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -12,6 +12,8 @@ #include static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *); +static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, + struct bch_replicas_cpu *); static const char *bch2_sb_validate_replicas(struct bch_sb *); static inline void __bch2_sb_layout_size_assert(void) @@ -157,7 +159,7 @@ struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb, return NULL; f = __bch2_sb_field_resize(sb->sb, f, u64s); - f->type = type; + f->type = cpu_to_le32(type); return f; } @@ -188,7 +190,7 @@ struct bch_sb_field *bch2_fs_sb_field_resize(struct bch_fs *c, } f = __bch2_sb_field_resize(c->disk_sb, f, u64s); - f->type = type; + f->type = cpu_to_le32(type); return f; } @@ -354,7 +356,16 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) if (!BCH_SB_DATA_REPLICAS_REQ(sb) || BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) - return "Invalid number of metadata replicas"; + return "Invalid number of data replicas"; + + if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) + return "Invalid metadata checksum type"; + + if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) + return "Invalid metadata checksum type"; + + if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR) + return "Invalid compression type"; if (!BCH_SB_BTREE_NODE_SIZE(sb)) return "Btree node size not set"; @@ -507,7 +518,7 @@ static void __copy_super(struct bch_sb *dst, struct bch_sb *src) if (src_f->type == BCH_SB_FIELD_journal) continue; - dst_f = bch2_sb_field_get(dst, src_f->type); + dst_f = bch2_sb_field_get(dst, le32_to_cpu(src_f->type)); dst_f = __bch2_sb_field_resize(dst, dst_f, le32_to_cpu(src_f->u64s)); @@ -601,7 +612,7 @@ reread: /* XXX: verify MACs */ csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb), - (struct nonce) { 0 }, sb->sb); + null_nonce(), sb->sb); if (bch2_crc_cmp(csum, sb->sb->csum)) return "bad checksum reading superblock"; @@ -688,9 +699,9 @@ const char *bch2_read_super(const char *path, got_super: pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u", le64_to_cpu(ret->sb->version), - le64_to_cpu(ret->sb->flags), + le64_to_cpu(ret->sb->flags[0]), le64_to_cpu(ret->sb->seq), - le16_to_cpu(ret->sb->u64s)); + le32_to_cpu(ret->sb->u64s)); err = "Superblock block size smaller than device block size"; if (le16_to_cpu(ret->sb->block_size) << 9 < @@ -711,7 +722,7 @@ static void write_super_endio(struct bio *bio) /* XXX: return errors directly */ - if (bch2_dev_io_err_on(bio->bi_error, ca, "superblock write")) + if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write")) ca->sb_write_error = 1; closure_put(&ca->fs->sb_write); @@ -727,7 +738,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum); sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), - (struct nonce) { 0 }, sb); + null_nonce(), sb); bio_reset(bio); bio->bi_bdev = ca->disk_sb.bdev; @@ -830,7 +841,12 @@ out: bch2_sb_update(c); } -/* replica information: */ +/* Replicas tracking - in memory: */ + +#define for_each_cpu_replicas_entry(_r, _i) \ + for (_i = (_r)->entries; \ + (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\ + _i = (void *) (_i) + (_r)->entry_size) static inline struct bch_replicas_cpu_entry * cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) @@ -838,6 +854,11 @@ cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) return (void *) r->entries + r->entry_size * i; } +static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) +{ + eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); +} + static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e, unsigned dev) { @@ -856,6 +877,246 @@ static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r) offsetof(struct bch_replicas_cpu_entry, devs)) * 8; } +static unsigned bkey_to_replicas(struct bkey_s_c_extent e, + enum bch_data_type data_type, + struct bch_replicas_cpu_entry *r, + unsigned *max_dev) +{ + const struct bch_extent_ptr *ptr; + unsigned nr = 0; + + BUG_ON(!data_type || + data_type == BCH_DATA_SB || + data_type >= BCH_DATA_NR); + + memset(r, 0, sizeof(*r)); + r->data_type = data_type; + + *max_dev = 0; + + extent_for_each_ptr(e, ptr) + if (!ptr->cached) { + *max_dev = max_t(unsigned, *max_dev, ptr->dev); + replicas_set_dev(r, ptr->dev); + nr++; + } + return nr; +} + +static struct bch_replicas_cpu * +cpu_replicas_add_entry(struct bch_replicas_cpu *old, + struct bch_replicas_cpu_entry new_entry, + unsigned max_dev) +{ + struct bch_replicas_cpu *new; + unsigned i, nr, entry_size; + + entry_size = offsetof(struct bch_replicas_cpu_entry, devs) + + DIV_ROUND_UP(max_dev + 1, 8); + entry_size = max(entry_size, old->entry_size); + nr = old->nr + 1; + + new = kzalloc(sizeof(struct bch_replicas_cpu) + + nr * entry_size, GFP_NOIO); + if (!new) + return NULL; + + new->nr = nr; + new->entry_size = entry_size; + + for (i = 0; i < old->nr; i++) + memcpy(cpu_replicas_entry(new, i), + cpu_replicas_entry(old, i), + min(new->entry_size, old->entry_size)); + + memcpy(cpu_replicas_entry(new, old->nr), + &new_entry, + new->entry_size); + + bch2_cpu_replicas_sort(new); + return new; +} + +static bool replicas_has_entry(struct bch_replicas_cpu *r, + struct bch_replicas_cpu_entry search, + unsigned max_dev) +{ + return max_dev < replicas_dev_slots(r) && + eytzinger0_find(r->entries, r->nr, + r->entry_size, + memcmp, &search) < r->nr; +} + +noinline +static int bch2_check_mark_super_slowpath(struct bch_fs *c, + struct bch_replicas_cpu_entry new_entry, + unsigned max_dev) +{ + struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r; + int ret = -ENOMEM; + + mutex_lock(&c->sb_lock); + + old_gc = rcu_dereference_protected(c->replicas_gc, + lockdep_is_held(&c->sb_lock)); + if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) { + new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev); + if (!new_gc) + goto err; + } + + old_r = rcu_dereference_protected(c->replicas, + lockdep_is_held(&c->sb_lock)); + /* recheck, might have raced */ + if (replicas_has_entry(old_r, new_entry, max_dev)) + goto out; + + new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev); + if (!new_r) + goto err; + + ret = bch2_cpu_replicas_to_sb_replicas(c, new_r); + if (ret) + goto err; + + if (new_gc) { + rcu_assign_pointer(c->replicas_gc, new_gc); + kfree_rcu(old_gc, rcu); + } + + rcu_assign_pointer(c->replicas, new_r); + kfree_rcu(old_r, rcu); + + bch2_write_super(c); +out: + ret = 0; +err: + mutex_unlock(&c->sb_lock); + return ret; +} + +static inline int __bch2_check_mark_super(struct bch_fs *c, + struct bch_replicas_cpu_entry search, + unsigned max_dev) +{ + struct bch_replicas_cpu *r, *gc_r; + bool marked; + + rcu_read_lock(); + r = rcu_dereference(c->replicas); + gc_r = rcu_dereference(c->replicas_gc); + marked = replicas_has_entry(r, search, max_dev) && + (!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev)); + rcu_read_unlock(); + + return likely(marked) ? 0 + : bch2_check_mark_super_slowpath(c, search, max_dev); +} + +int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e, + enum bch_data_type data_type) +{ + struct bch_replicas_cpu_entry search; + unsigned max_dev; + + if (!bkey_to_replicas(e, data_type, &search, &max_dev)) + return 0; + + return __bch2_check_mark_super(c, search, max_dev); +} + +int bch2_check_mark_super_devlist(struct bch_fs *c, + struct bch_devs_list *devs, + enum bch_data_type data_type) +{ + struct bch_replicas_cpu_entry search = { .data_type = data_type }; + unsigned i, max_dev = 0; + + if (!devs->nr) + return 0; + + for (i = 0; i < devs->nr; i++) { + max_dev = max_t(unsigned, max_dev, devs->devs[i]); + replicas_set_dev(&search, devs->devs[i]); + } + + return __bch2_check_mark_super(c, search, max_dev); +} + +int bch2_replicas_gc_end(struct bch_fs *c, int err) +{ + struct bch_replicas_cpu *new_r, *old_r; + int ret = 0; + + lockdep_assert_held(&c->replicas_gc_lock); + + mutex_lock(&c->sb_lock); + + new_r = rcu_dereference_protected(c->replicas_gc, + lockdep_is_held(&c->sb_lock)); + + if (err) { + rcu_assign_pointer(c->replicas_gc, NULL); + kfree_rcu(new_r, rcu); + goto err; + } + + if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) { + ret = -ENOSPC; + goto err; + } + + old_r = rcu_dereference_protected(c->replicas, + lockdep_is_held(&c->sb_lock)); + + rcu_assign_pointer(c->replicas, new_r); + rcu_assign_pointer(c->replicas_gc, NULL); + kfree_rcu(old_r, rcu); + + bch2_write_super(c); +err: + mutex_unlock(&c->sb_lock); + return ret; +} + +int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) +{ + struct bch_replicas_cpu *dst, *src; + struct bch_replicas_cpu_entry *e; + + lockdep_assert_held(&c->replicas_gc_lock); + + mutex_lock(&c->sb_lock); + BUG_ON(c->replicas_gc); + + src = rcu_dereference_protected(c->replicas, + lockdep_is_held(&c->sb_lock)); + + dst = kzalloc(sizeof(struct bch_replicas_cpu) + + src->nr * src->entry_size, GFP_NOIO); + if (!dst) { + mutex_unlock(&c->sb_lock); + return -ENOMEM; + } + + dst->nr = 0; + dst->entry_size = src->entry_size; + + for_each_cpu_replicas_entry(src, e) + if (!((1 << e->data_type) & typemask)) + memcpy(cpu_replicas_entry(dst, dst->nr++), + e, dst->entry_size); + + bch2_cpu_replicas_sort(dst); + + rcu_assign_pointer(c->replicas_gc, dst); + mutex_unlock(&c->sb_lock); + + return 0; +} + +/* Replicas tracking - superblock: */ + static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r, unsigned *nr, unsigned *bytes, @@ -914,10 +1175,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r) } } - eytzinger0_sort(cpu_r->entries, - cpu_r->nr, - cpu_r->entry_size, - memcmp, NULL); + bch2_cpu_replicas_sort(cpu_r); return cpu_r; } @@ -926,14 +1184,12 @@ static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) struct bch_sb_field_replicas *sb_r; struct bch_replicas_cpu *cpu_r, *old_r; - lockdep_assert_held(&c->sb_lock); - sb_r = bch2_sb_get_replicas(c->disk_sb); cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r); if (!cpu_r) return -ENOMEM; - old_r = c->replicas; + old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock)); rcu_assign_pointer(c->replicas, cpu_r); if (old_r) kfree_rcu(old_r, rcu); @@ -941,192 +1197,133 @@ static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) return 0; } -static void bkey_to_replicas(struct bkey_s_c_extent e, - enum bch_data_type data_type, - struct bch_replicas_cpu_entry *r, - unsigned *max_dev) +static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, + struct bch_replicas_cpu *r) { - const struct bch_extent_ptr *ptr; + struct bch_sb_field_replicas *sb_r; + struct bch_replicas_entry *sb_e; + struct bch_replicas_cpu_entry *e; + size_t i, bytes; - BUG_ON(!data_type || - data_type == BCH_DATA_SB || - data_type >= BCH_DATA_NR); + bytes = sizeof(struct bch_sb_field_replicas); - memset(r, 0, sizeof(*r)); - r->data_type = data_type; + for_each_cpu_replicas_entry(r, e) { + bytes += sizeof(struct bch_replicas_entry); + for (i = 0; i < r->entry_size - 1; i++) + bytes += hweight8(e->devs[i]); + } - *max_dev = 0; + sb_r = bch2_fs_sb_resize_replicas(c, + DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64))); + if (!sb_r) + return -ENOSPC; - extent_for_each_ptr(e, ptr) - if (!ptr->cached) { - *max_dev = max_t(unsigned, *max_dev, ptr->dev); - replicas_set_dev(r, ptr->dev); - } -} + memset(&sb_r->entries, 0, + vstruct_end(&sb_r->field) - + (void *) &sb_r->entries); -/* - * for when gc of replica information is in progress: - */ -static int bch2_update_gc_replicas(struct bch_fs *c, - struct bch_replicas_cpu *gc_r, - struct bkey_s_c_extent e, - enum bch_data_type data_type) -{ - struct bch_replicas_cpu_entry new_e; - struct bch_replicas_cpu *new; - unsigned i, nr, entry_size, max_dev; + sb_e = sb_r->entries; + for_each_cpu_replicas_entry(r, e) { + sb_e->data_type = e->data_type; - bkey_to_replicas(e, data_type, &new_e, &max_dev); + for (i = 0; i < replicas_dev_slots(r); i++) + if (replicas_test_dev(e, i)) + sb_e->devs[sb_e->nr++] = i; - entry_size = offsetof(struct bch_replicas_cpu_entry, devs) + - DIV_ROUND_UP(max_dev + 1, 8); - entry_size = max(entry_size, gc_r->entry_size); - nr = gc_r->nr + 1; + sb_e = replicas_entry_next(sb_e); - new = kzalloc(sizeof(struct bch_replicas_cpu) + - nr * entry_size, GFP_NOIO); - if (!new) - return -ENOMEM; + BUG_ON((void *) sb_e > vstruct_end(&sb_r->field)); + } - new->nr = nr; - new->entry_size = entry_size; - - for (i = 0; i < gc_r->nr; i++) - memcpy(cpu_replicas_entry(new, i), - cpu_replicas_entry(gc_r, i), - gc_r->entry_size); - - memcpy(cpu_replicas_entry(new, nr - 1), - &new_e, - new->entry_size); - - eytzinger0_sort(new->entries, - new->nr, - new->entry_size, - memcmp, NULL); - - rcu_assign_pointer(c->replicas_gc, new); - kfree_rcu(gc_r, rcu); return 0; } -static bool replicas_has_extent(struct bch_replicas_cpu *r, - struct bkey_s_c_extent e, - enum bch_data_type data_type) +static const char *bch2_sb_validate_replicas(struct bch_sb *sb) { - struct bch_replicas_cpu_entry search; - unsigned max_dev; + struct bch_sb_field_members *mi; + struct bch_sb_field_replicas *sb_r; + struct bch_replicas_cpu *cpu_r = NULL; + struct bch_replicas_entry *e; + const char *err; + unsigned i; - bkey_to_replicas(e, data_type, &search, &max_dev); + mi = bch2_sb_get_members(sb); + sb_r = bch2_sb_get_replicas(sb); + if (!sb_r) + return NULL; - return max_dev < replicas_dev_slots(r) && - eytzinger0_find(r->entries, r->nr, - r->entry_size, - memcmp, &search) < r->nr; + for_each_replicas_entry(sb_r, e) { + err = "invalid replicas entry: invalid data type"; + if (e->data_type >= BCH_DATA_NR) + goto err; + + err = "invalid replicas entry: no devices"; + if (!e->nr) + goto err; + + err = "invalid replicas entry: too many devices"; + if (e->nr >= BCH_REPLICAS_MAX) + goto err; + + err = "invalid replicas entry: invalid device"; + for (i = 0; i < e->nr; i++) + if (!bch2_dev_exists(sb, mi, e->devs[i])) + goto err; + } + + err = "cannot allocate memory"; + cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r); + if (!cpu_r) + goto err; + + sort_cmp_size(cpu_r->entries, + cpu_r->nr, + cpu_r->entry_size, + memcmp, NULL); + + for (i = 0; i + 1 < cpu_r->nr; i++) { + struct bch_replicas_cpu_entry *l = + cpu_replicas_entry(cpu_r, i); + struct bch_replicas_cpu_entry *r = + cpu_replicas_entry(cpu_r, i + 1); + + BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0); + + err = "duplicate replicas entry"; + if (!memcmp(l, r, cpu_r->entry_size)) + goto err; + } + + err = NULL; +err: + kfree(cpu_r); + return err; } +/* Query replicas: */ + bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e, enum bch_data_type data_type) { + struct bch_replicas_cpu_entry search; + unsigned max_dev; bool ret; + if (!bkey_to_replicas(e, data_type, &search, &max_dev)) + return true; + rcu_read_lock(); - ret = replicas_has_extent(rcu_dereference(c->replicas), - e, data_type); + ret = replicas_has_entry(rcu_dereference(c->replicas), + search, max_dev); rcu_read_unlock(); return ret; } -noinline -static int bch2_check_mark_super_slowpath(struct bch_fs *c, - struct bkey_s_c_extent e, - enum bch_data_type data_type) -{ - struct bch_replicas_cpu *gc_r; - const struct bch_extent_ptr *ptr; - struct bch_sb_field_replicas *sb_r; - struct bch_replicas_entry *new_entry; - unsigned new_entry_bytes, new_u64s, nr, bytes, max_dev; - int ret = 0; - - mutex_lock(&c->sb_lock); - - gc_r = rcu_dereference_protected(c->replicas_gc, - lockdep_is_held(&c->sb_lock)); - if (gc_r && - !replicas_has_extent(gc_r, e, data_type)) { - ret = bch2_update_gc_replicas(c, gc_r, e, data_type); - if (ret) - goto err; - } - - /* recheck, might have raced */ - if (bch2_sb_has_replicas(c, e, data_type)) { - mutex_unlock(&c->sb_lock); - return 0; - } - - new_entry_bytes = sizeof(struct bch_replicas_entry) + - bch2_extent_nr_dirty_ptrs(e.s_c); - - sb_r = bch2_sb_get_replicas(c->disk_sb); - - bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev); - - new_u64s = DIV_ROUND_UP(bytes + new_entry_bytes, sizeof(u64)); - - sb_r = bch2_fs_sb_resize_replicas(c, - DIV_ROUND_UP(sizeof(*sb_r) + bytes + new_entry_bytes, - sizeof(u64))); - if (!sb_r) { - ret = -ENOSPC; - goto err; - } - - new_entry = (void *) sb_r + bytes; - new_entry->data_type = data_type; - new_entry->nr = 0; - - extent_for_each_ptr(e, ptr) - if (!ptr->cached) - new_entry->devs[new_entry->nr++] = ptr->dev; - - ret = bch2_sb_replicas_to_cpu_replicas(c); - if (ret) { - memset(new_entry, 0, - vstruct_end(&sb_r->field) - (void *) new_entry); - goto err; - } - - bch2_write_super(c); -err: - mutex_unlock(&c->sb_lock); - return ret; -} - -int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e, - enum bch_data_type data_type) -{ - struct bch_replicas_cpu *gc_r; - bool marked; - - rcu_read_lock(); - marked = replicas_has_extent(rcu_dereference(c->replicas), - e, data_type) && - (!(gc_r = rcu_dereference(c->replicas_gc)) || - replicas_has_extent(gc_r, e, data_type)); - rcu_read_unlock(); - - if (marked) - return 0; - - return bch2_check_mark_super_slowpath(c, e, data_type); -} - struct replicas_status __bch2_replicas_status(struct bch_fs *c, - struct bch_devs_mask online_devs) + struct bch_devs_mask online_devs) { + struct bch_sb_field_members *mi; struct bch_replicas_cpu_entry *e; struct bch_replicas_cpu *r; unsigned i, dev, dev_slots, nr_online, nr_offline; @@ -1137,14 +1334,15 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c, for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) ret.replicas[i].nr_online = UINT_MAX; + mi = bch2_sb_get_members(c->disk_sb); rcu_read_lock(); + r = rcu_dereference(c->replicas); - dev_slots = min_t(unsigned, replicas_dev_slots(r), c->sb.nr_devices); + dev_slots = replicas_dev_slots(r); - for (i = 0; i < r->nr; i++) { - e = cpu_replicas_entry(r, i); - - BUG_ON(e->data_type >= ARRAY_SIZE(ret.replicas)); + for_each_cpu_replicas_entry(r, e) { + if (e->data_type >= ARRAY_SIZE(ret.replicas)) + panic("e %p data_type %u\n", e, e->data_type); nr_online = nr_offline = 0; @@ -1152,6 +1350,8 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c, if (!replicas_test_dev(e, dev)) continue; + BUG_ON(!bch2_dev_exists(c->disk_sb, mi, dev)); + if (test_bit(dev, online_devs.d)) nr_online++; else @@ -1216,7 +1416,7 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) { struct bch_replicas_cpu_entry *e; struct bch_replicas_cpu *r; - unsigned i, ret = 0; + unsigned ret = 0; rcu_read_lock(); r = rcu_dereference(c->replicas); @@ -1224,191 +1424,13 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) if (ca->dev_idx >= replicas_dev_slots(r)) goto out; - for (i = 0; i < r->nr; i++) { - e = cpu_replicas_entry(r, i); - + for_each_cpu_replicas_entry(r, e) if (replicas_test_dev(e, ca->dev_idx)) { ret |= 1 << e->data_type; break; } - } out: rcu_read_unlock(); return ret; } - -static const char *bch2_sb_validate_replicas(struct bch_sb *sb) -{ - struct bch_sb_field_members *mi; - struct bch_sb_field_replicas *sb_r; - struct bch_replicas_cpu *cpu_r = NULL; - struct bch_replicas_entry *e; - const char *err; - unsigned i; - - mi = bch2_sb_get_members(sb); - sb_r = bch2_sb_get_replicas(sb); - if (!sb_r) - return NULL; - - for_each_replicas_entry(sb_r, e) { - err = "invalid replicas entry: invalid data type"; - if (e->data_type >= BCH_DATA_NR) - goto err; - - err = "invalid replicas entry: too many devices"; - if (e->nr >= BCH_REPLICAS_MAX) - goto err; - - err = "invalid replicas entry: invalid device"; - for (i = 0; i < e->nr; i++) - if (!bch2_dev_exists(sb, mi, e->devs[i])) - goto err; - } - - err = "cannot allocate memory"; - cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r); - if (!cpu_r) - goto err; - - sort_cmp_size(cpu_r->entries, - cpu_r->nr, - cpu_r->entry_size, - memcmp, NULL); - - for (i = 0; i + 1 < cpu_r->nr; i++) { - struct bch_replicas_cpu_entry *l = - cpu_replicas_entry(cpu_r, i); - struct bch_replicas_cpu_entry *r = - cpu_replicas_entry(cpu_r, i + 1); - - BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0); - - err = "duplicate replicas entry"; - if (!memcmp(l, r, cpu_r->entry_size)) - goto err; - } - - err = NULL; -err: - kfree(cpu_r); - return err; -} - -int bch2_replicas_gc_end(struct bch_fs *c, int err) -{ - struct bch_sb_field_replicas *sb_r; - struct bch_replicas_cpu *r, *old_r; - struct bch_replicas_entry *dst_e; - size_t i, j, bytes, dev_slots; - int ret = 0; - - lockdep_assert_held(&c->replicas_gc_lock); - - mutex_lock(&c->sb_lock); - - r = rcu_dereference_protected(c->replicas_gc, - lockdep_is_held(&c->sb_lock)); - - if (err) { - rcu_assign_pointer(c->replicas_gc, NULL); - kfree_rcu(r, rcu); - goto err; - } - - dev_slots = replicas_dev_slots(r); - - bytes = sizeof(struct bch_sb_field_replicas); - - for (i = 0; i < r->nr; i++) { - struct bch_replicas_cpu_entry *e = - cpu_replicas_entry(r, i); - - bytes += sizeof(struct bch_replicas_entry); - for (j = 0; j < r->entry_size - 1; j++) - bytes += hweight8(e->devs[j]); - } - - sb_r = bch2_fs_sb_resize_replicas(c, - DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64))); - if (!sb_r) { - ret = -ENOSPC; - goto err; - } - - memset(&sb_r->entries, 0, - vstruct_end(&sb_r->field) - - (void *) &sb_r->entries); - - dst_e = sb_r->entries; - for (i = 0; i < r->nr; i++) { - struct bch_replicas_cpu_entry *src_e = - cpu_replicas_entry(r, i); - - dst_e->data_type = src_e->data_type; - - for (j = 0; j < dev_slots; j++) - if (replicas_test_dev(src_e, j)) - dst_e->devs[dst_e->nr++] = j; - - dst_e = replicas_entry_next(dst_e); - } - - old_r = rcu_dereference_protected(c->replicas, - lockdep_is_held(&c->sb_lock)); - rcu_assign_pointer(c->replicas, r); - rcu_assign_pointer(c->replicas_gc, NULL); - kfree_rcu(old_r, rcu); - - bch2_write_super(c); -err: - mutex_unlock(&c->sb_lock); - return ret; -} - -int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) -{ - struct bch_replicas_cpu *r, *src; - unsigned i; - - lockdep_assert_held(&c->replicas_gc_lock); - - mutex_lock(&c->sb_lock); - BUG_ON(c->replicas_gc); - - src = rcu_dereference_protected(c->replicas, - lockdep_is_held(&c->sb_lock)); - - r = kzalloc(sizeof(struct bch_replicas_cpu) + - src->nr * src->entry_size, GFP_NOIO); - if (!r) { - mutex_unlock(&c->sb_lock); - return -ENOMEM; - } - - r->entry_size = src->entry_size; - r->nr = 0; - - for (i = 0; i < src->nr; i++) { - struct bch_replicas_cpu_entry *dst_e = - cpu_replicas_entry(r, r->nr); - struct bch_replicas_cpu_entry *src_e = - cpu_replicas_entry(src, i); - - if (!(src_e->data_type & typemask)) { - memcpy(dst_e, src_e, r->entry_size); - r->nr++; - } - } - - eytzinger0_sort(r->entries, - r->nr, - r->entry_size, - memcmp, NULL); - - rcu_assign_pointer(c->replicas_gc, r); - mutex_unlock(&c->sb_lock); - - return 0; -} diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h index 8cafb301..4096efb2 100644 --- a/libbcachefs/super-io.h +++ b/libbcachefs/super-io.h @@ -125,23 +125,12 @@ void bch2_write_super(struct bch_fs *); /* replicas: */ -/* iterate over bch_sb_field_replicas: */ - -static inline struct bch_replicas_entry * -replicas_entry_next(struct bch_replicas_entry *i) -{ - return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr; -} - -#define for_each_replicas_entry(_r, _i) \ - for (_i = (_r)->entries; \ - (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ - (_i) = replicas_entry_next(_i)) - bool bch2_sb_has_replicas(struct bch_fs *, struct bkey_s_c_extent, enum bch_data_type); int bch2_check_mark_super(struct bch_fs *, struct bkey_s_c_extent, enum bch_data_type); +int bch2_check_mark_super_devlist(struct bch_fs *, struct bch_devs_list *, + enum bch_data_type); struct replicas_status { struct { @@ -161,4 +150,17 @@ unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); int bch2_replicas_gc_end(struct bch_fs *, int); int bch2_replicas_gc_start(struct bch_fs *, unsigned); +/* iterate over superblock replicas - used by userspace tools: */ + +static inline struct bch_replicas_entry * +replicas_entry_next(struct bch_replicas_entry *i) +{ + return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr; +} + +#define for_each_replicas_entry(_r, _i) \ + for (_i = (_r)->entries; \ + (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ + (_i) = replicas_entry_next(_i)) + #endif /* _BCACHEFS_SUPER_IO_H */ diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 4e8b0a51..60a2d83e 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -140,8 +140,9 @@ struct bch_fs *bch2_uuid_to_fs(uuid_le uuid) return c; } -int bch2_congested(struct bch_fs *c, int bdi_bits) +int bch2_congested(void *data, int bdi_bits) { + struct bch_fs *c = data; struct backing_dev_info *bdi; struct bch_dev *ca; unsigned i; @@ -178,13 +179,6 @@ int bch2_congested(struct bch_fs *c, int bdi_bits) return ret; } -static int bch2_congested_fn(void *data, int bdi_bits) -{ - struct bch_fs *c = data; - - return bch2_congested(c, bdi_bits); -} - /* Filesystem RO/RW: */ /* @@ -218,7 +212,7 @@ static void __bch2_fs_read_only(struct bch_fs *c) * Flush journal before stopping allocators, because flushing journal * blacklist entries involves allocating new btree nodes: */ - bch2_journal_flush_pins(&c->journal, U64_MAX); + bch2_journal_flush_all_pins(&c->journal); if (!bch2_journal_error(&c->journal)) bch2_btree_verify_flushed(c); @@ -379,8 +373,6 @@ static void bch2_fs_free(struct bch_fs *c) bch2_io_clock_exit(&c->io_clock[WRITE]); bch2_io_clock_exit(&c->io_clock[READ]); bch2_fs_compress_exit(c); - if (c->bdi.bdi_list.next) - bdi_destroy(&c->bdi); lg_lock_free(&c->usage_lock); free_percpu(c->usage_percpu); mempool_exit(&c->btree_bounce_pool); @@ -393,7 +385,7 @@ static void bch2_fs_free(struct bch_fs *c) mempool_exit(&c->btree_reserve_pool); mempool_exit(&c->fill_iter); percpu_ref_exit(&c->writes); - kfree(c->replicas); + kfree(rcu_dereference_protected(c->replicas, 1)); if (c->copygc_wq) destroy_workqueue(c->copygc_wq); @@ -414,7 +406,7 @@ static void bch2_fs_exit(struct bch_fs *c) for (i = 0; i < c->sb.nr_devices; i++) if (c->devs[i]) - bch2_dev_free(c->devs[i]); + bch2_dev_free(rcu_dereference_protected(c->devs[i], 1)); closure_debug_destroy(&c->cl); kobject_put(&c->kobj); @@ -576,10 +568,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) sizeof(struct btree_update)) || mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || bioset_init(&c->btree_read_bio, 1, - offsetof(struct btree_read_bio, bio)) || - bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio)) || - bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio)) || - bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio)) || + offsetof(struct btree_read_bio, bio), + BIOSET_NEED_BVECS) || + bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), + BIOSET_NEED_BVECS) || + bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), + BIOSET_NEED_BVECS) || + bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), + BIOSET_NEED_BVECS) || mempool_init_page_pool(&c->bio_bounce_pages, max_t(unsigned, c->opts.btree_node_size, @@ -588,7 +584,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) || lg_lock_init(&c->usage_lock) || mempool_init_vp_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) || - bdi_setup_and_register(&c->bdi, "bcachefs") || bch2_io_clock_init(&c->io_clock[READ]) || bch2_io_clock_init(&c->io_clock[WRITE]) || bch2_fs_journal_init(&c->journal) || @@ -599,10 +594,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_fsio_init(c)) goto err; - c->bdi.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; - c->bdi.congested_fn = bch2_congested_fn; - c->bdi.congested_data = c; - mi = bch2_sb_get_members(c->disk_sb); for (i = 0; i < c->sb.nr_devices; i++) if (bch2_dev_exists(c->disk_sb, mi, i) && @@ -729,8 +720,12 @@ static const char *__bch2_fs_start(struct bch_fs *c) continue; err = "error reading btree root"; - if (bch2_btree_root_read(c, i, k, level)) - goto err; + if (bch2_btree_root_read(c, i, k, level)) { + if (i != BTREE_ID_ALLOC) + goto err; + + mustfix_fsck_err(c, "error reading btree root"); + } } err = "error reading allocation information"; @@ -830,7 +825,7 @@ static const char *__bch2_fs_start(struct bch_fs *c) closure_sync(&cl); bch2_inode_init(c, &inode, 0, 0, - S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0); + S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); inode.bi_inum = BCACHEFS_ROOT_INO; bch2_inode_pack(&packed_inode, &inode); @@ -877,6 +872,7 @@ out: bch2_journal_entries_free(&journal); return err; err: +fsck_err: closure_sync(&cl); switch (ret) { @@ -995,24 +991,20 @@ static void bch2_dev_free(struct bch_dev *ca) kobject_put(&ca->kobj); } -static void bch2_dev_io_ref_release(struct percpu_ref *ref) -{ - struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref); - - complete(&ca->offline_complete); -} - static void __bch2_dev_offline(struct bch_dev *ca) { struct bch_fs *c = ca->fs; lockdep_assert_held(&c->state_lock); + if (percpu_ref_is_zero(&ca->io_ref)) + return; + __bch2_dev_read_only(c, ca); - reinit_completion(&ca->offline_complete); + reinit_completion(&ca->io_ref_completion); percpu_ref_kill(&ca->io_ref); - wait_for_completion(&ca->offline_complete); + wait_for_completion(&ca->io_ref_completion); if (ca->kobj.state_in_sysfs) { struct kobject *block = @@ -1026,27 +1018,18 @@ static void __bch2_dev_offline(struct bch_dev *ca) bch2_dev_journal_exit(ca); } -static void bch2_dev_ref_release(struct percpu_ref *ref) +static void bch2_dev_ref_complete(struct percpu_ref *ref) { struct bch_dev *ca = container_of(ref, struct bch_dev, ref); - complete(&ca->stop_complete); + complete(&ca->ref_completion); } -static void bch2_dev_stop(struct bch_dev *ca) +static void bch2_dev_io_ref_complete(struct percpu_ref *ref) { - struct bch_fs *c = ca->fs; + struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref); - lockdep_assert_held(&c->state_lock); - - BUG_ON(rcu_access_pointer(c->devs[ca->dev_idx]) != ca); - rcu_assign_pointer(c->devs[ca->dev_idx], NULL); - - synchronize_rcu(); - - reinit_completion(&ca->stop_complete); - percpu_ref_kill(&ca->ref); - wait_for_completion(&ca->stop_complete); + complete(&ca->io_ref_completion); } static int bch2_dev_sysfs_online(struct bch_dev *ca) @@ -1095,8 +1078,8 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) return -ENOMEM; kobject_init(&ca->kobj, &bch2_dev_ktype); - init_completion(&ca->stop_complete); - init_completion(&ca->offline_complete); + init_completion(&ca->ref_completion); + init_completion(&ca->io_ref_completion); ca->dev_idx = dev_idx; __set_bit(ca->dev_idx, ca->self.d); @@ -1132,9 +1115,9 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) DIV_ROUND_UP(BTREE_NODE_RESERVE, ca->mi.bucket_size / c->opts.btree_node_size); - if (percpu_ref_init(&ca->ref, bch2_dev_ref_release, + if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL) || - percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_release, + percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_BTREE], btree_node_reserve_buckets, GFP_KERNEL) || @@ -1155,7 +1138,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) GFP_KERNEL|__GFP_ZERO)) || !(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) || bioset_init(&ca->replica_set, 4, - offsetof(struct bch_write_bio, bio)) || + offsetof(struct bch_write_bio, bio), 0) || !(ca->io_done = alloc_percpu(*ca->io_done))) goto err; @@ -1180,8 +1163,6 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb) struct bch_dev *ca; int ret; - lockdep_assert_held(&c->sb_lock); - if (le64_to_cpu(sb->sb->seq) > le64_to_cpu(c->disk_sb->seq)) bch2_sb_to_fs(c, sb->sb); @@ -1189,13 +1170,15 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb) BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices || !c->devs[sb->sb->dev_idx]); - ca = c->devs[sb->sb->dev_idx]; + ca = bch_dev_locked(c, sb->sb->dev_idx); if (ca->disk_sb.bdev) { bch_err(c, "already have device online in slot %u", sb->sb->dev_idx); return -EINVAL; } + BUG_ON(!percpu_ref_is_zero(&ca->io_ref)); + ret = bch2_dev_journal_init(ca, sb->sb); if (ret) return ret; @@ -1222,7 +1205,7 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb) if (bch2_dev_sysfs_online(ca)) pr_warn("error creating sysfs objects"); - bch2_mark_dev_superblock(c, ca, 0); + bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE); if (ca->mi.state == BCH_MEMBER_STATE_RW) bch2_dev_allocator_add(c, ca); @@ -1293,6 +1276,7 @@ static bool bch2_fs_may_start(struct bch_fs *c) { struct replicas_status s; struct bch_sb_field_members *mi; + struct bch_dev *ca; unsigned i, flags = c->opts.degraded ? BCH_FORCE_IF_DEGRADED : 0; @@ -1301,14 +1285,19 @@ static bool bch2_fs_may_start(struct bch_fs *c) mutex_lock(&c->sb_lock); mi = bch2_sb_get_members(c->disk_sb); - for (i = 0; i < c->disk_sb->nr_devices; i++) - if (bch2_dev_exists(c->disk_sb, mi, i) && - !bch2_dev_is_online(c->devs[i]) && - (c->devs[i]->mi.state == BCH_MEMBER_STATE_RW || - c->devs[i]->mi.state == BCH_MEMBER_STATE_RO)) { + for (i = 0; i < c->disk_sb->nr_devices; i++) { + if (!bch2_dev_exists(c->disk_sb, mi, i)) + continue; + + ca = bch_dev_locked(c, i); + + if (!bch2_dev_is_online(ca) && + (ca->mi.state == BCH_MEMBER_STATE_RW || + ca->mi.state == BCH_MEMBER_STATE_RO)) { mutex_unlock(&c->sb_lock); return false; } + } mutex_unlock(&c->sb_lock); } @@ -1419,22 +1408,59 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) * * flag_data_bad() does not check btree pointers */ - ret = bch2_flag_data_bad(ca); + ret = bch2_dev_data_drop(c, ca->dev_idx, flags); if (ret) { - bch_err(ca, "Remove failed"); + bch_err(ca, "Remove failed: error %i dropping data", ret); + goto err; + } + + ret = bch2_journal_flush_device(&c->journal, ca->dev_idx); + if (ret) { + bch_err(ca, "Remove failed: error %i flushing journal", ret); goto err; } data = bch2_dev_has_data(c, ca); if (data) { - bch_err(ca, "Remove failed, still has data (%x)", data); + char data_has_str[100]; + bch2_scnprint_flag_list(data_has_str, + sizeof(data_has_str), + bch2_data_types, + data); + bch_err(ca, "Remove failed, still has data (%s)", data_has_str); + ret = -EBUSY; goto err; } - bch2_journal_meta(&c->journal); + ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC, + POS(ca->dev_idx, 0), + POS(ca->dev_idx + 1, 0), + ZERO_VERSION, + NULL, NULL, NULL); + if (ret) { + bch_err(ca, "Remove failed, error deleting alloc info"); + goto err; + } + + /* + * must flush all existing journal entries, they might have + * (overwritten) keys that point to the device we're removing: + */ + ret = bch2_journal_flush_all_pins(&c->journal); + if (ret) { + bch_err(ca, "Remove failed, journal error"); + goto err; + } __bch2_dev_offline(ca); - bch2_dev_stop(ca); + + mutex_lock(&c->sb_lock); + rcu_assign_pointer(c->devs[ca->dev_idx], NULL); + mutex_unlock(&c->sb_lock); + + percpu_ref_kill(&ca->ref); + wait_for_completion(&ca->ref_completion); + bch2_dev_free(ca); /* @@ -1542,7 +1568,7 @@ have_slot: bch2_write_super(c); mutex_unlock(&c->sb_lock); - ca = c->devs[dev_idx]; + ca = bch_dev_locked(c, dev_idx); if (ca->mi.state == BCH_MEMBER_STATE_RW) { err = "journal alloc failed"; if (bch2_dev_journal_alloc(ca)) @@ -1568,7 +1594,7 @@ err: /* Hot add existing device to running filesystem: */ int bch2_dev_online(struct bch_fs *c, const char *path) { - struct bch_sb_handle sb = { 0 }; + struct bch_sb_handle sb = { NULL }; struct bch_dev *ca; unsigned dev_idx; const char *err; @@ -1593,7 +1619,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path) } mutex_unlock(&c->sb_lock); - ca = c->devs[dev_idx]; + ca = bch_dev_locked(c, dev_idx); if (ca->mi.state == BCH_MEMBER_STATE_RW) { err = __bch2_dev_read_write(c, ca); if (err) @@ -1619,7 +1645,6 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) return -EINVAL; } - __bch2_dev_read_only(c, ca); __bch2_dev_offline(ca); mutex_unlock(&c->state_lock); @@ -1629,37 +1654,31 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca) { unsigned data; - int ret; + int ret = 0; mutex_lock(&c->state_lock); if (ca->mi.state == BCH_MEMBER_STATE_RW) { bch_err(ca, "Cannot migrate data off RW device"); - mutex_unlock(&c->state_lock); - return -EINVAL; + ret = -EINVAL; + goto err; } - mutex_unlock(&c->state_lock); - - ret = bch2_move_data_off_device(ca); + ret = bch2_dev_data_migrate(c, ca, 0); if (ret) { bch_err(ca, "Error migrating data: %i", ret); - return ret; - } - - ret = bch2_move_metadata_off_device(ca); - if (ret) { - bch_err(ca, "Error migrating metadata: %i", ret); - return ret; + goto err; } data = bch2_dev_has_data(c, ca); if (data) { bch_err(ca, "Migrate error: data still present (%x)", data); - return -EINVAL; + ret = -EINVAL; + goto err; } - - return 0; +err: + mutex_unlock(&c->state_lock); + return ret; } /* Filesystem open: */ diff --git a/libbcachefs/super.h b/libbcachefs/super.h index eb1d2f3d..7ebe5981 100644 --- a/libbcachefs/super.h +++ b/libbcachefs/super.h @@ -59,6 +59,14 @@ static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs, } } +static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, + unsigned dev) +{ + BUG_ON(bch2_dev_list_has_dev(*devs, dev)); + BUG_ON(devs->nr >= BCH_REPLICAS_MAX); + devs->devs[devs->nr++] = dev; +} + static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter, struct bch_devs_mask *mask) { @@ -131,6 +139,26 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, __for_each_online_member(ca, c, iter, \ (1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO)) +/* + * If a key exists that references a device, the device won't be going away and + * we can omit rcu_read_lock(): + */ +static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx) +{ + EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); + + return rcu_dereference_check(c->devs[idx], 1); +} + +static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx) +{ + EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); + + return rcu_dereference_protected(c->devs[idx], + lockdep_is_held(&c->sb_lock) || + lockdep_is_held(&c->state_lock)); +} + /* XXX kill, move to struct bch_fs */ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) { @@ -146,7 +174,7 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) struct bch_fs *bch2_bdev_to_fs(struct block_device *); struct bch_fs *bch2_uuid_to_fs(uuid_le); -int bch2_congested(struct bch_fs *, int); +int bch2_congested(void *, int); bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *, enum bch_member_state, int); diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 35f1e561..3197a2e4 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -739,7 +739,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) c->open_buckets_wait.list.first ? "waiting" : "empty"); } -const char * const bch2_rw[] = { +static const char * const bch2_rw[] = { "read", "write", NULL diff --git a/libbcachefs/tier.c b/libbcachefs/tier.c index 2e29f741..f5007864 100644 --- a/libbcachefs/tier.c +++ b/libbcachefs/tier.c @@ -6,7 +6,6 @@ #include "clock.h" #include "extents.h" #include "io.h" -#include "keylist.h" #include "move.h" #include "super-io.h" #include "tier.h" @@ -28,7 +27,7 @@ static bool tiering_pred(void *arg, struct bkey_s_c_extent e) return false; extent_for_each_ptr(e, ptr) - if (c->devs[ptr->dev]->mi.tier >= tier->idx) + if (bch_dev_bkey_exists(c, ptr->dev)->mi.tier >= tier->idx) replicas++; return replicas < c->opts.data_replicas; diff --git a/libbcachefs/util.h b/libbcachefs/util.h index a251bf9c..6e97e831 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -34,8 +34,12 @@ struct closure; #define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0) #define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0) -#define memcpy(_dst, _src, _len) \ +#define memcpy(dst, src, len) \ ({ \ + void *_dst = (dst); \ + const void *_src = (src); \ + size_t _len = (len); \ + \ BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) || \ (void *) (_dst) + (_len) <= (void *) (_src))); \ memcpy(_dst, _src, _len); \ diff --git a/libbcachefs/vstructs.h b/libbcachefs/vstructs.h index ce2cece0..79566442 100644 --- a/libbcachefs/vstructs.h +++ b/libbcachefs/vstructs.h @@ -9,10 +9,10 @@ */ #define __vstruct_u64s(_s) \ ({ \ - ( type_is((_s)->u64s, u64) ? le64_to_cpu((_s)->u64s) \ - : type_is((_s)->u64s, u32) ? le32_to_cpu((_s)->u64s) \ - : type_is((_s)->u64s, u16) ? le16_to_cpu((_s)->u64s) \ - : ((_s)->u64s)); \ + ( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s) \ + : type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s) \ + : type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s) \ + : ((__force u8) ((_s)->u64s))); \ }) #define __vstruct_bytes(_type, _u64s) \ diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index 3a49d728..1d6cbe72 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "bkey_methods.h" #include "btree_update.h" +#include "compress.h" #include "extents.h" #include "fs.h" #include "str_hash.h" @@ -358,6 +359,129 @@ static const struct xattr_handler bch_xattr_security_handler = { .flags = BCH_XATTR_INDEX_SECURITY, }; +#ifndef NO_BCACHEFS_FS + +static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *vinode, + const char *name, void *buffer, size_t size) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_opts opts = + bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode)); + const struct bch_option *opt; + int ret, id; + u64 v; + + id = bch2_opt_lookup(name); + if (id < 0 || !bch2_opt_is_inode_opt(id)) + return -EINVAL; + + opt = bch2_opt_table + id; + + if (!bch2_opt_defined_by_id(&opts, id)) + return -ENODATA; + + v = bch2_opt_get_by_id(&opts, id); + + if (opt->type == BCH_OPT_STR) + ret = snprintf(buffer, size, "%s", opt->choices[v]); + else + ret = snprintf(buffer, size, "%llu", v); + + return ret <= size || !buffer ? ret : -ERANGE; +} + +struct inode_opt_set { + int id; + u64 v; + bool defined; +}; + +static int inode_opt_set_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct inode_opt_set *s = p; + + if (s->defined) + bch2_inode_opt_set(bi, s->id, s->v); + else + bch2_inode_opt_clear(bi, s->id); + return 0; +} + +static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *vinode, + const char *name, const void *value, + size_t size, int flags) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + const struct bch_option *opt; + char *buf; + struct inode_opt_set s; + int ret; + + s.id = bch2_opt_lookup(name); + if (s.id < 0 || !bch2_opt_is_inode_opt(s.id)) + return -EINVAL; + + opt = bch2_opt_table + s.id; + + if (value) { + buf = kmalloc(size + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + memcpy(buf, value, size); + buf[size] = '\0'; + + ret = bch2_opt_parse(opt, buf, &s.v); + kfree(buf); + + if (ret < 0) + return ret; + + if (s.id == Opt_compression) { + mutex_lock(&c->sb_lock); + ret = bch2_check_set_has_compressed_data(c, s.v); + mutex_unlock(&c->sb_lock); + + if (ret) + return ret; + } + + s.defined = true; + } else { + s.defined = false; + } + + mutex_lock(&inode->ei_update_lock); + ret = __bch2_write_inode(c, inode, inode_opt_set_fn, &s); + mutex_unlock(&inode->ei_update_lock); + + return ret; +} + +static const struct xattr_handler bch_xattr_bcachefs_handler = { + .prefix = "bcachefs.", + .get = bch2_xattr_bcachefs_get, + .set = bch2_xattr_bcachefs_set, +}; + +#endif /* NO_BCACHEFS_FS */ + +const struct xattr_handler *bch2_xattr_handlers[] = { + &bch_xattr_user_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, + &bch_xattr_trusted_handler, + &bch_xattr_security_handler, +#ifndef NO_BCACHEFS_FS + &bch_xattr_bcachefs_handler, +#endif + NULL +}; + static const struct xattr_handler *bch_xattr_handler_map[] = { [BCH_XATTR_INDEX_USER] = &bch_xattr_user_handler, [BCH_XATTR_INDEX_POSIX_ACL_ACCESS] = @@ -368,15 +492,6 @@ static const struct xattr_handler *bch_xattr_handler_map[] = { [BCH_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler, }; -const struct xattr_handler *bch2_xattr_handlers[] = { - &bch_xattr_user_handler, - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, - &bch_xattr_trusted_handler, - &bch_xattr_security_handler, - NULL -}; - static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type) { return type < ARRAY_SIZE(bch_xattr_handler_map) diff --git a/linux/bio.c b/linux/bio.c index f4356699..d8256989 100644 --- a/linux/bio.c +++ b/linux/bio.c @@ -19,7 +19,38 @@ #include #include #include -#include + +static const struct { + int err; + const char *name; +} blk_errors[] = { + [BLK_STS_OK] = { 0, "" }, + [BLK_STS_NOTSUPP] = { -EOPNOTSUPP, "operation not supported" }, + [BLK_STS_TIMEOUT] = { -ETIMEDOUT, "timeout" }, + [BLK_STS_NOSPC] = { -ENOSPC, "critical space allocation" }, + [BLK_STS_TRANSPORT] = { -ENOLINK, "recoverable transport" }, + [BLK_STS_TARGET] = { -EREMOTEIO, "critical target" }, + [BLK_STS_NEXUS] = { -EBADE, "critical nexus" }, + [BLK_STS_MEDIUM] = { -ENODATA, "critical medium" }, + [BLK_STS_PROTECTION] = { -EILSEQ, "protection" }, + [BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" }, + [BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" }, + + /* device mapper special case, should not leak out: */ + [BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" }, + + /* everything else not covered above: */ + [BLK_STS_IOERR] = { -EIO, "I/O" }, +}; + +int blk_status_to_errno(blk_status_t status) +{ + int idx = (__force int)status; + + if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) + return -EIO; + return blk_errors[idx].err; +} void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter) @@ -199,8 +230,8 @@ static struct bio *__bio_chain_endio(struct bio *bio) { struct bio *parent = bio->bi_private; - if (!parent->bi_error) - parent->bi_error = bio->bi_error; + if (!parent->bi_status) + parent->bi_status = bio->bi_status; bio_put(bio); return parent; } @@ -233,27 +264,6 @@ again: bio->bi_end_io(bio); } -void bio_endio_nodec(struct bio *bio) -{ - goto nodec; - - while (bio) { - if (unlikely(!bio_remaining_done(bio))) - break; -nodec: - if (bio->bi_end_io == bio_chain_endio) { - struct bio *parent = bio->bi_private; - parent->bi_error = bio->bi_error; - bio_put(bio); - bio = parent; - } else { - if (bio->bi_end_io) - bio->bi_end_io(bio); - bio = NULL; - } - } -} - void bio_reset(struct bio *bio) { unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); diff --git a/linux/blkdev.c b/linux/blkdev.c index ea7db40b..156d5353 100644 --- a/linux/blkdev.c +++ b/linux/blkdev.c @@ -32,7 +32,7 @@ void generic_make_request(struct bio *bio) ret = fdatasync(bio->bi_bdev->bd_fd); if (ret) { fprintf(stderr, "fsync error: %m\n"); - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); return; } @@ -106,7 +106,7 @@ int submit_bio_wait(struct bio *bio) submit_bio(bio); wait_for_completion(&done); - return bio->bi_error; + return blk_status_to_errno(bio->bi_status); } int blkdev_issue_discard(struct block_device *bdev, @@ -235,10 +235,8 @@ static int aio_completion_thread(void *arg) for (ev = events; ev < events + ret; ev++) { struct bio *bio = (struct bio *) ev->data; - if (ev->res < 0) - bio->bi_error = ev->res; - else if (ev->res != bio->bi_iter.bi_size) - bio->bi_error = -EIO; + if (ev->res != bio->bi_iter.bi_size) + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); }