Update bcachefs sources to 14ce2a2031 bcachefs: fixes for building in userspace

This commit is contained in:
Kent Overstreet 2017-12-21 18:00:30 -05:00
parent 8acc54456e
commit 1cf4d51dc4
61 changed files with 2074 additions and 1442 deletions

View File

@ -1 +1 @@
e57b5958cf4e8530d26f7c36a6e1427fb284cc70 14ce2a2031f3761a4b957aa2e5aac446ce18b87c

View File

@ -293,11 +293,11 @@ int cmd_list(int argc, char *argv[])
list_modes, "list mode"); list_modes, "list mode");
break; break;
case 'f': case 'f':
opts.fix_errors = FSCK_ERR_YES; opt_set(opts, fix_errors, FSCK_OPT_YES);
opts.norecovery = false; opt_set(opts, norecovery, false);
break; break;
case 'v': case 'v':
opts.verbose_recovery = true; opt_set(opts, verbose_recovery, true);
break; break;
case 'h': case 'h':
list_keys_usage(); list_keys_usage();

View File

@ -28,18 +28,19 @@ int cmd_fsck(int argc, char *argv[])
int opt; int opt;
opt_set(opts, degraded, true); opt_set(opts, degraded, true);
opt_set(opts, fix_errors, FSCK_OPT_ASK);
while ((opt = getopt(argc, argv, "pynfvh")) != -1) while ((opt = getopt(argc, argv, "pynfvh")) != -1)
switch (opt) { switch (opt) {
case 'p': case 'p':
opt_set(opts, fix_errors, FSCK_ERR_YES); opt_set(opts, fix_errors, FSCK_OPT_YES);
break; break;
case 'y': case 'y':
opt_set(opts, fix_errors, FSCK_ERR_YES); opt_set(opts, fix_errors, FSCK_OPT_YES);
break; break;
case 'n': case 'n':
opt_set(opts, nochanges, true); opt_set(opts, nochanges, true);
opt_set(opts, fix_errors, FSCK_ERR_NO); opt_set(opts, fix_errors, FSCK_OPT_NO);
break; break;
case 'f': case 'f':
/* force check, even if filesystem marked clean: */ /* force check, even if filesystem marked clean: */

View File

@ -164,7 +164,7 @@ static struct bch_inode_unpacked create_file(struct bch_fs *c,
struct bch_inode_unpacked new_inode; struct bch_inode_unpacked new_inode;
int ret; int ret;
bch2_inode_init(c, &new_inode, uid, gid, mode, rdev); bch2_inode_init(c, &new_inode, uid, gid, mode, rdev, parent);
ret = bch2_inode_create(c, &new_inode, BLOCKDEV_INODE_MAX, 0, ret = bch2_inode_create(c, &new_inode, BLOCKDEV_INODE_MAX, 0,
&c->unused_inode_hint); &c->unused_inode_hint);
@ -247,7 +247,6 @@ static void write_data(struct bch_fs *c,
struct bch_inode_unpacked *dst_inode, struct bch_inode_unpacked *dst_inode,
u64 dst_offset, void *buf, size_t len) u64 dst_offset, void *buf, size_t len)
{ {
struct disk_reservation res;
struct bch_write_op op; struct bch_write_op op;
struct bio_vec bv; struct bio_vec bv;
struct closure cl; struct closure cl;
@ -261,12 +260,15 @@ static void write_data(struct bch_fs *c,
op.wbio.bio.bi_iter.bi_size = len; op.wbio.bio.bi_iter.bi_size = len;
bch2_bio_map(&op.wbio.bio, buf); bch2_bio_map(&op.wbio.bio, buf);
int ret = bch2_disk_reservation_get(c, &res, len >> 9, 0); bch2_write_op_init(&op, c);
op.write_point = writepoint_hashed(0);
op.pos = POS(dst_inode->bi_inum, dst_offset >> 9);
int ret = bch2_disk_reservation_get(c, &op.res, len >> 9, 0);
if (ret) if (ret)
die("error reserving space in new filesystem: %s", strerror(-ret)); die("error reserving space in new filesystem: %s", strerror(-ret));
bch2_write_op_init(&op, c, res, NULL, writepoint_hashed(0),
POS(dst_inode->bi_inum, dst_offset >> 9), NULL, 0);
closure_call(&op.cl, bch2_write, NULL, &cl); closure_call(&op.cl, bch2_write, NULL, &cl);
closure_sync(&cl); closure_sync(&cl);

View File

@ -243,7 +243,8 @@ static inline void bioset_free(struct bio_set *bs)
static inline int bioset_init(struct bio_set *bs, static inline int bioset_init(struct bio_set *bs,
unsigned pool_size, unsigned pool_size,
unsigned front_pad) unsigned front_pad,
int flags)
{ {
bs->front_pad = front_pad; bs->front_pad = front_pad;
return 0; return 0;
@ -251,6 +252,10 @@ static inline int bioset_init(struct bio_set *bs,
extern struct bio_set *bioset_create(unsigned int, unsigned int); extern struct bio_set *bioset_create(unsigned int, unsigned int);
extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int); extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int);
enum {
BIOSET_NEED_BVECS = 1 << 0,
BIOSET_NEED_RESCUER = 1 << 1,
};
extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
extern void bio_put(struct bio *); extern void bio_put(struct bio *);
@ -271,13 +276,6 @@ static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
} }
extern void bio_endio(struct bio *); extern void bio_endio(struct bio *);
extern void bio_endio_nodec(struct bio *);
static inline void bio_io_error(struct bio *bio)
{
bio->bi_error = -EIO;
bio_endio(bio);
}
extern void bio_advance(struct bio *, unsigned); extern void bio_advance(struct bio *, unsigned);

View File

@ -13,7 +13,27 @@ struct bio_set;
struct bio; struct bio;
struct block_device; struct block_device;
typedef void (bio_end_io_t) (struct bio *); typedef void (bio_end_io_t) (struct bio *);
typedef void (bio_destructor_t) (struct bio *);
/*
* Block error status values. See block/blk-core:blk_errors for the details.
*/
typedef u8 __bitwise blk_status_t;
#define BLK_STS_OK 0
#define BLK_STS_NOTSUPP ((__force blk_status_t)1)
#define BLK_STS_TIMEOUT ((__force blk_status_t)2)
#define BLK_STS_NOSPC ((__force blk_status_t)3)
#define BLK_STS_TRANSPORT ((__force blk_status_t)4)
#define BLK_STS_TARGET ((__force blk_status_t)5)
#define BLK_STS_NEXUS ((__force blk_status_t)6)
#define BLK_STS_MEDIUM ((__force blk_status_t)7)
#define BLK_STS_PROTECTION ((__force blk_status_t)8)
#define BLK_STS_RESOURCE ((__force blk_status_t)9)
#define BLK_STS_IOERR ((__force blk_status_t)10)
/* hack for device mapper, don't use elsewhere: */
#define BLK_STS_DM_REQUEUE ((__force blk_status_t)11)
#define BLK_STS_AGAIN ((__force blk_status_t)12)
/* /*
* main unit of I/O for the block layer and lower layers (ie drivers and * main unit of I/O for the block layer and lower layers (ie drivers and
@ -22,7 +42,7 @@ typedef void (bio_destructor_t) (struct bio *);
struct bio { struct bio {
struct bio *bi_next; /* request queue link */ struct bio *bi_next; /* request queue link */
struct block_device *bi_bdev; struct block_device *bi_bdev;
int bi_error; blk_status_t bi_status;
unsigned int bi_opf; /* bottom bits req flags, unsigned int bi_opf; /* bottom bits req flags,
* top bits REQ_OP. Use * top bits REQ_OP. Use
* accessors. * accessors.

View File

@ -197,5 +197,8 @@ static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx)
#define capable(cap) true #define capable(cap) true
int blk_status_to_errno(blk_status_t status);
blk_status_t errno_to_blk_status(int errno);
#endif /* __TOOLS_LINUX_BLKDEV_H */ #endif /* __TOOLS_LINUX_BLKDEV_H */

View File

@ -14,7 +14,7 @@
#define BUG() do { assert(0); unreachable(); } while (0) #define BUG() do { assert(0); unreachable(); } while (0)
#define BUG_ON(cond) assert(!(cond)) #define BUG_ON(cond) assert(!(cond))
#define WARN_ON_ONCE(cond) assert(!(cond)) #define WARN_ON_ONCE(cond) ({ bool _r = (cond); if (_r) assert(0); _r; })
#define WARN_ONCE(cond, msg) ({ bool _r = (cond); if (_r) assert(0); _r; }) #define WARN_ONCE(cond, msg) ({ bool _r = (cond); if (_r) assert(0); _r; })
#define __WARN() assert(0) #define __WARN() assert(0)

View File

@ -204,4 +204,19 @@ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns)
extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs, extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
const struct timespec64 rhs); const struct timespec64 rhs);
static inline struct timespec timespec_trunc(struct timespec t, unsigned gran)
{
/* Avoid division in the common cases 1 ns and 1 s. */
if (gran == 1) {
/* nothing */
} else if (gran == NSEC_PER_SEC) {
t.tv_nsec = 0;
} else if (gran > 1 && gran < NSEC_PER_SEC) {
t.tv_nsec -= t.tv_nsec % gran;
} else {
WARN(1, "illegal file time granularity: %u", gran);
}
return t;
}
#endif /* _LINUX_TIME64_H */ #endif /* _LINUX_TIME64_H */

View File

@ -193,8 +193,7 @@ int bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type)
if (ret < 0) if (ret < 0)
return ret; return ret;
else { else {
inode->v.i_ctime = inode->v.i_ctime = current_time(&inode->v);
current_fs_time(inode->v.i_sb);
mark_inode_dirty(&inode->v); mark_inode_dirty(&inode->v);
if (ret == 0) if (ret == 0)
acl = NULL; acl = NULL;

View File

@ -257,7 +257,7 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
return; return;
a = bkey_s_c_to_alloc(k); a = bkey_s_c_to_alloc(k);
ca = c->devs[a.k->p.inode]; ca = bch_dev_bkey_exists(c, a.k->p.inode);
if (a.k->p.offset >= ca->mi.nbuckets) if (a.k->p.offset >= ca->mi.nbuckets)
return; return;
@ -305,10 +305,12 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
bch2_alloc_read_key(c, bkey_i_to_s_c(k)); bch2_alloc_read_key(c, bkey_i_to_s_c(k));
} }
mutex_lock(&c->bucket_lock);
for_each_member_device(ca, c, i) { for_each_member_device(ca, c, i) {
bch2_recalc_min_prio(c, ca, READ); bch2_recalc_min_prio(c, ca, READ);
bch2_recalc_min_prio(c, ca, WRITE); bch2_recalc_min_prio(c, ca, WRITE);
} }
mutex_unlock(&c->bucket_lock);
return 0; return 0;
} }
@ -368,7 +370,7 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode]) if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
return 0; return 0;
ca = c->devs[pos.inode]; ca = bch_dev_bkey_exists(c, pos.inode);
if (pos.offset >= ca->mi.nbuckets) if (pos.offset >= ca->mi.nbuckets)
return 0; return 0;
@ -461,7 +463,7 @@ static void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
/* Bucket heap / gen */ /* Bucket heap / gen */
void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw) static void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw)
{ {
struct prio_clock *clock = &c->prio_clock[rw]; struct prio_clock *clock = &c->prio_clock[rw];
struct bucket *g; struct bucket *g;
@ -975,7 +977,7 @@ static int bch2_allocator_thread(void *arg)
void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
{ {
struct bch_dev *ca = c->devs[ob->ptr.dev]; struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
spin_lock(&ob->lock); spin_lock(&ob->lock);
bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), false, bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), false,
@ -1303,7 +1305,7 @@ static void writepoint_drop_ptrs(struct bch_fs *c,
for (i = wp->nr_ptrs - 1; i >= 0; --i) { for (i = wp->nr_ptrs - 1; i >= 0; --i) {
struct open_bucket *ob = wp->ptrs[i]; struct open_bucket *ob = wp->ptrs[i];
struct bch_dev *ca = c->devs[ob->ptr.dev]; struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
if (nr_ptrs_dislike && !test_bit(ob->ptr.dev, devs->d)) { if (nr_ptrs_dislike && !test_bit(ob->ptr.dev, devs->d)) {
BUG_ON(ca->open_buckets_partial_nr >= BUG_ON(ca->open_buckets_partial_nr >=
@ -1331,7 +1333,7 @@ static void verify_not_stale(struct bch_fs *c, const struct write_point *wp)
unsigned i; unsigned i;
writepoint_for_each_ptr(wp, ob, i) { writepoint_for_each_ptr(wp, ob, i) {
struct bch_dev *ca = c->devs[ob->ptr.dev]; struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
BUG_ON(ptr_stale(ca, &ob->ptr)); BUG_ON(ptr_stale(ca, &ob->ptr));
} }
@ -1537,7 +1539,7 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
for (i = 0; i < wp->nr_ptrs_can_use; i++) { for (i = 0; i < wp->nr_ptrs_can_use; i++) {
struct open_bucket *ob = wp->ptrs[i]; struct open_bucket *ob = wp->ptrs[i];
struct bch_dev *ca = c->devs[ob->ptr.dev]; struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
struct bch_extent_ptr tmp = ob->ptr; struct bch_extent_ptr tmp = ob->ptr;
EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptr.dev)); EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptr.dev));
@ -1589,7 +1591,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
ra_pages += bdi->ra_pages; ra_pages += bdi->ra_pages;
} }
c->bdi.ra_pages = ra_pages; bch2_set_ra_pages(c, ra_pages);
/* Find fastest, slowest tiers with devices: */ /* Find fastest, slowest tiers with devices: */

View File

@ -326,9 +326,9 @@ struct io_count {
struct bch_dev { struct bch_dev {
struct kobject kobj; struct kobject kobj;
struct percpu_ref ref; struct percpu_ref ref;
struct completion ref_completion;
struct percpu_ref io_ref; struct percpu_ref io_ref;
struct completion stop_complete; struct completion io_ref_completion;
struct completion offline_complete;
struct bch_fs *fs; struct bch_fs *fs;
@ -515,12 +515,11 @@ struct bch_fs {
struct closure sb_write; struct closure sb_write;
struct mutex sb_lock; struct mutex sb_lock;
struct backing_dev_info bdi;
/* BTREE CACHE */ /* BTREE CACHE */
struct bio_set btree_read_bio; struct bio_set btree_read_bio;
struct btree_root btree_roots[BTREE_ID_NR]; struct btree_root btree_roots[BTREE_ID_NR];
bool btree_roots_dirty;
struct mutex btree_root_lock; struct mutex btree_root_lock;
struct btree_cache btree_cache; struct btree_cache btree_cache;
@ -710,6 +709,14 @@ struct bch_fs {
#undef BCH_TIME_STAT #undef BCH_TIME_STAT
}; };
static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
{
#ifndef NO_BCACHEFS_FS
if (c->vfs_sb)
c->vfs_sb->s_bdi->ra_pages = ra_pages;
#endif
}
static inline bool bch2_fs_running(struct bch_fs *c) static inline bool bch2_fs_running(struct bch_fs *c)
{ {
return c->state == BCH_FS_RO || c->state == BCH_FS_RW; return c->state == BCH_FS_RO || c->state == BCH_FS_RW;

View File

@ -593,18 +593,24 @@ struct bch_inode_generation {
} __attribute__((packed, aligned(8))); } __attribute__((packed, aligned(8)));
BKEY_VAL_TYPE(inode_generation, BCH_INODE_GENERATION); BKEY_VAL_TYPE(inode_generation, BCH_INODE_GENERATION);
#define BCH_INODE_FIELDS() \ #define BCH_INODE_FIELDS() \
BCH_INODE_FIELD(bi_atime, 64) \ BCH_INODE_FIELD(bi_atime, 64) \
BCH_INODE_FIELD(bi_ctime, 64) \ BCH_INODE_FIELD(bi_ctime, 64) \
BCH_INODE_FIELD(bi_mtime, 64) \ BCH_INODE_FIELD(bi_mtime, 64) \
BCH_INODE_FIELD(bi_otime, 64) \ BCH_INODE_FIELD(bi_otime, 64) \
BCH_INODE_FIELD(bi_size, 64) \ BCH_INODE_FIELD(bi_size, 64) \
BCH_INODE_FIELD(bi_sectors, 64) \ BCH_INODE_FIELD(bi_sectors, 64) \
BCH_INODE_FIELD(bi_uid, 32) \ BCH_INODE_FIELD(bi_uid, 32) \
BCH_INODE_FIELD(bi_gid, 32) \ BCH_INODE_FIELD(bi_gid, 32) \
BCH_INODE_FIELD(bi_nlink, 32) \ BCH_INODE_FIELD(bi_nlink, 32) \
BCH_INODE_FIELD(bi_generation, 32) \ BCH_INODE_FIELD(bi_generation, 32) \
BCH_INODE_FIELD(bi_dev, 32) BCH_INODE_FIELD(bi_dev, 32) \
BCH_INODE_FIELD(bi_data_checksum, 8) \
BCH_INODE_FIELD(bi_compression, 8)
#define BCH_INODE_FIELDS_INHERIT() \
BCH_INODE_FIELD(bi_data_checksum) \
BCH_INODE_FIELD(bi_compression)
enum { enum {
/* /*
@ -794,7 +800,7 @@ struct bch_sb_layout {
__u8 sb_max_size_bits; /* base 2 of 512 byte sectors */ __u8 sb_max_size_bits; /* base 2 of 512 byte sectors */
__u8 nr_superblocks; __u8 nr_superblocks;
__u8 pad[5]; __u8 pad[5];
__u64 sb_offset[61]; __le64 sb_offset[61];
} __attribute__((packed, aligned(8))); } __attribute__((packed, aligned(8)));
#define BCH_SB_LAYOUT_SECTOR 7 #define BCH_SB_LAYOUT_SECTOR 7
@ -1089,6 +1095,11 @@ struct jset_entry {
}; };
}; };
struct jset_entry_blacklist {
struct jset_entry entry;
__le64 seq;
};
#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64)) #define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64))
enum { enum {

View File

@ -1,6 +1,7 @@
#include "bcachefs.h" #include "bcachefs.h"
#include "bkey.h" #include "bkey.h"
#include "bkey_methods.h"
#include "bset.h" #include "bset.h"
#include "util.h" #include "util.h"
@ -80,37 +81,6 @@ static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed,
const struct bkey_format *format) {} const struct bkey_format *format) {}
#endif #endif
int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
{
char *out = buf, *end = buf + size;
#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
p("u64s %u type %u %llu:%llu snap %u len %u ver %llu",
k->u64s, k->type, k->p.inode, k->p.offset,
k->p.snapshot, k->size, k->version.lo);
BUG_ON(bkey_packed(k));
switch (k->type) {
case KEY_TYPE_DELETED:
p(" deleted");
break;
case KEY_TYPE_DISCARD:
p(" discard");
break;
case KEY_TYPE_ERROR:
p(" error");
break;
case KEY_TYPE_COOKIE:
p(" cookie");
break;
}
#undef p
return out - buf;
}
struct pack_state { struct pack_state {
const struct bkey_format *format; const struct bkey_format *format;
unsigned bits; /* bits remaining in current word */ unsigned bits; /* bits remaining in current word */
@ -336,7 +306,8 @@ bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
* Extents - we have to guarantee that if an extent is packed, a trimmed * Extents - we have to guarantee that if an extent is packed, a trimmed
* version will also pack: * version will also pack:
*/ */
if (bkey_start_offset(in) < format->field_offset[BKEY_FIELD_OFFSET]) if (bkey_start_offset(in) <
le64_to_cpu(format->field_offset[BKEY_FIELD_OFFSET]))
return false; return false;
pack_state_finish(&state, out); pack_state_finish(&state, out);
@ -800,7 +771,7 @@ static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
bool *eax_zeroed) bool *eax_zeroed)
{ {
unsigned bits = format->bits_per_field[field]; unsigned bits = format->bits_per_field[field];
u64 offset = format->field_offset[field]; u64 offset = le64_to_cpu(format->field_offset[field]);
unsigned i, byte, bit_offset, align, shl, shr; unsigned i, byte, bit_offset, align, shl, shr;
if (!bits && !offset) { if (!bits && !offset) {

View File

@ -8,7 +8,6 @@
#include "vstructs.h" #include "vstructs.h"
void bch2_to_binary(char *, const u64 *, unsigned); void bch2_to_binary(char *, const u64 *, unsigned);
int bch2_bkey_to_text(char *, size_t, const struct bkey *);
#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX) #define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
@ -377,7 +376,8 @@ static inline u64 bkey_field_max(const struct bkey_format *f,
enum bch_bkey_fields nr) enum bch_bkey_fields nr)
{ {
return f->bits_per_field[nr] < 64 return f->bits_per_field[nr] < 64
? f->field_offset[nr] + ~(~0ULL << f->bits_per_field[nr]) ? (le64_to_cpu(f->field_offset[nr]) +
~(~0ULL << f->bits_per_field[nr]))
: U64_MAX; : U64_MAX;
} }

View File

@ -18,28 +18,11 @@ const struct bkey_ops *bch2_bkey_ops[] = {
[BKEY_TYPE_BTREE] = &bch2_bkey_btree_ops, [BKEY_TYPE_BTREE] = &bch2_bkey_btree_ops,
}; };
/* Returns string indicating reason for being invalid, or NULL if valid: */ const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type,
const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type, struct bkey_s_c k)
struct bkey_s_c k)
{ {
const struct bkey_ops *ops = bch2_bkey_ops[type]; const struct bkey_ops *ops = bch2_bkey_ops[type];
if (k.k->u64s < BKEY_U64s)
return "u64s too small";
if (!ops->is_extents) {
if (k.k->size)
return "nonzero size field";
} else {
if ((k.k->size == 0) != bkey_deleted(k.k))
return "bad size field";
}
if (ops->is_extents &&
!k.k->size &&
!bkey_deleted(k.k))
return "zero size field";
switch (k.k->type) { switch (k.k->type) {
case KEY_TYPE_DELETED: case KEY_TYPE_DELETED:
case KEY_TYPE_DISCARD: case KEY_TYPE_DISCARD:
@ -63,8 +46,41 @@ const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
} }
} }
const char *bch2_btree_bkey_invalid(struct bch_fs *c, struct btree *b, const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
struct bkey_s_c k) struct bkey_s_c k)
{
const struct bkey_ops *ops = bch2_bkey_ops[type];
if (k.k->u64s < BKEY_U64s)
return "u64s too small";
if (!ops->is_extents) {
if (k.k->size)
return "nonzero size field";
} else {
if ((k.k->size == 0) != bkey_deleted(k.k))
return "bad size field";
}
if (ops->is_extents &&
!k.k->size &&
!bkey_deleted(k.k))
return "zero size field";
if (k.k->p.snapshot)
return "nonzero snapshot";
return NULL;
}
const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
struct bkey_s_c k)
{
return __bch2_bkey_invalid(c, type, k) ?:
bch2_bkey_val_invalid(c, type, k);
}
const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
{ {
if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0) if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0)
return "key before start of btree node"; return "key before start of btree node";
@ -72,10 +88,7 @@ const char *bch2_btree_bkey_invalid(struct bch_fs *c, struct btree *b,
if (bkey_cmp(k.k->p, b->data->max_key) > 0) if (bkey_cmp(k.k->p, b->data->max_key) > 0)
return "key past end of btree node"; return "key past end of btree node";
if (k.k->p.snapshot) return NULL;
return "nonzero snapshot";
return bch2_bkey_invalid(c, btree_node_type(b), k);
} }
void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
@ -86,7 +99,8 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
BUG_ON(!k.k->u64s); BUG_ON(!k.k->u64s);
invalid = bch2_btree_bkey_invalid(c, b, k); invalid = bch2_bkey_invalid(c, type, k) ?:
bch2_bkey_in_btree_node(b, k);
if (invalid) { if (invalid) {
char buf[160]; char buf[160];
@ -100,33 +114,62 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
ops->key_debugcheck(c, b, k); ops->key_debugcheck(c, b, k);
} }
char *bch2_val_to_text(struct bch_fs *c, enum bkey_type type, #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
char *buf, size_t size, struct bkey_s_c k)
int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
{ {
const struct bkey_ops *ops = bch2_bkey_ops[type]; char *out = buf, *end = buf + size;
if (k.k->type >= KEY_TYPE_GENERIC_NR && p("u64s %u type %u ", k->u64s, k->type);
ops->val_to_text)
ops->val_to_text(c, buf, size, k);
return buf; if (bkey_cmp(k->p, POS_MAX))
p("%llu:%llu", k->p.inode, k->p.offset);
else
p("POS_MAX");
p(" snap %u len %u ver %llu", k->p.snapshot, k->size, k->version.lo);
return out - buf;
} }
char *bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type, int bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
char *buf, size_t size, struct bkey_s_c k) char *buf, size_t size, struct bkey_s_c k)
{ {
const struct bkey_ops *ops = bch2_bkey_ops[type]; const struct bkey_ops *ops = bch2_bkey_ops[type];
char *out = buf, *end = buf + size; char *out = buf, *end = buf + size;
out += bch2_bkey_to_text(out, end - out, k.k); switch (k.k->type) {
case KEY_TYPE_DELETED:
if (k.k->type >= KEY_TYPE_GENERIC_NR && p(" deleted");
ops->val_to_text) { break;
out += scnprintf(out, end - out, ": "); case KEY_TYPE_DISCARD:
ops->val_to_text(c, out, end - out, k); p(" discard");
break;
case KEY_TYPE_ERROR:
p(" error");
break;
case KEY_TYPE_COOKIE:
p(" cookie");
break;
default:
if (k.k->type >= KEY_TYPE_GENERIC_NR && ops->val_to_text)
ops->val_to_text(c, buf, size, k);
break;
} }
return buf; return out - buf;
}
int bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
char *buf, size_t size, struct bkey_s_c k)
{
char *out = buf, *end = buf + size;
out += bch2_bkey_to_text(out, end - out, k.k);
out += scnprintf(out, end - out, ": ");
out += bch2_val_to_text(c, type, out, end - out, k);
return out - buf;
} }
void bch2_bkey_swab(enum bkey_type type, void bch2_bkey_swab(enum bkey_type type,

View File

@ -64,15 +64,19 @@ struct bkey_ops {
bool is_extents; bool is_extents;
}; };
const char *bch2_bkey_val_invalid(struct bch_fs *, enum bkey_type,
struct bkey_s_c);
const char *__bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
const char *bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c); const char *bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
const char *bch2_btree_bkey_invalid(struct bch_fs *, struct btree *, const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
struct bkey_s_c);
void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c); void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
char *bch2_val_to_text(struct bch_fs *, enum bkey_type,
char *, size_t, struct bkey_s_c); int bch2_bkey_to_text(char *, size_t, const struct bkey *);
char *bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type, int bch2_val_to_text(struct bch_fs *, enum bkey_type,
char *, size_t, struct bkey_s_c); char *, size_t, struct bkey_s_c);
int bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
char *, size_t, struct bkey_s_c);
void bch2_bkey_swab(enum bkey_type, const struct bkey_format *, void bch2_bkey_swab(enum bkey_type, const struct bkey_format *,
struct bkey_packed *); struct bkey_packed *);

View File

@ -96,7 +96,7 @@ u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k)
struct bkey_s_c_extent e = bkey_s_c_to_extent(k); struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
extent_for_each_ptr(e, ptr) { extent_for_each_ptr(e, ptr) {
struct bch_dev *ca = c->devs[ptr->dev]; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
size_t b = PTR_BUCKET_NR(ca, ptr); size_t b = PTR_BUCKET_NR(ca, ptr);
if (gen_after(ca->oldest_gens[b], ptr->gen)) if (gen_after(ca->oldest_gens[b], ptr->gen))
@ -159,14 +159,15 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
(!c->opts.nofsck && (!c->opts.nofsck &&
fsck_err_on(!bch2_sb_has_replicas(c, e, data_type), c, fsck_err_on(!bch2_sb_has_replicas(c, e, data_type), c,
"superblock not marked as containing replicas"))) { "superblock not marked as containing replicas (type %u)",
data_type))) {
ret = bch2_check_mark_super(c, e, data_type); ret = bch2_check_mark_super(c, e, data_type);
if (ret) if (ret)
return ret; return ret;
} }
extent_for_each_ptr(e, ptr) { extent_for_each_ptr(e, ptr) {
struct bch_dev *ca = c->devs[ptr->dev]; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct bucket *g = PTR_BUCKET(ca, ptr); struct bucket *g = PTR_BUCKET(ca, ptr);
if (mustfix_fsck_err_on(!g->mark.gen_valid, c, if (mustfix_fsck_err_on(!g->mark.gen_valid, c,
@ -315,14 +316,14 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
lockdep_assert_held(&c->sb_lock); lockdep_assert_held(&c->sb_lock);
for (i = 0; i < layout->nr_superblocks; i++) { for (i = 0; i < layout->nr_superblocks; i++) {
if (layout->sb_offset[i] == BCH_SB_SECTOR) u64 offset = le64_to_cpu(layout->sb_offset[i]);
if (offset == BCH_SB_SECTOR)
mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR, mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
BUCKET_SB, flags); BUCKET_SB, flags);
mark_metadata_sectors(c, ca, mark_metadata_sectors(c, ca, offset,
layout->sb_offset[i], offset + (1 << layout->sb_max_size_bits),
layout->sb_offset[i] +
(1 << layout->sb_max_size_bits),
BUCKET_SB, flags); BUCKET_SB, flags);
} }
@ -414,7 +415,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
spin_lock(&ob->lock); spin_lock(&ob->lock);
if (ob->valid) { if (ob->valid) {
gc_pos_set(c, gc_pos_alloc(c, ob)); gc_pos_set(c, gc_pos_alloc(c, ob));
ca = c->devs[ob->ptr.dev]; ca = bch_dev_bkey_exists(c, ob->ptr.dev);
bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), true, bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), true,
gc_pos_alloc(c, ob), gc_pos_alloc(c, ob),
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
@ -424,7 +425,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
} }
} }
void bch2_gc_start(struct bch_fs *c) static void bch2_gc_start(struct bch_fs *c)
{ {
struct bch_dev *ca; struct bch_dev *ca;
struct bucket *g; struct bucket *g;

View File

@ -556,7 +556,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
struct bset_tree *t; struct bset_tree *t;
struct bset *start_bset = bset(b, &b->set[start_idx]); struct bset *start_bset = bset(b, &b->set[start_idx]);
bool used_mempool = false; bool used_mempool = false;
u64 start_time; u64 start_time, seq = 0;
unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1; unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1;
bool sorting_entire_node = start_idx == 0 && bool sorting_entire_node = start_idx == 0 &&
end_idx == b->nsets; end_idx == b->nsets;
@ -595,12 +595,9 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
bch2_time_stats_update(&c->btree_sort_time, start_time); bch2_time_stats_update(&c->btree_sort_time, start_time);
/* Make sure we preserve bset journal_seq: */ /* Make sure we preserve bset journal_seq: */
for (t = b->set + start_idx + 1; for (t = b->set + start_idx; t < b->set + end_idx; t++)
t < b->set + end_idx; seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq));
t++) start_bset->journal_seq = cpu_to_le64(seq);
start_bset->journal_seq =
max(start_bset->journal_seq,
bset(b, t)->journal_seq);
if (sorting_entire_node) { if (sorting_entire_node) {
unsigned u64s = le16_to_cpu(out->keys.u64s); unsigned u64s = le16_to_cpu(out->keys.u64s);
@ -958,6 +955,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
{ {
struct bkey_packed *k, *prev = NULL; struct bkey_packed *k, *prev = NULL;
struct bpos prev_pos = POS_MIN; struct bpos prev_pos = POS_MIN;
enum bkey_type type = btree_node_type(b);
bool seen_non_whiteout = false; bool seen_non_whiteout = false;
const char *err; const char *err;
int ret = 0; int ret = 0;
@ -1025,7 +1023,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
if (!BSET_SEPARATE_WHITEOUTS(i)) { if (!BSET_SEPARATE_WHITEOUTS(i)) {
seen_non_whiteout = true; seen_non_whiteout = true;
whiteout_u64s = 0; *whiteout_u64s = 0;
} }
for (k = i->start; for (k = i->start;
@ -1059,16 +1057,17 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
} }
if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
bch2_bkey_swab(btree_node_type(b), &b->format, k); bch2_bkey_swab(type, &b->format, k);
u = bkey_disassemble(b, k, &tmp); u = bkey_disassemble(b, k, &tmp);
invalid = bch2_btree_bkey_invalid(c, b, u); invalid = __bch2_bkey_invalid(c, type, u) ?:
bch2_bkey_in_btree_node(b, u) ?:
(write ? bch2_bkey_val_invalid(c, type, u) : NULL);
if (invalid) { if (invalid) {
char buf[160]; char buf[160];
bch2_bkey_val_to_text(c, btree_node_type(b), bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
buf, sizeof(buf), u);
btree_err(BTREE_ERR_FIXABLE, c, b, i, btree_err(BTREE_ERR_FIXABLE, c, b, i,
"invalid bkey %s: %s", buf, invalid); "invalid bkey %s: %s", buf, invalid);
@ -1114,6 +1113,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
struct btree_node_entry *bne; struct btree_node_entry *bne;
struct btree_node_iter *iter; struct btree_node_iter *iter;
struct btree_node *sorted; struct btree_node *sorted;
struct bkey_packed *k;
struct bset *i;
bool used_mempool; bool used_mempool;
unsigned u64s; unsigned u64s;
int ret, retry_read = 0, write = READ; int ret, retry_read = 0, write = READ;
@ -1137,7 +1138,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
unsigned sectors, whiteout_u64s = 0; unsigned sectors, whiteout_u64s = 0;
struct nonce nonce; struct nonce nonce;
struct bch_csum csum; struct bch_csum csum;
struct bset *i;
if (!b->written) { if (!b->written) {
i = &b->data->keys; i = &b->data->keys;
@ -1238,6 +1238,31 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
btree_bounce_free(c, btree_page_order(c), used_mempool, sorted); btree_bounce_free(c, btree_page_order(c), used_mempool, sorted);
i = &b->data->keys;
for (k = i->start; k != vstruct_last(i);) {
enum bkey_type type = btree_node_type(b);
struct bkey tmp;
struct bkey_s_c u = bkey_disassemble(b, k, &tmp);
const char *invalid = bch2_bkey_val_invalid(c, type, u);
if (invalid) {
char buf[160];
bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
btree_err(BTREE_ERR_FIXABLE, c, b, i,
"invalid bkey %s: %s", buf, invalid);
btree_keys_account_key_drop(&b->nr, 0, k);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_next(k),
(u64 *) vstruct_end(i) - (u64 *) k);
continue;
}
k = bkey_next(k);
}
bch2_bset_build_aux_tree(b, b->set, false); bch2_bset_build_aux_tree(b, b->set, false);
set_needs_whiteout(btree_bset_first(b)); set_needs_whiteout(btree_bset_first(b));
@ -1278,13 +1303,13 @@ static void btree_node_read_work(struct work_struct *work)
bio->bi_iter.bi_size = btree_bytes(c); bio->bi_iter.bi_size = btree_bytes(c);
submit_bio_wait(bio); submit_bio_wait(bio);
start: start:
bch2_dev_io_err_on(bio->bi_error, rb->pick.ca, "btree read"); bch2_dev_io_err_on(bio->bi_status, rb->pick.ca, "btree read");
percpu_ref_put(&rb->pick.ca->io_ref); percpu_ref_put(&rb->pick.ca->io_ref);
__set_bit(rb->pick.ca->dev_idx, avoid.d); __set_bit(rb->pick.ca->dev_idx, avoid.d);
rb->pick = bch2_btree_pick_ptr(c, b, &avoid); rb->pick = bch2_btree_pick_ptr(c, b, &avoid);
if (!bio->bi_error && if (!bio->bi_status &&
!bch2_btree_node_read_done(c, b, !IS_ERR_OR_NULL(rb->pick.ca))) !bch2_btree_node_read_done(c, b, !IS_ERR_OR_NULL(rb->pick.ca)))
goto out; goto out;
} while (!IS_ERR_OR_NULL(rb->pick.ca)); } while (!IS_ERR_OR_NULL(rb->pick.ca));
@ -1377,17 +1402,24 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id)); BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
bch2_btree_node_read(c, b, true); bch2_btree_node_read(c, b, true);
six_unlock_write(&b->lock);
if (btree_node_read_error(b)) { if (btree_node_read_error(b)) {
six_unlock_intent(&b->lock); bch2_btree_node_hash_remove(&c->btree_cache, b);
return -EIO;
mutex_lock(&c->btree_cache.lock);
list_move(&b->list, &c->btree_cache.freeable);
mutex_unlock(&c->btree_cache.lock);
ret = -EIO;
goto err;
} }
bch2_btree_set_root_for_read(c, b); bch2_btree_set_root_for_read(c, b);
err:
six_unlock_write(&b->lock);
six_unlock_intent(&b->lock); six_unlock_intent(&b->lock);
return 0; return ret;
} }
void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
@ -1412,35 +1444,57 @@ static void bch2_btree_node_write_error(struct bch_fs *c,
struct closure *cl = wbio->cl; struct closure *cl = wbio->cl;
__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
struct bkey_i_extent *new_key; struct bkey_i_extent *new_key;
struct bkey_s_extent e;
struct bch_extent_ptr *ptr;
struct btree_iter iter;
int ret;
six_lock_read(&b->lock); __bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
bkey_copy(&tmp.k, &b->key); BTREE_MAX_DEPTH,
six_unlock_read(&b->lock); b->level, 0);
retry:
ret = bch2_btree_iter_traverse(&iter);
if (ret)
goto err;
if (!bkey_extent_is_data(&tmp.k.k) || !PTR_HASH(&tmp.k)) { /* has node been freed? */
/* Node has been freed: */ if (iter.nodes[b->level] != b) {
/* node has been freed: */
if (!btree_node_dying(b))
panic("foo4\n");
goto out; goto out;
} }
if (!btree_node_hashed(b))
panic("foo5\n");
bkey_copy(&tmp.k, &b->key);
new_key = bkey_i_to_extent(&tmp.k); new_key = bkey_i_to_extent(&tmp.k);
e = extent_i_to_s(new_key);
extent_for_each_ptr_backwards(e, ptr)
if (bch2_dev_list_has_dev(wbio->failed, ptr->dev))
bch2_extent_drop_ptr(e, ptr);
while (wbio->replicas_failed) { if (!bch2_extent_nr_ptrs(e.c))
unsigned idx = __fls(wbio->replicas_failed); goto err;
bch2_extent_drop_ptr_idx(extent_i_to_s(new_key), idx); ret = bch2_btree_node_update_key(c, &iter, b, new_key);
wbio->replicas_failed ^= 1 << idx; if (ret == -EINTR)
} goto retry;
if (ret)
if (!bch2_extent_nr_ptrs(extent_i_to_s_c(new_key)) || goto err;
bch2_btree_node_update_key(c, b, new_key)) {
set_btree_node_noevict(b);
bch2_fatal_error(c);
}
out: out:
bch2_btree_iter_unlock(&iter);
bio_put(&wbio->bio); bio_put(&wbio->bio);
btree_node_write_done(c, b); btree_node_write_done(c, b);
if (cl) if (cl)
closure_put(cl); closure_put(cl);
return;
err:
set_btree_node_noevict(b);
bch2_fs_fatal_error(c, "fatal error writing btree node");
goto out;
} }
void bch2_btree_write_error_work(struct work_struct *work) void bch2_btree_write_error_work(struct work_struct *work)
@ -1470,12 +1524,17 @@ static void btree_node_write_endio(struct bio *bio)
struct closure *cl = !wbio->split ? wbio->cl : NULL; struct closure *cl = !wbio->split ? wbio->cl : NULL;
struct bch_fs *c = wbio->c; struct bch_fs *c = wbio->c;
struct bch_dev *ca = wbio->ca; struct bch_dev *ca = wbio->ca;
unsigned long flags;
bch2_latency_acct(ca, wbio->submit_time_us, WRITE); bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
if (bch2_dev_io_err_on(bio->bi_error, ca, "btree write") || if (bio->bi_status == BLK_STS_REMOVED ||
bch2_meta_write_fault("btree")) bch2_dev_io_err_on(bio->bi_status, ca, "btree write") ||
set_bit(wbio->ptr_idx, (unsigned long *) &orig->replicas_failed); bch2_meta_write_fault("btree")) {
spin_lock_irqsave(&c->btree_write_error_lock, flags);
bch2_dev_list_add_dev(&orig->failed, ca->dev_idx);
spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
}
if (wbio->have_io_ref) if (wbio->have_io_ref)
percpu_ref_put(&ca->io_ref); percpu_ref_put(&ca->io_ref);
@ -1491,12 +1550,11 @@ static void btree_node_write_endio(struct bio *bio)
wbio->used_mempool, wbio->used_mempool,
wbio->data); wbio->data);
if (wbio->replicas_failed) { if (wbio->failed.nr) {
unsigned long flags;
spin_lock_irqsave(&c->btree_write_error_lock, flags); spin_lock_irqsave(&c->btree_write_error_lock, flags);
bio_list_add(&c->btree_write_error_list, &wbio->bio); bio_list_add(&c->btree_write_error_list, &wbio->bio);
spin_unlock_irqrestore(&c->btree_write_error_lock, flags); spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
queue_work(c->wq, &c->btree_write_error_work); queue_work(c->wq, &c->btree_write_error_work);
return; return;
} }
@ -1707,6 +1765,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
wbio = wbio_init(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write)); wbio = wbio_init(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write));
wbio->cl = parent; wbio->cl = parent;
wbio->failed.nr = 0;
wbio->order = order; wbio->order = order;
wbio->used_mempool = used_mempool; wbio->used_mempool = used_mempool;
wbio->data = data; wbio->data = data;

View File

@ -75,8 +75,8 @@ bool bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
{ {
struct btree_iter *linked; struct btree_iter *linked;
struct btree *b = iter->nodes[level]; struct btree *b = iter->nodes[level];
enum btree_node_locked_type want = btree_lock_want(iter, level); int want = btree_lock_want(iter, level);
enum btree_node_locked_type have = btree_node_locked_type(iter, level); int have = btree_node_locked_type(iter, level);
if (want == have) if (want == have)
return true; return true;
@ -108,6 +108,17 @@ success:
return true; return true;
} }
bool bch2_btree_iter_relock(struct btree_iter *iter)
{
unsigned l;
for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++)
if (!bch2_btree_node_relock(iter, l))
return false;
return true;
}
/* Slowpath: */ /* Slowpath: */
bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
unsigned level, unsigned level,
@ -214,7 +225,6 @@ bool __bch2_btree_iter_set_locks_want(struct btree_iter *iter,
unsigned new_locks_want) unsigned new_locks_want)
{ {
struct btree_iter *linked; struct btree_iter *linked;
unsigned l;
/* Drop locks we don't want anymore: */ /* Drop locks we don't want anymore: */
if (new_locks_want < iter->locks_want) if (new_locks_want < iter->locks_want)
@ -228,12 +238,9 @@ bool __bch2_btree_iter_set_locks_want(struct btree_iter *iter,
iter->locks_want = new_locks_want; iter->locks_want = new_locks_want;
btree_iter_drop_extra_locks(iter); btree_iter_drop_extra_locks(iter);
for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++) if (bch2_btree_iter_relock(iter))
if (!bch2_btree_node_relock(iter, l)) return true;
goto fail;
return true;
fail:
/* /*
* Just an optimization: ancestor nodes must be locked before child * Just an optimization: ancestor nodes must be locked before child
* nodes, so set locks_want on iterators that might lock ancestors * nodes, so set locks_want on iterators that might lock ancestors

View File

@ -75,7 +75,7 @@ static inline void mark_btree_node_intent_locked(struct btree_iter *iter,
mark_btree_node_locked(iter, level, SIX_LOCK_intent); mark_btree_node_locked(iter, level, SIX_LOCK_intent);
} }
static inline int btree_lock_want(struct btree_iter *iter, int level) static inline enum six_lock_type btree_lock_want(struct btree_iter *iter, int level)
{ {
return level < iter->locks_want return level < iter->locks_want
? SIX_LOCK_intent ? SIX_LOCK_intent
@ -111,6 +111,7 @@ static inline bool btree_node_lock(struct btree *b, struct bpos pos,
} }
bool bch2_btree_node_relock(struct btree_iter *, unsigned); bool bch2_btree_node_relock(struct btree_iter *, unsigned);
bool bch2_btree_iter_relock(struct btree_iter *);
void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *); void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
void bch2_btree_node_lock_write(struct btree *, struct btree_iter *); void bch2_btree_node_lock_write(struct btree *, struct btree_iter *);

View File

@ -196,6 +196,7 @@ enum btree_flags {
BTREE_NODE_accessed, BTREE_NODE_accessed,
BTREE_NODE_write_in_flight, BTREE_NODE_write_in_flight,
BTREE_NODE_just_written, BTREE_NODE_just_written,
BTREE_NODE_dying,
}; };
BTREE_FLAG(read_in_flight); BTREE_FLAG(read_in_flight);
@ -207,6 +208,7 @@ BTREE_FLAG(write_idx);
BTREE_FLAG(accessed); BTREE_FLAG(accessed);
BTREE_FLAG(write_in_flight); BTREE_FLAG(write_in_flight);
BTREE_FLAG(just_written); BTREE_FLAG(just_written);
BTREE_FLAG(dying);
static inline struct btree_write *btree_current_write(struct btree *b) static inline struct btree_write *btree_current_write(struct btree *b)
{ {

View File

@ -130,7 +130,7 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *, int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
__le64, unsigned); __le64, unsigned);
int bch2_btree_node_update_key(struct bch_fs *, struct btree *, int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
struct bkey_i_extent *); struct btree *, struct bkey_i_extent *);
#endif /* _BCACHEFS_BTREE_UPDATE_H */ #endif /* _BCACHEFS_BTREE_UPDATE_H */

View File

@ -21,7 +21,7 @@
static void btree_node_will_make_reachable(struct btree_update *, static void btree_node_will_make_reachable(struct btree_update *,
struct btree *); struct btree *);
static void btree_update_drop_new_node(struct bch_fs *, struct btree *); static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *); static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int);
/* Debug code: */ /* Debug code: */
@ -686,7 +686,7 @@ retry:
BUG_ON(c->btree_roots[b->btree_id].as != as); BUG_ON(c->btree_roots[b->btree_id].as != as);
c->btree_roots[b->btree_id].as = NULL; c->btree_roots[b->btree_id].as = NULL;
bch2_btree_set_root_ondisk(c, b); bch2_btree_set_root_ondisk(c, b, WRITE);
/* /*
* We don't have to wait anything anything here (before * We don't have to wait anything anything here (before
@ -914,6 +914,7 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
struct btree_write *w; struct btree_write *w;
struct bset_tree *t; struct bset_tree *t;
set_btree_node_dying(b);
btree_interior_update_add_node_reference(as, b); btree_interior_update_add_node_reference(as, b);
/* /*
@ -925,7 +926,8 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
* in with keys that aren't in the journal anymore: * in with keys that aren't in the journal anymore:
*/ */
for_each_bset(b, t) for_each_bset(b, t)
as->journal_seq = max(as->journal_seq, bset(b, t)->journal_seq); as->journal_seq = max(as->journal_seq,
le64_to_cpu(bset(b, t)->journal_seq));
mutex_lock(&c->btree_interior_update_lock); mutex_lock(&c->btree_interior_update_lock);
@ -1027,6 +1029,10 @@ static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
mutex_unlock(&c->btree_cache.lock); mutex_unlock(&c->btree_cache.lock);
mutex_lock(&c->btree_root_lock); mutex_lock(&c->btree_root_lock);
BUG_ON(btree_node_root(c, b) &&
(b->level < btree_node_root(c, b)->level ||
!btree_node_dying(btree_node_root(c, b))));
btree_node_root(c, b) = b; btree_node_root(c, b) = b;
mutex_unlock(&c->btree_root_lock); mutex_unlock(&c->btree_root_lock);
@ -1054,7 +1060,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
gc_pos_btree_root(b->btree_id)); gc_pos_btree_root(b->btree_id));
} }
static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b) static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
{ {
struct btree_root *r = &c->btree_roots[b->btree_id]; struct btree_root *r = &c->btree_roots[b->btree_id];
@ -1064,6 +1070,8 @@ static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b)
bkey_copy(&r->key, &b->key); bkey_copy(&r->key, &b->key);
r->level = b->level; r->level = b->level;
r->alive = true; r->alive = true;
if (rw == WRITE)
c->btree_roots_dirty = true;
mutex_unlock(&c->btree_root_lock); mutex_unlock(&c->btree_root_lock);
} }
@ -1787,64 +1795,16 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
return ret; return ret;
} }
int bch2_btree_node_update_key(struct bch_fs *c, struct btree *b, static void __bch2_btree_node_update_key(struct bch_fs *c,
struct bkey_i_extent *new_key) struct btree_update *as,
struct btree_iter *iter,
struct btree *b, struct btree *new_hash,
struct bkey_i_extent *new_key)
{ {
struct btree_update *as = NULL; struct btree *parent;
struct btree *parent, *new_hash = NULL;
struct btree_iter iter;
struct closure cl;
bool must_rewrite_parent = false; bool must_rewrite_parent = false;
int ret; int ret;
__bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
BTREE_MAX_DEPTH,
b->level, 0);
closure_init_stack(&cl);
ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE);
if (ret)
return ret;
retry:
down_read(&c->gc_lock);
ret = bch2_btree_iter_traverse(&iter);
if (ret)
goto err;
/* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
if (!new_hash &&
PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
/* bch2_btree_reserve_get will unlock */
do {
ret = bch2_btree_cache_cannibalize_lock(c, &cl);
closure_sync(&cl);
} while (ret == -EAGAIN);
BUG_ON(ret);
new_hash = bch2_btree_node_mem_alloc(c);
}
as = bch2_btree_update_start(c, iter.btree_id,
btree_update_reserve_required(c, b),
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_USE_ALLOC_RESERVE,
&cl);
if (IS_ERR(as)) {
ret = PTR_ERR(as);
if (ret == -EAGAIN || ret == -EINTR) {
bch2_btree_iter_unlock(&iter);
up_read(&c->gc_lock);
closure_sync(&cl);
goto retry;
}
goto err;
}
mutex_lock(&c->btree_interior_update_lock);
/* /*
* Two corner cases that need to be thought about here: * Two corner cases that need to be thought about here:
* *
@ -1869,22 +1829,12 @@ retry:
if (b->will_make_reachable) if (b->will_make_reachable)
must_rewrite_parent = true; must_rewrite_parent = true;
/* other case: btree node being freed */
if (iter.nodes[b->level] != b) {
/* node has been freed: */
BUG_ON(btree_node_hashed(b));
mutex_unlock(&c->btree_interior_update_lock);
goto err;
}
mutex_unlock(&c->btree_interior_update_lock);
if (must_rewrite_parent) if (must_rewrite_parent)
as->flags |= BTREE_INTERIOR_UPDATE_MUST_REWRITE; as->flags |= BTREE_INTERIOR_UPDATE_MUST_REWRITE;
btree_interior_update_add_node_reference(as, b); btree_interior_update_add_node_reference(as, b);
parent = iter.nodes[b->level + 1]; parent = iter->nodes[b->level + 1];
if (parent) { if (parent) {
if (new_hash) { if (new_hash) {
bkey_copy(&new_hash->key, &new_key->k_i); bkey_copy(&new_hash->key, &new_key->k_i);
@ -1893,8 +1843,8 @@ retry:
BUG_ON(ret); BUG_ON(ret);
} }
bch2_btree_insert_node(as, parent, &iter, bch2_keylist_add(&as->parent_keys, &new_key->k_i);
&keylist_single(&new_key->k_i)); bch2_btree_insert_node(as, parent, iter, &as->parent_keys);
if (new_hash) { if (new_hash) {
mutex_lock(&c->btree_cache.lock); mutex_lock(&c->btree_cache.lock);
@ -1914,7 +1864,7 @@ retry:
BUG_ON(btree_node_root(c, b) != b); BUG_ON(btree_node_root(c, b) != b);
bch2_btree_node_lock_write(b, &iter); bch2_btree_node_lock_write(b, iter);
bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i), bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i),
c->opts.btree_node_size, true, c->opts.btree_node_size, true,
@ -1925,14 +1875,94 @@ retry:
&stats); &stats);
bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res, bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
gc_pos_btree_root(b->btree_id)); gc_pos_btree_root(b->btree_id));
bkey_copy(&b->key, &new_key->k_i);
if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
mutex_lock(&c->btree_cache.lock);
bch2_btree_node_hash_remove(&c->btree_cache, b);
bkey_copy(&b->key, &new_key->k_i);
ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
BUG_ON(ret);
mutex_unlock(&c->btree_cache.lock);
} else {
bkey_copy(&b->key, &new_key->k_i);
}
btree_update_updated_root(as); btree_update_updated_root(as);
bch2_btree_node_unlock_write(b, &iter); bch2_btree_node_unlock_write(b, iter);
} }
bch2_btree_update_done(as); bch2_btree_update_done(as);
out: }
int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
struct btree *b, struct bkey_i_extent *new_key)
{
struct btree_update *as = NULL;
struct btree *new_hash = NULL;
struct closure cl;
int ret;
closure_init_stack(&cl);
if (!down_read_trylock(&c->gc_lock)) {
bch2_btree_iter_unlock(iter);
down_read(&c->gc_lock);
if (!bch2_btree_iter_relock(iter)) {
ret = -EINTR;
goto err;
}
}
/* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
/* bch2_btree_reserve_get will unlock */
ret = bch2_btree_cache_cannibalize_lock(c, &cl);
if (ret) {
ret = -EINTR;
bch2_btree_iter_unlock(iter);
up_read(&c->gc_lock);
closure_sync(&cl);
down_read(&c->gc_lock);
if (!bch2_btree_iter_relock(iter))
goto err;
}
new_hash = bch2_btree_node_mem_alloc(c);
}
as = bch2_btree_update_start(c, iter->btree_id,
btree_update_reserve_required(c, b),
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_USE_ALLOC_RESERVE,
&cl);
if (IS_ERR(as)) {
ret = PTR_ERR(as);
if (ret == -EAGAIN)
ret = -EINTR;
if (ret != -EINTR)
goto err;
bch2_btree_iter_unlock(iter);
up_read(&c->gc_lock);
closure_sync(&cl);
down_read(&c->gc_lock);
if (!bch2_btree_iter_relock(iter))
goto err;
}
ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE);
if (ret)
goto err_free_update;
__bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
err:
if (new_hash) { if (new_hash) {
mutex_lock(&c->btree_cache.lock); mutex_lock(&c->btree_cache.lock);
list_move(&new_hash->list, &c->btree_cache.freeable); list_move(&new_hash->list, &c->btree_cache.freeable);
@ -1941,14 +1971,12 @@ out:
six_unlock_write(&new_hash->lock); six_unlock_write(&new_hash->lock);
six_unlock_intent(&new_hash->lock); six_unlock_intent(&new_hash->lock);
} }
bch2_btree_iter_unlock(&iter);
up_read(&c->gc_lock); up_read(&c->gc_lock);
closure_sync(&cl); closure_sync(&cl);
return ret; return ret;
err: err_free_update:
if (as) bch2_btree_update_free(as);
bch2_btree_update_free(as); goto err;
goto out;
} }
/* Init code: */ /* Init code: */
@ -1962,7 +1990,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
BUG_ON(btree_node_root(c, b)); BUG_ON(btree_node_root(c, b));
__bch2_btree_set_root_inmem(c, b); __bch2_btree_set_root_inmem(c, b);
bch2_btree_set_root_ondisk(c, b); bch2_btree_set_root_ondisk(c, b, READ);
} }
int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id, int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
@ -1998,7 +2026,7 @@ int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
BUG_ON(btree_node_root(c, b)); BUG_ON(btree_node_root(c, b));
bch2_btree_set_root_inmem(as, b); bch2_btree_set_root_inmem(as, b);
bch2_btree_set_root_ondisk(c, b); bch2_btree_set_root_ondisk(c, b, WRITE);
bch2_btree_open_bucket_put(c, b); bch2_btree_open_bucket_put(c, b);
six_unlock_intent(&b->lock); six_unlock_intent(&b->lock);

View File

@ -174,9 +174,11 @@ do { \
#define bch2_usage_read_raw(_stats) \ #define bch2_usage_read_raw(_stats) \
({ \ ({ \
typeof(*this_cpu_ptr(_stats)) _acc = { 0 }; \ typeof(*this_cpu_ptr(_stats)) _acc; \
int cpu; \ int cpu; \
\ \
memset(&_acc, 0, sizeof(_acc)); \
\
for_each_possible_cpu(cpu) \ for_each_possible_cpu(cpu) \
bch2_usage_add(&_acc, per_cpu_ptr((_stats), cpu)); \ bch2_usage_add(&_acc, per_cpu_ptr((_stats), cpu)); \
\ \
@ -479,7 +481,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
{ {
struct bucket_mark old, new; struct bucket_mark old, new;
unsigned saturated; unsigned saturated;
struct bch_dev *ca = c->devs[ptr->dev]; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct bucket *g = ca->buckets + PTR_BUCKET_NR(ca, ptr); struct bucket *g = ca->buckets + PTR_BUCKET_NR(ca, ptr);
unsigned data_type = type == S_META unsigned data_type = type == S_META
? BUCKET_BTREE : BUCKET_DATA; ? BUCKET_BTREE : BUCKET_DATA;

View File

@ -68,16 +68,14 @@ struct bch_dev_usage {
struct bch_fs_usage { struct bch_fs_usage {
/* all fields are in units of 512 byte sectors: */ /* all fields are in units of 512 byte sectors: */
/* _uncompressed_ sectors: */ /* _uncompressed_ sectors: */
u64 online_reserved;
u64 available_cache;
struct { struct {
u64 data[S_ALLOC_NR]; u64 data[S_ALLOC_NR];
u64 persistent_reserved; u64 persistent_reserved;
} s[BCH_REPLICAS_MAX]; } s[BCH_REPLICAS_MAX];
u64 online_reserved;
u64 available_cache;
}; };
/* /*

View File

@ -2,6 +2,7 @@
#include "bcachefs.h" #include "bcachefs.h"
#include "bcachefs_ioctl.h" #include "bcachefs_ioctl.h"
#include "chardev.h"
#include "super.h" #include "super.h"
#include "super-io.h" #include "super-io.h"
@ -25,7 +26,7 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
rcu_read_lock(); rcu_read_lock();
ca = c->devs[dev]; ca = rcu_dereference(c->devs[dev]);
if (ca) if (ca)
percpu_ref_get(&ca->ref); percpu_ref_get(&ca->ref);
rcu_read_unlock(); rcu_read_unlock();
@ -80,7 +81,7 @@ static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL); devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);
if (copy_from_user(user_devs, arg.devs, if (copy_from_user(user_devs, user_arg->devs,
sizeof(u64) * arg.nr_devs)) sizeof(u64) * arg.nr_devs))
goto err; goto err;

View File

@ -72,14 +72,15 @@ static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
} }
} }
static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c) static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
unsigned opt)
{ {
if (c->sb.encryption_type) if (c->sb.encryption_type)
return c->opts.wide_macs return c->opts.wide_macs
? BCH_CSUM_CHACHA20_POLY1305_128 ? BCH_CSUM_CHACHA20_POLY1305_128
: BCH_CSUM_CHACHA20_POLY1305_80; : BCH_CSUM_CHACHA20_POLY1305_80;
return bch2_csum_opt_to_type(c->opts.data_checksum, true); return bch2_csum_opt_to_type(opt, true);
} }
static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
@ -143,6 +144,14 @@ static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
return nonce; return nonce;
} }
static inline struct nonce null_nonce(void)
{
struct nonce ret;
memset(&ret, 0, sizeof(ret));
return ret;
}
static inline struct nonce extent_nonce(struct bversion version, static inline struct nonce extent_nonce(struct bversion version,
struct bch_extent_crc_unpacked crc) struct bch_extent_crc_unpacked crc)
{ {

View File

@ -95,11 +95,17 @@ print:
vscnprintf(buf, sizeof(_buf), fmt, args); vscnprintf(buf, sizeof(_buf), fmt, args);
va_end(args); va_end(args);
if (c->opts.fix_errors == FSCK_OPT_EXIT) {
bch_err(c, "%s, exiting", buf);
mutex_unlock(&c->fsck_error_lock);
return FSCK_ERR_EXIT;
}
if (flags & FSCK_CAN_FIX) { if (flags & FSCK_CAN_FIX) {
if (c->opts.fix_errors == FSCK_ERR_ASK) { if (c->opts.fix_errors == FSCK_OPT_ASK) {
printk(KERN_ERR "%s: fix?", buf); printk(KERN_ERR "%s: fix?", buf);
fix = ask_yn(); fix = ask_yn();
} else if (c->opts.fix_errors == FSCK_ERR_YES || } else if (c->opts.fix_errors == FSCK_OPT_YES ||
(c->opts.nochanges && (c->opts.nochanges &&
!(flags & FSCK_CAN_IGNORE))) { !(flags & FSCK_CAN_IGNORE))) {
if (print) if (print)

View File

@ -96,9 +96,10 @@ enum {
}; };
enum fsck_err_opts { enum fsck_err_opts {
FSCK_ERR_NO, FSCK_OPT_EXIT,
FSCK_ERR_YES, FSCK_OPT_YES,
FSCK_ERR_ASK, FSCK_OPT_NO,
FSCK_OPT_ASK,
}; };
enum fsck_err_ret { enum fsck_err_ret {
@ -217,7 +218,7 @@ do { \
#define bcache_io_error(c, bio, fmt, ...) \ #define bcache_io_error(c, bio, fmt, ...) \
do { \ do { \
__bcache_io_error(c, fmt, ##__VA_ARGS__); \ __bcache_io_error(c, fmt, ##__VA_ARGS__); \
(bio)->bi_error = -EIO; \ (bio)->bi_status = BLK_STS_IOERR; \
} while (0) } while (0)
#endif /* _BCACHEFS_ERROR_H */ #endif /* _BCACHEFS_ERROR_H */

View File

@ -18,6 +18,7 @@
#include "extents.h" #include "extents.h"
#include "inode.h" #include "inode.h"
#include "journal.h" #include "journal.h"
#include "super.h"
#include "super-io.h" #include "super-io.h"
#include "util.h" #include "util.h"
#include "xattr.h" #include "xattr.h"
@ -156,6 +157,19 @@ unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
return nr_ptrs; return nr_ptrs;
} }
unsigned bch2_extent_nr_good_ptrs(struct bch_fs *c, struct bkey_s_c_extent e)
{
const struct bch_extent_ptr *ptr;
unsigned nr_ptrs = 0;
extent_for_each_ptr(e, ptr)
nr_ptrs += (!ptr->cached &&
bch_dev_bkey_exists(c, ptr->dev)->mi.state !=
BCH_MEMBER_STATE_FAILED);
return nr_ptrs;
}
unsigned bch2_extent_is_compressed(struct bkey_s_c k) unsigned bch2_extent_is_compressed(struct bkey_s_c k)
{ {
struct bkey_s_c_extent e; struct bkey_s_c_extent e;
@ -362,7 +376,7 @@ static bool should_drop_ptr(const struct bch_fs *c,
struct bkey_s_c_extent e, struct bkey_s_c_extent e,
const struct bch_extent_ptr *ptr) const struct bch_extent_ptr *ptr)
{ {
return ptr->cached && ptr_stale(c->devs[ptr->dev], ptr); return ptr->cached && ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr);
} }
static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e) static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
@ -411,8 +425,10 @@ static void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
break; break;
case BCH_EXTENT_ENTRY_crc128: case BCH_EXTENT_ENTRY_crc128:
entry->crc128.csum.hi = swab64(entry->crc64.csum_hi); entry->crc128.csum.hi = (__force __le64)
entry->crc128.csum.lo = swab64(entry->crc64.csum_lo); swab64((__force u64) entry->crc128.csum.hi);
entry->crc128.csum.lo = (__force __le64)
swab64((__force u64) entry->crc128.csum.lo);
break; break;
case BCH_EXTENT_ENTRY_ptr: case BCH_EXTENT_ENTRY_ptr:
break; break;
@ -432,10 +448,11 @@ static const char *extent_ptr_invalid(const struct bch_fs *c,
const struct bch_extent_ptr *ptr2; const struct bch_extent_ptr *ptr2;
struct bch_dev *ca; struct bch_dev *ca;
if (ptr->dev >= c->sb.nr_devices) if (ptr->dev >= c->sb.nr_devices ||
!c->devs[ptr->dev])
return "pointer to invalid device"; return "pointer to invalid device";
ca = c->devs[ptr->dev]; ca = bch_dev_bkey_exists(c, ptr->dev);
if (!ca) if (!ca)
return "pointer to invalid device"; return "pointer to invalid device";
@ -487,7 +504,9 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
break; break;
case BCH_EXTENT_ENTRY_ptr: case BCH_EXTENT_ENTRY_ptr:
ptr = entry_to_ptr(entry); ptr = entry_to_ptr(entry);
ca = c->devs[ptr->dev]; ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
? bch_dev_bkey_exists(c, ptr->dev)
: NULL;
p("ptr: %u:%llu gen %u%s", ptr->dev, p("ptr: %u:%llu gen %u%s", ptr->dev,
(u64) ptr->offset, ptr->gen, (u64) ptr->offset, ptr->gen,
@ -528,7 +547,7 @@ static void extent_pick_read_device(struct bch_fs *c,
struct bch_extent_crc_unpacked crc; struct bch_extent_crc_unpacked crc;
extent_for_each_ptr_crc(e, ptr, crc) { extent_for_each_ptr_crc(e, ptr, crc) {
struct bch_dev *ca = c->devs[ptr->dev]; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
if (ptr->cached && ptr_stale(ca, ptr)) if (ptr->cached && ptr_stale(ca, ptr))
continue; continue;
@ -621,7 +640,7 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
bool bad; bool bad;
extent_for_each_ptr(e, ptr) { extent_for_each_ptr(e, ptr) {
ca = c->devs[ptr->dev]; ca = bch_dev_bkey_exists(c, ptr->dev);
g = PTR_BUCKET(ca, ptr); g = PTR_BUCKET(ca, ptr);
replicas++; replicas++;
@ -1730,7 +1749,7 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier)); memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier));
extent_for_each_ptr(e, ptr) { extent_for_each_ptr(e, ptr) {
ca = c->devs[ptr->dev]; ca = bch_dev_bkey_exists(c, ptr->dev);
g = PTR_BUCKET(ca, ptr); g = PTR_BUCKET(ca, ptr);
replicas++; replicas++;
ptrs_per_tier[ca->mi.tier]++; ptrs_per_tier[ca->mi.tier]++;
@ -1844,7 +1863,7 @@ static void bch2_extent_to_text(struct bch_fs *c, char *buf,
static unsigned PTR_TIER(struct bch_fs *c, static unsigned PTR_TIER(struct bch_fs *c,
const struct bch_extent_ptr *ptr) const struct bch_extent_ptr *ptr)
{ {
return c->devs[ptr->dev]->mi.tier; return bch_dev_bkey_exists(c, ptr->dev)->mi.tier;
} }
static void bch2_extent_crc_init(union bch_extent_crc *crc, static void bch2_extent_crc_init(union bch_extent_crc *crc,
@ -1971,14 +1990,10 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
struct bkey_s_extent e) struct bkey_s_extent e)
{ {
struct bch_extent_ptr *ptr; struct bch_extent_ptr *ptr;
unsigned tier = 0, nr_cached = 0, nr_good = 0; unsigned tier = 0, nr_cached = 0;
unsigned nr_good = bch2_extent_nr_good_ptrs(c, e.c);
bool have_higher_tier; bool have_higher_tier;
extent_for_each_ptr(e, ptr)
if (!ptr->cached &&
c->devs[ptr->dev]->mi.state != BCH_MEMBER_STATE_FAILED)
nr_good++;
if (nr_good <= c->opts.data_replicas) if (nr_good <= c->opts.data_replicas)
return; return;
@ -2103,7 +2118,7 @@ static enum merge_result bch2_extent_merge(struct bch_fs *c,
return BCH_MERGE_NOMERGE; return BCH_MERGE_NOMERGE;
/* We don't allow extents to straddle buckets: */ /* We don't allow extents to straddle buckets: */
ca = c->devs[lp->dev]; ca = bch_dev_bkey_exists(c, lp->dev);
if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
return BCH_MERGE_NOMERGE; return BCH_MERGE_NOMERGE;
@ -2347,6 +2362,30 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
} }
} }
int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
{
struct btree_iter iter;
struct bpos end = pos;
struct bkey_s_c k;
int ret = 0;
end.offset += size;
for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos,
BTREE_ITER_WITH_HOLES, k) {
if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
break;
if (!bch2_extent_is_fully_allocated(k)) {
ret = -ENOSPC;
break;
}
}
bch2_btree_iter_unlock(&iter);
return ret;
}
const struct bkey_ops bch2_bkey_extent_ops = { const struct bkey_ops bch2_bkey_extent_ops = {
.key_invalid = bch2_extent_invalid, .key_invalid = bch2_extent_invalid,
.key_debugcheck = bch2_extent_debugcheck, .key_debugcheck = bch2_extent_debugcheck,

View File

@ -45,6 +45,7 @@ bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent); unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c); unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
unsigned bch2_extent_nr_good_ptrs(struct bch_fs *, struct bkey_s_c_extent);
unsigned bch2_extent_is_compressed(struct bkey_s_c); unsigned bch2_extent_is_compressed(struct bkey_s_c);
bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent, bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
@ -243,14 +244,14 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
case BCH_EXTENT_CRC32: case BCH_EXTENT_CRC32:
return (struct bch_extent_crc_unpacked) { return (struct bch_extent_crc_unpacked) {
common_fields(crc->crc32), common_fields(crc->crc32),
.csum.lo = crc->crc32.csum, .csum.lo = (__force __le64) crc->crc32.csum,
}; };
case BCH_EXTENT_CRC64: case BCH_EXTENT_CRC64:
return (struct bch_extent_crc_unpacked) { return (struct bch_extent_crc_unpacked) {
common_fields(crc->crc64), common_fields(crc->crc64),
.nonce = crc->crc64.nonce, .nonce = crc->crc64.nonce,
.csum.lo = crc->crc64.csum_lo, .csum.lo = (__force __le64) crc->crc64.csum_lo,
.csum.hi = crc->crc64.csum_hi, .csum.hi = (__force __le64) crc->crc64.csum_hi,
}; };
case BCH_EXTENT_CRC128: case BCH_EXTENT_CRC128:
return (struct bch_extent_crc_unpacked) { return (struct bch_extent_crc_unpacked) {
@ -425,4 +426,6 @@ bool bch2_cut_front(struct bpos, struct bkey_i *);
bool bch2_cut_back(struct bpos, struct bkey *); bool bch2_cut_back(struct bpos, struct bkey *);
void bch2_key_resize(struct bkey *, unsigned); void bch2_key_resize(struct bkey *, unsigned);
int bch2_check_range_allocated(struct bch_fs *, struct bpos, u64);
#endif /* _BCACHEFS_EXTENTS_H */ #endif /* _BCACHEFS_EXTENTS_H */

File diff suppressed because it is too large Load Diff

View File

@ -75,7 +75,7 @@ do { \
/* Set VFS inode flags from bcachefs inode: */ /* Set VFS inode flags from bcachefs inode: */
void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
{ {
set_flags(bch_flags_to_vfs, inode->ei_flags, inode->v.i_flags); set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
} }
static int bch2_inode_flags_set(struct bch_inode_info *inode, static int bch2_inode_flags_set(struct bch_inode_info *inode,
@ -99,13 +99,13 @@ static int bch2_inode_flags_set(struct bch_inode_info *inode,
return -EINVAL; return -EINVAL;
bi->bi_flags = newflags; bi->bi_flags = newflags;
inode->v.i_ctime = current_fs_time(inode->v.i_sb); inode->v.i_ctime = current_time(&inode->v);
return 0; return 0;
} }
static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg) static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg)
{ {
unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_flags); unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags);
return put_user(flags, arg); return put_user(flags, arg);
} }
@ -153,7 +153,7 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
{ {
struct fsxattr fa = { 0 }; struct fsxattr fa = { 0 };
fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_flags); fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
return copy_to_user(arg, &fa, sizeof(fa)); return copy_to_user(arg, &fa, sizeof(fa));
} }

View File

@ -12,6 +12,7 @@
#include "fs-ioctl.h" #include "fs-ioctl.h"
#include "fsck.h" #include "fsck.h"
#include "inode.h" #include "inode.h"
#include "io.h"
#include "journal.h" #include "journal.h"
#include "keylist.h" #include "keylist.h"
#include "super.h" #include "super.h"
@ -130,10 +131,8 @@ int __must_check __bch2_write_inode(struct bch_fs *c,
BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i)); BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i));
} while (ret == -EINTR); } while (ret == -EINTR);
if (!ret) { if (!ret)
inode->ei_size = inode_u.bi_size; inode->ei_inode = inode_u;
inode->ei_flags = inode_u.bi_flags;
}
out: out:
bch2_btree_iter_unlock(&iter); bch2_btree_iter_unlock(&iter);
@ -146,7 +145,7 @@ int __must_check bch2_write_inode(struct bch_fs *c,
return __bch2_write_inode(c, inode, NULL, NULL); return __bch2_write_inode(c, inode, NULL, NULL);
} }
int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode) static int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode)
{ {
int ret; int ret;
@ -158,7 +157,7 @@ int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode)
return ret; return ret;
} }
int bch2_dec_nlink(struct bch_fs *c, struct bch_inode_info *inode) static int bch2_dec_nlink(struct bch_fs *c, struct bch_inode_info *inode)
{ {
int ret = 0; int ret = 0;
@ -223,7 +222,9 @@ static struct bch_inode_info *bch2_vfs_inode_create(struct bch_fs *c,
bch2_inode_init(c, &inode_u, bch2_inode_init(c, &inode_u,
i_uid_read(&inode->v), i_uid_read(&inode->v),
i_gid_read(&inode->v), i_gid_read(&inode->v),
inode->v.i_mode, rdev); inode->v.i_mode, rdev,
&dir->ei_inode);
ret = bch2_inode_create(c, &inode_u, ret = bch2_inode_create(c, &inode_u,
BLOCKDEV_INODE_MAX, 0, BLOCKDEV_INODE_MAX, 0,
&c->unused_inode_hint); &c->unused_inode_hint);
@ -277,7 +278,7 @@ static int bch2_vfs_dirent_create(struct bch_fs *c,
if (unlikely(ret)) if (unlikely(ret))
return ret; return ret;
dir->v.i_mtime = dir->v.i_ctime = current_fs_time(c->vfs_sb); dir->v.i_mtime = dir->v.i_ctime = current_time(&dir->v);
mark_inode_dirty_sync(&dir->v); mark_inode_dirty_sync(&dir->v);
return 0; return 0;
} }
@ -344,7 +345,7 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
lockdep_assert_held(&inode->v.i_rwsem); lockdep_assert_held(&inode->v.i_rwsem);
inode->v.i_ctime = current_fs_time(dir->v.i_sb); inode->v.i_ctime = current_time(&dir->v);
ret = bch2_inc_nlink(c, inode); ret = bch2_inc_nlink(c, inode);
if (ret) if (ret)
@ -473,7 +474,7 @@ static int bch2_rename(struct bch_fs *c,
{ {
struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode); struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode);
struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode); struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode);
struct timespec now = current_fs_time(old_dir->v.i_sb); struct timespec now = current_time(&old_dir->v);
int ret; int ret;
lockdep_assert_held(&old_dir->v.i_rwsem); lockdep_assert_held(&old_dir->v.i_rwsem);
@ -551,7 +552,7 @@ static int bch2_rename_exchange(struct bch_fs *c,
{ {
struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode); struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode);
struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode); struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode);
struct timespec now = current_fs_time(old_dir->v.i_sb); struct timespec now = current_time(&old_dir->v);
int ret; int ret;
ret = bch2_dirent_rename(c, ret = bch2_dirent_rename(c,
@ -909,10 +910,8 @@ static void bch2_vfs_inode_init(struct bch_fs *c,
inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime); inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime);
inode->ei_journal_seq = 0; inode->ei_journal_seq = 0;
inode->ei_size = bi->bi_size;
inode->ei_flags = bi->bi_flags;
atomic64_set(&inode->ei_sectors, bi->bi_sectors);
inode->ei_str_hash = bch2_hash_info_init(c, bi); inode->ei_str_hash = bch2_hash_info_init(c, bi);
inode->ei_inode = *bi;
bch2_inode_flags_to_vfs(inode); bch2_inode_flags_to_vfs(inode);
@ -949,8 +948,6 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
inode_init_once(&inode->v); inode_init_once(&inode->v);
mutex_init(&inode->ei_update_lock); mutex_init(&inode->ei_update_lock);
inode->ei_journal_seq = 0; inode->ei_journal_seq = 0;
atomic_long_set(&inode->ei_size_dirty_count, 0);
atomic_long_set(&inode->ei_sectors_dirty_count, 0);
return &inode->v; return &inode->v;
} }
@ -995,12 +992,6 @@ static void bch2_evict_inode(struct inode *vinode)
truncate_inode_pages_final(&inode->v.i_data); truncate_inode_pages_final(&inode->v.i_data);
if (!bch2_journal_error(&c->journal) && !is_bad_inode(&inode->v)) {
/* XXX - we want to check this stuff iff there weren't IO errors: */
BUG_ON(atomic_long_read(&inode->ei_sectors_dirty_count));
BUG_ON(atomic64_read(&inode->ei_sectors) != inode->v.i_blocks);
}
clear_inode(&inode->v); clear_inode(&inode->v);
if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) { if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
@ -1272,9 +1263,16 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
sb->s_magic = BCACHEFS_STATFS_MAGIC; sb->s_magic = BCACHEFS_STATFS_MAGIC;
sb->s_time_gran = c->sb.time_precision; sb->s_time_gran = c->sb.time_precision;
c->vfs_sb = sb; c->vfs_sb = sb;
sb->s_bdi = &c->bdi;
strlcpy(sb->s_id, c->name, sizeof(sb->s_id)); strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
ret = super_setup_bdi(sb);
if (ret)
goto err_put_super;
sb->s_bdi->congested_fn = bch2_congested;
sb->s_bdi->congested_data = c;
sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
for_each_online_member(ca, c, i) { for_each_online_member(ca, c, i) {
struct block_device *bdev = ca->disk_sb.bdev; struct block_device *bdev = ca->disk_sb.bdev;

View File

@ -1,6 +1,7 @@
#ifndef _BCACHEFS_FS_H #ifndef _BCACHEFS_FS_H
#define _BCACHEFS_FS_H #define _BCACHEFS_FS_H
#include "opts.h"
#include "str_hash.h" #include "str_hash.h"
#include <linux/seqlock.h> #include <linux/seqlock.h>
@ -11,22 +12,12 @@ struct bch_inode_info {
struct mutex ei_update_lock; struct mutex ei_update_lock;
u64 ei_journal_seq; u64 ei_journal_seq;
unsigned long ei_last_dirtied;
atomic_long_t ei_size_dirty_count;
/*
* these are updated whenever we update the inode in the btree - for
* e.g. fsync
*/
u64 ei_size;
u32 ei_flags;
atomic_long_t ei_sectors_dirty_count;
atomic64_t ei_sectors;
struct bch_hash_info ei_str_hash; struct bch_hash_info ei_str_hash;
unsigned long ei_last_dirtied; /* copy of inode in btree: */
struct bch_inode_unpacked ei_inode;
}; };
#define to_bch_ei(_inode) \ #define to_bch_ei(_inode) \

View File

@ -204,7 +204,7 @@ static int hash_check_key(const struct bch_hash_desc desc,
"hash table key at wrong offset: %llu, " "hash table key at wrong offset: %llu, "
"hashed to %llu chain starts at %llu\n%s", "hashed to %llu chain starts at %llu\n%s",
k.k->p.offset, hashed, h->chain.pos.offset, k.k->p.offset, hashed, h->chain.pos.offset,
bch2_bkey_val_to_text(c, desc.btree_id, bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
buf, sizeof(buf), k))) { buf, sizeof(buf), k))) {
ret = hash_redo_key(desc, h, c, k_iter, k, hashed); ret = hash_redo_key(desc, h, c, k_iter, k, hashed);
if (ret) { if (ret) {
@ -224,7 +224,7 @@ static int hash_check_key(const struct bch_hash_desc desc,
if (fsck_err_on(k2.k->type == desc.key_type && if (fsck_err_on(k2.k->type == desc.key_type &&
!desc.cmp_bkey(k, k2), c, !desc.cmp_bkey(k, k2), c,
"duplicate hash table keys:\n%s", "duplicate hash table keys:\n%s",
bch2_bkey_val_to_text(c, desc.btree_id, bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
buf, sizeof(buf), k))) { buf, sizeof(buf), k))) {
ret = bch2_hash_delete_at(desc, &h->info, &h->iter, NULL); ret = bch2_hash_delete_at(desc, &h->info, &h->iter, NULL);
if (ret) if (ret)
@ -397,9 +397,9 @@ static int check_dirents(struct bch_fs *c)
if (fsck_err_on(have_target && if (fsck_err_on(have_target &&
d.v->d_type != d.v->d_type !=
mode_to_type(le16_to_cpu(target.bi_mode)), c, mode_to_type(target.bi_mode), c,
"incorrect d_type: should be %u:\n%s", "incorrect d_type: should be %u:\n%s",
mode_to_type(le16_to_cpu(target.bi_mode)), mode_to_type(target.bi_mode),
bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS, bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
buf, sizeof(buf), k))) { buf, sizeof(buf), k))) {
struct bkey_i_dirent *n; struct bkey_i_dirent *n;
@ -411,7 +411,7 @@ static int check_dirents(struct bch_fs *c)
} }
bkey_reassemble(&n->k_i, d.s_c); bkey_reassemble(&n->k_i, d.s_c);
n->v.d_type = mode_to_type(le16_to_cpu(target.bi_mode)); n->v.d_type = mode_to_type(target.bi_mode);
ret = bch2_btree_insert_at(c, NULL, NULL, NULL, ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
BTREE_INSERT_NOFAIL, BTREE_INSERT_NOFAIL,
@ -493,7 +493,8 @@ static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
fsck_err: fsck_err:
return ret; return ret;
create_root: create_root:
bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0); bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO,
0, NULL);
root_inode->bi_inum = BCACHEFS_ROOT_INO; root_inode->bi_inum = BCACHEFS_ROOT_INO;
bch2_inode_pack(&packed, root_inode); bch2_inode_pack(&packed, root_inode);
@ -545,7 +546,8 @@ create_lostfound:
if (ret) if (ret)
return ret; return ret;
bch2_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0); bch2_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO,
0, root_inode);
ret = bch2_inode_create(c, lostfound_inode, BLOCKDEV_INODE_MAX, 0, ret = bch2_inode_create(c, lostfound_inode, BLOCKDEV_INODE_MAX, 0,
&c->unused_inode_hint); &c->unused_inode_hint);

View File

@ -198,6 +198,12 @@ static const char *bch2_inode_invalid(const struct bch_fs *c,
if (bch2_inode_unpack(inode, &unpacked)) if (bch2_inode_unpack(inode, &unpacked))
return "invalid variable length fields"; return "invalid variable length fields";
if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
return "invalid data checksum type";
if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
return "invalid data checksum type";
return NULL; return NULL;
} }
case BCH_INODE_BLOCKDEV: case BCH_INODE_BLOCKDEV:
@ -221,6 +227,7 @@ static const char *bch2_inode_invalid(const struct bch_fs *c,
static void bch2_inode_to_text(struct bch_fs *c, char *buf, static void bch2_inode_to_text(struct bch_fs *c, char *buf,
size_t size, struct bkey_s_c k) size_t size, struct bkey_s_c k)
{ {
char *out = buf, *end = out + size;
struct bkey_s_c_inode inode; struct bkey_s_c_inode inode;
struct bch_inode_unpacked unpacked; struct bch_inode_unpacked unpacked;
@ -228,11 +235,14 @@ static void bch2_inode_to_text(struct bch_fs *c, char *buf,
case BCH_INODE_FS: case BCH_INODE_FS:
inode = bkey_s_c_to_inode(k); inode = bkey_s_c_to_inode(k);
if (bch2_inode_unpack(inode, &unpacked)) { if (bch2_inode_unpack(inode, &unpacked)) {
scnprintf(buf, size, "(unpack error)"); out += scnprintf(out, end - out, "(unpack error)");
break; break;
} }
scnprintf(buf, size, "i_size %llu", unpacked.bi_size); #define BCH_INODE_FIELD(_name, _bits) \
out += scnprintf(out, end - out, #_name ": %llu ", (u64) unpacked._name);
BCH_INODE_FIELDS()
#undef BCH_INODE_FIELD
break; break;
} }
} }
@ -243,9 +253,12 @@ const struct bkey_ops bch2_bkey_inode_ops = {
}; };
void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
uid_t uid, gid_t gid, umode_t mode, dev_t rdev) uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
struct bch_inode_unpacked *parent)
{ {
s64 now = timespec_to_bch2_time(c, CURRENT_TIME); s64 now = timespec_to_bch2_time(c,
timespec_trunc(current_kernel_time(),
c->sb.time_precision));
memset(inode_u, 0, sizeof(*inode_u)); memset(inode_u, 0, sizeof(*inode_u));
@ -261,6 +274,12 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
inode_u->bi_mtime = now; inode_u->bi_mtime = now;
inode_u->bi_ctime = now; inode_u->bi_ctime = now;
inode_u->bi_otime = now; inode_u->bi_otime = now;
if (parent) {
#define BCH_INODE_FIELD(_name) inode_u->_name = parent->_name;
BCH_INODE_FIELDS_INHERIT()
#undef BCH_INODE_FIELD
}
} }
int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u, int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
@ -416,7 +435,7 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
struct bch_inode_unpacked inode_u; struct bch_inode_unpacked inode_u;
if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u)) if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
bi_generation = cpu_to_le32(inode_u.bi_generation) + 1; bi_generation = inode_u.bi_generation + 1;
break; break;
} }
case BCH_INODE_GENERATION: { case BCH_INODE_GENERATION: {

View File

@ -1,6 +1,8 @@
#ifndef _BCACHEFS_INODE_H #ifndef _BCACHEFS_INODE_H
#define _BCACHEFS_INODE_H #define _BCACHEFS_INODE_H
#include "opts.h"
#include <linux/math64.h> #include <linux/math64.h>
extern const struct bkey_ops bch2_bkey_inode_ops; extern const struct bkey_ops bch2_bkey_inode_ops;
@ -28,7 +30,8 @@ void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *)
int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
uid_t, gid_t, umode_t, dev_t); uid_t, gid_t, umode_t, dev_t,
struct bch_inode_unpacked *);
int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *, int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *,
u64, u64, u64 *); u64, u64, u64 *);
int bch2_inode_truncate(struct bch_fs *, u64, u64, int bch2_inode_truncate(struct bch_fs *, u64, u64,
@ -55,6 +58,45 @@ static inline u64 timespec_to_bch2_time(struct bch_fs *c, struct timespec ts)
return div_s64(ns, c->sb.time_precision); return div_s64(ns, c->sb.time_precision);
} }
static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
{
struct bch_io_opts ret = { 0 };
#define BCH_INODE_OPT(_name, _bits) \
if (inode->bi_##_name) \
opt_set(ret, _name, inode->bi_##_name - 1);
BCH_INODE_OPTS()
#undef BCH_INODE_OPT
return ret;
}
static inline void __bch2_inode_opt_set(struct bch_inode_unpacked *inode,
enum bch_opt_id id, u64 v)
{
switch (id) {
#define BCH_INODE_OPT(_name, ...) \
case Opt_##_name: \
inode->bi_##_name = v; \
break;
BCH_INODE_OPTS()
#undef BCH_INODE_OPT
default:
BUG();
}
}
static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
enum bch_opt_id id, u64 v)
{
return __bch2_inode_opt_set(inode, id, v + 1);
}
static inline void bch2_inode_opt_clear(struct bch_inode_unpacked *inode,
enum bch_opt_id id)
{
return __bch2_inode_opt_set(inode, id, 0);
}
#ifdef CONFIG_BCACHEFS_DEBUG #ifdef CONFIG_BCACHEFS_DEBUG
void bch2_inode_pack_test(void); void bch2_inode_pack_test(void);
#else #else

View File

@ -20,6 +20,7 @@
#include "journal.h" #include "journal.h"
#include "keylist.h" #include "keylist.h"
#include "move.h" #include "move.h"
#include "super.h"
#include "super-io.h" #include "super-io.h"
#include <linux/blkdev.h> #include <linux/blkdev.h>
@ -139,7 +140,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
const struct bch_extent_ptr *ptr; const struct bch_extent_ptr *ptr;
struct bch_write_bio *n; struct bch_write_bio *n;
struct bch_dev *ca; struct bch_dev *ca;
unsigned ptr_idx = 0;
BUG_ON(c->opts.nochanges); BUG_ON(c->opts.nochanges);
@ -147,7 +147,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX || BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
!c->devs[ptr->dev]); !c->devs[ptr->dev]);
ca = c->devs[ptr->dev]; ca = bch_dev_bkey_exists(c, ptr->dev);
if (ptr + 1 < &extent_entry_last(e)->ptr) { if (ptr + 1 < &extent_entry_last(e)->ptr) {
n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO, n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
@ -168,7 +168,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
n->c = c; n->c = c;
n->ca = ca; n->ca = ca;
n->ptr_idx = ptr_idx++;
n->submit_time_us = local_clock_us(); n->submit_time_us = local_clock_us();
n->bio.bi_iter.bi_sector = ptr->offset; n->bio.bi_iter.bi_sector = ptr->offset;
@ -184,7 +183,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
submit_bio(&n->bio); submit_bio(&n->bio);
} else { } else {
n->have_io_ref = false; n->have_io_ref = false;
bcache_io_error(c, &n->bio, "device has been removed"); n->bio.bi_status = BLK_STS_REMOVED;
bio_endio(&n->bio); bio_endio(&n->bio);
} }
} }
@ -201,9 +200,12 @@ static void bch2_write_done(struct closure *cl)
if (!op->error && (op->flags & BCH_WRITE_FLUSH)) if (!op->error && (op->flags & BCH_WRITE_FLUSH))
op->error = bch2_journal_error(&op->c->journal); op->error = bch2_journal_error(&op->c->journal);
bch2_disk_reservation_put(op->c, &op->res); if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
bch2_disk_reservation_put(op->c, &op->res);
percpu_ref_put(&op->c->writes); percpu_ref_put(&op->c->writes);
bch2_keylist_free(&op->insert_keys, op->inline_keys); bch2_keylist_free(&op->insert_keys, op->inline_keys);
op->flags &= ~(BCH_WRITE_DONE|BCH_WRITE_LOOPED);
closure_return(cl); closure_return(cl);
} }
@ -244,9 +246,37 @@ static void bch2_write_index(struct closure *cl)
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct bch_fs *c = op->c; struct bch_fs *c = op->c;
struct keylist *keys = &op->insert_keys; struct keylist *keys = &op->insert_keys;
struct bkey_s_extent e;
struct bch_extent_ptr *ptr;
struct bkey_i *src, *dst = keys->keys, *n;
int ret;
op->flags |= BCH_WRITE_LOOPED; op->flags |= BCH_WRITE_LOOPED;
for (src = keys->keys; src != keys->top; src = n) {
n = bkey_next(src);
bkey_copy(dst, src);
e = bkey_i_to_s_extent(dst);
extent_for_each_ptr_backwards(e, ptr)
if (test_bit(ptr->dev, op->failed.d))
bch2_extent_drop_ptr(e, ptr);
ret = bch2_extent_nr_ptrs(e.c)
? bch2_check_mark_super(c, e.c, BCH_DATA_USER)
: -EIO;
if (ret) {
keys->top = keys->keys;
op->error = ret;
op->flags |= BCH_WRITE_DONE;
goto err;
}
dst = bkey_next(dst);
}
keys->top = dst;
if (!bch2_keylist_empty(keys)) { if (!bch2_keylist_empty(keys)) {
u64 sectors_start = keylist_sectors(keys); u64 sectors_start = keylist_sectors(keys);
int ret = op->index_update_fn(op); int ret = op->index_update_fn(op);
@ -260,7 +290,7 @@ static void bch2_write_index(struct closure *cl)
op->error = ret; op->error = ret;
} }
} }
err:
bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets); bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets);
if (!(op->flags & BCH_WRITE_DONE)) if (!(op->flags & BCH_WRITE_DONE))
@ -276,43 +306,6 @@ static void bch2_write_index(struct closure *cl)
} }
} }
static void bch2_write_io_error(struct closure *cl)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct keylist *keys = &op->insert_keys;
struct bch_fs *c = op->c;
struct bch_extent_ptr *ptr;
struct bkey_i *k;
int ret;
for_each_keylist_key(keys, k) {
struct bkey_i *n = bkey_next(k);
struct bkey_s_extent e = bkey_i_to_s_extent(k);
extent_for_each_ptr_backwards(e, ptr)
if (test_bit(ptr->dev, op->failed.d))
bch2_extent_drop_ptr(e, ptr);
memmove(bkey_next(k), n, (void *) keys->top - (void *) n);
keys->top_p -= (u64 *) n - (u64 *) bkey_next(k);
ret = bch2_extent_nr_ptrs(e.c)
? bch2_check_mark_super(c, e.c, BCH_DATA_USER)
: -EIO;
if (ret) {
keys->top = keys->keys;
op->error = ret;
op->flags |= BCH_WRITE_DONE;
break;
}
}
memset(&op->failed, 0, sizeof(op->failed));
bch2_write_index(cl);
return;
}
static void bch2_write_endio(struct bio *bio) static void bch2_write_endio(struct bio *bio)
{ {
struct closure *cl = bio->bi_private; struct closure *cl = bio->bi_private;
@ -324,10 +317,8 @@ static void bch2_write_endio(struct bio *bio)
bch2_latency_acct(ca, wbio->submit_time_us, WRITE); bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
if (bch2_dev_io_err_on(bio->bi_error, ca, "data write")) { if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
set_bit(ca->dev_idx, op->failed.d); set_bit(ca->dev_idx, op->failed.d);
set_closure_fn(cl, bch2_write_io_error, index_update_wq(op));
}
if (wbio->have_io_ref) if (wbio->have_io_ref)
percpu_ref_put(&ca->io_ref); percpu_ref_put(&ca->io_ref);
@ -706,11 +697,6 @@ do_write:
key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset); key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(key_to_write),
BCH_DATA_USER);
if (ret)
goto err;
dst->bi_end_io = bch2_write_endio; dst->bi_end_io = bch2_write_endio;
dst->bi_private = &op->cl; dst->bi_private = &op->cl;
bio_set_op_attrs(dst, REQ_OP_WRITE, 0); bio_set_op_attrs(dst, REQ_OP_WRITE, 0);
@ -870,7 +856,8 @@ void bch2_write(struct closure *cl)
!percpu_ref_tryget(&c->writes)) { !percpu_ref_tryget(&c->writes)) {
__bcache_io_error(c, "read only"); __bcache_io_error(c, "read only");
op->error = -EROFS; op->error = -EROFS;
bch2_disk_reservation_put(c, &op->res); if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
bch2_disk_reservation_put(c, &op->res);
closure_return(cl); closure_return(cl);
} }
@ -916,7 +903,10 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
swap(bio->bi_vcnt, rbio->bio.bi_vcnt); swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
rbio->promote = NULL; rbio->promote = NULL;
__bch2_write_op_init(&op->write.op, c); bch2_write_op_init(&op->write.op, c);
op->write.op.csum_type = bch2_data_checksum_type(c, rbio->opts.data_checksum);
op->write.op.compression_type =
bch2_compression_opt_to_type(rbio->opts.compression);
op->write.move_dev = -1; op->write.move_dev = -1;
op->write.op.devs = c->fastest_devs; op->write.op.devs = c->fastest_devs;
@ -1060,7 +1050,7 @@ static void bch2_rbio_retry(struct work_struct *work)
if (rbio->split) if (rbio->split)
rbio = bch2_rbio_free(rbio); rbio = bch2_rbio_free(rbio);
else else
rbio->bio.bi_error = 0; rbio->bio.bi_status = 0;
if (!(flags & BCH_READ_NODECODE)) if (!(flags & BCH_READ_NODECODE))
flags |= BCH_READ_MUST_CLONE; flags |= BCH_READ_MUST_CLONE;
@ -1073,7 +1063,8 @@ static void bch2_rbio_retry(struct work_struct *work)
__bch2_read(c, rbio, iter, inode, &avoid, flags); __bch2_read(c, rbio, iter, inode, &avoid, flags);
} }
static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error) static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
blk_status_t error)
{ {
rbio->retry = retry; rbio->retry = retry;
@ -1081,7 +1072,7 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error)
return; return;
if (retry == READ_ERR) { if (retry == READ_ERR) {
bch2_rbio_parent(rbio)->bio.bi_error = error; bch2_rbio_parent(rbio)->bio.bi_status = error;
bch2_rbio_done(rbio); bch2_rbio_done(rbio);
} else { } else {
bch2_rbio_punt(rbio, bch2_rbio_retry, bch2_rbio_punt(rbio, bch2_rbio_retry,
@ -1236,7 +1227,7 @@ csum_err:
*/ */
if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
rbio->flags |= BCH_READ_MUST_BOUNCE; rbio->flags |= BCH_READ_MUST_BOUNCE;
bch2_rbio_error(rbio, READ_RETRY, -EIO); bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
return; return;
} }
@ -1245,13 +1236,13 @@ csum_err:
rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector, rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
csum.hi, csum.lo, crc.csum_type); csum.hi, csum.lo, crc.csum_type);
bch2_rbio_error(rbio, READ_RETRY_AVOID, -EIO); bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
return; return;
decompression_err: decompression_err:
__bcache_io_error(c, "decompression error, inode %llu offset %llu", __bcache_io_error(c, "decompression error, inode %llu offset %llu",
rbio->pos.inode, rbio->pos.inode,
(u64) rbio->bvec_iter.bi_sector); (u64) rbio->bvec_iter.bi_sector);
bch2_rbio_error(rbio, READ_ERR, -EIO); bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
return; return;
} }
@ -1270,8 +1261,8 @@ static void bch2_read_endio(struct bio *bio)
if (!rbio->split) if (!rbio->split)
rbio->bio.bi_end_io = rbio->end_io; rbio->bio.bi_end_io = rbio->end_io;
if (bch2_dev_io_err_on(bio->bi_error, rbio->pick.ca, "data read")) { if (bch2_dev_io_err_on(bio->bi_status, rbio->pick.ca, "data read")) {
bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_error); bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
return; return;
} }
@ -1281,9 +1272,9 @@ static void bch2_read_endio(struct bio *bio)
atomic_long_inc(&c->read_realloc_races); atomic_long_inc(&c->read_realloc_races);
if (rbio->flags & BCH_READ_RETRY_IF_STALE) if (rbio->flags & BCH_READ_RETRY_IF_STALE)
bch2_rbio_error(rbio, READ_RETRY, -EINTR); bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
else else
bch2_rbio_error(rbio, READ_ERR, -EINTR); bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
return; return;
} }
@ -1360,7 +1351,8 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
rbio = rbio_init(bio_alloc_bioset(GFP_NOIO, rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
DIV_ROUND_UP(sectors, PAGE_SECTORS), DIV_ROUND_UP(sectors, PAGE_SECTORS),
&c->bio_read_split)); &c->bio_read_split),
orig->opts);
bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
split = true; split = true;
@ -1374,7 +1366,8 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
* lose the error) * lose the error)
*/ */
rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO, rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
&c->bio_read_split)); &c->bio_read_split),
orig->opts);
rbio->bio.bi_iter = iter; rbio->bio.bi_iter = iter;
split = true; split = true;
} else { } else {
@ -1428,6 +1421,8 @@ noclone:
bch2_read_endio(&rbio->bio); bch2_read_endio(&rbio->bio);
ret = rbio->retry; ret = rbio->retry;
if (rbio->split)
rbio = bch2_rbio_free(rbio);
if (!ret) if (!ret)
bch2_rbio_done(rbio); bch2_rbio_done(rbio);
} }
@ -1503,7 +1498,7 @@ err:
* possibly bigger than the memory that was * possibly bigger than the memory that was
* originally allocated) * originally allocated)
*/ */
rbio->bio.bi_error = -EINTR; rbio->bio.bi_status = BLK_STS_AGAIN;
bio_endio(&rbio->bio); bio_endio(&rbio->bio);
return; return;
} }
@ -1561,6 +1556,7 @@ retry:
case READ_RETRY: case READ_RETRY:
goto retry; goto retry;
case READ_ERR: case READ_ERR:
rbio->bio.bi_status = BLK_STS_IOERR;
bio_endio(&rbio->bio); bio_endio(&rbio->bio);
return; return;
}; };

View File

@ -21,6 +21,8 @@ void bch2_latency_acct(struct bch_dev *, unsigned, int);
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
enum bch_data_type, const struct bkey_i *); enum bch_data_type, const struct bkey_i *);
#define BLK_STS_REMOVED ((__force blk_status_t)128)
enum bch_write_flags { enum bch_write_flags {
BCH_WRITE_ALLOC_NOWAIT = (1 << 0), BCH_WRITE_ALLOC_NOWAIT = (1 << 0),
BCH_WRITE_CACHED = (1 << 1), BCH_WRITE_CACHED = (1 << 1),
@ -29,11 +31,12 @@ enum bch_write_flags {
BCH_WRITE_PAGES_STABLE = (1 << 4), BCH_WRITE_PAGES_STABLE = (1 << 4),
BCH_WRITE_PAGES_OWNED = (1 << 5), BCH_WRITE_PAGES_OWNED = (1 << 5),
BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6), BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6),
BCH_WRITE_NOPUT_RESERVATION = (1 << 7),
/* Internal: */ /* Internal: */
BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 7), BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 8),
BCH_WRITE_DONE = (1 << 8), BCH_WRITE_DONE = (1 << 9),
BCH_WRITE_LOOPED = (1 << 9), BCH_WRITE_LOOPED = (1 << 10),
}; };
static inline u64 *op_journal_seq(struct bch_write_op *op) static inline u64 *op_journal_seq(struct bch_write_op *op)
@ -42,6 +45,12 @@ static inline u64 *op_journal_seq(struct bch_write_op *op)
? op->journal_seq_p : &op->journal_seq; ? op->journal_seq_p : &op->journal_seq;
} }
static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq)
{
op->journal_seq_p = journal_seq;
op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
}
static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
{ {
return op->alloc_reserve == RESERVE_MOVINGGC return op->alloc_reserve == RESERVE_MOVINGGC
@ -51,14 +60,14 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
int bch2_write_index_default(struct bch_write_op *); int bch2_write_index_default(struct bch_write_op *);
static inline void __bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c) static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c)
{ {
op->c = c; op->c = c;
op->io_wq = index_update_wq(op); op->io_wq = index_update_wq(op);
op->flags = 0; op->flags = 0;
op->written = 0; op->written = 0;
op->error = 0; op->error = 0;
op->csum_type = bch2_data_checksum_type(c); op->csum_type = bch2_data_checksum_type(c, c->opts.data_checksum);
op->compression_type = op->compression_type =
bch2_compression_opt_to_type(c->opts.compression); bch2_compression_opt_to_type(c->opts.compression);
op->nr_replicas = 0; op->nr_replicas = 0;
@ -75,27 +84,6 @@ static inline void __bch2_write_op_init(struct bch_write_op *op, struct bch_fs *
op->index_update_fn = bch2_write_index_default; op->index_update_fn = bch2_write_index_default;
} }
static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
struct disk_reservation res,
struct bch_devs_mask *devs,
struct write_point_specifier write_point,
struct bpos pos,
u64 *journal_seq, unsigned flags)
{
__bch2_write_op_init(op, c);
op->flags = flags;
op->nr_replicas = res.nr_replicas;
op->pos = pos;
op->res = res;
op->devs = devs;
op->write_point = write_point;
if (journal_seq) {
op->journal_seq_p = journal_seq;
op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
}
}
void bch2_write(struct closure *); void bch2_write(struct closure *);
static inline struct bch_write_bio *wbio_init(struct bio *bio) static inline struct bch_write_bio *wbio_init(struct bio *bio)
@ -134,25 +122,27 @@ static inline void bch2_read_extent(struct bch_fs *c,
struct extent_pick_ptr *pick, struct extent_pick_ptr *pick,
unsigned flags) unsigned flags)
{ {
rbio->_state = 0;
__bch2_read_extent(c, rbio, rbio->bio.bi_iter, e, pick, flags); __bch2_read_extent(c, rbio, rbio->bio.bi_iter, e, pick, flags);
} }
static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
u64 inode) u64 inode)
{ {
rbio->_state = 0; BUG_ON(rbio->_state);
__bch2_read(c, rbio, rbio->bio.bi_iter, inode, NULL, __bch2_read(c, rbio, rbio->bio.bi_iter, inode, NULL,
BCH_READ_RETRY_IF_STALE| BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE| BCH_READ_MAY_PROMOTE|
BCH_READ_USER_MAPPED); BCH_READ_USER_MAPPED);
} }
static inline struct bch_read_bio *rbio_init(struct bio *bio) static inline struct bch_read_bio *rbio_init(struct bio *bio,
struct bch_io_opts opts)
{ {
struct bch_read_bio *rbio = to_rbio(bio); struct bch_read_bio *rbio = to_rbio(bio);
rbio->_state = 0; rbio->_state = 0;
rbio->promote = NULL;
rbio->opts = opts;
return rbio; return rbio;
} }

View File

@ -6,6 +6,7 @@
#include "buckets_types.h" #include "buckets_types.h"
#include "extents_types.h" #include "extents_types.h"
#include "keylist_types.h" #include "keylist_types.h"
#include "opts.h"
#include "super_types.h" #include "super_types.h"
#include <linux/llist.h> #include <linux/llist.h>
@ -56,6 +57,8 @@ struct bch_read_bio {
struct promote_op *promote; struct promote_op *promote;
struct bch_io_opts opts;
struct work_struct work; struct work_struct work;
struct bio bio; struct bio bio;
@ -69,8 +72,7 @@ struct bch_write_bio {
struct closure *cl; struct closure *cl;
}; };
u8 ptr_idx; struct bch_devs_list failed;
u8 replicas_failed;
u8 order; u8 order;
unsigned split:1, unsigned split:1,
@ -90,8 +92,8 @@ struct bch_write_op {
struct bch_fs *c; struct bch_fs *c;
struct workqueue_struct *io_wq; struct workqueue_struct *io_wq;
unsigned written; /* sectors */
u16 flags; u16 flags;
u16 written; /* sectors */
s8 error; s8 error;
unsigned csum_type:4; unsigned csum_type:4;

View File

@ -338,8 +338,8 @@ struct journal_list {
* Given a journal entry we just read, add it to the list of journal entries to * Given a journal entry we just read, add it to the list of journal entries to
* be replayed: * be replayed:
*/ */
static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist, static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
struct jset *j) struct journal_list *jlist, struct jset *j)
{ {
struct journal_replay *i, *pos; struct journal_replay *i, *pos;
struct list_head *where; struct list_head *where;
@ -347,8 +347,6 @@ static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
__le64 last_seq; __le64 last_seq;
int ret; int ret;
mutex_lock(&jlist->lock);
last_seq = !list_empty(jlist->head) last_seq = !list_empty(jlist->head)
? list_last_entry(jlist->head, struct journal_replay, ? list_last_entry(jlist->head, struct journal_replay,
list)->j.last_seq list)->j.last_seq
@ -376,9 +374,7 @@ static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
memcmp(j, &i->j, bytes), c, memcmp(j, &i->j, bytes), c,
"found duplicate but non identical journal entries (seq %llu)", "found duplicate but non identical journal entries (seq %llu)",
le64_to_cpu(j->seq)); le64_to_cpu(j->seq));
goto found;
ret = JOURNAL_ENTRY_ADD_OK;
goto out;
} }
if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) { if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
@ -395,12 +391,16 @@ add:
goto out; goto out;
} }
memcpy(&i->j, j, bytes);
list_add(&i->list, where); list_add(&i->list, where);
i->devs.nr = 0;
memcpy(&i->j, j, bytes);
found:
if (!fsck_err_on(bch2_dev_list_has_dev(i->devs, ca->dev_idx),
c, "duplicate journal entries on same device"))
bch2_dev_list_add_dev(&i->devs, ca->dev_idx);
ret = JOURNAL_ENTRY_ADD_OK; ret = JOURNAL_ENTRY_ADD_OK;
out: out:
fsck_err: fsck_err:
mutex_unlock(&jlist->lock);
return ret; return ret;
} }
@ -496,8 +496,8 @@ fsck_err:
#define journal_entry_err_on(cond, c, msg, ...) \ #define journal_entry_err_on(cond, c, msg, ...) \
((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false) ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
static int __journal_entry_validate(struct bch_fs *c, struct jset *j, static int journal_entry_validate_entries(struct bch_fs *c, struct jset *j,
int write) int write)
{ {
struct jset_entry *entry; struct jset_entry *entry;
int ret = 0; int ret = 0;
@ -508,7 +508,7 @@ static int __journal_entry_validate(struct bch_fs *c, struct jset *j,
if (journal_entry_err_on(vstruct_next(entry) > if (journal_entry_err_on(vstruct_next(entry) >
vstruct_last(j), c, vstruct_last(j), c,
"journal entry extends past end of jset")) { "journal entry extends past end of jset")) {
j->u64s = cpu_to_le64((u64 *) entry - j->_data); j->u64s = cpu_to_le32((u64 *) entry - j->_data);
break; break;
} }
@ -614,7 +614,7 @@ static int journal_entry_validate(struct bch_fs *c,
"invalid journal entry: last_seq > seq")) "invalid journal entry: last_seq > seq"))
j->last_seq = j->seq; j->last_seq = j->seq;
return __journal_entry_validate(c, j, write); return 0;
fsck_err: fsck_err:
return ret; return ret;
} }
@ -722,7 +722,10 @@ reread: sectors_read = min_t(unsigned,
ja->bucket_seq[bucket] = le64_to_cpu(j->seq); ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
ret = journal_entry_add(c, jlist, j); mutex_lock(&jlist->lock);
ret = journal_entry_add(c, ca, jlist, j);
mutex_unlock(&jlist->lock);
switch (ret) { switch (ret) {
case JOURNAL_ENTRY_ADD_OK: case JOURNAL_ENTRY_ADD_OK:
*entries_found = true; *entries_found = true;
@ -916,7 +919,9 @@ static int journal_seq_blacklist_read(struct journal *j,
for_each_jset_entry_type(entry, &i->j, for_each_jset_entry_type(entry, &i->j,
JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED) { JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED) {
seq = le64_to_cpu(entry->_data[0]); struct jset_entry_blacklist *bl_entry =
container_of(entry, struct jset_entry_blacklist, entry);
seq = le64_to_cpu(bl_entry->seq);
bch_verbose(c, "blacklisting existing journal seq %llu", seq); bch_verbose(c, "blacklisting existing journal seq %llu", seq);
@ -982,6 +987,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
fsck_err_on(c->sb.clean && journal_has_keys(list), c, fsck_err_on(c->sb.clean && journal_has_keys(list), c,
"filesystem marked clean but journal has keys to replay"); "filesystem marked clean but journal has keys to replay");
list_for_each_entry(i, list, list) {
ret = journal_entry_validate_entries(c, &i->j, READ);
if (ret)
goto fsck_err;
}
i = list_last_entry(list, struct journal_replay, list); i = list_last_entry(list, struct journal_replay, list);
unfixable_fsck_err_on(le64_to_cpu(i->j.seq) - unfixable_fsck_err_on(le64_to_cpu(i->j.seq) -
@ -1002,6 +1013,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
INIT_LIST_HEAD(&p->list); INIT_LIST_HEAD(&p->list);
INIT_LIST_HEAD(&p->flushed); INIT_LIST_HEAD(&p->flushed);
atomic_set(&p->count, 0); atomic_set(&p->count, 0);
p->devs.nr = 0;
} }
mutex_lock(&j->blacklist_lock); mutex_lock(&j->blacklist_lock);
@ -1010,6 +1022,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
p = journal_seq_pin(j, le64_to_cpu(i->j.seq)); p = journal_seq_pin(j, le64_to_cpu(i->j.seq));
atomic_set(&p->count, 1); atomic_set(&p->count, 1);
p->devs = i->devs;
if (journal_seq_blacklist_read(j, i, p)) { if (journal_seq_blacklist_read(j, i, p)) {
mutex_unlock(&j->blacklist_lock); mutex_unlock(&j->blacklist_lock);
@ -1090,7 +1103,7 @@ void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
{ {
struct journal_buf *w = journal_prev_buf(j); struct journal_buf *w = journal_prev_buf(j);
atomic_dec_bug(&journal_seq_pin(j, w->data->seq)->count); atomic_dec_bug(&journal_seq_pin(j, le64_to_cpu(w->data->seq))->count);
if (!need_write_just_set && if (!need_write_just_set &&
test_bit(JOURNAL_NEED_WRITE, &j->flags)) test_bit(JOURNAL_NEED_WRITE, &j->flags))
@ -1122,6 +1135,7 @@ static void __journal_entry_new(struct journal *j, int count)
INIT_LIST_HEAD(&p->list); INIT_LIST_HEAD(&p->list);
INIT_LIST_HEAD(&p->flushed); INIT_LIST_HEAD(&p->flushed);
atomic_set(&p->count, count); atomic_set(&p->count, count);
p->devs.nr = 0;
} }
static void __bch2_journal_next_entry(struct journal *j) static void __bch2_journal_next_entry(struct journal *j)
@ -1851,6 +1865,21 @@ void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
bch2_journal_error(j)); bch2_journal_error(j));
} }
int bch2_journal_flush_all_pins(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
bool flush;
bch2_journal_flush_pins(j, U64_MAX);
spin_lock(&j->lock);
flush = last_seq(j) != j->last_seq_ondisk ||
c->btree_roots_dirty;
spin_unlock(&j->lock);
return flush ? bch2_journal_meta(j) : 0;
}
static bool should_discard_bucket(struct journal *j, struct journal_device *ja) static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
{ {
bool ret; bool ret;
@ -2002,7 +2031,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
* i.e. whichever device was limiting the current journal entry size. * i.e. whichever device was limiting the current journal entry size.
*/ */
extent_for_each_ptr_backwards(e, ptr) { extent_for_each_ptr_backwards(e, ptr) {
ca = c->devs[ptr->dev]; ca = bch_dev_bkey_exists(c, ptr->dev);
if (ca->mi.state != BCH_MEMBER_STATE_RW || if (ca->mi.state != BCH_MEMBER_STATE_RW ||
ca->journal.sectors_free <= sectors) ca->journal.sectors_free <= sectors)
@ -2197,7 +2226,7 @@ static void journal_write_endio(struct bio *bio)
struct bch_dev *ca = bio->bi_private; struct bch_dev *ca = bio->bi_private;
struct journal *j = &ca->fs->journal; struct journal *j = &ca->fs->journal;
if (bch2_dev_io_err_on(bio->bi_error, ca, "journal write") || if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write") ||
bch2_meta_write_fault("journal")) { bch2_meta_write_fault("journal")) {
/* Was this a flush or an actual journal write? */ /* Was this a flush or an actual journal write? */
if (ca->journal.ptr_idx != U8_MAX) { if (ca->journal.ptr_idx != U8_MAX) {
@ -2233,6 +2262,7 @@ static void journal_write(struct closure *cl)
if (r->alive) if (r->alive)
bch2_journal_add_btree_root(w, i, &r->key, r->level); bch2_journal_add_btree_root(w, i, &r->key, r->level);
} }
c->btree_roots_dirty = false;
mutex_unlock(&c->btree_root_lock); mutex_unlock(&c->btree_root_lock);
journal_write_compact(jset); journal_write_compact(jset);
@ -2246,7 +2276,7 @@ static void journal_write(struct closure *cl)
SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) && if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
__journal_entry_validate(c, jset, WRITE)) journal_entry_validate_entries(c, jset, WRITE))
goto err; goto err;
bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
@ -2257,7 +2287,7 @@ static void journal_write(struct closure *cl)
journal_nonce(jset), jset); journal_nonce(jset), jset);
if (!bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) && if (!bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
__journal_entry_validate(c, jset, WRITE)) journal_entry_validate_entries(c, jset, WRITE))
goto err; goto err;
sectors = vstruct_sectors(jset, c->block_bits); sectors = vstruct_sectors(jset, c->block_bits);
@ -2277,6 +2307,9 @@ static void journal_write(struct closure *cl)
BCH_DATA_JOURNAL)) BCH_DATA_JOURNAL))
goto err; goto err;
journal_seq_pin(j, le64_to_cpu(jset->seq))->devs =
bch2_extent_devs(bkey_i_to_s_c_extent(&j->key));
/* /*
* XXX: we really should just disable the entire journal in nochanges * XXX: we really should just disable the entire journal in nochanges
* mode * mode
@ -2285,7 +2318,7 @@ static void journal_write(struct closure *cl)
goto no_io; goto no_io;
extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) { extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) {
ca = c->devs[ptr->dev]; ca = bch_dev_bkey_exists(c, ptr->dev);
if (!percpu_ref_tryget(&ca->io_ref)) { if (!percpu_ref_tryget(&ca->io_ref)) {
/* XXX: fix this */ /* XXX: fix this */
bch_err(c, "missing device for journal write\n"); bch_err(c, "missing device for journal write\n");
@ -2693,6 +2726,46 @@ int bch2_journal_flush(struct journal *j)
return bch2_journal_flush_seq(j, seq); return bch2_journal_flush_seq(j, seq);
} }
int bch2_journal_flush_device(struct journal *j, unsigned dev_idx)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_entry_pin_list *p;
struct bch_devs_list devs;
u64 seq = 0;
unsigned iter;
int ret = 0;
spin_lock(&j->lock);
fifo_for_each_entry_ptr(p, &j->pin, iter)
if (bch2_dev_list_has_dev(p->devs, dev_idx))
seq = journal_pin_seq(j, p);
spin_unlock(&j->lock);
bch2_journal_flush_pins(j, seq);
mutex_lock(&c->replicas_gc_lock);
bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
seq = 0;
spin_lock(&j->lock);
while (!ret && seq < atomic64_read(&j->seq)) {
seq = max(seq, last_seq(j));
devs = journal_seq_pin(j, seq)->devs;
seq++;
spin_unlock(&j->lock);
ret = bch2_check_mark_super_devlist(c, &devs, BCH_DATA_JOURNAL);
spin_lock(&j->lock);
}
spin_unlock(&j->lock);
bch2_replicas_gc_end(c, ret);
mutex_unlock(&c->replicas_gc_lock);
return ret;
}
ssize_t bch2_journal_print_debug(struct journal *j, char *buf) ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
{ {
struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_fs *c = container_of(j, struct bch_fs, journal);
@ -2862,9 +2935,7 @@ void bch2_fs_journal_stop(struct journal *j)
* journal entries, then force a brand new empty journal entry to be * journal entries, then force a brand new empty journal entry to be
* written: * written:
*/ */
bch2_journal_flush_pins(j, U64_MAX); bch2_journal_flush_all_pins(j);
bch2_journal_flush_async(j, NULL);
bch2_journal_meta(j);
cancel_delayed_work_sync(&j->write_work); cancel_delayed_work_sync(&j->write_work);
cancel_delayed_work_sync(&j->reclaim_work); cancel_delayed_work_sync(&j->reclaim_work);

View File

@ -118,6 +118,8 @@
*/ */
struct journal_replay { struct journal_replay {
struct list_head list; struct list_head list;
struct bch_devs_list devs;
/* must be last: */
struct jset j; struct jset j;
}; };
@ -164,6 +166,7 @@ void bch2_journal_pin_add_if_older(struct journal *,
struct journal_entry_pin *, struct journal_entry_pin *,
journal_pin_flush_fn); journal_pin_flush_fn);
void bch2_journal_flush_pins(struct journal *, u64); void bch2_journal_flush_pins(struct journal *, u64);
int bch2_journal_flush_all_pins(struct journal *);
struct closure; struct closure;
struct bch_fs; struct bch_fs;
@ -356,6 +359,7 @@ void bch2_journal_meta_async(struct journal *, struct closure *);
int bch2_journal_flush_seq(struct journal *, u64); int bch2_journal_flush_seq(struct journal *, u64);
int bch2_journal_flush(struct journal *); int bch2_journal_flush(struct journal *);
int bch2_journal_meta(struct journal *); int bch2_journal_meta(struct journal *);
int bch2_journal_flush_device(struct journal *, unsigned);
void bch2_journal_halt(struct journal *); void bch2_journal_halt(struct journal *);

View File

@ -34,6 +34,7 @@ struct journal_entry_pin_list {
struct list_head list; struct list_head list;
struct list_head flushed; struct list_head flushed;
atomic_t count; atomic_t count;
struct bch_devs_list devs;
}; };
struct journal; struct journal;

View File

@ -27,23 +27,9 @@ static bool migrate_pred(void *arg, struct bkey_s_c_extent e)
#define MAX_DATA_OFF_ITER 10 #define MAX_DATA_OFF_ITER 10
/* static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca,
* This moves only the data off, leaving the meta-data (if any) in place. int flags)
* It walks the key space, and for any key with a valid pointer to the
* relevant device, it copies it elsewhere, updating the key to point to
* the copy.
* The meta-data is moved off by bch_move_meta_data_off_device.
*
* Note: If the number of data replicas desired is > 1, ideally, any
* new copies would not be made in the same device that already have a
* copy (if there are enough devices).
* This is _not_ currently implemented. The multiple replicas can
* land in the same device even if there are others available.
*/
int bch2_move_data_off_device(struct bch_dev *ca)
{ {
struct bch_fs *c = ca->fs;
struct btree_iter iter; struct btree_iter iter;
struct bkey_s_c k; struct bkey_s_c k;
u64 keys_moved, sectors_moved; u64 keys_moved, sectors_moved;
@ -113,10 +99,6 @@ int bch2_move_data_off_device(struct bch_dev *ca)
return ret; return ret;
} }
/*
* This walks the btree, and for any node on the relevant device it moves the
* node elsewhere.
*/
static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca, static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
enum btree_id id) enum btree_id id)
{ {
@ -200,9 +182,9 @@ static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
* is written. * is written.
*/ */
int bch2_move_metadata_off_device(struct bch_dev *ca) static int bch2_dev_metadata_migrate(struct bch_fs *c, struct bch_dev *ca,
int flags)
{ {
struct bch_fs *c = ca->fs;
unsigned i; unsigned i;
int ret = 0; int ret = 0;
@ -240,37 +222,31 @@ err:
return ret; return ret;
} }
/* int bch2_dev_data_migrate(struct bch_fs *c, struct bch_dev *ca, int flags)
* Flagging data bad when forcibly removing a device after failing to
* migrate the data off the device.
*/
static int bch2_flag_key_bad(struct btree_iter *iter,
struct bch_dev *ca,
struct bkey_s_c_extent orig)
{ {
BKEY_PADDED(key) tmp; return bch2_dev_usrdata_migrate(c, ca, flags) ?:
struct bkey_s_extent e; bch2_dev_metadata_migrate(c, ca, flags);
struct bch_extent_ptr *ptr; }
struct bch_fs *c = ca->fs;
bkey_reassemble(&tmp.key, orig.s_c); static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
e = bkey_i_to_s_extent(&tmp.key); unsigned dev_idx, int flags, bool metadata)
{
struct bch_extent_ptr *ptr;
unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
unsigned nr_good;
extent_for_each_ptr_backwards(e, ptr) extent_for_each_ptr_backwards(e, ptr)
if (ptr->dev == ca->dev_idx) if (ptr->dev == dev_idx)
bch2_extent_drop_ptr(e, ptr); bch2_extent_drop_ptr(e, ptr);
/* nr_good = bch2_extent_nr_good_ptrs(c, e.c);
* If the new extent no longer has any pointers, bch2_extent_normalize() if ((!nr_good && !(flags & lost)) ||
* will do the appropriate thing with it (turning it into a (nr_good < replicas && !(flags & degraded)))
* KEY_TYPE_ERROR key, or just a discard if it was a cached extent) return -EINVAL;
*/
bch2_extent_normalize(c, e.s);
return bch2_btree_insert_at(c, NULL, NULL, NULL, return 0;
BTREE_INSERT_ATOMIC,
BTREE_INSERT_ENTRY(iter, &tmp.key));
} }
/* /*
@ -284,11 +260,11 @@ static int bch2_flag_key_bad(struct btree_iter *iter,
* that we've already tried to move the data MAX_DATA_OFF_ITER times and * that we've already tried to move the data MAX_DATA_OFF_ITER times and
* are not likely to succeed if we try again. * are not likely to succeed if we try again.
*/ */
int bch2_flag_data_bad(struct bch_dev *ca) static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{ {
struct bch_fs *c = ca->fs;
struct bkey_s_c k; struct bkey_s_c k;
struct bkey_s_c_extent e; struct bkey_s_extent e;
BKEY_PADDED(key) tmp;
struct btree_iter iter; struct btree_iter iter;
int ret = 0; int ret = 0;
@ -303,11 +279,33 @@ int bch2_flag_data_bad(struct bch_dev *ca)
if (!bkey_extent_is_data(k.k)) if (!bkey_extent_is_data(k.k))
goto advance; goto advance;
e = bkey_s_c_to_extent(k); if (!bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx))
if (!bch2_extent_has_device(e, ca->dev_idx))
goto advance; goto advance;
ret = bch2_flag_key_bad(&iter, ca, e); bkey_reassemble(&tmp.key, k);
e = bkey_i_to_s_extent(&tmp.key);
ret = drop_dev_ptrs(c, e, dev_idx, flags, false);
if (ret)
break;
/*
* If the new extent no longer has any pointers, bch2_extent_normalize()
* will do the appropriate thing with it (turning it into a
* KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
*/
bch2_extent_normalize(c, e.s);
if (bkey_extent_is_data(e.k) &&
(ret = bch2_check_mark_super(c, e.c, BCH_DATA_USER)))
break;
iter.pos = bkey_start_pos(&tmp.key.k);
ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL,
BTREE_INSERT_ENTRY(&iter, &tmp.key));
/* /*
* don't want to leave ret == -EINTR, since if we raced and * don't want to leave ret == -EINTR, since if we raced and
@ -319,26 +317,6 @@ int bch2_flag_data_bad(struct bch_dev *ca)
if (ret) if (ret)
break; break;
/*
* If the replica we're dropping was dirty and there is an
* additional cached replica, the cached replica will now be
* considered dirty - upon inserting the new version of the key,
* the bucket accounting will be updated to reflect the fact
* that the cached data is now dirty and everything works out as
* if by magic without us having to do anything.
*
* The one thing we need to be concerned with here is there's a
* race between when we drop any stale pointers from the key
* we're about to insert, and when the key actually gets
* inserted and the cached data is marked as dirty - we could
* end up trying to insert a key with a pointer that should be
* dirty, but points to stale data.
*
* If that happens the insert code just bails out and doesn't do
* the insert - however, it doesn't return an error. Hence we
* need to always recheck the current key before advancing to
* the next:
*/
continue; continue;
advance: advance:
if (bkey_extent_is_data(k.k)) { if (bkey_extent_is_data(k.k)) {
@ -357,3 +335,80 @@ advance:
return ret; return ret;
} }
static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
struct btree_iter iter;
struct closure cl;
struct btree *b;
unsigned id;
int ret;
/* don't handle this yet: */
if (flags & BCH_FORCE_IF_METADATA_LOST)
return -EINVAL;
closure_init_stack(&cl);
mutex_lock(&c->replicas_gc_lock);
bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
for (id = 0; id < BTREE_ID_NR; id++) {
for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
struct bkey_i_extent *new_key;
retry:
if (!bch2_extent_has_device(bkey_i_to_s_c_extent(&b->key),
dev_idx)) {
bch2_btree_iter_set_locks_want(&iter, 0);
ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key),
BCH_DATA_BTREE);
if (ret)
goto err;
} else {
bkey_copy(&tmp.k, &b->key);
new_key = bkey_i_to_extent(&tmp.k);
ret = drop_dev_ptrs(c, extent_i_to_s(new_key),
dev_idx, flags, true);
if (ret)
goto err;
if (!bch2_btree_iter_set_locks_want(&iter, U8_MAX)) {
b = bch2_btree_iter_peek_node(&iter);
goto retry;
}
ret = bch2_btree_node_update_key(c, &iter, b, new_key);
if (ret == -EINTR) {
b = bch2_btree_iter_peek_node(&iter);
goto retry;
}
if (ret)
goto err;
}
}
bch2_btree_iter_unlock(&iter);
/* btree root */
mutex_lock(&c->btree_root_lock);
mutex_unlock(&c->btree_root_lock);
}
ret = 0;
out:
bch2_replicas_gc_end(c, ret);
mutex_unlock(&c->replicas_gc_lock);
return ret;
err:
bch2_btree_iter_unlock(&iter);
goto out;
}
int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
bch2_dev_metadata_drop(c, dev_idx, flags);
}

View File

@ -1,8 +1,7 @@
#ifndef _BCACHEFS_MIGRATE_H #ifndef _BCACHEFS_MIGRATE_H
#define _BCACHEFS_MIGRATE_H #define _BCACHEFS_MIGRATE_H
int bch2_move_data_off_device(struct bch_dev *); int bch2_dev_data_migrate(struct bch_fs *, struct bch_dev *, int);
int bch2_move_metadata_off_device(struct bch_dev *); int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
int bch2_flag_data_bad(struct bch_dev *);
#endif /* _BCACHEFS_MIGRATE_H */ #endif /* _BCACHEFS_MIGRATE_H */

View File

@ -3,6 +3,7 @@
#include "btree_gc.h" #include "btree_gc.h"
#include "btree_update.h" #include "btree_update.h"
#include "buckets.h" #include "buckets.h"
#include "inode.h"
#include "io.h" #include "io.h"
#include "move.h" #include "move.h"
#include "super-io.h" #include "super-io.h"
@ -206,7 +207,7 @@ static void move_write(struct closure *cl)
{ {
struct moving_io *io = container_of(cl, struct moving_io, cl); struct moving_io *io = container_of(cl, struct moving_io, cl);
if (likely(!io->rbio.bio.bi_error)) { if (likely(!io->rbio.bio.bi_status)) {
bch2_migrate_write_init(&io->write, &io->rbio); bch2_migrate_write_init(&io->write, &io->rbio);
closure_call(&io->write.op.cl, bch2_write, NULL, cl); closure_call(&io->write.op.cl, bch2_write, NULL, cl);
} }
@ -240,6 +241,7 @@ static int bch2_move_extent(struct bch_fs *c,
struct write_point_specifier wp, struct write_point_specifier wp,
int btree_insert_flags, int btree_insert_flags,
int move_device, int move_device,
struct bch_io_opts opts,
struct bkey_s_c k) struct bkey_s_c k)
{ {
struct extent_pick_ptr pick; struct extent_pick_ptr pick;
@ -276,6 +278,7 @@ static int bch2_move_extent(struct bch_fs *c,
goto err; goto err;
} }
io->rbio.opts = opts;
bio_init(&io->rbio.bio, io->bi_inline_vecs, pages); bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
io->rbio.bio.bi_iter.bi_size = sectors << 9; io->rbio.bio.bi_iter.bi_size = sectors << 9;
@ -284,9 +287,13 @@ static int bch2_move_extent(struct bch_fs *c,
io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
io->rbio.bio.bi_end_io = move_read_endio; io->rbio.bio.bi_end_io = move_read_endio;
__bch2_write_op_init(&io->write.op, c);
io->write.btree_insert_flags = btree_insert_flags; io->write.btree_insert_flags = btree_insert_flags;
io->write.move_dev = move_device; io->write.move_dev = move_device;
bch2_write_op_init(&io->write.op, c);
io->write.op.csum_type = bch2_data_checksum_type(c, opts.data_checksum);
io->write.op.compression_type =
bch2_compression_opt_to_type(opts.compression);
io->write.op.devs = devs; io->write.op.devs = devs;
io->write.op.write_point = wp; io->write.op.write_point = wp;
@ -371,9 +378,11 @@ int bch2_move_data(struct bch_fs *c,
{ {
bool kthread = (current->flags & PF_KTHREAD) != 0; bool kthread = (current->flags & PF_KTHREAD) != 0;
struct moving_context ctxt; struct moving_context ctxt;
struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
struct btree_iter iter; struct btree_iter iter;
BKEY_PADDED(k) tmp; BKEY_PADDED(k) tmp;
struct bkey_s_c k; struct bkey_s_c k;
u64 cur_inum = U64_MAX;
int ret = 0; int ret = 0;
bch2_move_ctxt_init(&ctxt); bch2_move_ctxt_init(&ctxt);
@ -396,7 +405,7 @@ int bch2_move_data(struct bch_fs *c,
(bch2_btree_iter_unlock(&iter), (bch2_btree_iter_unlock(&iter),
(ret = bch2_ratelimit_wait_freezable_stoppable(rate)))) (ret = bch2_ratelimit_wait_freezable_stoppable(rate))))
break; break;
peek:
k = bch2_btree_iter_peek(&iter); k = bch2_btree_iter_peek(&iter);
if (!k.k) if (!k.k)
break; break;
@ -404,8 +413,23 @@ int bch2_move_data(struct bch_fs *c,
if (ret) if (ret)
break; break;
if (!bkey_extent_is_data(k.k) || if (!bkey_extent_is_data(k.k))
!pred(arg, bkey_s_c_to_extent(k))) goto next;
if (cur_inum != k.k->p.inode) {
struct bch_inode_unpacked inode;
/* don't hold btree locks while looking up inode: */
bch2_btree_iter_unlock(&iter);
opts = bch2_opts_to_inode_opts(c->opts);
if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode))
bch2_io_opts_apply(&opts, bch2_inode_opts_get(&inode));
cur_inum = k.k->p.inode;
goto peek;
}
if (!pred(arg, bkey_s_c_to_extent(k)))
goto next; goto next;
/* unlock before doing IO: */ /* unlock before doing IO: */
@ -415,7 +439,7 @@ int bch2_move_data(struct bch_fs *c,
if (bch2_move_extent(c, &ctxt, devs, wp, if (bch2_move_extent(c, &ctxt, devs, wp,
btree_insert_flags, btree_insert_flags,
move_device, k)) { move_device, opts, k)) {
/* memory allocation failure, wait for some IO to finish */ /* memory allocation failure, wait for some IO to finish */
bch2_move_ctxt_wait_for_io(&ctxt); bch2_move_ctxt_wait_for_io(&ctxt);
continue; continue;

View File

@ -76,16 +76,27 @@ void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
#undef BCH_OPT #undef BCH_OPT
} }
bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id)
{
switch (id) {
#define BCH_OPT(_name, ...) \
case Opt_##_name: \
return opt_defined(*opts, _name);
BCH_OPTS()
#undef BCH_OPT
default:
BUG();
}
}
u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id) u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
{ {
switch (id) { switch (id) {
#define BCH_OPT(_name, ...) \ #define BCH_OPT(_name, ...) \
case Opt_##_name: \ case Opt_##_name: \
return opts->_name; \ return opts->_name;
BCH_OPTS() BCH_OPTS()
#undef BCH_OPT #undef BCH_OPT
default: default:
BUG(); BUG();
} }
@ -98,10 +109,8 @@ void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
case Opt_##_name: \ case Opt_##_name: \
opt_set(*opts, _name, v); \ opt_set(*opts, _name, v); \
break; break;
BCH_OPTS() BCH_OPTS()
#undef BCH_OPT #undef BCH_OPT
default: default:
BUG(); BUG();
} }
@ -118,7 +127,6 @@ struct bch_opts bch2_opts_from_sb(struct bch_sb *sb)
#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default) \ #define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default) \
if (_sb_opt != NO_SB_OPT) \ if (_sb_opt != NO_SB_OPT) \
opt_set(opts, _name, _sb_opt(sb)); opt_set(opts, _name, _sb_opt(sb));
BCH_OPTS() BCH_OPTS()
#undef BCH_OPT #undef BCH_OPT
@ -145,7 +153,7 @@ const struct bch_option bch2_opt_table[] = {
#undef BCH_OPT #undef BCH_OPT
}; };
static int bch2_opt_lookup(const char *name) int bch2_opt_lookup(const char *name)
{ {
const struct bch_option *i; const struct bch_option *i;
@ -247,3 +255,52 @@ no_val:
pr_err("Mount option %s requires a value", name); pr_err("Mount option %s requires a value", name);
return -1; return -1;
} }
/* io opts: */
struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
{
struct bch_io_opts ret = { 0 };
#define BCH_INODE_OPT(_name, _bits) \
if (opt_defined(src, _name)) \
opt_set(ret, _name, src._name);
BCH_INODE_OPTS()
#undef BCH_INODE_OPT
return ret;
}
struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src)
{
struct bch_opts ret = { 0 };
#define BCH_INODE_OPT(_name, _bits) \
if (opt_defined(src, _name)) \
opt_set(ret, _name, src._name);
BCH_INODE_OPTS()
#undef BCH_INODE_OPT
return ret;
}
void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src)
{
#define BCH_INODE_OPT(_name, _bits) \
if (opt_defined(src, _name)) \
opt_set(*dst, _name, src._name);
BCH_INODE_OPTS()
#undef BCH_INODE_OPT
}
bool bch2_opt_is_inode_opt(enum bch_opt_id id)
{
static const enum bch_opt_id inode_opt_list[] = {
#define BCH_INODE_OPT(_name, _bits) Opt_##_name,
BCH_INODE_OPTS()
#undef BCH_INODE_OPT
};
unsigned i;
for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++)
if (inode_opt_list[i] == id)
return true;
return false;
}

View File

@ -181,10 +181,7 @@ do { \
static inline struct bch_opts bch2_opts_empty(void) static inline struct bch_opts bch2_opts_empty(void)
{ {
struct bch_opts opts; return (struct bch_opts) { 0 };
memset(&opts, 0, sizeof(opts));
return opts;
} }
void bch2_opts_apply(struct bch_opts *, struct bch_opts); void bch2_opts_apply(struct bch_opts *, struct bch_opts);
@ -215,12 +212,35 @@ struct bch_option {
extern const struct bch_option bch2_opt_table[]; extern const struct bch_option bch2_opt_table[];
bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
struct bch_opts bch2_opts_from_sb(struct bch_sb *); struct bch_opts bch2_opts_from_sb(struct bch_sb *);
int bch2_opt_lookup(const char *);
int bch2_opt_parse(const struct bch_option *, const char *, u64 *); int bch2_opt_parse(const struct bch_option *, const char *, u64 *);
int bch2_parse_mount_opts(struct bch_opts *, char *); int bch2_parse_mount_opts(struct bch_opts *, char *);
/* inode opts: */
#define BCH_INODE_OPTS() \
BCH_INODE_OPT(data_checksum, 8) \
BCH_INODE_OPT(compression, 8)
struct bch_io_opts {
#define BCH_INODE_OPT(_name, _bits) unsigned _name##_defined:1;
BCH_INODE_OPTS()
#undef BCH_INODE_OPT
#define BCH_INODE_OPT(_name, _bits) u##_bits _name;
BCH_INODE_OPTS()
#undef BCH_INODE_OPT
};
struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts);
void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts);
bool bch2_opt_is_inode_opt(enum bch_opt_id);
#endif /* _BCACHEFS_OPTS_H */ #endif /* _BCACHEFS_OPTS_H */

View File

@ -12,6 +12,8 @@
#include <linux/sort.h> #include <linux/sort.h>
static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *); static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
struct bch_replicas_cpu *);
static const char *bch2_sb_validate_replicas(struct bch_sb *); static const char *bch2_sb_validate_replicas(struct bch_sb *);
static inline void __bch2_sb_layout_size_assert(void) static inline void __bch2_sb_layout_size_assert(void)
@ -157,7 +159,7 @@ struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
return NULL; return NULL;
f = __bch2_sb_field_resize(sb->sb, f, u64s); f = __bch2_sb_field_resize(sb->sb, f, u64s);
f->type = type; f->type = cpu_to_le32(type);
return f; return f;
} }
@ -188,7 +190,7 @@ struct bch_sb_field *bch2_fs_sb_field_resize(struct bch_fs *c,
} }
f = __bch2_sb_field_resize(c->disk_sb, f, u64s); f = __bch2_sb_field_resize(c->disk_sb, f, u64s);
f->type = type; f->type = cpu_to_le32(type);
return f; return f;
} }
@ -354,7 +356,16 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
if (!BCH_SB_DATA_REPLICAS_REQ(sb) || if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
return "Invalid number of metadata replicas"; return "Invalid number of data replicas";
if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
return "Invalid metadata checksum type";
if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
return "Invalid metadata checksum type";
if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR)
return "Invalid compression type";
if (!BCH_SB_BTREE_NODE_SIZE(sb)) if (!BCH_SB_BTREE_NODE_SIZE(sb))
return "Btree node size not set"; return "Btree node size not set";
@ -507,7 +518,7 @@ static void __copy_super(struct bch_sb *dst, struct bch_sb *src)
if (src_f->type == BCH_SB_FIELD_journal) if (src_f->type == BCH_SB_FIELD_journal)
continue; continue;
dst_f = bch2_sb_field_get(dst, src_f->type); dst_f = bch2_sb_field_get(dst, le32_to_cpu(src_f->type));
dst_f = __bch2_sb_field_resize(dst, dst_f, dst_f = __bch2_sb_field_resize(dst, dst_f,
le32_to_cpu(src_f->u64s)); le32_to_cpu(src_f->u64s));
@ -601,7 +612,7 @@ reread:
/* XXX: verify MACs */ /* XXX: verify MACs */
csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb), csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
(struct nonce) { 0 }, sb->sb); null_nonce(), sb->sb);
if (bch2_crc_cmp(csum, sb->sb->csum)) if (bch2_crc_cmp(csum, sb->sb->csum))
return "bad checksum reading superblock"; return "bad checksum reading superblock";
@ -688,9 +699,9 @@ const char *bch2_read_super(const char *path,
got_super: got_super:
pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u", pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
le64_to_cpu(ret->sb->version), le64_to_cpu(ret->sb->version),
le64_to_cpu(ret->sb->flags), le64_to_cpu(ret->sb->flags[0]),
le64_to_cpu(ret->sb->seq), le64_to_cpu(ret->sb->seq),
le16_to_cpu(ret->sb->u64s)); le32_to_cpu(ret->sb->u64s));
err = "Superblock block size smaller than device block size"; err = "Superblock block size smaller than device block size";
if (le16_to_cpu(ret->sb->block_size) << 9 < if (le16_to_cpu(ret->sb->block_size) << 9 <
@ -711,7 +722,7 @@ static void write_super_endio(struct bio *bio)
/* XXX: return errors directly */ /* XXX: return errors directly */
if (bch2_dev_io_err_on(bio->bi_error, ca, "superblock write")) if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write"))
ca->sb_write_error = 1; ca->sb_write_error = 1;
closure_put(&ca->fs->sb_write); closure_put(&ca->fs->sb_write);
@ -727,7 +738,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum); SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
(struct nonce) { 0 }, sb); null_nonce(), sb);
bio_reset(bio); bio_reset(bio);
bio->bi_bdev = ca->disk_sb.bdev; bio->bi_bdev = ca->disk_sb.bdev;
@ -830,7 +841,12 @@ out:
bch2_sb_update(c); bch2_sb_update(c);
} }
/* replica information: */ /* Replicas tracking - in memory: */
#define for_each_cpu_replicas_entry(_r, _i) \
for (_i = (_r)->entries; \
(void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
_i = (void *) (_i) + (_r)->entry_size)
static inline struct bch_replicas_cpu_entry * static inline struct bch_replicas_cpu_entry *
cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
@ -838,6 +854,11 @@ cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
return (void *) r->entries + r->entry_size * i; return (void *) r->entries + r->entry_size * i;
} }
static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
{
eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
}
static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e, static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
unsigned dev) unsigned dev)
{ {
@ -856,6 +877,246 @@ static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
offsetof(struct bch_replicas_cpu_entry, devs)) * 8; offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
} }
static unsigned bkey_to_replicas(struct bkey_s_c_extent e,
enum bch_data_type data_type,
struct bch_replicas_cpu_entry *r,
unsigned *max_dev)
{
const struct bch_extent_ptr *ptr;
unsigned nr = 0;
BUG_ON(!data_type ||
data_type == BCH_DATA_SB ||
data_type >= BCH_DATA_NR);
memset(r, 0, sizeof(*r));
r->data_type = data_type;
*max_dev = 0;
extent_for_each_ptr(e, ptr)
if (!ptr->cached) {
*max_dev = max_t(unsigned, *max_dev, ptr->dev);
replicas_set_dev(r, ptr->dev);
nr++;
}
return nr;
}
static struct bch_replicas_cpu *
cpu_replicas_add_entry(struct bch_replicas_cpu *old,
struct bch_replicas_cpu_entry new_entry,
unsigned max_dev)
{
struct bch_replicas_cpu *new;
unsigned i, nr, entry_size;
entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
DIV_ROUND_UP(max_dev + 1, 8);
entry_size = max(entry_size, old->entry_size);
nr = old->nr + 1;
new = kzalloc(sizeof(struct bch_replicas_cpu) +
nr * entry_size, GFP_NOIO);
if (!new)
return NULL;
new->nr = nr;
new->entry_size = entry_size;
for (i = 0; i < old->nr; i++)
memcpy(cpu_replicas_entry(new, i),
cpu_replicas_entry(old, i),
min(new->entry_size, old->entry_size));
memcpy(cpu_replicas_entry(new, old->nr),
&new_entry,
new->entry_size);
bch2_cpu_replicas_sort(new);
return new;
}
static bool replicas_has_entry(struct bch_replicas_cpu *r,
struct bch_replicas_cpu_entry search,
unsigned max_dev)
{
return max_dev < replicas_dev_slots(r) &&
eytzinger0_find(r->entries, r->nr,
r->entry_size,
memcmp, &search) < r->nr;
}
noinline
static int bch2_check_mark_super_slowpath(struct bch_fs *c,
struct bch_replicas_cpu_entry new_entry,
unsigned max_dev)
{
struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r;
int ret = -ENOMEM;
mutex_lock(&c->sb_lock);
old_gc = rcu_dereference_protected(c->replicas_gc,
lockdep_is_held(&c->sb_lock));
if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) {
new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev);
if (!new_gc)
goto err;
}
old_r = rcu_dereference_protected(c->replicas,
lockdep_is_held(&c->sb_lock));
/* recheck, might have raced */
if (replicas_has_entry(old_r, new_entry, max_dev))
goto out;
new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev);
if (!new_r)
goto err;
ret = bch2_cpu_replicas_to_sb_replicas(c, new_r);
if (ret)
goto err;
if (new_gc) {
rcu_assign_pointer(c->replicas_gc, new_gc);
kfree_rcu(old_gc, rcu);
}
rcu_assign_pointer(c->replicas, new_r);
kfree_rcu(old_r, rcu);
bch2_write_super(c);
out:
ret = 0;
err:
mutex_unlock(&c->sb_lock);
return ret;
}
static inline int __bch2_check_mark_super(struct bch_fs *c,
struct bch_replicas_cpu_entry search,
unsigned max_dev)
{
struct bch_replicas_cpu *r, *gc_r;
bool marked;
rcu_read_lock();
r = rcu_dereference(c->replicas);
gc_r = rcu_dereference(c->replicas_gc);
marked = replicas_has_entry(r, search, max_dev) &&
(!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev));
rcu_read_unlock();
return likely(marked) ? 0
: bch2_check_mark_super_slowpath(c, search, max_dev);
}
int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
enum bch_data_type data_type)
{
struct bch_replicas_cpu_entry search;
unsigned max_dev;
if (!bkey_to_replicas(e, data_type, &search, &max_dev))
return 0;
return __bch2_check_mark_super(c, search, max_dev);
}
int bch2_check_mark_super_devlist(struct bch_fs *c,
struct bch_devs_list *devs,
enum bch_data_type data_type)
{
struct bch_replicas_cpu_entry search = { .data_type = data_type };
unsigned i, max_dev = 0;
if (!devs->nr)
return 0;
for (i = 0; i < devs->nr; i++) {
max_dev = max_t(unsigned, max_dev, devs->devs[i]);
replicas_set_dev(&search, devs->devs[i]);
}
return __bch2_check_mark_super(c, search, max_dev);
}
int bch2_replicas_gc_end(struct bch_fs *c, int err)
{
struct bch_replicas_cpu *new_r, *old_r;
int ret = 0;
lockdep_assert_held(&c->replicas_gc_lock);
mutex_lock(&c->sb_lock);
new_r = rcu_dereference_protected(c->replicas_gc,
lockdep_is_held(&c->sb_lock));
if (err) {
rcu_assign_pointer(c->replicas_gc, NULL);
kfree_rcu(new_r, rcu);
goto err;
}
if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
ret = -ENOSPC;
goto err;
}
old_r = rcu_dereference_protected(c->replicas,
lockdep_is_held(&c->sb_lock));
rcu_assign_pointer(c->replicas, new_r);
rcu_assign_pointer(c->replicas_gc, NULL);
kfree_rcu(old_r, rcu);
bch2_write_super(c);
err:
mutex_unlock(&c->sb_lock);
return ret;
}
int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
{
struct bch_replicas_cpu *dst, *src;
struct bch_replicas_cpu_entry *e;
lockdep_assert_held(&c->replicas_gc_lock);
mutex_lock(&c->sb_lock);
BUG_ON(c->replicas_gc);
src = rcu_dereference_protected(c->replicas,
lockdep_is_held(&c->sb_lock));
dst = kzalloc(sizeof(struct bch_replicas_cpu) +
src->nr * src->entry_size, GFP_NOIO);
if (!dst) {
mutex_unlock(&c->sb_lock);
return -ENOMEM;
}
dst->nr = 0;
dst->entry_size = src->entry_size;
for_each_cpu_replicas_entry(src, e)
if (!((1 << e->data_type) & typemask))
memcpy(cpu_replicas_entry(dst, dst->nr++),
e, dst->entry_size);
bch2_cpu_replicas_sort(dst);
rcu_assign_pointer(c->replicas_gc, dst);
mutex_unlock(&c->sb_lock);
return 0;
}
/* Replicas tracking - superblock: */
static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r, static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
unsigned *nr, unsigned *nr,
unsigned *bytes, unsigned *bytes,
@ -914,10 +1175,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
} }
} }
eytzinger0_sort(cpu_r->entries, bch2_cpu_replicas_sort(cpu_r);
cpu_r->nr,
cpu_r->entry_size,
memcmp, NULL);
return cpu_r; return cpu_r;
} }
@ -926,14 +1184,12 @@ static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
struct bch_sb_field_replicas *sb_r; struct bch_sb_field_replicas *sb_r;
struct bch_replicas_cpu *cpu_r, *old_r; struct bch_replicas_cpu *cpu_r, *old_r;
lockdep_assert_held(&c->sb_lock);
sb_r = bch2_sb_get_replicas(c->disk_sb); sb_r = bch2_sb_get_replicas(c->disk_sb);
cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r); cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
if (!cpu_r) if (!cpu_r)
return -ENOMEM; return -ENOMEM;
old_r = c->replicas; old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock));
rcu_assign_pointer(c->replicas, cpu_r); rcu_assign_pointer(c->replicas, cpu_r);
if (old_r) if (old_r)
kfree_rcu(old_r, rcu); kfree_rcu(old_r, rcu);
@ -941,192 +1197,133 @@ static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
return 0; return 0;
} }
static void bkey_to_replicas(struct bkey_s_c_extent e, static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
enum bch_data_type data_type, struct bch_replicas_cpu *r)
struct bch_replicas_cpu_entry *r,
unsigned *max_dev)
{ {
const struct bch_extent_ptr *ptr; struct bch_sb_field_replicas *sb_r;
struct bch_replicas_entry *sb_e;
struct bch_replicas_cpu_entry *e;
size_t i, bytes;
BUG_ON(!data_type || bytes = sizeof(struct bch_sb_field_replicas);
data_type == BCH_DATA_SB ||
data_type >= BCH_DATA_NR);
memset(r, 0, sizeof(*r)); for_each_cpu_replicas_entry(r, e) {
r->data_type = data_type; bytes += sizeof(struct bch_replicas_entry);
for (i = 0; i < r->entry_size - 1; i++)
bytes += hweight8(e->devs[i]);
}
*max_dev = 0; sb_r = bch2_fs_sb_resize_replicas(c,
DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
if (!sb_r)
return -ENOSPC;
extent_for_each_ptr(e, ptr) memset(&sb_r->entries, 0,
if (!ptr->cached) { vstruct_end(&sb_r->field) -
*max_dev = max_t(unsigned, *max_dev, ptr->dev); (void *) &sb_r->entries);
replicas_set_dev(r, ptr->dev);
}
}
/* sb_e = sb_r->entries;
* for when gc of replica information is in progress: for_each_cpu_replicas_entry(r, e) {
*/ sb_e->data_type = e->data_type;
static int bch2_update_gc_replicas(struct bch_fs *c,
struct bch_replicas_cpu *gc_r,
struct bkey_s_c_extent e,
enum bch_data_type data_type)
{
struct bch_replicas_cpu_entry new_e;
struct bch_replicas_cpu *new;
unsigned i, nr, entry_size, max_dev;
bkey_to_replicas(e, data_type, &new_e, &max_dev); for (i = 0; i < replicas_dev_slots(r); i++)
if (replicas_test_dev(e, i))
sb_e->devs[sb_e->nr++] = i;
entry_size = offsetof(struct bch_replicas_cpu_entry, devs) + sb_e = replicas_entry_next(sb_e);
DIV_ROUND_UP(max_dev + 1, 8);
entry_size = max(entry_size, gc_r->entry_size);
nr = gc_r->nr + 1;
new = kzalloc(sizeof(struct bch_replicas_cpu) + BUG_ON((void *) sb_e > vstruct_end(&sb_r->field));
nr * entry_size, GFP_NOIO); }
if (!new)
return -ENOMEM;
new->nr = nr;
new->entry_size = entry_size;
for (i = 0; i < gc_r->nr; i++)
memcpy(cpu_replicas_entry(new, i),
cpu_replicas_entry(gc_r, i),
gc_r->entry_size);
memcpy(cpu_replicas_entry(new, nr - 1),
&new_e,
new->entry_size);
eytzinger0_sort(new->entries,
new->nr,
new->entry_size,
memcmp, NULL);
rcu_assign_pointer(c->replicas_gc, new);
kfree_rcu(gc_r, rcu);
return 0; return 0;
} }
static bool replicas_has_extent(struct bch_replicas_cpu *r, static const char *bch2_sb_validate_replicas(struct bch_sb *sb)
struct bkey_s_c_extent e,
enum bch_data_type data_type)
{ {
struct bch_replicas_cpu_entry search; struct bch_sb_field_members *mi;
unsigned max_dev; struct bch_sb_field_replicas *sb_r;
struct bch_replicas_cpu *cpu_r = NULL;
struct bch_replicas_entry *e;
const char *err;
unsigned i;
bkey_to_replicas(e, data_type, &search, &max_dev); mi = bch2_sb_get_members(sb);
sb_r = bch2_sb_get_replicas(sb);
if (!sb_r)
return NULL;
return max_dev < replicas_dev_slots(r) && for_each_replicas_entry(sb_r, e) {
eytzinger0_find(r->entries, r->nr, err = "invalid replicas entry: invalid data type";
r->entry_size, if (e->data_type >= BCH_DATA_NR)
memcmp, &search) < r->nr; goto err;
err = "invalid replicas entry: no devices";
if (!e->nr)
goto err;
err = "invalid replicas entry: too many devices";
if (e->nr >= BCH_REPLICAS_MAX)
goto err;
err = "invalid replicas entry: invalid device";
for (i = 0; i < e->nr; i++)
if (!bch2_dev_exists(sb, mi, e->devs[i]))
goto err;
}
err = "cannot allocate memory";
cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
if (!cpu_r)
goto err;
sort_cmp_size(cpu_r->entries,
cpu_r->nr,
cpu_r->entry_size,
memcmp, NULL);
for (i = 0; i + 1 < cpu_r->nr; i++) {
struct bch_replicas_cpu_entry *l =
cpu_replicas_entry(cpu_r, i);
struct bch_replicas_cpu_entry *r =
cpu_replicas_entry(cpu_r, i + 1);
BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
err = "duplicate replicas entry";
if (!memcmp(l, r, cpu_r->entry_size))
goto err;
}
err = NULL;
err:
kfree(cpu_r);
return err;
} }
/* Query replicas: */
bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e, bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e,
enum bch_data_type data_type) enum bch_data_type data_type)
{ {
struct bch_replicas_cpu_entry search;
unsigned max_dev;
bool ret; bool ret;
if (!bkey_to_replicas(e, data_type, &search, &max_dev))
return true;
rcu_read_lock(); rcu_read_lock();
ret = replicas_has_extent(rcu_dereference(c->replicas), ret = replicas_has_entry(rcu_dereference(c->replicas),
e, data_type); search, max_dev);
rcu_read_unlock(); rcu_read_unlock();
return ret; return ret;
} }
noinline
static int bch2_check_mark_super_slowpath(struct bch_fs *c,
struct bkey_s_c_extent e,
enum bch_data_type data_type)
{
struct bch_replicas_cpu *gc_r;
const struct bch_extent_ptr *ptr;
struct bch_sb_field_replicas *sb_r;
struct bch_replicas_entry *new_entry;
unsigned new_entry_bytes, new_u64s, nr, bytes, max_dev;
int ret = 0;
mutex_lock(&c->sb_lock);
gc_r = rcu_dereference_protected(c->replicas_gc,
lockdep_is_held(&c->sb_lock));
if (gc_r &&
!replicas_has_extent(gc_r, e, data_type)) {
ret = bch2_update_gc_replicas(c, gc_r, e, data_type);
if (ret)
goto err;
}
/* recheck, might have raced */
if (bch2_sb_has_replicas(c, e, data_type)) {
mutex_unlock(&c->sb_lock);
return 0;
}
new_entry_bytes = sizeof(struct bch_replicas_entry) +
bch2_extent_nr_dirty_ptrs(e.s_c);
sb_r = bch2_sb_get_replicas(c->disk_sb);
bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
new_u64s = DIV_ROUND_UP(bytes + new_entry_bytes, sizeof(u64));
sb_r = bch2_fs_sb_resize_replicas(c,
DIV_ROUND_UP(sizeof(*sb_r) + bytes + new_entry_bytes,
sizeof(u64)));
if (!sb_r) {
ret = -ENOSPC;
goto err;
}
new_entry = (void *) sb_r + bytes;
new_entry->data_type = data_type;
new_entry->nr = 0;
extent_for_each_ptr(e, ptr)
if (!ptr->cached)
new_entry->devs[new_entry->nr++] = ptr->dev;
ret = bch2_sb_replicas_to_cpu_replicas(c);
if (ret) {
memset(new_entry, 0,
vstruct_end(&sb_r->field) - (void *) new_entry);
goto err;
}
bch2_write_super(c);
err:
mutex_unlock(&c->sb_lock);
return ret;
}
int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
enum bch_data_type data_type)
{
struct bch_replicas_cpu *gc_r;
bool marked;
rcu_read_lock();
marked = replicas_has_extent(rcu_dereference(c->replicas),
e, data_type) &&
(!(gc_r = rcu_dereference(c->replicas_gc)) ||
replicas_has_extent(gc_r, e, data_type));
rcu_read_unlock();
if (marked)
return 0;
return bch2_check_mark_super_slowpath(c, e, data_type);
}
struct replicas_status __bch2_replicas_status(struct bch_fs *c, struct replicas_status __bch2_replicas_status(struct bch_fs *c,
struct bch_devs_mask online_devs) struct bch_devs_mask online_devs)
{ {
struct bch_sb_field_members *mi;
struct bch_replicas_cpu_entry *e; struct bch_replicas_cpu_entry *e;
struct bch_replicas_cpu *r; struct bch_replicas_cpu *r;
unsigned i, dev, dev_slots, nr_online, nr_offline; unsigned i, dev, dev_slots, nr_online, nr_offline;
@ -1137,14 +1334,15 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c,
for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
ret.replicas[i].nr_online = UINT_MAX; ret.replicas[i].nr_online = UINT_MAX;
mi = bch2_sb_get_members(c->disk_sb);
rcu_read_lock(); rcu_read_lock();
r = rcu_dereference(c->replicas); r = rcu_dereference(c->replicas);
dev_slots = min_t(unsigned, replicas_dev_slots(r), c->sb.nr_devices); dev_slots = replicas_dev_slots(r);
for (i = 0; i < r->nr; i++) { for_each_cpu_replicas_entry(r, e) {
e = cpu_replicas_entry(r, i); if (e->data_type >= ARRAY_SIZE(ret.replicas))
panic("e %p data_type %u\n", e, e->data_type);
BUG_ON(e->data_type >= ARRAY_SIZE(ret.replicas));
nr_online = nr_offline = 0; nr_online = nr_offline = 0;
@ -1152,6 +1350,8 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c,
if (!replicas_test_dev(e, dev)) if (!replicas_test_dev(e, dev))
continue; continue;
BUG_ON(!bch2_dev_exists(c->disk_sb, mi, dev));
if (test_bit(dev, online_devs.d)) if (test_bit(dev, online_devs.d))
nr_online++; nr_online++;
else else
@ -1216,7 +1416,7 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
{ {
struct bch_replicas_cpu_entry *e; struct bch_replicas_cpu_entry *e;
struct bch_replicas_cpu *r; struct bch_replicas_cpu *r;
unsigned i, ret = 0; unsigned ret = 0;
rcu_read_lock(); rcu_read_lock();
r = rcu_dereference(c->replicas); r = rcu_dereference(c->replicas);
@ -1224,191 +1424,13 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
if (ca->dev_idx >= replicas_dev_slots(r)) if (ca->dev_idx >= replicas_dev_slots(r))
goto out; goto out;
for (i = 0; i < r->nr; i++) { for_each_cpu_replicas_entry(r, e)
e = cpu_replicas_entry(r, i);
if (replicas_test_dev(e, ca->dev_idx)) { if (replicas_test_dev(e, ca->dev_idx)) {
ret |= 1 << e->data_type; ret |= 1 << e->data_type;
break; break;
} }
}
out: out:
rcu_read_unlock(); rcu_read_unlock();
return ret; return ret;
} }
static const char *bch2_sb_validate_replicas(struct bch_sb *sb)
{
struct bch_sb_field_members *mi;
struct bch_sb_field_replicas *sb_r;
struct bch_replicas_cpu *cpu_r = NULL;
struct bch_replicas_entry *e;
const char *err;
unsigned i;
mi = bch2_sb_get_members(sb);
sb_r = bch2_sb_get_replicas(sb);
if (!sb_r)
return NULL;
for_each_replicas_entry(sb_r, e) {
err = "invalid replicas entry: invalid data type";
if (e->data_type >= BCH_DATA_NR)
goto err;
err = "invalid replicas entry: too many devices";
if (e->nr >= BCH_REPLICAS_MAX)
goto err;
err = "invalid replicas entry: invalid device";
for (i = 0; i < e->nr; i++)
if (!bch2_dev_exists(sb, mi, e->devs[i]))
goto err;
}
err = "cannot allocate memory";
cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
if (!cpu_r)
goto err;
sort_cmp_size(cpu_r->entries,
cpu_r->nr,
cpu_r->entry_size,
memcmp, NULL);
for (i = 0; i + 1 < cpu_r->nr; i++) {
struct bch_replicas_cpu_entry *l =
cpu_replicas_entry(cpu_r, i);
struct bch_replicas_cpu_entry *r =
cpu_replicas_entry(cpu_r, i + 1);
BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
err = "duplicate replicas entry";
if (!memcmp(l, r, cpu_r->entry_size))
goto err;
}
err = NULL;
err:
kfree(cpu_r);
return err;
}
int bch2_replicas_gc_end(struct bch_fs *c, int err)
{
struct bch_sb_field_replicas *sb_r;
struct bch_replicas_cpu *r, *old_r;
struct bch_replicas_entry *dst_e;
size_t i, j, bytes, dev_slots;
int ret = 0;
lockdep_assert_held(&c->replicas_gc_lock);
mutex_lock(&c->sb_lock);
r = rcu_dereference_protected(c->replicas_gc,
lockdep_is_held(&c->sb_lock));
if (err) {
rcu_assign_pointer(c->replicas_gc, NULL);
kfree_rcu(r, rcu);
goto err;
}
dev_slots = replicas_dev_slots(r);
bytes = sizeof(struct bch_sb_field_replicas);
for (i = 0; i < r->nr; i++) {
struct bch_replicas_cpu_entry *e =
cpu_replicas_entry(r, i);
bytes += sizeof(struct bch_replicas_entry);
for (j = 0; j < r->entry_size - 1; j++)
bytes += hweight8(e->devs[j]);
}
sb_r = bch2_fs_sb_resize_replicas(c,
DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
if (!sb_r) {
ret = -ENOSPC;
goto err;
}
memset(&sb_r->entries, 0,
vstruct_end(&sb_r->field) -
(void *) &sb_r->entries);
dst_e = sb_r->entries;
for (i = 0; i < r->nr; i++) {
struct bch_replicas_cpu_entry *src_e =
cpu_replicas_entry(r, i);
dst_e->data_type = src_e->data_type;
for (j = 0; j < dev_slots; j++)
if (replicas_test_dev(src_e, j))
dst_e->devs[dst_e->nr++] = j;
dst_e = replicas_entry_next(dst_e);
}
old_r = rcu_dereference_protected(c->replicas,
lockdep_is_held(&c->sb_lock));
rcu_assign_pointer(c->replicas, r);
rcu_assign_pointer(c->replicas_gc, NULL);
kfree_rcu(old_r, rcu);
bch2_write_super(c);
err:
mutex_unlock(&c->sb_lock);
return ret;
}
int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
{
struct bch_replicas_cpu *r, *src;
unsigned i;
lockdep_assert_held(&c->replicas_gc_lock);
mutex_lock(&c->sb_lock);
BUG_ON(c->replicas_gc);
src = rcu_dereference_protected(c->replicas,
lockdep_is_held(&c->sb_lock));
r = kzalloc(sizeof(struct bch_replicas_cpu) +
src->nr * src->entry_size, GFP_NOIO);
if (!r) {
mutex_unlock(&c->sb_lock);
return -ENOMEM;
}
r->entry_size = src->entry_size;
r->nr = 0;
for (i = 0; i < src->nr; i++) {
struct bch_replicas_cpu_entry *dst_e =
cpu_replicas_entry(r, r->nr);
struct bch_replicas_cpu_entry *src_e =
cpu_replicas_entry(src, i);
if (!(src_e->data_type & typemask)) {
memcpy(dst_e, src_e, r->entry_size);
r->nr++;
}
}
eytzinger0_sort(r->entries,
r->nr,
r->entry_size,
memcmp, NULL);
rcu_assign_pointer(c->replicas_gc, r);
mutex_unlock(&c->sb_lock);
return 0;
}

View File

@ -125,23 +125,12 @@ void bch2_write_super(struct bch_fs *);
/* replicas: */ /* replicas: */
/* iterate over bch_sb_field_replicas: */
static inline struct bch_replicas_entry *
replicas_entry_next(struct bch_replicas_entry *i)
{
return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
}
#define for_each_replicas_entry(_r, _i) \
for (_i = (_r)->entries; \
(void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
(_i) = replicas_entry_next(_i))
bool bch2_sb_has_replicas(struct bch_fs *, struct bkey_s_c_extent, bool bch2_sb_has_replicas(struct bch_fs *, struct bkey_s_c_extent,
enum bch_data_type); enum bch_data_type);
int bch2_check_mark_super(struct bch_fs *, struct bkey_s_c_extent, int bch2_check_mark_super(struct bch_fs *, struct bkey_s_c_extent,
enum bch_data_type); enum bch_data_type);
int bch2_check_mark_super_devlist(struct bch_fs *, struct bch_devs_list *,
enum bch_data_type);
struct replicas_status { struct replicas_status {
struct { struct {
@ -161,4 +150,17 @@ unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
int bch2_replicas_gc_end(struct bch_fs *, int); int bch2_replicas_gc_end(struct bch_fs *, int);
int bch2_replicas_gc_start(struct bch_fs *, unsigned); int bch2_replicas_gc_start(struct bch_fs *, unsigned);
/* iterate over superblock replicas - used by userspace tools: */
static inline struct bch_replicas_entry *
replicas_entry_next(struct bch_replicas_entry *i)
{
return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
}
#define for_each_replicas_entry(_r, _i) \
for (_i = (_r)->entries; \
(void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
(_i) = replicas_entry_next(_i))
#endif /* _BCACHEFS_SUPER_IO_H */ #endif /* _BCACHEFS_SUPER_IO_H */

View File

@ -140,8 +140,9 @@ struct bch_fs *bch2_uuid_to_fs(uuid_le uuid)
return c; return c;
} }
int bch2_congested(struct bch_fs *c, int bdi_bits) int bch2_congested(void *data, int bdi_bits)
{ {
struct bch_fs *c = data;
struct backing_dev_info *bdi; struct backing_dev_info *bdi;
struct bch_dev *ca; struct bch_dev *ca;
unsigned i; unsigned i;
@ -178,13 +179,6 @@ int bch2_congested(struct bch_fs *c, int bdi_bits)
return ret; return ret;
} }
static int bch2_congested_fn(void *data, int bdi_bits)
{
struct bch_fs *c = data;
return bch2_congested(c, bdi_bits);
}
/* Filesystem RO/RW: */ /* Filesystem RO/RW: */
/* /*
@ -218,7 +212,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
* Flush journal before stopping allocators, because flushing journal * Flush journal before stopping allocators, because flushing journal
* blacklist entries involves allocating new btree nodes: * blacklist entries involves allocating new btree nodes:
*/ */
bch2_journal_flush_pins(&c->journal, U64_MAX); bch2_journal_flush_all_pins(&c->journal);
if (!bch2_journal_error(&c->journal)) if (!bch2_journal_error(&c->journal))
bch2_btree_verify_flushed(c); bch2_btree_verify_flushed(c);
@ -379,8 +373,6 @@ static void bch2_fs_free(struct bch_fs *c)
bch2_io_clock_exit(&c->io_clock[WRITE]); bch2_io_clock_exit(&c->io_clock[WRITE]);
bch2_io_clock_exit(&c->io_clock[READ]); bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_compress_exit(c); bch2_fs_compress_exit(c);
if (c->bdi.bdi_list.next)
bdi_destroy(&c->bdi);
lg_lock_free(&c->usage_lock); lg_lock_free(&c->usage_lock);
free_percpu(c->usage_percpu); free_percpu(c->usage_percpu);
mempool_exit(&c->btree_bounce_pool); mempool_exit(&c->btree_bounce_pool);
@ -393,7 +385,7 @@ static void bch2_fs_free(struct bch_fs *c)
mempool_exit(&c->btree_reserve_pool); mempool_exit(&c->btree_reserve_pool);
mempool_exit(&c->fill_iter); mempool_exit(&c->fill_iter);
percpu_ref_exit(&c->writes); percpu_ref_exit(&c->writes);
kfree(c->replicas); kfree(rcu_dereference_protected(c->replicas, 1));
if (c->copygc_wq) if (c->copygc_wq)
destroy_workqueue(c->copygc_wq); destroy_workqueue(c->copygc_wq);
@ -414,7 +406,7 @@ static void bch2_fs_exit(struct bch_fs *c)
for (i = 0; i < c->sb.nr_devices; i++) for (i = 0; i < c->sb.nr_devices; i++)
if (c->devs[i]) if (c->devs[i])
bch2_dev_free(c->devs[i]); bch2_dev_free(rcu_dereference_protected(c->devs[i], 1));
closure_debug_destroy(&c->cl); closure_debug_destroy(&c->cl);
kobject_put(&c->kobj); kobject_put(&c->kobj);
@ -576,10 +568,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
sizeof(struct btree_update)) || sizeof(struct btree_update)) ||
mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
bioset_init(&c->btree_read_bio, 1, bioset_init(&c->btree_read_bio, 1,
offsetof(struct btree_read_bio, bio)) || offsetof(struct btree_read_bio, bio),
bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio)) || BIOSET_NEED_BVECS) ||
bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio)) || bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio)) || BIOSET_NEED_BVECS) ||
bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
BIOSET_NEED_BVECS) ||
bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
BIOSET_NEED_BVECS) ||
mempool_init_page_pool(&c->bio_bounce_pages, mempool_init_page_pool(&c->bio_bounce_pages,
max_t(unsigned, max_t(unsigned,
c->opts.btree_node_size, c->opts.btree_node_size,
@ -588,7 +584,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
!(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) || !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
lg_lock_init(&c->usage_lock) || lg_lock_init(&c->usage_lock) ||
mempool_init_vp_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) || mempool_init_vp_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) ||
bdi_setup_and_register(&c->bdi, "bcachefs") ||
bch2_io_clock_init(&c->io_clock[READ]) || bch2_io_clock_init(&c->io_clock[READ]) ||
bch2_io_clock_init(&c->io_clock[WRITE]) || bch2_io_clock_init(&c->io_clock[WRITE]) ||
bch2_fs_journal_init(&c->journal) || bch2_fs_journal_init(&c->journal) ||
@ -599,10 +594,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_fsio_init(c)) bch2_fs_fsio_init(c))
goto err; goto err;
c->bdi.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
c->bdi.congested_fn = bch2_congested_fn;
c->bdi.congested_data = c;
mi = bch2_sb_get_members(c->disk_sb); mi = bch2_sb_get_members(c->disk_sb);
for (i = 0; i < c->sb.nr_devices; i++) for (i = 0; i < c->sb.nr_devices; i++)
if (bch2_dev_exists(c->disk_sb, mi, i) && if (bch2_dev_exists(c->disk_sb, mi, i) &&
@ -729,8 +720,12 @@ static const char *__bch2_fs_start(struct bch_fs *c)
continue; continue;
err = "error reading btree root"; err = "error reading btree root";
if (bch2_btree_root_read(c, i, k, level)) if (bch2_btree_root_read(c, i, k, level)) {
goto err; if (i != BTREE_ID_ALLOC)
goto err;
mustfix_fsck_err(c, "error reading btree root");
}
} }
err = "error reading allocation information"; err = "error reading allocation information";
@ -830,7 +825,7 @@ static const char *__bch2_fs_start(struct bch_fs *c)
closure_sync(&cl); closure_sync(&cl);
bch2_inode_init(c, &inode, 0, 0, bch2_inode_init(c, &inode, 0, 0,
S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0); S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
inode.bi_inum = BCACHEFS_ROOT_INO; inode.bi_inum = BCACHEFS_ROOT_INO;
bch2_inode_pack(&packed_inode, &inode); bch2_inode_pack(&packed_inode, &inode);
@ -877,6 +872,7 @@ out:
bch2_journal_entries_free(&journal); bch2_journal_entries_free(&journal);
return err; return err;
err: err:
fsck_err:
closure_sync(&cl); closure_sync(&cl);
switch (ret) { switch (ret) {
@ -995,24 +991,20 @@ static void bch2_dev_free(struct bch_dev *ca)
kobject_put(&ca->kobj); kobject_put(&ca->kobj);
} }
static void bch2_dev_io_ref_release(struct percpu_ref *ref)
{
struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
complete(&ca->offline_complete);
}
static void __bch2_dev_offline(struct bch_dev *ca) static void __bch2_dev_offline(struct bch_dev *ca)
{ {
struct bch_fs *c = ca->fs; struct bch_fs *c = ca->fs;
lockdep_assert_held(&c->state_lock); lockdep_assert_held(&c->state_lock);
if (percpu_ref_is_zero(&ca->io_ref))
return;
__bch2_dev_read_only(c, ca); __bch2_dev_read_only(c, ca);
reinit_completion(&ca->offline_complete); reinit_completion(&ca->io_ref_completion);
percpu_ref_kill(&ca->io_ref); percpu_ref_kill(&ca->io_ref);
wait_for_completion(&ca->offline_complete); wait_for_completion(&ca->io_ref_completion);
if (ca->kobj.state_in_sysfs) { if (ca->kobj.state_in_sysfs) {
struct kobject *block = struct kobject *block =
@ -1026,27 +1018,18 @@ static void __bch2_dev_offline(struct bch_dev *ca)
bch2_dev_journal_exit(ca); bch2_dev_journal_exit(ca);
} }
static void bch2_dev_ref_release(struct percpu_ref *ref) static void bch2_dev_ref_complete(struct percpu_ref *ref)
{ {
struct bch_dev *ca = container_of(ref, struct bch_dev, ref); struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
complete(&ca->stop_complete); complete(&ca->ref_completion);
} }
static void bch2_dev_stop(struct bch_dev *ca) static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
{ {
struct bch_fs *c = ca->fs; struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
lockdep_assert_held(&c->state_lock); complete(&ca->io_ref_completion);
BUG_ON(rcu_access_pointer(c->devs[ca->dev_idx]) != ca);
rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
synchronize_rcu();
reinit_completion(&ca->stop_complete);
percpu_ref_kill(&ca->ref);
wait_for_completion(&ca->stop_complete);
} }
static int bch2_dev_sysfs_online(struct bch_dev *ca) static int bch2_dev_sysfs_online(struct bch_dev *ca)
@ -1095,8 +1078,8 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
return -ENOMEM; return -ENOMEM;
kobject_init(&ca->kobj, &bch2_dev_ktype); kobject_init(&ca->kobj, &bch2_dev_ktype);
init_completion(&ca->stop_complete); init_completion(&ca->ref_completion);
init_completion(&ca->offline_complete); init_completion(&ca->io_ref_completion);
ca->dev_idx = dev_idx; ca->dev_idx = dev_idx;
__set_bit(ca->dev_idx, ca->self.d); __set_bit(ca->dev_idx, ca->self.d);
@ -1132,9 +1115,9 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
DIV_ROUND_UP(BTREE_NODE_RESERVE, DIV_ROUND_UP(BTREE_NODE_RESERVE,
ca->mi.bucket_size / c->opts.btree_node_size); ca->mi.bucket_size / c->opts.btree_node_size);
if (percpu_ref_init(&ca->ref, bch2_dev_ref_release, if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
0, GFP_KERNEL) || 0, GFP_KERNEL) ||
percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_release, percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) || PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_BTREE], btree_node_reserve_buckets, !init_fifo(&ca->free[RESERVE_BTREE], btree_node_reserve_buckets,
GFP_KERNEL) || GFP_KERNEL) ||
@ -1155,7 +1138,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
GFP_KERNEL|__GFP_ZERO)) || GFP_KERNEL|__GFP_ZERO)) ||
!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) || !(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) ||
bioset_init(&ca->replica_set, 4, bioset_init(&ca->replica_set, 4,
offsetof(struct bch_write_bio, bio)) || offsetof(struct bch_write_bio, bio), 0) ||
!(ca->io_done = alloc_percpu(*ca->io_done))) !(ca->io_done = alloc_percpu(*ca->io_done)))
goto err; goto err;
@ -1180,8 +1163,6 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
struct bch_dev *ca; struct bch_dev *ca;
int ret; int ret;
lockdep_assert_held(&c->sb_lock);
if (le64_to_cpu(sb->sb->seq) > if (le64_to_cpu(sb->sb->seq) >
le64_to_cpu(c->disk_sb->seq)) le64_to_cpu(c->disk_sb->seq))
bch2_sb_to_fs(c, sb->sb); bch2_sb_to_fs(c, sb->sb);
@ -1189,13 +1170,15 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices || BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
!c->devs[sb->sb->dev_idx]); !c->devs[sb->sb->dev_idx]);
ca = c->devs[sb->sb->dev_idx]; ca = bch_dev_locked(c, sb->sb->dev_idx);
if (ca->disk_sb.bdev) { if (ca->disk_sb.bdev) {
bch_err(c, "already have device online in slot %u", bch_err(c, "already have device online in slot %u",
sb->sb->dev_idx); sb->sb->dev_idx);
return -EINVAL; return -EINVAL;
} }
BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
ret = bch2_dev_journal_init(ca, sb->sb); ret = bch2_dev_journal_init(ca, sb->sb);
if (ret) if (ret)
return ret; return ret;
@ -1222,7 +1205,7 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
if (bch2_dev_sysfs_online(ca)) if (bch2_dev_sysfs_online(ca))
pr_warn("error creating sysfs objects"); pr_warn("error creating sysfs objects");
bch2_mark_dev_superblock(c, ca, 0); bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
if (ca->mi.state == BCH_MEMBER_STATE_RW) if (ca->mi.state == BCH_MEMBER_STATE_RW)
bch2_dev_allocator_add(c, ca); bch2_dev_allocator_add(c, ca);
@ -1293,6 +1276,7 @@ static bool bch2_fs_may_start(struct bch_fs *c)
{ {
struct replicas_status s; struct replicas_status s;
struct bch_sb_field_members *mi; struct bch_sb_field_members *mi;
struct bch_dev *ca;
unsigned i, flags = c->opts.degraded unsigned i, flags = c->opts.degraded
? BCH_FORCE_IF_DEGRADED ? BCH_FORCE_IF_DEGRADED
: 0; : 0;
@ -1301,14 +1285,19 @@ static bool bch2_fs_may_start(struct bch_fs *c)
mutex_lock(&c->sb_lock); mutex_lock(&c->sb_lock);
mi = bch2_sb_get_members(c->disk_sb); mi = bch2_sb_get_members(c->disk_sb);
for (i = 0; i < c->disk_sb->nr_devices; i++) for (i = 0; i < c->disk_sb->nr_devices; i++) {
if (bch2_dev_exists(c->disk_sb, mi, i) && if (!bch2_dev_exists(c->disk_sb, mi, i))
!bch2_dev_is_online(c->devs[i]) && continue;
(c->devs[i]->mi.state == BCH_MEMBER_STATE_RW ||
c->devs[i]->mi.state == BCH_MEMBER_STATE_RO)) { ca = bch_dev_locked(c, i);
if (!bch2_dev_is_online(ca) &&
(ca->mi.state == BCH_MEMBER_STATE_RW ||
ca->mi.state == BCH_MEMBER_STATE_RO)) {
mutex_unlock(&c->sb_lock); mutex_unlock(&c->sb_lock);
return false; return false;
} }
}
mutex_unlock(&c->sb_lock); mutex_unlock(&c->sb_lock);
} }
@ -1419,22 +1408,59 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
* *
* flag_data_bad() does not check btree pointers * flag_data_bad() does not check btree pointers
*/ */
ret = bch2_flag_data_bad(ca); ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
if (ret) { if (ret) {
bch_err(ca, "Remove failed"); bch_err(ca, "Remove failed: error %i dropping data", ret);
goto err;
}
ret = bch2_journal_flush_device(&c->journal, ca->dev_idx);
if (ret) {
bch_err(ca, "Remove failed: error %i flushing journal", ret);
goto err; goto err;
} }
data = bch2_dev_has_data(c, ca); data = bch2_dev_has_data(c, ca);
if (data) { if (data) {
bch_err(ca, "Remove failed, still has data (%x)", data); char data_has_str[100];
bch2_scnprint_flag_list(data_has_str,
sizeof(data_has_str),
bch2_data_types,
data);
bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
ret = -EBUSY;
goto err; goto err;
} }
bch2_journal_meta(&c->journal); ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC,
POS(ca->dev_idx, 0),
POS(ca->dev_idx + 1, 0),
ZERO_VERSION,
NULL, NULL, NULL);
if (ret) {
bch_err(ca, "Remove failed, error deleting alloc info");
goto err;
}
/*
* must flush all existing journal entries, they might have
* (overwritten) keys that point to the device we're removing:
*/
ret = bch2_journal_flush_all_pins(&c->journal);
if (ret) {
bch_err(ca, "Remove failed, journal error");
goto err;
}
__bch2_dev_offline(ca); __bch2_dev_offline(ca);
bch2_dev_stop(ca);
mutex_lock(&c->sb_lock);
rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
mutex_unlock(&c->sb_lock);
percpu_ref_kill(&ca->ref);
wait_for_completion(&ca->ref_completion);
bch2_dev_free(ca); bch2_dev_free(ca);
/* /*
@ -1542,7 +1568,7 @@ have_slot:
bch2_write_super(c); bch2_write_super(c);
mutex_unlock(&c->sb_lock); mutex_unlock(&c->sb_lock);
ca = c->devs[dev_idx]; ca = bch_dev_locked(c, dev_idx);
if (ca->mi.state == BCH_MEMBER_STATE_RW) { if (ca->mi.state == BCH_MEMBER_STATE_RW) {
err = "journal alloc failed"; err = "journal alloc failed";
if (bch2_dev_journal_alloc(ca)) if (bch2_dev_journal_alloc(ca))
@ -1568,7 +1594,7 @@ err:
/* Hot add existing device to running filesystem: */ /* Hot add existing device to running filesystem: */
int bch2_dev_online(struct bch_fs *c, const char *path) int bch2_dev_online(struct bch_fs *c, const char *path)
{ {
struct bch_sb_handle sb = { 0 }; struct bch_sb_handle sb = { NULL };
struct bch_dev *ca; struct bch_dev *ca;
unsigned dev_idx; unsigned dev_idx;
const char *err; const char *err;
@ -1593,7 +1619,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
} }
mutex_unlock(&c->sb_lock); mutex_unlock(&c->sb_lock);
ca = c->devs[dev_idx]; ca = bch_dev_locked(c, dev_idx);
if (ca->mi.state == BCH_MEMBER_STATE_RW) { if (ca->mi.state == BCH_MEMBER_STATE_RW) {
err = __bch2_dev_read_write(c, ca); err = __bch2_dev_read_write(c, ca);
if (err) if (err)
@ -1619,7 +1645,6 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
return -EINVAL; return -EINVAL;
} }
__bch2_dev_read_only(c, ca);
__bch2_dev_offline(ca); __bch2_dev_offline(ca);
mutex_unlock(&c->state_lock); mutex_unlock(&c->state_lock);
@ -1629,37 +1654,31 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca) int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca)
{ {
unsigned data; unsigned data;
int ret; int ret = 0;
mutex_lock(&c->state_lock); mutex_lock(&c->state_lock);
if (ca->mi.state == BCH_MEMBER_STATE_RW) { if (ca->mi.state == BCH_MEMBER_STATE_RW) {
bch_err(ca, "Cannot migrate data off RW device"); bch_err(ca, "Cannot migrate data off RW device");
mutex_unlock(&c->state_lock); ret = -EINVAL;
return -EINVAL; goto err;
} }
mutex_unlock(&c->state_lock); ret = bch2_dev_data_migrate(c, ca, 0);
ret = bch2_move_data_off_device(ca);
if (ret) { if (ret) {
bch_err(ca, "Error migrating data: %i", ret); bch_err(ca, "Error migrating data: %i", ret);
return ret; goto err;
}
ret = bch2_move_metadata_off_device(ca);
if (ret) {
bch_err(ca, "Error migrating metadata: %i", ret);
return ret;
} }
data = bch2_dev_has_data(c, ca); data = bch2_dev_has_data(c, ca);
if (data) { if (data) {
bch_err(ca, "Migrate error: data still present (%x)", data); bch_err(ca, "Migrate error: data still present (%x)", data);
return -EINVAL; ret = -EINVAL;
goto err;
} }
err:
return 0; mutex_unlock(&c->state_lock);
return ret;
} }
/* Filesystem open: */ /* Filesystem open: */

View File

@ -59,6 +59,14 @@ static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
} }
} }
static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
unsigned dev)
{
BUG_ON(bch2_dev_list_has_dev(*devs, dev));
BUG_ON(devs->nr >= BCH_REPLICAS_MAX);
devs->devs[devs->nr++] = dev;
}
static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter, static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
struct bch_devs_mask *mask) struct bch_devs_mask *mask)
{ {
@ -131,6 +139,26 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
__for_each_online_member(ca, c, iter, \ __for_each_online_member(ca, c, iter, \
(1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO)) (1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO))
/*
* If a key exists that references a device, the device won't be going away and
* we can omit rcu_read_lock():
*/
static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
{
EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
return rcu_dereference_check(c->devs[idx], 1);
}
static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
{
EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
return rcu_dereference_protected(c->devs[idx],
lockdep_is_held(&c->sb_lock) ||
lockdep_is_held(&c->state_lock));
}
/* XXX kill, move to struct bch_fs */ /* XXX kill, move to struct bch_fs */
static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
{ {
@ -146,7 +174,7 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
struct bch_fs *bch2_bdev_to_fs(struct block_device *); struct bch_fs *bch2_bdev_to_fs(struct block_device *);
struct bch_fs *bch2_uuid_to_fs(uuid_le); struct bch_fs *bch2_uuid_to_fs(uuid_le);
int bch2_congested(struct bch_fs *, int); int bch2_congested(void *, int);
bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *, bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
enum bch_member_state, int); enum bch_member_state, int);

View File

@ -739,7 +739,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
c->open_buckets_wait.list.first ? "waiting" : "empty"); c->open_buckets_wait.list.first ? "waiting" : "empty");
} }
const char * const bch2_rw[] = { static const char * const bch2_rw[] = {
"read", "read",
"write", "write",
NULL NULL

View File

@ -6,7 +6,6 @@
#include "clock.h" #include "clock.h"
#include "extents.h" #include "extents.h"
#include "io.h" #include "io.h"
#include "keylist.h"
#include "move.h" #include "move.h"
#include "super-io.h" #include "super-io.h"
#include "tier.h" #include "tier.h"
@ -28,7 +27,7 @@ static bool tiering_pred(void *arg, struct bkey_s_c_extent e)
return false; return false;
extent_for_each_ptr(e, ptr) extent_for_each_ptr(e, ptr)
if (c->devs[ptr->dev]->mi.tier >= tier->idx) if (bch_dev_bkey_exists(c, ptr->dev)->mi.tier >= tier->idx)
replicas++; replicas++;
return replicas < c->opts.data_replicas; return replicas < c->opts.data_replicas;

View File

@ -34,8 +34,12 @@ struct closure;
#define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0) #define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0)
#define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0) #define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0)
#define memcpy(_dst, _src, _len) \ #define memcpy(dst, src, len) \
({ \ ({ \
void *_dst = (dst); \
const void *_src = (src); \
size_t _len = (len); \
\
BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) || \ BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) || \
(void *) (_dst) + (_len) <= (void *) (_src))); \ (void *) (_dst) + (_len) <= (void *) (_src))); \
memcpy(_dst, _src, _len); \ memcpy(_dst, _src, _len); \

View File

@ -9,10 +9,10 @@
*/ */
#define __vstruct_u64s(_s) \ #define __vstruct_u64s(_s) \
({ \ ({ \
( type_is((_s)->u64s, u64) ? le64_to_cpu((_s)->u64s) \ ( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s) \
: type_is((_s)->u64s, u32) ? le32_to_cpu((_s)->u64s) \ : type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s) \
: type_is((_s)->u64s, u16) ? le16_to_cpu((_s)->u64s) \ : type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s) \
: ((_s)->u64s)); \ : ((__force u8) ((_s)->u64s))); \
}) })
#define __vstruct_bytes(_type, _u64s) \ #define __vstruct_bytes(_type, _u64s) \

View File

@ -2,6 +2,7 @@
#include "bcachefs.h" #include "bcachefs.h"
#include "bkey_methods.h" #include "bkey_methods.h"
#include "btree_update.h" #include "btree_update.h"
#include "compress.h"
#include "extents.h" #include "extents.h"
#include "fs.h" #include "fs.h"
#include "str_hash.h" #include "str_hash.h"
@ -358,6 +359,129 @@ static const struct xattr_handler bch_xattr_security_handler = {
.flags = BCH_XATTR_INDEX_SECURITY, .flags = BCH_XATTR_INDEX_SECURITY,
}; };
#ifndef NO_BCACHEFS_FS
static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
struct dentry *dentry, struct inode *vinode,
const char *name, void *buffer, size_t size)
{
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_opts opts =
bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode));
const struct bch_option *opt;
int ret, id;
u64 v;
id = bch2_opt_lookup(name);
if (id < 0 || !bch2_opt_is_inode_opt(id))
return -EINVAL;
opt = bch2_opt_table + id;
if (!bch2_opt_defined_by_id(&opts, id))
return -ENODATA;
v = bch2_opt_get_by_id(&opts, id);
if (opt->type == BCH_OPT_STR)
ret = snprintf(buffer, size, "%s", opt->choices[v]);
else
ret = snprintf(buffer, size, "%llu", v);
return ret <= size || !buffer ? ret : -ERANGE;
}
struct inode_opt_set {
int id;
u64 v;
bool defined;
};
static int inode_opt_set_fn(struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
void *p)
{
struct inode_opt_set *s = p;
if (s->defined)
bch2_inode_opt_set(bi, s->id, s->v);
else
bch2_inode_opt_clear(bi, s->id);
return 0;
}
static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
struct dentry *dentry, struct inode *vinode,
const char *name, const void *value,
size_t size, int flags)
{
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
const struct bch_option *opt;
char *buf;
struct inode_opt_set s;
int ret;
s.id = bch2_opt_lookup(name);
if (s.id < 0 || !bch2_opt_is_inode_opt(s.id))
return -EINVAL;
opt = bch2_opt_table + s.id;
if (value) {
buf = kmalloc(size + 1, GFP_KERNEL);
if (!buf)
return -ENOMEM;
memcpy(buf, value, size);
buf[size] = '\0';
ret = bch2_opt_parse(opt, buf, &s.v);
kfree(buf);
if (ret < 0)
return ret;
if (s.id == Opt_compression) {
mutex_lock(&c->sb_lock);
ret = bch2_check_set_has_compressed_data(c, s.v);
mutex_unlock(&c->sb_lock);
if (ret)
return ret;
}
s.defined = true;
} else {
s.defined = false;
}
mutex_lock(&inode->ei_update_lock);
ret = __bch2_write_inode(c, inode, inode_opt_set_fn, &s);
mutex_unlock(&inode->ei_update_lock);
return ret;
}
static const struct xattr_handler bch_xattr_bcachefs_handler = {
.prefix = "bcachefs.",
.get = bch2_xattr_bcachefs_get,
.set = bch2_xattr_bcachefs_set,
};
#endif /* NO_BCACHEFS_FS */
const struct xattr_handler *bch2_xattr_handlers[] = {
&bch_xattr_user_handler,
&posix_acl_access_xattr_handler,
&posix_acl_default_xattr_handler,
&bch_xattr_trusted_handler,
&bch_xattr_security_handler,
#ifndef NO_BCACHEFS_FS
&bch_xattr_bcachefs_handler,
#endif
NULL
};
static const struct xattr_handler *bch_xattr_handler_map[] = { static const struct xattr_handler *bch_xattr_handler_map[] = {
[BCH_XATTR_INDEX_USER] = &bch_xattr_user_handler, [BCH_XATTR_INDEX_USER] = &bch_xattr_user_handler,
[BCH_XATTR_INDEX_POSIX_ACL_ACCESS] = [BCH_XATTR_INDEX_POSIX_ACL_ACCESS] =
@ -368,15 +492,6 @@ static const struct xattr_handler *bch_xattr_handler_map[] = {
[BCH_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler, [BCH_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler,
}; };
const struct xattr_handler *bch2_xattr_handlers[] = {
&bch_xattr_user_handler,
&posix_acl_access_xattr_handler,
&posix_acl_default_xattr_handler,
&bch_xattr_trusted_handler,
&bch_xattr_security_handler,
NULL
};
static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type) static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type)
{ {
return type < ARRAY_SIZE(bch_xattr_handler_map) return type < ARRAY_SIZE(bch_xattr_handler_map)

View File

@ -19,7 +19,38 @@
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/export.h>
static const struct {
int err;
const char *name;
} blk_errors[] = {
[BLK_STS_OK] = { 0, "" },
[BLK_STS_NOTSUPP] = { -EOPNOTSUPP, "operation not supported" },
[BLK_STS_TIMEOUT] = { -ETIMEDOUT, "timeout" },
[BLK_STS_NOSPC] = { -ENOSPC, "critical space allocation" },
[BLK_STS_TRANSPORT] = { -ENOLINK, "recoverable transport" },
[BLK_STS_TARGET] = { -EREMOTEIO, "critical target" },
[BLK_STS_NEXUS] = { -EBADE, "critical nexus" },
[BLK_STS_MEDIUM] = { -ENODATA, "critical medium" },
[BLK_STS_PROTECTION] = { -EILSEQ, "protection" },
[BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" },
[BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" },
/* device mapper special case, should not leak out: */
[BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" },
/* everything else not covered above: */
[BLK_STS_IOERR] = { -EIO, "I/O" },
};
int blk_status_to_errno(blk_status_t status)
{
int idx = (__force int)status;
if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
return -EIO;
return blk_errors[idx].err;
}
void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
struct bio *src, struct bvec_iter *src_iter) struct bio *src, struct bvec_iter *src_iter)
@ -199,8 +230,8 @@ static struct bio *__bio_chain_endio(struct bio *bio)
{ {
struct bio *parent = bio->bi_private; struct bio *parent = bio->bi_private;
if (!parent->bi_error) if (!parent->bi_status)
parent->bi_error = bio->bi_error; parent->bi_status = bio->bi_status;
bio_put(bio); bio_put(bio);
return parent; return parent;
} }
@ -233,27 +264,6 @@ again:
bio->bi_end_io(bio); bio->bi_end_io(bio);
} }
void bio_endio_nodec(struct bio *bio)
{
goto nodec;
while (bio) {
if (unlikely(!bio_remaining_done(bio)))
break;
nodec:
if (bio->bi_end_io == bio_chain_endio) {
struct bio *parent = bio->bi_private;
parent->bi_error = bio->bi_error;
bio_put(bio);
bio = parent;
} else {
if (bio->bi_end_io)
bio->bi_end_io(bio);
bio = NULL;
}
}
}
void bio_reset(struct bio *bio) void bio_reset(struct bio *bio)
{ {
unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);

View File

@ -32,7 +32,7 @@ void generic_make_request(struct bio *bio)
ret = fdatasync(bio->bi_bdev->bd_fd); ret = fdatasync(bio->bi_bdev->bd_fd);
if (ret) { if (ret) {
fprintf(stderr, "fsync error: %m\n"); fprintf(stderr, "fsync error: %m\n");
bio->bi_error = -EIO; bio->bi_status = BLK_STS_IOERR;
bio_endio(bio); bio_endio(bio);
return; return;
} }
@ -106,7 +106,7 @@ int submit_bio_wait(struct bio *bio)
submit_bio(bio); submit_bio(bio);
wait_for_completion(&done); wait_for_completion(&done);
return bio->bi_error; return blk_status_to_errno(bio->bi_status);
} }
int blkdev_issue_discard(struct block_device *bdev, int blkdev_issue_discard(struct block_device *bdev,
@ -235,10 +235,8 @@ static int aio_completion_thread(void *arg)
for (ev = events; ev < events + ret; ev++) { for (ev = events; ev < events + ret; ev++) {
struct bio *bio = (struct bio *) ev->data; struct bio *bio = (struct bio *) ev->data;
if (ev->res < 0) if (ev->res != bio->bi_iter.bi_size)
bio->bi_error = ev->res; bio->bi_status = BLK_STS_IOERR;
else if (ev->res != bio->bi_iter.bi_size)
bio->bi_error = -EIO;
bio_endio(bio); bio_endio(bio);
} }