Update bcachefs sources to 14ce2a2031 bcachefs: fixes for building in userspace

This commit is contained in:
Kent Overstreet 2017-12-21 18:00:30 -05:00
parent 8acc54456e
commit 1cf4d51dc4
61 changed files with 2074 additions and 1442 deletions

View File

@ -1 +1 @@
e57b5958cf4e8530d26f7c36a6e1427fb284cc70
14ce2a2031f3761a4b957aa2e5aac446ce18b87c

View File

@ -293,11 +293,11 @@ int cmd_list(int argc, char *argv[])
list_modes, "list mode");
break;
case 'f':
opts.fix_errors = FSCK_ERR_YES;
opts.norecovery = false;
opt_set(opts, fix_errors, FSCK_OPT_YES);
opt_set(opts, norecovery, false);
break;
case 'v':
opts.verbose_recovery = true;
opt_set(opts, verbose_recovery, true);
break;
case 'h':
list_keys_usage();

View File

@ -28,18 +28,19 @@ int cmd_fsck(int argc, char *argv[])
int opt;
opt_set(opts, degraded, true);
opt_set(opts, fix_errors, FSCK_OPT_ASK);
while ((opt = getopt(argc, argv, "pynfvh")) != -1)
switch (opt) {
case 'p':
opt_set(opts, fix_errors, FSCK_ERR_YES);
opt_set(opts, fix_errors, FSCK_OPT_YES);
break;
case 'y':
opt_set(opts, fix_errors, FSCK_ERR_YES);
opt_set(opts, fix_errors, FSCK_OPT_YES);
break;
case 'n':
opt_set(opts, nochanges, true);
opt_set(opts, fix_errors, FSCK_ERR_NO);
opt_set(opts, fix_errors, FSCK_OPT_NO);
break;
case 'f':
/* force check, even if filesystem marked clean: */

View File

@ -164,7 +164,7 @@ static struct bch_inode_unpacked create_file(struct bch_fs *c,
struct bch_inode_unpacked new_inode;
int ret;
bch2_inode_init(c, &new_inode, uid, gid, mode, rdev);
bch2_inode_init(c, &new_inode, uid, gid, mode, rdev, parent);
ret = bch2_inode_create(c, &new_inode, BLOCKDEV_INODE_MAX, 0,
&c->unused_inode_hint);
@ -247,7 +247,6 @@ static void write_data(struct bch_fs *c,
struct bch_inode_unpacked *dst_inode,
u64 dst_offset, void *buf, size_t len)
{
struct disk_reservation res;
struct bch_write_op op;
struct bio_vec bv;
struct closure cl;
@ -261,12 +260,15 @@ static void write_data(struct bch_fs *c,
op.wbio.bio.bi_iter.bi_size = len;
bch2_bio_map(&op.wbio.bio, buf);
int ret = bch2_disk_reservation_get(c, &res, len >> 9, 0);
bch2_write_op_init(&op, c);
op.write_point = writepoint_hashed(0);
op.pos = POS(dst_inode->bi_inum, dst_offset >> 9);
int ret = bch2_disk_reservation_get(c, &op.res, len >> 9, 0);
if (ret)
die("error reserving space in new filesystem: %s", strerror(-ret));
bch2_write_op_init(&op, c, res, NULL, writepoint_hashed(0),
POS(dst_inode->bi_inum, dst_offset >> 9), NULL, 0);
closure_call(&op.cl, bch2_write, NULL, &cl);
closure_sync(&cl);

View File

@ -243,7 +243,8 @@ static inline void bioset_free(struct bio_set *bs)
static inline int bioset_init(struct bio_set *bs,
unsigned pool_size,
unsigned front_pad)
unsigned front_pad,
int flags)
{
bs->front_pad = front_pad;
return 0;
@ -251,6 +252,10 @@ static inline int bioset_init(struct bio_set *bs,
extern struct bio_set *bioset_create(unsigned int, unsigned int);
extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int);
enum {
BIOSET_NEED_BVECS = 1 << 0,
BIOSET_NEED_RESCUER = 1 << 1,
};
extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
extern void bio_put(struct bio *);
@ -271,13 +276,6 @@ static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
}
extern void bio_endio(struct bio *);
extern void bio_endio_nodec(struct bio *);
static inline void bio_io_error(struct bio *bio)
{
bio->bi_error = -EIO;
bio_endio(bio);
}
extern void bio_advance(struct bio *, unsigned);

View File

@ -13,7 +13,27 @@ struct bio_set;
struct bio;
struct block_device;
typedef void (bio_end_io_t) (struct bio *);
typedef void (bio_destructor_t) (struct bio *);
/*
* Block error status values. See block/blk-core:blk_errors for the details.
*/
typedef u8 __bitwise blk_status_t;
#define BLK_STS_OK 0
#define BLK_STS_NOTSUPP ((__force blk_status_t)1)
#define BLK_STS_TIMEOUT ((__force blk_status_t)2)
#define BLK_STS_NOSPC ((__force blk_status_t)3)
#define BLK_STS_TRANSPORT ((__force blk_status_t)4)
#define BLK_STS_TARGET ((__force blk_status_t)5)
#define BLK_STS_NEXUS ((__force blk_status_t)6)
#define BLK_STS_MEDIUM ((__force blk_status_t)7)
#define BLK_STS_PROTECTION ((__force blk_status_t)8)
#define BLK_STS_RESOURCE ((__force blk_status_t)9)
#define BLK_STS_IOERR ((__force blk_status_t)10)
/* hack for device mapper, don't use elsewhere: */
#define BLK_STS_DM_REQUEUE ((__force blk_status_t)11)
#define BLK_STS_AGAIN ((__force blk_status_t)12)
/*
* main unit of I/O for the block layer and lower layers (ie drivers and
@ -22,7 +42,7 @@ typedef void (bio_destructor_t) (struct bio *);
struct bio {
struct bio *bi_next; /* request queue link */
struct block_device *bi_bdev;
int bi_error;
blk_status_t bi_status;
unsigned int bi_opf; /* bottom bits req flags,
* top bits REQ_OP. Use
* accessors.

View File

@ -197,5 +197,8 @@ static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx)
#define capable(cap) true
int blk_status_to_errno(blk_status_t status);
blk_status_t errno_to_blk_status(int errno);
#endif /* __TOOLS_LINUX_BLKDEV_H */

View File

@ -14,7 +14,7 @@
#define BUG() do { assert(0); unreachable(); } while (0)
#define BUG_ON(cond) assert(!(cond))
#define WARN_ON_ONCE(cond) assert(!(cond))
#define WARN_ON_ONCE(cond) ({ bool _r = (cond); if (_r) assert(0); _r; })
#define WARN_ONCE(cond, msg) ({ bool _r = (cond); if (_r) assert(0); _r; })
#define __WARN() assert(0)

View File

@ -204,4 +204,19 @@ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns)
extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
const struct timespec64 rhs);
static inline struct timespec timespec_trunc(struct timespec t, unsigned gran)
{
/* Avoid division in the common cases 1 ns and 1 s. */
if (gran == 1) {
/* nothing */
} else if (gran == NSEC_PER_SEC) {
t.tv_nsec = 0;
} else if (gran > 1 && gran < NSEC_PER_SEC) {
t.tv_nsec -= t.tv_nsec % gran;
} else {
WARN(1, "illegal file time granularity: %u", gran);
}
return t;
}
#endif /* _LINUX_TIME64_H */

View File

@ -193,8 +193,7 @@ int bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type)
if (ret < 0)
return ret;
else {
inode->v.i_ctime =
current_fs_time(inode->v.i_sb);
inode->v.i_ctime = current_time(&inode->v);
mark_inode_dirty(&inode->v);
if (ret == 0)
acl = NULL;

View File

@ -257,7 +257,7 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
return;
a = bkey_s_c_to_alloc(k);
ca = c->devs[a.k->p.inode];
ca = bch_dev_bkey_exists(c, a.k->p.inode);
if (a.k->p.offset >= ca->mi.nbuckets)
return;
@ -305,10 +305,12 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
bch2_alloc_read_key(c, bkey_i_to_s_c(k));
}
mutex_lock(&c->bucket_lock);
for_each_member_device(ca, c, i) {
bch2_recalc_min_prio(c, ca, READ);
bch2_recalc_min_prio(c, ca, WRITE);
}
mutex_unlock(&c->bucket_lock);
return 0;
}
@ -368,7 +370,7 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
return 0;
ca = c->devs[pos.inode];
ca = bch_dev_bkey_exists(c, pos.inode);
if (pos.offset >= ca->mi.nbuckets)
return 0;
@ -461,7 +463,7 @@ static void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
/* Bucket heap / gen */
void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw)
static void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw)
{
struct prio_clock *clock = &c->prio_clock[rw];
struct bucket *g;
@ -975,7 +977,7 @@ static int bch2_allocator_thread(void *arg)
void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
{
struct bch_dev *ca = c->devs[ob->ptr.dev];
struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
spin_lock(&ob->lock);
bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), false,
@ -1303,7 +1305,7 @@ static void writepoint_drop_ptrs(struct bch_fs *c,
for (i = wp->nr_ptrs - 1; i >= 0; --i) {
struct open_bucket *ob = wp->ptrs[i];
struct bch_dev *ca = c->devs[ob->ptr.dev];
struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
if (nr_ptrs_dislike && !test_bit(ob->ptr.dev, devs->d)) {
BUG_ON(ca->open_buckets_partial_nr >=
@ -1331,7 +1333,7 @@ static void verify_not_stale(struct bch_fs *c, const struct write_point *wp)
unsigned i;
writepoint_for_each_ptr(wp, ob, i) {
struct bch_dev *ca = c->devs[ob->ptr.dev];
struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
BUG_ON(ptr_stale(ca, &ob->ptr));
}
@ -1537,7 +1539,7 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
for (i = 0; i < wp->nr_ptrs_can_use; i++) {
struct open_bucket *ob = wp->ptrs[i];
struct bch_dev *ca = c->devs[ob->ptr.dev];
struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
struct bch_extent_ptr tmp = ob->ptr;
EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptr.dev));
@ -1589,7 +1591,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
ra_pages += bdi->ra_pages;
}
c->bdi.ra_pages = ra_pages;
bch2_set_ra_pages(c, ra_pages);
/* Find fastest, slowest tiers with devices: */

View File

@ -326,9 +326,9 @@ struct io_count {
struct bch_dev {
struct kobject kobj;
struct percpu_ref ref;
struct completion ref_completion;
struct percpu_ref io_ref;
struct completion stop_complete;
struct completion offline_complete;
struct completion io_ref_completion;
struct bch_fs *fs;
@ -515,12 +515,11 @@ struct bch_fs {
struct closure sb_write;
struct mutex sb_lock;
struct backing_dev_info bdi;
/* BTREE CACHE */
struct bio_set btree_read_bio;
struct btree_root btree_roots[BTREE_ID_NR];
bool btree_roots_dirty;
struct mutex btree_root_lock;
struct btree_cache btree_cache;
@ -710,6 +709,14 @@ struct bch_fs {
#undef BCH_TIME_STAT
};
static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
{
#ifndef NO_BCACHEFS_FS
if (c->vfs_sb)
c->vfs_sb->s_bdi->ra_pages = ra_pages;
#endif
}
static inline bool bch2_fs_running(struct bch_fs *c)
{
return c->state == BCH_FS_RO || c->state == BCH_FS_RW;

View File

@ -593,18 +593,24 @@ struct bch_inode_generation {
} __attribute__((packed, aligned(8)));
BKEY_VAL_TYPE(inode_generation, BCH_INODE_GENERATION);
#define BCH_INODE_FIELDS() \
BCH_INODE_FIELD(bi_atime, 64) \
BCH_INODE_FIELD(bi_ctime, 64) \
BCH_INODE_FIELD(bi_mtime, 64) \
BCH_INODE_FIELD(bi_otime, 64) \
BCH_INODE_FIELD(bi_size, 64) \
BCH_INODE_FIELD(bi_sectors, 64) \
BCH_INODE_FIELD(bi_uid, 32) \
BCH_INODE_FIELD(bi_gid, 32) \
BCH_INODE_FIELD(bi_nlink, 32) \
BCH_INODE_FIELD(bi_generation, 32) \
BCH_INODE_FIELD(bi_dev, 32)
#define BCH_INODE_FIELDS() \
BCH_INODE_FIELD(bi_atime, 64) \
BCH_INODE_FIELD(bi_ctime, 64) \
BCH_INODE_FIELD(bi_mtime, 64) \
BCH_INODE_FIELD(bi_otime, 64) \
BCH_INODE_FIELD(bi_size, 64) \
BCH_INODE_FIELD(bi_sectors, 64) \
BCH_INODE_FIELD(bi_uid, 32) \
BCH_INODE_FIELD(bi_gid, 32) \
BCH_INODE_FIELD(bi_nlink, 32) \
BCH_INODE_FIELD(bi_generation, 32) \
BCH_INODE_FIELD(bi_dev, 32) \
BCH_INODE_FIELD(bi_data_checksum, 8) \
BCH_INODE_FIELD(bi_compression, 8)
#define BCH_INODE_FIELDS_INHERIT() \
BCH_INODE_FIELD(bi_data_checksum) \
BCH_INODE_FIELD(bi_compression)
enum {
/*
@ -794,7 +800,7 @@ struct bch_sb_layout {
__u8 sb_max_size_bits; /* base 2 of 512 byte sectors */
__u8 nr_superblocks;
__u8 pad[5];
__u64 sb_offset[61];
__le64 sb_offset[61];
} __attribute__((packed, aligned(8)));
#define BCH_SB_LAYOUT_SECTOR 7
@ -1089,6 +1095,11 @@ struct jset_entry {
};
};
struct jset_entry_blacklist {
struct jset_entry entry;
__le64 seq;
};
#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64))
enum {

View File

@ -1,6 +1,7 @@
#include "bcachefs.h"
#include "bkey.h"
#include "bkey_methods.h"
#include "bset.h"
#include "util.h"
@ -80,37 +81,6 @@ static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed,
const struct bkey_format *format) {}
#endif
int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
{
char *out = buf, *end = buf + size;
#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
p("u64s %u type %u %llu:%llu snap %u len %u ver %llu",
k->u64s, k->type, k->p.inode, k->p.offset,
k->p.snapshot, k->size, k->version.lo);
BUG_ON(bkey_packed(k));
switch (k->type) {
case KEY_TYPE_DELETED:
p(" deleted");
break;
case KEY_TYPE_DISCARD:
p(" discard");
break;
case KEY_TYPE_ERROR:
p(" error");
break;
case KEY_TYPE_COOKIE:
p(" cookie");
break;
}
#undef p
return out - buf;
}
struct pack_state {
const struct bkey_format *format;
unsigned bits; /* bits remaining in current word */
@ -336,7 +306,8 @@ bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
* Extents - we have to guarantee that if an extent is packed, a trimmed
* version will also pack:
*/
if (bkey_start_offset(in) < format->field_offset[BKEY_FIELD_OFFSET])
if (bkey_start_offset(in) <
le64_to_cpu(format->field_offset[BKEY_FIELD_OFFSET]))
return false;
pack_state_finish(&state, out);
@ -800,7 +771,7 @@ static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
bool *eax_zeroed)
{
unsigned bits = format->bits_per_field[field];
u64 offset = format->field_offset[field];
u64 offset = le64_to_cpu(format->field_offset[field]);
unsigned i, byte, bit_offset, align, shl, shr;
if (!bits && !offset) {

View File

@ -8,7 +8,6 @@
#include "vstructs.h"
void bch2_to_binary(char *, const u64 *, unsigned);
int bch2_bkey_to_text(char *, size_t, const struct bkey *);
#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
@ -377,7 +376,8 @@ static inline u64 bkey_field_max(const struct bkey_format *f,
enum bch_bkey_fields nr)
{
return f->bits_per_field[nr] < 64
? f->field_offset[nr] + ~(~0ULL << f->bits_per_field[nr])
? (le64_to_cpu(f->field_offset[nr]) +
~(~0ULL << f->bits_per_field[nr]))
: U64_MAX;
}

View File

@ -18,28 +18,11 @@ const struct bkey_ops *bch2_bkey_ops[] = {
[BKEY_TYPE_BTREE] = &bch2_bkey_btree_ops,
};
/* Returns string indicating reason for being invalid, or NULL if valid: */
const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
struct bkey_s_c k)
const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type,
struct bkey_s_c k)
{
const struct bkey_ops *ops = bch2_bkey_ops[type];
if (k.k->u64s < BKEY_U64s)
return "u64s too small";
if (!ops->is_extents) {
if (k.k->size)
return "nonzero size field";
} else {
if ((k.k->size == 0) != bkey_deleted(k.k))
return "bad size field";
}
if (ops->is_extents &&
!k.k->size &&
!bkey_deleted(k.k))
return "zero size field";
switch (k.k->type) {
case KEY_TYPE_DELETED:
case KEY_TYPE_DISCARD:
@ -63,8 +46,41 @@ const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
}
}
const char *bch2_btree_bkey_invalid(struct bch_fs *c, struct btree *b,
struct bkey_s_c k)
const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
struct bkey_s_c k)
{
const struct bkey_ops *ops = bch2_bkey_ops[type];
if (k.k->u64s < BKEY_U64s)
return "u64s too small";
if (!ops->is_extents) {
if (k.k->size)
return "nonzero size field";
} else {
if ((k.k->size == 0) != bkey_deleted(k.k))
return "bad size field";
}
if (ops->is_extents &&
!k.k->size &&
!bkey_deleted(k.k))
return "zero size field";
if (k.k->p.snapshot)
return "nonzero snapshot";
return NULL;
}
const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
struct bkey_s_c k)
{
return __bch2_bkey_invalid(c, type, k) ?:
bch2_bkey_val_invalid(c, type, k);
}
const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
{
if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0)
return "key before start of btree node";
@ -72,10 +88,7 @@ const char *bch2_btree_bkey_invalid(struct bch_fs *c, struct btree *b,
if (bkey_cmp(k.k->p, b->data->max_key) > 0)
return "key past end of btree node";
if (k.k->p.snapshot)
return "nonzero snapshot";
return bch2_bkey_invalid(c, btree_node_type(b), k);
return NULL;
}
void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
@ -86,7 +99,8 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
BUG_ON(!k.k->u64s);
invalid = bch2_btree_bkey_invalid(c, b, k);
invalid = bch2_bkey_invalid(c, type, k) ?:
bch2_bkey_in_btree_node(b, k);
if (invalid) {
char buf[160];
@ -100,33 +114,62 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
ops->key_debugcheck(c, b, k);
}
char *bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
char *buf, size_t size, struct bkey_s_c k)
#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
{
const struct bkey_ops *ops = bch2_bkey_ops[type];
char *out = buf, *end = buf + size;
if (k.k->type >= KEY_TYPE_GENERIC_NR &&
ops->val_to_text)
ops->val_to_text(c, buf, size, k);
p("u64s %u type %u ", k->u64s, k->type);
return buf;
if (bkey_cmp(k->p, POS_MAX))
p("%llu:%llu", k->p.inode, k->p.offset);
else
p("POS_MAX");
p(" snap %u len %u ver %llu", k->p.snapshot, k->size, k->version.lo);
return out - buf;
}
char *bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
char *buf, size_t size, struct bkey_s_c k)
int bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
char *buf, size_t size, struct bkey_s_c k)
{
const struct bkey_ops *ops = bch2_bkey_ops[type];
char *out = buf, *end = buf + size;
out += bch2_bkey_to_text(out, end - out, k.k);
if (k.k->type >= KEY_TYPE_GENERIC_NR &&
ops->val_to_text) {
out += scnprintf(out, end - out, ": ");
ops->val_to_text(c, out, end - out, k);
switch (k.k->type) {
case KEY_TYPE_DELETED:
p(" deleted");
break;
case KEY_TYPE_DISCARD:
p(" discard");
break;
case KEY_TYPE_ERROR:
p(" error");
break;
case KEY_TYPE_COOKIE:
p(" cookie");
break;
default:
if (k.k->type >= KEY_TYPE_GENERIC_NR && ops->val_to_text)
ops->val_to_text(c, buf, size, k);
break;
}
return buf;
return out - buf;
}
int bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
char *buf, size_t size, struct bkey_s_c k)
{
char *out = buf, *end = buf + size;
out += bch2_bkey_to_text(out, end - out, k.k);
out += scnprintf(out, end - out, ": ");
out += bch2_val_to_text(c, type, out, end - out, k);
return out - buf;
}
void bch2_bkey_swab(enum bkey_type type,

View File

@ -64,15 +64,19 @@ struct bkey_ops {
bool is_extents;
};
const char *bch2_bkey_val_invalid(struct bch_fs *, enum bkey_type,
struct bkey_s_c);
const char *__bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
const char *bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
const char *bch2_btree_bkey_invalid(struct bch_fs *, struct btree *,
struct bkey_s_c);
const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
char *bch2_val_to_text(struct bch_fs *, enum bkey_type,
char *, size_t, struct bkey_s_c);
char *bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
char *, size_t, struct bkey_s_c);
int bch2_bkey_to_text(char *, size_t, const struct bkey *);
int bch2_val_to_text(struct bch_fs *, enum bkey_type,
char *, size_t, struct bkey_s_c);
int bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
char *, size_t, struct bkey_s_c);
void bch2_bkey_swab(enum bkey_type, const struct bkey_format *,
struct bkey_packed *);

View File

@ -96,7 +96,7 @@ u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k)
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
extent_for_each_ptr(e, ptr) {
struct bch_dev *ca = c->devs[ptr->dev];
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
size_t b = PTR_BUCKET_NR(ca, ptr);
if (gen_after(ca->oldest_gens[b], ptr->gen))
@ -159,14 +159,15 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
(!c->opts.nofsck &&
fsck_err_on(!bch2_sb_has_replicas(c, e, data_type), c,
"superblock not marked as containing replicas"))) {
"superblock not marked as containing replicas (type %u)",
data_type))) {
ret = bch2_check_mark_super(c, e, data_type);
if (ret)
return ret;
}
extent_for_each_ptr(e, ptr) {
struct bch_dev *ca = c->devs[ptr->dev];
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct bucket *g = PTR_BUCKET(ca, ptr);
if (mustfix_fsck_err_on(!g->mark.gen_valid, c,
@ -315,14 +316,14 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
lockdep_assert_held(&c->sb_lock);
for (i = 0; i < layout->nr_superblocks; i++) {
if (layout->sb_offset[i] == BCH_SB_SECTOR)
u64 offset = le64_to_cpu(layout->sb_offset[i]);
if (offset == BCH_SB_SECTOR)
mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
BUCKET_SB, flags);
mark_metadata_sectors(c, ca,
layout->sb_offset[i],
layout->sb_offset[i] +
(1 << layout->sb_max_size_bits),
mark_metadata_sectors(c, ca, offset,
offset + (1 << layout->sb_max_size_bits),
BUCKET_SB, flags);
}
@ -414,7 +415,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
spin_lock(&ob->lock);
if (ob->valid) {
gc_pos_set(c, gc_pos_alloc(c, ob));
ca = c->devs[ob->ptr.dev];
ca = bch_dev_bkey_exists(c, ob->ptr.dev);
bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), true,
gc_pos_alloc(c, ob),
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
@ -424,7 +425,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
}
}
void bch2_gc_start(struct bch_fs *c)
static void bch2_gc_start(struct bch_fs *c)
{
struct bch_dev *ca;
struct bucket *g;

View File

@ -556,7 +556,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
struct bset_tree *t;
struct bset *start_bset = bset(b, &b->set[start_idx]);
bool used_mempool = false;
u64 start_time;
u64 start_time, seq = 0;
unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1;
bool sorting_entire_node = start_idx == 0 &&
end_idx == b->nsets;
@ -595,12 +595,9 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
bch2_time_stats_update(&c->btree_sort_time, start_time);
/* Make sure we preserve bset journal_seq: */
for (t = b->set + start_idx + 1;
t < b->set + end_idx;
t++)
start_bset->journal_seq =
max(start_bset->journal_seq,
bset(b, t)->journal_seq);
for (t = b->set + start_idx; t < b->set + end_idx; t++)
seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq));
start_bset->journal_seq = cpu_to_le64(seq);
if (sorting_entire_node) {
unsigned u64s = le16_to_cpu(out->keys.u64s);
@ -958,6 +955,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
{
struct bkey_packed *k, *prev = NULL;
struct bpos prev_pos = POS_MIN;
enum bkey_type type = btree_node_type(b);
bool seen_non_whiteout = false;
const char *err;
int ret = 0;
@ -1025,7 +1023,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
if (!BSET_SEPARATE_WHITEOUTS(i)) {
seen_non_whiteout = true;
whiteout_u64s = 0;
*whiteout_u64s = 0;
}
for (k = i->start;
@ -1059,16 +1057,17 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
}
if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
bch2_bkey_swab(btree_node_type(b), &b->format, k);
bch2_bkey_swab(type, &b->format, k);
u = bkey_disassemble(b, k, &tmp);
invalid = bch2_btree_bkey_invalid(c, b, u);
invalid = __bch2_bkey_invalid(c, type, u) ?:
bch2_bkey_in_btree_node(b, u) ?:
(write ? bch2_bkey_val_invalid(c, type, u) : NULL);
if (invalid) {
char buf[160];
bch2_bkey_val_to_text(c, btree_node_type(b),
buf, sizeof(buf), u);
bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
btree_err(BTREE_ERR_FIXABLE, c, b, i,
"invalid bkey %s: %s", buf, invalid);
@ -1114,6 +1113,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
struct btree_node_entry *bne;
struct btree_node_iter *iter;
struct btree_node *sorted;
struct bkey_packed *k;
struct bset *i;
bool used_mempool;
unsigned u64s;
int ret, retry_read = 0, write = READ;
@ -1137,7 +1138,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
unsigned sectors, whiteout_u64s = 0;
struct nonce nonce;
struct bch_csum csum;
struct bset *i;
if (!b->written) {
i = &b->data->keys;
@ -1238,6 +1238,31 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
btree_bounce_free(c, btree_page_order(c), used_mempool, sorted);
i = &b->data->keys;
for (k = i->start; k != vstruct_last(i);) {
enum bkey_type type = btree_node_type(b);
struct bkey tmp;
struct bkey_s_c u = bkey_disassemble(b, k, &tmp);
const char *invalid = bch2_bkey_val_invalid(c, type, u);
if (invalid) {
char buf[160];
bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
btree_err(BTREE_ERR_FIXABLE, c, b, i,
"invalid bkey %s: %s", buf, invalid);
btree_keys_account_key_drop(&b->nr, 0, k);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_next(k),
(u64 *) vstruct_end(i) - (u64 *) k);
continue;
}
k = bkey_next(k);
}
bch2_bset_build_aux_tree(b, b->set, false);
set_needs_whiteout(btree_bset_first(b));
@ -1278,13 +1303,13 @@ static void btree_node_read_work(struct work_struct *work)
bio->bi_iter.bi_size = btree_bytes(c);
submit_bio_wait(bio);
start:
bch2_dev_io_err_on(bio->bi_error, rb->pick.ca, "btree read");
bch2_dev_io_err_on(bio->bi_status, rb->pick.ca, "btree read");
percpu_ref_put(&rb->pick.ca->io_ref);
__set_bit(rb->pick.ca->dev_idx, avoid.d);
rb->pick = bch2_btree_pick_ptr(c, b, &avoid);
if (!bio->bi_error &&
if (!bio->bi_status &&
!bch2_btree_node_read_done(c, b, !IS_ERR_OR_NULL(rb->pick.ca)))
goto out;
} while (!IS_ERR_OR_NULL(rb->pick.ca));
@ -1377,17 +1402,24 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
bch2_btree_node_read(c, b, true);
six_unlock_write(&b->lock);
if (btree_node_read_error(b)) {
six_unlock_intent(&b->lock);
return -EIO;
bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_lock(&c->btree_cache.lock);
list_move(&b->list, &c->btree_cache.freeable);
mutex_unlock(&c->btree_cache.lock);
ret = -EIO;
goto err;
}
bch2_btree_set_root_for_read(c, b);
err:
six_unlock_write(&b->lock);
six_unlock_intent(&b->lock);
return 0;
return ret;
}
void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
@ -1412,35 +1444,57 @@ static void bch2_btree_node_write_error(struct bch_fs *c,
struct closure *cl = wbio->cl;
__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
struct bkey_i_extent *new_key;
struct bkey_s_extent e;
struct bch_extent_ptr *ptr;
struct btree_iter iter;
int ret;
six_lock_read(&b->lock);
bkey_copy(&tmp.k, &b->key);
six_unlock_read(&b->lock);
__bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
BTREE_MAX_DEPTH,
b->level, 0);
retry:
ret = bch2_btree_iter_traverse(&iter);
if (ret)
goto err;
if (!bkey_extent_is_data(&tmp.k.k) || !PTR_HASH(&tmp.k)) {
/* Node has been freed: */
/* has node been freed? */
if (iter.nodes[b->level] != b) {
/* node has been freed: */
if (!btree_node_dying(b))
panic("foo4\n");
goto out;
}
if (!btree_node_hashed(b))
panic("foo5\n");
bkey_copy(&tmp.k, &b->key);
new_key = bkey_i_to_extent(&tmp.k);
e = extent_i_to_s(new_key);
extent_for_each_ptr_backwards(e, ptr)
if (bch2_dev_list_has_dev(wbio->failed, ptr->dev))
bch2_extent_drop_ptr(e, ptr);
while (wbio->replicas_failed) {
unsigned idx = __fls(wbio->replicas_failed);
if (!bch2_extent_nr_ptrs(e.c))
goto err;
bch2_extent_drop_ptr_idx(extent_i_to_s(new_key), idx);
wbio->replicas_failed ^= 1 << idx;
}
if (!bch2_extent_nr_ptrs(extent_i_to_s_c(new_key)) ||
bch2_btree_node_update_key(c, b, new_key)) {
set_btree_node_noevict(b);
bch2_fatal_error(c);
}
ret = bch2_btree_node_update_key(c, &iter, b, new_key);
if (ret == -EINTR)
goto retry;
if (ret)
goto err;
out:
bch2_btree_iter_unlock(&iter);
bio_put(&wbio->bio);
btree_node_write_done(c, b);
if (cl)
closure_put(cl);
return;
err:
set_btree_node_noevict(b);
bch2_fs_fatal_error(c, "fatal error writing btree node");
goto out;
}
void bch2_btree_write_error_work(struct work_struct *work)
@ -1470,12 +1524,17 @@ static void btree_node_write_endio(struct bio *bio)
struct closure *cl = !wbio->split ? wbio->cl : NULL;
struct bch_fs *c = wbio->c;
struct bch_dev *ca = wbio->ca;
unsigned long flags;
bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
if (bch2_dev_io_err_on(bio->bi_error, ca, "btree write") ||
bch2_meta_write_fault("btree"))
set_bit(wbio->ptr_idx, (unsigned long *) &orig->replicas_failed);
if (bio->bi_status == BLK_STS_REMOVED ||
bch2_dev_io_err_on(bio->bi_status, ca, "btree write") ||
bch2_meta_write_fault("btree")) {
spin_lock_irqsave(&c->btree_write_error_lock, flags);
bch2_dev_list_add_dev(&orig->failed, ca->dev_idx);
spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
}
if (wbio->have_io_ref)
percpu_ref_put(&ca->io_ref);
@ -1491,12 +1550,11 @@ static void btree_node_write_endio(struct bio *bio)
wbio->used_mempool,
wbio->data);
if (wbio->replicas_failed) {
unsigned long flags;
if (wbio->failed.nr) {
spin_lock_irqsave(&c->btree_write_error_lock, flags);
bio_list_add(&c->btree_write_error_list, &wbio->bio);
spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
queue_work(c->wq, &c->btree_write_error_work);
return;
}
@ -1707,6 +1765,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
wbio = wbio_init(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write));
wbio->cl = parent;
wbio->failed.nr = 0;
wbio->order = order;
wbio->used_mempool = used_mempool;
wbio->data = data;

View File

@ -75,8 +75,8 @@ bool bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
{
struct btree_iter *linked;
struct btree *b = iter->nodes[level];
enum btree_node_locked_type want = btree_lock_want(iter, level);
enum btree_node_locked_type have = btree_node_locked_type(iter, level);
int want = btree_lock_want(iter, level);
int have = btree_node_locked_type(iter, level);
if (want == have)
return true;
@ -108,6 +108,17 @@ success:
return true;
}
bool bch2_btree_iter_relock(struct btree_iter *iter)
{
unsigned l;
for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++)
if (!bch2_btree_node_relock(iter, l))
return false;
return true;
}
/* Slowpath: */
bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
unsigned level,
@ -214,7 +225,6 @@ bool __bch2_btree_iter_set_locks_want(struct btree_iter *iter,
unsigned new_locks_want)
{
struct btree_iter *linked;
unsigned l;
/* Drop locks we don't want anymore: */
if (new_locks_want < iter->locks_want)
@ -228,12 +238,9 @@ bool __bch2_btree_iter_set_locks_want(struct btree_iter *iter,
iter->locks_want = new_locks_want;
btree_iter_drop_extra_locks(iter);
for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++)
if (!bch2_btree_node_relock(iter, l))
goto fail;
if (bch2_btree_iter_relock(iter))
return true;
return true;
fail:
/*
* Just an optimization: ancestor nodes must be locked before child
* nodes, so set locks_want on iterators that might lock ancestors

View File

@ -75,7 +75,7 @@ static inline void mark_btree_node_intent_locked(struct btree_iter *iter,
mark_btree_node_locked(iter, level, SIX_LOCK_intent);
}
static inline int btree_lock_want(struct btree_iter *iter, int level)
static inline enum six_lock_type btree_lock_want(struct btree_iter *iter, int level)
{
return level < iter->locks_want
? SIX_LOCK_intent
@ -111,6 +111,7 @@ static inline bool btree_node_lock(struct btree *b, struct bpos pos,
}
bool bch2_btree_node_relock(struct btree_iter *, unsigned);
bool bch2_btree_iter_relock(struct btree_iter *);
void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
void bch2_btree_node_lock_write(struct btree *, struct btree_iter *);

View File

@ -196,6 +196,7 @@ enum btree_flags {
BTREE_NODE_accessed,
BTREE_NODE_write_in_flight,
BTREE_NODE_just_written,
BTREE_NODE_dying,
};
BTREE_FLAG(read_in_flight);
@ -207,6 +208,7 @@ BTREE_FLAG(write_idx);
BTREE_FLAG(accessed);
BTREE_FLAG(write_in_flight);
BTREE_FLAG(just_written);
BTREE_FLAG(dying);
static inline struct btree_write *btree_current_write(struct btree *b)
{

View File

@ -130,7 +130,7 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
__le64, unsigned);
int bch2_btree_node_update_key(struct bch_fs *, struct btree *,
struct bkey_i_extent *);
int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
struct btree *, struct bkey_i_extent *);
#endif /* _BCACHEFS_BTREE_UPDATE_H */

View File

@ -21,7 +21,7 @@
static void btree_node_will_make_reachable(struct btree_update *,
struct btree *);
static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *);
static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int);
/* Debug code: */
@ -686,7 +686,7 @@ retry:
BUG_ON(c->btree_roots[b->btree_id].as != as);
c->btree_roots[b->btree_id].as = NULL;
bch2_btree_set_root_ondisk(c, b);
bch2_btree_set_root_ondisk(c, b, WRITE);
/*
* We don't have to wait anything anything here (before
@ -914,6 +914,7 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
struct btree_write *w;
struct bset_tree *t;
set_btree_node_dying(b);
btree_interior_update_add_node_reference(as, b);
/*
@ -925,7 +926,8 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
* in with keys that aren't in the journal anymore:
*/
for_each_bset(b, t)
as->journal_seq = max(as->journal_seq, bset(b, t)->journal_seq);
as->journal_seq = max(as->journal_seq,
le64_to_cpu(bset(b, t)->journal_seq));
mutex_lock(&c->btree_interior_update_lock);
@ -1027,6 +1029,10 @@ static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
mutex_unlock(&c->btree_cache.lock);
mutex_lock(&c->btree_root_lock);
BUG_ON(btree_node_root(c, b) &&
(b->level < btree_node_root(c, b)->level ||
!btree_node_dying(btree_node_root(c, b))));
btree_node_root(c, b) = b;
mutex_unlock(&c->btree_root_lock);
@ -1054,7 +1060,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
gc_pos_btree_root(b->btree_id));
}
static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b)
static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
{
struct btree_root *r = &c->btree_roots[b->btree_id];
@ -1064,6 +1070,8 @@ static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b)
bkey_copy(&r->key, &b->key);
r->level = b->level;
r->alive = true;
if (rw == WRITE)
c->btree_roots_dirty = true;
mutex_unlock(&c->btree_root_lock);
}
@ -1787,64 +1795,16 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
return ret;
}
int bch2_btree_node_update_key(struct bch_fs *c, struct btree *b,
struct bkey_i_extent *new_key)
static void __bch2_btree_node_update_key(struct bch_fs *c,
struct btree_update *as,
struct btree_iter *iter,
struct btree *b, struct btree *new_hash,
struct bkey_i_extent *new_key)
{
struct btree_update *as = NULL;
struct btree *parent, *new_hash = NULL;
struct btree_iter iter;
struct closure cl;
struct btree *parent;
bool must_rewrite_parent = false;
int ret;
__bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
BTREE_MAX_DEPTH,
b->level, 0);
closure_init_stack(&cl);
ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE);
if (ret)
return ret;
retry:
down_read(&c->gc_lock);
ret = bch2_btree_iter_traverse(&iter);
if (ret)
goto err;
/* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
if (!new_hash &&
PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
/* bch2_btree_reserve_get will unlock */
do {
ret = bch2_btree_cache_cannibalize_lock(c, &cl);
closure_sync(&cl);
} while (ret == -EAGAIN);
BUG_ON(ret);
new_hash = bch2_btree_node_mem_alloc(c);
}
as = bch2_btree_update_start(c, iter.btree_id,
btree_update_reserve_required(c, b),
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_USE_ALLOC_RESERVE,
&cl);
if (IS_ERR(as)) {
ret = PTR_ERR(as);
if (ret == -EAGAIN || ret == -EINTR) {
bch2_btree_iter_unlock(&iter);
up_read(&c->gc_lock);
closure_sync(&cl);
goto retry;
}
goto err;
}
mutex_lock(&c->btree_interior_update_lock);
/*
* Two corner cases that need to be thought about here:
*
@ -1869,22 +1829,12 @@ retry:
if (b->will_make_reachable)
must_rewrite_parent = true;
/* other case: btree node being freed */
if (iter.nodes[b->level] != b) {
/* node has been freed: */
BUG_ON(btree_node_hashed(b));
mutex_unlock(&c->btree_interior_update_lock);
goto err;
}
mutex_unlock(&c->btree_interior_update_lock);
if (must_rewrite_parent)
as->flags |= BTREE_INTERIOR_UPDATE_MUST_REWRITE;
btree_interior_update_add_node_reference(as, b);
parent = iter.nodes[b->level + 1];
parent = iter->nodes[b->level + 1];
if (parent) {
if (new_hash) {
bkey_copy(&new_hash->key, &new_key->k_i);
@ -1893,8 +1843,8 @@ retry:
BUG_ON(ret);
}
bch2_btree_insert_node(as, parent, &iter,
&keylist_single(&new_key->k_i));
bch2_keylist_add(&as->parent_keys, &new_key->k_i);
bch2_btree_insert_node(as, parent, iter, &as->parent_keys);
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
@ -1914,7 +1864,7 @@ retry:
BUG_ON(btree_node_root(c, b) != b);
bch2_btree_node_lock_write(b, &iter);
bch2_btree_node_lock_write(b, iter);
bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i),
c->opts.btree_node_size, true,
@ -1925,14 +1875,94 @@ retry:
&stats);
bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
gc_pos_btree_root(b->btree_id));
bkey_copy(&b->key, &new_key->k_i);
if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
mutex_lock(&c->btree_cache.lock);
bch2_btree_node_hash_remove(&c->btree_cache, b);
bkey_copy(&b->key, &new_key->k_i);
ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
BUG_ON(ret);
mutex_unlock(&c->btree_cache.lock);
} else {
bkey_copy(&b->key, &new_key->k_i);
}
btree_update_updated_root(as);
bch2_btree_node_unlock_write(b, &iter);
bch2_btree_node_unlock_write(b, iter);
}
bch2_btree_update_done(as);
out:
}
int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
struct btree *b, struct bkey_i_extent *new_key)
{
struct btree_update *as = NULL;
struct btree *new_hash = NULL;
struct closure cl;
int ret;
closure_init_stack(&cl);
if (!down_read_trylock(&c->gc_lock)) {
bch2_btree_iter_unlock(iter);
down_read(&c->gc_lock);
if (!bch2_btree_iter_relock(iter)) {
ret = -EINTR;
goto err;
}
}
/* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
/* bch2_btree_reserve_get will unlock */
ret = bch2_btree_cache_cannibalize_lock(c, &cl);
if (ret) {
ret = -EINTR;
bch2_btree_iter_unlock(iter);
up_read(&c->gc_lock);
closure_sync(&cl);
down_read(&c->gc_lock);
if (!bch2_btree_iter_relock(iter))
goto err;
}
new_hash = bch2_btree_node_mem_alloc(c);
}
as = bch2_btree_update_start(c, iter->btree_id,
btree_update_reserve_required(c, b),
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_USE_ALLOC_RESERVE,
&cl);
if (IS_ERR(as)) {
ret = PTR_ERR(as);
if (ret == -EAGAIN)
ret = -EINTR;
if (ret != -EINTR)
goto err;
bch2_btree_iter_unlock(iter);
up_read(&c->gc_lock);
closure_sync(&cl);
down_read(&c->gc_lock);
if (!bch2_btree_iter_relock(iter))
goto err;
}
ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE);
if (ret)
goto err_free_update;
__bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
err:
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
list_move(&new_hash->list, &c->btree_cache.freeable);
@ -1941,14 +1971,12 @@ out:
six_unlock_write(&new_hash->lock);
six_unlock_intent(&new_hash->lock);
}
bch2_btree_iter_unlock(&iter);
up_read(&c->gc_lock);
closure_sync(&cl);
return ret;
err:
if (as)
bch2_btree_update_free(as);
goto out;
err_free_update:
bch2_btree_update_free(as);
goto err;
}
/* Init code: */
@ -1962,7 +1990,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
BUG_ON(btree_node_root(c, b));
__bch2_btree_set_root_inmem(c, b);
bch2_btree_set_root_ondisk(c, b);
bch2_btree_set_root_ondisk(c, b, READ);
}
int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
@ -1998,7 +2026,7 @@ int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
BUG_ON(btree_node_root(c, b));
bch2_btree_set_root_inmem(as, b);
bch2_btree_set_root_ondisk(c, b);
bch2_btree_set_root_ondisk(c, b, WRITE);
bch2_btree_open_bucket_put(c, b);
six_unlock_intent(&b->lock);

View File

@ -174,9 +174,11 @@ do { \
#define bch2_usage_read_raw(_stats) \
({ \
typeof(*this_cpu_ptr(_stats)) _acc = { 0 }; \
typeof(*this_cpu_ptr(_stats)) _acc; \
int cpu; \
\
memset(&_acc, 0, sizeof(_acc)); \
\
for_each_possible_cpu(cpu) \
bch2_usage_add(&_acc, per_cpu_ptr((_stats), cpu)); \
\
@ -479,7 +481,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
{
struct bucket_mark old, new;
unsigned saturated;
struct bch_dev *ca = c->devs[ptr->dev];
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct bucket *g = ca->buckets + PTR_BUCKET_NR(ca, ptr);
unsigned data_type = type == S_META
? BUCKET_BTREE : BUCKET_DATA;

View File

@ -68,16 +68,14 @@ struct bch_dev_usage {
struct bch_fs_usage {
/* all fields are in units of 512 byte sectors: */
/* _uncompressed_ sectors: */
u64 online_reserved;
u64 available_cache;
struct {
u64 data[S_ALLOC_NR];
u64 persistent_reserved;
} s[BCH_REPLICAS_MAX];
u64 online_reserved;
u64 available_cache;
};
/*

View File

@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "bcachefs_ioctl.h"
#include "chardev.h"
#include "super.h"
#include "super-io.h"
@ -25,7 +26,7 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
return ERR_PTR(-EINVAL);
rcu_read_lock();
ca = c->devs[dev];
ca = rcu_dereference(c->devs[dev]);
if (ca)
percpu_ref_get(&ca->ref);
rcu_read_unlock();
@ -80,7 +81,7 @@ static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);
if (copy_from_user(user_devs, arg.devs,
if (copy_from_user(user_devs, user_arg->devs,
sizeof(u64) * arg.nr_devs))
goto err;

View File

@ -72,14 +72,15 @@ static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
}
}
static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c)
static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
unsigned opt)
{
if (c->sb.encryption_type)
return c->opts.wide_macs
? BCH_CSUM_CHACHA20_POLY1305_128
: BCH_CSUM_CHACHA20_POLY1305_80;
return bch2_csum_opt_to_type(c->opts.data_checksum, true);
return bch2_csum_opt_to_type(opt, true);
}
static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
@ -143,6 +144,14 @@ static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
return nonce;
}
static inline struct nonce null_nonce(void)
{
struct nonce ret;
memset(&ret, 0, sizeof(ret));
return ret;
}
static inline struct nonce extent_nonce(struct bversion version,
struct bch_extent_crc_unpacked crc)
{

View File

@ -95,11 +95,17 @@ print:
vscnprintf(buf, sizeof(_buf), fmt, args);
va_end(args);
if (c->opts.fix_errors == FSCK_OPT_EXIT) {
bch_err(c, "%s, exiting", buf);
mutex_unlock(&c->fsck_error_lock);
return FSCK_ERR_EXIT;
}
if (flags & FSCK_CAN_FIX) {
if (c->opts.fix_errors == FSCK_ERR_ASK) {
if (c->opts.fix_errors == FSCK_OPT_ASK) {
printk(KERN_ERR "%s: fix?", buf);
fix = ask_yn();
} else if (c->opts.fix_errors == FSCK_ERR_YES ||
} else if (c->opts.fix_errors == FSCK_OPT_YES ||
(c->opts.nochanges &&
!(flags & FSCK_CAN_IGNORE))) {
if (print)

View File

@ -96,9 +96,10 @@ enum {
};
enum fsck_err_opts {
FSCK_ERR_NO,
FSCK_ERR_YES,
FSCK_ERR_ASK,
FSCK_OPT_EXIT,
FSCK_OPT_YES,
FSCK_OPT_NO,
FSCK_OPT_ASK,
};
enum fsck_err_ret {
@ -217,7 +218,7 @@ do { \
#define bcache_io_error(c, bio, fmt, ...) \
do { \
__bcache_io_error(c, fmt, ##__VA_ARGS__); \
(bio)->bi_error = -EIO; \
(bio)->bi_status = BLK_STS_IOERR; \
} while (0)
#endif /* _BCACHEFS_ERROR_H */

View File

@ -18,6 +18,7 @@
#include "extents.h"
#include "inode.h"
#include "journal.h"
#include "super.h"
#include "super-io.h"
#include "util.h"
#include "xattr.h"
@ -156,6 +157,19 @@ unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
return nr_ptrs;
}
unsigned bch2_extent_nr_good_ptrs(struct bch_fs *c, struct bkey_s_c_extent e)
{
const struct bch_extent_ptr *ptr;
unsigned nr_ptrs = 0;
extent_for_each_ptr(e, ptr)
nr_ptrs += (!ptr->cached &&
bch_dev_bkey_exists(c, ptr->dev)->mi.state !=
BCH_MEMBER_STATE_FAILED);
return nr_ptrs;
}
unsigned bch2_extent_is_compressed(struct bkey_s_c k)
{
struct bkey_s_c_extent e;
@ -362,7 +376,7 @@ static bool should_drop_ptr(const struct bch_fs *c,
struct bkey_s_c_extent e,
const struct bch_extent_ptr *ptr)
{
return ptr->cached && ptr_stale(c->devs[ptr->dev], ptr);
return ptr->cached && ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr);
}
static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
@ -411,8 +425,10 @@ static void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
break;
case BCH_EXTENT_ENTRY_crc128:
entry->crc128.csum.hi = swab64(entry->crc64.csum_hi);
entry->crc128.csum.lo = swab64(entry->crc64.csum_lo);
entry->crc128.csum.hi = (__force __le64)
swab64((__force u64) entry->crc128.csum.hi);
entry->crc128.csum.lo = (__force __le64)
swab64((__force u64) entry->crc128.csum.lo);
break;
case BCH_EXTENT_ENTRY_ptr:
break;
@ -432,10 +448,11 @@ static const char *extent_ptr_invalid(const struct bch_fs *c,
const struct bch_extent_ptr *ptr2;
struct bch_dev *ca;
if (ptr->dev >= c->sb.nr_devices)
if (ptr->dev >= c->sb.nr_devices ||
!c->devs[ptr->dev])
return "pointer to invalid device";
ca = c->devs[ptr->dev];
ca = bch_dev_bkey_exists(c, ptr->dev);
if (!ca)
return "pointer to invalid device";
@ -487,7 +504,9 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
break;
case BCH_EXTENT_ENTRY_ptr:
ptr = entry_to_ptr(entry);
ca = c->devs[ptr->dev];
ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
? bch_dev_bkey_exists(c, ptr->dev)
: NULL;
p("ptr: %u:%llu gen %u%s", ptr->dev,
(u64) ptr->offset, ptr->gen,
@ -528,7 +547,7 @@ static void extent_pick_read_device(struct bch_fs *c,
struct bch_extent_crc_unpacked crc;
extent_for_each_ptr_crc(e, ptr, crc) {
struct bch_dev *ca = c->devs[ptr->dev];
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
if (ptr->cached && ptr_stale(ca, ptr))
continue;
@ -621,7 +640,7 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
bool bad;
extent_for_each_ptr(e, ptr) {
ca = c->devs[ptr->dev];
ca = bch_dev_bkey_exists(c, ptr->dev);
g = PTR_BUCKET(ca, ptr);
replicas++;
@ -1730,7 +1749,7 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier));
extent_for_each_ptr(e, ptr) {
ca = c->devs[ptr->dev];
ca = bch_dev_bkey_exists(c, ptr->dev);
g = PTR_BUCKET(ca, ptr);
replicas++;
ptrs_per_tier[ca->mi.tier]++;
@ -1844,7 +1863,7 @@ static void bch2_extent_to_text(struct bch_fs *c, char *buf,
static unsigned PTR_TIER(struct bch_fs *c,
const struct bch_extent_ptr *ptr)
{
return c->devs[ptr->dev]->mi.tier;
return bch_dev_bkey_exists(c, ptr->dev)->mi.tier;
}
static void bch2_extent_crc_init(union bch_extent_crc *crc,
@ -1971,14 +1990,10 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
struct bkey_s_extent e)
{
struct bch_extent_ptr *ptr;
unsigned tier = 0, nr_cached = 0, nr_good = 0;
unsigned tier = 0, nr_cached = 0;
unsigned nr_good = bch2_extent_nr_good_ptrs(c, e.c);
bool have_higher_tier;
extent_for_each_ptr(e, ptr)
if (!ptr->cached &&
c->devs[ptr->dev]->mi.state != BCH_MEMBER_STATE_FAILED)
nr_good++;
if (nr_good <= c->opts.data_replicas)
return;
@ -2103,7 +2118,7 @@ static enum merge_result bch2_extent_merge(struct bch_fs *c,
return BCH_MERGE_NOMERGE;
/* We don't allow extents to straddle buckets: */
ca = c->devs[lp->dev];
ca = bch_dev_bkey_exists(c, lp->dev);
if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
return BCH_MERGE_NOMERGE;
@ -2347,6 +2362,30 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
}
}
int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
{
struct btree_iter iter;
struct bpos end = pos;
struct bkey_s_c k;
int ret = 0;
end.offset += size;
for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos,
BTREE_ITER_WITH_HOLES, k) {
if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
break;
if (!bch2_extent_is_fully_allocated(k)) {
ret = -ENOSPC;
break;
}
}
bch2_btree_iter_unlock(&iter);
return ret;
}
const struct bkey_ops bch2_bkey_extent_ops = {
.key_invalid = bch2_extent_invalid,
.key_debugcheck = bch2_extent_debugcheck,

View File

@ -45,6 +45,7 @@ bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
unsigned bch2_extent_nr_good_ptrs(struct bch_fs *, struct bkey_s_c_extent);
unsigned bch2_extent_is_compressed(struct bkey_s_c);
bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
@ -243,14 +244,14 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
case BCH_EXTENT_CRC32:
return (struct bch_extent_crc_unpacked) {
common_fields(crc->crc32),
.csum.lo = crc->crc32.csum,
.csum.lo = (__force __le64) crc->crc32.csum,
};
case BCH_EXTENT_CRC64:
return (struct bch_extent_crc_unpacked) {
common_fields(crc->crc64),
.nonce = crc->crc64.nonce,
.csum.lo = crc->crc64.csum_lo,
.csum.hi = crc->crc64.csum_hi,
.csum.lo = (__force __le64) crc->crc64.csum_lo,
.csum.hi = (__force __le64) crc->crc64.csum_hi,
};
case BCH_EXTENT_CRC128:
return (struct bch_extent_crc_unpacked) {
@ -425,4 +426,6 @@ bool bch2_cut_front(struct bpos, struct bkey_i *);
bool bch2_cut_back(struct bpos, struct bkey *);
void bch2_key_resize(struct bkey *, unsigned);
int bch2_check_range_allocated(struct bch_fs *, struct bpos, u64);
#endif /* _BCACHEFS_EXTENTS_H */

File diff suppressed because it is too large Load Diff

View File

@ -75,7 +75,7 @@ do { \
/* Set VFS inode flags from bcachefs inode: */
void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
{
set_flags(bch_flags_to_vfs, inode->ei_flags, inode->v.i_flags);
set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
}
static int bch2_inode_flags_set(struct bch_inode_info *inode,
@ -99,13 +99,13 @@ static int bch2_inode_flags_set(struct bch_inode_info *inode,
return -EINVAL;
bi->bi_flags = newflags;
inode->v.i_ctime = current_fs_time(inode->v.i_sb);
inode->v.i_ctime = current_time(&inode->v);
return 0;
}
static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg)
{
unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_flags);
unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags);
return put_user(flags, arg);
}
@ -153,7 +153,7 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
{
struct fsxattr fa = { 0 };
fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_flags);
fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
return copy_to_user(arg, &fa, sizeof(fa));
}

View File

@ -12,6 +12,7 @@
#include "fs-ioctl.h"
#include "fsck.h"
#include "inode.h"
#include "io.h"
#include "journal.h"
#include "keylist.h"
#include "super.h"
@ -130,10 +131,8 @@ int __must_check __bch2_write_inode(struct bch_fs *c,
BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i));
} while (ret == -EINTR);
if (!ret) {
inode->ei_size = inode_u.bi_size;
inode->ei_flags = inode_u.bi_flags;
}
if (!ret)
inode->ei_inode = inode_u;
out:
bch2_btree_iter_unlock(&iter);
@ -146,7 +145,7 @@ int __must_check bch2_write_inode(struct bch_fs *c,
return __bch2_write_inode(c, inode, NULL, NULL);
}
int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode)
static int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode)
{
int ret;
@ -158,7 +157,7 @@ int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode)
return ret;
}
int bch2_dec_nlink(struct bch_fs *c, struct bch_inode_info *inode)
static int bch2_dec_nlink(struct bch_fs *c, struct bch_inode_info *inode)
{
int ret = 0;
@ -223,7 +222,9 @@ static struct bch_inode_info *bch2_vfs_inode_create(struct bch_fs *c,
bch2_inode_init(c, &inode_u,
i_uid_read(&inode->v),
i_gid_read(&inode->v),
inode->v.i_mode, rdev);
inode->v.i_mode, rdev,
&dir->ei_inode);
ret = bch2_inode_create(c, &inode_u,
BLOCKDEV_INODE_MAX, 0,
&c->unused_inode_hint);
@ -277,7 +278,7 @@ static int bch2_vfs_dirent_create(struct bch_fs *c,
if (unlikely(ret))
return ret;
dir->v.i_mtime = dir->v.i_ctime = current_fs_time(c->vfs_sb);
dir->v.i_mtime = dir->v.i_ctime = current_time(&dir->v);
mark_inode_dirty_sync(&dir->v);
return 0;
}
@ -344,7 +345,7 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
lockdep_assert_held(&inode->v.i_rwsem);
inode->v.i_ctime = current_fs_time(dir->v.i_sb);
inode->v.i_ctime = current_time(&dir->v);
ret = bch2_inc_nlink(c, inode);
if (ret)
@ -473,7 +474,7 @@ static int bch2_rename(struct bch_fs *c,
{
struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode);
struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode);
struct timespec now = current_fs_time(old_dir->v.i_sb);
struct timespec now = current_time(&old_dir->v);
int ret;
lockdep_assert_held(&old_dir->v.i_rwsem);
@ -551,7 +552,7 @@ static int bch2_rename_exchange(struct bch_fs *c,
{
struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode);
struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode);
struct timespec now = current_fs_time(old_dir->v.i_sb);
struct timespec now = current_time(&old_dir->v);
int ret;
ret = bch2_dirent_rename(c,
@ -909,10 +910,8 @@ static void bch2_vfs_inode_init(struct bch_fs *c,
inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime);
inode->ei_journal_seq = 0;
inode->ei_size = bi->bi_size;
inode->ei_flags = bi->bi_flags;
atomic64_set(&inode->ei_sectors, bi->bi_sectors);
inode->ei_str_hash = bch2_hash_info_init(c, bi);
inode->ei_inode = *bi;
bch2_inode_flags_to_vfs(inode);
@ -949,8 +948,6 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
inode_init_once(&inode->v);
mutex_init(&inode->ei_update_lock);
inode->ei_journal_seq = 0;
atomic_long_set(&inode->ei_size_dirty_count, 0);
atomic_long_set(&inode->ei_sectors_dirty_count, 0);
return &inode->v;
}
@ -995,12 +992,6 @@ static void bch2_evict_inode(struct inode *vinode)
truncate_inode_pages_final(&inode->v.i_data);
if (!bch2_journal_error(&c->journal) && !is_bad_inode(&inode->v)) {
/* XXX - we want to check this stuff iff there weren't IO errors: */
BUG_ON(atomic_long_read(&inode->ei_sectors_dirty_count));
BUG_ON(atomic64_read(&inode->ei_sectors) != inode->v.i_blocks);
}
clear_inode(&inode->v);
if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
@ -1272,9 +1263,16 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
sb->s_magic = BCACHEFS_STATFS_MAGIC;
sb->s_time_gran = c->sb.time_precision;
c->vfs_sb = sb;
sb->s_bdi = &c->bdi;
strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
ret = super_setup_bdi(sb);
if (ret)
goto err_put_super;
sb->s_bdi->congested_fn = bch2_congested;
sb->s_bdi->congested_data = c;
sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
for_each_online_member(ca, c, i) {
struct block_device *bdev = ca->disk_sb.bdev;

View File

@ -1,6 +1,7 @@
#ifndef _BCACHEFS_FS_H
#define _BCACHEFS_FS_H
#include "opts.h"
#include "str_hash.h"
#include <linux/seqlock.h>
@ -11,22 +12,12 @@ struct bch_inode_info {
struct mutex ei_update_lock;
u64 ei_journal_seq;
atomic_long_t ei_size_dirty_count;
/*
* these are updated whenever we update the inode in the btree - for
* e.g. fsync
*/
u64 ei_size;
u32 ei_flags;
atomic_long_t ei_sectors_dirty_count;
atomic64_t ei_sectors;
unsigned long ei_last_dirtied;
struct bch_hash_info ei_str_hash;
unsigned long ei_last_dirtied;
/* copy of inode in btree: */
struct bch_inode_unpacked ei_inode;
};
#define to_bch_ei(_inode) \

View File

@ -204,7 +204,7 @@ static int hash_check_key(const struct bch_hash_desc desc,
"hash table key at wrong offset: %llu, "
"hashed to %llu chain starts at %llu\n%s",
k.k->p.offset, hashed, h->chain.pos.offset,
bch2_bkey_val_to_text(c, desc.btree_id,
bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
buf, sizeof(buf), k))) {
ret = hash_redo_key(desc, h, c, k_iter, k, hashed);
if (ret) {
@ -224,7 +224,7 @@ static int hash_check_key(const struct bch_hash_desc desc,
if (fsck_err_on(k2.k->type == desc.key_type &&
!desc.cmp_bkey(k, k2), c,
"duplicate hash table keys:\n%s",
bch2_bkey_val_to_text(c, desc.btree_id,
bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
buf, sizeof(buf), k))) {
ret = bch2_hash_delete_at(desc, &h->info, &h->iter, NULL);
if (ret)
@ -397,9 +397,9 @@ static int check_dirents(struct bch_fs *c)
if (fsck_err_on(have_target &&
d.v->d_type !=
mode_to_type(le16_to_cpu(target.bi_mode)), c,
mode_to_type(target.bi_mode), c,
"incorrect d_type: should be %u:\n%s",
mode_to_type(le16_to_cpu(target.bi_mode)),
mode_to_type(target.bi_mode),
bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
buf, sizeof(buf), k))) {
struct bkey_i_dirent *n;
@ -411,7 +411,7 @@ static int check_dirents(struct bch_fs *c)
}
bkey_reassemble(&n->k_i, d.s_c);
n->v.d_type = mode_to_type(le16_to_cpu(target.bi_mode));
n->v.d_type = mode_to_type(target.bi_mode);
ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
BTREE_INSERT_NOFAIL,
@ -493,7 +493,8 @@ static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
fsck_err:
return ret;
create_root:
bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO,
0, NULL);
root_inode->bi_inum = BCACHEFS_ROOT_INO;
bch2_inode_pack(&packed, root_inode);
@ -545,7 +546,8 @@ create_lostfound:
if (ret)
return ret;
bch2_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
bch2_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO,
0, root_inode);
ret = bch2_inode_create(c, lostfound_inode, BLOCKDEV_INODE_MAX, 0,
&c->unused_inode_hint);

View File

@ -198,6 +198,12 @@ static const char *bch2_inode_invalid(const struct bch_fs *c,
if (bch2_inode_unpack(inode, &unpacked))
return "invalid variable length fields";
if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
return "invalid data checksum type";
if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
return "invalid data checksum type";
return NULL;
}
case BCH_INODE_BLOCKDEV:
@ -221,6 +227,7 @@ static const char *bch2_inode_invalid(const struct bch_fs *c,
static void bch2_inode_to_text(struct bch_fs *c, char *buf,
size_t size, struct bkey_s_c k)
{
char *out = buf, *end = out + size;
struct bkey_s_c_inode inode;
struct bch_inode_unpacked unpacked;
@ -228,11 +235,14 @@ static void bch2_inode_to_text(struct bch_fs *c, char *buf,
case BCH_INODE_FS:
inode = bkey_s_c_to_inode(k);
if (bch2_inode_unpack(inode, &unpacked)) {
scnprintf(buf, size, "(unpack error)");
out += scnprintf(out, end - out, "(unpack error)");
break;
}
scnprintf(buf, size, "i_size %llu", unpacked.bi_size);
#define BCH_INODE_FIELD(_name, _bits) \
out += scnprintf(out, end - out, #_name ": %llu ", (u64) unpacked._name);
BCH_INODE_FIELDS()
#undef BCH_INODE_FIELD
break;
}
}
@ -243,9 +253,12 @@ const struct bkey_ops bch2_bkey_inode_ops = {
};
void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
uid_t uid, gid_t gid, umode_t mode, dev_t rdev)
uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
struct bch_inode_unpacked *parent)
{
s64 now = timespec_to_bch2_time(c, CURRENT_TIME);
s64 now = timespec_to_bch2_time(c,
timespec_trunc(current_kernel_time(),
c->sb.time_precision));
memset(inode_u, 0, sizeof(*inode_u));
@ -261,6 +274,12 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
inode_u->bi_mtime = now;
inode_u->bi_ctime = now;
inode_u->bi_otime = now;
if (parent) {
#define BCH_INODE_FIELD(_name) inode_u->_name = parent->_name;
BCH_INODE_FIELDS_INHERIT()
#undef BCH_INODE_FIELD
}
}
int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
@ -416,7 +435,7 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
struct bch_inode_unpacked inode_u;
if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
bi_generation = cpu_to_le32(inode_u.bi_generation) + 1;
bi_generation = inode_u.bi_generation + 1;
break;
}
case BCH_INODE_GENERATION: {

View File

@ -1,6 +1,8 @@
#ifndef _BCACHEFS_INODE_H
#define _BCACHEFS_INODE_H
#include "opts.h"
#include <linux/math64.h>
extern const struct bkey_ops bch2_bkey_inode_ops;
@ -28,7 +30,8 @@ void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *)
int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
uid_t, gid_t, umode_t, dev_t);
uid_t, gid_t, umode_t, dev_t,
struct bch_inode_unpacked *);
int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *,
u64, u64, u64 *);
int bch2_inode_truncate(struct bch_fs *, u64, u64,
@ -55,6 +58,45 @@ static inline u64 timespec_to_bch2_time(struct bch_fs *c, struct timespec ts)
return div_s64(ns, c->sb.time_precision);
}
static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
{
struct bch_io_opts ret = { 0 };
#define BCH_INODE_OPT(_name, _bits) \
if (inode->bi_##_name) \
opt_set(ret, _name, inode->bi_##_name - 1);
BCH_INODE_OPTS()
#undef BCH_INODE_OPT
return ret;
}
static inline void __bch2_inode_opt_set(struct bch_inode_unpacked *inode,
enum bch_opt_id id, u64 v)
{
switch (id) {
#define BCH_INODE_OPT(_name, ...) \
case Opt_##_name: \
inode->bi_##_name = v; \
break;
BCH_INODE_OPTS()
#undef BCH_INODE_OPT
default:
BUG();
}
}
static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
enum bch_opt_id id, u64 v)
{
return __bch2_inode_opt_set(inode, id, v + 1);
}
static inline void bch2_inode_opt_clear(struct bch_inode_unpacked *inode,
enum bch_opt_id id)
{
return __bch2_inode_opt_set(inode, id, 0);
}
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_inode_pack_test(void);
#else

View File

@ -20,6 +20,7 @@
#include "journal.h"
#include "keylist.h"
#include "move.h"
#include "super.h"
#include "super-io.h"
#include <linux/blkdev.h>
@ -139,7 +140,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
const struct bch_extent_ptr *ptr;
struct bch_write_bio *n;
struct bch_dev *ca;
unsigned ptr_idx = 0;
BUG_ON(c->opts.nochanges);
@ -147,7 +147,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
!c->devs[ptr->dev]);
ca = c->devs[ptr->dev];
ca = bch_dev_bkey_exists(c, ptr->dev);
if (ptr + 1 < &extent_entry_last(e)->ptr) {
n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
@ -168,7 +168,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
n->c = c;
n->ca = ca;
n->ptr_idx = ptr_idx++;
n->submit_time_us = local_clock_us();
n->bio.bi_iter.bi_sector = ptr->offset;
@ -184,7 +183,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
submit_bio(&n->bio);
} else {
n->have_io_ref = false;
bcache_io_error(c, &n->bio, "device has been removed");
n->bio.bi_status = BLK_STS_REMOVED;
bio_endio(&n->bio);
}
}
@ -201,9 +200,12 @@ static void bch2_write_done(struct closure *cl)
if (!op->error && (op->flags & BCH_WRITE_FLUSH))
op->error = bch2_journal_error(&op->c->journal);
bch2_disk_reservation_put(op->c, &op->res);
if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
bch2_disk_reservation_put(op->c, &op->res);
percpu_ref_put(&op->c->writes);
bch2_keylist_free(&op->insert_keys, op->inline_keys);
op->flags &= ~(BCH_WRITE_DONE|BCH_WRITE_LOOPED);
closure_return(cl);
}
@ -244,9 +246,37 @@ static void bch2_write_index(struct closure *cl)
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct bch_fs *c = op->c;
struct keylist *keys = &op->insert_keys;
struct bkey_s_extent e;
struct bch_extent_ptr *ptr;
struct bkey_i *src, *dst = keys->keys, *n;
int ret;
op->flags |= BCH_WRITE_LOOPED;
for (src = keys->keys; src != keys->top; src = n) {
n = bkey_next(src);
bkey_copy(dst, src);
e = bkey_i_to_s_extent(dst);
extent_for_each_ptr_backwards(e, ptr)
if (test_bit(ptr->dev, op->failed.d))
bch2_extent_drop_ptr(e, ptr);
ret = bch2_extent_nr_ptrs(e.c)
? bch2_check_mark_super(c, e.c, BCH_DATA_USER)
: -EIO;
if (ret) {
keys->top = keys->keys;
op->error = ret;
op->flags |= BCH_WRITE_DONE;
goto err;
}
dst = bkey_next(dst);
}
keys->top = dst;
if (!bch2_keylist_empty(keys)) {
u64 sectors_start = keylist_sectors(keys);
int ret = op->index_update_fn(op);
@ -260,7 +290,7 @@ static void bch2_write_index(struct closure *cl)
op->error = ret;
}
}
err:
bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets);
if (!(op->flags & BCH_WRITE_DONE))
@ -276,43 +306,6 @@ static void bch2_write_index(struct closure *cl)
}
}
static void bch2_write_io_error(struct closure *cl)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct keylist *keys = &op->insert_keys;
struct bch_fs *c = op->c;
struct bch_extent_ptr *ptr;
struct bkey_i *k;
int ret;
for_each_keylist_key(keys, k) {
struct bkey_i *n = bkey_next(k);
struct bkey_s_extent e = bkey_i_to_s_extent(k);
extent_for_each_ptr_backwards(e, ptr)
if (test_bit(ptr->dev, op->failed.d))
bch2_extent_drop_ptr(e, ptr);
memmove(bkey_next(k), n, (void *) keys->top - (void *) n);
keys->top_p -= (u64 *) n - (u64 *) bkey_next(k);
ret = bch2_extent_nr_ptrs(e.c)
? bch2_check_mark_super(c, e.c, BCH_DATA_USER)
: -EIO;
if (ret) {
keys->top = keys->keys;
op->error = ret;
op->flags |= BCH_WRITE_DONE;
break;
}
}
memset(&op->failed, 0, sizeof(op->failed));
bch2_write_index(cl);
return;
}
static void bch2_write_endio(struct bio *bio)
{
struct closure *cl = bio->bi_private;
@ -324,10 +317,8 @@ static void bch2_write_endio(struct bio *bio)
bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
if (bch2_dev_io_err_on(bio->bi_error, ca, "data write")) {
if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
set_bit(ca->dev_idx, op->failed.d);
set_closure_fn(cl, bch2_write_io_error, index_update_wq(op));
}
if (wbio->have_io_ref)
percpu_ref_put(&ca->io_ref);
@ -706,11 +697,6 @@ do_write:
key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(key_to_write),
BCH_DATA_USER);
if (ret)
goto err;
dst->bi_end_io = bch2_write_endio;
dst->bi_private = &op->cl;
bio_set_op_attrs(dst, REQ_OP_WRITE, 0);
@ -870,7 +856,8 @@ void bch2_write(struct closure *cl)
!percpu_ref_tryget(&c->writes)) {
__bcache_io_error(c, "read only");
op->error = -EROFS;
bch2_disk_reservation_put(c, &op->res);
if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
bch2_disk_reservation_put(c, &op->res);
closure_return(cl);
}
@ -916,7 +903,10 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
rbio->promote = NULL;
__bch2_write_op_init(&op->write.op, c);
bch2_write_op_init(&op->write.op, c);
op->write.op.csum_type = bch2_data_checksum_type(c, rbio->opts.data_checksum);
op->write.op.compression_type =
bch2_compression_opt_to_type(rbio->opts.compression);
op->write.move_dev = -1;
op->write.op.devs = c->fastest_devs;
@ -1060,7 +1050,7 @@ static void bch2_rbio_retry(struct work_struct *work)
if (rbio->split)
rbio = bch2_rbio_free(rbio);
else
rbio->bio.bi_error = 0;
rbio->bio.bi_status = 0;
if (!(flags & BCH_READ_NODECODE))
flags |= BCH_READ_MUST_CLONE;
@ -1073,7 +1063,8 @@ static void bch2_rbio_retry(struct work_struct *work)
__bch2_read(c, rbio, iter, inode, &avoid, flags);
}
static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error)
static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
blk_status_t error)
{
rbio->retry = retry;
@ -1081,7 +1072,7 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error)
return;
if (retry == READ_ERR) {
bch2_rbio_parent(rbio)->bio.bi_error = error;
bch2_rbio_parent(rbio)->bio.bi_status = error;
bch2_rbio_done(rbio);
} else {
bch2_rbio_punt(rbio, bch2_rbio_retry,
@ -1236,7 +1227,7 @@ csum_err:
*/
if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
rbio->flags |= BCH_READ_MUST_BOUNCE;
bch2_rbio_error(rbio, READ_RETRY, -EIO);
bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
return;
}
@ -1245,13 +1236,13 @@ csum_err:
rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
csum.hi, csum.lo, crc.csum_type);
bch2_rbio_error(rbio, READ_RETRY_AVOID, -EIO);
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
return;
decompression_err:
__bcache_io_error(c, "decompression error, inode %llu offset %llu",
rbio->pos.inode,
(u64) rbio->bvec_iter.bi_sector);
bch2_rbio_error(rbio, READ_ERR, -EIO);
bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
return;
}
@ -1270,8 +1261,8 @@ static void bch2_read_endio(struct bio *bio)
if (!rbio->split)
rbio->bio.bi_end_io = rbio->end_io;
if (bch2_dev_io_err_on(bio->bi_error, rbio->pick.ca, "data read")) {
bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_error);
if (bch2_dev_io_err_on(bio->bi_status, rbio->pick.ca, "data read")) {
bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
return;
}
@ -1281,9 +1272,9 @@ static void bch2_read_endio(struct bio *bio)
atomic_long_inc(&c->read_realloc_races);
if (rbio->flags & BCH_READ_RETRY_IF_STALE)
bch2_rbio_error(rbio, READ_RETRY, -EINTR);
bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
else
bch2_rbio_error(rbio, READ_ERR, -EINTR);
bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
return;
}
@ -1360,7 +1351,8 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
DIV_ROUND_UP(sectors, PAGE_SECTORS),
&c->bio_read_split));
&c->bio_read_split),
orig->opts);
bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
split = true;
@ -1374,7 +1366,8 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
* lose the error)
*/
rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
&c->bio_read_split));
&c->bio_read_split),
orig->opts);
rbio->bio.bi_iter = iter;
split = true;
} else {
@ -1428,6 +1421,8 @@ noclone:
bch2_read_endio(&rbio->bio);
ret = rbio->retry;
if (rbio->split)
rbio = bch2_rbio_free(rbio);
if (!ret)
bch2_rbio_done(rbio);
}
@ -1503,7 +1498,7 @@ err:
* possibly bigger than the memory that was
* originally allocated)
*/
rbio->bio.bi_error = -EINTR;
rbio->bio.bi_status = BLK_STS_AGAIN;
bio_endio(&rbio->bio);
return;
}
@ -1561,6 +1556,7 @@ retry:
case READ_RETRY:
goto retry;
case READ_ERR:
rbio->bio.bi_status = BLK_STS_IOERR;
bio_endio(&rbio->bio);
return;
};

View File

@ -21,6 +21,8 @@ void bch2_latency_acct(struct bch_dev *, unsigned, int);
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
enum bch_data_type, const struct bkey_i *);
#define BLK_STS_REMOVED ((__force blk_status_t)128)
enum bch_write_flags {
BCH_WRITE_ALLOC_NOWAIT = (1 << 0),
BCH_WRITE_CACHED = (1 << 1),
@ -29,11 +31,12 @@ enum bch_write_flags {
BCH_WRITE_PAGES_STABLE = (1 << 4),
BCH_WRITE_PAGES_OWNED = (1 << 5),
BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6),
BCH_WRITE_NOPUT_RESERVATION = (1 << 7),
/* Internal: */
BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 7),
BCH_WRITE_DONE = (1 << 8),
BCH_WRITE_LOOPED = (1 << 9),
BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 8),
BCH_WRITE_DONE = (1 << 9),
BCH_WRITE_LOOPED = (1 << 10),
};
static inline u64 *op_journal_seq(struct bch_write_op *op)
@ -42,6 +45,12 @@ static inline u64 *op_journal_seq(struct bch_write_op *op)
? op->journal_seq_p : &op->journal_seq;
}
static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq)
{
op->journal_seq_p = journal_seq;
op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
}
static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
{
return op->alloc_reserve == RESERVE_MOVINGGC
@ -51,14 +60,14 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
int bch2_write_index_default(struct bch_write_op *);
static inline void __bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c)
static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c)
{
op->c = c;
op->io_wq = index_update_wq(op);
op->flags = 0;
op->written = 0;
op->error = 0;
op->csum_type = bch2_data_checksum_type(c);
op->csum_type = bch2_data_checksum_type(c, c->opts.data_checksum);
op->compression_type =
bch2_compression_opt_to_type(c->opts.compression);
op->nr_replicas = 0;
@ -75,27 +84,6 @@ static inline void __bch2_write_op_init(struct bch_write_op *op, struct bch_fs *
op->index_update_fn = bch2_write_index_default;
}
static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
struct disk_reservation res,
struct bch_devs_mask *devs,
struct write_point_specifier write_point,
struct bpos pos,
u64 *journal_seq, unsigned flags)
{
__bch2_write_op_init(op, c);
op->flags = flags;
op->nr_replicas = res.nr_replicas;
op->pos = pos;
op->res = res;
op->devs = devs;
op->write_point = write_point;
if (journal_seq) {
op->journal_seq_p = journal_seq;
op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
}
}
void bch2_write(struct closure *);
static inline struct bch_write_bio *wbio_init(struct bio *bio)
@ -134,25 +122,27 @@ static inline void bch2_read_extent(struct bch_fs *c,
struct extent_pick_ptr *pick,
unsigned flags)
{
rbio->_state = 0;
__bch2_read_extent(c, rbio, rbio->bio.bi_iter, e, pick, flags);
}
static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
u64 inode)
{
rbio->_state = 0;
BUG_ON(rbio->_state);
__bch2_read(c, rbio, rbio->bio.bi_iter, inode, NULL,
BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE|
BCH_READ_USER_MAPPED);
}
static inline struct bch_read_bio *rbio_init(struct bio *bio)
static inline struct bch_read_bio *rbio_init(struct bio *bio,
struct bch_io_opts opts)
{
struct bch_read_bio *rbio = to_rbio(bio);
rbio->_state = 0;
rbio->_state = 0;
rbio->promote = NULL;
rbio->opts = opts;
return rbio;
}

View File

@ -6,6 +6,7 @@
#include "buckets_types.h"
#include "extents_types.h"
#include "keylist_types.h"
#include "opts.h"
#include "super_types.h"
#include <linux/llist.h>
@ -56,6 +57,8 @@ struct bch_read_bio {
struct promote_op *promote;
struct bch_io_opts opts;
struct work_struct work;
struct bio bio;
@ -69,8 +72,7 @@ struct bch_write_bio {
struct closure *cl;
};
u8 ptr_idx;
u8 replicas_failed;
struct bch_devs_list failed;
u8 order;
unsigned split:1,
@ -90,8 +92,8 @@ struct bch_write_op {
struct bch_fs *c;
struct workqueue_struct *io_wq;
unsigned written; /* sectors */
u16 flags;
u16 written; /* sectors */
s8 error;
unsigned csum_type:4;

View File

@ -338,8 +338,8 @@ struct journal_list {
* Given a journal entry we just read, add it to the list of journal entries to
* be replayed:
*/
static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
struct jset *j)
static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
struct journal_list *jlist, struct jset *j)
{
struct journal_replay *i, *pos;
struct list_head *where;
@ -347,8 +347,6 @@ static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
__le64 last_seq;
int ret;
mutex_lock(&jlist->lock);
last_seq = !list_empty(jlist->head)
? list_last_entry(jlist->head, struct journal_replay,
list)->j.last_seq
@ -376,9 +374,7 @@ static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
memcmp(j, &i->j, bytes), c,
"found duplicate but non identical journal entries (seq %llu)",
le64_to_cpu(j->seq));
ret = JOURNAL_ENTRY_ADD_OK;
goto out;
goto found;
}
if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
@ -395,12 +391,16 @@ add:
goto out;
}
memcpy(&i->j, j, bytes);
list_add(&i->list, where);
i->devs.nr = 0;
memcpy(&i->j, j, bytes);
found:
if (!fsck_err_on(bch2_dev_list_has_dev(i->devs, ca->dev_idx),
c, "duplicate journal entries on same device"))
bch2_dev_list_add_dev(&i->devs, ca->dev_idx);
ret = JOURNAL_ENTRY_ADD_OK;
out:
fsck_err:
mutex_unlock(&jlist->lock);
return ret;
}
@ -496,8 +496,8 @@ fsck_err:
#define journal_entry_err_on(cond, c, msg, ...) \
((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
static int __journal_entry_validate(struct bch_fs *c, struct jset *j,
int write)
static int journal_entry_validate_entries(struct bch_fs *c, struct jset *j,
int write)
{
struct jset_entry *entry;
int ret = 0;
@ -508,7 +508,7 @@ static int __journal_entry_validate(struct bch_fs *c, struct jset *j,
if (journal_entry_err_on(vstruct_next(entry) >
vstruct_last(j), c,
"journal entry extends past end of jset")) {
j->u64s = cpu_to_le64((u64 *) entry - j->_data);
j->u64s = cpu_to_le32((u64 *) entry - j->_data);
break;
}
@ -614,7 +614,7 @@ static int journal_entry_validate(struct bch_fs *c,
"invalid journal entry: last_seq > seq"))
j->last_seq = j->seq;
return __journal_entry_validate(c, j, write);
return 0;
fsck_err:
return ret;
}
@ -722,7 +722,10 @@ reread: sectors_read = min_t(unsigned,
ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
ret = journal_entry_add(c, jlist, j);
mutex_lock(&jlist->lock);
ret = journal_entry_add(c, ca, jlist, j);
mutex_unlock(&jlist->lock);
switch (ret) {
case JOURNAL_ENTRY_ADD_OK:
*entries_found = true;
@ -916,7 +919,9 @@ static int journal_seq_blacklist_read(struct journal *j,
for_each_jset_entry_type(entry, &i->j,
JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED) {
seq = le64_to_cpu(entry->_data[0]);
struct jset_entry_blacklist *bl_entry =
container_of(entry, struct jset_entry_blacklist, entry);
seq = le64_to_cpu(bl_entry->seq);
bch_verbose(c, "blacklisting existing journal seq %llu", seq);
@ -982,6 +987,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
fsck_err_on(c->sb.clean && journal_has_keys(list), c,
"filesystem marked clean but journal has keys to replay");
list_for_each_entry(i, list, list) {
ret = journal_entry_validate_entries(c, &i->j, READ);
if (ret)
goto fsck_err;
}
i = list_last_entry(list, struct journal_replay, list);
unfixable_fsck_err_on(le64_to_cpu(i->j.seq) -
@ -1002,6 +1013,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
INIT_LIST_HEAD(&p->list);
INIT_LIST_HEAD(&p->flushed);
atomic_set(&p->count, 0);
p->devs.nr = 0;
}
mutex_lock(&j->blacklist_lock);
@ -1010,6 +1022,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
p = journal_seq_pin(j, le64_to_cpu(i->j.seq));
atomic_set(&p->count, 1);
p->devs = i->devs;
if (journal_seq_blacklist_read(j, i, p)) {
mutex_unlock(&j->blacklist_lock);
@ -1090,7 +1103,7 @@ void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
{
struct journal_buf *w = journal_prev_buf(j);
atomic_dec_bug(&journal_seq_pin(j, w->data->seq)->count);
atomic_dec_bug(&journal_seq_pin(j, le64_to_cpu(w->data->seq))->count);
if (!need_write_just_set &&
test_bit(JOURNAL_NEED_WRITE, &j->flags))
@ -1122,6 +1135,7 @@ static void __journal_entry_new(struct journal *j, int count)
INIT_LIST_HEAD(&p->list);
INIT_LIST_HEAD(&p->flushed);
atomic_set(&p->count, count);
p->devs.nr = 0;
}
static void __bch2_journal_next_entry(struct journal *j)
@ -1851,6 +1865,21 @@ void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
bch2_journal_error(j));
}
int bch2_journal_flush_all_pins(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
bool flush;
bch2_journal_flush_pins(j, U64_MAX);
spin_lock(&j->lock);
flush = last_seq(j) != j->last_seq_ondisk ||
c->btree_roots_dirty;
spin_unlock(&j->lock);
return flush ? bch2_journal_meta(j) : 0;
}
static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
{
bool ret;
@ -2002,7 +2031,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
* i.e. whichever device was limiting the current journal entry size.
*/
extent_for_each_ptr_backwards(e, ptr) {
ca = c->devs[ptr->dev];
ca = bch_dev_bkey_exists(c, ptr->dev);
if (ca->mi.state != BCH_MEMBER_STATE_RW ||
ca->journal.sectors_free <= sectors)
@ -2197,7 +2226,7 @@ static void journal_write_endio(struct bio *bio)
struct bch_dev *ca = bio->bi_private;
struct journal *j = &ca->fs->journal;
if (bch2_dev_io_err_on(bio->bi_error, ca, "journal write") ||
if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write") ||
bch2_meta_write_fault("journal")) {
/* Was this a flush or an actual journal write? */
if (ca->journal.ptr_idx != U8_MAX) {
@ -2233,6 +2262,7 @@ static void journal_write(struct closure *cl)
if (r->alive)
bch2_journal_add_btree_root(w, i, &r->key, r->level);
}
c->btree_roots_dirty = false;
mutex_unlock(&c->btree_root_lock);
journal_write_compact(jset);
@ -2246,7 +2276,7 @@ static void journal_write(struct closure *cl)
SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
__journal_entry_validate(c, jset, WRITE))
journal_entry_validate_entries(c, jset, WRITE))
goto err;
bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
@ -2257,7 +2287,7 @@ static void journal_write(struct closure *cl)
journal_nonce(jset), jset);
if (!bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
__journal_entry_validate(c, jset, WRITE))
journal_entry_validate_entries(c, jset, WRITE))
goto err;
sectors = vstruct_sectors(jset, c->block_bits);
@ -2277,6 +2307,9 @@ static void journal_write(struct closure *cl)
BCH_DATA_JOURNAL))
goto err;
journal_seq_pin(j, le64_to_cpu(jset->seq))->devs =
bch2_extent_devs(bkey_i_to_s_c_extent(&j->key));
/*
* XXX: we really should just disable the entire journal in nochanges
* mode
@ -2285,7 +2318,7 @@ static void journal_write(struct closure *cl)
goto no_io;
extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) {
ca = c->devs[ptr->dev];
ca = bch_dev_bkey_exists(c, ptr->dev);
if (!percpu_ref_tryget(&ca->io_ref)) {
/* XXX: fix this */
bch_err(c, "missing device for journal write\n");
@ -2693,6 +2726,46 @@ int bch2_journal_flush(struct journal *j)
return bch2_journal_flush_seq(j, seq);
}
int bch2_journal_flush_device(struct journal *j, unsigned dev_idx)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_entry_pin_list *p;
struct bch_devs_list devs;
u64 seq = 0;
unsigned iter;
int ret = 0;
spin_lock(&j->lock);
fifo_for_each_entry_ptr(p, &j->pin, iter)
if (bch2_dev_list_has_dev(p->devs, dev_idx))
seq = journal_pin_seq(j, p);
spin_unlock(&j->lock);
bch2_journal_flush_pins(j, seq);
mutex_lock(&c->replicas_gc_lock);
bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
seq = 0;
spin_lock(&j->lock);
while (!ret && seq < atomic64_read(&j->seq)) {
seq = max(seq, last_seq(j));
devs = journal_seq_pin(j, seq)->devs;
seq++;
spin_unlock(&j->lock);
ret = bch2_check_mark_super_devlist(c, &devs, BCH_DATA_JOURNAL);
spin_lock(&j->lock);
}
spin_unlock(&j->lock);
bch2_replicas_gc_end(c, ret);
mutex_unlock(&c->replicas_gc_lock);
return ret;
}
ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
@ -2862,9 +2935,7 @@ void bch2_fs_journal_stop(struct journal *j)
* journal entries, then force a brand new empty journal entry to be
* written:
*/
bch2_journal_flush_pins(j, U64_MAX);
bch2_journal_flush_async(j, NULL);
bch2_journal_meta(j);
bch2_journal_flush_all_pins(j);
cancel_delayed_work_sync(&j->write_work);
cancel_delayed_work_sync(&j->reclaim_work);

View File

@ -118,6 +118,8 @@
*/
struct journal_replay {
struct list_head list;
struct bch_devs_list devs;
/* must be last: */
struct jset j;
};
@ -164,6 +166,7 @@ void bch2_journal_pin_add_if_older(struct journal *,
struct journal_entry_pin *,
journal_pin_flush_fn);
void bch2_journal_flush_pins(struct journal *, u64);
int bch2_journal_flush_all_pins(struct journal *);
struct closure;
struct bch_fs;
@ -356,6 +359,7 @@ void bch2_journal_meta_async(struct journal *, struct closure *);
int bch2_journal_flush_seq(struct journal *, u64);
int bch2_journal_flush(struct journal *);
int bch2_journal_meta(struct journal *);
int bch2_journal_flush_device(struct journal *, unsigned);
void bch2_journal_halt(struct journal *);

View File

@ -34,6 +34,7 @@ struct journal_entry_pin_list {
struct list_head list;
struct list_head flushed;
atomic_t count;
struct bch_devs_list devs;
};
struct journal;

View File

@ -27,23 +27,9 @@ static bool migrate_pred(void *arg, struct bkey_s_c_extent e)
#define MAX_DATA_OFF_ITER 10
/*
* This moves only the data off, leaving the meta-data (if any) in place.
* It walks the key space, and for any key with a valid pointer to the
* relevant device, it copies it elsewhere, updating the key to point to
* the copy.
* The meta-data is moved off by bch_move_meta_data_off_device.
*
* Note: If the number of data replicas desired is > 1, ideally, any
* new copies would not be made in the same device that already have a
* copy (if there are enough devices).
* This is _not_ currently implemented. The multiple replicas can
* land in the same device even if there are others available.
*/
int bch2_move_data_off_device(struct bch_dev *ca)
static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca,
int flags)
{
struct bch_fs *c = ca->fs;
struct btree_iter iter;
struct bkey_s_c k;
u64 keys_moved, sectors_moved;
@ -113,10 +99,6 @@ int bch2_move_data_off_device(struct bch_dev *ca)
return ret;
}
/*
* This walks the btree, and for any node on the relevant device it moves the
* node elsewhere.
*/
static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
enum btree_id id)
{
@ -200,9 +182,9 @@ static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
* is written.
*/
int bch2_move_metadata_off_device(struct bch_dev *ca)
static int bch2_dev_metadata_migrate(struct bch_fs *c, struct bch_dev *ca,
int flags)
{
struct bch_fs *c = ca->fs;
unsigned i;
int ret = 0;
@ -240,37 +222,31 @@ err:
return ret;
}
/*
* Flagging data bad when forcibly removing a device after failing to
* migrate the data off the device.
*/
static int bch2_flag_key_bad(struct btree_iter *iter,
struct bch_dev *ca,
struct bkey_s_c_extent orig)
int bch2_dev_data_migrate(struct bch_fs *c, struct bch_dev *ca, int flags)
{
BKEY_PADDED(key) tmp;
struct bkey_s_extent e;
struct bch_extent_ptr *ptr;
struct bch_fs *c = ca->fs;
return bch2_dev_usrdata_migrate(c, ca, flags) ?:
bch2_dev_metadata_migrate(c, ca, flags);
}
bkey_reassemble(&tmp.key, orig.s_c);
e = bkey_i_to_s_extent(&tmp.key);
static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
unsigned dev_idx, int flags, bool metadata)
{
struct bch_extent_ptr *ptr;
unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
unsigned nr_good;
extent_for_each_ptr_backwards(e, ptr)
if (ptr->dev == ca->dev_idx)
if (ptr->dev == dev_idx)
bch2_extent_drop_ptr(e, ptr);
/*
* If the new extent no longer has any pointers, bch2_extent_normalize()
* will do the appropriate thing with it (turning it into a
* KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
*/
bch2_extent_normalize(c, e.s);
nr_good = bch2_extent_nr_good_ptrs(c, e.c);
if ((!nr_good && !(flags & lost)) ||
(nr_good < replicas && !(flags & degraded)))
return -EINVAL;
return bch2_btree_insert_at(c, NULL, NULL, NULL,
BTREE_INSERT_ATOMIC,
BTREE_INSERT_ENTRY(iter, &tmp.key));
return 0;
}
/*
@ -284,11 +260,11 @@ static int bch2_flag_key_bad(struct btree_iter *iter,
* that we've already tried to move the data MAX_DATA_OFF_ITER times and
* are not likely to succeed if we try again.
*/
int bch2_flag_data_bad(struct bch_dev *ca)
static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
struct bch_fs *c = ca->fs;
struct bkey_s_c k;
struct bkey_s_c_extent e;
struct bkey_s_extent e;
BKEY_PADDED(key) tmp;
struct btree_iter iter;
int ret = 0;
@ -303,11 +279,33 @@ int bch2_flag_data_bad(struct bch_dev *ca)
if (!bkey_extent_is_data(k.k))
goto advance;
e = bkey_s_c_to_extent(k);
if (!bch2_extent_has_device(e, ca->dev_idx))
if (!bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx))
goto advance;
ret = bch2_flag_key_bad(&iter, ca, e);
bkey_reassemble(&tmp.key, k);
e = bkey_i_to_s_extent(&tmp.key);
ret = drop_dev_ptrs(c, e, dev_idx, flags, false);
if (ret)
break;
/*
* If the new extent no longer has any pointers, bch2_extent_normalize()
* will do the appropriate thing with it (turning it into a
* KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
*/
bch2_extent_normalize(c, e.s);
if (bkey_extent_is_data(e.k) &&
(ret = bch2_check_mark_super(c, e.c, BCH_DATA_USER)))
break;
iter.pos = bkey_start_pos(&tmp.key.k);
ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL,
BTREE_INSERT_ENTRY(&iter, &tmp.key));
/*
* don't want to leave ret == -EINTR, since if we raced and
@ -319,26 +317,6 @@ int bch2_flag_data_bad(struct bch_dev *ca)
if (ret)
break;
/*
* If the replica we're dropping was dirty and there is an
* additional cached replica, the cached replica will now be
* considered dirty - upon inserting the new version of the key,
* the bucket accounting will be updated to reflect the fact
* that the cached data is now dirty and everything works out as
* if by magic without us having to do anything.
*
* The one thing we need to be concerned with here is there's a
* race between when we drop any stale pointers from the key
* we're about to insert, and when the key actually gets
* inserted and the cached data is marked as dirty - we could
* end up trying to insert a key with a pointer that should be
* dirty, but points to stale data.
*
* If that happens the insert code just bails out and doesn't do
* the insert - however, it doesn't return an error. Hence we
* need to always recheck the current key before advancing to
* the next:
*/
continue;
advance:
if (bkey_extent_is_data(k.k)) {
@ -357,3 +335,80 @@ advance:
return ret;
}
static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
struct btree_iter iter;
struct closure cl;
struct btree *b;
unsigned id;
int ret;
/* don't handle this yet: */
if (flags & BCH_FORCE_IF_METADATA_LOST)
return -EINVAL;
closure_init_stack(&cl);
mutex_lock(&c->replicas_gc_lock);
bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
for (id = 0; id < BTREE_ID_NR; id++) {
for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
struct bkey_i_extent *new_key;
retry:
if (!bch2_extent_has_device(bkey_i_to_s_c_extent(&b->key),
dev_idx)) {
bch2_btree_iter_set_locks_want(&iter, 0);
ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key),
BCH_DATA_BTREE);
if (ret)
goto err;
} else {
bkey_copy(&tmp.k, &b->key);
new_key = bkey_i_to_extent(&tmp.k);
ret = drop_dev_ptrs(c, extent_i_to_s(new_key),
dev_idx, flags, true);
if (ret)
goto err;
if (!bch2_btree_iter_set_locks_want(&iter, U8_MAX)) {
b = bch2_btree_iter_peek_node(&iter);
goto retry;
}
ret = bch2_btree_node_update_key(c, &iter, b, new_key);
if (ret == -EINTR) {
b = bch2_btree_iter_peek_node(&iter);
goto retry;
}
if (ret)
goto err;
}
}
bch2_btree_iter_unlock(&iter);
/* btree root */
mutex_lock(&c->btree_root_lock);
mutex_unlock(&c->btree_root_lock);
}
ret = 0;
out:
bch2_replicas_gc_end(c, ret);
mutex_unlock(&c->replicas_gc_lock);
return ret;
err:
bch2_btree_iter_unlock(&iter);
goto out;
}
int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
bch2_dev_metadata_drop(c, dev_idx, flags);
}

View File

@ -1,8 +1,7 @@
#ifndef _BCACHEFS_MIGRATE_H
#define _BCACHEFS_MIGRATE_H
int bch2_move_data_off_device(struct bch_dev *);
int bch2_move_metadata_off_device(struct bch_dev *);
int bch2_flag_data_bad(struct bch_dev *);
int bch2_dev_data_migrate(struct bch_fs *, struct bch_dev *, int);
int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
#endif /* _BCACHEFS_MIGRATE_H */

View File

@ -3,6 +3,7 @@
#include "btree_gc.h"
#include "btree_update.h"
#include "buckets.h"
#include "inode.h"
#include "io.h"
#include "move.h"
#include "super-io.h"
@ -206,7 +207,7 @@ static void move_write(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
if (likely(!io->rbio.bio.bi_error)) {
if (likely(!io->rbio.bio.bi_status)) {
bch2_migrate_write_init(&io->write, &io->rbio);
closure_call(&io->write.op.cl, bch2_write, NULL, cl);
}
@ -240,6 +241,7 @@ static int bch2_move_extent(struct bch_fs *c,
struct write_point_specifier wp,
int btree_insert_flags,
int move_device,
struct bch_io_opts opts,
struct bkey_s_c k)
{
struct extent_pick_ptr pick;
@ -276,6 +278,7 @@ static int bch2_move_extent(struct bch_fs *c,
goto err;
}
io->rbio.opts = opts;
bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
io->rbio.bio.bi_iter.bi_size = sectors << 9;
@ -284,9 +287,13 @@ static int bch2_move_extent(struct bch_fs *c,
io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
io->rbio.bio.bi_end_io = move_read_endio;
__bch2_write_op_init(&io->write.op, c);
io->write.btree_insert_flags = btree_insert_flags;
io->write.move_dev = move_device;
bch2_write_op_init(&io->write.op, c);
io->write.op.csum_type = bch2_data_checksum_type(c, opts.data_checksum);
io->write.op.compression_type =
bch2_compression_opt_to_type(opts.compression);
io->write.op.devs = devs;
io->write.op.write_point = wp;
@ -371,9 +378,11 @@ int bch2_move_data(struct bch_fs *c,
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
struct moving_context ctxt;
struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
struct btree_iter iter;
BKEY_PADDED(k) tmp;
struct bkey_s_c k;
u64 cur_inum = U64_MAX;
int ret = 0;
bch2_move_ctxt_init(&ctxt);
@ -396,7 +405,7 @@ int bch2_move_data(struct bch_fs *c,
(bch2_btree_iter_unlock(&iter),
(ret = bch2_ratelimit_wait_freezable_stoppable(rate))))
break;
peek:
k = bch2_btree_iter_peek(&iter);
if (!k.k)
break;
@ -404,8 +413,23 @@ int bch2_move_data(struct bch_fs *c,
if (ret)
break;
if (!bkey_extent_is_data(k.k) ||
!pred(arg, bkey_s_c_to_extent(k)))
if (!bkey_extent_is_data(k.k))
goto next;
if (cur_inum != k.k->p.inode) {
struct bch_inode_unpacked inode;
/* don't hold btree locks while looking up inode: */
bch2_btree_iter_unlock(&iter);
opts = bch2_opts_to_inode_opts(c->opts);
if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode))
bch2_io_opts_apply(&opts, bch2_inode_opts_get(&inode));
cur_inum = k.k->p.inode;
goto peek;
}
if (!pred(arg, bkey_s_c_to_extent(k)))
goto next;
/* unlock before doing IO: */
@ -415,7 +439,7 @@ int bch2_move_data(struct bch_fs *c,
if (bch2_move_extent(c, &ctxt, devs, wp,
btree_insert_flags,
move_device, k)) {
move_device, opts, k)) {
/* memory allocation failure, wait for some IO to finish */
bch2_move_ctxt_wait_for_io(&ctxt);
continue;

View File

@ -76,16 +76,27 @@ void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
#undef BCH_OPT
}
bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id)
{
switch (id) {
#define BCH_OPT(_name, ...) \
case Opt_##_name: \
return opt_defined(*opts, _name);
BCH_OPTS()
#undef BCH_OPT
default:
BUG();
}
}
u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
{
switch (id) {
#define BCH_OPT(_name, ...) \
case Opt_##_name: \
return opts->_name; \
return opts->_name;
BCH_OPTS()
#undef BCH_OPT
default:
BUG();
}
@ -98,10 +109,8 @@ void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
case Opt_##_name: \
opt_set(*opts, _name, v); \
break;
BCH_OPTS()
#undef BCH_OPT
default:
BUG();
}
@ -118,7 +127,6 @@ struct bch_opts bch2_opts_from_sb(struct bch_sb *sb)
#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default) \
if (_sb_opt != NO_SB_OPT) \
opt_set(opts, _name, _sb_opt(sb));
BCH_OPTS()
#undef BCH_OPT
@ -145,7 +153,7 @@ const struct bch_option bch2_opt_table[] = {
#undef BCH_OPT
};
static int bch2_opt_lookup(const char *name)
int bch2_opt_lookup(const char *name)
{
const struct bch_option *i;
@ -247,3 +255,52 @@ no_val:
pr_err("Mount option %s requires a value", name);
return -1;
}
/* io opts: */
struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
{
struct bch_io_opts ret = { 0 };
#define BCH_INODE_OPT(_name, _bits) \
if (opt_defined(src, _name)) \
opt_set(ret, _name, src._name);
BCH_INODE_OPTS()
#undef BCH_INODE_OPT
return ret;
}
struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src)
{
struct bch_opts ret = { 0 };
#define BCH_INODE_OPT(_name, _bits) \
if (opt_defined(src, _name)) \
opt_set(ret, _name, src._name);
BCH_INODE_OPTS()
#undef BCH_INODE_OPT
return ret;
}
void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src)
{
#define BCH_INODE_OPT(_name, _bits) \
if (opt_defined(src, _name)) \
opt_set(*dst, _name, src._name);
BCH_INODE_OPTS()
#undef BCH_INODE_OPT
}
bool bch2_opt_is_inode_opt(enum bch_opt_id id)
{
static const enum bch_opt_id inode_opt_list[] = {
#define BCH_INODE_OPT(_name, _bits) Opt_##_name,
BCH_INODE_OPTS()
#undef BCH_INODE_OPT
};
unsigned i;
for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++)
if (inode_opt_list[i] == id)
return true;
return false;
}

View File

@ -181,10 +181,7 @@ do { \
static inline struct bch_opts bch2_opts_empty(void)
{
struct bch_opts opts;
memset(&opts, 0, sizeof(opts));
return opts;
return (struct bch_opts) { 0 };
}
void bch2_opts_apply(struct bch_opts *, struct bch_opts);
@ -215,12 +212,35 @@ struct bch_option {
extern const struct bch_option bch2_opt_table[];
bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
struct bch_opts bch2_opts_from_sb(struct bch_sb *);
int bch2_opt_lookup(const char *);
int bch2_opt_parse(const struct bch_option *, const char *, u64 *);
int bch2_parse_mount_opts(struct bch_opts *, char *);
/* inode opts: */
#define BCH_INODE_OPTS() \
BCH_INODE_OPT(data_checksum, 8) \
BCH_INODE_OPT(compression, 8)
struct bch_io_opts {
#define BCH_INODE_OPT(_name, _bits) unsigned _name##_defined:1;
BCH_INODE_OPTS()
#undef BCH_INODE_OPT
#define BCH_INODE_OPT(_name, _bits) u##_bits _name;
BCH_INODE_OPTS()
#undef BCH_INODE_OPT
};
struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts);
void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts);
bool bch2_opt_is_inode_opt(enum bch_opt_id);
#endif /* _BCACHEFS_OPTS_H */

View File

@ -12,6 +12,8 @@
#include <linux/sort.h>
static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
struct bch_replicas_cpu *);
static const char *bch2_sb_validate_replicas(struct bch_sb *);
static inline void __bch2_sb_layout_size_assert(void)
@ -157,7 +159,7 @@ struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
return NULL;
f = __bch2_sb_field_resize(sb->sb, f, u64s);
f->type = type;
f->type = cpu_to_le32(type);
return f;
}
@ -188,7 +190,7 @@ struct bch_sb_field *bch2_fs_sb_field_resize(struct bch_fs *c,
}
f = __bch2_sb_field_resize(c->disk_sb, f, u64s);
f->type = type;
f->type = cpu_to_le32(type);
return f;
}
@ -354,7 +356,16 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
return "Invalid number of metadata replicas";
return "Invalid number of data replicas";
if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
return "Invalid metadata checksum type";
if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
return "Invalid metadata checksum type";
if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR)
return "Invalid compression type";
if (!BCH_SB_BTREE_NODE_SIZE(sb))
return "Btree node size not set";
@ -507,7 +518,7 @@ static void __copy_super(struct bch_sb *dst, struct bch_sb *src)
if (src_f->type == BCH_SB_FIELD_journal)
continue;
dst_f = bch2_sb_field_get(dst, src_f->type);
dst_f = bch2_sb_field_get(dst, le32_to_cpu(src_f->type));
dst_f = __bch2_sb_field_resize(dst, dst_f,
le32_to_cpu(src_f->u64s));
@ -601,7 +612,7 @@ reread:
/* XXX: verify MACs */
csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
(struct nonce) { 0 }, sb->sb);
null_nonce(), sb->sb);
if (bch2_crc_cmp(csum, sb->sb->csum))
return "bad checksum reading superblock";
@ -688,9 +699,9 @@ const char *bch2_read_super(const char *path,
got_super:
pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
le64_to_cpu(ret->sb->version),
le64_to_cpu(ret->sb->flags),
le64_to_cpu(ret->sb->flags[0]),
le64_to_cpu(ret->sb->seq),
le16_to_cpu(ret->sb->u64s));
le32_to_cpu(ret->sb->u64s));
err = "Superblock block size smaller than device block size";
if (le16_to_cpu(ret->sb->block_size) << 9 <
@ -711,7 +722,7 @@ static void write_super_endio(struct bio *bio)
/* XXX: return errors directly */
if (bch2_dev_io_err_on(bio->bi_error, ca, "superblock write"))
if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write"))
ca->sb_write_error = 1;
closure_put(&ca->fs->sb_write);
@ -727,7 +738,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
(struct nonce) { 0 }, sb);
null_nonce(), sb);
bio_reset(bio);
bio->bi_bdev = ca->disk_sb.bdev;
@ -830,7 +841,12 @@ out:
bch2_sb_update(c);
}
/* replica information: */
/* Replicas tracking - in memory: */
#define for_each_cpu_replicas_entry(_r, _i) \
for (_i = (_r)->entries; \
(void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
_i = (void *) (_i) + (_r)->entry_size)
static inline struct bch_replicas_cpu_entry *
cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
@ -838,6 +854,11 @@ cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
return (void *) r->entries + r->entry_size * i;
}
static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
{
eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
}
static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
unsigned dev)
{
@ -856,6 +877,246 @@ static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
}
static unsigned bkey_to_replicas(struct bkey_s_c_extent e,
enum bch_data_type data_type,
struct bch_replicas_cpu_entry *r,
unsigned *max_dev)
{
const struct bch_extent_ptr *ptr;
unsigned nr = 0;
BUG_ON(!data_type ||
data_type == BCH_DATA_SB ||
data_type >= BCH_DATA_NR);
memset(r, 0, sizeof(*r));
r->data_type = data_type;
*max_dev = 0;
extent_for_each_ptr(e, ptr)
if (!ptr->cached) {
*max_dev = max_t(unsigned, *max_dev, ptr->dev);
replicas_set_dev(r, ptr->dev);
nr++;
}
return nr;
}
static struct bch_replicas_cpu *
cpu_replicas_add_entry(struct bch_replicas_cpu *old,
struct bch_replicas_cpu_entry new_entry,
unsigned max_dev)
{
struct bch_replicas_cpu *new;
unsigned i, nr, entry_size;
entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
DIV_ROUND_UP(max_dev + 1, 8);
entry_size = max(entry_size, old->entry_size);
nr = old->nr + 1;
new = kzalloc(sizeof(struct bch_replicas_cpu) +
nr * entry_size, GFP_NOIO);
if (!new)
return NULL;
new->nr = nr;
new->entry_size = entry_size;
for (i = 0; i < old->nr; i++)
memcpy(cpu_replicas_entry(new, i),
cpu_replicas_entry(old, i),
min(new->entry_size, old->entry_size));
memcpy(cpu_replicas_entry(new, old->nr),
&new_entry,
new->entry_size);
bch2_cpu_replicas_sort(new);
return new;
}
static bool replicas_has_entry(struct bch_replicas_cpu *r,
struct bch_replicas_cpu_entry search,
unsigned max_dev)
{
return max_dev < replicas_dev_slots(r) &&
eytzinger0_find(r->entries, r->nr,
r->entry_size,
memcmp, &search) < r->nr;
}
noinline
static int bch2_check_mark_super_slowpath(struct bch_fs *c,
struct bch_replicas_cpu_entry new_entry,
unsigned max_dev)
{
struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r;
int ret = -ENOMEM;
mutex_lock(&c->sb_lock);
old_gc = rcu_dereference_protected(c->replicas_gc,
lockdep_is_held(&c->sb_lock));
if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) {
new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev);
if (!new_gc)
goto err;
}
old_r = rcu_dereference_protected(c->replicas,
lockdep_is_held(&c->sb_lock));
/* recheck, might have raced */
if (replicas_has_entry(old_r, new_entry, max_dev))
goto out;
new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev);
if (!new_r)
goto err;
ret = bch2_cpu_replicas_to_sb_replicas(c, new_r);
if (ret)
goto err;
if (new_gc) {
rcu_assign_pointer(c->replicas_gc, new_gc);
kfree_rcu(old_gc, rcu);
}
rcu_assign_pointer(c->replicas, new_r);
kfree_rcu(old_r, rcu);
bch2_write_super(c);
out:
ret = 0;
err:
mutex_unlock(&c->sb_lock);
return ret;
}
static inline int __bch2_check_mark_super(struct bch_fs *c,
struct bch_replicas_cpu_entry search,
unsigned max_dev)
{
struct bch_replicas_cpu *r, *gc_r;
bool marked;
rcu_read_lock();
r = rcu_dereference(c->replicas);
gc_r = rcu_dereference(c->replicas_gc);
marked = replicas_has_entry(r, search, max_dev) &&
(!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev));
rcu_read_unlock();
return likely(marked) ? 0
: bch2_check_mark_super_slowpath(c, search, max_dev);
}
int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
enum bch_data_type data_type)
{
struct bch_replicas_cpu_entry search;
unsigned max_dev;
if (!bkey_to_replicas(e, data_type, &search, &max_dev))
return 0;
return __bch2_check_mark_super(c, search, max_dev);
}
int bch2_check_mark_super_devlist(struct bch_fs *c,
struct bch_devs_list *devs,
enum bch_data_type data_type)
{
struct bch_replicas_cpu_entry search = { .data_type = data_type };
unsigned i, max_dev = 0;
if (!devs->nr)
return 0;
for (i = 0; i < devs->nr; i++) {
max_dev = max_t(unsigned, max_dev, devs->devs[i]);
replicas_set_dev(&search, devs->devs[i]);
}
return __bch2_check_mark_super(c, search, max_dev);
}
int bch2_replicas_gc_end(struct bch_fs *c, int err)
{
struct bch_replicas_cpu *new_r, *old_r;
int ret = 0;
lockdep_assert_held(&c->replicas_gc_lock);
mutex_lock(&c->sb_lock);
new_r = rcu_dereference_protected(c->replicas_gc,
lockdep_is_held(&c->sb_lock));
if (err) {
rcu_assign_pointer(c->replicas_gc, NULL);
kfree_rcu(new_r, rcu);
goto err;
}
if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
ret = -ENOSPC;
goto err;
}
old_r = rcu_dereference_protected(c->replicas,
lockdep_is_held(&c->sb_lock));
rcu_assign_pointer(c->replicas, new_r);
rcu_assign_pointer(c->replicas_gc, NULL);
kfree_rcu(old_r, rcu);
bch2_write_super(c);
err:
mutex_unlock(&c->sb_lock);
return ret;
}
int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
{
struct bch_replicas_cpu *dst, *src;
struct bch_replicas_cpu_entry *e;
lockdep_assert_held(&c->replicas_gc_lock);
mutex_lock(&c->sb_lock);
BUG_ON(c->replicas_gc);
src = rcu_dereference_protected(c->replicas,
lockdep_is_held(&c->sb_lock));
dst = kzalloc(sizeof(struct bch_replicas_cpu) +
src->nr * src->entry_size, GFP_NOIO);
if (!dst) {
mutex_unlock(&c->sb_lock);
return -ENOMEM;
}
dst->nr = 0;
dst->entry_size = src->entry_size;
for_each_cpu_replicas_entry(src, e)
if (!((1 << e->data_type) & typemask))
memcpy(cpu_replicas_entry(dst, dst->nr++),
e, dst->entry_size);
bch2_cpu_replicas_sort(dst);
rcu_assign_pointer(c->replicas_gc, dst);
mutex_unlock(&c->sb_lock);
return 0;
}
/* Replicas tracking - superblock: */
static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
unsigned *nr,
unsigned *bytes,
@ -914,10 +1175,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
}
}
eytzinger0_sort(cpu_r->entries,
cpu_r->nr,
cpu_r->entry_size,
memcmp, NULL);
bch2_cpu_replicas_sort(cpu_r);
return cpu_r;
}
@ -926,14 +1184,12 @@ static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
struct bch_sb_field_replicas *sb_r;
struct bch_replicas_cpu *cpu_r, *old_r;
lockdep_assert_held(&c->sb_lock);
sb_r = bch2_sb_get_replicas(c->disk_sb);
cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
if (!cpu_r)
return -ENOMEM;
old_r = c->replicas;
old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock));
rcu_assign_pointer(c->replicas, cpu_r);
if (old_r)
kfree_rcu(old_r, rcu);
@ -941,192 +1197,133 @@ static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
return 0;
}
static void bkey_to_replicas(struct bkey_s_c_extent e,
enum bch_data_type data_type,
struct bch_replicas_cpu_entry *r,
unsigned *max_dev)
static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
struct bch_replicas_cpu *r)
{
const struct bch_extent_ptr *ptr;
struct bch_sb_field_replicas *sb_r;
struct bch_replicas_entry *sb_e;
struct bch_replicas_cpu_entry *e;
size_t i, bytes;
BUG_ON(!data_type ||
data_type == BCH_DATA_SB ||
data_type >= BCH_DATA_NR);
bytes = sizeof(struct bch_sb_field_replicas);
memset(r, 0, sizeof(*r));
r->data_type = data_type;
for_each_cpu_replicas_entry(r, e) {
bytes += sizeof(struct bch_replicas_entry);
for (i = 0; i < r->entry_size - 1; i++)
bytes += hweight8(e->devs[i]);
}
*max_dev = 0;
sb_r = bch2_fs_sb_resize_replicas(c,
DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
if (!sb_r)
return -ENOSPC;
extent_for_each_ptr(e, ptr)
if (!ptr->cached) {
*max_dev = max_t(unsigned, *max_dev, ptr->dev);
replicas_set_dev(r, ptr->dev);
}
}
memset(&sb_r->entries, 0,
vstruct_end(&sb_r->field) -
(void *) &sb_r->entries);
/*
* for when gc of replica information is in progress:
*/
static int bch2_update_gc_replicas(struct bch_fs *c,
struct bch_replicas_cpu *gc_r,
struct bkey_s_c_extent e,
enum bch_data_type data_type)
{
struct bch_replicas_cpu_entry new_e;
struct bch_replicas_cpu *new;
unsigned i, nr, entry_size, max_dev;
sb_e = sb_r->entries;
for_each_cpu_replicas_entry(r, e) {
sb_e->data_type = e->data_type;
bkey_to_replicas(e, data_type, &new_e, &max_dev);
for (i = 0; i < replicas_dev_slots(r); i++)
if (replicas_test_dev(e, i))
sb_e->devs[sb_e->nr++] = i;
entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
DIV_ROUND_UP(max_dev + 1, 8);
entry_size = max(entry_size, gc_r->entry_size);
nr = gc_r->nr + 1;
sb_e = replicas_entry_next(sb_e);
new = kzalloc(sizeof(struct bch_replicas_cpu) +
nr * entry_size, GFP_NOIO);
if (!new)
return -ENOMEM;
BUG_ON((void *) sb_e > vstruct_end(&sb_r->field));
}
new->nr = nr;
new->entry_size = entry_size;
for (i = 0; i < gc_r->nr; i++)
memcpy(cpu_replicas_entry(new, i),
cpu_replicas_entry(gc_r, i),
gc_r->entry_size);
memcpy(cpu_replicas_entry(new, nr - 1),
&new_e,
new->entry_size);
eytzinger0_sort(new->entries,
new->nr,
new->entry_size,
memcmp, NULL);
rcu_assign_pointer(c->replicas_gc, new);
kfree_rcu(gc_r, rcu);
return 0;
}
static bool replicas_has_extent(struct bch_replicas_cpu *r,
struct bkey_s_c_extent e,
enum bch_data_type data_type)
static const char *bch2_sb_validate_replicas(struct bch_sb *sb)
{
struct bch_replicas_cpu_entry search;
unsigned max_dev;
struct bch_sb_field_members *mi;
struct bch_sb_field_replicas *sb_r;
struct bch_replicas_cpu *cpu_r = NULL;
struct bch_replicas_entry *e;
const char *err;
unsigned i;
bkey_to_replicas(e, data_type, &search, &max_dev);
mi = bch2_sb_get_members(sb);
sb_r = bch2_sb_get_replicas(sb);
if (!sb_r)
return NULL;
return max_dev < replicas_dev_slots(r) &&
eytzinger0_find(r->entries, r->nr,
r->entry_size,
memcmp, &search) < r->nr;
for_each_replicas_entry(sb_r, e) {
err = "invalid replicas entry: invalid data type";
if (e->data_type >= BCH_DATA_NR)
goto err;
err = "invalid replicas entry: no devices";
if (!e->nr)
goto err;
err = "invalid replicas entry: too many devices";
if (e->nr >= BCH_REPLICAS_MAX)
goto err;
err = "invalid replicas entry: invalid device";
for (i = 0; i < e->nr; i++)
if (!bch2_dev_exists(sb, mi, e->devs[i]))
goto err;
}
err = "cannot allocate memory";
cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
if (!cpu_r)
goto err;
sort_cmp_size(cpu_r->entries,
cpu_r->nr,
cpu_r->entry_size,
memcmp, NULL);
for (i = 0; i + 1 < cpu_r->nr; i++) {
struct bch_replicas_cpu_entry *l =
cpu_replicas_entry(cpu_r, i);
struct bch_replicas_cpu_entry *r =
cpu_replicas_entry(cpu_r, i + 1);
BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
err = "duplicate replicas entry";
if (!memcmp(l, r, cpu_r->entry_size))
goto err;
}
err = NULL;
err:
kfree(cpu_r);
return err;
}
/* Query replicas: */
bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e,
enum bch_data_type data_type)
{
struct bch_replicas_cpu_entry search;
unsigned max_dev;
bool ret;
if (!bkey_to_replicas(e, data_type, &search, &max_dev))
return true;
rcu_read_lock();
ret = replicas_has_extent(rcu_dereference(c->replicas),
e, data_type);
ret = replicas_has_entry(rcu_dereference(c->replicas),
search, max_dev);
rcu_read_unlock();
return ret;
}
noinline
static int bch2_check_mark_super_slowpath(struct bch_fs *c,
struct bkey_s_c_extent e,
enum bch_data_type data_type)
{
struct bch_replicas_cpu *gc_r;
const struct bch_extent_ptr *ptr;
struct bch_sb_field_replicas *sb_r;
struct bch_replicas_entry *new_entry;
unsigned new_entry_bytes, new_u64s, nr, bytes, max_dev;
int ret = 0;
mutex_lock(&c->sb_lock);
gc_r = rcu_dereference_protected(c->replicas_gc,
lockdep_is_held(&c->sb_lock));
if (gc_r &&
!replicas_has_extent(gc_r, e, data_type)) {
ret = bch2_update_gc_replicas(c, gc_r, e, data_type);
if (ret)
goto err;
}
/* recheck, might have raced */
if (bch2_sb_has_replicas(c, e, data_type)) {
mutex_unlock(&c->sb_lock);
return 0;
}
new_entry_bytes = sizeof(struct bch_replicas_entry) +
bch2_extent_nr_dirty_ptrs(e.s_c);
sb_r = bch2_sb_get_replicas(c->disk_sb);
bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
new_u64s = DIV_ROUND_UP(bytes + new_entry_bytes, sizeof(u64));
sb_r = bch2_fs_sb_resize_replicas(c,
DIV_ROUND_UP(sizeof(*sb_r) + bytes + new_entry_bytes,
sizeof(u64)));
if (!sb_r) {
ret = -ENOSPC;
goto err;
}
new_entry = (void *) sb_r + bytes;
new_entry->data_type = data_type;
new_entry->nr = 0;
extent_for_each_ptr(e, ptr)
if (!ptr->cached)
new_entry->devs[new_entry->nr++] = ptr->dev;
ret = bch2_sb_replicas_to_cpu_replicas(c);
if (ret) {
memset(new_entry, 0,
vstruct_end(&sb_r->field) - (void *) new_entry);
goto err;
}
bch2_write_super(c);
err:
mutex_unlock(&c->sb_lock);
return ret;
}
int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
enum bch_data_type data_type)
{
struct bch_replicas_cpu *gc_r;
bool marked;
rcu_read_lock();
marked = replicas_has_extent(rcu_dereference(c->replicas),
e, data_type) &&
(!(gc_r = rcu_dereference(c->replicas_gc)) ||
replicas_has_extent(gc_r, e, data_type));
rcu_read_unlock();
if (marked)
return 0;
return bch2_check_mark_super_slowpath(c, e, data_type);
}
struct replicas_status __bch2_replicas_status(struct bch_fs *c,
struct bch_devs_mask online_devs)
struct bch_devs_mask online_devs)
{
struct bch_sb_field_members *mi;
struct bch_replicas_cpu_entry *e;
struct bch_replicas_cpu *r;
unsigned i, dev, dev_slots, nr_online, nr_offline;
@ -1137,14 +1334,15 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c,
for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
ret.replicas[i].nr_online = UINT_MAX;
mi = bch2_sb_get_members(c->disk_sb);
rcu_read_lock();
r = rcu_dereference(c->replicas);
dev_slots = min_t(unsigned, replicas_dev_slots(r), c->sb.nr_devices);
dev_slots = replicas_dev_slots(r);
for (i = 0; i < r->nr; i++) {
e = cpu_replicas_entry(r, i);
BUG_ON(e->data_type >= ARRAY_SIZE(ret.replicas));
for_each_cpu_replicas_entry(r, e) {
if (e->data_type >= ARRAY_SIZE(ret.replicas))
panic("e %p data_type %u\n", e, e->data_type);
nr_online = nr_offline = 0;
@ -1152,6 +1350,8 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c,
if (!replicas_test_dev(e, dev))
continue;
BUG_ON(!bch2_dev_exists(c->disk_sb, mi, dev));
if (test_bit(dev, online_devs.d))
nr_online++;
else
@ -1216,7 +1416,7 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
{
struct bch_replicas_cpu_entry *e;
struct bch_replicas_cpu *r;
unsigned i, ret = 0;
unsigned ret = 0;
rcu_read_lock();
r = rcu_dereference(c->replicas);
@ -1224,191 +1424,13 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
if (ca->dev_idx >= replicas_dev_slots(r))
goto out;
for (i = 0; i < r->nr; i++) {
e = cpu_replicas_entry(r, i);
for_each_cpu_replicas_entry(r, e)
if (replicas_test_dev(e, ca->dev_idx)) {
ret |= 1 << e->data_type;
break;
}
}
out:
rcu_read_unlock();
return ret;
}
static const char *bch2_sb_validate_replicas(struct bch_sb *sb)
{
struct bch_sb_field_members *mi;
struct bch_sb_field_replicas *sb_r;
struct bch_replicas_cpu *cpu_r = NULL;
struct bch_replicas_entry *e;
const char *err;
unsigned i;
mi = bch2_sb_get_members(sb);
sb_r = bch2_sb_get_replicas(sb);
if (!sb_r)
return NULL;
for_each_replicas_entry(sb_r, e) {
err = "invalid replicas entry: invalid data type";
if (e->data_type >= BCH_DATA_NR)
goto err;
err = "invalid replicas entry: too many devices";
if (e->nr >= BCH_REPLICAS_MAX)
goto err;
err = "invalid replicas entry: invalid device";
for (i = 0; i < e->nr; i++)
if (!bch2_dev_exists(sb, mi, e->devs[i]))
goto err;
}
err = "cannot allocate memory";
cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
if (!cpu_r)
goto err;
sort_cmp_size(cpu_r->entries,
cpu_r->nr,
cpu_r->entry_size,
memcmp, NULL);
for (i = 0; i + 1 < cpu_r->nr; i++) {
struct bch_replicas_cpu_entry *l =
cpu_replicas_entry(cpu_r, i);
struct bch_replicas_cpu_entry *r =
cpu_replicas_entry(cpu_r, i + 1);
BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
err = "duplicate replicas entry";
if (!memcmp(l, r, cpu_r->entry_size))
goto err;
}
err = NULL;
err:
kfree(cpu_r);
return err;
}
int bch2_replicas_gc_end(struct bch_fs *c, int err)
{
struct bch_sb_field_replicas *sb_r;
struct bch_replicas_cpu *r, *old_r;
struct bch_replicas_entry *dst_e;
size_t i, j, bytes, dev_slots;
int ret = 0;
lockdep_assert_held(&c->replicas_gc_lock);
mutex_lock(&c->sb_lock);
r = rcu_dereference_protected(c->replicas_gc,
lockdep_is_held(&c->sb_lock));
if (err) {
rcu_assign_pointer(c->replicas_gc, NULL);
kfree_rcu(r, rcu);
goto err;
}
dev_slots = replicas_dev_slots(r);
bytes = sizeof(struct bch_sb_field_replicas);
for (i = 0; i < r->nr; i++) {
struct bch_replicas_cpu_entry *e =
cpu_replicas_entry(r, i);
bytes += sizeof(struct bch_replicas_entry);
for (j = 0; j < r->entry_size - 1; j++)
bytes += hweight8(e->devs[j]);
}
sb_r = bch2_fs_sb_resize_replicas(c,
DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
if (!sb_r) {
ret = -ENOSPC;
goto err;
}
memset(&sb_r->entries, 0,
vstruct_end(&sb_r->field) -
(void *) &sb_r->entries);
dst_e = sb_r->entries;
for (i = 0; i < r->nr; i++) {
struct bch_replicas_cpu_entry *src_e =
cpu_replicas_entry(r, i);
dst_e->data_type = src_e->data_type;
for (j = 0; j < dev_slots; j++)
if (replicas_test_dev(src_e, j))
dst_e->devs[dst_e->nr++] = j;
dst_e = replicas_entry_next(dst_e);
}
old_r = rcu_dereference_protected(c->replicas,
lockdep_is_held(&c->sb_lock));
rcu_assign_pointer(c->replicas, r);
rcu_assign_pointer(c->replicas_gc, NULL);
kfree_rcu(old_r, rcu);
bch2_write_super(c);
err:
mutex_unlock(&c->sb_lock);
return ret;
}
int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
{
struct bch_replicas_cpu *r, *src;
unsigned i;
lockdep_assert_held(&c->replicas_gc_lock);
mutex_lock(&c->sb_lock);
BUG_ON(c->replicas_gc);
src = rcu_dereference_protected(c->replicas,
lockdep_is_held(&c->sb_lock));
r = kzalloc(sizeof(struct bch_replicas_cpu) +
src->nr * src->entry_size, GFP_NOIO);
if (!r) {
mutex_unlock(&c->sb_lock);
return -ENOMEM;
}
r->entry_size = src->entry_size;
r->nr = 0;
for (i = 0; i < src->nr; i++) {
struct bch_replicas_cpu_entry *dst_e =
cpu_replicas_entry(r, r->nr);
struct bch_replicas_cpu_entry *src_e =
cpu_replicas_entry(src, i);
if (!(src_e->data_type & typemask)) {
memcpy(dst_e, src_e, r->entry_size);
r->nr++;
}
}
eytzinger0_sort(r->entries,
r->nr,
r->entry_size,
memcmp, NULL);
rcu_assign_pointer(c->replicas_gc, r);
mutex_unlock(&c->sb_lock);
return 0;
}

View File

@ -125,23 +125,12 @@ void bch2_write_super(struct bch_fs *);
/* replicas: */
/* iterate over bch_sb_field_replicas: */
static inline struct bch_replicas_entry *
replicas_entry_next(struct bch_replicas_entry *i)
{
return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
}
#define for_each_replicas_entry(_r, _i) \
for (_i = (_r)->entries; \
(void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
(_i) = replicas_entry_next(_i))
bool bch2_sb_has_replicas(struct bch_fs *, struct bkey_s_c_extent,
enum bch_data_type);
int bch2_check_mark_super(struct bch_fs *, struct bkey_s_c_extent,
enum bch_data_type);
int bch2_check_mark_super_devlist(struct bch_fs *, struct bch_devs_list *,
enum bch_data_type);
struct replicas_status {
struct {
@ -161,4 +150,17 @@ unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
int bch2_replicas_gc_end(struct bch_fs *, int);
int bch2_replicas_gc_start(struct bch_fs *, unsigned);
/* iterate over superblock replicas - used by userspace tools: */
static inline struct bch_replicas_entry *
replicas_entry_next(struct bch_replicas_entry *i)
{
return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
}
#define for_each_replicas_entry(_r, _i) \
for (_i = (_r)->entries; \
(void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
(_i) = replicas_entry_next(_i))
#endif /* _BCACHEFS_SUPER_IO_H */

View File

@ -140,8 +140,9 @@ struct bch_fs *bch2_uuid_to_fs(uuid_le uuid)
return c;
}
int bch2_congested(struct bch_fs *c, int bdi_bits)
int bch2_congested(void *data, int bdi_bits)
{
struct bch_fs *c = data;
struct backing_dev_info *bdi;
struct bch_dev *ca;
unsigned i;
@ -178,13 +179,6 @@ int bch2_congested(struct bch_fs *c, int bdi_bits)
return ret;
}
static int bch2_congested_fn(void *data, int bdi_bits)
{
struct bch_fs *c = data;
return bch2_congested(c, bdi_bits);
}
/* Filesystem RO/RW: */
/*
@ -218,7 +212,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
* Flush journal before stopping allocators, because flushing journal
* blacklist entries involves allocating new btree nodes:
*/
bch2_journal_flush_pins(&c->journal, U64_MAX);
bch2_journal_flush_all_pins(&c->journal);
if (!bch2_journal_error(&c->journal))
bch2_btree_verify_flushed(c);
@ -379,8 +373,6 @@ static void bch2_fs_free(struct bch_fs *c)
bch2_io_clock_exit(&c->io_clock[WRITE]);
bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_compress_exit(c);
if (c->bdi.bdi_list.next)
bdi_destroy(&c->bdi);
lg_lock_free(&c->usage_lock);
free_percpu(c->usage_percpu);
mempool_exit(&c->btree_bounce_pool);
@ -393,7 +385,7 @@ static void bch2_fs_free(struct bch_fs *c)
mempool_exit(&c->btree_reserve_pool);
mempool_exit(&c->fill_iter);
percpu_ref_exit(&c->writes);
kfree(c->replicas);
kfree(rcu_dereference_protected(c->replicas, 1));
if (c->copygc_wq)
destroy_workqueue(c->copygc_wq);
@ -414,7 +406,7 @@ static void bch2_fs_exit(struct bch_fs *c)
for (i = 0; i < c->sb.nr_devices; i++)
if (c->devs[i])
bch2_dev_free(c->devs[i]);
bch2_dev_free(rcu_dereference_protected(c->devs[i], 1));
closure_debug_destroy(&c->cl);
kobject_put(&c->kobj);
@ -576,10 +568,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
sizeof(struct btree_update)) ||
mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
bioset_init(&c->btree_read_bio, 1,
offsetof(struct btree_read_bio, bio)) ||
bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio)) ||
bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio)) ||
bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio)) ||
offsetof(struct btree_read_bio, bio),
BIOSET_NEED_BVECS) ||
bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
BIOSET_NEED_BVECS) ||
bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
BIOSET_NEED_BVECS) ||
bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
BIOSET_NEED_BVECS) ||
mempool_init_page_pool(&c->bio_bounce_pages,
max_t(unsigned,
c->opts.btree_node_size,
@ -588,7 +584,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
!(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
lg_lock_init(&c->usage_lock) ||
mempool_init_vp_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) ||
bdi_setup_and_register(&c->bdi, "bcachefs") ||
bch2_io_clock_init(&c->io_clock[READ]) ||
bch2_io_clock_init(&c->io_clock[WRITE]) ||
bch2_fs_journal_init(&c->journal) ||
@ -599,10 +594,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_fsio_init(c))
goto err;
c->bdi.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
c->bdi.congested_fn = bch2_congested_fn;
c->bdi.congested_data = c;
mi = bch2_sb_get_members(c->disk_sb);
for (i = 0; i < c->sb.nr_devices; i++)
if (bch2_dev_exists(c->disk_sb, mi, i) &&
@ -729,8 +720,12 @@ static const char *__bch2_fs_start(struct bch_fs *c)
continue;
err = "error reading btree root";
if (bch2_btree_root_read(c, i, k, level))
goto err;
if (bch2_btree_root_read(c, i, k, level)) {
if (i != BTREE_ID_ALLOC)
goto err;
mustfix_fsck_err(c, "error reading btree root");
}
}
err = "error reading allocation information";
@ -830,7 +825,7 @@ static const char *__bch2_fs_start(struct bch_fs *c)
closure_sync(&cl);
bch2_inode_init(c, &inode, 0, 0,
S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
inode.bi_inum = BCACHEFS_ROOT_INO;
bch2_inode_pack(&packed_inode, &inode);
@ -877,6 +872,7 @@ out:
bch2_journal_entries_free(&journal);
return err;
err:
fsck_err:
closure_sync(&cl);
switch (ret) {
@ -995,24 +991,20 @@ static void bch2_dev_free(struct bch_dev *ca)
kobject_put(&ca->kobj);
}
static void bch2_dev_io_ref_release(struct percpu_ref *ref)
{
struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
complete(&ca->offline_complete);
}
static void __bch2_dev_offline(struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
lockdep_assert_held(&c->state_lock);
if (percpu_ref_is_zero(&ca->io_ref))
return;
__bch2_dev_read_only(c, ca);
reinit_completion(&ca->offline_complete);
reinit_completion(&ca->io_ref_completion);
percpu_ref_kill(&ca->io_ref);
wait_for_completion(&ca->offline_complete);
wait_for_completion(&ca->io_ref_completion);
if (ca->kobj.state_in_sysfs) {
struct kobject *block =
@ -1026,27 +1018,18 @@ static void __bch2_dev_offline(struct bch_dev *ca)
bch2_dev_journal_exit(ca);
}
static void bch2_dev_ref_release(struct percpu_ref *ref)
static void bch2_dev_ref_complete(struct percpu_ref *ref)
{
struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
complete(&ca->stop_complete);
complete(&ca->ref_completion);
}
static void bch2_dev_stop(struct bch_dev *ca)
static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
{
struct bch_fs *c = ca->fs;
struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
lockdep_assert_held(&c->state_lock);
BUG_ON(rcu_access_pointer(c->devs[ca->dev_idx]) != ca);
rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
synchronize_rcu();
reinit_completion(&ca->stop_complete);
percpu_ref_kill(&ca->ref);
wait_for_completion(&ca->stop_complete);
complete(&ca->io_ref_completion);
}
static int bch2_dev_sysfs_online(struct bch_dev *ca)
@ -1095,8 +1078,8 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
return -ENOMEM;
kobject_init(&ca->kobj, &bch2_dev_ktype);
init_completion(&ca->stop_complete);
init_completion(&ca->offline_complete);
init_completion(&ca->ref_completion);
init_completion(&ca->io_ref_completion);
ca->dev_idx = dev_idx;
__set_bit(ca->dev_idx, ca->self.d);
@ -1132,9 +1115,9 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
DIV_ROUND_UP(BTREE_NODE_RESERVE,
ca->mi.bucket_size / c->opts.btree_node_size);
if (percpu_ref_init(&ca->ref, bch2_dev_ref_release,
if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
0, GFP_KERNEL) ||
percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_release,
percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_BTREE], btree_node_reserve_buckets,
GFP_KERNEL) ||
@ -1155,7 +1138,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
GFP_KERNEL|__GFP_ZERO)) ||
!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) ||
bioset_init(&ca->replica_set, 4,
offsetof(struct bch_write_bio, bio)) ||
offsetof(struct bch_write_bio, bio), 0) ||
!(ca->io_done = alloc_percpu(*ca->io_done)))
goto err;
@ -1180,8 +1163,6 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
struct bch_dev *ca;
int ret;
lockdep_assert_held(&c->sb_lock);
if (le64_to_cpu(sb->sb->seq) >
le64_to_cpu(c->disk_sb->seq))
bch2_sb_to_fs(c, sb->sb);
@ -1189,13 +1170,15 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
!c->devs[sb->sb->dev_idx]);
ca = c->devs[sb->sb->dev_idx];
ca = bch_dev_locked(c, sb->sb->dev_idx);
if (ca->disk_sb.bdev) {
bch_err(c, "already have device online in slot %u",
sb->sb->dev_idx);
return -EINVAL;
}
BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
ret = bch2_dev_journal_init(ca, sb->sb);
if (ret)
return ret;
@ -1222,7 +1205,7 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
if (bch2_dev_sysfs_online(ca))
pr_warn("error creating sysfs objects");
bch2_mark_dev_superblock(c, ca, 0);
bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
if (ca->mi.state == BCH_MEMBER_STATE_RW)
bch2_dev_allocator_add(c, ca);
@ -1293,6 +1276,7 @@ static bool bch2_fs_may_start(struct bch_fs *c)
{
struct replicas_status s;
struct bch_sb_field_members *mi;
struct bch_dev *ca;
unsigned i, flags = c->opts.degraded
? BCH_FORCE_IF_DEGRADED
: 0;
@ -1301,14 +1285,19 @@ static bool bch2_fs_may_start(struct bch_fs *c)
mutex_lock(&c->sb_lock);
mi = bch2_sb_get_members(c->disk_sb);
for (i = 0; i < c->disk_sb->nr_devices; i++)
if (bch2_dev_exists(c->disk_sb, mi, i) &&
!bch2_dev_is_online(c->devs[i]) &&
(c->devs[i]->mi.state == BCH_MEMBER_STATE_RW ||
c->devs[i]->mi.state == BCH_MEMBER_STATE_RO)) {
for (i = 0; i < c->disk_sb->nr_devices; i++) {
if (!bch2_dev_exists(c->disk_sb, mi, i))
continue;
ca = bch_dev_locked(c, i);
if (!bch2_dev_is_online(ca) &&
(ca->mi.state == BCH_MEMBER_STATE_RW ||
ca->mi.state == BCH_MEMBER_STATE_RO)) {
mutex_unlock(&c->sb_lock);
return false;
}
}
mutex_unlock(&c->sb_lock);
}
@ -1419,22 +1408,59 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
*
* flag_data_bad() does not check btree pointers
*/
ret = bch2_flag_data_bad(ca);
ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
if (ret) {
bch_err(ca, "Remove failed");
bch_err(ca, "Remove failed: error %i dropping data", ret);
goto err;
}
ret = bch2_journal_flush_device(&c->journal, ca->dev_idx);
if (ret) {
bch_err(ca, "Remove failed: error %i flushing journal", ret);
goto err;
}
data = bch2_dev_has_data(c, ca);
if (data) {
bch_err(ca, "Remove failed, still has data (%x)", data);
char data_has_str[100];
bch2_scnprint_flag_list(data_has_str,
sizeof(data_has_str),
bch2_data_types,
data);
bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
ret = -EBUSY;
goto err;
}
bch2_journal_meta(&c->journal);
ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC,
POS(ca->dev_idx, 0),
POS(ca->dev_idx + 1, 0),
ZERO_VERSION,
NULL, NULL, NULL);
if (ret) {
bch_err(ca, "Remove failed, error deleting alloc info");
goto err;
}
/*
* must flush all existing journal entries, they might have
* (overwritten) keys that point to the device we're removing:
*/
ret = bch2_journal_flush_all_pins(&c->journal);
if (ret) {
bch_err(ca, "Remove failed, journal error");
goto err;
}
__bch2_dev_offline(ca);
bch2_dev_stop(ca);
mutex_lock(&c->sb_lock);
rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
mutex_unlock(&c->sb_lock);
percpu_ref_kill(&ca->ref);
wait_for_completion(&ca->ref_completion);
bch2_dev_free(ca);
/*
@ -1542,7 +1568,7 @@ have_slot:
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
ca = c->devs[dev_idx];
ca = bch_dev_locked(c, dev_idx);
if (ca->mi.state == BCH_MEMBER_STATE_RW) {
err = "journal alloc failed";
if (bch2_dev_journal_alloc(ca))
@ -1568,7 +1594,7 @@ err:
/* Hot add existing device to running filesystem: */
int bch2_dev_online(struct bch_fs *c, const char *path)
{
struct bch_sb_handle sb = { 0 };
struct bch_sb_handle sb = { NULL };
struct bch_dev *ca;
unsigned dev_idx;
const char *err;
@ -1593,7 +1619,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
}
mutex_unlock(&c->sb_lock);
ca = c->devs[dev_idx];
ca = bch_dev_locked(c, dev_idx);
if (ca->mi.state == BCH_MEMBER_STATE_RW) {
err = __bch2_dev_read_write(c, ca);
if (err)
@ -1619,7 +1645,6 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
return -EINVAL;
}
__bch2_dev_read_only(c, ca);
__bch2_dev_offline(ca);
mutex_unlock(&c->state_lock);
@ -1629,37 +1654,31 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca)
{
unsigned data;
int ret;
int ret = 0;
mutex_lock(&c->state_lock);
if (ca->mi.state == BCH_MEMBER_STATE_RW) {
bch_err(ca, "Cannot migrate data off RW device");
mutex_unlock(&c->state_lock);
return -EINVAL;
ret = -EINVAL;
goto err;
}
mutex_unlock(&c->state_lock);
ret = bch2_move_data_off_device(ca);
ret = bch2_dev_data_migrate(c, ca, 0);
if (ret) {
bch_err(ca, "Error migrating data: %i", ret);
return ret;
}
ret = bch2_move_metadata_off_device(ca);
if (ret) {
bch_err(ca, "Error migrating metadata: %i", ret);
return ret;
goto err;
}
data = bch2_dev_has_data(c, ca);
if (data) {
bch_err(ca, "Migrate error: data still present (%x)", data);
return -EINVAL;
ret = -EINVAL;
goto err;
}
return 0;
err:
mutex_unlock(&c->state_lock);
return ret;
}
/* Filesystem open: */

View File

@ -59,6 +59,14 @@ static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
}
}
static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
unsigned dev)
{
BUG_ON(bch2_dev_list_has_dev(*devs, dev));
BUG_ON(devs->nr >= BCH_REPLICAS_MAX);
devs->devs[devs->nr++] = dev;
}
static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
struct bch_devs_mask *mask)
{
@ -131,6 +139,26 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
__for_each_online_member(ca, c, iter, \
(1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO))
/*
* If a key exists that references a device, the device won't be going away and
* we can omit rcu_read_lock():
*/
static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
{
EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
return rcu_dereference_check(c->devs[idx], 1);
}
static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
{
EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
return rcu_dereference_protected(c->devs[idx],
lockdep_is_held(&c->sb_lock) ||
lockdep_is_held(&c->state_lock));
}
/* XXX kill, move to struct bch_fs */
static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
{
@ -146,7 +174,7 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
struct bch_fs *bch2_bdev_to_fs(struct block_device *);
struct bch_fs *bch2_uuid_to_fs(uuid_le);
int bch2_congested(struct bch_fs *, int);
int bch2_congested(void *, int);
bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
enum bch_member_state, int);

View File

@ -739,7 +739,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
c->open_buckets_wait.list.first ? "waiting" : "empty");
}
const char * const bch2_rw[] = {
static const char * const bch2_rw[] = {
"read",
"write",
NULL

View File

@ -6,7 +6,6 @@
#include "clock.h"
#include "extents.h"
#include "io.h"
#include "keylist.h"
#include "move.h"
#include "super-io.h"
#include "tier.h"
@ -28,7 +27,7 @@ static bool tiering_pred(void *arg, struct bkey_s_c_extent e)
return false;
extent_for_each_ptr(e, ptr)
if (c->devs[ptr->dev]->mi.tier >= tier->idx)
if (bch_dev_bkey_exists(c, ptr->dev)->mi.tier >= tier->idx)
replicas++;
return replicas < c->opts.data_replicas;

View File

@ -34,8 +34,12 @@ struct closure;
#define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0)
#define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0)
#define memcpy(_dst, _src, _len) \
#define memcpy(dst, src, len) \
({ \
void *_dst = (dst); \
const void *_src = (src); \
size_t _len = (len); \
\
BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) || \
(void *) (_dst) + (_len) <= (void *) (_src))); \
memcpy(_dst, _src, _len); \

View File

@ -9,10 +9,10 @@
*/
#define __vstruct_u64s(_s) \
({ \
( type_is((_s)->u64s, u64) ? le64_to_cpu((_s)->u64s) \
: type_is((_s)->u64s, u32) ? le32_to_cpu((_s)->u64s) \
: type_is((_s)->u64s, u16) ? le16_to_cpu((_s)->u64s) \
: ((_s)->u64s)); \
( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s) \
: type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s) \
: type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s) \
: ((__force u8) ((_s)->u64s))); \
})
#define __vstruct_bytes(_type, _u64s) \

View File

@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "bkey_methods.h"
#include "btree_update.h"
#include "compress.h"
#include "extents.h"
#include "fs.h"
#include "str_hash.h"
@ -358,6 +359,129 @@ static const struct xattr_handler bch_xattr_security_handler = {
.flags = BCH_XATTR_INDEX_SECURITY,
};
#ifndef NO_BCACHEFS_FS
static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
struct dentry *dentry, struct inode *vinode,
const char *name, void *buffer, size_t size)
{
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_opts opts =
bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode));
const struct bch_option *opt;
int ret, id;
u64 v;
id = bch2_opt_lookup(name);
if (id < 0 || !bch2_opt_is_inode_opt(id))
return -EINVAL;
opt = bch2_opt_table + id;
if (!bch2_opt_defined_by_id(&opts, id))
return -ENODATA;
v = bch2_opt_get_by_id(&opts, id);
if (opt->type == BCH_OPT_STR)
ret = snprintf(buffer, size, "%s", opt->choices[v]);
else
ret = snprintf(buffer, size, "%llu", v);
return ret <= size || !buffer ? ret : -ERANGE;
}
struct inode_opt_set {
int id;
u64 v;
bool defined;
};
static int inode_opt_set_fn(struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
void *p)
{
struct inode_opt_set *s = p;
if (s->defined)
bch2_inode_opt_set(bi, s->id, s->v);
else
bch2_inode_opt_clear(bi, s->id);
return 0;
}
static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
struct dentry *dentry, struct inode *vinode,
const char *name, const void *value,
size_t size, int flags)
{
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
const struct bch_option *opt;
char *buf;
struct inode_opt_set s;
int ret;
s.id = bch2_opt_lookup(name);
if (s.id < 0 || !bch2_opt_is_inode_opt(s.id))
return -EINVAL;
opt = bch2_opt_table + s.id;
if (value) {
buf = kmalloc(size + 1, GFP_KERNEL);
if (!buf)
return -ENOMEM;
memcpy(buf, value, size);
buf[size] = '\0';
ret = bch2_opt_parse(opt, buf, &s.v);
kfree(buf);
if (ret < 0)
return ret;
if (s.id == Opt_compression) {
mutex_lock(&c->sb_lock);
ret = bch2_check_set_has_compressed_data(c, s.v);
mutex_unlock(&c->sb_lock);
if (ret)
return ret;
}
s.defined = true;
} else {
s.defined = false;
}
mutex_lock(&inode->ei_update_lock);
ret = __bch2_write_inode(c, inode, inode_opt_set_fn, &s);
mutex_unlock(&inode->ei_update_lock);
return ret;
}
static const struct xattr_handler bch_xattr_bcachefs_handler = {
.prefix = "bcachefs.",
.get = bch2_xattr_bcachefs_get,
.set = bch2_xattr_bcachefs_set,
};
#endif /* NO_BCACHEFS_FS */
const struct xattr_handler *bch2_xattr_handlers[] = {
&bch_xattr_user_handler,
&posix_acl_access_xattr_handler,
&posix_acl_default_xattr_handler,
&bch_xattr_trusted_handler,
&bch_xattr_security_handler,
#ifndef NO_BCACHEFS_FS
&bch_xattr_bcachefs_handler,
#endif
NULL
};
static const struct xattr_handler *bch_xattr_handler_map[] = {
[BCH_XATTR_INDEX_USER] = &bch_xattr_user_handler,
[BCH_XATTR_INDEX_POSIX_ACL_ACCESS] =
@ -368,15 +492,6 @@ static const struct xattr_handler *bch_xattr_handler_map[] = {
[BCH_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler,
};
const struct xattr_handler *bch2_xattr_handlers[] = {
&bch_xattr_user_handler,
&posix_acl_access_xattr_handler,
&posix_acl_default_xattr_handler,
&bch_xattr_trusted_handler,
&bch_xattr_security_handler,
NULL
};
static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type)
{
return type < ARRAY_SIZE(bch_xattr_handler_map)

View File

@ -19,7 +19,38 @@
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/export.h>
static const struct {
int err;
const char *name;
} blk_errors[] = {
[BLK_STS_OK] = { 0, "" },
[BLK_STS_NOTSUPP] = { -EOPNOTSUPP, "operation not supported" },
[BLK_STS_TIMEOUT] = { -ETIMEDOUT, "timeout" },
[BLK_STS_NOSPC] = { -ENOSPC, "critical space allocation" },
[BLK_STS_TRANSPORT] = { -ENOLINK, "recoverable transport" },
[BLK_STS_TARGET] = { -EREMOTEIO, "critical target" },
[BLK_STS_NEXUS] = { -EBADE, "critical nexus" },
[BLK_STS_MEDIUM] = { -ENODATA, "critical medium" },
[BLK_STS_PROTECTION] = { -EILSEQ, "protection" },
[BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" },
[BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" },
/* device mapper special case, should not leak out: */
[BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" },
/* everything else not covered above: */
[BLK_STS_IOERR] = { -EIO, "I/O" },
};
int blk_status_to_errno(blk_status_t status)
{
int idx = (__force int)status;
if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
return -EIO;
return blk_errors[idx].err;
}
void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
struct bio *src, struct bvec_iter *src_iter)
@ -199,8 +230,8 @@ static struct bio *__bio_chain_endio(struct bio *bio)
{
struct bio *parent = bio->bi_private;
if (!parent->bi_error)
parent->bi_error = bio->bi_error;
if (!parent->bi_status)
parent->bi_status = bio->bi_status;
bio_put(bio);
return parent;
}
@ -233,27 +264,6 @@ again:
bio->bi_end_io(bio);
}
void bio_endio_nodec(struct bio *bio)
{
goto nodec;
while (bio) {
if (unlikely(!bio_remaining_done(bio)))
break;
nodec:
if (bio->bi_end_io == bio_chain_endio) {
struct bio *parent = bio->bi_private;
parent->bi_error = bio->bi_error;
bio_put(bio);
bio = parent;
} else {
if (bio->bi_end_io)
bio->bi_end_io(bio);
bio = NULL;
}
}
}
void bio_reset(struct bio *bio)
{
unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);

View File

@ -32,7 +32,7 @@ void generic_make_request(struct bio *bio)
ret = fdatasync(bio->bi_bdev->bd_fd);
if (ret) {
fprintf(stderr, "fsync error: %m\n");
bio->bi_error = -EIO;
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
return;
}
@ -106,7 +106,7 @@ int submit_bio_wait(struct bio *bio)
submit_bio(bio);
wait_for_completion(&done);
return bio->bi_error;
return blk_status_to_errno(bio->bi_status);
}
int blkdev_issue_discard(struct block_device *bdev,
@ -235,10 +235,8 @@ static int aio_completion_thread(void *arg)
for (ev = events; ev < events + ret; ev++) {
struct bio *bio = (struct bio *) ev->data;
if (ev->res < 0)
bio->bi_error = ev->res;
else if (ev->res != bio->bi_iter.bi_size)
bio->bi_error = -EIO;
if (ev->res != bio->bi_iter.bi_size)
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
}