Update bcachefs sources to 8c94740b1bf8 bcachefs: Add missing vaidation for jset_entry_data_usage

This commit is contained in:
Kent Overstreet 2023-11-25 21:51:30 -05:00
parent 138397d892
commit 3a0cc86e76
36 changed files with 609 additions and 425 deletions

View File

@ -1 +1 @@
783085c3cc440183ba5e987b1aa7791cc1ca42ba 8c94740b1bf8645d3398170f41c9c88b78332252

View File

@ -261,10 +261,8 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
case BCH_DATA_free: case BCH_DATA_free:
case BCH_DATA_need_gc_gens: case BCH_DATA_need_gc_gens:
case BCH_DATA_need_discard: case BCH_DATA_need_discard:
bkey_fsck_err_on(a.v->dirty_sectors || bkey_fsck_err_on(bch2_bucket_sectors(*a.v) || a.v->stripe,
a.v->cached_sectors || c, err, alloc_key_empty_but_have_data,
a.v->stripe, c, err,
alloc_key_empty_but_have_data,
"empty data type free but have data"); "empty data type free but have data");
break; break;
case BCH_DATA_sb: case BCH_DATA_sb:
@ -272,22 +270,21 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
case BCH_DATA_btree: case BCH_DATA_btree:
case BCH_DATA_user: case BCH_DATA_user:
case BCH_DATA_parity: case BCH_DATA_parity:
bkey_fsck_err_on(!a.v->dirty_sectors, c, err, bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v),
alloc_key_dirty_sectors_0, c, err, alloc_key_dirty_sectors_0,
"data_type %s but dirty_sectors==0", "data_type %s but dirty_sectors==0",
bch2_data_types[a.v->data_type]); bch2_data_types[a.v->data_type]);
break; break;
case BCH_DATA_cached: case BCH_DATA_cached:
bkey_fsck_err_on(!a.v->cached_sectors || bkey_fsck_err_on(!a.v->cached_sectors ||
a.v->dirty_sectors || bch2_bucket_sectors_dirty(*a.v) ||
a.v->stripe, c, err, a.v->stripe,
alloc_key_cached_inconsistency, c, err, alloc_key_cached_inconsistency,
"data type inconsistency"); "data type inconsistency");
bkey_fsck_err_on(!a.v->io_time[READ] && bkey_fsck_err_on(!a.v->io_time[READ] &&
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
c, err, c, err, alloc_key_cached_but_read_time_zero,
alloc_key_cached_but_read_time_zero,
"cached bucket with read_time == 0"); "cached bucket with read_time == 0");
break; break;
case BCH_DATA_stripe: case BCH_DATA_stripe:
@ -790,8 +787,7 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
new_a->data_type = alloc_data_type(*new_a, new_a->data_type); new_a->data_type = alloc_data_type(*new_a, new_a->data_type);
if (new_a->dirty_sectors > old_a->dirty_sectors || if (bch2_bucket_sectors(*new_a) > bch2_bucket_sectors(*old_a)) {
new_a->cached_sectors > old_a->cached_sectors) {
new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
@ -1509,6 +1505,27 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
if (a->data_type != BCH_DATA_cached) if (a->data_type != BCH_DATA_cached)
return 0; return 0;
if (fsck_err_on(!a->io_time[READ], c,
alloc_key_cached_but_read_time_zero,
"cached bucket with read_time 0\n"
" %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
struct bkey_i_alloc_v4 *a_mut =
bch2_alloc_to_v4_mut(trans, alloc_k);
ret = PTR_ERR_OR_ZERO(a_mut);
if (ret)
goto err;
a_mut->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
ret = bch2_trans_update(trans, alloc_iter,
&a_mut->k_i, BTREE_TRIGGER_NORUN);
if (ret)
goto err;
a = &a_mut->v;
}
lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
lru_pos(alloc_k.k->p.inode, lru_pos(alloc_k.k->p.inode,
bucket_to_u64(alloc_k.k->p), bucket_to_u64(alloc_k.k->p),
@ -1517,41 +1534,18 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
if (ret) if (ret)
return ret; return ret;
if (fsck_err_on(!a->io_time[READ], c, if (fsck_err_on(lru_k.k->type != KEY_TYPE_set, c,
alloc_key_cached_but_read_time_zero,
"cached bucket with read_time 0\n"
" %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) ||
fsck_err_on(lru_k.k->type != KEY_TYPE_set, c,
alloc_key_to_missing_lru_entry, alloc_key_to_missing_lru_entry,
"missing lru entry\n" "missing lru entry\n"
" %s", " %s",
(printbuf_reset(&buf), (printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
u64 read_time = a->io_time[READ] ?:
atomic64_read(&c->io_clock[READ].now);
ret = bch2_lru_set(trans, ret = bch2_lru_set(trans,
alloc_k.k->p.inode, alloc_k.k->p.inode,
bucket_to_u64(alloc_k.k->p), bucket_to_u64(alloc_k.k->p),
read_time); a->io_time[READ]);
if (ret) if (ret)
goto err; goto err;
if (a->io_time[READ] != read_time) {
struct bkey_i_alloc_v4 *a_mut =
bch2_alloc_to_v4_mut(trans, alloc_k);
ret = PTR_ERR_OR_ZERO(a_mut);
if (ret)
goto err;
a_mut->v.io_time[READ] = read_time;
ret = bch2_trans_update(trans, alloc_iter,
&a_mut->k_i, BTREE_TRIGGER_NORUN);
if (ret)
goto err;
}
} }
err: err:
fsck_err: fsck_err:
@ -1564,15 +1558,13 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
{ {
struct btree_iter iter; struct btree_iter iter;
struct bkey_s_c k; struct bkey_s_c k;
int ret = 0;
ret = bch2_trans_run(c, int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
POS_MIN, BTREE_ITER_PREFETCH, k, POS_MIN, BTREE_ITER_PREFETCH, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
bch2_check_alloc_to_lru_ref(trans, &iter))); bch2_check_alloc_to_lru_ref(trans, &iter)));
if (ret) bch_err_fn(c, ret);
bch_err_fn(c, ret);
return ret; return ret;
} }
@ -1734,28 +1726,25 @@ void bch2_do_discards(struct bch_fs *c)
static int invalidate_one_bucket(struct btree_trans *trans, static int invalidate_one_bucket(struct btree_trans *trans,
struct btree_iter *lru_iter, struct btree_iter *lru_iter,
struct bkey_s_c lru_k, struct bkey_s_c lru_k,
struct bpos *last_flushed_pos,
s64 *nr_to_invalidate) s64 *nr_to_invalidate)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct btree_iter alloc_iter = { NULL };
struct bkey_i_alloc_v4 *a = NULL;
struct printbuf buf = PRINTBUF;
struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
unsigned cached_sectors;
int ret = 0; int ret = 0;
if (*nr_to_invalidate <= 0) if (*nr_to_invalidate <= 0)
return 1; return 1;
if (!bch2_dev_bucket_exists(c, bucket)) { ret = bch2_check_lru_key(trans, lru_iter, lru_k, last_flushed_pos);
prt_str(&buf, "lru entry points to invalid bucket"); if (ret)
goto err; return ret < 0 ? ret : 0;
}
struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
return 0; return 0;
a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket); struct btree_iter alloc_iter;
struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket);
ret = PTR_ERR_OR_ZERO(a); ret = PTR_ERR_OR_ZERO(a);
if (ret) if (ret)
goto out; goto out;
@ -1769,7 +1758,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
if (!a->v.cached_sectors) if (!a->v.cached_sectors)
bch_err(c, "invalidating empty bucket, confused"); bch_err(c, "invalidating empty bucket, confused");
cached_sectors = a->v.cached_sectors; unsigned cached_sectors = a->v.cached_sectors;
SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
a->v.gen++; a->v.gen++;
@ -1791,28 +1780,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
--*nr_to_invalidate; --*nr_to_invalidate;
out: out:
bch2_trans_iter_exit(trans, &alloc_iter); bch2_trans_iter_exit(trans, &alloc_iter);
printbuf_exit(&buf);
return ret; return ret;
err:
prt_str(&buf, "\n lru key: ");
bch2_bkey_val_to_text(&buf, c, lru_k);
prt_str(&buf, "\n lru entry: ");
bch2_lru_pos_to_text(&buf, lru_iter->pos);
prt_str(&buf, "\n alloc key: ");
if (!a)
bch2_bpos_to_text(&buf, bucket);
else
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
bch_err(c, "%s", buf.buf);
if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) {
bch2_inconsistent_error(c);
ret = -EINVAL;
}
goto out;
} }
static void bch2_do_invalidates_work(struct work_struct *work) static void bch2_do_invalidates_work(struct work_struct *work)
@ -1822,6 +1790,7 @@ static void bch2_do_invalidates_work(struct work_struct *work)
struct btree_trans *trans = bch2_trans_get(c); struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter; struct btree_iter iter;
struct bkey_s_c k; struct bkey_s_c k;
struct bpos last_flushed_pos = POS_MIN;
unsigned i; unsigned i;
int ret = 0; int ret = 0;
@ -1837,7 +1806,8 @@ static void bch2_do_invalidates_work(struct work_struct *work)
lru_pos(ca->dev_idx, 0, 0), lru_pos(ca->dev_idx, 0, 0),
lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX), lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
BTREE_ITER_INTENT, k, BTREE_ITER_INTENT, k,
invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate)); invalidate_one_bucket(trans, &iter, k, &last_flushed_pos,
&nr_to_invalidate));
if (ret < 0) { if (ret < 0) {
percpu_ref_put(&ca->ref); percpu_ref_put(&ca->ref);

View File

@ -71,6 +71,24 @@ static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type)
return data_type == BCH_DATA_stripe ? BCH_DATA_user : data_type; return data_type == BCH_DATA_stripe ? BCH_DATA_user : data_type;
} }
static inline unsigned bch2_bucket_sectors(struct bch_alloc_v4 a)
{
return a.dirty_sectors + a.cached_sectors;
}
static inline unsigned bch2_bucket_sectors_dirty(struct bch_alloc_v4 a)
{
return a.dirty_sectors;
}
static inline unsigned bch2_bucket_sectors_fragmented(struct bch_dev *ca,
struct bch_alloc_v4 a)
{
unsigned d = bch2_bucket_sectors_dirty(a);
return d ? max(0U, ca->mi.bucket_size - d) : 0;
}
static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a) static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a)
{ {
return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0; return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0;
@ -90,10 +108,11 @@ static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
struct bch_dev *ca) struct bch_dev *ca)
{ {
if (!data_type_movable(a.data_type) || if (!data_type_movable(a.data_type) ||
a.dirty_sectors >= ca->mi.bucket_size) !bch2_bucket_sectors_fragmented(ca, a))
return 0; return 0;
return div_u64((u64) a.dirty_sectors * (1ULL << 31), ca->mi.bucket_size); u64 d = bch2_bucket_sectors_dirty(a);
return div_u64(d * (1ULL << 31), ca->mi.bucket_size);
} }
static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a) static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a)

View File

@ -1345,6 +1345,9 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
int ret; int ret;
int i; int i;
if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
erasure_code = false;
BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS); BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
BUG_ON(!nr_replicas || !nr_replicas_required); BUG_ON(!nr_replicas || !nr_replicas_required);

View File

@ -935,7 +935,7 @@ struct bch_fs {
mempool_t compression_bounce[2]; mempool_t compression_bounce[2];
mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR]; mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR];
mempool_t decompress_workspace; mempool_t decompress_workspace;
ZSTD_parameters zstd_params; size_t zstd_workspace_size;
struct crypto_shash *sha256; struct crypto_shash *sha256;
struct crypto_sync_skcipher *chacha20; struct crypto_sync_skcipher *chacha20;

View File

@ -151,7 +151,11 @@ struct bpos {
#else #else
#error edit for your odd byteorder. #error edit for your odd byteorder.
#endif #endif
} __packed __aligned(4); } __packed
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
__aligned(4)
#endif
;
#define KEY_INODE_MAX ((__u64)~0ULL) #define KEY_INODE_MAX ((__u64)~0ULL)
#define KEY_OFFSET_MAX ((__u64)~0ULL) #define KEY_OFFSET_MAX ((__u64)~0ULL)
@ -2203,8 +2207,8 @@ struct jset_entry_dev_usage {
__le32 dev; __le32 dev;
__u32 pad; __u32 pad;
__le64 buckets_ec; __le64 _buckets_ec; /* No longer used */
__le64 _buckets_unavailable; /* No longer used */ __le64 _buckets_unavailable; /* No longer used */
struct jset_entry_dev_usage_type d[]; struct jset_entry_dev_usage_type d[];
}; };

View File

@ -81,6 +81,8 @@ struct bch_ioctl_incremental {
#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume) #define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume)
#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume) #define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume)
#define BCH_IOCTL_DEV_USAGE_V2 _IOWR(0xbc, 18, struct bch_ioctl_dev_usage_v2)
/* ioctl below act on a particular file, not the filesystem as a whole: */ /* ioctl below act on a particular file, not the filesystem as a whole: */
#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *) #define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *)
@ -298,7 +300,20 @@ struct bch_ioctl_dev_usage {
__u64 buckets; __u64 buckets;
__u64 sectors; __u64 sectors;
__u64 fragmented; __u64 fragmented;
} d[BCH_DATA_NR]; } d[10];
};
struct bch_ioctl_dev_usage_v2 {
__u64 dev;
__u32 flags;
__u8 state;
__u8 nr_data_types;
__u8 pad[6];
__u32 bucket_size;
__u64 nr_buckets;
struct bch_ioctl_dev_usage_type d[0];
}; };
/* /*

View File

@ -1254,9 +1254,6 @@ static int bch2_gc_done(struct bch_fs *c,
copy_dev_field(dev_usage_fragmented_wrong, copy_dev_field(dev_usage_fragmented_wrong,
d[i].fragmented, "%s fragmented", bch2_data_types[i]); d[i].fragmented, "%s fragmented", bch2_data_types[i]);
} }
copy_dev_field(dev_usage_buckets_ec_wrong,
buckets_ec, "buckets_ec");
} }
{ {

View File

@ -361,7 +361,6 @@ noinline static int
btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
struct btree_path *path, unsigned new_u64s) struct btree_path *path, unsigned new_u64s)
{ {
struct bch_fs *c = trans->c;
struct btree_insert_entry *i; struct btree_insert_entry *i;
struct bkey_cached *ck = (void *) path->l[0].b; struct bkey_cached *ck = (void *) path->l[0].b;
struct bkey_i *new_k; struct bkey_i *new_k;
@ -372,7 +371,7 @@ btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
if (!new_k) { if (!new_k) {
bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
bch2_btree_id_str(path->btree_id), new_u64s); bch2_btree_id_str(path->btree_id), new_u64s);
return -BCH_ERR_ENOMEM_btree_key_cache_insert; return -BCH_ERR_ENOMEM_btree_key_cache_insert;
} }

View File

@ -29,14 +29,12 @@ static inline bool wb_key_cmp(const struct wb_key_ref *l, const struct wb_key_re
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
int cmp; int cmp;
asm(".intel_syntax noprefix;" asm("mov (%[l]), %%rax;"
"mov rax, [%[l]];" "sub (%[r]), %%rax;"
"sub rax, [%[r]];" "mov 8(%[l]), %%rax;"
"mov rax, [%[l] + 8];" "sbb 8(%[r]), %%rax;"
"sbb rax, [%[r] + 8];" "mov 16(%[l]), %%rax;"
"mov rax, [%[l] + 16];" "sbb 16(%[r]), %%rax;"
"sbb rax, [%[r] + 16];"
".att_syntax prefix;"
: "=@ccae" (cmp) : "=@ccae" (cmp)
: [l] "r" (l), [r] "r" (r) : [l] "r" (l), [r] "r" (r)
: "rax", "cc"); : "rax", "cc");
@ -297,7 +295,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx]; struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx];
skipped++; skipped++;
n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);; n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);
k->journal_seq = 0; k->journal_seq = 0;
continue; continue;
} }

View File

@ -277,12 +277,28 @@ void bch2_dev_usage_init(struct bch_dev *ca)
ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket; ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket;
} }
static inline int bucket_sectors_fragmented(struct bch_dev *ca, void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage)
struct bch_alloc_v4 a)
{ {
return a.dirty_sectors prt_tab(out);
? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors) prt_str(out, "buckets");
: 0; prt_tab_rjust(out);
prt_str(out, "sectors");
prt_tab_rjust(out);
prt_str(out, "fragmented");
prt_tab_rjust(out);
prt_newline(out);
for (unsigned i = 0; i < BCH_DATA_NR; i++) {
prt_str(out, bch2_data_types[i]);
prt_tab(out);
prt_u64(out, usage->d[i].buckets);
prt_tab_rjust(out);
prt_u64(out, usage->d[i].sectors);
prt_tab_rjust(out);
prt_u64(out, usage->d[i].fragmented);
prt_tab_rjust(out);
prt_newline(out);
}
} }
static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
@ -306,41 +322,37 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
u->d[old.data_type].buckets--; u->d[old.data_type].buckets--;
u->d[new.data_type].buckets++; u->d[new.data_type].buckets++;
u->buckets_ec -= (int) !!old.stripe; u->d[old.data_type].sectors -= bch2_bucket_sectors_dirty(old);
u->buckets_ec += (int) !!new.stripe; u->d[new.data_type].sectors += bch2_bucket_sectors_dirty(new);
u->d[old.data_type].sectors -= old.dirty_sectors;
u->d[new.data_type].sectors += new.dirty_sectors;
u->d[BCH_DATA_cached].sectors += new.cached_sectors; u->d[BCH_DATA_cached].sectors += new.cached_sectors;
u->d[BCH_DATA_cached].sectors -= old.cached_sectors; u->d[BCH_DATA_cached].sectors -= old.cached_sectors;
u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old); u->d[old.data_type].fragmented -= bch2_bucket_sectors_fragmented(ca, old);
u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new); u->d[new.data_type].fragmented += bch2_bucket_sectors_fragmented(ca, new);
preempt_enable(); preempt_enable();
} }
struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b)
{
return (struct bch_alloc_v4) {
.gen = b.gen,
.data_type = b.data_type,
.dirty_sectors = b.dirty_sectors,
.cached_sectors = b.cached_sectors,
.stripe = b.stripe,
};
}
static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca,
struct bucket old, struct bucket new, struct bucket old, struct bucket new,
u64 journal_seq, bool gc) u64 journal_seq, bool gc)
{ {
struct bch_alloc_v4 old_a = { bch2_dev_usage_update(c, ca,
.gen = old.gen, bucket_m_to_alloc(old),
.data_type = old.data_type, bucket_m_to_alloc(new),
.dirty_sectors = old.dirty_sectors, journal_seq, gc);
.cached_sectors = old.cached_sectors,
.stripe = old.stripe,
};
struct bch_alloc_v4 new_a = {
.gen = new.gen,
.data_type = new.data_type,
.dirty_sectors = new.dirty_sectors,
.cached_sectors = new.cached_sectors,
.stripe = new.stripe,
};
bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc);
} }
static inline int __update_replicas(struct bch_fs *c, static inline int __update_replicas(struct bch_fs *c,
@ -640,7 +652,6 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
goto err; goto err;
} }
g->data_type = data_type; g->data_type = data_type;
g->dirty_sectors += sectors; g->dirty_sectors += sectors;
new = *g; new = *g;
@ -657,14 +668,11 @@ static int check_bucket_ref(struct btree_trans *trans,
const struct bch_extent_ptr *ptr, const struct bch_extent_ptr *ptr,
s64 sectors, enum bch_data_type ptr_data_type, s64 sectors, enum bch_data_type ptr_data_type,
u8 b_gen, u8 bucket_data_type, u8 b_gen, u8 bucket_data_type,
u32 dirty_sectors, u32 cached_sectors) u32 bucket_sectors)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
u32 bucket_sectors = !ptr->cached
? dirty_sectors
: cached_sectors;
struct printbuf buf = PRINTBUF; struct printbuf buf = PRINTBUF;
int ret = 0; int ret = 0;
@ -799,7 +807,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
ret = check_bucket_ref(trans, k, ptr, sectors, data_type, ret = check_bucket_ref(trans, k, ptr, sectors, data_type,
g->gen, g->data_type, g->gen, g->data_type,
g->dirty_sectors, g->cached_sectors); g->dirty_sectors);
if (ret) if (ret)
goto err; goto err;
@ -829,8 +837,7 @@ static int __mark_pointer(struct btree_trans *trans,
? dirty_sectors ? dirty_sectors
: cached_sectors; : cached_sectors;
int ret = check_bucket_ref(trans, k, ptr, sectors, ptr_data_type, int ret = check_bucket_ref(trans, k, ptr, sectors, ptr_data_type,
bucket_gen, *bucket_data_type, bucket_gen, *bucket_data_type, *dst_sectors);
*dirty_sectors, *cached_sectors);
if (ret) if (ret)
return ret; return ret;
@ -1559,7 +1566,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
ret = check_bucket_ref(trans, s.s_c, ptr, sectors, data_type, ret = check_bucket_ref(trans, s.s_c, ptr, sectors, data_type,
a->v.gen, a->v.data_type, a->v.gen, a->v.data_type,
a->v.dirty_sectors, a->v.cached_sectors); a->v.dirty_sectors);
if (ret) if (ret)
goto err; goto err;
@ -2073,8 +2080,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
bucket_gens->first_bucket = ca->mi.first_bucket; bucket_gens->first_bucket = ca->mi.first_bucket;
bucket_gens->nbuckets = nbuckets; bucket_gens->nbuckets = nbuckets;
bch2_copygc_stop(c);
if (resize) { if (resize) {
down_write(&c->gc_lock); down_write(&c->gc_lock);
down_write(&ca->bucket_lock); down_write(&ca->bucket_lock);

View File

@ -203,6 +203,7 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
} }
void bch2_dev_usage_init(struct bch_dev *); void bch2_dev_usage_init(struct bch_dev *);
void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev_usage *);
static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark) static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark)
{ {

View File

@ -33,8 +33,6 @@ struct bucket_gens {
}; };
struct bch_dev_usage { struct bch_dev_usage {
u64 buckets_ec;
struct { struct {
u64 buckets; u64 buckets;
u64 sectors; /* _compressed_ sectors: */ u64 sectors; /* _compressed_ sectors: */

View File

@ -23,6 +23,12 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/uaccess.h> #include <linux/uaccess.h>
__must_check
static int copy_to_user_errcode(void __user *to, const void *from, unsigned long n)
{
return copy_to_user(to, from, n) ? -EFAULT : 0;
}
/* returns with ref on ca->ref */ /* returns with ref on ca->ref */
static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
unsigned flags) unsigned flags)
@ -149,10 +155,8 @@ static long bch2_global_ioctl(unsigned cmd, void __user *arg)
static long bch2_ioctl_query_uuid(struct bch_fs *c, static long bch2_ioctl_query_uuid(struct bch_fs *c,
struct bch_ioctl_query_uuid __user *user_arg) struct bch_ioctl_query_uuid __user *user_arg)
{ {
if (copy_to_user(&user_arg->uuid, &c->sb.user_uuid, return copy_to_user_errcode(&user_arg->uuid, &c->sb.user_uuid,
sizeof(c->sb.user_uuid))) sizeof(c->sb.user_uuid));
return -EFAULT;
return 0;
} }
#if 0 #if 0
@ -341,10 +345,7 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
if (len < sizeof(e)) if (len < sizeof(e))
return -EINVAL; return -EINVAL;
if (copy_to_user(buf, &e, sizeof(e))) return copy_to_user_errcode(buf, &e, sizeof(e)) ?: sizeof(e);
return -EFAULT;
return sizeof(e);
} }
static const struct file_operations bcachefs_data_ops = { static const struct file_operations bcachefs_data_ops = {
@ -474,14 +475,15 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
if (ret) if (ret)
goto err; goto err;
if (copy_to_user(user_arg, arg,
sizeof(*arg) + arg->replica_entries_bytes)) ret = copy_to_user_errcode(user_arg, arg,
ret = -EFAULT; sizeof(*arg) + arg->replica_entries_bytes);
err: err:
kfree(arg); kfree(arg);
return ret; return ret;
} }
/* obsolete, didn't allow for new data types: */
static long bch2_ioctl_dev_usage(struct bch_fs *c, static long bch2_ioctl_dev_usage(struct bch_fs *c,
struct bch_ioctl_dev_usage __user *user_arg) struct bch_ioctl_dev_usage __user *user_arg)
{ {
@ -511,7 +513,6 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
arg.state = ca->mi.state; arg.state = ca->mi.state;
arg.bucket_size = ca->mi.bucket_size; arg.bucket_size = ca->mi.bucket_size;
arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket;
arg.buckets_ec = src.buckets_ec;
for (i = 0; i < BCH_DATA_NR; i++) { for (i = 0; i < BCH_DATA_NR; i++) {
arg.d[i].buckets = src.d[i].buckets; arg.d[i].buckets = src.d[i].buckets;
@ -521,10 +522,58 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
percpu_ref_put(&ca->ref); percpu_ref_put(&ca->ref);
if (copy_to_user(user_arg, &arg, sizeof(arg))) return copy_to_user_errcode(user_arg, &arg, sizeof(arg));
}
static long bch2_ioctl_dev_usage_v2(struct bch_fs *c,
struct bch_ioctl_dev_usage_v2 __user *user_arg)
{
struct bch_ioctl_dev_usage_v2 arg;
struct bch_dev_usage src;
struct bch_dev *ca;
int ret = 0;
if (!test_bit(BCH_FS_STARTED, &c->flags))
return -EINVAL;
if (copy_from_user(&arg, user_arg, sizeof(arg)))
return -EFAULT; return -EFAULT;
return 0; if ((arg.flags & ~BCH_BY_INDEX) ||
arg.pad[0] ||
arg.pad[1] ||
arg.pad[2])
return -EINVAL;
ca = bch2_device_lookup(c, arg.dev, arg.flags);
if (IS_ERR(ca))
return PTR_ERR(ca);
src = bch2_dev_usage_read(ca);
arg.state = ca->mi.state;
arg.bucket_size = ca->mi.bucket_size;
arg.nr_data_types = min(arg.nr_data_types, BCH_DATA_NR);
arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket;
ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
if (ret)
goto err;
for (unsigned i = 0; i < arg.nr_data_types; i++) {
struct bch_ioctl_dev_usage_type t = {
.buckets = src.d[i].buckets,
.sectors = src.d[i].sectors,
.fragmented = src.d[i].fragmented,
};
ret = copy_to_user_errcode(&user_arg->d[i], &t, sizeof(t));
if (ret)
goto err;
}
err:
percpu_ref_put(&ca->ref);
return ret;
} }
static long bch2_ioctl_read_super(struct bch_fs *c, static long bch2_ioctl_read_super(struct bch_fs *c,
@ -561,9 +610,8 @@ static long bch2_ioctl_read_super(struct bch_fs *c,
goto err; goto err;
} }
if (copy_to_user((void __user *)(unsigned long)arg.sb, sb, ret = copy_to_user_errcode((void __user *)(unsigned long)arg.sb, sb,
vstruct_bytes(sb))) vstruct_bytes(sb));
ret = -EFAULT;
err: err:
if (!IS_ERR_OR_NULL(ca)) if (!IS_ERR_OR_NULL(ca))
percpu_ref_put(&ca->ref); percpu_ref_put(&ca->ref);
@ -663,6 +711,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
return bch2_ioctl_fs_usage(c, arg); return bch2_ioctl_fs_usage(c, arg);
case BCH_IOCTL_DEV_USAGE: case BCH_IOCTL_DEV_USAGE:
return bch2_ioctl_dev_usage(c, arg); return bch2_ioctl_dev_usage(c, arg);
case BCH_IOCTL_DEV_USAGE_V2:
return bch2_ioctl_dev_usage_v2(c, arg);
#if 0 #if 0
case BCH_IOCTL_START: case BCH_IOCTL_START:
BCH_IOCTL(start, struct bch_ioctl_start); BCH_IOCTL(start, struct bch_ioctl_start);

View File

@ -354,8 +354,7 @@ static int attempt_compress(struct bch_fs *c,
*/ */
unsigned level = min((compression.level * 3) / 2, zstd_max_clevel()); unsigned level = min((compression.level * 3) / 2, zstd_max_clevel());
ZSTD_parameters params = zstd_get_params(level, c->opts.encoded_extent_max); ZSTD_parameters params = zstd_get_params(level, c->opts.encoded_extent_max);
ZSTD_CCtx *ctx = zstd_init_cctx(workspace, ZSTD_CCtx *ctx = zstd_init_cctx(workspace, c->zstd_workspace_size);
zstd_cctx_workspace_bound(&params.cParams));
/* /*
* ZSTD requires that when we decompress we pass in the exact * ZSTD requires that when we decompress we pass in the exact
@ -371,7 +370,7 @@ static int attempt_compress(struct bch_fs *c,
size_t len = zstd_compress_cctx(ctx, size_t len = zstd_compress_cctx(ctx,
dst + 4, dst_len - 4 - 7, dst + 4, dst_len - 4 - 7,
src, src_len, src, src_len,
&c->zstd_params); &params);
if (zstd_is_error(len)) if (zstd_is_error(len))
return 0; return 0;
@ -572,6 +571,13 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
size_t decompress_workspace_size = 0; size_t decompress_workspace_size = 0;
ZSTD_parameters params = zstd_get_params(zstd_max_clevel(), ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
c->opts.encoded_extent_max); c->opts.encoded_extent_max);
/*
* ZSTD is lying: if we allocate the size of the workspace it says it
* requires, it returns memory allocation errors
*/
c->zstd_workspace_size = zstd_cctx_workspace_bound(&params.cParams) * 2;
struct { struct {
unsigned feature; unsigned feature;
enum bch_compression_type type; enum bch_compression_type type;
@ -585,13 +591,11 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
zlib_inflate_workspacesize(), }, zlib_inflate_workspacesize(), },
{ BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd, { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd,
zstd_cctx_workspace_bound(&params.cParams), c->zstd_workspace_size,
zstd_dctx_workspace_bound() }, zstd_dctx_workspace_bound() },
}, *i; }, *i;
bool have_compressed = false; bool have_compressed = false;
c->zstd_params = params;
for (i = compression_types; for (i = compression_types;
i < compression_types + ARRAY_SIZE(compression_types); i < compression_types + ARRAY_SIZE(compression_types);
i++) i++)

View File

@ -267,6 +267,20 @@ restart_drop_extra_replicas:
goto out; goto out;
} }
if (trace_data_update_enabled()) {
struct printbuf buf = PRINTBUF;
prt_str(&buf, "\nold: ");
bch2_bkey_val_to_text(&buf, c, old);
prt_str(&buf, "\nk: ");
bch2_bkey_val_to_text(&buf, c, k);
prt_str(&buf, "\nnew: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
trace_data_update(c, buf.buf);
printbuf_exit(&buf);
}
ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id, ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, bkey_start_pos(&insert->k)) ?: k.k->p, bkey_start_pos(&insert->k)) ?:
bch2_insert_snapshot_whiteouts(trans, m->btree_id, bch2_insert_snapshot_whiteouts(trans, m->btree_id,
@ -356,7 +370,7 @@ void bch2_data_update_exit(struct data_update *update)
bch2_bio_free_pages_pool(c, &update->op.wbio.bio); bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
} }
void bch2_update_unwritten_extent(struct btree_trans *trans, static void bch2_update_unwritten_extent(struct btree_trans *trans,
struct data_update *update) struct data_update *update)
{ {
struct bch_fs *c = update->op.c; struct bch_fs *c = update->op.c;
@ -436,7 +450,51 @@ void bch2_update_unwritten_extent(struct btree_trans *trans,
} }
} }
int bch2_extent_drop_ptrs(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
struct data_update_opts data_opts)
{
struct bch_fs *c = trans->c;
struct bkey_i *n;
int ret;
n = bch2_bkey_make_mut_noupdate(trans, k);
ret = PTR_ERR_OR_ZERO(n);
if (ret)
return ret;
while (data_opts.kill_ptrs) {
unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
struct bch_extent_ptr *ptr;
bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
data_opts.kill_ptrs ^= 1U << drop;
}
/*
* If the new extent no longer has any pointers, bch2_extent_normalize()
* will do the appropriate thing with it (turning it into a
* KEY_TYPE_error key, or just a discard if it was a cached extent)
*/
bch2_extent_normalize(c, bkey_i_to_s(n));
/*
* Since we're not inserting through an extent iterator
* (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
* we aren't using the extent overwrite path to delete, we're
* just using the normal key deletion path:
*/
if (bkey_deleted(&n->k))
n->k.size = 0;
return bch2_trans_relock(trans) ?:
bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
int bch2_data_update_init(struct btree_trans *trans, int bch2_data_update_init(struct btree_trans *trans,
struct btree_iter *iter,
struct moving_context *ctxt, struct moving_context *ctxt,
struct data_update *m, struct data_update *m,
struct write_point_specifier wp, struct write_point_specifier wp,
@ -452,7 +510,7 @@ int bch2_data_update_init(struct btree_trans *trans,
const struct bch_extent_ptr *ptr; const struct bch_extent_ptr *ptr;
unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas; unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
unsigned ptrs_locked = 0; unsigned ptrs_locked = 0;
int ret; int ret = 0;
bch2_bkey_buf_init(&m->k); bch2_bkey_buf_init(&m->k);
bch2_bkey_buf_reassemble(&m->k, c, k); bch2_bkey_buf_reassemble(&m->k, c, k);
@ -478,6 +536,8 @@ int bch2_data_update_init(struct btree_trans *trans,
bkey_for_each_ptr(ptrs, ptr) bkey_for_each_ptr(ptrs, ptr)
percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref); percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref);
unsigned durability_have = 0, durability_removing = 0;
i = 0; i = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
bool locked; bool locked;
@ -489,8 +549,11 @@ int bch2_data_update_init(struct btree_trans *trans,
reserve_sectors += k.k->size; reserve_sectors += k.k->size;
m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
} else if (!p.ptr.cached) { durability_removing += bch2_extent_ptr_desired_durability(c, &p);
} else if (!p.ptr.cached &&
!((1U << i) & m->data_opts.kill_ptrs)) {
bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
durability_have += bch2_extent_ptr_durability(c, &p);
} }
/* /*
@ -529,6 +592,29 @@ int bch2_data_update_init(struct btree_trans *trans,
i++; i++;
} }
/*
* If current extent durability is less than io_opts.data_replicas,
* we're not trying to rereplicate the extent up to data_replicas here -
* unless extra_replicas was specified
*
* Increasing replication is an explicit operation triggered by
* rereplicate, currently, so that users don't get an unexpected -ENOSPC
*/
if (durability_have >= io_opts.data_replicas) {
m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
m->data_opts.rewrite_ptrs = 0;
/* if iter == NULL, it's just a promote */
if (iter)
ret = bch2_extent_drop_ptrs(trans, iter, k, data_opts);
goto done;
}
m->op.nr_replicas = min(durability_removing, io_opts.data_replicas - durability_have) +
m->data_opts.extra_replicas;
m->op.nr_replicas_required = m->op.nr_replicas;
BUG_ON(!m->op.nr_replicas);
if (reserve_sectors) { if (reserve_sectors) {
ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
m->data_opts.extra_replicas m->data_opts.extra_replicas
@ -538,14 +624,11 @@ int bch2_data_update_init(struct btree_trans *trans,
goto err; goto err;
} }
m->op.nr_replicas += m->data_opts.extra_replicas; if (bkey_extent_is_unwritten(k)) {
m->op.nr_replicas_required = m->op.nr_replicas; bch2_update_unwritten_extent(trans, m);
goto done;
}
BUG_ON(!m->op.nr_replicas);
/* Special handling required: */
if (bkey_extent_is_unwritten(k))
return -BCH_ERR_unwritten_extent_update;
return 0; return 0;
err: err:
i = 0; i = 0;
@ -560,6 +643,9 @@ err:
bch2_bkey_buf_exit(&m->k, c); bch2_bkey_buf_exit(&m->k, c);
bch2_bio_free_pages_pool(c, &m->op.wbio.bio); bch2_bio_free_pages_pool(c, &m->op.wbio.bio);
return ret; return ret;
done:
bch2_data_update_exit(m);
return ret ?: -BCH_ERR_data_update_done;
} }
void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)

View File

@ -32,9 +32,14 @@ int bch2_data_update_index_update(struct bch_write_op *);
void bch2_data_update_read_done(struct data_update *, void bch2_data_update_read_done(struct data_update *,
struct bch_extent_crc_unpacked); struct bch_extent_crc_unpacked);
int bch2_extent_drop_ptrs(struct btree_trans *,
struct btree_iter *,
struct bkey_s_c,
struct data_update_opts);
void bch2_data_update_exit(struct data_update *); void bch2_data_update_exit(struct data_update *);
void bch2_update_unwritten_extent(struct btree_trans *, struct data_update *); int bch2_data_update_init(struct btree_trans *, struct btree_iter *,
int bch2_data_update_init(struct btree_trans *, struct moving_context *, struct moving_context *,
struct data_update *, struct data_update *,
struct write_point_specifier, struct write_point_specifier,
struct bch_io_opts, struct data_update_opts, struct bch_io_opts, struct data_update_opts,

View File

@ -160,7 +160,7 @@
x(BCH_ERR_fsck, fsck_repair_unimplemented) \ x(BCH_ERR_fsck, fsck_repair_unimplemented) \
x(BCH_ERR_fsck, fsck_repair_impossible) \ x(BCH_ERR_fsck, fsck_repair_impossible) \
x(0, restart_recovery) \ x(0, restart_recovery) \
x(0, unwritten_extent_update) \ x(0, data_update_done) \
x(EINVAL, device_state_not_allowed) \ x(EINVAL, device_state_not_allowed) \
x(EINVAL, member_info_missing) \ x(EINVAL, member_info_missing) \
x(EINVAL, mismatched_block_size) \ x(EINVAL, mismatched_block_size) \
@ -208,6 +208,7 @@
x(BCH_ERR_invalid_sb, invalid_sb_members) \ x(BCH_ERR_invalid_sb, invalid_sb_members) \
x(BCH_ERR_invalid_sb, invalid_sb_disk_groups) \ x(BCH_ERR_invalid_sb, invalid_sb_disk_groups) \
x(BCH_ERR_invalid_sb, invalid_sb_replicas) \ x(BCH_ERR_invalid_sb, invalid_sb_replicas) \
x(BCH_ERR_invalid_sb, invalid_replicas_entry) \
x(BCH_ERR_invalid_sb, invalid_sb_journal) \ x(BCH_ERR_invalid_sb, invalid_sb_journal) \
x(BCH_ERR_invalid_sb, invalid_sb_journal_seq_blacklist) \ x(BCH_ERR_invalid_sb, invalid_sb_journal_seq_blacklist) \
x(BCH_ERR_invalid_sb, invalid_sb_crypt) \ x(BCH_ERR_invalid_sb, invalid_sb_crypt) \

View File

@ -649,37 +649,31 @@ unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
return replicas; return replicas;
} }
unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p) static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent_ptr_decoded *p)
{ {
struct bch_dev *ca;
if (p->ptr.cached) if (p->ptr.cached)
return 0; return 0;
ca = bch_dev_bkey_exists(c, p->ptr.dev); return p->has_ec
? p->ec.redundancy + 1
: ca->mi.durability;
}
return ca->mi.durability + unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
(p->has_ec {
? p->ec.redundancy struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev);
: 0);
return __extent_ptr_durability(ca, p);
} }
unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p) unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
{ {
struct bch_dev *ca; struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev);
if (p->ptr.cached)
return 0;
ca = bch_dev_bkey_exists(c, p->ptr.dev);
if (ca->mi.state == BCH_MEMBER_STATE_failed) if (ca->mi.state == BCH_MEMBER_STATE_failed)
return 0; return 0;
return ca->mi.durability + return __extent_ptr_durability(ca, p);
(p->has_ec
? p->ec.redundancy
: 0);
} }
unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)

View File

@ -209,7 +209,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
bio = &op->write.op.wbio.bio; bio = &op->write.op.wbio.bio;
bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
ret = bch2_data_update_init(trans, NULL, &op->write, ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
writepoint_hashed((unsigned long) current), writepoint_hashed((unsigned long) current),
opts, opts,
(struct data_update_opts) { (struct data_update_opts) {

View File

@ -548,6 +548,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c,
struct jset_entry_data_usage *u = struct jset_entry_data_usage *u =
container_of(entry, struct jset_entry_data_usage, entry); container_of(entry, struct jset_entry_data_usage, entry);
unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
struct printbuf err = PRINTBUF;
int ret = 0; int ret = 0;
if (journal_entry_err_on(bytes < sizeof(*u) || if (journal_entry_err_on(bytes < sizeof(*u) ||
@ -556,10 +557,19 @@ static int journal_entry_data_usage_validate(struct bch_fs *c,
journal_entry_data_usage_bad_size, journal_entry_data_usage_bad_size,
"invalid journal entry usage: bad size")) { "invalid journal entry usage: bad size")) {
journal_entry_null_range(entry, vstruct_next(entry)); journal_entry_null_range(entry, vstruct_next(entry));
return ret; goto out;
} }
if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c->disk_sb.sb, &err),
c, version, jset, entry,
journal_entry_data_usage_bad_size,
"invalid journal entry usage: %s", err.buf)) {
journal_entry_null_range(entry, vstruct_next(entry));
goto out;
}
out:
fsck_err: fsck_err:
printbuf_exit(&err);
return ret; return ret;
} }
@ -676,8 +686,6 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs
le64_to_cpu(u->d[i].sectors), le64_to_cpu(u->d[i].sectors),
le64_to_cpu(u->d[i].fragmented)); le64_to_cpu(u->d[i].fragmented));
} }
prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec));
} }
static int journal_entry_log_validate(struct bch_fs *c, static int journal_entry_log_validate(struct bch_fs *c,

View File

@ -40,8 +40,8 @@ void bch2_lru_pos_to_text(struct printbuf *out, struct bpos lru)
u64_to_bucket(lru.offset).offset); u64_to_bucket(lru.offset).offset);
} }
static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id, static inline int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
u64 dev_bucket, u64 time, bool set) u64 dev_bucket, u64 time, bool set)
{ {
return time return time
? bch2_btree_bit_mod(trans, BTREE_ID_lru, ? bch2_btree_bit_mod(trans, BTREE_ID_lru,
@ -51,12 +51,12 @@ static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time) int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
{ {
return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_deleted); return __bch2_lru_set(trans, lru_id, dev_bucket, time, false);
} }
int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time) int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
{ {
return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set); return __bch2_lru_set(trans, lru_id, dev_bucket, time, true);
} }
int bch2_lru_change(struct btree_trans *trans, int bch2_lru_change(struct btree_trans *trans,
@ -66,8 +66,8 @@ int bch2_lru_change(struct btree_trans *trans,
if (old_time == new_time) if (old_time == new_time)
return 0; return 0;
return bch2_lru_del(trans, lru_id, dev_bucket, old_time) ?: return __bch2_lru_set(trans, lru_id, dev_bucket, old_time, false) ?:
bch2_lru_set(trans, lru_id, dev_bucket, new_time); __bch2_lru_set(trans, lru_id, dev_bucket, new_time, true);
} }
static const char * const bch2_lru_types[] = { static const char * const bch2_lru_types[] = {
@ -77,10 +77,11 @@ static const char * const bch2_lru_types[] = {
NULL NULL
}; };
static int bch2_check_lru_key(struct btree_trans *trans, /* Returns 1 if key has been deleted */
struct btree_iter *lru_iter, int bch2_check_lru_key(struct btree_trans *trans,
struct bkey_s_c lru_k, struct btree_iter *lru_iter,
struct bpos *last_flushed_pos) struct bkey_s_c lru_k,
struct bpos *last_flushed_pos)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct btree_iter iter; struct btree_iter iter;
@ -89,7 +90,6 @@ static int bch2_check_lru_key(struct btree_trans *trans,
const struct bch_alloc_v4 *a; const struct bch_alloc_v4 *a;
struct printbuf buf1 = PRINTBUF; struct printbuf buf1 = PRINTBUF;
struct printbuf buf2 = PRINTBUF; struct printbuf buf2 = PRINTBUF;
enum bch_lru_type type = lru_type(lru_k);
struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset); struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset);
u64 idx; u64 idx;
int ret; int ret;
@ -98,7 +98,7 @@ static int bch2_check_lru_key(struct btree_trans *trans,
lru_entry_to_invalid_bucket, lru_entry_to_invalid_bucket,
"lru key points to nonexistent device:bucket %llu:%llu", "lru key points to nonexistent device:bucket %llu:%llu",
alloc_pos.inode, alloc_pos.offset)) alloc_pos.inode, alloc_pos.offset))
return bch2_btree_delete_at(trans, lru_iter, 0); goto delete;
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, alloc_pos, 0); k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, alloc_pos, 0);
ret = bkey_err(k); ret = bkey_err(k);
@ -107,6 +107,7 @@ static int bch2_check_lru_key(struct btree_trans *trans,
a = bch2_alloc_to_v4(k, &a_convert); a = bch2_alloc_to_v4(k, &a_convert);
enum bch_lru_type type = lru_type(lru_k);
switch (type) { switch (type) {
case BCH_LRU_read: case BCH_LRU_read:
idx = alloc_lru_idx_read(*a); idx = alloc_lru_idx_read(*a);
@ -114,6 +115,9 @@ static int bch2_check_lru_key(struct btree_trans *trans,
case BCH_LRU_fragmentation: case BCH_LRU_fragmentation:
idx = a->fragmentation_lru; idx = a->fragmentation_lru;
break; break;
default:
/* unknown LRU type, don't check: */
goto out;
} }
if (lru_k.k->type != KEY_TYPE_set || if (lru_k.k->type != KEY_TYPE_set ||
@ -125,16 +129,18 @@ static int bch2_check_lru_key(struct btree_trans *trans,
goto out; goto out;
} }
if (c->opts.reconstruct_alloc || if ((c->opts.reconstruct_alloc &&
c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_lrus) ||
fsck_err(c, lru_entry_bad, fsck_err(c, lru_entry_bad,
"incorrect lru entry: lru %s time %llu\n" "incorrect lru entry: lru %s time %llu\n"
" %s\n" " %s\n"
" for %s", "for\n"
" %s",
bch2_lru_types[type], bch2_lru_types[type],
lru_pos_time(lru_k.k->p), lru_pos_time(lru_k.k->p),
(bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf), (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
(bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf)))
ret = bch2_btree_delete_at(trans, lru_iter, 0); goto delete;
} }
out: out:
err: err:
@ -143,6 +149,14 @@ fsck_err:
printbuf_exit(&buf2); printbuf_exit(&buf2);
printbuf_exit(&buf1); printbuf_exit(&buf1);
return ret; return ret;
delete:
ret = bch2_btree_delete_at(trans, lru_iter, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_WATERMARK_btree|
BCH_TRANS_COMMIT_lazy_rw|
BCH_TRANS_COMMIT_no_enospc) ?:
1;
goto out;
} }
int bch2_check_lrus(struct bch_fs *c) int bch2_check_lrus(struct bch_fs *c)
@ -150,15 +164,14 @@ int bch2_check_lrus(struct bch_fs *c)
struct btree_iter iter; struct btree_iter iter;
struct bkey_s_c k; struct bkey_s_c k;
struct bpos last_flushed_pos = POS_MIN; struct bpos last_flushed_pos = POS_MIN;
int ret = 0;
ret = bch2_trans_run(c, int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, for_each_btree_key2(trans, iter,
BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k, BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k, ({
NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw, int ret2 = bch2_check_lru_key(trans, &iter, k, &last_flushed_pos);
bch2_check_lru_key(trans, &iter, k, &last_flushed_pos)));
if (ret) ret2 < 0 ? ret2 : 0;
bch_err_fn(c, ret); })));
bch_err_fn(c, ret);
return ret; return ret;
} }

View File

@ -64,6 +64,8 @@ int bch2_lru_del(struct btree_trans *, u16, u64, u64);
int bch2_lru_set(struct btree_trans *, u16, u64, u64); int bch2_lru_set(struct btree_trans *, u16, u64, u64);
int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64);
int bch2_check_lru_key(struct btree_trans *, struct btree_iter *,
struct bkey_s_c, struct bpos *);
int bch2_check_lrus(struct bch_fs *); int bch2_check_lrus(struct bch_fs *);
#endif /* _BCACHEFS_LRU_H */ #endif /* _BCACHEFS_LRU_H */

View File

@ -173,6 +173,7 @@ void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
static void bch2_moving_ctxt_flush_all(struct moving_context *ctxt) static void bch2_moving_ctxt_flush_all(struct moving_context *ctxt)
{ {
move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
bch2_trans_unlock_long(ctxt->trans);
closure_sync(&ctxt->cl); closure_sync(&ctxt->cl);
} }
@ -235,49 +236,6 @@ void bch2_move_stats_init(struct bch_move_stats *stats, const char *name)
scnprintf(stats->name, sizeof(stats->name), "%s", name); scnprintf(stats->name, sizeof(stats->name), "%s", name);
} }
static int bch2_extent_drop_ptrs(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
struct data_update_opts data_opts)
{
struct bch_fs *c = trans->c;
struct bkey_i *n;
int ret;
n = bch2_bkey_make_mut_noupdate(trans, k);
ret = PTR_ERR_OR_ZERO(n);
if (ret)
return ret;
while (data_opts.kill_ptrs) {
unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
struct bch_extent_ptr *ptr;
bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
data_opts.kill_ptrs ^= 1U << drop;
}
/*
* If the new extent no longer has any pointers, bch2_extent_normalize()
* will do the appropriate thing with it (turning it into a
* KEY_TYPE_error key, or just a discard if it was a cached extent)
*/
bch2_extent_normalize(c, bkey_i_to_s(n));
/*
* Since we're not inserting through an extent iterator
* (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
* we aren't using the extent overwrite path to delete, we're
* just using the normal key deletion path:
*/
if (bkey_deleted(&n->k))
n->k.size = 0;
return bch2_trans_relock(trans) ?:
bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
int bch2_move_extent(struct moving_context *ctxt, int bch2_move_extent(struct moving_context *ctxt,
struct move_bucket_in_flight *bucket_in_flight, struct move_bucket_in_flight *bucket_in_flight,
struct btree_iter *iter, struct btree_iter *iter,
@ -347,19 +305,11 @@ int bch2_move_extent(struct moving_context *ctxt,
io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
io->rbio.bio.bi_end_io = move_read_endio; io->rbio.bio.bi_end_io = move_read_endio;
ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp, ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
io_opts, data_opts, iter->btree_id, k); io_opts, data_opts, iter->btree_id, k);
if (ret && ret != -BCH_ERR_unwritten_extent_update) if (ret)
goto err_free_pages; goto err_free_pages;
if (ret == -BCH_ERR_unwritten_extent_update) {
bch2_update_unwritten_extent(trans, &io->write);
move_free(io);
return 0;
}
BUG_ON(ret);
io->write.op.end_io = move_write_done; io->write.op.end_io = move_write_done;
if (ctxt->rate) if (ctxt->rate)
@ -403,6 +353,9 @@ err_free_pages:
err_free: err_free:
kfree(io); kfree(io);
err: err:
if (ret == -BCH_ERR_data_update_done)
return 0;
this_cpu_inc(c->counters[BCH_COUNTER_move_extent_alloc_mem_fail]); this_cpu_inc(c->counters[BCH_COUNTER_move_extent_alloc_mem_fail]);
trace_move_extent_alloc_mem_fail2(c, k); trace_move_extent_alloc_mem_fail2(c, k);
return ret; return ret;
@ -506,22 +459,13 @@ int bch2_move_ratelimit(struct moving_context *ctxt)
do { do {
delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0; delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
if (kthread_should_stop())
if (delay) {
if (delay > HZ / 10)
bch2_trans_unlock_long(ctxt->trans);
else
bch2_trans_unlock(ctxt->trans);
set_current_state(TASK_INTERRUPTIBLE);
}
if (kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
return 1; return 1;
}
if (delay) if (delay)
schedule_timeout(delay); move_ctxt_wait_event_timeout(ctxt,
freezing(current) || kthread_should_stop(),
delay);
if (unlikely(freezing(current))) { if (unlikely(freezing(current))) {
bch2_moving_ctxt_flush_all(ctxt); bch2_moving_ctxt_flush_all(ctxt);
@ -729,7 +673,7 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
} }
a = bch2_alloc_to_v4(k, &a_convert); a = bch2_alloc_to_v4(k, &a_convert);
dirty_sectors = a->dirty_sectors; dirty_sectors = bch2_bucket_sectors_dirty(*a);
bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size; bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
fragmentation = a->fragmentation_lru; fragmentation = a->fragmentation_lru;

View File

@ -38,6 +38,25 @@ struct moving_context {
wait_queue_head_t wait; wait_queue_head_t wait;
}; };
#define move_ctxt_wait_event_timeout(_ctxt, _cond, _timeout) \
({ \
int _ret = 0; \
while (true) { \
bool cond_finished = false; \
bch2_moving_ctxt_do_pending_writes(_ctxt); \
\
if (_cond) \
break; \
bch2_trans_unlock_long((_ctxt)->trans); \
_ret = __wait_event_timeout((_ctxt)->wait, \
bch2_moving_ctxt_next_pending_write(_ctxt) || \
(cond_finished = (_cond)), _timeout); \
if (_ret || ( cond_finished)) \
break; \
} \
_ret; \
})
#define move_ctxt_wait_event(_ctxt, _cond) \ #define move_ctxt_wait_event(_ctxt, _cond) \
do { \ do { \
bool cond_finished = false; \ bool cond_finished = false; \

View File

@ -91,7 +91,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
a = bch2_alloc_to_v4(k, &_a); a = bch2_alloc_to_v4(k, &_a);
b->k.gen = a->gen; b->k.gen = a->gen;
b->sectors = a->dirty_sectors; b->sectors = bch2_bucket_sectors_dirty(*a);
ret = data_type_movable(a->data_type) && ret = data_type_movable(a->data_type) &&
a->fragmentation_lru && a->fragmentation_lru &&
@ -149,6 +149,7 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
struct bkey_s_c k; struct bkey_s_c k;
size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4); size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4);
size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0; size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
struct bpos last_flushed_pos = POS_MIN;
int ret; int ret;
move_buckets_wait(ctxt, buckets_in_flight, false); move_buckets_wait(ctxt, buckets_in_flight, false);
@ -165,11 +166,16 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0), lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0),
lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX), lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
0, k, ({ 0, k, ({
struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) }; int ret2 = bch2_check_lru_key(trans, &iter, k, &last_flushed_pos);
int ret2 = 0; if (ret2) {
ret2 = ret2 < 0 ? ret2 : 0;
goto next;
}
saw++; saw++;
struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) };
if (!bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p))) if (!bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p)))
not_movable++; not_movable++;
else if (bucket_in_flight(buckets_in_flight, b.k)) else if (bucket_in_flight(buckets_in_flight, b.k))
@ -179,6 +185,7 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
if (ret2 >= 0) if (ret2 >= 0)
sectors += b.sectors; sectors += b.sectors;
} }
next:
ret2; ret2;
})); }));

View File

@ -171,6 +171,20 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
return bkey_s_c_null; return bkey_s_c_null;
} }
if (trace_rebalance_extent_enabled()) {
struct printbuf buf = PRINTBUF;
prt_str(&buf, "target=");
bch2_target_to_text(&buf, c, r->target);
prt_str(&buf, " compression=");
prt_str(&buf, bch2_compression_opts[r->compression]);
prt_str(&buf, " ");
bch2_bkey_val_to_text(&buf, c, k);
trace_rebalance_extent(c, buf.buf);
printbuf_exit(&buf);
}
return k; return k;
} }

View File

@ -302,8 +302,6 @@ static int journal_replay_entry_early(struct bch_fs *c,
struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev)); struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev));
unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec);
for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) { for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) {
ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets);
ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors);

View File

@ -68,6 +68,33 @@ void bch2_replicas_entry_to_text(struct printbuf *out,
prt_printf(out, "]"); prt_printf(out, "]");
} }
int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
struct bch_sb *sb,
struct printbuf *err)
{
if (!r->nr_devs) {
prt_printf(err, "no devices in entry ");
goto bad;
}
if (r->nr_required > 1 &&
r->nr_required >= r->nr_devs) {
prt_printf(err, "bad nr_required in entry ");
goto bad;
}
for (unsigned i = 0; i < r->nr_devs; i++)
if (!bch2_dev_exists(sb, r->devs[i])) {
prt_printf(err, "invalid device %u in entry ", r->devs[i]);
goto bad;
}
return 0;
bad:
bch2_replicas_entry_to_text(err, r);
return -BCH_ERR_invalid_replicas_entry;
}
void bch2_cpu_replicas_to_text(struct printbuf *out, void bch2_cpu_replicas_to_text(struct printbuf *out,
struct bch_replicas_cpu *r) struct bch_replicas_cpu *r)
{ {
@ -163,7 +190,8 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
} }
static struct bch_replicas_cpu static struct bch_replicas_cpu
cpu_replicas_add_entry(struct bch_replicas_cpu *old, cpu_replicas_add_entry(struct bch_fs *c,
struct bch_replicas_cpu *old,
struct bch_replicas_entry_v1 *new_entry) struct bch_replicas_entry_v1 *new_entry)
{ {
unsigned i; unsigned i;
@ -173,6 +201,9 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old,
replicas_entry_bytes(new_entry)), replicas_entry_bytes(new_entry)),
}; };
for (i = 0; i < new_entry->nr_devs; i++)
BUG_ON(!bch2_dev_exists2(c, new_entry->devs[i]));
BUG_ON(!new_entry->data_type); BUG_ON(!new_entry->data_type);
verify_replicas_entry(new_entry); verify_replicas_entry(new_entry);
@ -382,7 +413,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
if (c->replicas_gc.entries && if (c->replicas_gc.entries &&
!__replicas_has_entry(&c->replicas_gc, new_entry)) { !__replicas_has_entry(&c->replicas_gc, new_entry)) {
new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry); new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry);
if (!new_gc.entries) { if (!new_gc.entries) {
ret = -BCH_ERR_ENOMEM_cpu_replicas; ret = -BCH_ERR_ENOMEM_cpu_replicas;
goto err; goto err;
@ -390,7 +421,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
} }
if (!__replicas_has_entry(&c->replicas, new_entry)) { if (!__replicas_has_entry(&c->replicas, new_entry)) {
new_r = cpu_replicas_add_entry(&c->replicas, new_entry); new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry);
if (!new_r.entries) { if (!new_r.entries) {
ret = -BCH_ERR_ENOMEM_cpu_replicas; ret = -BCH_ERR_ENOMEM_cpu_replicas;
goto err; goto err;
@ -598,7 +629,7 @@ int bch2_replicas_set_usage(struct bch_fs *c,
if (idx < 0) { if (idx < 0) {
struct bch_replicas_cpu n; struct bch_replicas_cpu n;
n = cpu_replicas_add_entry(&c->replicas, r); n = cpu_replicas_add_entry(c, &c->replicas, r);
if (!n.entries) if (!n.entries)
return -BCH_ERR_ENOMEM_cpu_replicas; return -BCH_ERR_ENOMEM_cpu_replicas;
@ -797,7 +828,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
struct bch_sb *sb, struct bch_sb *sb,
struct printbuf *err) struct printbuf *err)
{ {
unsigned i, j; unsigned i;
sort_cmp_size(cpu_r->entries, sort_cmp_size(cpu_r->entries,
cpu_r->nr, cpu_r->nr,
@ -808,31 +839,9 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
struct bch_replicas_entry_v1 *e = struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(cpu_r, i); cpu_replicas_entry(cpu_r, i);
if (e->data_type >= BCH_DATA_NR) { int ret = bch2_replicas_entry_validate(e, sb, err);
prt_printf(err, "invalid data type in entry "); if (ret)
bch2_replicas_entry_to_text(err, e); return ret;
return -BCH_ERR_invalid_sb_replicas;
}
if (!e->nr_devs) {
prt_printf(err, "no devices in entry ");
bch2_replicas_entry_to_text(err, e);
return -BCH_ERR_invalid_sb_replicas;
}
if (e->nr_required > 1 &&
e->nr_required >= e->nr_devs) {
prt_printf(err, "bad nr_required in entry ");
bch2_replicas_entry_to_text(err, e);
return -BCH_ERR_invalid_sb_replicas;
}
for (j = 0; j < e->nr_devs; j++)
if (!bch2_dev_exists(sb, e->devs[j])) {
prt_printf(err, "invalid device %u in entry ", e->devs[j]);
bch2_replicas_entry_to_text(err, e);
return -BCH_ERR_invalid_sb_replicas;
}
if (i + 1 < cpu_r->nr) { if (i + 1 < cpu_r->nr) {
struct bch_replicas_entry_v1 *n = struct bch_replicas_entry_v1 *n =

View File

@ -9,6 +9,8 @@
void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *); void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *);
void bch2_replicas_entry_to_text(struct printbuf *, void bch2_replicas_entry_to_text(struct printbuf *,
struct bch_replicas_entry_v1 *); struct bch_replicas_entry_v1 *);
int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *,
struct bch_sb *, struct printbuf *);
void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
static inline struct bch_replicas_entry_v1 * static inline struct bch_replicas_entry_v1 *

View File

@ -256,7 +256,6 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
u->entry.type = BCH_JSET_ENTRY_dev_usage; u->entry.type = BCH_JSET_ENTRY_dev_usage;
u->dev = cpu_to_le32(dev); u->dev = cpu_to_le32(dev);
u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec);
for (i = 0; i < BCH_DATA_NR; i++) { for (i = 0; i < BCH_DATA_NR; i++) {
u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets); u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);

View File

@ -259,6 +259,11 @@ static void member_to_text(struct printbuf *out,
prt_printf(out, "(none)"); prt_printf(out, "(none)");
prt_newline(out); prt_newline(out);
prt_str(out, "Durability:");
prt_tab(out);
prt_printf(out, "%llu", BCH_MEMBER_DURABILITY(&m));
prt_newline(out);
prt_printf(out, "Discard:"); prt_printf(out, "Discard:");
prt_tab(out); prt_tab(out);
prt_printf(out, "%llu", BCH_MEMBER_DISCARD(&m)); prt_printf(out, "%llu", BCH_MEMBER_DISCARD(&m));

View File

@ -658,7 +658,7 @@ reread:
return 0; return 0;
} }
int __bch2_read_super(const char *path, struct bch_opts *opts, static int __bch2_read_super(const char *path, struct bch_opts *opts,
struct bch_sb_handle *sb, bool ignore_notbchfs_msg) struct bch_sb_handle *sb, bool ignore_notbchfs_msg)
{ {
u64 offset = opt_get(*opts, sb); u64 offset = opt_get(*opts, sb);

View File

@ -270,6 +270,8 @@ void bch2_fs_read_only(struct bch_fs *c)
BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
bch_verbose(c, "going read-only");
/* /*
* Block new foreground-end write operations from starting - any new * Block new foreground-end write operations from starting - any new
* writes will return -EROFS: * writes will return -EROFS:
@ -297,13 +299,21 @@ void bch2_fs_read_only(struct bch_fs *c)
test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) || test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
test_bit(BCH_FS_EMERGENCY_RO, &c->flags)); test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
bool writes_disabled = test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
if (writes_disabled)
bch_verbose(c, "finished waiting for writes to stop");
__bch2_fs_read_only(c); __bch2_fs_read_only(c);
wait_event(bch2_read_only_wait, wait_event(bch2_read_only_wait,
test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
if (!writes_disabled)
bch_verbose(c, "finished waiting for writes to stop");
clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
clear_bit(BCH_FS_GOING_RO, &c->flags); clear_bit(BCH_FS_GOING_RO, &c->flags);
clear_bit(BCH_FS_RW, &c->flags);
if (!bch2_journal_error(&c->journal) && if (!bch2_journal_error(&c->journal) &&
!test_bit(BCH_FS_ERROR, &c->flags) && !test_bit(BCH_FS_ERROR, &c->flags) &&
@ -319,9 +329,9 @@ void bch2_fs_read_only(struct bch_fs *c)
bch_verbose(c, "marking filesystem clean"); bch_verbose(c, "marking filesystem clean");
bch2_fs_mark_clean(c); bch2_fs_mark_clean(c);
} else {
bch_verbose(c, "done going read-only, filesystem not clean");
} }
clear_bit(BCH_FS_RW, &c->flags);
} }
static void bch2_fs_read_only_work(struct work_struct *work) static void bch2_fs_read_only_work(struct work_struct *work)
@ -424,6 +434,18 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
bch2_dev_allocator_add(c, ca); bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c); bch2_recalc_capacity(c);
set_bit(BCH_FS_RW, &c->flags);
set_bit(BCH_FS_WAS_RW, &c->flags);
#ifndef BCH_WRITE_REF_DEBUG
percpu_ref_reinit(&c->writes);
#else
for (i = 0; i < BCH_WRITE_REF_NR; i++) {
BUG_ON(atomic_long_read(&c->writes[i]));
atomic_long_inc(&c->writes[i]);
}
#endif
ret = bch2_gc_thread_start(c); ret = bch2_gc_thread_start(c);
if (ret) { if (ret) {
bch_err(c, "error starting gc thread"); bch_err(c, "error starting gc thread");
@ -440,24 +462,16 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
goto err; goto err;
} }
#ifndef BCH_WRITE_REF_DEBUG
percpu_ref_reinit(&c->writes);
#else
for (i = 0; i < BCH_WRITE_REF_NR; i++) {
BUG_ON(atomic_long_read(&c->writes[i]));
atomic_long_inc(&c->writes[i]);
}
#endif
set_bit(BCH_FS_RW, &c->flags);
set_bit(BCH_FS_WAS_RW, &c->flags);
bch2_do_discards(c); bch2_do_discards(c);
bch2_do_invalidates(c); bch2_do_invalidates(c);
bch2_do_stripe_deletes(c); bch2_do_stripe_deletes(c);
bch2_do_pending_node_rewrites(c); bch2_do_pending_node_rewrites(c);
return 0; return 0;
err: err:
__bch2_fs_read_only(c); if (test_bit(BCH_FS_RW, &c->flags))
bch2_fs_read_only(c);
else
__bch2_fs_read_only(c);
return ret; return ret;
} }

View File

@ -258,15 +258,16 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
struct btree_iter iter; struct btree_iter iter;
struct bkey_s_c k; struct bkey_s_c k;
enum btree_id id; enum btree_id id;
u64 nr_uncompressed_extents = 0, struct compression_type_stats {
nr_compressed_extents = 0, u64 nr_extents;
nr_incompressible_extents = 0, u64 sectors_compressed;
uncompressed_sectors = 0, u64 sectors_uncompressed;
incompressible_sectors = 0, } s[BCH_COMPRESSION_TYPE_NR];
compressed_sectors_compressed = 0, u64 compressed_incompressible = 0;
compressed_sectors_uncompressed = 0;
int ret = 0; int ret = 0;
memset(s, 0, sizeof(s));
if (!test_bit(BCH_FS_STARTED, &c->flags)) if (!test_bit(BCH_FS_STARTED, &c->flags))
return -EPERM; return -EPERM;
@ -279,36 +280,30 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
for_each_btree_key(trans, iter, id, POS_MIN, for_each_btree_key(trans, iter, id, POS_MIN,
BTREE_ITER_ALL_SNAPSHOTS, k, ret) { BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
struct bch_extent_crc_unpacked crc;
const union bch_extent_entry *entry; const union bch_extent_entry *entry;
struct extent_ptr_decoded p; bool compressed = false, incompressible = false;
bool compressed = false, uncompressed = false, incompressible = false;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { bkey_for_each_crc(k.k, ptrs, crc, entry) {
switch (p.crc.compression_type) { incompressible |= crc.compression_type == BCH_COMPRESSION_TYPE_incompressible;
case BCH_COMPRESSION_TYPE_none: compressed |= crc_is_compressed(crc);
uncompressed = true;
uncompressed_sectors += k.k->size; if (crc_is_compressed(crc)) {
break; s[crc.compression_type].nr_extents++;
case BCH_COMPRESSION_TYPE_incompressible: s[crc.compression_type].sectors_compressed += crc.compressed_size;
incompressible = true; s[crc.compression_type].sectors_uncompressed += crc.uncompressed_size;
incompressible_sectors += k.k->size;
break;
default:
compressed_sectors_compressed +=
p.crc.compressed_size;
compressed_sectors_uncompressed +=
p.crc.uncompressed_size;
compressed = true;
break;
} }
} }
if (incompressible) compressed_incompressible += compressed && incompressible;
nr_incompressible_extents++;
else if (uncompressed) if (!compressed) {
nr_uncompressed_extents++; unsigned t = incompressible ? BCH_COMPRESSION_TYPE_incompressible : 0;
else if (compressed)
nr_compressed_extents++; s[t].nr_extents++;
s[t].sectors_compressed += k.k->size;
s[t].sectors_uncompressed += k.k->size;
}
} }
bch2_trans_iter_exit(trans, &iter); bch2_trans_iter_exit(trans, &iter);
} }
@ -318,26 +313,45 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
if (ret) if (ret)
return ret; return ret;
prt_printf(out, "uncompressed:\n"); prt_str(out, "type");
prt_printf(out, " nr extents: %llu\n", nr_uncompressed_extents); printbuf_tabstop_push(out, 12);
prt_printf(out, " size: "); prt_tab(out);
prt_human_readable_u64(out, uncompressed_sectors << 9);
prt_printf(out, "\n");
prt_printf(out, "compressed:\n"); prt_str(out, "compressed");
prt_printf(out, " nr extents: %llu\n", nr_compressed_extents); printbuf_tabstop_push(out, 16);
prt_printf(out, " compressed size: "); prt_tab_rjust(out);
prt_human_readable_u64(out, compressed_sectors_compressed << 9);
prt_printf(out, "\n"); prt_str(out, "uncompressed");
prt_printf(out, " uncompressed size: "); printbuf_tabstop_push(out, 16);
prt_human_readable_u64(out, compressed_sectors_uncompressed << 9); prt_tab_rjust(out);
prt_printf(out, "\n");
prt_str(out, "average extent size");
printbuf_tabstop_push(out, 24);
prt_tab_rjust(out);
prt_newline(out);
for (unsigned i = 0; i < ARRAY_SIZE(s); i++) {
prt_str(out, bch2_compression_types[i]);
prt_tab(out);
prt_human_readable_u64(out, s[i].sectors_compressed << 9);
prt_tab_rjust(out);
prt_human_readable_u64(out, s[i].sectors_uncompressed << 9);
prt_tab_rjust(out);
prt_human_readable_u64(out, s[i].nr_extents
? div_u64(s[i].sectors_uncompressed << 9, s[i].nr_extents)
: 0);
prt_tab_rjust(out);
prt_newline(out);
}
if (compressed_incompressible) {
prt_printf(out, "%llu compressed & incompressible extents", compressed_incompressible);
prt_newline(out);
}
prt_printf(out, "incompressible:\n");
prt_printf(out, " nr extents: %llu\n", nr_incompressible_extents);
prt_printf(out, " size: ");
prt_human_readable_u64(out, incompressible_sectors << 9);
prt_printf(out, "\n");
return 0; return 0;
} }
@ -786,32 +800,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
printbuf_tabstop_push(out, 16); printbuf_tabstop_push(out, 16);
printbuf_tabstop_push(out, 16); printbuf_tabstop_push(out, 16);
prt_tab(out); bch2_dev_usage_to_text(out, &stats);
prt_str(out, "buckets");
prt_tab_rjust(out);
prt_str(out, "sectors");
prt_tab_rjust(out);
prt_str(out, "fragmented");
prt_tab_rjust(out);
prt_newline(out);
for (i = 0; i < BCH_DATA_NR; i++) {
prt_str(out, bch2_data_types[i]);
prt_tab(out);
prt_u64(out, stats.d[i].buckets);
prt_tab_rjust(out);
prt_u64(out, stats.d[i].sectors);
prt_tab_rjust(out);
prt_u64(out, stats.d[i].fragmented);
prt_tab_rjust(out);
prt_newline(out);
}
prt_str(out, "ec");
prt_tab(out);
prt_u64(out, stats.buckets_ec);
prt_tab_rjust(out);
prt_newline(out);
prt_newline(out); prt_newline(out);

View File

@ -32,19 +32,21 @@ DECLARE_EVENT_CLASS(bpos,
TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot) TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot)
); );
DECLARE_EVENT_CLASS(bkey, DECLARE_EVENT_CLASS(str,
TP_PROTO(struct bch_fs *c, const char *k), TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, k), TP_ARGS(c, str),
TP_STRUCT__entry( TP_STRUCT__entry(
__string(k, k ) __field(dev_t, dev )
__string(str, str )
), ),
TP_fast_assign( TP_fast_assign(
__assign_str(k, k); __entry->dev = c->dev;
__assign_str(str, str);
), ),
TP_printk("%s", __get_str(k)) TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str))
); );
DECLARE_EVENT_CLASS(btree_node, DECLARE_EVENT_CLASS(btree_node,
@ -736,22 +738,22 @@ TRACE_EVENT(bucket_evacuate,
__entry->dev_idx, __entry->bucket) __entry->dev_idx, __entry->bucket)
); );
DEFINE_EVENT(bkey, move_extent, DEFINE_EVENT(str, move_extent,
TP_PROTO(struct bch_fs *c, const char *k), TP_PROTO(struct bch_fs *c, const char *k),
TP_ARGS(c, k) TP_ARGS(c, k)
); );
DEFINE_EVENT(bkey, move_extent_read, DEFINE_EVENT(str, move_extent_read,
TP_PROTO(struct bch_fs *c, const char *k), TP_PROTO(struct bch_fs *c, const char *k),
TP_ARGS(c, k) TP_ARGS(c, k)
); );
DEFINE_EVENT(bkey, move_extent_write, DEFINE_EVENT(str, move_extent_write,
TP_PROTO(struct bch_fs *c, const char *k), TP_PROTO(struct bch_fs *c, const char *k),
TP_ARGS(c, k) TP_ARGS(c, k)
); );
DEFINE_EVENT(bkey, move_extent_finish, DEFINE_EVENT(str, move_extent_finish,
TP_PROTO(struct bch_fs *c, const char *k), TP_PROTO(struct bch_fs *c, const char *k),
TP_ARGS(c, k) TP_ARGS(c, k)
); );
@ -773,7 +775,7 @@ TRACE_EVENT(move_extent_fail,
TP_printk("%d:%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(msg)) TP_printk("%d:%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(msg))
); );
DEFINE_EVENT(bkey, move_extent_alloc_mem_fail, DEFINE_EVENT(str, move_extent_alloc_mem_fail,
TP_PROTO(struct bch_fs *c, const char *k), TP_PROTO(struct bch_fs *c, const char *k),
TP_ARGS(c, k) TP_ARGS(c, k)
); );
@ -1366,6 +1368,16 @@ TRACE_EVENT(write_buffer_flush_slowpath,
TP_printk("%zu/%zu", __entry->slowpath, __entry->total) TP_printk("%zu/%zu", __entry->slowpath, __entry->total)
); );
DEFINE_EVENT(str, rebalance_extent,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
DEFINE_EVENT(str, data_update,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
#endif /* _TRACE_BCACHEFS_H */ #endif /* _TRACE_BCACHEFS_H */
/* This part must be outside protection */ /* This part must be outside protection */