Update bcachefs sources to 0d63ed13ea3d closures: Fix race in closure_sync()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2023-10-25 02:09:44 -04:00
parent bd9e015334
commit 9799b119c3
52 changed files with 1439 additions and 818 deletions

View File

@ -1 +1 @@
f70a3402188ea797a38fa9f5b729fb6fbe5f5b83
0d63ed13ea3d867055ae5752e2e0514a227d1dcb

View File

@ -47,6 +47,7 @@ typedef struct {
#define smp_rmb() cmm_smp_rmb()
#define smp_mb() cmm_smp_mb()
#define smp_read_barrier_depends() cmm_smp_read_barrier_depends()
#define smp_acquire__after_ctrl_dep() cmm_smp_mb()
#else /* C11_ATOMICS */
@ -205,6 +206,11 @@ static inline i_type a_type##_dec_return(a_type##_t *v) \
return __ATOMIC_DEC_RETURN(&v->counter); \
} \
\
static inline i_type a_type##_dec_return_release(a_type##_t *v) \
{ \
return __ATOMIC_SUB_RETURN_RELEASE(1, &v->counter); \
} \
\
static inline void a_type##_inc(a_type##_t *v) \
{ \
__ATOMIC_INC(&v->counter); \

View File

@ -154,6 +154,7 @@ struct closure {
struct closure *parent;
atomic_t remaining;
bool closure_get_happened;
#ifdef CONFIG_DEBUG_CLOSURES
#define CLOSURE_MAGIC_DEAD 0xc054dead
@ -185,7 +186,11 @@ static inline unsigned closure_nr_remaining(struct closure *cl)
*/
static inline void closure_sync(struct closure *cl)
{
if (closure_nr_remaining(cl) != 1)
#ifdef CONFIG_DEBUG_CLOSURES
BUG_ON(closure_nr_remaining(cl) != 1 && !cl->closure_get_happened);
#endif
if (cl->closure_get_happened)
__closure_sync(cl);
}
@ -233,8 +238,6 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
closure_set_ip(cl);
cl->fn = fn;
cl->wq = wq;
/* between atomic_dec() in closure_put() */
smp_mb__before_atomic();
}
static inline void closure_queue(struct closure *cl)
@ -259,6 +262,8 @@ static inline void closure_queue(struct closure *cl)
*/
static inline void closure_get(struct closure *cl)
{
cl->closure_get_happened = true;
#ifdef CONFIG_DEBUG_CLOSURES
BUG_ON((atomic_inc_return(&cl->remaining) &
CLOSURE_REMAINING_MASK) <= 1);
@ -281,6 +286,7 @@ static inline void closure_init(struct closure *cl, struct closure *parent)
closure_get(parent);
atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
cl->closure_get_happened = false;
closure_debug_create(cl);
closure_set_ip(cl);

View File

@ -151,6 +151,14 @@ static inline u64 ktime_get_seconds(void)
return ts.tv_sec;
}
static inline u64 ktime_get_real_ns(void)
{
struct timespec ts;
clock_gettime(CLOCK_REALTIME, &ts);
return timespec_to_ns(&ts);
}
static inline u64 ktime_get_real_seconds(void)
{
struct timespec ts;

View File

@ -2,22 +2,10 @@
#ifndef _BCACHEFS_BBPOS_H
#define _BCACHEFS_BBPOS_H
#include "bbpos_types.h"
#include "bkey_methods.h"
#include "btree_cache.h"
struct bbpos {
enum btree_id btree;
struct bpos pos;
};
static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
{
return (struct bbpos) { btree, pos };
}
#define BBPOS_MIN BBPOS(0, POS_MIN)
#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX)
static inline int bbpos_cmp(struct bbpos l, struct bbpos r)
{
return cmp_int(l.btree, r.btree) ?: bpos_cmp(l.pos, r.pos);

18
libbcachefs/bbpos_types.h Normal file
View File

@ -0,0 +1,18 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BBPOS_TYPES_H
#define _BCACHEFS_BBPOS_TYPES_H
struct bbpos {
enum btree_id btree;
struct bpos pos;
};
static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
{
return (struct bbpos) { btree, pos };
}
#define BBPOS_MIN BBPOS(0, POS_MIN)
#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX)
#endif /* _BCACHEFS_BBPOS_TYPES_H */

View File

@ -418,6 +418,7 @@ enum bch_time_stats {
#include "buckets_types.h"
#include "buckets_waiting_for_journal_types.h"
#include "clock_types.h"
#include "disk_groups_types.h"
#include "ec_types.h"
#include "journal_types.h"
#include "keylist_types.h"
@ -463,6 +464,7 @@ enum gc_phase {
GC_PHASE_BTREE_snapshot_trees,
GC_PHASE_BTREE_deleted_inodes,
GC_PHASE_BTREE_logged_ops,
GC_PHASE_BTREE_rebalance_work,
GC_PHASE_PENDING_DELETE,
};
@ -938,9 +940,6 @@ struct bch_fs {
struct list_head moving_context_list;
struct mutex moving_context_lock;
struct list_head data_progress_list;
struct mutex data_progress_lock;
/* REBALANCE */
struct bch_fs_rebalance rebalance;

View File

@ -613,31 +613,17 @@ struct bch_extent_stripe_ptr {
#endif
};
struct bch_extent_reservation {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:6,
unused:22,
replicas:4,
generation:32;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 generation:32,
replicas:4,
unused:22,
type:6;
#endif
};
struct bch_extent_rebalance {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:7,
unused:33,
compression:8,
__u64 type:6,
unused:34,
compression:8, /* enum bch_compression_opt */
target:16;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 target:16,
compression:8,
unused:33,
type:7;
unused:34,
type:6;
#endif
};
@ -1682,7 +1668,9 @@ struct bch_sb_field_journal_seq_blacklist {
x(snapshot_skiplists, BCH_VERSION(1, 1), \
BIT_ULL(BCH_RECOVERY_PASS_check_snapshots)) \
x(deleted_inodes, BCH_VERSION(1, 2), \
BIT_ULL(BCH_RECOVERY_PASS_check_inodes))
BIT_ULL(BCH_RECOVERY_PASS_check_inodes)) \
x(rebalance_work, BCH_VERSION(1, 3), \
BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@ -1693,7 +1681,7 @@ enum bcachefs_metadata_version {
};
static const __maybe_unused
unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_major_minor;
unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work;
#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)
@ -2306,7 +2294,9 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_set)) \
x(logged_ops, 17, 0, \
BIT_ULL(KEY_TYPE_logged_op_truncate)| \
BIT_ULL(KEY_TYPE_logged_op_finsert))
BIT_ULL(KEY_TYPE_logged_op_finsert)) \
x(rebalance_work, 18, BTREE_ID_SNAPSHOTS, \
BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))
enum btree_id {
#define x(name, nr, ...) BTREE_ID_##name = nr,

View File

@ -119,16 +119,6 @@ enum btree_update_flags {
#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC)
#define BTREE_TRIGGER_WANTS_OLD_AND_NEW \
((1U << KEY_TYPE_alloc)| \
(1U << KEY_TYPE_alloc_v2)| \
(1U << KEY_TYPE_alloc_v3)| \
(1U << KEY_TYPE_alloc_v4)| \
(1U << KEY_TYPE_stripe)| \
(1U << KEY_TYPE_inode)| \
(1U << KEY_TYPE_inode_v2)| \
(1U << KEY_TYPE_snapshot))
static inline int bch2_trans_mark_key(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_i *new,

View File

@ -382,8 +382,7 @@ static int run_one_mem_trigger(struct btree_trans *trans,
if (!btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)))
return 0;
if (old_ops->atomic_trigger == new_ops->atomic_trigger &&
((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
if (old_ops->atomic_trigger == new_ops->atomic_trigger) {
ret = bch2_mark_key(trans, i->btree_id, i->level,
old, bkey_i_to_s_c(new),
BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
@ -425,8 +424,7 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
if (!i->insert_trigger_run &&
!i->overwrite_trigger_run &&
old_ops->trans_trigger == new_ops->trans_trigger &&
((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
old_ops->trans_trigger == new_ops->trans_trigger) {
i->overwrite_trigger_run = true;
i->insert_trigger_run = true;
return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k,

View File

@ -935,14 +935,12 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans,
return 0;
}
int bch2_mark_extent(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
unsigned flags)
static int __mark_extent(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k, unsigned flags)
{
u64 journal_seq = trans->journal_res.seq;
struct bch_fs *c = trans->c;
struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
@ -1018,6 +1016,14 @@ int bch2_mark_extent(struct btree_trans *trans,
return 0;
}
int bch2_mark_extent(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
unsigned flags)
{
return mem_trigger_run_overwrite_then_insert(__mark_extent, trans, btree_id, level, old, new, flags);
}
int bch2_mark_stripe(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
@ -1124,13 +1130,11 @@ int bch2_mark_stripe(struct btree_trans *trans,
return 0;
}
int bch2_mark_reservation(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
unsigned flags)
static int __mark_reservation(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k, unsigned flags)
{
struct bch_fs *c = trans->c;
struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
struct bch_fs_usage *fs_usage;
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
s64 sectors = (s64) k.k->size;
@ -1157,6 +1161,14 @@ int bch2_mark_reservation(struct btree_trans *trans,
return 0;
}
int bch2_mark_reservation(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
unsigned flags)
{
return mem_trigger_run_overwrite_then_insert(__mark_reservation, trans, btree_id, level, old, new, flags);
}
static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
struct bkey_s_c_reflink_p p,
u64 start, u64 end,
@ -1211,13 +1223,11 @@ fsck_err:
return ret;
}
int bch2_mark_reflink_p(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
unsigned flags)
static int __mark_reflink_p(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k, unsigned flags)
{
struct bch_fs *c = trans->c;
struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
struct reflink_gc *ref;
size_t l, r, m;
@ -1251,6 +1261,14 @@ int bch2_mark_reflink_p(struct btree_trans *trans,
return ret;
}
int bch2_mark_reflink_p(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
unsigned flags)
{
return mem_trigger_run_overwrite_then_insert(__mark_reflink_p, trans, btree_id, level, old, new, flags);
}
void bch2_trans_fs_usage_revert(struct btree_trans *trans,
struct replicas_delta_list *deltas)
{
@ -1452,15 +1470,11 @@ err:
return ret;
}
int bch2_trans_mark_extent(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_i *new,
unsigned flags)
static int __trans_mark_extent(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k, unsigned flags)
{
struct bch_fs *c = trans->c;
struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
? old
: bkey_i_to_s_c(new);
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
@ -1517,6 +1531,24 @@ int bch2_trans_mark_extent(struct btree_trans *trans,
return ret;
}
int bch2_trans_mark_extent(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_i *new,
unsigned flags)
{
struct bch_fs *c = trans->c;
int mod = (int) bch2_bkey_needs_rebalance(c, bkey_i_to_s_c(new)) -
(int) bch2_bkey_needs_rebalance(c, old);
if (mod) {
int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new->k.p, mod > 0);
if (ret)
return ret;
}
return trigger_run_overwrite_then_insert(__trans_mark_extent, trans, btree_id, level, old, new, flags);
}
static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
struct bkey_s_c_stripe s,
unsigned idx, bool deleting)
@ -1670,15 +1702,10 @@ int bch2_trans_mark_stripe(struct btree_trans *trans,
return ret;
}
int bch2_trans_mark_reservation(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old,
struct bkey_i *new,
unsigned flags)
static int __trans_mark_reservation(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k, unsigned flags)
{
struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
? old
: bkey_i_to_s_c(new);
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
s64 sectors = (s64) k.k->size;
struct replicas_delta_list *d;
@ -1700,7 +1727,16 @@ int bch2_trans_mark_reservation(struct btree_trans *trans,
return 0;
}
static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
int bch2_trans_mark_reservation(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old,
struct bkey_i *new,
unsigned flags)
{
return trigger_run_overwrite_then_insert(__trans_mark_reservation, trans, btree_id, level, old, new, flags);
}
static int trans_mark_reflink_p_segment(struct btree_trans *trans,
struct bkey_s_c_reflink_p p,
u64 *idx, unsigned flags)
{
@ -1767,35 +1803,38 @@ err:
return ret;
}
int bch2_trans_mark_reflink_p(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old,
struct bkey_i *new,
unsigned flags)
static int __trans_mark_reflink_p(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k, unsigned flags)
{
struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
? old
: bkey_i_to_s_c(new);
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
u64 idx, end_idx;
int ret = 0;
if (flags & BTREE_TRIGGER_INSERT) {
struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
v->front_pad = v->back_pad = 0;
}
idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
end_idx = le64_to_cpu(p.v->idx) + p.k->size +
le32_to_cpu(p.v->back_pad);
while (idx < end_idx && !ret)
ret = __bch2_trans_mark_reflink_p(trans, p, &idx, flags);
ret = trans_mark_reflink_p_segment(trans, p, &idx, flags);
return ret;
}
int bch2_trans_mark_reflink_p(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old,
struct bkey_i *new,
unsigned flags)
{
if (flags & BTREE_TRIGGER_INSERT) {
struct bch_reflink_p *v = &bkey_i_to_reflink_p(new)->v;
v->front_pad = v->back_pad = 0;
}
return trigger_run_overwrite_then_insert(__trans_mark_reflink_p, trans, btree_id, level, old, new, flags);
}
static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
struct bch_dev *ca, size_t b,
enum bch_data_type type,
@ -1825,16 +1864,16 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
bch2_data_types[type],
bch2_data_types[type]);
ret = -EIO;
goto out;
goto err;
}
a->v.data_type = type;
a->v.dirty_sectors = sectors;
ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
if (ret)
goto out;
out:
if (a->v.data_type != type ||
a->v.dirty_sectors != sectors) {
a->v.data_type = type;
a->v.dirty_sectors = sectors;
ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
}
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@ -1929,6 +1968,22 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
return ret;
}
int bch2_trans_mark_dev_sbs(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
for_each_online_member(ca, c, i) {
int ret = bch2_trans_mark_dev_sb(c, ca);
if (ret) {
percpu_ref_put(&ca->ref);
return ret;
}
}
return 0;
}
/* Disk reservations: */
#define SECTORS_CACHE 1024

View File

@ -339,12 +339,27 @@ int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct
int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
#define mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\
({ \
int ret = 0; \
\
if (_old.k->type) \
ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_INSERT); \
if (!ret && _new.k->type) \
ret = _fn(_trans, _btree_id, _level, _new, _flags & ~BTREE_TRIGGER_OVERWRITE); \
ret; \
})
#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags) \
mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, bkey_i_to_s_c(_new), _flags)
void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
size_t, enum bch_data_type, unsigned);
int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *);
int bch2_trans_mark_dev_sbs(struct bch_fs *);
static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
{

View File

@ -332,8 +332,8 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
struct bch_ioctl_data_event e = {
.type = BCH_DATA_EVENT_PROGRESS,
.p.data_type = ctx->stats.data_type,
.p.btree_id = ctx->stats.btree_id,
.p.pos = ctx->stats.pos,
.p.btree_id = ctx->stats.pos.btree,
.p.pos = ctx->stats.pos.pos,
.p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
.p.sectors_total = bch2_fs_usage_read_short(c).used,
};

View File

@ -697,14 +697,32 @@ err:
return ret;
}
void bch2_compression_opt_to_text(struct printbuf *out, u64 v)
{
struct bch_compression_opt opt = bch2_compression_decode(v);
if (opt.type < BCH_COMPRESSION_OPT_NR)
prt_str(out, bch2_compression_opts[opt.type]);
else
prt_printf(out, "(unknown compression opt %u)", opt.type);
if (opt.level)
prt_printf(out, ":%u", opt.level);
}
void bch2_opt_compression_to_text(struct printbuf *out,
struct bch_fs *c,
struct bch_sb *sb,
u64 v)
{
struct bch_compression_opt opt = bch2_compression_decode(v);
prt_str(out, bch2_compression_opts[opt.type]);
if (opt.level)
prt_printf(out, ":%u", opt.level);
return bch2_compression_opt_to_text(out, v);
}
int bch2_opt_compression_validate(u64 v, struct printbuf *err)
{
if (!bch2_compression_opt_valid(v)) {
prt_printf(err, "invalid compression opt %llu", v);
return -BCH_ERR_invalid_sb_opt_compression;
}
return 0;
}

View File

@ -4,12 +4,18 @@
#include "extents_types.h"
static const unsigned __bch2_compression_opt_to_type[] = {
#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
BCH_COMPRESSION_OPTS()
#undef x
};
struct bch_compression_opt {
u8 type:4,
level:4;
};
static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
static inline struct bch_compression_opt __bch2_compression_decode(unsigned v)
{
return (struct bch_compression_opt) {
.type = v & 15,
@ -17,17 +23,25 @@ static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
};
}
static inline bool bch2_compression_opt_valid(unsigned v)
{
struct bch_compression_opt opt = __bch2_compression_decode(v);
return opt.type < ARRAY_SIZE(__bch2_compression_opt_to_type) && !(!opt.type && opt.level);
}
static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
{
return bch2_compression_opt_valid(v)
? __bch2_compression_decode(v)
: (struct bch_compression_opt) { 0 };
}
static inline unsigned bch2_compression_encode(struct bch_compression_opt opt)
{
return opt.type|(opt.level << 4);
}
static const unsigned __bch2_compression_opt_to_type[] = {
#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
BCH_COMPRESSION_OPTS()
#undef x
};
static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
{
return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
@ -44,12 +58,16 @@ int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
void bch2_fs_compress_exit(struct bch_fs *);
int bch2_fs_compress_init(struct bch_fs *);
void bch2_compression_opt_to_text(struct printbuf *, u64);
int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
int bch2_opt_compression_validate(u64, struct printbuf *);
#define bch2_opt_compression (struct bch_opt_fn) { \
.parse = bch2_opt_compression_parse, \
.to_text = bch2_opt_compression_to_text, \
.parse = bch2_opt_compression_parse, \
.to_text = bch2_opt_compression_to_text, \
.validate = bch2_opt_compression_validate, \
}
#endif /* _BCACHEFS_COMPRESS_H */

View File

@ -13,6 +13,7 @@
#include "keylist.h"
#include "move.h"
#include "nocow_locking.h"
#include "rebalance.h"
#include "subvolume.h"
#include "trace.h"
@ -251,11 +252,11 @@ restart_drop_extra_replicas:
ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, bkey_start_pos(&insert->k)) ?:
bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, insert->k.p);
if (ret)
goto err;
ret = bch2_trans_update(trans, &iter, insert,
k.k->p, insert->k.p) ?:
bch2_bkey_set_needs_rebalance(c, insert,
op->opts.background_target,
op->opts.background_compression) ?:
bch2_trans_update(trans, &iter, insert,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, &op->res,
NULL,
@ -281,11 +282,11 @@ next:
}
continue;
nowork:
if (m->ctxt && m->ctxt->stats) {
if (m->stats && m->stats) {
BUG_ON(k.k->p.offset <= iter.pos.offset);
atomic64_inc(&m->ctxt->stats->keys_raced);
atomic64_inc(&m->stats->keys_raced);
atomic64_add(k.k->p.offset - iter.pos.offset,
&m->ctxt->stats->sectors_raced);
&m->stats->sectors_raced);
}
this_cpu_inc(c->counters[BCH_COUNTER_move_extent_fail]);
@ -439,6 +440,8 @@ int bch2_data_update_init(struct btree_trans *trans,
bch2_bkey_buf_reassemble(&m->k, c, k);
m->btree_id = btree_id;
m->data_opts = data_opts;
m->ctxt = ctxt;
m->stats = ctxt ? ctxt->stats : NULL;
bch2_write_op_init(&m->op, c, io_opts);
m->op.pos = bkey_start_pos(k.k);
@ -487,7 +490,7 @@ int bch2_data_update_init(struct btree_trans *trans,
if (c->opts.nocow_enabled) {
if (ctxt) {
move_ctxt_wait_event(ctxt, trans,
move_ctxt_wait_event(ctxt,
(locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
PTR_BUCKET_POS(c, &p.ptr), 0)) ||
!atomic_read(&ctxt->read_sectors));

View File

@ -23,6 +23,7 @@ struct data_update {
struct bkey_buf k;
struct data_update_opts data_opts;
struct moving_context *ctxt;
struct bch_move_stats *stats;
struct bch_write_op op;
};

View File

@ -175,6 +175,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
dst->deleted = BCH_GROUP_DELETED(src);
dst->parent = BCH_GROUP_PARENT(src);
memcpy(dst->label, src->label, sizeof(dst->label));
}
for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
@ -382,7 +383,57 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
return v;
}
void bch2_disk_path_to_text(struct printbuf *out, struct bch_sb *sb, unsigned v)
void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
{
struct bch_disk_groups_cpu *groups;
struct bch_disk_group_cpu *g;
unsigned nr = 0;
u16 path[32];
out->atomic++;
rcu_read_lock();
groups = rcu_dereference(c->disk_groups);
if (!groups)
goto invalid;
while (1) {
if (nr == ARRAY_SIZE(path))
goto invalid;
if (v >= groups->nr)
goto invalid;
g = groups->entries + v;
if (g->deleted)
goto invalid;
path[nr++] = v;
if (!g->parent)
break;
v = g->parent - 1;
}
while (nr) {
v = path[--nr];
g = groups->entries + v;
prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
if (nr)
prt_printf(out, ".");
}
out:
rcu_read_unlock();
out->atomic--;
return;
invalid:
prt_printf(out, "invalid label %u", v);
goto out;
}
void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
{
struct bch_sb_field_disk_groups *groups =
bch2_sb_field_get(sb, disk_groups);
@ -493,10 +544,7 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res,
return -EINVAL;
}
void bch2_opt_target_to_text(struct printbuf *out,
struct bch_fs *c,
struct bch_sb *sb,
u64 v)
void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
{
struct target t = target_decode(v);
@ -504,47 +552,69 @@ void bch2_opt_target_to_text(struct printbuf *out,
case TARGET_NULL:
prt_printf(out, "none");
break;
case TARGET_DEV:
if (c) {
struct bch_dev *ca;
case TARGET_DEV: {
struct bch_dev *ca;
rcu_read_lock();
ca = t.dev < c->sb.nr_devices
? rcu_dereference(c->devs[t.dev])
: NULL;
rcu_read_lock();
ca = t.dev < c->sb.nr_devices
? rcu_dereference(c->devs[t.dev])
: NULL;
if (ca && percpu_ref_tryget(&ca->io_ref)) {
prt_printf(out, "/dev/%pg", ca->disk_sb.bdev);
percpu_ref_put(&ca->io_ref);
} else if (ca) {
prt_printf(out, "offline device %u", t.dev);
} else {
prt_printf(out, "invalid device %u", t.dev);
}
rcu_read_unlock();
if (ca && percpu_ref_tryget(&ca->io_ref)) {
prt_printf(out, "/dev/%pg", ca->disk_sb.bdev);
percpu_ref_put(&ca->io_ref);
} else if (ca) {
prt_printf(out, "offline device %u", t.dev);
} else {
struct bch_member m = bch2_sb_member_get(sb, t.dev);
if (bch2_dev_exists(sb, t.dev)) {
prt_printf(out, "Device ");
pr_uuid(out, m.uuid.b);
prt_printf(out, " (%u)", t.dev);
} else {
prt_printf(out, "Bad device %u", t.dev);
}
prt_printf(out, "invalid device %u", t.dev);
}
rcu_read_unlock();
break;
}
case TARGET_GROUP:
if (c) {
mutex_lock(&c->sb_lock);
bch2_disk_path_to_text(out, c->disk_sb.sb, t.group);
mutex_unlock(&c->sb_lock);
} else {
bch2_disk_path_to_text(out, sb, t.group);
}
bch2_disk_path_to_text(out, c, t.group);
break;
default:
BUG();
}
}
void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
{
struct target t = target_decode(v);
switch (t.type) {
case TARGET_NULL:
prt_printf(out, "none");
break;
case TARGET_DEV: {
struct bch_member m = bch2_sb_member_get(sb, t.dev);
if (bch2_dev_exists(sb, t.dev)) {
prt_printf(out, "Device ");
pr_uuid(out, m.uuid.b);
prt_printf(out, " (%u)", t.dev);
} else {
prt_printf(out, "Bad device %u", t.dev);
}
break;
}
case TARGET_GROUP:
bch2_disk_path_to_text_sb(out, sb, t.group);
break;
default:
BUG();
}
}
void bch2_opt_target_to_text(struct printbuf *out,
struct bch_fs *c,
struct bch_sb *sb,
u64 v)
{
if (c)
bch2_target_to_text(out, c, v);
else
bch2_target_to_text_sb(out, sb, v);
}

View File

@ -2,6 +2,8 @@
#ifndef _BCACHEFS_DISK_GROUPS_H
#define _BCACHEFS_DISK_GROUPS_H
#include "disk_groups_types.h"
extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
@ -83,7 +85,10 @@ int bch2_disk_path_find(struct bch_sb_handle *, const char *);
/* Exported for userspace bcachefs-tools: */
int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned);
void bch2_disk_path_to_text(struct printbuf *, struct bch_fs *, unsigned);
void bch2_disk_path_to_text_sb(struct printbuf *, struct bch_sb *, unsigned);
void bch2_target_to_text(struct printbuf *out, struct bch_fs *, unsigned);
int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);

View File

@ -0,0 +1,18 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_DISK_GROUPS_TYPES_H
#define _BCACHEFS_DISK_GROUPS_TYPES_H
struct bch_disk_group_cpu {
bool deleted;
u16 parent;
u8 label[BCH_SB_LABEL_SIZE];
struct bch_devs_mask devs;
};
struct bch_disk_groups_cpu {
struct rcu_head rcu;
unsigned nr;
struct bch_disk_group_cpu entries[] __counted_by(nr);
};
#endif /* _BCACHEFS_DISK_GROUPS_TYPES_H */

View File

@ -213,6 +213,7 @@
x(BCH_ERR_invalid_sb, invalid_sb_crypt) \
x(BCH_ERR_invalid_sb, invalid_sb_clean) \
x(BCH_ERR_invalid_sb, invalid_sb_quota) \
x(BCH_ERR_invalid_sb, invalid_sb_opt_compression) \
x(BCH_ERR_invalid, invalid_bkey) \
x(BCH_ERR_operation_blocked, nocow_lock_blocked) \
x(EIO, btree_node_read_err) \

View File

@ -13,6 +13,7 @@
#include "btree_iter.h"
#include "buckets.h"
#include "checksum.h"
#include "compress.h"
#include "debug.h"
#include "disk_groups.h"
#include "error.h"
@ -757,18 +758,6 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
return i;
}
static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
{
union bch_extent_entry *next = extent_entry_next(entry);
/* stripes have ptrs, but their layout doesn't work with this code */
BUG_ON(k.k->type == KEY_TYPE_stripe);
memmove_u64s_down(entry, next,
(u64 *) bkey_val_end(k) - (u64 *) next);
k.k->u64s -= (u64 *) next - (u64 *) entry;
}
/*
* Returns pointer to the next entry after the one being dropped:
*/
@ -992,10 +981,6 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct bch_extent_crc_unpacked crc;
const struct bch_extent_ptr *ptr;
const struct bch_extent_stripe_ptr *ec;
struct bch_dev *ca;
bool first = true;
if (c)
@ -1006,9 +991,9 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, " ");
switch (__extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
ptr = entry_to_ptr(entry);
ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
case BCH_EXTENT_ENTRY_ptr: {
const struct bch_extent_ptr *ptr = entry_to_ptr(entry);
struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
? bch_dev_bkey_exists(c, ptr->dev)
: NULL;
@ -1030,10 +1015,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, " stale");
}
break;
}
case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64:
case BCH_EXTENT_ENTRY_crc128:
crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
case BCH_EXTENT_ENTRY_crc128: {
struct bch_extent_crc_unpacked crc =
bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
crc.compressed_size,
@ -1042,12 +1029,26 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
bch2_csum_types[crc.csum_type],
bch2_compression_types[crc.compression_type]);
break;
case BCH_EXTENT_ENTRY_stripe_ptr:
ec = &entry->stripe_ptr;
}
case BCH_EXTENT_ENTRY_stripe_ptr: {
const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr;
prt_printf(out, "ec: idx %llu block %u",
(u64) ec->idx, ec->block);
break;
}
case BCH_EXTENT_ENTRY_rebalance: {
const struct bch_extent_rebalance *r = &entry->rebalance;
prt_str(out, "rebalance: target ");
if (c)
bch2_target_to_text(out, c, r->target);
else
prt_printf(out, "%u", r->target);
prt_str(out, " compression ");
bch2_compression_opt_to_text(out, r->compression);
break;
}
default:
prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
return;
@ -1207,6 +1208,14 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
return -BCH_ERR_invalid_bkey;
}
crc_since_last_ptr = true;
if (crc_is_encoded(crc) &&
(crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
(flags & (BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT))) {
prt_printf(err, "too large encoded extent");
return -BCH_ERR_invalid_bkey;
}
break;
case BCH_EXTENT_ENTRY_stripe_ptr:
if (have_ec) {
@ -1215,9 +1224,18 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
}
have_ec = true;
break;
case BCH_EXTENT_ENTRY_rebalance:
case BCH_EXTENT_ENTRY_rebalance: {
const struct bch_extent_rebalance *r = &entry->rebalance;
if (!bch2_compression_opt_valid(r->compression)) {
struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
prt_printf(err, "invalid compression opt %u:%u",
opt.type, opt.level);
return -BCH_ERR_invalid_bkey;
}
break;
}
}
}
if (!nr_ptrs) {
@ -1281,6 +1299,125 @@ void bch2_ptr_swab(struct bkey_s k)
}
}
const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
bkey_extent_entry_for_each(ptrs, entry)
if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
return &entry->rebalance;
return NULL;
}
unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
unsigned target, unsigned compression)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
unsigned rewrite_ptrs = 0;
if (compression) {
unsigned compression_type = bch2_compression_opt_to_type(compression);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
unsigned i = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) {
rewrite_ptrs = 0;
goto incompressible;
}
if (!p.ptr.cached && p.crc.compression_type != compression_type)
rewrite_ptrs |= 1U << i;
i++;
}
}
incompressible:
if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
const struct bch_extent_ptr *ptr;
unsigned i = 0;
bkey_for_each_ptr(ptrs, ptr) {
if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target))
rewrite_ptrs |= 1U << i;
i++;
}
}
return rewrite_ptrs;
}
bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
{
const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
/*
* If it's an indirect extent, we don't delete the rebalance entry when
* done so that we know what options were applied - check if it still
* needs work done:
*/
if (r &&
k.k->type == KEY_TYPE_reflink_v &&
!bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression))
r = NULL;
return r != NULL;
}
int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
unsigned target, unsigned compression)
{
struct bkey_s k = bkey_i_to_s(_k);
struct bch_extent_rebalance *r;
bool needs_rebalance;
if (!bkey_extent_is_direct_data(k.k))
return 0;
/* get existing rebalance entry: */
r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
if (r) {
if (k.k->type == KEY_TYPE_reflink_v) {
/*
* indirect extents: existing options take precedence,
* so that we don't move extents back and forth if
* they're referenced by different inodes with different
* options:
*/
if (r->target)
target = r->target;
if (r->compression)
compression = r->compression;
}
r->target = target;
r->compression = compression;
}
needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression);
if (needs_rebalance && !r) {
union bch_extent_entry *new = bkey_val_end(k);
new->rebalance.type = 1U << BCH_EXTENT_ENTRY_rebalance;
new->rebalance.compression = compression;
new->rebalance.target = target;
new->rebalance.unused = 0;
k.k->u64s += extent_entry_u64s(new);
} else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) {
/*
* For indirect extents, don't delete the rebalance entry when
* we're finished so that we know we specifically moved it or
* compressed it to its current location/compression type
*/
extent_entry_drop(k, (union bch_extent_entry *) r);
}
return 0;
}
/* Generic extent code: */
int bch2_cut_front_s(struct bpos where, struct bkey_s k)

View File

@ -89,6 +89,18 @@ static inline void __extent_entry_insert(struct bkey_i *k,
memcpy_u64s_small(dst, new, extent_entry_u64s(new));
}
static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
{
union bch_extent_entry *next = extent_entry_next(entry);
/* stripes have ptrs, but their layout doesn't work with this code */
BUG_ON(k.k->type == KEY_TYPE_stripe);
memmove_u64s_down(entry, next,
(u64 *) bkey_val_end(k) - (u64 *) next);
k.k->u64s -= (u64 *) next - (u64 *) entry;
}
static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
{
return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
@ -190,6 +202,11 @@ static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc)
crc.compression_type != BCH_COMPRESSION_TYPE_incompressible);
}
static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc)
{
return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc);
}
/* bkey_ptrs: generically over any key type that has ptrs */
struct bkey_ptrs_c {
@ -693,6 +710,14 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c,
void bch2_ptr_swab(struct bkey_s);
const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
unsigned, unsigned);
bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
unsigned, unsigned);
/* Generic extent code: */
enum bch_extent_overlap {
@ -737,22 +762,4 @@ static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
k->size = new_size;
}
/*
* In extent_sort_fix_overlapping(), insert_fixup_extent(),
* extent_merge_inline() - we're modifying keys in place that are packed. To do
* that we have to unpack the key, modify the unpacked key - then this
* copies/repacks the unpacked to the original as necessary.
*/
static inline void extent_save(struct btree *b, struct bkey_packed *dst,
struct bkey *src)
{
struct bkey_format *f = &b->format;
struct bkey_i *dst_unpacked;
if ((dst_unpacked = packed_to_bkey(dst)))
dst_unpacked->k = *src;
else
BUG_ON(!bch2_bkey_pack_key(dst, src, f));
}
#endif /* _BCACHEFS_EXTENTS_H */

View File

@ -113,6 +113,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
} else {
atomic_set(&dio->cl.remaining,
CLOSURE_REMAINING_INITIALIZER + 1);
dio->cl.closure_get_happened = true;
}
dio->req = req;

View File

@ -1299,6 +1299,28 @@ err:
return ret;
}
static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
struct bch_extent_crc_unpacked crc;
const union bch_extent_entry *i;
unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
bkey_for_each_crc(k.k, ptrs, crc, i)
if (crc_is_encoded(crc) &&
crc.uncompressed_size > encoded_extent_max_sectors) {
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf);
printbuf_exit(&buf);
}
return 0;
}
static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c k,
struct inode_walker *inode,
@ -1434,7 +1456,8 @@ int bch2_check_extents(struct bch_fs *c)
&res, NULL,
BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
bch2_disk_reservation_put(c, &res);
check_extent(trans, &iter, k, &w, &s, &extent_ends);
check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
check_extent_overbig(trans, &iter, k);
})) ?:
check_i_sectors(trans, &w);
@ -1448,6 +1471,30 @@ int bch2_check_extents(struct bch_fs *c)
return ret;
}
int bch2_check_indirect_extents(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
struct disk_reservation res = { 0 };
int ret = 0;
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
POS_MIN,
BTREE_ITER_PREFETCH, k,
&res, NULL,
BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
bch2_disk_reservation_put(c, &res);
check_extent_overbig(trans, &iter, k);
}));
bch2_disk_reservation_put(c, &res);
bch2_trans_put(trans);
bch_err_fn(c, ret);
return ret;
}
static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
{
struct bch_fs *c = trans->c;

View File

@ -4,6 +4,7 @@
int bch2_check_inodes(struct bch_fs *);
int bch2_check_extents(struct bch_fs *);
int bch2_check_indirect_extents(struct bch_fs *);
int bch2_check_dirents(struct bch_fs *);
int bch2_check_xattrs(struct bch_fs *);
int bch2_check_root(struct bch_fs *);

View File

@ -6,6 +6,7 @@
#include "bkey_methods.h"
#include "btree_update.h"
#include "buckets.h"
#include "compress.h"
#include "error.h"
#include "extents.h"
#include "extent_update.h"
@ -422,9 +423,10 @@ static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err)
return -BCH_ERR_invalid_bkey;
}
if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) {
prt_printf(err, "invalid data checksum type (%u >= %u)",
unpacked.bi_compression, BCH_COMPRESSION_OPT_NR + 1);
if (unpacked.bi_compression &&
!bch2_compression_opt_valid(unpacked.bi_compression - 1)) {
prt_printf(err, "invalid compression opt %u",
unpacked.bi_compression - 1);
return -BCH_ERR_invalid_bkey;
}
@ -979,6 +981,18 @@ void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0;
}
int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts)
{
struct bch_inode_unpacked inode;
int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode));
if (ret)
return ret;
bch2_inode_opts_get(opts, trans->c, &inode);
return 0;
}
int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
{
struct bch_fs *c = trans->c;

View File

@ -200,6 +200,7 @@ void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
struct bch_inode_unpacked *);
int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *);
int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
int bch2_delete_dead_inodes(struct bch_fs *);

View File

@ -16,6 +16,7 @@
#include "io_misc.h"
#include "io_write.h"
#include "logged_ops.h"
#include "rebalance.h"
#include "subvolume.h"
/* Overwrites whatever was present with zeroes: */
@ -355,6 +356,7 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
struct btree_iter iter;
struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k);
subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
struct bch_io_opts opts;
u64 dst_offset = le64_to_cpu(op->v.dst_offset);
u64 src_offset = le64_to_cpu(op->v.src_offset);
s64 shift = dst_offset - src_offset;
@ -363,6 +365,10 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
bool insert = shift > 0;
int ret = 0;
ret = bch2_inum_opts_get(trans, inum, &opts);
if (ret)
return ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
POS(inum.inum, 0),
BTREE_ITER_INTENT);
@ -443,7 +449,10 @@ case LOGGED_OP_FINSERT_shift_extents:
op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
ret = bch2_bkey_set_needs_rebalance(c, copy,
opts.background_target,
opts.background_compression) ?:
bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
bch2_logged_op_update(trans, &op->k_i) ?:
bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL);

View File

@ -351,10 +351,13 @@ static int bch2_write_index_default(struct bch_write_op *op)
bkey_start_pos(&sk.k->k),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
ret = bch2_extent_update(trans, inum, &iter, sk.k,
&op->res,
op->new_i_size, &op->i_sectors_delta,
op->flags & BCH_WRITE_CHECK_ENOSPC);
ret = bch2_bkey_set_needs_rebalance(c, sk.k,
op->opts.background_target,
op->opts.background_compression) ?:
bch2_extent_update(trans, inum, &iter, sk.k,
&op->res,
op->new_i_size, &op->i_sectors_delta,
op->flags & BCH_WRITE_CHECK_ENOSPC);
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@ -495,7 +498,6 @@ static void __bch2_write_index(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
struct keylist *keys = &op->insert_keys;
struct bkey_i *k;
unsigned dev;
int ret = 0;
@ -505,14 +507,6 @@ static void __bch2_write_index(struct bch_write_op *op)
goto err;
}
/*
* probably not the ideal place to hook this in, but I don't
* particularly want to plumb io_opts all the way through the btree
* update stack right now
*/
for_each_keylist_key(keys, k)
bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
if (!bch2_keylist_empty(keys)) {
u64 sectors_start = keylist_sectors(keys);
@ -816,6 +810,7 @@ static enum prep_encoded_ret {
/* Can we just write the entire extent as is? */
if (op->crc.uncompressed_size == op->crc.live_size &&
op->crc.uncompressed_size <= c->opts.encoded_extent_max >> 9 &&
op->crc.compressed_size <= wp->sectors_free &&
(op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
op->incompressible)) {
@ -1091,9 +1086,7 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op,
e = bkey_s_c_to_extent(k);
extent_for_each_ptr_decode(e, p, entry) {
if (p.crc.csum_type ||
crc_is_compressed(p.crc) ||
p.has_ec)
if (crc_is_encoded(p.crc) || p.has_ec)
return false;
replicas += bch2_extent_ptr_durability(c, &p);

View File

@ -1019,6 +1019,25 @@ err:
return ret;
}
int bch2_fs_journal_alloc(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
for_each_online_member(ca, c, i) {
if (ca->journal.nr)
continue;
int ret = bch2_dev_journal_alloc(ca);
if (ret) {
percpu_ref_put(&ca->io_ref);
return ret;
}
}
return 0;
}
/* startup/shutdown: */
static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)

View File

@ -534,6 +534,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
unsigned nr);
int bch2_dev_journal_alloc(struct bch_dev *);
int bch2_fs_journal_alloc(struct bch_fs *);
void bch2_dev_journal_stop(struct journal *, struct bch_dev *);

View File

@ -20,6 +20,7 @@
#include "keylist.h"
#include "move.h"
#include "replicas.h"
#include "snapshot.h"
#include "super-io.h"
#include "trace.h"
@ -59,20 +60,6 @@ static void trace_move_extent_alloc_mem_fail2(struct bch_fs *c, struct bkey_s_c
}
}
static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats)
{
mutex_lock(&c->data_progress_lock);
list_add(&stats->list, &c->data_progress_list);
mutex_unlock(&c->data_progress_lock);
}
static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats)
{
mutex_lock(&c->data_progress_lock);
list_del(&stats->list);
mutex_unlock(&c->data_progress_lock);
}
struct moving_io {
struct list_head read_list;
struct list_head io_list;
@ -156,13 +143,11 @@ static void move_read_endio(struct bio *bio)
closure_put(&ctxt->cl);
}
void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt,
struct btree_trans *trans)
void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt)
{
struct moving_io *io;
if (trans)
bch2_trans_unlock(trans);
bch2_trans_unlock(ctxt->trans);
while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
list_del(&io->read_list);
@ -170,21 +155,20 @@ void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt,
}
}
static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
struct btree_trans *trans)
void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
{
unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
move_ctxt_wait_event(ctxt, trans,
move_ctxt_wait_event(ctxt,
!atomic_read(&ctxt->write_sectors) ||
atomic_read(&ctxt->write_sectors) != sectors_pending);
}
void bch2_moving_ctxt_exit(struct moving_context *ctxt)
{
struct bch_fs *c = ctxt->c;
struct bch_fs *c = ctxt->trans->c;
move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
closure_sync(&ctxt->cl);
EBUG_ON(atomic_read(&ctxt->write_sectors));
@ -192,16 +176,12 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt)
EBUG_ON(atomic_read(&ctxt->read_sectors));
EBUG_ON(atomic_read(&ctxt->read_ios));
if (ctxt->stats) {
progress_list_del(c, ctxt->stats);
trace_move_data(c,
atomic64_read(&ctxt->stats->sectors_moved),
atomic64_read(&ctxt->stats->keys_moved));
}
mutex_lock(&c->moving_context_lock);
list_del(&ctxt->list);
mutex_unlock(&c->moving_context_lock);
bch2_trans_put(ctxt->trans);
memset(ctxt, 0, sizeof(*ctxt));
}
void bch2_moving_ctxt_init(struct moving_context *ctxt,
@ -213,7 +193,7 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt,
{
memset(ctxt, 0, sizeof(*ctxt));
ctxt->c = c;
ctxt->trans = bch2_trans_get(c);
ctxt->fn = (void *) _RET_IP_;
ctxt->rate = rate;
ctxt->stats = stats;
@ -230,16 +210,17 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt,
mutex_lock(&c->moving_context_lock);
list_add(&ctxt->list, &c->moving_context_list);
mutex_unlock(&c->moving_context_lock);
}
if (stats) {
progress_list_add(c, stats);
stats->data_type = BCH_DATA_user;
}
void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
{
trace_move_data(c, stats);
}
void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
{
memset(stats, 0, sizeof(*stats));
stats->data_type = BCH_DATA_user;
scnprintf(stats->name, sizeof(stats->name), "%s", name);
}
@ -286,15 +267,14 @@ static int bch2_extent_drop_ptrs(struct btree_trans *trans,
bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
}
static int bch2_move_extent(struct btree_trans *trans,
struct btree_iter *iter,
struct moving_context *ctxt,
struct move_bucket_in_flight *bucket_in_flight,
struct bch_io_opts io_opts,
enum btree_id btree_id,
struct bkey_s_c k,
struct data_update_opts data_opts)
int bch2_move_extent(struct moving_context *ctxt,
struct move_bucket_in_flight *bucket_in_flight,
struct btree_iter *iter,
struct bkey_s_c k,
struct bch_io_opts io_opts,
struct data_update_opts data_opts)
{
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
struct moving_io *io;
@ -303,6 +283,8 @@ static int bch2_move_extent(struct btree_trans *trans,
unsigned sectors = k.k->size, pages;
int ret = -ENOMEM;
if (ctxt->stats)
ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
trace_move_extent2(c, k);
bch2_data_update_opts_normalize(k, &data_opts);
@ -355,7 +337,7 @@ static int bch2_move_extent(struct btree_trans *trans,
io->rbio.bio.bi_end_io = move_read_endio;
ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp,
io_opts, data_opts, btree_id, k);
io_opts, data_opts, iter->btree_id, k);
if (ret && ret != -BCH_ERR_unwritten_extent_update)
goto err_free_pages;
@ -367,9 +349,11 @@ static int bch2_move_extent(struct btree_trans *trans,
BUG_ON(ret);
io->write.ctxt = ctxt;
io->write.op.end_io = move_write_done;
if (ctxt->rate)
bch2_ratelimit_increment(ctxt->rate, k.k->size);
if (ctxt->stats) {
atomic64_inc(&ctxt->stats->keys_moved);
atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
@ -399,7 +383,7 @@ static int bch2_move_extent(struct btree_trans *trans,
closure_get(&ctxt->cl);
bch2_read_extent(trans, &io->rbio,
bkey_start_pos(k.k),
btree_id, k, 0,
iter->btree_id, k, 0,
BCH_READ_NODECODE|
BCH_READ_LAST_FRAGMENT);
return 0;
@ -413,45 +397,96 @@ err:
return ret;
}
static int lookup_inode(struct btree_trans *trans, struct bpos pos,
struct bch_inode_unpacked *inode)
struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
struct per_snapshot_io_opts *io_opts,
struct bkey_s_c extent_k)
{
struct bch_fs *c = trans->c;
u32 restart_count = trans->restart_count;
int ret = 0;
if (io_opts->cur_inum != extent_k.k->p.inode) {
struct btree_iter iter;
struct bkey_s_c k;
io_opts->d.nr = 0;
for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
if (k.k->p.offset != extent_k.k->p.inode)
break;
if (!bkey_is_inode(k.k))
continue;
struct bch_inode_unpacked inode;
BUG_ON(bch2_inode_unpack(k, &inode));
struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
ret = darray_push(&io_opts->d, e);
if (ret)
break;
}
bch2_trans_iter_exit(trans, &iter);
io_opts->cur_inum = extent_k.k->p.inode;
}
ret = ret ?: trans_was_restarted(trans, restart_count);
if (ret)
return ERR_PTR(ret);
if (extent_k.k->p.snapshot) {
struct snapshot_io_opts_entry *i;
darray_for_each(io_opts->d, i)
if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot))
return &i->io_opts;
}
return &io_opts->fs_io_opts;
}
int bch2_move_get_io_opts_one(struct btree_trans *trans,
struct bch_io_opts *io_opts,
struct bkey_s_c extent_k)
{
struct btree_iter iter;
struct bkey_s_c k;
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos,
BTREE_ITER_ALL_SNAPSHOTS);
k = bch2_btree_iter_peek(&iter);
ret = bkey_err(k);
if (ret)
goto err;
if (!k.k || !bkey_eq(k.k->p, pos)) {
ret = -BCH_ERR_ENOENT_inode;
goto err;
/* reflink btree? */
if (!extent_k.k->p.inode) {
*io_opts = bch2_opts_to_inode_opts(trans->c->opts);
return 0;
}
ret = bkey_is_inode(k.k) ? 0 : -EIO;
if (ret)
goto err;
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
BTREE_ITER_CACHED);
ret = bkey_err(k);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
return ret;
if (!ret && bkey_is_inode(k.k)) {
struct bch_inode_unpacked inode;
bch2_inode_unpack(k, &inode);
bch2_inode_opts_get(io_opts, trans->c, &inode);
} else {
*io_opts = bch2_opts_to_inode_opts(trans->c->opts);
}
ret = bch2_inode_unpack(k, inode);
if (ret)
goto err;
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
return 0;
}
static int move_ratelimit(struct btree_trans *trans,
struct moving_context *ctxt)
int bch2_move_ratelimit(struct moving_context *ctxt)
{
struct bch_fs *c = trans->c;
struct bch_fs *c = ctxt->trans->c;
u64 delay;
if (ctxt->wait_on_copygc) {
bch2_trans_unlock(trans);
bch2_trans_unlock(ctxt->trans);
wait_event_killable(c->copygc_running_wq,
!c->copygc_running ||
kthread_should_stop());
@ -461,7 +496,7 @@ static int move_ratelimit(struct btree_trans *trans,
delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
if (delay) {
bch2_trans_unlock(trans);
bch2_trans_unlock(ctxt->trans);
set_current_state(TASK_INTERRUPTIBLE);
}
@ -474,7 +509,7 @@ static int move_ratelimit(struct btree_trans *trans,
schedule_timeout(delay);
if (unlikely(freezing(current))) {
move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads));
move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
try_to_freeze();
}
} while (delay);
@ -483,7 +518,7 @@ static int move_ratelimit(struct btree_trans *trans,
* XXX: these limits really ought to be per device, SSDs and hard drives
* will want different limits
*/
move_ctxt_wait_event(ctxt, trans,
move_ctxt_wait_event(ctxt,
atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
@ -492,52 +527,28 @@ static int move_ratelimit(struct btree_trans *trans,
return 0;
}
static int move_get_io_opts(struct btree_trans *trans,
struct bch_io_opts *io_opts,
struct bkey_s_c k, u64 *cur_inum)
static int bch2_move_data_btree(struct moving_context *ctxt,
struct bpos start,
struct bpos end,
move_pred_fn pred, void *arg,
enum btree_id btree_id)
{
struct bch_inode_unpacked inode;
int ret;
if (*cur_inum == k.k->p.inode)
return 0;
ret = lookup_inode(trans,
SPOS(0, k.k->p.inode, k.k->p.snapshot),
&inode);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
return ret;
if (!ret)
bch2_inode_opts_get(io_opts, trans->c, &inode);
else
*io_opts = bch2_opts_to_inode_opts(trans->c->opts);
*cur_inum = k.k->p.inode;
return 0;
}
static int __bch2_move_data(struct moving_context *ctxt,
struct bpos start,
struct bpos end,
move_pred_fn pred, void *arg,
enum btree_id btree_id)
{
struct bch_fs *c = ctxt->c;
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct per_snapshot_io_opts snapshot_io_opts;
struct bch_io_opts *io_opts;
struct bkey_buf sk;
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
struct data_update_opts data_opts;
u64 cur_inum = U64_MAX;
int ret = 0, ret2;
per_snapshot_io_opts_init(&snapshot_io_opts, c);
bch2_bkey_buf_init(&sk);
if (ctxt->stats) {
ctxt->stats->data_type = BCH_DATA_user;
ctxt->stats->btree_id = btree_id;
ctxt->stats->pos = start;
ctxt->stats->pos = BBPOS(btree_id, start);
}
bch2_trans_iter_init(trans, &iter, btree_id, start,
@ -547,7 +558,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
if (ctxt->rate)
bch2_ratelimit_reset(ctxt->rate);
while (!move_ratelimit(trans, ctxt)) {
while (!bch2_move_ratelimit(ctxt)) {
bch2_trans_begin(trans);
k = bch2_btree_iter_peek(&iter);
@ -564,17 +575,18 @@ static int __bch2_move_data(struct moving_context *ctxt,
break;
if (ctxt->stats)
ctxt->stats->pos = iter.pos;
ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
if (!bkey_extent_is_direct_data(k.k))
goto next_nondata;
ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k);
ret = PTR_ERR_OR_ZERO(io_opts);
if (ret)
continue;
memset(&data_opts, 0, sizeof(data_opts));
if (!pred(c, arg, k, &io_opts, &data_opts))
if (!pred(c, arg, k, io_opts, &data_opts))
goto next;
/*
@ -584,24 +596,20 @@ static int __bch2_move_data(struct moving_context *ctxt,
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
ret2 = bch2_move_extent(trans, &iter, ctxt, NULL,
io_opts, btree_id, k, data_opts);
ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts);
if (ret2) {
if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
continue;
if (ret2 == -ENOMEM) {
/* memory allocation failure, wait for some IO to finish */
bch2_move_ctxt_wait_for_io(ctxt, trans);
bch2_move_ctxt_wait_for_io(ctxt);
continue;
}
/* XXX signal failure */
goto next;
}
if (ctxt->rate)
bch2_ratelimit_increment(ctxt->rate, k.k->size);
next:
if (ctxt->stats)
atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
@ -610,59 +618,68 @@ next_nondata:
}
bch2_trans_iter_exit(trans, &iter);
bch2_trans_put(trans);
bch2_bkey_buf_exit(&sk, c);
per_snapshot_io_opts_exit(&snapshot_io_opts);
return ret;
}
int __bch2_move_data(struct moving_context *ctxt,
struct bbpos start,
struct bbpos end,
move_pred_fn pred, void *arg)
{
struct bch_fs *c = ctxt->trans->c;
enum btree_id id;
int ret = 0;
for (id = start.btree;
id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
id++) {
ctxt->stats->pos = BBPOS(id, POS_MIN);
if (!btree_type_has_ptrs(id) ||
!bch2_btree_id_root(c, id)->b)
continue;
ret = bch2_move_data_btree(ctxt,
id == start.btree ? start.pos : POS_MIN,
id == end.btree ? end.pos : POS_MAX,
pred, arg, id);
if (ret)
break;
}
return ret;
}
int bch2_move_data(struct bch_fs *c,
enum btree_id start_btree_id, struct bpos start_pos,
enum btree_id end_btree_id, struct bpos end_pos,
struct bbpos start,
struct bbpos end,
struct bch_ratelimit *rate,
struct bch_move_stats *stats,
struct write_point_specifier wp,
bool wait_on_copygc,
move_pred_fn pred, void *arg)
{
struct moving_context ctxt;
enum btree_id id;
int ret = 0;
int ret;
bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
for (id = start_btree_id;
id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
id++) {
stats->btree_id = id;
if (id != BTREE_ID_extents &&
id != BTREE_ID_reflink)
continue;
if (!bch2_btree_id_root(c, id)->b)
continue;
ret = __bch2_move_data(&ctxt,
id == start_btree_id ? start_pos : POS_MIN,
id == end_btree_id ? end_pos : POS_MAX,
pred, arg, id);
if (ret)
break;
}
ret = __bch2_move_data(&ctxt, start, end, pred, arg);
bch2_moving_ctxt_exit(&ctxt);
return ret;
}
int __bch2_evacuate_bucket(struct btree_trans *trans,
struct moving_context *ctxt,
int __bch2_evacuate_bucket(struct moving_context *ctxt,
struct move_bucket_in_flight *bucket_in_flight,
struct bpos bucket, int gen,
struct data_update_opts _data_opts)
{
struct bch_fs *c = ctxt->c;
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
struct btree_iter iter;
struct bkey_buf sk;
@ -673,7 +690,6 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
struct data_update_opts data_opts;
unsigned dirty_sectors, bucket_size;
u64 fragmentation;
u64 cur_inum = U64_MAX;
struct bpos bp_pos = POS_MIN;
int ret = 0;
@ -708,7 +724,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
goto err;
}
while (!(ret = move_ratelimit(trans, ctxt))) {
while (!(ret = bch2_move_ratelimit(ctxt))) {
bch2_trans_begin(trans);
ret = bch2_get_next_backpointer(trans, bucket, gen,
@ -737,7 +753,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
if (ret) {
bch2_trans_iter_exit(trans, &iter);
continue;
@ -758,23 +774,20 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
i++;
}
ret = bch2_move_extent(trans, &iter, ctxt,
bucket_in_flight,
io_opts, bp.btree_id, k, data_opts);
ret = bch2_move_extent(ctxt, bucket_in_flight,
&iter, k, io_opts, data_opts);
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret == -ENOMEM) {
/* memory allocation failure, wait for some IO to finish */
bch2_move_ctxt_wait_for_io(ctxt, trans);
bch2_move_ctxt_wait_for_io(ctxt);
continue;
}
if (ret)
goto err;
if (ctxt->rate)
bch2_ratelimit_increment(ctxt->rate, k.k->size);
if (ctxt->stats)
atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
} else {
@ -825,14 +838,12 @@ int bch2_evacuate_bucket(struct bch_fs *c,
struct write_point_specifier wp,
bool wait_on_copygc)
{
struct btree_trans *trans = bch2_trans_get(c);
struct moving_context ctxt;
int ret;
bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
ret = __bch2_evacuate_bucket(trans, &ctxt, NULL, bucket, gen, data_opts);
ret = __bch2_evacuate_bucket(&ctxt, NULL, bucket, gen, data_opts);
bch2_moving_ctxt_exit(&ctxt);
bch2_trans_put(trans);
return ret;
}
@ -849,21 +860,25 @@ static int bch2_move_btree(struct bch_fs *c,
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
struct btree_trans *trans = bch2_trans_get(c);
struct moving_context ctxt;
struct btree_trans *trans;
struct btree_iter iter;
struct btree *b;
enum btree_id id;
struct data_update_opts data_opts;
int ret = 0;
progress_list_add(c, stats);
bch2_moving_ctxt_init(&ctxt, c, NULL, stats,
writepoint_ptr(&c->btree_write_point),
true);
trans = ctxt.trans;
stats->data_type = BCH_DATA_btree;
for (id = start_btree_id;
id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
id++) {
stats->btree_id = id;
stats->pos = BBPOS(id, POS_MIN);
if (!bch2_btree_id_root(c, id)->b)
continue;
@ -882,7 +897,7 @@ retry:
bpos_cmp(b->key.k.p, end_pos)) > 0)
break;
stats->pos = iter.pos;
stats->pos = BBPOS(iter.btree_id, iter.pos);
if (!pred(c, arg, b, &io_opts, &data_opts))
goto next;
@ -904,14 +919,10 @@ next:
break;
}
bch2_trans_put(trans);
if (ret)
bch_err_fn(c, ret);
bch_err_fn(c, ret);
bch2_moving_ctxt_exit(&ctxt);
bch2_btree_interior_updates_flush(c);
progress_list_del(c, stats);
return ret;
}
@ -1032,8 +1043,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
mutex_unlock(&c->sb_lock);
}
if (ret)
bch_err_fn(c, ret);
bch_err_fn(c, ret);
return ret;
}
@ -1056,14 +1066,16 @@ int bch2_data_job(struct bch_fs *c,
ret = bch2_replicas_gc2(c) ?: ret;
ret = bch2_move_data(c,
op.start_btree, op.start_pos,
op.end_btree, op.end_pos,
(struct bbpos) { op.start_btree, op.start_pos },
(struct bbpos) { op.end_btree, op.end_pos },
NULL,
stats,
writepoint_hashed((unsigned long) current),
true,
rereplicate_pred, c) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
bch2_move_stats_exit(stats, c);
break;
case BCH_DATA_OP_MIGRATE:
if (op.migrate.dev >= c->sb.nr_devices)
@ -1080,18 +1092,21 @@ int bch2_data_job(struct bch_fs *c,
ret = bch2_replicas_gc2(c) ?: ret;
ret = bch2_move_data(c,
op.start_btree, op.start_pos,
op.end_btree, op.end_pos,
(struct bbpos) { op.start_btree, op.start_pos },
(struct bbpos) { op.end_btree, op.end_pos },
NULL,
stats,
writepoint_hashed((unsigned long) current),
true,
migrate_pred, &op) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
bch2_move_stats_exit(stats, c);
break;
case BCH_DATA_OP_REWRITE_OLD_NODES:
bch2_move_stats_init(stats, "rewrite_old_nodes");
ret = bch2_scan_old_btree_nodes(c, stats);
bch2_move_stats_exit(stats, c);
break;
default:
ret = -EINVAL;
@ -1100,19 +1115,43 @@ int bch2_data_job(struct bch_fs *c,
return ret;
}
void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
{
prt_printf(out, "%s: data type=%s pos=",
stats->name,
bch2_data_types[stats->data_type]);
bch2_bbpos_to_text(out, stats->pos);
prt_newline(out);
printbuf_indent_add(out, 2);
prt_str(out, "keys moved: ");
prt_u64(out, atomic64_read(&stats->keys_moved));
prt_newline(out);
prt_str(out, "keys raced: ");
prt_u64(out, atomic64_read(&stats->keys_raced));
prt_newline(out);
prt_str(out, "bytes seen: ");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
prt_newline(out);
prt_str(out, "bytes moved: ");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
prt_newline(out);
prt_str(out, "bytes raced: ");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
prt_newline(out);
printbuf_indent_sub(out, 2);
}
static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
{
struct bch_move_stats *stats = ctxt->stats;
struct moving_io *io;
prt_printf(out, "%s (%ps):", stats->name, ctxt->fn);
prt_newline(out);
prt_printf(out, " data type %s btree_id %s position: ",
bch2_data_types[stats->data_type],
bch2_btree_id_str(stats->btree_id));
bch2_bpos_to_text(out, stats->pos);
prt_newline(out);
bch2_move_stats_to_text(out, ctxt->stats);
printbuf_indent_add(out, 2);
prt_printf(out, "reads: ios %u/%u sectors %u/%u",
@ -1153,7 +1192,4 @@ void bch2_fs_move_init(struct bch_fs *c)
{
INIT_LIST_HEAD(&c->moving_context_list);
mutex_init(&c->moving_context_lock);
INIT_LIST_HEAD(&c->data_progress_list);
mutex_init(&c->data_progress_lock);
}

View File

@ -2,6 +2,7 @@
#ifndef _BCACHEFS_MOVE_H
#define _BCACHEFS_MOVE_H
#include "bbpos.h"
#include "bcachefs_ioctl.h"
#include "btree_iter.h"
#include "buckets.h"
@ -11,7 +12,7 @@
struct bch_read_bio;
struct moving_context {
struct bch_fs *c;
struct btree_trans *trans;
struct list_head list;
void *fn;
@ -37,10 +38,10 @@ struct moving_context {
wait_queue_head_t wait;
};
#define move_ctxt_wait_event(_ctxt, _trans, _cond) \
#define move_ctxt_wait_event(_ctxt, _cond) \
do { \
bool cond_finished = false; \
bch2_moving_ctxt_do_pending_writes(_ctxt, _trans); \
bch2_moving_ctxt_do_pending_writes(_ctxt); \
\
if (_cond) \
break; \
@ -59,22 +60,60 @@ void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
struct bch_ratelimit *, struct bch_move_stats *,
struct write_point_specifier, bool);
struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *);
void bch2_moving_ctxt_do_pending_writes(struct moving_context *,
struct btree_trans *);
void bch2_moving_ctxt_do_pending_writes(struct moving_context *);
void bch2_move_ctxt_wait_for_io(struct moving_context *);
int bch2_move_ratelimit(struct moving_context *);
/* Inodes in different snapshots may have different IO options: */
struct snapshot_io_opts_entry {
u32 snapshot;
struct bch_io_opts io_opts;
};
struct per_snapshot_io_opts {
u64 cur_inum;
struct bch_io_opts fs_io_opts;
DARRAY(struct snapshot_io_opts_entry) d;
};
static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c)
{
memset(io_opts, 0, sizeof(*io_opts));
io_opts->fs_io_opts = bch2_opts_to_inode_opts(c->opts);
}
static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts)
{
darray_exit(&io_opts->d);
}
struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *,
struct per_snapshot_io_opts *, struct bkey_s_c);
int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, struct bkey_s_c);
int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
int bch2_move_extent(struct moving_context *,
struct move_bucket_in_flight *,
struct btree_iter *,
struct bkey_s_c,
struct bch_io_opts,
struct data_update_opts);
int __bch2_move_data(struct moving_context *,
struct bbpos,
struct bbpos,
move_pred_fn, void *);
int bch2_move_data(struct bch_fs *,
enum btree_id, struct bpos,
enum btree_id, struct bpos,
struct bbpos start,
struct bbpos end,
struct bch_ratelimit *,
struct bch_move_stats *,
struct write_point_specifier,
bool,
move_pred_fn, void *);
int __bch2_evacuate_bucket(struct btree_trans *,
struct moving_context *,
int __bch2_evacuate_bucket(struct moving_context *,
struct move_bucket_in_flight *,
struct bpos, int,
struct data_update_opts);
@ -88,7 +127,10 @@ int bch2_data_job(struct bch_fs *,
struct bch_move_stats *,
struct bch_ioctl_data);
void bch2_move_stats_init(struct bch_move_stats *stats, char *name);
void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *);
void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *);
void bch2_move_stats_init(struct bch_move_stats *, char *);
void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);
void bch2_fs_move_init(struct bch_fs *);

View File

@ -2,17 +2,17 @@
#ifndef _BCACHEFS_MOVE_TYPES_H
#define _BCACHEFS_MOVE_TYPES_H
#include "bbpos_types.h"
struct bch_move_stats {
enum bch_data_type data_type;
enum btree_id btree_id;
struct bpos pos;
struct list_head list;
struct bbpos pos;
char name[32];
atomic64_t keys_moved;
atomic64_t keys_raced;
atomic64_t sectors_moved;
atomic64_t sectors_seen;
atomic64_t sectors_moved;
atomic64_t sectors_raced;
};

View File

@ -101,8 +101,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
return ret;
}
static void move_buckets_wait(struct btree_trans *trans,
struct moving_context *ctxt,
static void move_buckets_wait(struct moving_context *ctxt,
struct buckets_in_flight *list,
bool flush)
{
@ -111,7 +110,7 @@ static void move_buckets_wait(struct btree_trans *trans,
while ((i = list->first)) {
if (flush)
move_ctxt_wait_event(ctxt, trans, !atomic_read(&i->count));
move_ctxt_wait_event(ctxt, !atomic_read(&i->count));
if (atomic_read(&i->count))
break;
@ -129,7 +128,7 @@ static void move_buckets_wait(struct btree_trans *trans,
kfree(i);
}
bch2_trans_unlock(trans);
bch2_trans_unlock(ctxt->trans);
}
static bool bucket_in_flight(struct buckets_in_flight *list,
@ -140,11 +139,11 @@ static bool bucket_in_flight(struct buckets_in_flight *list,
typedef DARRAY(struct move_bucket) move_buckets;
static int bch2_copygc_get_buckets(struct btree_trans *trans,
struct moving_context *ctxt,
static int bch2_copygc_get_buckets(struct moving_context *ctxt,
struct buckets_in_flight *buckets_in_flight,
move_buckets *buckets)
{
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
@ -152,7 +151,7 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans,
size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
int ret;
move_buckets_wait(trans, ctxt, buckets_in_flight, false);
move_buckets_wait(ctxt, buckets_in_flight, false);
ret = bch2_btree_write_buffer_flush(trans);
if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()",
@ -188,10 +187,10 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans,
}
noinline
static int bch2_copygc(struct btree_trans *trans,
struct moving_context *ctxt,
static int bch2_copygc(struct moving_context *ctxt,
struct buckets_in_flight *buckets_in_flight)
{
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct data_update_opts data_opts = {
.btree_insert_flags = BCH_WATERMARK_copygc,
@ -202,7 +201,7 @@ static int bch2_copygc(struct btree_trans *trans,
u64 moved = atomic64_read(&ctxt->stats->sectors_moved);
int ret = 0;
ret = bch2_copygc_get_buckets(trans, ctxt, buckets_in_flight, &buckets);
ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets);
if (ret)
goto err;
@ -221,7 +220,7 @@ static int bch2_copygc(struct btree_trans *trans,
break;
}
ret = __bch2_evacuate_bucket(trans, ctxt, f, f->bucket.k.bucket,
ret = __bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket,
f->bucket.k.gen, data_opts);
if (ret)
goto err;
@ -300,7 +299,6 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
static int bch2_copygc_thread(void *arg)
{
struct bch_fs *c = arg;
struct btree_trans *trans;
struct moving_context ctxt;
struct bch_move_stats move_stats;
struct io_clock *clock = &c->io_clock[WRITE];
@ -317,7 +315,6 @@ static int bch2_copygc_thread(void *arg)
}
set_freezable();
trans = bch2_trans_get(c);
bch2_move_stats_init(&move_stats, "copygc");
bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
@ -325,16 +322,16 @@ static int bch2_copygc_thread(void *arg)
false);
while (!ret && !kthread_should_stop()) {
bch2_trans_unlock(trans);
bch2_trans_unlock(ctxt.trans);
cond_resched();
if (!c->copy_gc_enabled) {
move_buckets_wait(trans, &ctxt, &buckets, true);
move_buckets_wait(&ctxt, &buckets, true);
kthread_wait_freezable(c->copy_gc_enabled);
}
if (unlikely(freezing(current))) {
move_buckets_wait(trans, &ctxt, &buckets, true);
move_buckets_wait(&ctxt, &buckets, true);
__refrigerator(false);
continue;
}
@ -345,7 +342,7 @@ static int bch2_copygc_thread(void *arg)
if (wait > clock->max_slop) {
c->copygc_wait_at = last;
c->copygc_wait = last + wait;
move_buckets_wait(trans, &ctxt, &buckets, true);
move_buckets_wait(&ctxt, &buckets, true);
trace_and_count(c, copygc_wait, c, wait, last + wait);
bch2_kthread_io_clock_wait(clock, last + wait,
MAX_SCHEDULE_TIMEOUT);
@ -355,16 +352,16 @@ static int bch2_copygc_thread(void *arg)
c->copygc_wait = 0;
c->copygc_running = true;
ret = bch2_copygc(trans, &ctxt, &buckets);
ret = bch2_copygc(&ctxt, &buckets);
c->copygc_running = false;
wake_up(&c->copygc_running_wq);
}
move_buckets_wait(trans, &ctxt, &buckets, true);
move_buckets_wait(&ctxt, &buckets, true);
rhashtable_destroy(&buckets.table);
bch2_trans_put(trans);
bch2_moving_ctxt_exit(&ctxt);
bch2_move_stats_exit(&move_stats, c);
return 0;
}

View File

@ -294,6 +294,9 @@ int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
return -EINVAL;
}
if (opt->fn.validate)
return opt->fn.validate(v, err);
return 0;
}

View File

@ -74,6 +74,7 @@ enum opt_type {
struct bch_opt_fn {
int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *);
void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
int (*validate)(u64, struct printbuf *);
};
/**

View File

@ -415,11 +415,11 @@ void bch2_prt_bitflags(struct printbuf *out,
while (list[nr])
nr++;
while (flags && (bit = __ffs(flags)) < nr) {
while (flags && (bit = __ffs64(flags)) < nr) {
if (!first)
bch2_prt_printf(out, ",");
first = false;
bch2_prt_printf(out, "%s", list[bit]);
flags ^= 1 << bit;
flags ^= BIT_ULL(bit);
}
}

View File

@ -3,13 +3,18 @@
#include "bcachefs.h"
#include "alloc_foreground.h"
#include "btree_iter.h"
#include "btree_update.h"
#include "btree_write_buffer.h"
#include "buckets.h"
#include "clock.h"
#include "compress.h"
#include "disk_groups.h"
#include "errcode.h"
#include "error.h"
#include "inode.h"
#include "move.h"
#include "rebalance.h"
#include "subvolume.h"
#include "super-io.h"
#include "trace.h"
@ -17,302 +22,398 @@
#include <linux/kthread.h>
#include <linux/sched/cputime.h>
/*
* Check if an extent should be moved:
* returns -1 if it should not be moved, or
* device of pointer that should be moved, if known, or INT_MAX if unknown
*/
#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1)
static const char * const bch2_rebalance_state_strs[] = {
#define x(t) #t,
BCH_REBALANCE_STATES()
NULL
#undef x
};
static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum)
{
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_i_cookie *cookie;
u64 v;
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
BTREE_ITER_INTENT);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
v = k.k->type == KEY_TYPE_cookie
? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
: 0;
cookie = bch2_trans_kmalloc(trans, sizeof(*cookie));
ret = PTR_ERR_OR_ZERO(cookie);
if (ret)
goto err;
bkey_cookie_init(&cookie->k_i);
cookie->k.p = iter.pos;
cookie->v.cookie = cpu_to_le64(v + 1);
ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
{
int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
__bch2_set_rebalance_needs_scan(trans, inum));
rebalance_wakeup(c);
return ret;
}
int bch2_set_fs_needs_rebalance(struct bch_fs *c)
{
return bch2_set_rebalance_needs_scan(c, 0);
}
static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie)
{
struct btree_iter iter;
struct bkey_s_c k;
u64 v;
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
BTREE_ITER_INTENT);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
v = k.k->type == KEY_TYPE_cookie
? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
: 0;
if (v == cookie)
ret = bch2_btree_delete_at(trans, &iter, 0);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans,
struct btree_iter *work_iter)
{
return !kthread_should_stop()
? bch2_btree_iter_peek(work_iter)
: bkey_s_c_null;
}
static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k)
{
struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
int ret = PTR_ERR_OR_ZERO(n);
if (ret)
return ret;
extent_entry_drop(bkey_i_to_s(n),
(void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
}
static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
struct bpos work_pos,
struct btree_iter *extent_iter,
struct data_update_opts *data_opts)
{
struct bch_fs *c = trans->c;
struct bkey_s_c k;
bch2_trans_iter_exit(trans, extent_iter);
bch2_trans_iter_init(trans, extent_iter,
work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
work_pos,
BTREE_ITER_ALL_SNAPSHOTS);
k = bch2_btree_iter_peek_slot(extent_iter);
if (bkey_err(k))
return k;
const struct bch_extent_rebalance *r = k.k ? bch2_bkey_rebalance_opts(k) : NULL;
if (!r) {
/* raced due to btree write buffer, nothing to do */
return bkey_s_c_null;
}
memset(data_opts, 0, sizeof(*data_opts));
data_opts->rewrite_ptrs =
bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression);
data_opts->target = r->target;
if (!data_opts->rewrite_ptrs) {
/*
* device we would want to write to offline? devices in target
* changed?
*
* We'll now need a full scan before this extent is picked up
* again:
*/
int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k);
if (ret)
return bkey_s_c_err(ret);
return bkey_s_c_null;
}
return k;
}
noinline_for_stack
static int do_rebalance_extent(struct moving_context *ctxt,
struct bpos work_pos,
struct btree_iter *extent_iter)
{
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct bch_fs_rebalance *r = &trans->c->rebalance;
struct data_update_opts data_opts;
struct bch_io_opts io_opts;
struct bkey_s_c k;
struct bkey_buf sk;
int ret;
ctxt->stats = &r->work_stats;
r->state = BCH_REBALANCE_working;
bch2_bkey_buf_init(&sk);
ret = bkey_err(k = next_rebalance_extent(trans, work_pos,
extent_iter, &data_opts));
if (ret || !k.k)
goto out;
ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
if (ret)
goto out;
atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
/*
* The iterator gets unlocked by __bch2_read_extent - need to
* save a copy of @k elsewhere:
*/
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts);
if (ret) {
if (bch2_err_matches(ret, ENOMEM)) {
/* memory allocation failure, wait for some IO to finish */
bch2_move_ctxt_wait_for_io(ctxt);
ret = -BCH_ERR_transaction_restart_nested;
}
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto out;
/* skip it and continue, XXX signal failure */
ret = 0;
}
out:
bch2_bkey_buf_exit(&sk, c);
return ret;
}
static bool rebalance_pred(struct bch_fs *c, void *arg,
struct bkey_s_c k,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
unsigned i;
unsigned target, compression;
data_opts->rewrite_ptrs = 0;
data_opts->target = io_opts->background_target;
data_opts->extra_replicas = 0;
data_opts->btree_insert_flags = 0;
if (k.k->p.inode) {
target = io_opts->background_target;
compression = io_opts->background_compression ?: io_opts->compression;
} else {
const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
if (io_opts->background_compression &&
!bch2_bkey_is_incompressible(k)) {
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
i = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
if (!p.ptr.cached &&
p.crc.compression_type !=
bch2_compression_opt_to_type(io_opts->background_compression))
data_opts->rewrite_ptrs |= 1U << i;
i++;
}
}
if (io_opts->background_target) {
const struct bch_extent_ptr *ptr;
i = 0;
bkey_for_each_ptr(ptrs, ptr) {
if (!ptr->cached &&
!bch2_dev_in_target(c, ptr->dev, io_opts->background_target) &&
bch2_target_accepts_data(c, BCH_DATA_user, io_opts->background_target))
data_opts->rewrite_ptrs |= 1U << i;
i++;
}
target = r ? r->target : io_opts->background_target;
compression = r ? r->compression :
(io_opts->background_compression ?: io_opts->compression);
}
data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
data_opts->target = target;
return data_opts->rewrite_ptrs != 0;
}
void bch2_rebalance_add_key(struct bch_fs *c,
struct bkey_s_c k,
struct bch_io_opts *io_opts)
static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
{
struct data_update_opts update_opts = { 0 };
struct bkey_ptrs_c ptrs;
const struct bch_extent_ptr *ptr;
unsigned i;
struct btree_trans *trans = ctxt->trans;
struct bch_fs_rebalance *r = &trans->c->rebalance;
int ret;
if (!rebalance_pred(c, NULL, k, io_opts, &update_opts))
return;
bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
ctxt->stats = &r->scan_stats;
i = 0;
ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr(ptrs, ptr) {
if ((1U << i) && update_opts.rewrite_ptrs)
if (atomic64_add_return(k.k->size,
&bch_dev_bkey_exists(c, ptr->dev)->rebalance_work) ==
k.k->size)
rebalance_wakeup(c);
i++;
}
}
void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
{
if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
sectors)
rebalance_wakeup(c);
}
struct rebalance_work {
int dev_most_full_idx;
unsigned dev_most_full_percent;
u64 dev_most_full_work;
u64 dev_most_full_capacity;
u64 total_work;
};
static void rebalance_work_accumulate(struct rebalance_work *w,
u64 dev_work, u64 unknown_dev, u64 capacity, int idx)
{
unsigned percent_full;
u64 work = dev_work + unknown_dev;
/* avoid divide by 0 */
if (!capacity)
return;
if (work < dev_work || work < unknown_dev)
work = U64_MAX;
work = min(work, capacity);
percent_full = div64_u64(work * 100, capacity);
if (percent_full >= w->dev_most_full_percent) {
w->dev_most_full_idx = idx;
w->dev_most_full_percent = percent_full;
w->dev_most_full_work = work;
w->dev_most_full_capacity = capacity;
if (!inum) {
r->scan_start = BBPOS_MIN;
r->scan_end = BBPOS_MAX;
} else {
r->scan_start = BBPOS(BTREE_ID_extents, POS(inum, 0));
r->scan_end = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX));
}
if (w->total_work + dev_work >= w->total_work &&
w->total_work + dev_work >= dev_work)
w->total_work += dev_work;
}
r->state = BCH_REBALANCE_scanning;
static struct rebalance_work rebalance_work(struct bch_fs *c)
{
struct bch_dev *ca;
struct rebalance_work ret = { .dev_most_full_idx = -1 };
u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev);
unsigned i;
for_each_online_member(ca, c, i)
rebalance_work_accumulate(&ret,
atomic64_read(&ca->rebalance_work),
unknown_dev,
bucket_to_sector(ca, ca->mi.nbuckets -
ca->mi.first_bucket),
i);
rebalance_work_accumulate(&ret,
unknown_dev, 0, c->capacity, -1);
ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?:
commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
bch2_clear_rebalance_needs_scan(trans, inum, cookie));
bch2_move_stats_exit(&r->scan_stats, trans->c);
return ret;
}
static void rebalance_work_reset(struct bch_fs *c)
static void rebalance_wait(struct bch_fs *c)
{
struct bch_fs_rebalance *r = &c->rebalance;
struct bch_dev *ca;
struct io_clock *clock = &c->io_clock[WRITE];
u64 now = atomic64_read(&clock->now);
u64 min_member_capacity = 128 * 2048;
unsigned i;
for_each_online_member(ca, c, i)
atomic64_set(&ca->rebalance_work, 0);
for_each_rw_member(ca, c, i)
min_member_capacity = min(min_member_capacity,
ca->mi.nbuckets * ca->mi.bucket_size);
atomic64_set(&c->rebalance.work_unknown_dev, 0);
r->wait_iotime_end = now + (min_member_capacity >> 6);
if (r->state != BCH_REBALANCE_waiting) {
r->wait_iotime_start = now;
r->wait_wallclock_start = ktime_get_real_ns();
r->state = BCH_REBALANCE_waiting;
}
bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
}
static unsigned long curr_cputime(void)
static int do_rebalance(struct moving_context *ctxt)
{
u64 utime, stime;
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct bch_fs_rebalance *r = &c->rebalance;
struct btree_iter rebalance_work_iter, extent_iter = { NULL };
struct bkey_s_c k;
int ret = 0;
task_cputime_adjusted(current, &utime, &stime);
return nsecs_to_jiffies(utime + stime);
bch2_move_stats_init(&r->work_stats, "rebalance_work");
bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
bch2_trans_iter_init(trans, &rebalance_work_iter,
BTREE_ID_rebalance_work, POS_MIN,
BTREE_ITER_ALL_SNAPSHOTS);
while (!bch2_move_ratelimit(ctxt) &&
!kthread_wait_freezable(r->enabled)) {
bch2_trans_begin(trans);
ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter));
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret || !k.k)
break;
ret = k.k->type == KEY_TYPE_cookie
? do_rebalance_scan(ctxt, k.k->p.inode,
le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie))
: do_rebalance_extent(ctxt, k.k->p, &extent_iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret)
break;
bch2_btree_iter_advance(&rebalance_work_iter);
}
bch2_trans_iter_exit(trans, &extent_iter);
bch2_trans_iter_exit(trans, &rebalance_work_iter);
bch2_move_stats_exit(&r->scan_stats, c);
if (!ret &&
!kthread_should_stop() &&
!atomic64_read(&r->work_stats.sectors_seen) &&
!atomic64_read(&r->scan_stats.sectors_seen)) {
bch2_trans_unlock(trans);
rebalance_wait(c);
}
bch_err_fn(c, ret);
return ret;
}
static int bch2_rebalance_thread(void *arg)
{
struct bch_fs *c = arg;
struct bch_fs_rebalance *r = &c->rebalance;
struct io_clock *clock = &c->io_clock[WRITE];
struct rebalance_work w, p;
struct bch_move_stats move_stats;
unsigned long start, prev_start;
unsigned long prev_run_time, prev_run_cputime;
unsigned long cputime, prev_cputime;
u64 io_start;
long throttle;
struct moving_context ctxt;
int ret;
set_freezable();
io_start = atomic64_read(&clock->now);
p = rebalance_work(c);
prev_start = jiffies;
prev_cputime = curr_cputime();
bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,
writepoint_ptr(&c->rebalance_write_point),
true);
bch2_move_stats_init(&move_stats, "rebalance");
while (!kthread_wait_freezable(r->enabled)) {
cond_resched();
while (!kthread_should_stop() &&
!(ret = do_rebalance(&ctxt)))
;
start = jiffies;
cputime = curr_cputime();
prev_run_time = start - prev_start;
prev_run_cputime = cputime - prev_cputime;
w = rebalance_work(c);
BUG_ON(!w.dev_most_full_capacity);
if (!w.total_work) {
r->state = REBALANCE_WAITING;
kthread_wait_freezable(rebalance_work(c).total_work);
continue;
}
/*
* If there isn't much work to do, throttle cpu usage:
*/
throttle = prev_run_cputime * 100 /
max(1U, w.dev_most_full_percent) -
prev_run_time;
if (w.dev_most_full_percent < 20 && throttle > 0) {
r->throttled_until_iotime = io_start +
div_u64(w.dev_most_full_capacity *
(20 - w.dev_most_full_percent),
50);
if (atomic64_read(&clock->now) + clock->max_slop <
r->throttled_until_iotime) {
r->throttled_until_cputime = start + throttle;
r->state = REBALANCE_THROTTLED;
bch2_kthread_io_clock_wait(clock,
r->throttled_until_iotime,
throttle);
continue;
}
}
/* minimum 1 mb/sec: */
r->pd.rate.rate =
max_t(u64, 1 << 11,
r->pd.rate.rate *
max(p.dev_most_full_percent, 1U) /
max(w.dev_most_full_percent, 1U));
io_start = atomic64_read(&clock->now);
p = w;
prev_start = start;
prev_cputime = cputime;
r->state = REBALANCE_RUNNING;
memset(&move_stats, 0, sizeof(move_stats));
rebalance_work_reset(c);
bch2_move_data(c,
0, POS_MIN,
BTREE_ID_NR, POS_MAX,
/* ratelimiting disabled for now */
NULL, /* &r->pd.rate, */
&move_stats,
writepoint_ptr(&c->rebalance_write_point),
true,
rebalance_pred, NULL);
}
bch2_moving_ctxt_exit(&ctxt);
return 0;
}
void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
{
struct bch_fs_rebalance *r = &c->rebalance;
struct rebalance_work w = rebalance_work(c);
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 20);
prt_printf(out, "fullest_dev (%i):", w.dev_most_full_idx);
prt_tab(out);
prt_human_readable_u64(out, w.dev_most_full_work << 9);
prt_printf(out, "/");
prt_human_readable_u64(out, w.dev_most_full_capacity << 9);
prt_newline(out);
prt_printf(out, "total work:");
prt_tab(out);
prt_human_readable_u64(out, w.total_work << 9);
prt_printf(out, "/");
prt_human_readable_u64(out, c->capacity << 9);
prt_newline(out);
prt_printf(out, "rate:");
prt_tab(out);
prt_printf(out, "%u", r->pd.rate.rate);
prt_str(out, bch2_rebalance_state_strs[r->state]);
prt_newline(out);
printbuf_indent_add(out, 2);
switch (r->state) {
case REBALANCE_WAITING:
prt_printf(out, "waiting");
case BCH_REBALANCE_waiting: {
u64 now = atomic64_read(&c->io_clock[WRITE].now);
prt_str(out, "io wait duration: ");
bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start);
prt_newline(out);
prt_str(out, "io wait remaining: ");
bch2_prt_human_readable_s64(out, r->wait_iotime_end - now);
prt_newline(out);
prt_str(out, "duration waited: ");
bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
prt_newline(out);
break;
case REBALANCE_THROTTLED:
prt_printf(out, "throttled for %lu sec or ",
(r->throttled_until_cputime - jiffies) / HZ);
prt_human_readable_u64(out,
(r->throttled_until_iotime -
atomic64_read(&c->io_clock[WRITE].now)) << 9);
prt_printf(out, " io");
}
case BCH_REBALANCE_working:
bch2_move_stats_to_text(out, &r->work_stats);
break;
case REBALANCE_RUNNING:
prt_printf(out, "running");
case BCH_REBALANCE_scanning:
bch2_move_stats_to_text(out, &r->scan_stats);
break;
}
prt_newline(out);
printbuf_indent_sub(out, 2);
}
void bch2_rebalance_stop(struct bch_fs *c)
@ -361,6 +462,4 @@ int bch2_rebalance_start(struct bch_fs *c)
void bch2_fs_rebalance_init(struct bch_fs *c)
{
bch2_pd_controller_init(&c->rebalance.pd);
atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX);
}

View File

@ -4,6 +4,9 @@
#include "rebalance_types.h"
int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
int bch2_set_fs_needs_rebalance(struct bch_fs *);
static inline void rebalance_wakeup(struct bch_fs *c)
{
struct task_struct *p;
@ -15,11 +18,7 @@ static inline void rebalance_wakeup(struct bch_fs *c)
rcu_read_unlock();
}
void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c,
struct bch_io_opts *);
void bch2_rebalance_add_work(struct bch_fs *, u64);
void bch2_rebalance_work_to_text(struct printbuf *, struct bch_fs *);
void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *);
void bch2_rebalance_stop(struct bch_fs *);
int bch2_rebalance_start(struct bch_fs *);

View File

@ -2,25 +2,36 @@
#ifndef _BCACHEFS_REBALANCE_TYPES_H
#define _BCACHEFS_REBALANCE_TYPES_H
#include "bbpos_types.h"
#include "move_types.h"
enum rebalance_state {
REBALANCE_WAITING,
REBALANCE_THROTTLED,
REBALANCE_RUNNING,
#define BCH_REBALANCE_STATES() \
x(waiting) \
x(working) \
x(scanning)
enum bch_rebalance_states {
#define x(t) BCH_REBALANCE_##t,
BCH_REBALANCE_STATES()
#undef x
};
struct bch_fs_rebalance {
struct task_struct __rcu *thread;
struct task_struct __rcu *thread;
struct bch_pd_controller pd;
atomic64_t work_unknown_dev;
enum bch_rebalance_states state;
u64 wait_iotime_start;
u64 wait_iotime_end;
u64 wait_wallclock_start;
enum rebalance_state state;
u64 throttled_until_iotime;
unsigned long throttled_until_cputime;
struct bch_move_stats work_stats;
unsigned enabled:1;
struct bbpos scan_start;
struct bbpos scan_end;
struct bch_move_stats scan_stats;
unsigned enabled:1;
};
#endif /* _BCACHEFS_REBALANCE_TYPES_H */

View File

@ -23,6 +23,7 @@
#include "logged_ops.h"
#include "move.h"
#include "quota.h"
#include "rebalance.h"
#include "recovery.h"
#include "replicas.h"
#include "sb-clean.h"
@ -946,16 +947,12 @@ int bch2_fs_initialize(struct bch_fs *c)
for (i = 0; i < BTREE_ID_NR; i++)
bch2_btree_root_alloc(c, i);
for_each_online_member(ca, c, i)
for_each_member_device(ca, c, i)
bch2_dev_usage_init(ca);
for_each_online_member(ca, c, i) {
ret = bch2_dev_journal_alloc(ca);
if (ret) {
percpu_ref_put(&ca->io_ref);
goto err;
}
}
ret = bch2_fs_journal_alloc(c);
if (ret)
goto err;
/*
* journal_res_get() will crash if called before this has
@ -973,15 +970,13 @@ int bch2_fs_initialize(struct bch_fs *c)
* btree updates
*/
bch_verbose(c, "marking superblocks");
for_each_member_device(ca, c, i) {
ret = bch2_trans_mark_dev_sb(c, ca);
if (ret) {
percpu_ref_put(&ca->ref);
goto err;
}
ret = bch2_trans_mark_dev_sbs(c);
bch_err_msg(c, ret, "marking superblocks");
if (ret)
goto err;
for_each_online_member(ca, c, i)
ca->new_fs_bucket_idx = 0;
}
ret = bch2_fs_freespace_init(c);
if (ret)

View File

@ -14,6 +14,8 @@
x(snapshots_read, PASS_ALWAYS) \
x(check_topology, 0) \
x(check_allocations, PASS_FSCK) \
x(trans_mark_dev_sbs, PASS_ALWAYS|PASS_SILENT) \
x(fs_journal_alloc, PASS_ALWAYS|PASS_SILENT) \
x(set_may_go_rw, PASS_ALWAYS|PASS_SILENT) \
x(journal_replay, PASS_ALWAYS) \
x(check_alloc_info, PASS_FSCK) \
@ -32,6 +34,7 @@
x(resume_logged_ops, PASS_ALWAYS) \
x(check_inodes, PASS_FSCK) \
x(check_extents, PASS_FSCK) \
x(check_indirect_extents, PASS_FSCK) \
x(check_dirents, PASS_FSCK) \
x(check_xattrs, PASS_FSCK) \
x(check_root, PASS_FSCK) \
@ -39,6 +42,7 @@
x(check_nlinks, PASS_FSCK) \
x(delete_dead_inodes, PASS_FSCK|PASS_UNCLEAN) \
x(fix_reflink_p, 0) \
x(set_fs_needs_rebalance, 0) \
enum bch_recovery_pass {
#define x(n, when) BCH_RECOVERY_PASS_##n,

View File

@ -7,6 +7,7 @@
#include "inode.h"
#include "io_misc.h"
#include "io_write.h"
#include "rebalance.h"
#include "reflink.h"
#include "subvolume.h"
#include "super-io.h"
@ -103,21 +104,22 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
}
#endif
static inline void check_indirect_extent_deleting(struct bkey_i *new, unsigned *flags)
{
if ((*flags & BTREE_TRIGGER_INSERT) && !*bkey_refcount(new)) {
new->k.type = KEY_TYPE_deleted;
new->k.size = 0;
set_bkey_val_u64s(&new->k, 0);;
*flags &= ~BTREE_TRIGGER_INSERT;
}
}
int bch2_trans_mark_reflink_v(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_i *new,
unsigned flags)
{
if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
struct bkey_i_reflink_v *r = bkey_i_to_reflink_v(new);
if (!r->v.refcount) {
r->k.type = KEY_TYPE_deleted;
r->k.size = 0;
set_bkey_val_u64s(&r->k, 0);
return 0;
}
}
check_indirect_extent_deleting(new, &flags);
return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags);
}
@ -132,7 +134,7 @@ int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
}
void bch2_indirect_inline_data_to_text(struct printbuf *out,
struct bch_fs *c, struct bkey_s_c k)
struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
unsigned datalen = bkey_inline_data_bytes(k.k);
@ -147,16 +149,7 @@ int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans,
struct bkey_s_c old, struct bkey_i *new,
unsigned flags)
{
if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
struct bkey_i_indirect_inline_data *r =
bkey_i_to_indirect_inline_data(new);
if (!r->v.refcount) {
r->k.type = KEY_TYPE_deleted;
r->k.size = 0;
set_bkey_val_u64s(&r->k, 0);
}
}
check_indirect_extent_deleting(new, &flags);
return 0;
}
@ -260,6 +253,7 @@ s64 bch2_remap_range(struct bch_fs *c,
struct bpos dst_start = POS(dst_inum.inum, dst_offset);
struct bpos src_start = POS(src_inum.inum, src_offset);
struct bpos dst_end = dst_start, src_end = src_start;
struct bch_io_opts opts;
struct bpos src_want;
u64 dst_done;
u32 dst_snapshot, src_snapshot;
@ -277,6 +271,10 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_bkey_buf_init(&new_src);
trans = bch2_trans_get(c);
ret = bch2_inum_opts_get(trans, src_inum, &opts);
if (ret)
goto err;
bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start,
BTREE_ITER_INTENT);
bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start,
@ -360,10 +358,13 @@ s64 bch2_remap_range(struct bch_fs *c,
min(src_k.k->p.offset - src_want.offset,
dst_end.offset - dst_iter.pos.offset));
ret = bch2_extent_update(trans, dst_inum, &dst_iter,
new_dst.k, &disk_res,
new_i_size, i_sectors_delta,
true);
ret = bch2_bkey_set_needs_rebalance(c, new_dst.k,
opts.background_target,
opts.background_compression) ?:
bch2_extent_update(trans, dst_inum, &dst_iter,
new_dst.k, &disk_res,
new_i_size, i_sectors_delta,
true);
bch2_disk_reservation_put(c, &disk_res);
}
bch2_trans_iter_exit(trans, &dst_iter);
@ -394,7 +395,7 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_trans_iter_exit(trans, &inode_iter);
} while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
err:
bch2_trans_put(trans);
bch2_bkey_buf_exit(&new_src, c);
bch2_bkey_buf_exit(&new_dst, c);

View File

@ -948,9 +948,6 @@ int bch2_fs_start(struct bch_fs *c)
goto err;
}
for_each_online_member(ca, c, i)
bch2_sb_from_fs(c, ca);
for_each_online_member(ca, c, i)
bch2_members_v2_get_mut(c->disk_sb.sb, i)->last_mount = cpu_to_le64(now);
@ -960,12 +957,6 @@ int bch2_fs_start(struct bch_fs *c)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
for (i = 0; i < BCH_TRANSACTIONS_NR; i++) {
mutex_lock(&c->btree_transaction_stats[i].lock);
bch2_time_stats_init(&c->btree_transaction_stats[i].lock_hold_times);
mutex_unlock(&c->btree_transaction_stats[i].lock);
}
ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
? bch2_fs_recovery(c)
: bch2_fs_initialize(c);
@ -1591,7 +1582,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
if (BCH_MEMBER_GROUP(&dev_mi)) {
bch2_disk_path_to_text(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
if (label.allocation_failure) {
ret = -ENOMEM;
goto err;
@ -1689,13 +1680,13 @@ have_slot:
ret = bch2_trans_mark_dev_sb(c, ca);
if (ret) {
bch_err_msg(c, ret, "marking new superblock");
bch_err_msg(ca, ret, "marking new superblock");
goto err_late;
}
ret = bch2_fs_freespace_init(c);
if (ret) {
bch_err_msg(c, ret, "initializing free space");
bch_err_msg(ca, ret, "initializing free space");
goto err_late;
}
@ -1763,19 +1754,26 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
if (ca->mi.state == BCH_MEMBER_STATE_rw)
__bch2_dev_read_write(c, ca);
if (!ca->mi.freespace_initialized) {
ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
bch_err_msg(ca, ret, "initializing free space");
if (ret)
goto err;
}
if (!ca->journal.nr) {
ret = bch2_dev_journal_alloc(ca);
bch_err_msg(ca, ret, "allocating journal");
if (ret)
goto err;
}
mutex_lock(&c->sb_lock);
struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
m->last_mount =
bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =
cpu_to_le64(ktime_get_real_seconds());
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
ret = bch2_fs_freespace_init(c);
if (ret)
bch_err_msg(c, ret, "initializing free space");
up_write(&c->state_lock);
return 0;
err:

View File

@ -37,16 +37,4 @@ struct bch_member_cpu {
u8 valid;
};
struct bch_disk_group_cpu {
bool deleted;
u16 parent;
struct bch_devs_mask devs;
};
struct bch_disk_groups_cpu {
struct rcu_head rcu;
unsigned nr;
struct bch_disk_group_cpu entries[] __counted_by(nr);
};
#endif /* _BCACHEFS_SUPER_TYPES_H */

View File

@ -212,7 +212,7 @@ read_attribute(copy_gc_wait);
rw_attribute(rebalance_enabled);
sysfs_pd_controller_attribute(rebalance);
read_attribute(rebalance_work);
read_attribute(rebalance_status);
rw_attribute(promote_whole_extents);
read_attribute(new_stripes);
@ -386,8 +386,8 @@ SHOW(bch2_fs)
if (attr == &sysfs_copy_gc_wait)
bch2_copygc_wait_to_text(out, c);
if (attr == &sysfs_rebalance_work)
bch2_rebalance_work_to_text(out, c);
if (attr == &sysfs_rebalance_status)
bch2_rebalance_status_to_text(out, c);
sysfs_print(promote_whole_extents, c->promote_whole_extents);
@ -646,7 +646,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_copy_gc_wait,
&sysfs_rebalance_enabled,
&sysfs_rebalance_work,
&sysfs_rebalance_status,
sysfs_pd_controller_files(rebalance),
&sysfs_moving_ctxts,
@ -707,10 +707,8 @@ STORE(bch2_fs_opts_dir)
bch2_opt_set_by_id(&c->opts, id, v);
if ((id == Opt_background_target ||
id == Opt_background_compression) && v) {
bch2_rebalance_add_work(c, S64_MAX);
rebalance_wakeup(c);
}
id == Opt_background_compression) && v)
bch2_set_rebalance_needs_scan(c, 0);
ret = size;
err:
@ -910,13 +908,8 @@ SHOW(bch2_dev)
sysfs_print(discard, ca->mi.discard);
if (attr == &sysfs_label) {
if (ca->mi.group) {
mutex_lock(&c->sb_lock);
bch2_disk_path_to_text(out, c->disk_sb.sb,
ca->mi.group - 1);
mutex_unlock(&c->sb_lock);
}
if (ca->mi.group)
bch2_disk_path_to_text(out, c, ca->mi.group - 1);
prt_char(out, '\n');
}

View File

@ -7,6 +7,7 @@
#include "btree_locking.h"
#include "btree_update_interior.h"
#include "keylist.h"
#include "move_types.h"
#include "opts.h"
#include "six.h"

View File

@ -767,25 +767,36 @@ DEFINE_EVENT(bkey, move_extent_alloc_mem_fail,
);
TRACE_EVENT(move_data,
TP_PROTO(struct bch_fs *c, u64 sectors_moved,
u64 keys_moved),
TP_ARGS(c, sectors_moved, keys_moved),
TP_PROTO(struct bch_fs *c,
struct bch_move_stats *stats),
TP_ARGS(c, stats),
TP_STRUCT__entry(
__field(dev_t, dev )
__field(u64, sectors_moved )
__field(dev_t, dev )
__field(u64, keys_moved )
__field(u64, keys_raced )
__field(u64, sectors_seen )
__field(u64, sectors_moved )
__field(u64, sectors_raced )
),
TP_fast_assign(
__entry->dev = c->dev;
__entry->sectors_moved = sectors_moved;
__entry->keys_moved = keys_moved;
__entry->dev = c->dev;
__entry->keys_moved = atomic64_read(&stats->keys_moved);
__entry->keys_raced = atomic64_read(&stats->keys_raced);
__entry->sectors_seen = atomic64_read(&stats->sectors_seen);
__entry->sectors_moved = atomic64_read(&stats->sectors_moved);
__entry->sectors_raced = atomic64_read(&stats->sectors_raced);
),
TP_printk("%d,%d sectors_moved %llu keys_moved %llu",
TP_printk("%d,%d keys moved %llu raced %llu"
"sectors seen %llu moved %llu raced %llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->sectors_moved, __entry->keys_moved)
__entry->keys_moved,
__entry->keys_raced,
__entry->sectors_seen,
__entry->sectors_moved,
__entry->sectors_raced)
);
TRACE_EVENT(evacuate_bucket,

View File

@ -590,7 +590,7 @@ err:
if (value &&
(opt_id == Opt_background_compression ||
opt_id == Opt_background_target))
bch2_rebalance_add_work(c, inode->v.i_blocks);
bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum);
return bch2_err_class(ret);
}

View File

@ -22,6 +22,10 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
panic("closure_put_after_sub: bogus flags %x remaining %i", flags, r);
if (!r) {
smp_acquire__after_ctrl_dep();
cl->closure_get_happened = false;
if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
atomic_set(&cl->remaining,
CLOSURE_REMAINING_INITIALIZER);
@ -44,7 +48,7 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
/* For clearing flags with the same atomic op as a put */
void closure_sub(struct closure *cl, int v)
{
closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
closure_put_after_sub(cl, atomic_sub_return_release(v, &cl->remaining));
}
EXPORT_SYMBOL(closure_sub);
@ -53,7 +57,7 @@ EXPORT_SYMBOL(closure_sub);
*/
void closure_put(struct closure *cl)
{
closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
closure_put_after_sub(cl, atomic_dec_return_release(&cl->remaining));
}
EXPORT_SYMBOL(closure_put);
@ -91,6 +95,7 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
return false;
cl->closure_get_happened = true;
closure_set_waiting(cl, _RET_IP_);
atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
llist_add(&cl->list, &waitlist->list);