Update bcachefs sources to 1381a92a5d23 bcachefs: sysfs trigger_check_inconsistent_replicas

This commit is contained in:
Kent Overstreet 2025-11-06 17:31:00 -05:00
parent 16ad30e11d
commit 12cf9df98b
48 changed files with 2494 additions and 736 deletions

View File

@ -1 +1 @@
d13053c5782b5c9993cf9cfba52dd19c0732091b 1381a92a5d23df9d9bbf6cae1ecf0f3eb39f4b0b

View File

@ -250,7 +250,9 @@ fsck_err:
return ret; return ret;
} }
void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k) void bch2_accounting_key_to_text(struct printbuf *out,
struct bch_fs *c,
struct disk_accounting_pos *k)
{ {
if (k->type >= BCH_DISK_ACCOUNTING_TYPE_NR) { if (k->type >= BCH_DISK_ACCOUNTING_TYPE_NR) {
prt_printf(out, "unknown type %u", k->type); prt_printf(out, "unknown type %u", k->type);
@ -283,6 +285,17 @@ void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_po
prt_str(out, "btree="); prt_str(out, "btree=");
bch2_btree_id_to_text(out, k->btree.id); bch2_btree_id_to_text(out, k->btree.id);
break; break;
case BCH_DISK_ACCOUNTING_rebalance_work_v2:
bch2_prt_rebalance_accounting_type(out, k->rebalance_work_v2.type);
break;
case BCH_DISK_ACCOUNTING_dev_leaving: {
struct bch_dev *ca = c ? bch2_dev_rcu_noerror(c, k->dev_leaving.dev) : NULL;
if (ca)
prt_printf(out, "%s ", ca->name);
else
prt_printf(out, "%u ", k->dev_leaving.dev);
break;
}
} }
} }
@ -292,7 +305,7 @@ void bch2_accounting_to_text(struct printbuf *out, struct bch_fs *c, struct bkey
struct disk_accounting_pos acc_k; struct disk_accounting_pos acc_k;
bpos_to_disk_accounting_pos(&acc_k, k.k->p); bpos_to_disk_accounting_pos(&acc_k, k.k->p);
bch2_accounting_key_to_text(out, &acc_k); bch2_accounting_key_to_text(out, c, &acc_k);
for (unsigned i = 0; i < bch2_accounting_counters(k.k); i++) for (unsigned i = 0; i < bch2_accounting_counters(k.k); i++)
prt_printf(out, " %lli", acc.v->d[i]); prt_printf(out, " %lli", acc.v->d[i]);
@ -607,7 +620,7 @@ int bch2_gc_accounting_done(struct bch_fs *c)
if (memcmp(dst_v, src_v, nr * sizeof(u64))) { if (memcmp(dst_v, src_v, nr * sizeof(u64))) {
printbuf_reset(&buf); printbuf_reset(&buf);
prt_str(&buf, "accounting mismatch for "); prt_str(&buf, "accounting mismatch for ");
bch2_accounting_key_to_text(&buf, &acc_k); bch2_accounting_key_to_text(&buf, c, &acc_k);
prt_str(&buf, ":\n got"); prt_str(&buf, ":\n got");
for (unsigned j = 0; j < nr; j++) for (unsigned j = 0; j < nr; j++)
@ -672,7 +685,7 @@ static int disk_accounting_invalid_dev(struct btree_trans *trans,
unsigned dev) unsigned dev)
{ {
CLASS(printbuf, buf)(); CLASS(printbuf, buf)();
bch2_accounting_key_to_text(&buf, acc); bch2_accounting_key_to_text(&buf, trans->c, acc);
int ret = 0; int ret = 0;
if (fsck_err(trans, accounting_to_invalid_device, if (fsck_err(trans, accounting_to_invalid_device,
@ -719,7 +732,7 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
trans, accounting_replicas_not_marked, trans, accounting_replicas_not_marked,
"accounting not marked in superblock replicas\n%s", "accounting not marked in superblock replicas\n%s",
(printbuf_reset(&buf), (printbuf_reset(&buf),
bch2_accounting_key_to_text(&buf, acc), bch2_accounting_key_to_text(&buf, c, acc),
buf.buf))) buf.buf)))
try(bch2_mark_replicas(c, &r.e)); try(bch2_mark_replicas(c, &r.e));
break; break;
@ -849,7 +862,7 @@ static int accounting_read_mem_fixups(struct btree_trans *trans)
bch2_log_msg_start(c, &underflow_err); bch2_log_msg_start(c, &underflow_err);
prt_printf(&underflow_err, "Accounting underflow for\n"); prt_printf(&underflow_err, "Accounting underflow for\n");
} }
bch2_accounting_key_to_text(&underflow_err, &k); bch2_accounting_key_to_text(&underflow_err, c, &k);
for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++) for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++)
prt_printf(&underflow_err, " %lli", v[j]); prt_printf(&underflow_err, " %lli", v[j]);

View File

@ -124,7 +124,7 @@ int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool);
int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c,
struct bkey_validate_context); struct bkey_validate_context);
void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *); void bch2_accounting_key_to_text(struct printbuf *, struct bch_fs *, struct disk_accounting_pos *);
void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
void bch2_accounting_swab(const struct bch_fs *, struct bkey_s); void bch2_accounting_swab(const struct bch_fs *, struct bkey_s);

View File

@ -110,7 +110,9 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
x(snapshot, 5, 1) \ x(snapshot, 5, 1) \
x(btree, 6, 3) \ x(btree, 6, 3) \
x(rebalance_work, 7, 1) \ x(rebalance_work, 7, 1) \
x(inum, 8, 3) x(inum, 8, 3) \
x(rebalance_work_v2, 9, 1) \
x(dev_leaving, 10, 1)
enum disk_accounting_type { enum disk_accounting_type {
#define x(f, nr, ...) BCH_DISK_ACCOUNTING_##f = nr, #define x(f, nr, ...) BCH_DISK_ACCOUNTING_##f = nr,
@ -210,6 +212,19 @@ struct bch_acct_inum {
struct bch_acct_rebalance_work { struct bch_acct_rebalance_work {
}; };
struct bch_acct_rebalance_work_v2 {
__u8 type;
};
struct bch_acct_dev_leaving {
__u32 dev;
};
/*
* XXX: need per-device counters for "how much data are we going to move off of
* this device
*/
struct disk_accounting_pos { struct disk_accounting_pos {
union { union {
struct { struct {
@ -224,6 +239,8 @@ struct disk_accounting_pos {
struct bch_acct_btree btree; struct bch_acct_btree btree;
struct bch_acct_rebalance_work rebalance_work; struct bch_acct_rebalance_work rebalance_work;
struct bch_acct_inum inum; struct bch_acct_inum inum;
struct bch_acct_rebalance_work_v2 rebalance_work_v2;
struct bch_acct_dev_leaving dev_leaving;
} __packed; } __packed;
} __packed; } __packed;
struct bpos _pad; struct bpos _pad;

View File

@ -317,8 +317,7 @@ int bch2_check_fix_ptrs(struct btree_trans *trans,
if (do_update) { if (do_update) {
struct bkey_i *new = struct bkey_i *new =
errptr_try(bch2_trans_kmalloc(trans, bkey_bytes(k.k) + errptr_try(bch2_trans_kmalloc(trans, BKEY_EXTENT_U64s_MAX * sizeof(u64)));
sizeof(struct bch_extent_rebalance)));
bkey_reassemble(new, k); bkey_reassemble(new, k);
scoped_guard(rcu) scoped_guard(rcu)
@ -386,7 +385,7 @@ found:
struct bch_inode_opts opts; struct bch_inode_opts opts;
try(bch2_bkey_get_io_opts(trans, NULL, k, &opts)); try(bch2_bkey_get_io_opts(trans, NULL, k, &opts));
try(bch2_bkey_set_needs_rebalance(c, &opts, new, SET_NEEDS_REBALANCE_opt_change, 0)); try(bch2_bkey_set_needs_rebalance(trans, NULL, &opts, new, SET_NEEDS_REBALANCE_opt_change, 0));
if (!(flags & BTREE_TRIGGER_is_root)) { if (!(flags & BTREE_TRIGGER_is_root)) {
CLASS(btree_node_iter, iter)(trans, btree, new->k.p, 0, level, CLASS(btree_node_iter, iter)(trans, btree, new->k.p, 0, level,
@ -889,7 +888,7 @@ int bch2_trigger_extent(struct btree_trans *trans,
try(__trigger_extent(trans, btree, level, new.s_c, try(__trigger_extent(trans, btree, level, new.s_c,
flags & ~BTREE_TRIGGER_overwrite)); flags & ~BTREE_TRIGGER_overwrite));
try(bch2_trigger_extent_rebalance(trans, old, new.s_c, flags)); try(bch2_trigger_extent_rebalance(trans, btree, level, old, new, flags));
} }
return 0; return 0;

View File

@ -3,6 +3,8 @@
#include "alloc/disk_groups.h" #include "alloc/disk_groups.h"
#include "data/rebalance.h"
#include "init/dev.h" #include "init/dev.h"
#include "sb/members.h" #include "sb/members.h"
@ -469,9 +471,18 @@ int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
{ {
guard(mutex)(&c->sb_lock); struct rebalance_scan s = { .type = REBALANCE_SCAN_pending };
return __bch2_dev_group_set(c, ca, name) ?:
bch2_write_super(c); try(bch2_set_rebalance_needs_scan(c, s, false));
/* bch2_rebalance_wakeup_pending goes here */
scoped_guard(mutex,&c->sb_lock) {
try(__bch2_dev_group_set(c, ca, name));
try(bch2_write_super(c));
}
try(bch2_set_rebalance_needs_scan(c, s, true));
return 0;
} }
int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res, int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res,

View File

@ -16,25 +16,40 @@ DEFINE_CLASS(bch_replicas_cpu, struct bch_replicas_cpu,
kfree(_T.entries), kfree(_T.entries),
(struct bch_replicas_cpu) {}, void) (struct bch_replicas_cpu) {}, void)
static inline struct bch_replicas_entry_v1 * static inline struct bch_replicas_entry_cpu *
cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
{ {
return (void *) r->entries + r->entry_size * i; return (void *) r->entries + r->entry_size * i;
} }
static inline unsigned __cpu_replicas_entry_bytes(unsigned v1_bytes)
{
return offsetof(struct bch_replicas_entry_cpu, e) + v1_bytes;
}
static inline unsigned cpu_replicas_entry_bytes(struct bch_replicas_entry_cpu *e)
{
return __cpu_replicas_entry_bytes(replicas_entry_bytes(&e->e));
}
#define for_each_cpu_replicas_entry(_r, _i) \ #define for_each_cpu_replicas_entry(_r, _i) \
for (struct bch_replicas_entry_v1 *_i = (_r)->entries; \ for (struct bch_replicas_entry_cpu *_i = (_r)->entries; \
(void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size; \ (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size; \
_i = (void *) (_i) + (_r)->entry_size) _i = (void *) (_i) + (_r)->entry_size)
static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
struct bch_replicas_cpu *); struct bch_replicas_cpu *);
/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */ static int cpu_replicas_entry_cmp(const struct bch_replicas_entry_cpu *l,
static int bch2_memcmp(const void *l, const void *r, const void *priv) const struct bch_replicas_entry_cpu *r,
size_t size)
{ {
size_t size = (size_t) priv; return memcmp(&l->e, &r->e, size - offsetof(struct bch_replicas_entry_cpu, e));
return memcmp(l, r, size); }
static int cpu_replicas_entry_cmp_r(const void *l, const void *r, const void *priv)
{
return cpu_replicas_entry_cmp(l, r, (size_t) priv);
} }
/* Replicas tracking - in memory: */ /* Replicas tracking - in memory: */
@ -60,7 +75,8 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
{ {
eytzinger0_sort_r(r->entries, r->nr, r->entry_size, eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
bch2_memcmp, NULL, (void *)(size_t)r->entry_size); cpu_replicas_entry_cmp_r, NULL,
(void *)(size_t)r->entry_size);
} }
static void bch2_replicas_entry_v0_to_text(struct printbuf *out, static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
@ -85,6 +101,13 @@ void bch2_replicas_entry_to_text(struct printbuf *out,
prt_printf(out, "]"); prt_printf(out, "]");
} }
static void bch2_replicas_entry_cpu_to_text(struct printbuf *out,
struct bch_replicas_entry_cpu *e)
{
prt_printf(out, "ref=%u ", atomic_read(&e->ref));
bch2_replicas_entry_to_text(out, &e->e);
}
static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r, static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r,
struct bch_sb *sb, struct bch_sb *sb,
struct printbuf *err) struct printbuf *err)
@ -151,7 +174,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *out,
prt_printf(out, " "); prt_printf(out, " ");
first = false; first = false;
bch2_replicas_entry_to_text(out, i); bch2_replicas_entry_cpu_to_text(out, i);
} }
} }
@ -232,6 +255,44 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
bch2_replicas_entry_sort(e); bch2_replicas_entry_sort(e);
} }
/* @l is bch_replicas_entry_v1, @r is bch_replicas_entry_cpu */
static int replicas_entry_search_cmp(const void *_l, const void *_r, const void *priv)
{
const struct bch_replicas_entry_v1 *l = _l;
const struct bch_replicas_entry_cpu *r = _r;
size_t size = (size_t) priv;
return memcmp(l, &r->e, size);
}
static inline struct bch_replicas_entry_cpu *
replicas_entry_search(struct bch_replicas_cpu *r,
struct bch_replicas_entry_v1 *search)
{
verify_replicas_entry(search);
size_t entry_size = replicas_entry_bytes(search);
int idx = likely(__cpu_replicas_entry_bytes(entry_size) <= r->entry_size)
? eytzinger0_find_r(r->entries, r->nr, r->entry_size,
replicas_entry_search_cmp,
(void *) entry_size, search)
: -1;
return idx >= 0 ? cpu_replicas_entry(r, idx) : NULL;
}
bool bch2_replicas_marked_locked(struct bch_fs *c,
struct bch_replicas_entry_v1 *search)
{
return !search->nr_devs || replicas_entry_search(&c->replicas, search);
}
bool bch2_replicas_marked(struct bch_fs *c,
struct bch_replicas_entry_v1 *search)
{
guard(percpu_read)(&c->mark_lock);
return bch2_replicas_marked_locked(c, search);
}
static struct bch_replicas_cpu static struct bch_replicas_cpu
cpu_replicas_add_entry(struct bch_fs *c, cpu_replicas_add_entry(struct bch_fs *c,
struct bch_replicas_cpu *old, struct bch_replicas_cpu *old,
@ -240,9 +301,12 @@ cpu_replicas_add_entry(struct bch_fs *c,
struct bch_replicas_cpu new = { struct bch_replicas_cpu new = {
.nr = old->nr + 1, .nr = old->nr + 1,
.entry_size = max_t(unsigned, old->entry_size, .entry_size = max_t(unsigned, old->entry_size,
replicas_entry_bytes(new_entry)), __cpu_replicas_entry_bytes(replicas_entry_bytes(new_entry))),
}; };
/* alignment */
new.entry_size = round_up(new.entry_size, sizeof(atomic_t));
new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL); new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL);
if (!new.entries) if (!new.entries)
return new; return new;
@ -252,7 +316,7 @@ cpu_replicas_add_entry(struct bch_fs *c,
cpu_replicas_entry(old, i), cpu_replicas_entry(old, i),
old->entry_size); old->entry_size);
memcpy(cpu_replicas_entry(&new, old->nr), memcpy(&cpu_replicas_entry(&new, old->nr)->e,
new_entry, new_entry,
replicas_entry_bytes(new_entry)); replicas_entry_bytes(new_entry));
@ -260,152 +324,56 @@ cpu_replicas_add_entry(struct bch_fs *c,
return new; return new;
} }
static inline struct bch_replicas_entry_v1 *
replicas_entry_search(struct bch_replicas_cpu *r,
struct bch_replicas_entry_v1 *search)
{
verify_replicas_entry(search);
size_t entry_size = replicas_entry_bytes(search);
int idx = likely(entry_size <= r->entry_size)
? eytzinger0_find_r(r->entries, r->nr, r->entry_size,
bch2_memcmp, (void *) entry_size, search)
: -1;
return idx >= 0 ? cpu_replicas_entry(r, idx) : NULL;
}
bool bch2_replicas_marked_locked(struct bch_fs *c,
struct bch_replicas_entry_v1 *search)
{
return !search->nr_devs ||
(replicas_entry_search(&c->replicas, search) &&
(likely((!c->replicas_gc.entries)) ||
replicas_entry_search(&c->replicas_gc, search)));
}
bool bch2_replicas_marked(struct bch_fs *c,
struct bch_replicas_entry_v1 *search)
{
guard(percpu_read)(&c->mark_lock);
return bch2_replicas_marked_locked(c, search);
}
noinline noinline
static int bch2_mark_replicas_slowpath(struct bch_fs *c, static int bch2_mark_replicas_slowpath(struct bch_fs *c,
struct bch_replicas_entry_v1 *new_entry) struct bch_replicas_entry_v1 *new_entry,
unsigned ref)
{ {
verify_replicas_entry(new_entry); verify_replicas_entry(new_entry);
CLASS(bch_replicas_cpu, new_r)();
CLASS(bch_replicas_cpu, new_gc)();
guard(mutex)(&c->sb_lock); guard(mutex)(&c->sb_lock);
bool write_sb = false;
if (c->replicas_gc.entries &&
!replicas_entry_search(&c->replicas_gc, new_entry)) {
new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry);
if (!new_gc.entries)
return bch_err_throw(c, ENOMEM_cpu_replicas);
}
if (!replicas_entry_search(&c->replicas, new_entry)) {
new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry);
if (!new_r.entries)
return bch_err_throw(c, ENOMEM_cpu_replicas);
try(bch2_cpu_replicas_to_sb_replicas(c, &new_r));
}
if (!new_r.entries &&
!new_gc.entries)
return 0;
/* allocations done, now commit: */
if (new_r.entries)
bch2_write_super(c);
/* don't update in memory replicas until changes are persistent */
scoped_guard(percpu_write, &c->mark_lock) { scoped_guard(percpu_write, &c->mark_lock) {
if (new_r.entries) if (!replicas_entry_search(&c->replicas, new_entry)) {
CLASS(bch_replicas_cpu, new_r)();
new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry);
if (!new_r.entries)
return bch_err_throw(c, ENOMEM_cpu_replicas);
try(bch2_cpu_replicas_to_sb_replicas(c, &new_r));
swap(c->replicas, new_r); swap(c->replicas, new_r);
if (new_gc.entries) write_sb = true;
swap(new_gc, c->replicas_gc); }
atomic_add(ref, &replicas_entry_search(&c->replicas, new_entry)->ref);
} }
/* After dropping mark_lock */
if (write_sb)
bch2_write_super(c);
return 0; return 0;
} }
int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r) int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
{ {
return likely(bch2_replicas_marked(c, r)) return likely(bch2_replicas_marked(c, r))
? 0 : bch2_mark_replicas_slowpath(c, r); ? 0 : bch2_mark_replicas_slowpath(c, r, 0);
} }
/* static void __replicas_entry_kill(struct bch_fs *c, struct bch_replicas_entry_cpu *e)
* Old replicas_gc mechanism: only used for journal replicas entries now, should
* die at some point:
*/
int bch2_replicas_gc_end(struct bch_fs *c, int ret)
{ {
lockdep_assert_held(&c->replicas_gc_lock); struct bch_replicas_cpu *r = &c->replicas;
guard(mutex)(&c->sb_lock); memcpy(e, cpu_replicas_entry(r, --r->nr), r->entry_size);
scoped_guard(percpu_write, &c->mark_lock) { bch2_cpu_replicas_sort(r);
ret = ret ?:
bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc);
if (!ret)
swap(c->replicas, c->replicas_gc);
kfree(c->replicas_gc.entries); int ret = bch2_cpu_replicas_to_sb_replicas(c, r);
c->replicas_gc.entries = NULL; if (WARN(ret, "bch2_cpu_replicas_to_sb_replicas() error: %s", bch2_err_str(ret)))
} return;
if (!ret)
bch2_write_super(c);
return ret;
}
int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
{
lockdep_assert_held(&c->replicas_gc_lock);
guard(mutex)(&c->sb_lock);
BUG_ON(c->replicas_gc.entries);
c->replicas_gc.nr = 0;
c->replicas_gc.entry_size = 0;
for_each_cpu_replicas_entry(&c->replicas, e) {
/* Preserve unknown data types */
if (e->data_type >= BCH_DATA_NR ||
!(BIT(e->data_type) & typemask)) {
c->replicas_gc.nr++;
c->replicas_gc.entry_size =
max_t(unsigned, c->replicas_gc.entry_size,
replicas_entry_bytes(e));
}
}
c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
c->replicas_gc.entry_size,
GFP_KERNEL);
if (!c->replicas_gc.entries) {
bch_err(c, "error allocating c->replicas_gc");
return bch_err_throw(c, ENOMEM_replicas_gc);
}
unsigned i = 0;
for_each_cpu_replicas_entry(&c->replicas, e)
if (e->data_type >= BCH_DATA_NR ||
!(BIT(e->data_type) & typemask))
memcpy(cpu_replicas_entry(&c->replicas_gc, i++),
e, c->replicas_gc.entry_size);
bch2_cpu_replicas_sort(&c->replicas_gc);
return 0;
} }
void bch2_replicas_entry_kill(struct bch_fs *c, struct bch_replicas_entry_v1 *kill) void bch2_replicas_entry_kill(struct bch_fs *c, struct bch_replicas_entry_v1 *kill)
@ -413,18 +381,95 @@ void bch2_replicas_entry_kill(struct bch_fs *c, struct bch_replicas_entry_v1 *ki
lockdep_assert_held(&c->mark_lock); lockdep_assert_held(&c->mark_lock);
lockdep_assert_held(&c->sb_lock); lockdep_assert_held(&c->sb_lock);
struct bch_replicas_cpu *r = &c->replicas; struct bch_replicas_entry_cpu *e = replicas_entry_search(&c->replicas, kill);
struct bch_replicas_entry_v1 *e = replicas_entry_search(&c->replicas, kill);
if (WARN(!e, "replicas entry not found in sb")) if (WARN(!e, "replicas entry not found in sb"))
return; return;
memcpy(e, cpu_replicas_entry(r, --r->nr), r->entry_size); __replicas_entry_kill(c, e);
bch2_cpu_replicas_sort(r); /* caller does write_super() after dropping mark_lock */
}
int ret = bch2_cpu_replicas_to_sb_replicas(c, r); void bch2_replicas_entry_put_many(struct bch_fs *c, struct bch_replicas_entry_v1 *r, unsigned nr)
WARN(ret, "bch2_cpu_replicas_to_sb_replicas() error: %s", bch2_err_str(ret)); {
if (!r->nr_devs)
return;
BUG_ON(r->data_type != BCH_DATA_journal);
verify_replicas_entry(r);
scoped_guard(percpu_read, &c->mark_lock) {
struct bch_replicas_entry_cpu *e = replicas_entry_search(&c->replicas, r);
int v = atomic_sub_return(nr, &e->ref);
BUG_ON(v < 0);
if (v)
return;
}
guard(mutex)(&c->sb_lock);
scoped_guard(percpu_write, &c->mark_lock) {
struct bch_replicas_entry_cpu *e = replicas_entry_search(&c->replicas, r);
if (e && !atomic_read(&e->ref))
__replicas_entry_kill(c, e);
}
bch2_write_super(c);
}
static inline bool bch2_replicas_entry_get_inmem(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
{
guard(percpu_read)(&c->mark_lock);
struct bch_replicas_entry_cpu *e = replicas_entry_search(&c->replicas, r);
if (e)
atomic_inc(&e->ref);
return e != NULL;
}
int bch2_replicas_entry_get(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
{
if (!r->nr_devs)
return 0;
BUG_ON(r->data_type != BCH_DATA_journal);
verify_replicas_entry(r);
return bch2_replicas_entry_get_inmem(c, r)
? 0
: bch2_mark_replicas_slowpath(c, r, 1);
}
int bch2_replicas_gc_reffed(struct bch_fs *c)
{
bool write_sb = false;
guard(mutex)(&c->sb_lock);
scoped_guard(percpu_write, &c->mark_lock) {
unsigned dst = 0;
for (unsigned i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry_cpu *e =
cpu_replicas_entry(&c->replicas, i);
if (e->e.data_type != BCH_DATA_journal ||
atomic_read(&e->ref))
memcpy(cpu_replicas_entry(&c->replicas, dst++),
e,
c->replicas.entry_size);
}
if (c->replicas.nr != dst) {
c->replicas.nr = dst;
bch2_cpu_replicas_sort(&c->replicas);
try(bch2_cpu_replicas_to_sb_replicas(c, &c->replicas));
}
}
if (write_sb)
bch2_write_super(c);
return 0;
} }
/* Replicas tracking - superblock: */ /* Replicas tracking - superblock: */
@ -441,6 +486,9 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
nr++; nr++;
} }
entry_size = __cpu_replicas_entry_bytes(entry_size);
entry_size = round_up(entry_size, sizeof(atomic_t));
cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
if (!cpu_r->entries) if (!cpu_r->entries)
return -BCH_ERR_ENOMEM_cpu_replicas; return -BCH_ERR_ENOMEM_cpu_replicas;
@ -448,10 +496,10 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
cpu_r->nr = nr; cpu_r->nr = nr;
cpu_r->entry_size = entry_size; cpu_r->entry_size = entry_size;
for_each_replicas_entry(sb_r, e) { for_each_replicas_entry(sb_r, src) {
struct bch_replicas_entry_v1 *dst = cpu_replicas_entry(cpu_r, idx++); struct bch_replicas_entry_cpu *dst = cpu_replicas_entry(cpu_r, idx++);
memcpy(dst, e, replicas_entry_bytes(e)); memcpy(&dst->e, src, replicas_entry_bytes(src));
bch2_replicas_entry_sort(dst); bch2_replicas_entry_sort(&dst->e);
} }
return 0; return 0;
@ -469,9 +517,13 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
nr++; nr++;
} }
entry_size = __cpu_replicas_entry_bytes(entry_size);
entry_size += sizeof(struct bch_replicas_entry_v1) - entry_size += sizeof(struct bch_replicas_entry_v1) -
sizeof(struct bch_replicas_entry_v0); sizeof(struct bch_replicas_entry_v0);
entry_size = round_up(entry_size, sizeof(atomic_t));
cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
if (!cpu_r->entries) if (!cpu_r->entries)
return -BCH_ERR_ENOMEM_cpu_replicas; return -BCH_ERR_ENOMEM_cpu_replicas;
@ -480,14 +532,14 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
cpu_r->entry_size = entry_size; cpu_r->entry_size = entry_size;
for_each_replicas_entry(sb_r, src) { for_each_replicas_entry(sb_r, src) {
struct bch_replicas_entry_v1 *dst = struct bch_replicas_entry_cpu *dst =
cpu_replicas_entry(cpu_r, idx++); cpu_replicas_entry(cpu_r, idx++);
dst->data_type = src->data_type; dst->e.data_type = src->data_type;
dst->nr_devs = src->nr_devs; dst->e.nr_devs = src->nr_devs;
dst->nr_required = 1; dst->e.nr_required = 1;
memcpy(dst->devs, src->devs, src->nr_devs); memcpy(dst->e.devs, src->devs, src->nr_devs);
bch2_replicas_entry_sort(dst); bch2_replicas_entry_sort(&dst->e);
} }
return 0; return 0;
@ -495,6 +547,12 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
{ {
/*
* If called after fs is started (after journal read), we'll be blowing
* away refcounts
*/
BUG_ON(test_bit(BCH_FS_started, &c->flags));
struct bch_sb_field_replicas *sb_v1; struct bch_sb_field_replicas *sb_v1;
struct bch_sb_field_replicas_v0 *sb_v0; struct bch_sb_field_replicas_v0 *sb_v0;
CLASS(bch_replicas_cpu, new_r)(); CLASS(bch_replicas_cpu, new_r)();
@ -522,7 +580,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
bytes = sizeof(struct bch_sb_field_replicas); bytes = sizeof(struct bch_sb_field_replicas);
for_each_cpu_replicas_entry(r, src) for_each_cpu_replicas_entry(r, src)
bytes += replicas_entry_bytes(src) - 1; bytes += replicas_entry_bytes(&src->e) - 1;
sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0, sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0,
DIV_ROUND_UP(bytes, sizeof(u64))); DIV_ROUND_UP(bytes, sizeof(u64)));
@ -538,9 +596,9 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
dst = sb_r->entries; dst = sb_r->entries;
for_each_cpu_replicas_entry(r, src) { for_each_cpu_replicas_entry(r, src) {
dst->data_type = src->data_type; dst->data_type = src->e.data_type;
dst->nr_devs = src->nr_devs; dst->nr_devs = src->e.nr_devs;
memcpy(dst->devs, src->devs, src->nr_devs); memcpy(dst->devs, src->e.devs, src->e.nr_devs);
dst = replicas_entry_next(dst); dst = replicas_entry_next(dst);
@ -561,8 +619,8 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
bytes = sizeof(struct bch_sb_field_replicas); bytes = sizeof(struct bch_sb_field_replicas);
for_each_cpu_replicas_entry(r, src) { for_each_cpu_replicas_entry(r, src) {
bytes += replicas_entry_bytes(src); bytes += replicas_entry_bytes(&src->e);
if (src->nr_required != 1) if (src->e.nr_required != 1)
need_v1 = true; need_v1 = true;
} }
@ -583,7 +641,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
dst = sb_r->entries; dst = sb_r->entries;
for_each_cpu_replicas_entry(r, src) { for_each_cpu_replicas_entry(r, src) {
memcpy(dst, src, replicas_entry_bytes(src)); memcpy(dst, &src->e, replicas_entry_bytes(&src->e));
dst = replicas_entry_next(dst); dst = replicas_entry_next(dst);
@ -602,24 +660,26 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
sort_r(cpu_r->entries, sort_r(cpu_r->entries,
cpu_r->nr, cpu_r->nr,
cpu_r->entry_size, cpu_r->entry_size,
bch2_memcmp, NULL, cpu_replicas_entry_cmp_r, NULL,
(void *)(size_t)cpu_r->entry_size); (void *)(size_t)cpu_r->entry_size);
for (i = 0; i < cpu_r->nr; i++) { for (i = 0; i < cpu_r->nr; i++) {
struct bch_replicas_entry_v1 *e = struct bch_replicas_entry_cpu *e =
cpu_replicas_entry(cpu_r, i); cpu_replicas_entry(cpu_r, i);
try(bch2_replicas_entry_sb_validate(e, sb, err)); try(bch2_replicas_entry_sb_validate(&e->e, sb, err));
if (i + 1 < cpu_r->nr) { if (i + 1 < cpu_r->nr) {
struct bch_replicas_entry_v1 *n = struct bch_replicas_entry_cpu *n =
cpu_replicas_entry(cpu_r, i + 1); cpu_replicas_entry(cpu_r, i + 1);
BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0); int cmp = cpu_replicas_entry_cmp(e, n, cpu_r->entry_size);
if (!memcmp(e, n, cpu_r->entry_size)) { BUG_ON(cmp > 0);
if (!cmp) {
prt_printf(err, "duplicate replicas entry "); prt_printf(err, "duplicate replicas entry ");
bch2_replicas_entry_to_text(err, e); bch2_replicas_entry_to_text(err, &e->e);
return -BCH_ERR_invalid_sb_replicas; return -BCH_ERR_invalid_sb_replicas;
} }
} }
@ -702,7 +762,9 @@ bool bch2_can_read_fs_with_devs(struct bch_fs *c, struct bch_devs_mask devs,
unsigned flags, struct printbuf *err) unsigned flags, struct printbuf *err)
{ {
guard(percpu_read)(&c->mark_lock); guard(percpu_read)(&c->mark_lock);
for_each_cpu_replicas_entry(&c->replicas, e) { for_each_cpu_replicas_entry(&c->replicas, i) {
struct bch_replicas_entry_v1 *e = &i->e;
unsigned nr_online = 0, nr_failed = 0, dflags = 0; unsigned nr_online = 0, nr_failed = 0, dflags = 0;
bool metadata = e->data_type < BCH_DATA_user; bool metadata = e->data_type < BCH_DATA_user;
@ -820,6 +882,25 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
return bch2_can_read_fs_with_devs(c, devs, flags, err); return bch2_can_read_fs_with_devs(c, devs, flags, err);
} }
bool bch2_sb_has_journal(struct bch_sb *sb)
{
struct bch_sb_field_replicas *replicas = bch2_sb_field_get(sb, replicas);
struct bch_sb_field_replicas_v0 *replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
if (replicas) {
for_each_replicas_entry(replicas, r)
if (r->data_type == BCH_DATA_journal)
return true;
} else if (replicas_v0) {
for_each_replicas_entry(replicas_v0, r)
if (r->data_type == BCH_DATA_journal)
return true;
}
return false;
}
unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
{ {
struct bch_sb_field_replicas *replicas; struct bch_sb_field_replicas *replicas;
@ -863,5 +944,4 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
void bch2_fs_replicas_exit(struct bch_fs *c) void bch2_fs_replicas_exit(struct bch_fs *c)
{ {
kfree(c->replicas.entries); kfree(c->replicas.entries);
kfree(c->replicas_gc.entries);
} }

View File

@ -39,13 +39,22 @@ bool bch2_can_read_fs_with_devs(struct bch_fs *, struct bch_devs_mask,
bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask, bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask,
unsigned, struct printbuf *, bool); unsigned, struct printbuf *, bool);
bool bch2_sb_has_journal(struct bch_sb *);
unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned); unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned);
unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
int bch2_replicas_gc_end(struct bch_fs *, int); void bch2_replicas_entry_put_many(struct bch_fs *, struct bch_replicas_entry_v1 *, unsigned);
int bch2_replicas_gc_start(struct bch_fs *, unsigned); static inline void bch2_replicas_entry_put(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
{
bch2_replicas_entry_put_many(c, r, 1);
}
int bch2_replicas_entry_get(struct bch_fs *, struct bch_replicas_entry_v1 *);
void bch2_replicas_entry_kill(struct bch_fs *, struct bch_replicas_entry_v1 *); void bch2_replicas_entry_kill(struct bch_fs *, struct bch_replicas_entry_v1 *);
int bch2_replicas_gc_reffed(struct bch_fs *);
static inline bool bch2_replicas_entry_has_dev(struct bch_replicas_entry_v1 *r, unsigned dev) static inline bool bch2_replicas_entry_has_dev(struct bch_replicas_entry_v1 *r, unsigned dev)
{ {
for (unsigned i = 0; i < r->nr_devs; i++) for (unsigned i = 0; i < r->nr_devs; i++)
@ -54,6 +63,12 @@ static inline bool bch2_replicas_entry_has_dev(struct bch_replicas_entry_v1 *r,
return false; return false;
} }
static inline bool bch2_replicas_entry_eq(struct bch_replicas_entry_v1 *l,
struct bch_replicas_entry_v1 *r)
{
return l->nr_devs == r->nr_devs && !memcmp(l, r, replicas_entry_bytes(l));
}
/* iterate over superblock replicas - used by userspace tools: */ /* iterate over superblock replicas - used by userspace tools: */
#define replicas_entry_next(_i) \ #define replicas_entry_next(_i) \

View File

@ -2,10 +2,16 @@
#ifndef _BCACHEFS_REPLICAS_TYPES_H #ifndef _BCACHEFS_REPLICAS_TYPES_H
#define _BCACHEFS_REPLICAS_TYPES_H #define _BCACHEFS_REPLICAS_TYPES_H
/* unsized - bch_replicas_entry_v1 is variable length */
struct bch_replicas_entry_cpu {
atomic_t ref;
struct bch_replicas_entry_v1 e;
};
struct bch_replicas_cpu { struct bch_replicas_cpu {
unsigned nr; unsigned nr;
unsigned entry_size; unsigned entry_size;
struct bch_replicas_entry_v1 *entries; struct bch_replicas_entry_cpu *entries;
}; };
union bch_replicas_padded { union bch_replicas_padded {

View File

@ -811,8 +811,6 @@ struct bch_fs {
struct bch_accounting_mem accounting; struct bch_accounting_mem accounting;
struct bch_replicas_cpu replicas; struct bch_replicas_cpu replicas;
struct bch_replicas_cpu replicas_gc;
struct mutex replicas_gc_lock;
struct journal_entry_res btree_root_journal_res; struct journal_entry_res btree_root_journal_res;
struct journal_entry_res clock_journal_res; struct journal_entry_res clock_journal_res;
@ -1075,6 +1073,7 @@ struct bch_fs {
GENRADIX(struct gc_stripe) gc_stripes; GENRADIX(struct gc_stripe) gc_stripes;
struct hlist_head ec_stripes_new[32]; struct hlist_head ec_stripes_new[32];
struct hlist_head ec_stripes_new_buckets[64];
spinlock_t ec_stripes_new_lock; spinlock_t ec_stripes_new_lock;
/* ERASURE CODING */ /* ERASURE CODING */

View File

@ -711,7 +711,8 @@ struct bch_sb_field_ext {
x(extent_snapshot_whiteouts, BCH_VERSION(1, 29)) \ x(extent_snapshot_whiteouts, BCH_VERSION(1, 29)) \
x(31bit_dirent_offset, BCH_VERSION(1, 30)) \ x(31bit_dirent_offset, BCH_VERSION(1, 30)) \
x(btree_node_accounting, BCH_VERSION(1, 31)) \ x(btree_node_accounting, BCH_VERSION(1, 31)) \
x(sb_field_extent_type_u64s, BCH_VERSION(1, 32)) x(sb_field_extent_type_u64s, BCH_VERSION(1, 32)) \
x(rebalance_v2, BCH_VERSION(1, 33))
enum bcachefs_metadata_version { enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9, bcachefs_metadata_version_min = 9,
@ -1430,6 +1431,17 @@ enum btree_id_flags {
BTREE_IS_snapshot_field| \ BTREE_IS_snapshot_field| \
BTREE_IS_write_buffer, \ BTREE_IS_write_buffer, \
BIT_ULL(KEY_TYPE_accounting)) \ BIT_ULL(KEY_TYPE_accounting)) \
x(rebalance_hipri, 21, \
BTREE_IS_snapshot_field| \
BTREE_IS_write_buffer, \
BIT_ULL(KEY_TYPE_set)) \
x(rebalance_pending, 22, \
BTREE_IS_snapshot_field| \
BTREE_IS_write_buffer, \
BIT_ULL(KEY_TYPE_set)) \
x(rebalance_scan, 23, 0, \
BIT_ULL(KEY_TYPE_cookie)| \
BIT_ULL(KEY_TYPE_backpointer))
enum btree_id { enum btree_id {
#define x(name, nr, ...) BTREE_ID_##name = nr, #define x(name, nr, ...) BTREE_ID_##name = nr,

View File

@ -682,9 +682,11 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
try(bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k), try(bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
BTREE_TRIGGER_check_repair|flags)); BTREE_TRIGGER_check_repair|flags));
if (bch2_trans_has_updates(trans)) if (bch2_trans_has_updates(trans)) {
return bch2_trans_commit(trans, NULL, NULL, 0) ?: CLASS(disk_reservation, res)(c);
return bch2_trans_commit(trans, &res.r, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
-BCH_ERR_transaction_restart_nested; -BCH_ERR_transaction_restart_nested;
}
try(bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k), try(bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
BTREE_TRIGGER_gc|BTREE_TRIGGER_insert|flags)); BTREE_TRIGGER_gc|BTREE_TRIGGER_insert|flags));

View File

@ -22,6 +22,7 @@
#include "data/extents.h" #include "data/extents.h"
#include "data/keylist.h" #include "data/keylist.h"
#include "data/rebalance.h"
#include "data/write.h" #include "data/write.h"
#include "init/error.h" #include "init/error.h"
@ -654,6 +655,35 @@ static void btree_update_new_nodes_mark_sb(struct btree_update *as)
bch2_write_super(c); bch2_write_super(c);
} }
static void bkey_strip_rebalance(const struct bch_fs *c, struct bkey_s k)
{
bool dropped;
do {
dropped = false;
struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
union bch_extent_entry *entry;
bkey_extent_entry_for_each(ptrs, entry)
if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance_v2 ||
extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance_bp) {
extent_entry_drop(c, k, entry);
dropped = true;
break;
}
} while (dropped);
}
static bool bkey_has_rebalance(const struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
bkey_extent_entry_for_each(ptrs, entry)
if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance_v2)
return true;
return false;
}
/* /*
* The transactional part of an interior btree node update, where we journal the * The transactional part of an interior btree node update, where we journal the
* update we did to the interior node and update alloc info: * update we did to the interior node and update alloc info:
@ -661,26 +691,70 @@ static void btree_update_new_nodes_mark_sb(struct btree_update *as)
static int btree_update_nodes_written_trans(struct btree_trans *trans, static int btree_update_nodes_written_trans(struct btree_trans *trans,
struct btree_update *as) struct btree_update *as)
{ {
struct bch_fs *c = trans->c;
struct bch_inode_opts opts;
bch2_inode_opts_get(as->c, &opts, true);
trans->journal_pin = &as->journal; trans->journal_pin = &as->journal;
darray_for_each(as->old_nodes, i) darray_for_each(as->old_nodes, i) {
try(bch2_key_trigger_old(trans, as->btree_id, i->level + 1, bkey_i_to_s_c(&i->key), try(bch2_key_trigger_old(trans, as->btree_id, i->level + 1, bkey_i_to_s_c(&i->key),
BTREE_TRIGGER_transactional)); BTREE_TRIGGER_transactional));
darray_for_each(as->new_nodes, i) {
try(bch2_key_trigger_new(trans, as->btree_id, i->level + 1, bkey_i_to_s(&i->key),
BTREE_TRIGGER_transactional));
journal_entry_set(errptr_try(bch2_trans_jset_entry_alloc(trans, journal_entry_set(errptr_try(bch2_trans_jset_entry_alloc(trans,
jset_u64s(i->key.k.u64s))), jset_u64s(i->key.k.u64s))),
i->root BCH_JSET_ENTRY_overwrite,
? BCH_JSET_ENTRY_btree_root
: BCH_JSET_ENTRY_btree_keys,
as->btree_id, as->btree_id,
i->root ? i->level : i->level + 1, i->level + 1,
&i->key, i->key.k.u64s); &i->key, i->key.k.u64s);
} }
darray_for_each(as->new_nodes, i) {
i->update_node_key = false;
bkey_strip_rebalance(c, bkey_i_to_s(&i->key));
try(bch2_bkey_set_needs_rebalance(trans, NULL, &opts, &i->key,
SET_NEEDS_REBALANCE_foreground, 0));
if (bkey_has_rebalance(c, bkey_i_to_s_c(&i->key))) {
CLASS(btree_iter_uninit, iter)(trans);
int ret = bch2_btree_node_get_iter(trans, &iter, i->b);
if (ret && ret != -BCH_ERR_btree_node_dying)
return ret;
if (!ret)
i->update_node_key = true;
else
bkey_strip_rebalance(c, bkey_i_to_s(&i->key));
}
try(bch2_key_trigger_new(trans, as->btree_id, i->level + 1, bkey_i_to_s(&i->key),
BTREE_TRIGGER_transactional));
if (!i->update_node_key || i->root) {
journal_entry_set(errptr_try(bch2_trans_jset_entry_alloc(trans,
jset_u64s(i->key.k.u64s))),
i->root
? BCH_JSET_ENTRY_btree_root
: BCH_JSET_ENTRY_btree_keys,
as->btree_id,
i->root ? i->level : i->level + 1,
&i->key, i->key.k.u64s);
} else {
CLASS(btree_node_iter, parent_iter)(trans,
as->btree_id,
i->key.k.p,
0,
i->level + 1,
BTREE_ITER_intent);
try(bch2_btree_iter_traverse(&parent_iter));
/*
* XXX: we shouldn't be logging overwrites here, need a
* flag for that
*/
try(bch2_trans_update(trans, &parent_iter, &i->key, BTREE_TRIGGER_norun));
}
}
return 0; return 0;
} }
@ -760,19 +834,23 @@ static void btree_update_nodes_written(struct btree_update *as)
BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_journal_reclaim, BCH_TRANS_COMMIT_journal_reclaim,
btree_update_nodes_written_trans(trans, as)); btree_update_nodes_written_trans(trans, as));
bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal),
"%s", bch2_err_str(ret)); c, "%s", bch2_err_str(ret));
/* /*
* Clear will_make_reachable while we still hold intent locks on * Clear will_make_reachable while we still hold intent locks on
* all our new nodes, to avoid racing with * all our new nodes, to avoid racing with
* btree_node_update_key(): * btree_node_update_key():
*/ */
darray_for_each(as->new_nodes, i) darray_for_each(as->new_nodes, i) {
if (i->update_node_key)
bkey_copy(&i->b->key, &i->key);
if (i->b) { if (i->b) {
BUG_ON(i->b->will_make_reachable != (unsigned long) as); BUG_ON(i->b->will_make_reachable != (unsigned long) as);
i->b->will_make_reachable = 0; i->b->will_make_reachable = 0;
clear_btree_node_will_make_reachable(i->b); clear_btree_node_will_make_reachable(i->b);
} }
}
} }
/* /*
@ -2422,7 +2500,8 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
*/ */
} }
try(bch2_trans_commit(trans, NULL, NULL, commit_flags)); CLASS(disk_reservation, res)(c);
try(bch2_trans_commit(trans, &res.r, NULL, commit_flags));
bch2_btree_node_lock_write_nofail(trans, btree_iter_path(trans, iter), &b->c); bch2_btree_node_lock_write_nofail(trans, btree_iter_path(trans, iter), &b->c);
bkey_copy(&b->key, new_key); bkey_copy(&b->key, new_key);

View File

@ -26,6 +26,7 @@ struct btree_update_node {
struct btree *b; struct btree *b;
unsigned level; unsigned level;
bool root; bool root;
bool update_node_key;
__le64 seq; __le64 seq;
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
}; };

View File

@ -560,7 +560,7 @@ struct btree_trans {
struct bch_fs_usage_base fs_usage_delta; struct bch_fs_usage_base fs_usage_delta;
unsigned journal_u64s; unsigned journal_u64s;
unsigned extra_disk_res; /* XXX kill */ u64 extra_disk_res;
__BKEY_PADDED(btree_path_down, BKEY_BTREE_PTR_VAL_U64s_MAX); __BKEY_PADDED(btree_path_down, BKEY_BTREE_PTR_VAL_U64s_MAX);

View File

@ -143,6 +143,17 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
return bch2_csum_opt_to_type(opts.data_checksum, true); return bch2_csum_opt_to_type(opts.data_checksum, true);
} }
static inline enum bch_csum_type bch2_data_checksum_type_rb(struct bch_fs *c,
struct bch_extent_rebalance_v2 opts)
{
if (c->sb.encryption_type)
return c->opts.wide_macs
? BCH_CSUM_chacha20_poly1305_128
: BCH_CSUM_chacha20_poly1305_80;
return bch2_csum_opt_to_type(opts.data_checksum, true);
}
static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
{ {
if (c->sb.encryption_type) if (c->sb.encryption_type)

View File

@ -878,8 +878,60 @@ static int ec_stripe_mem_alloc(struct btree_trans *trans,
* Hash table of open stripes: * Hash table of open stripes:
* Stripes that are being created or modified are kept in a hash table, so that * Stripes that are being created or modified are kept in a hash table, so that
* stripe deletion can skip them. * stripe deletion can skip them.
*
* Additionally, we have a hash table for buckets that have stripes being
* created, to avoid racing with rebalance:
*/ */
static bool __bch2_bucket_has_new_stripe(struct bch_fs *c, u64 dev_bucket)
{
unsigned hash = hash_64(dev_bucket, ilog2(ARRAY_SIZE(c->ec_stripes_new_buckets)));
struct ec_stripe_new_bucket *s;
hlist_for_each_entry(s, &c->ec_stripes_new_buckets[hash], hash)
if (s->dev_bucket == dev_bucket)
return true;
return false;
}
bool bch2_bucket_has_new_stripe(struct bch_fs *c, u64 dev_bucket)
{
guard(spinlock)(&c->ec_stripes_new_lock);
return __bch2_bucket_has_new_stripe(c, dev_bucket);
}
static void stripe_new_bucket_add(struct bch_fs *c, struct ec_stripe_new_bucket *s, u64 dev_bucket)
{
s->dev_bucket = dev_bucket;
unsigned hash = hash_64(dev_bucket, ilog2(ARRAY_SIZE(c->ec_stripes_new_buckets)));
hlist_add_head(&s->hash, &c->ec_stripes_new_buckets[hash]);
}
static void stripe_new_buckets_add(struct bch_fs *c, struct ec_stripe_new *s)
{
unsigned nr_blocks = s->nr_data + s->nr_parity;
guard(spinlock)(&c->ec_stripes_new_lock);
for (unsigned i = 0; i < nr_blocks; i++) {
if (!s->blocks[i])
continue;
struct open_bucket *ob = c->open_buckets + s->blocks[i];
struct bpos bucket = POS(ob->dev, ob->bucket);
stripe_new_bucket_add(c, &s->buckets[i], bucket_to_u64(bucket));
}
}
static void stripe_new_buckets_del(struct bch_fs *c, struct ec_stripe_new *s)
{
struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
for (unsigned i = 0; i < v->nr_blocks; i++)
hlist_del_init(&s->buckets[i].hash);
}
static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx) static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx)
{ {
unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
@ -920,6 +972,8 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
hlist_del_init(&s->hash); hlist_del_init(&s->hash);
s->idx = 0; s->idx = 0;
stripe_new_buckets_del(c, s);
} }
/* stripe deletion */ /* stripe deletion */
@ -1045,6 +1099,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
struct ec_stripe_buf *s, struct ec_stripe_buf *s,
struct bkey_s_c_backpointer bp, struct bkey_s_c_backpointer bp,
struct stripe_update_bucket_stats *stats, struct stripe_update_bucket_stats *stats,
struct disk_reservation *res,
struct wb_maybe_flush *last_flushed) struct wb_maybe_flush *last_flushed)
{ {
struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
@ -1110,7 +1165,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
.idx = s->key.k.p.offset, .idx = s->key.k.p.offset,
}; };
struct bkey_i *n = errptr_try(bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr))); struct bkey_i *n = errptr_try(bch2_trans_kmalloc(trans, BKEY_EXTENT_U64s_MAX * sizeof(u64)));
bkey_reassemble(n, k); bkey_reassemble(n, k);
@ -1126,10 +1181,9 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
struct bch_inode_opts opts; struct bch_inode_opts opts;
try(bch2_bkey_get_io_opts(trans, NULL, bkey_i_to_s_c(n), &opts)); try(bch2_bkey_get_io_opts(trans, NULL, bkey_i_to_s_c(n), &opts));
try(bch2_bkey_set_needs_rebalance(trans->c, &opts, n, try(bch2_bkey_set_needs_rebalance(trans, NULL, &opts, n, SET_NEEDS_REBALANCE_other, 0));
SET_NEEDS_REBALANCE_other, 0));
try(bch2_trans_update(trans, &iter, n, 0)); try(bch2_trans_update(trans, &iter, n, 0));
try(bch2_trans_commit(trans, NULL, NULL, try(bch2_trans_commit(trans, res, NULL,
BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc)); BCH_TRANS_COMMIT_no_enospc));
@ -1159,6 +1213,8 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
struct stripe_update_bucket_stats stats = {}; struct stripe_update_bucket_stats stats = {};
CLASS(disk_reservation, res)(c);
try(for_each_btree_key_max(trans, bp_iter, BTREE_ID_backpointers, try(for_each_btree_key_max(trans, bp_iter, BTREE_ID_backpointers,
bucket_pos_to_bp_start(ca, bucket_pos), bucket_pos_to_bp_start(ca, bucket_pos),
bucket_pos_to_bp_end(ca, bucket_pos), 0, bp_k, ({ bucket_pos_to_bp_end(ca, bucket_pos), 0, bp_k, ({
@ -1174,7 +1230,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
wb_maybe_flush_inc(&last_flushed); wb_maybe_flush_inc(&last_flushed);
ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, bp, ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, bp,
&stats, &last_flushed); &stats, &res.r, &last_flushed);
}))); })));
if (trace_stripe_update_bucket_enabled()) { if (trace_stripe_update_bucket_enabled()) {
@ -2026,6 +2082,7 @@ allocate_buf:
if (ret) if (ret)
goto err; goto err;
stripe_new_buckets_add(c, s);
s->allocated = true; s->allocated = true;
allocated: allocated:
BUG_ON(!s->idx); BUG_ON(!s->idx);

View File

@ -191,6 +191,11 @@ enum ec_stripe_ref {
STRIPE_REF_NR STRIPE_REF_NR
}; };
struct ec_stripe_new_bucket {
struct hlist_node hash;
u64 dev_bucket;
};
struct ec_stripe_new { struct ec_stripe_new {
struct bch_fs *c; struct bch_fs *c;
struct ec_stripe_head *h; struct ec_stripe_head *h;
@ -217,6 +222,8 @@ struct ec_stripe_new {
open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX]; open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX];
struct disk_reservation res; struct disk_reservation res;
struct ec_stripe_new_bucket buckets[BCH_BKEY_PTRS_MAX];
struct ec_stripe_buf new_stripe; struct ec_stripe_buf new_stripe;
struct ec_stripe_buf existing_stripe; struct ec_stripe_buf existing_stripe;
}; };
@ -248,6 +255,8 @@ struct ec_stripe_head {
int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey_s_c); int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey_s_c);
bool bch2_bucket_has_new_stripe(struct bch_fs *, u64);
void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int); void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int);

View File

@ -796,7 +796,7 @@ void bch2_bkey_propagate_incompressible(const struct bch_fs *c, struct bkey_i *d
/* /*
* XXX: if some data actually is compressed, we want * XXX: if some data actually is compressed, we want
* bch_extent_rebalance.wont_recompress_smaller * bch_extent_rebalance_v2.wont_recompress_smaller
*/ */
bkey_extent_entry_for_each(ptrs, entry) { bkey_extent_entry_for_each(ptrs, entry) {
@ -884,6 +884,15 @@ static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
return durability; return durability;
} }
void bch2_bkey_extent_entry_drop_s(const struct bch_fs *c, struct bkey_s k, union bch_extent_entry *entry)
{
union bch_extent_entry *end = bkey_val_end(k);
union bch_extent_entry *next = extent_entry_next(c, entry);
memmove_u64s(entry, next, (u64 *) end - (u64 *) next);
k.k->u64s -= extent_entry_u64s(c, entry);
}
void bch2_bkey_extent_entry_drop(const struct bch_fs *c, struct bkey_i *k, union bch_extent_entry *entry) void bch2_bkey_extent_entry_drop(const struct bch_fs *c, struct bkey_i *k, union bch_extent_entry *entry)
{ {
union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
@ -1378,14 +1387,22 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, "idx %llu block %u", (u64) ec->idx, ec->block); prt_printf(out, "idx %llu block %u", (u64) ec->idx, ec->block);
break; break;
} }
case BCH_EXTENT_ENTRY_rebalance: case BCH_EXTENT_ENTRY_rebalance_v1:
bch2_extent_rebalance_to_text(out, c, &entry->rebalance); bch2_extent_rebalance_v1_to_text(out, c, &entry->rebalance_v1);
break;
case BCH_EXTENT_ENTRY_rebalance_v2:
bch2_extent_rebalance_v2_to_text(out, c, &entry->rebalance_v2);
break; break;
case BCH_EXTENT_ENTRY_flags: case BCH_EXTENT_ENTRY_flags:
prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags); prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags);
break; break;
case BCH_EXTENT_ENTRY_rebalance_bp:
prt_printf(out, "idx %llu", (u64) entry->rebalance_bp.idx);
break;
default: default:
prt_printf(out, "(unknown extent entry %.16llx)", *((u64 *) entry)); prt_printf(out, "(unknown extent entry %.16llx)", *((u64 *) entry));
return; return;
@ -1439,6 +1456,18 @@ fsck_err:
return ret; return ret;
} }
static inline bool btree_ptr_entry_type_allowed(enum bch_extent_entry_type type)
{
switch (type) {
case BCH_EXTENT_ENTRY_ptr:
case BCH_EXTENT_ENTRY_rebalance_v2:
case BCH_EXTENT_ENTRY_rebalance_bp:
return true;
default:
return false;
};
}
int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
struct bkey_validate_context from) struct bkey_validate_context from)
{ {
@ -1449,23 +1478,27 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
unsigned nonce = UINT_MAX; unsigned nonce = UINT_MAX;
unsigned nr_ptrs = 0; unsigned nr_ptrs = 0;
bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false; bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false;
bool have_inval_dev_ptrs = false, have_non_inval_dev_ptrs = false;
int ret = 0; int ret = 0;
if (bkey_is_btree_ptr(k.k)) if (bkey_is_btree_ptr(k.k))
size_ondisk = btree_sectors(c); size_ondisk = btree_sectors(c);
bkey_extent_entry_for_each(ptrs, entry) { bkey_extent_entry_for_each(ptrs, entry) {
bkey_fsck_err_on(extent_entry_type(entry) >= c->extent_types_known, unsigned type = extent_entry_type(entry);
bkey_fsck_err_on(type >= c->extent_types_known,
c, extent_ptrs_invalid_entry, c, extent_ptrs_invalid_entry,
"invalid extent entry type (got %u, max %u)", "invalid extent entry type (got %u, max %u)",
extent_entry_type(entry), c->extent_types_known); type, c->extent_types_known);
bkey_fsck_err_on(bkey_is_btree_ptr(k.k) && bkey_fsck_err_on(bkey_is_btree_ptr(k.k) &&
!extent_entry_is_ptr(entry), type < BCH_EXTENT_ENTRY_MAX &&
!btree_ptr_entry_type_allowed(type),
c, btree_ptr_has_non_ptr, c, btree_ptr_has_non_ptr,
"has non ptr field"); "has non allowed field");
switch (extent_entry_type(entry)) { switch (type) {
case BCH_EXTENT_ENTRY_ptr: case BCH_EXTENT_ENTRY_ptr:
try(extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false)); try(extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false));
@ -1480,6 +1513,12 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
have_ec = false; have_ec = false;
crc_since_last_ptr = false; crc_since_last_ptr = false;
if (entry->ptr.dev == BCH_SB_MEMBER_INVALID)
have_inval_dev_ptrs = true;
else
have_non_inval_dev_ptrs = true;
nr_ptrs++; nr_ptrs++;
break; break;
case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc32:
@ -1527,30 +1566,18 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
c, ptr_stripe_redundant, c, ptr_stripe_redundant,
"redundant stripe entry"); "redundant stripe entry");
have_ec = true; have_ec = true;
have_non_inval_dev_ptrs = true;
break; break;
case BCH_EXTENT_ENTRY_rebalance: { case BCH_EXTENT_ENTRY_rebalance_v2:
/* try(bch2_extent_rebalance_validate(c, k, from, &entry->rebalance_v2));
* this shouldn't be a fsck error, for forward
* compatibility; the rebalance code should just refetch
* the compression opt if it's unknown
*/
#if 0
const struct bch_extent_rebalance *r = &entry->rebalance;
if (!bch2_compression_opt_valid(r->compression)) {
union bch_compression_opt opt = { .value = r->compression };
prt_printf(err, "invalid compression opt %u:%u",
opt.type, opt.level);
return bch_err_throw(c, invalid_bkey);
}
#endif
break; break;
}
case BCH_EXTENT_ENTRY_flags: case BCH_EXTENT_ENTRY_flags:
bkey_fsck_err_on(entry != ptrs.start, bkey_fsck_err_on(entry != ptrs.start,
c, extent_flags_not_at_start, c, extent_flags_not_at_start,
"extent flags entry not at start"); "extent flags entry not at start");
break; break;
case BCH_EXTENT_ENTRY_rebalance_bp:
break;
} }
} }
@ -1572,6 +1599,9 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
bkey_fsck_err_on(have_ec, bkey_fsck_err_on(have_ec,
c, extent_ptrs_redundant_stripe, c, extent_ptrs_redundant_stripe,
"redundant stripe entry"); "redundant stripe entry");
bkey_fsck_err_on(have_inval_dev_ptrs && !have_non_inval_dev_ptrs,
c, extent_ptrs_all_invalid,
"extent ptrs all to BCH_SB_MEMBER_INVALID");
fsck_err: fsck_err:
return ret; return ret;
} }
@ -1608,7 +1638,8 @@ void bch2_ptr_swab(const struct bch_fs *c, struct bkey_s k)
break; break;
case BCH_EXTENT_ENTRY_stripe_ptr: case BCH_EXTENT_ENTRY_stripe_ptr:
break; break;
case BCH_EXTENT_ENTRY_rebalance: case BCH_EXTENT_ENTRY_rebalance_v1:
case BCH_EXTENT_ENTRY_rebalance_v2:
break; break;
default: default:
/* Bad entry type: will be caught by validate() */ /* Bad entry type: will be caught by validate() */
@ -1682,8 +1713,10 @@ int bch2_cut_front_s(const struct bch_fs *c, struct bpos where, struct bkey_s k)
entry->crc128.offset += sub; entry->crc128.offset += sub;
break; break;
case BCH_EXTENT_ENTRY_stripe_ptr: case BCH_EXTENT_ENTRY_stripe_ptr:
case BCH_EXTENT_ENTRY_rebalance: case BCH_EXTENT_ENTRY_rebalance_v1:
case BCH_EXTENT_ENTRY_rebalance_v2:
case BCH_EXTENT_ENTRY_flags: case BCH_EXTENT_ENTRY_flags:
case BCH_EXTENT_ENTRY_rebalance_bp:
break; break;
} }

View File

@ -596,6 +596,7 @@ bool bch2_bkey_devs_rw(struct bch_fs *, struct bkey_s_c);
bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
bool bch2_bkey_in_target(struct bch_fs *, struct bkey_s_c, unsigned); bool bch2_bkey_in_target(struct bch_fs *, struct bkey_s_c, unsigned);
void bch2_bkey_extent_entry_drop_s(const struct bch_fs *, struct bkey_s, union bch_extent_entry *);
void bch2_bkey_extent_entry_drop(const struct bch_fs *, struct bkey_i *, union bch_extent_entry *); void bch2_bkey_extent_entry_drop(const struct bch_fs *, struct bkey_i *, union bch_extent_entry *);
static inline void bch2_bkey_append_ptr(const struct bch_fs *c, struct bkey_i *k, struct bch_extent_ptr ptr) static inline void bch2_bkey_append_ptr(const struct bch_fs *c, struct bkey_i *k, struct bch_extent_ptr ptr)

View File

@ -79,9 +79,11 @@
x(crc64, 2) \ x(crc64, 2) \
x(crc128, 3) \ x(crc128, 3) \
x(stripe_ptr, 4) \ x(stripe_ptr, 4) \
x(rebalance, 5) \ x(rebalance_v1, 5) \
x(flags, 6) x(flags, 6) \
#define BCH_EXTENT_ENTRY_MAX 7 x(rebalance_v2, 7) \
x(rebalance_bp, 8)
#define BCH_EXTENT_ENTRY_MAX 9
enum bch_extent_entry_type { enum bch_extent_entry_type {
#define x(f, n) BCH_EXTENT_ENTRY_##f = n, #define x(f, n) BCH_EXTENT_ENTRY_##f = n,
@ -221,7 +223,7 @@ struct bch_extent_flags {
#endif #endif
}; };
/* bch_extent_rebalance: */ /* bch_extent_rebalance_v2: */
#include "rebalance_format.h" #include "rebalance_format.h"
union bch_extent_entry { union bch_extent_entry {
@ -270,13 +272,13 @@ struct bch_extent {
} __packed __aligned(8); } __packed __aligned(8);
/* Maximum size (in u64s) a single pointer could be: */ /* Maximum size (in u64s) a single pointer could be: */
#define BKEY_EXTENT_PTR_U64s_MAX\ #define BKEY_EXTENT_PTR_U64s_MAX \
((sizeof(struct bch_extent_crc128) + \ ((sizeof(struct bch_extent_crc128) + \
sizeof(struct bch_extent_ptr)) / sizeof(__u64)) sizeof(struct bch_extent_ptr)) / sizeof(__u64))
/* Maximum possible size of an entire extent value: */ /* Maximum possible size of an entire extent value: */
#define BKEY_EXTENT_VAL_U64s_MAX \ #define BKEY_EXTENT_VAL_U64s_MAX \
(1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) (5 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
/* * Maximum possible size of an entire extent, key + value: */ /* * Maximum possible size of an entire extent, key + value: */
#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) #define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
@ -284,7 +286,9 @@ struct bch_extent {
/* Btree pointers don't carry around checksums: */ /* Btree pointers don't carry around checksums: */
#define BKEY_BTREE_PTR_VAL_U64s_MAX \ #define BKEY_BTREE_PTR_VAL_U64s_MAX \
((sizeof(struct bch_btree_ptr_v2) + \ ((sizeof(struct bch_btree_ptr_v2) + \
sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64)) sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX + \
sizeof(struct bch_extent_rebalance_v2) + \
sizeof(struct bch_extent_rebalance_bp)) / sizeof(__u64))
#define BKEY_BTREE_PTR_U64s_MAX \ #define BKEY_BTREE_PTR_U64s_MAX \
(BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)

View File

@ -75,14 +75,15 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
if (!bch2_bkey_has_device_c(c, k, dev_idx)) if (!bch2_bkey_has_device_c(c, k, dev_idx))
return 0; return 0;
struct bkey_i *n = /* blah */
errptr_try(bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_internal_snapshot_node)); struct bkey_i *n = errptr_try(bch2_trans_kmalloc(trans, BKEY_EXTENT_U64s_MAX * sizeof(u64)));
bkey_reassemble(n, k);
try(drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, err, false)); try(drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, err, false));
struct bch_inode_opts opts; struct bch_inode_opts opts;
try(bch2_bkey_get_io_opts(trans, NULL, k, &opts)); try(bch2_bkey_get_io_opts(trans, NULL, k, &opts));
try(bch2_bkey_set_needs_rebalance(c, &opts, n, SET_NEEDS_REBALANCE_opt_change, 0)); try(bch2_bkey_set_needs_rebalance(trans, NULL, &opts, n, SET_NEEDS_REBALANCE_opt_change, 0));
/* /*
* Since we're not inserting through an extent iterator * Since we're not inserting through an extent iterator
@ -92,7 +93,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
*/ */
if (bkey_deleted(&n->k)) if (bkey_deleted(&n->k))
n->k.size = 0; n->k.size = 0;
return 0; return bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node);
} }
static int bch2_dev_btree_drop_key(struct btree_trans *trans, static int bch2_dev_btree_drop_key(struct btree_trans *trans,
@ -116,6 +117,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c,
unsigned flags, struct printbuf *err) unsigned flags, struct printbuf *err)
{ {
CLASS(btree_trans, trans)(c); CLASS(btree_trans, trans)(c);
CLASS(disk_reservation, res)(c);
/* FIXME: this does not handle unknown btrees with data pointers */ /* FIXME: this does not handle unknown btrees with data pointers */
for (unsigned id = 0; id < BTREE_ID_NR; id++) { for (unsigned id = 0; id < BTREE_ID_NR; id++) {
@ -126,14 +128,13 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c,
if (id == BTREE_ID_stripes) if (id == BTREE_ID_stripes)
continue; continue;
int ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, try(for_each_btree_key_commit(trans, iter, id, POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ &res.r, NULL, BCH_TRANS_COMMIT_no_enospc, ({
bch2_disk_reservation_put(c, &res.r);
bch2_progress_update_iter(trans, progress, &iter, "dropping user data") ?: bch2_progress_update_iter(trans, progress, &iter, "dropping user data") ?:
bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags, err); bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags, err);
})); })));
if (ret)
return ret;
} }
return 0; return 0;
@ -218,6 +219,7 @@ int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsig
struct printbuf *err) struct printbuf *err)
{ {
CLASS(btree_trans, trans)(c); CLASS(btree_trans, trans)(c);
CLASS(disk_reservation, res)(c);
struct wb_maybe_flush last_flushed __cleanup(wb_maybe_flush_exit); struct wb_maybe_flush last_flushed __cleanup(wb_maybe_flush_exit);
wb_maybe_flush_init(&last_flushed); wb_maybe_flush_init(&last_flushed);
@ -226,11 +228,12 @@ int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsig
for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers, for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers,
POS(dev_idx, 0), POS(dev_idx, 0),
POS(dev_idx, U64_MAX), 0, k, POS(dev_idx, U64_MAX), 0, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ &res.r, NULL, BCH_TRANS_COMMIT_no_enospc, ({
if (k.k->type != KEY_TYPE_backpointer) if (k.k->type != KEY_TYPE_backpointer)
continue; continue;
wb_maybe_flush_inc(&last_flushed); wb_maybe_flush_inc(&last_flushed);
bch2_disk_reservation_put(c, &res.r);
data_drop_bp(trans, dev_idx, bkey_s_c_to_backpointer(k), data_drop_bp(trans, dev_idx, bkey_s_c_to_backpointer(k),
&last_flushed, flags, err); &last_flushed, flags, err);

View File

@ -324,8 +324,11 @@ int bch2_move_extent(struct moving_context *ctxt,
struct bch_inode_opts opts; struct bch_inode_opts opts;
try(bch2_bkey_get_io_opts(trans, snapshot_io_opts, k, &opts)); try(bch2_bkey_get_io_opts(trans, snapshot_io_opts, k, &opts));
try(bch2_update_rebalance_opts(trans, &opts, iter, k, SET_NEEDS_REBALANCE_other)); try(bch2_update_rebalance_opts(trans, snapshot_io_opts, &opts, iter, level, k,
try(bch2_trans_commit_lazy(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc)); SET_NEEDS_REBALANCE_other));
CLASS(disk_reservation, res)(c);
try(bch2_trans_commit_lazy(trans, &res.r, NULL, BCH_TRANS_COMMIT_no_enospc));
struct data_update_opts data_opts = {}; struct data_update_opts data_opts = {};
int ret = pred(trans, arg, iter->btree_id, k, &opts, &data_opts); int ret = pred(trans, arg, iter->btree_id, k, &opts, &data_opts);

File diff suppressed because it is too large Load Diff

View File

@ -6,33 +6,55 @@
#include "alloc/disk_groups.h" #include "alloc/disk_groups.h"
#include "rebalance_types.h" #include "rebalance_types.h"
static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c, int bch2_extent_rebalance_validate(struct bch_fs *, struct bkey_s_c,
struct bkey_validate_context,
const struct bch_extent_rebalance_v2 *);
static inline struct bch_extent_rebalance_v2 io_opts_to_rebalance_opts(struct bch_fs *c,
struct bch_inode_opts *opts) struct bch_inode_opts *opts)
{ {
struct bch_extent_rebalance r = { return (struct bch_extent_rebalance_v2) {
.type = BIT(BCH_EXTENT_ENTRY_rebalance), .type = BIT(BCH_EXTENT_ENTRY_rebalance_v2),
#define x(_name) \ #define x(_name) \
._name = opts->_name, \ ._name = opts->_name, \
._name##_from_inode = opts->_name##_from_inode, ._name##_from_inode = opts->_name##_from_inode,
BCH_REBALANCE_OPTS() BCH_REBALANCE_OPTS()
#undef x #undef x
}; };
if (r.background_target &&
!bch2_target_accepts_data(c, BCH_DATA_user, r.background_target))
r.background_target = 0;
return r;
}; };
void bch2_extent_rebalance_to_text(struct printbuf *, struct bch_fs *, void bch2_extent_rebalance_v1_to_text(struct printbuf *, struct bch_fs *,
const struct bch_extent_rebalance *); const struct bch_extent_rebalance_v1 *);
void bch2_extent_rebalance_v2_to_text(struct printbuf *, struct bch_fs *,
const struct bch_extent_rebalance_v2 *);
int bch2_trigger_extent_rebalance(struct btree_trans *, const struct bch_extent_rebalance_v2 *bch2_bkey_rebalance_opts(const struct bch_fs *, struct bkey_s_c);
struct bkey_s_c, struct bkey_s_c,
enum btree_iter_update_trigger_flags);
u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); int __bch2_trigger_extent_rebalance(struct btree_trans *,
enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s,
const struct bch_extent_rebalance_v2 *,
const struct bch_extent_rebalance_v2 *,
enum btree_iter_update_trigger_flags);
static inline unsigned rb_needs_trigger(const struct bch_extent_rebalance_v2 *r)
{
return r ? r->need_rb|r->ptrs_moving : 0;
}
static inline int bch2_trigger_extent_rebalance(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s new,
enum btree_iter_update_trigger_flags flags)
{
struct bch_fs *c = trans->c;
const struct bch_extent_rebalance_v2 *old_r = bch2_bkey_rebalance_opts(c, old);
const struct bch_extent_rebalance_v2 *new_r = bch2_bkey_rebalance_opts(c, new.s_c);
return rb_needs_trigger(old_r) || rb_needs_trigger(new_r)
? __bch2_trigger_extent_rebalance(trans, btree, level, old, new, old_r, new_r, flags)
: 0;
}
enum set_needs_rebalance_ctx { enum set_needs_rebalance_ctx {
SET_NEEDS_REBALANCE_opt_change, SET_NEEDS_REBALANCE_opt_change,
@ -41,9 +63,6 @@ enum set_needs_rebalance_ctx {
SET_NEEDS_REBALANCE_other, SET_NEEDS_REBALANCE_other,
}; };
int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_inode_opts *,
struct bkey_i *, enum set_needs_rebalance_ctx, u32);
/* Inodes in different snapshots may have different IO options: */ /* Inodes in different snapshots may have different IO options: */
struct snapshot_io_opts_entry { struct snapshot_io_opts_entry {
u32 snapshot; u32 snapshot;
@ -53,6 +72,9 @@ struct snapshot_io_opts_entry {
struct per_snapshot_io_opts { struct per_snapshot_io_opts {
u64 cur_inum; u64 cur_inum;
bool metadata; bool metadata;
bool fs_scan_cookie;
bool inum_scan_cookie;
struct bch_devs_mask dev_cookie;
struct bch_inode_opts fs_io_opts; struct bch_inode_opts fs_io_opts;
DARRAY(struct snapshot_io_opts_entry) d; DARRAY(struct snapshot_io_opts_entry) d;
@ -76,20 +98,27 @@ DEFINE_CLASS(per_snapshot_io_opts, struct per_snapshot_io_opts,
per_snapshot_io_opts_init(c), per_snapshot_io_opts_init(c),
struct bch_fs *c); struct bch_fs *c);
int bch2_update_rebalance_opts(struct btree_trans *,
struct bch_inode_opts *,
struct btree_iter *,
struct bkey_s_c,
enum set_needs_rebalance_ctx);
int bch2_bkey_get_io_opts(struct btree_trans *, int bch2_bkey_get_io_opts(struct btree_trans *,
struct per_snapshot_io_opts *, struct bkey_s_c, struct per_snapshot_io_opts *, struct bkey_s_c,
struct bch_inode_opts *opts); struct bch_inode_opts *opts);
int bch2_update_rebalance_opts(struct btree_trans *,
struct per_snapshot_io_opts *,
struct bch_inode_opts *,
struct btree_iter *,
unsigned level,
struct bkey_s_c,
enum set_needs_rebalance_ctx);
int bch2_bkey_set_needs_rebalance(struct btree_trans *,
struct per_snapshot_io_opts *, struct bch_inode_opts *,
struct bkey_i *, enum set_needs_rebalance_ctx, u32);
struct rebalance_scan { struct rebalance_scan {
enum rebalance_scan_type { enum rebalance_scan_type {
REBALANCE_SCAN_fs, REBALANCE_SCAN_fs,
REBALANCE_SCAN_metadata, REBALANCE_SCAN_metadata,
REBALANCE_SCAN_pending,
REBALANCE_SCAN_device, REBALANCE_SCAN_device,
REBALANCE_SCAN_inum, REBALANCE_SCAN_inum,
} type; } type;
@ -101,7 +130,7 @@ struct rebalance_scan {
}; };
int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, struct rebalance_scan); int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, struct rebalance_scan);
int bch2_set_rebalance_needs_scan(struct bch_fs *, struct rebalance_scan); int bch2_set_rebalance_needs_scan(struct bch_fs *, struct rebalance_scan, bool);
int bch2_set_fs_needs_rebalance(struct bch_fs *); int bch2_set_fs_needs_rebalance(struct bch_fs *);
static inline void bch2_rebalance_wakeup(struct bch_fs *c) static inline void bch2_rebalance_wakeup(struct bch_fs *c)
@ -114,6 +143,7 @@ static inline void bch2_rebalance_wakeup(struct bch_fs *c)
} }
void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *); void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *);
void bch2_rebalance_scan_pending_to_text(struct printbuf *, struct bch_fs *);
void bch2_rebalance_stop(struct bch_fs *); void bch2_rebalance_stop(struct bch_fs *);
int bch2_rebalance_start(struct bch_fs *); int bch2_rebalance_start(struct bch_fs *);

View File

@ -2,52 +2,177 @@
#ifndef _BCACHEFS_REBALANCE_FORMAT_H #ifndef _BCACHEFS_REBALANCE_FORMAT_H
#define _BCACHEFS_REBALANCE_FORMAT_H #define _BCACHEFS_REBALANCE_FORMAT_H
struct bch_extent_rebalance { /*
* rebalance on disk data structures:
*
* extents will contain a bch_extent_rebalance if they have background
* processing pending; additionally, indirect extents will always have a
* bch_extent_rebalance if they had any io path options set on the inode, since
* we don't (yet) have backpointers that would let us look up the "owning" inode
* of an indirect extent to recover the io path options.
*
* We also have 4 btrees for keeping track of pending rebalance work:
*
* BTREE_ID_rebalance_scan:
* Inum 0:
* Holds "scan cookies", which are created on option change to indicate that
* new options need to be propagated to each extent; this happens before the
* actual data processing.
*
* A scan cookie may be for the entire filesystem, a specific device, or a
* specific inode.
*
* Inum 1:
* Btree nodes that need background processing cannot be tracked by the
* other rebalance btrees; instead they have backpointers
* (KEY_TYPE_backpointer) created here.
*
* This has the added benefit that btree nodes will be processed before
* regular data, which is beneficial if e.g. we're recovering from data
* being degraded.
*
* BTREE_ID_rebalance_work:
* The main "pending rebalance work" btree: it's a simple bitset btree where
* a set bit indicates that an an extent in BTREE_ID_extents or
* BTREE_ID_reflink needs to be processed.
*
* BTREE_ID_rebalance_hipri:
* If bch_extent_rebalance.hipri is set, the extent will be tracked here
* instead of BTREE_ID_rebalance_work and processed ahead of extents in
* BTREE_ID_rebalance_work; this is so that we can evacuate failed devices
* before other work.
*
* BTREE_ID_rebalance_pending:
* If we'd like to move an extent to a specific target, but can't because the
* target is full, we set bch_extent_rebalance.pending and switch to tracking
* it here; pending rebalance work is re-attempted on device resize, add, or
* label change.
*/
struct bch_extent_rebalance_v1 {
#if defined(__LITTLE_ENDIAN_BITFIELD) #if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:6, __u64 type:6,
unused:3, unused:3,
promote_target_from_inode:1, promote_target_from_inode:1,
erasure_code_from_inode:1, erasure_code_from_inode:1,
data_checksum_from_inode:1, data_checksum_from_inode:1,
background_compression_from_inode:1, background_compression_from_inode:1,
data_replicas_from_inode:1, data_replicas_from_inode:1,
background_target_from_inode:1, background_target_from_inode:1,
promote_target:16, promote_target:16,
erasure_code:1, erasure_code:1,
data_checksum:4, data_checksum:4,
data_replicas:4, data_replicas:4,
background_compression:8, /* enum bch_compression_opt */ background_compression:8, /* enum bch_compression_opt */
background_target:16; background_target:16;
#elif defined (__BIG_ENDIAN_BITFIELD) #elif defined (__BIG_ENDIAN_BITFIELD)
__u64 background_target:16, __u64 background_target:16,
background_compression:8, background_compression:8,
data_replicas:4, data_replicas:4,
data_checksum:4, data_checksum:4,
erasure_code:1, erasure_code:1,
promote_target:16, promote_target:16,
background_target_from_inode:1, background_target_from_inode:1,
data_replicas_from_inode:1, data_replicas_from_inode:1,
background_compression_from_inode:1, background_compression_from_inode:1,
data_checksum_from_inode:1, data_checksum_from_inode:1,
erasure_code_from_inode:1, erasure_code_from_inode:1,
promote_target_from_inode:1, promote_target_from_inode:1,
unused:3, unused:3,
type:6; type:6;
#endif
};
struct bch_extent_rebalance_v2 {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:8,
unused:2,
ptrs_moving:5,
hipri:1,
pending:1,
need_rb:5,
data_replicas_from_inode:1,
data_checksum_from_inode:1,
erasure_code_from_inode:1,
background_compression_from_inode:1,
background_target_from_inode:1,
promote_target_from_inode:1,
data_replicas:3,
data_checksum:4,
erasure_code:1,
background_compression:8, /* enum bch_compression_opt */
background_target:10,
promote_target:10;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 promote_target:10,
background_target:10,
background_compression:8,
erasure_code:1,
data_checksum:4,
data_replicas:3,
promote_target_from_inode:1,
background_target_from_inode:1,
background_compression_from_inode:1,
erasure_code_from_inode:1,
data_checksum_from_inode:1,
data_replicas_from_inode:1,
need_rb:5,
pending:1,
hipri:1,
ptrs_moving:5,
unused:2,
type:8;
#endif
};
struct bch_extent_rebalance_bp {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:9,
idx:55;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 idx:55,
type:9;
#endif #endif
}; };
/* subset of BCH_INODE_OPTS */ /* subset of BCH_INODE_OPTS */
#define BCH_REBALANCE_OPTS() \ #define BCH_REBALANCE_OPTS() \
x(data_checksum) \
x(background_compression) \
x(data_replicas) \ x(data_replicas) \
x(promote_target) \ x(data_checksum) \
x(erasure_code) \
x(background_compression) \
x(background_target) \ x(background_target) \
x(erasure_code) x(promote_target)
enum bch_rebalance_opts {
#define x(n) BCH_REBALANCE_##n,
BCH_REBALANCE_OPTS()
#undef x
};
#define BCH_REBALANCE_ACCOUNTING() \
x(replicas, 0) \
x(checksum, 1) \
x(erasure_code, 2) \
x(compression, 3) \
x(target, 4) \
x(high_priority, 5) \
x(pending, 6) \
enum bch_rebalance_accounting_type {
#define x(t, n) BCH_REBALANCE_ACCOUNTING_##t = n,
BCH_REBALANCE_ACCOUNTING()
#undef x
BCH_REBALANCE_ACCOUNTING_NR,
};
#endif /* _BCACHEFS_REBALANCE_FORMAT_H */ #endif /* _BCACHEFS_REBALANCE_FORMAT_H */

View File

@ -197,14 +197,14 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
insert = bch2_trans_kmalloc(trans, insert = bch2_trans_kmalloc(trans,
bkey_bytes(k.k) + bkey_bytes(k.k) +
bkey_val_bytes(&new->k) + bkey_val_bytes(&new->k) +
sizeof(struct bch_extent_rebalance)); sizeof(struct bch_extent_rebalance_v2));
ret = PTR_ERR_OR_ZERO(insert); ret = PTR_ERR_OR_ZERO(insert);
if (ret) if (ret)
goto err; goto err;
bkey_reassemble(insert, k); bkey_reassemble(insert, k);
new = bch2_trans_kmalloc(trans, bkey_bytes(&new->k)); new = bch2_trans_kmalloc(trans, bkey_bytes(&new->k) + sizeof(struct bch_extent_rebalance_v2));
ret = PTR_ERR_OR_ZERO(new); ret = PTR_ERR_OR_ZERO(new);
if (ret) if (ret)
goto err; goto err;
@ -327,7 +327,18 @@ restart_drop_extra_replicas:
bch2_insert_snapshot_whiteouts(trans, m->btree_id, bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, insert->k.p) ?: k.k->p, insert->k.p) ?:
bch2_bkey_get_io_opts(trans, NULL, k, &opts) ?: bch2_bkey_get_io_opts(trans, NULL, k, &opts) ?:
bch2_bkey_set_needs_rebalance(c, &opts, insert, /*
* this set_needs_rebalance call is only for verifying
* that the data we just wrote was written correctly,
* otherwise we could fail to flag incorrectly written
* data due to needs_rb already being set on the
* existing extent
*/
bch2_bkey_set_needs_rebalance(trans, NULL, &opts, &new->k_i,
SET_NEEDS_REBALANCE_foreground,
m->op.opts.change_cookie) ?:
/* this is the real set_needs_rebalance() call */
bch2_bkey_set_needs_rebalance(trans, NULL, &opts, insert,
SET_NEEDS_REBALANCE_foreground, SET_NEEDS_REBALANCE_foreground,
m->op.opts.change_cookie) ?: m->op.opts.change_cookie) ?:
bch2_trans_update(trans, &iter, insert, bch2_trans_update(trans, &iter, insert,
@ -451,7 +462,8 @@ static void data_update_trace(struct data_update *u, int ret)
trace_data_update_no_io(c, buf.buf); trace_data_update_no_io(c, buf.buf);
} }
count_event(c, data_update_no_io); count_event(c, data_update_no_io);
} else if (ret != -BCH_ERR_data_update_fail_no_rw_devs) { } else if (ret != -BCH_ERR_data_update_fail_no_rw_devs &&
ret != -BCH_ERR_insufficient_devices) {
if (trace_data_update_fail_enabled()) { if (trace_data_update_fail_enabled()) {
CLASS(printbuf, buf)(); CLASS(printbuf, buf)();
bch2_data_update_to_text(&buf, u); bch2_data_update_to_text(&buf, u);
@ -774,7 +786,13 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
prt_printf(&buf, "\nret:\t%s\n", bch2_err_str(-BCH_ERR_data_update_fail_no_rw_devs)); prt_printf(&buf, "\nret:\t%s\n", bch2_err_str(-BCH_ERR_data_update_fail_no_rw_devs));
trace_data_update_fail(c, buf.buf); trace_data_update_fail(c, buf.buf);
} }
count_event(c, data_update_fail);
/*
* It's not counted as a failure because it'll end up on
* the rebalance pending list
*
* count_event(c, data_update_fail);
*/
} }
return bch_err_throw(c, data_update_fail_no_rw_devs); return bch_err_throw(c, data_update_fail_no_rw_devs);

View File

@ -355,7 +355,7 @@ int bch2_extent_update(struct btree_trans *trans,
bch2_inode_opts_get_inode(c, &inode, &opts); bch2_inode_opts_get_inode(c, &inode, &opts);
try(bch2_bkey_set_needs_rebalance(c, &opts, k, try(bch2_bkey_set_needs_rebalance(trans, NULL, &opts, k,
SET_NEEDS_REBALANCE_foreground, SET_NEEDS_REBALANCE_foreground,
change_cookie)); change_cookie));
try(bch2_trans_update(trans, iter, k, 0)); try(bch2_trans_update(trans, iter, k, 0));
@ -390,6 +390,13 @@ static int bch2_write_index_default(struct bch_write_op *op)
bch2_trans_begin(trans); bch2_trans_begin(trans);
k = bch2_keylist_front(keys); k = bch2_keylist_front(keys);
/*
* If we did a degraded write, bch2_bkey_set_needs_rebalance() will add
* pointers to BCH_SB_MEMBER_INVALID so the extent is accounted as
* degraded
*/
bch2_bkey_buf_realloc(&sk, k->k.u64s + 1 + BCH_REPLICAS_MAX);
bch2_bkey_buf_copy(&sk, k); bch2_bkey_buf_copy(&sk, k);
int ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &sk.k->k.p.snapshot); int ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &sk.k->k.p.snapshot);
@ -1227,8 +1234,15 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
return 0; return 0;
} }
/*
* If we did a degraded write, bch2_bkey_set_needs_rebalance() will add
* pointers to BCH_SB_MEMBER_INVALID so the extent is accounted as
* degraded
*/
struct bkey_i *new = errptr_try(bch2_trans_kmalloc_nomemzero(trans, struct bkey_i *new = errptr_try(bch2_trans_kmalloc_nomemzero(trans,
bkey_bytes(k.k) + sizeof(struct bch_extent_rebalance))); bkey_bytes(k.k) +
sizeof(struct bch_extent_rebalance_v2) +
sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX));
bkey_reassemble(new, k); bkey_reassemble(new, k);
bch2_cut_front(c, bkey_start_pos(&orig->k), new); bch2_cut_front(c, bkey_start_pos(&orig->k), new);
@ -1256,7 +1270,7 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
return bch2_extent_update_i_size_sectors(trans, iter, return bch2_extent_update_i_size_sectors(trans, iter,
min(new->k.p.offset << 9, new_i_size), 0, &inode) ?: min(new->k.p.offset << 9, new_i_size), 0, &inode) ?:
(bch2_inode_opts_get_inode(c, &inode, &opts), (bch2_inode_opts_get_inode(c, &inode, &opts),
bch2_bkey_set_needs_rebalance(c, &opts, new, bch2_bkey_set_needs_rebalance(trans, NULL, &opts, new,
SET_NEEDS_REBALANCE_foreground, SET_NEEDS_REBALANCE_foreground,
op->opts.change_cookie)) ?: op->opts.change_cookie)) ?:
bch2_trans_update(trans, iter, new, bch2_trans_update(trans, iter, new,
@ -1273,7 +1287,8 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
bkey_start_pos(&orig->k), orig->k.p, bkey_start_pos(&orig->k), orig->k.p,
BTREE_ITER_intent, k, BTREE_ITER_intent, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ &op->res, NULL,
BCH_TRANS_COMMIT_no_enospc, ({
bch2_nocow_write_convert_one_unwritten(trans, &iter, op, orig, k, op->new_i_size); bch2_nocow_write_convert_one_unwritten(trans, &iter, op, orig, k, op->new_i_size);
})); }));
if (ret) if (ret)

View File

@ -165,6 +165,7 @@ write_attribute(trigger_freelist_wakeup);
write_attribute(trigger_recalc_capacity); write_attribute(trigger_recalc_capacity);
write_attribute(trigger_delete_dead_snapshots); write_attribute(trigger_delete_dead_snapshots);
write_attribute(trigger_emergency_read_only); write_attribute(trigger_emergency_read_only);
write_attribute(trigger_check_inconsistent_replicas);
read_attribute(gc_gens_pos); read_attribute(gc_gens_pos);
read_attribute(uuid); read_attribute(uuid);
@ -218,6 +219,7 @@ read_attribute(copy_gc_wait);
sysfs_pd_controller_attribute(rebalance); sysfs_pd_controller_attribute(rebalance);
read_attribute(rebalance_status); read_attribute(rebalance_status);
read_attribute(rebalance_scan_pending);
read_attribute(snapshot_delete_status); read_attribute(snapshot_delete_status);
read_attribute(recovery_status); read_attribute(recovery_status);
@ -314,6 +316,51 @@ static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c)
prt_printf(out, "reserved:\t\t%llu\n", b.reserved); prt_printf(out, "reserved:\t\t%llu\n", b.reserved);
} }
static int bkey_check_inconsistent_replicas(struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p, prev;
bool have_prev = false, have_inconsistent = false;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
if (p.crc.live_size != k.k->size)
continue;
if (!have_prev) {
prev = p;
continue;
}
have_inconsistent |= prev.crc.csum_type == p.crc.csum_type &&
bch2_crc_cmp(prev.crc.csum, p.crc.csum);
}
if (have_inconsistent) {
CLASS(printbuf, buf)();
bch2_bkey_val_to_text(&buf, c, k);
pr_info("%s", buf.buf);
}
return 0;
}
static void bch2_check_inconsistent_replicas(struct bch_fs *c)
{
CLASS(btree_trans, trans)(c);
for_each_btree_key(trans, iter,
BTREE_ID_extents, POS_MIN,
BTREE_ITER_all_snapshots, k, ({
bkey_check_inconsistent_replicas(c, k);
}));
for_each_btree_key(trans, iter,
BTREE_ID_reflink, POS_MIN,
BTREE_ITER_all_snapshots, k, ({
bkey_check_inconsistent_replicas(c, k);
}));
}
SHOW(bch2_fs) SHOW(bch2_fs)
{ {
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
@ -340,6 +387,9 @@ SHOW(bch2_fs)
if (attr == &sysfs_rebalance_status) if (attr == &sysfs_rebalance_status)
bch2_rebalance_status_to_text(out, c); bch2_rebalance_status_to_text(out, c);
if (attr == &sysfs_rebalance_scan_pending)
bch2_rebalance_scan_pending_to_text(out, c);
if (attr == &sysfs_snapshot_delete_status) if (attr == &sysfs_snapshot_delete_status)
bch2_snapshot_delete_status_to_text(out, c); bch2_snapshot_delete_status_to_text(out, c);
@ -487,6 +537,9 @@ STORE(bch2_fs)
printbuf_exit(&buf); printbuf_exit(&buf);
} }
if (attr == &sysfs_trigger_check_inconsistent_replicas)
bch2_check_inconsistent_replicas(c);
#ifdef CONFIG_BCACHEFS_TESTS #ifdef CONFIG_BCACHEFS_TESTS
if (attr == &sysfs_perf_test) { if (attr == &sysfs_perf_test) {
char *tmp __free(kfree) = kstrdup(buf, GFP_KERNEL), *p = tmp; char *tmp __free(kfree) = kstrdup(buf, GFP_KERNEL), *p = tmp;
@ -517,6 +570,7 @@ struct attribute *bch2_fs_files[] = {
&sysfs_btree_write_stats, &sysfs_btree_write_stats,
&sysfs_rebalance_status, &sysfs_rebalance_status,
&sysfs_rebalance_scan_pending,
&sysfs_snapshot_delete_status, &sysfs_snapshot_delete_status,
&sysfs_recovery_status, &sysfs_recovery_status,
@ -622,6 +676,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_trigger_recalc_capacity, &sysfs_trigger_recalc_capacity,
&sysfs_trigger_delete_dead_snapshots, &sysfs_trigger_delete_dead_snapshots,
&sysfs_trigger_emergency_read_only, &sysfs_trigger_emergency_read_only,
&sysfs_trigger_check_inconsistent_replicas,
&sysfs_gc_gens_pos, &sysfs_gc_gens_pos,

View File

@ -294,7 +294,7 @@ int bch2_inode_set_casefold(struct btree_trans *, subvol_inum,
#include "data/rebalance.h" #include "data/rebalance.h"
static inline struct bch_extent_rebalance static inline struct bch_extent_rebalance_v2
bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode) bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode)
{ {
struct bch_inode_opts io_opts; struct bch_inode_opts io_opts;

View File

@ -543,6 +543,17 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
bch_notice(ca, "%s", bch2_member_states[new_state]); bch_notice(ca, "%s", bch2_member_states[new_state]);
bool do_rebalance_scan =
new_state == BCH_MEMBER_STATE_rw ||
new_state == BCH_MEMBER_STATE_failed;
struct rebalance_scan s = new_state == BCH_MEMBER_STATE_rw
? (struct rebalance_scan) { .type = REBALANCE_SCAN_pending }
: (struct rebalance_scan) { .type = REBALANCE_SCAN_device, .dev = ca->dev_idx };
if (do_rebalance_scan)
try(bch2_set_rebalance_needs_scan(c, s, false));
scoped_guard(mutex, &c->sb_lock) { scoped_guard(mutex, &c->sb_lock) {
struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
SET_BCH_MEMBER_STATE(m, new_state); SET_BCH_MEMBER_STATE(m, new_state);
@ -552,7 +563,8 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
if (new_state == BCH_MEMBER_STATE_rw) if (new_state == BCH_MEMBER_STATE_rw)
__bch2_dev_read_write(c, ca); __bch2_dev_read_write(c, ca);
bch2_rebalance_wakeup(c); if (do_rebalance_scan)
try(bch2_set_rebalance_needs_scan(c, s, true));
return ret; return ret;
} }
@ -740,6 +752,9 @@ int bch2_dev_add(struct bch_fs *c, const char *path, struct printbuf *err)
if (ret) if (ret)
goto err; goto err;
struct rebalance_scan s = { .type = REBALANCE_SCAN_pending };
try(bch2_set_rebalance_needs_scan(c, s, false));
scoped_guard(rwsem_write, &c->state_lock) { scoped_guard(rwsem_write, &c->state_lock) {
scoped_guard(mutex, &c->sb_lock) { scoped_guard(mutex, &c->sb_lock) {
SET_BCH_SB_MULTI_DEVICE(c->disk_sb.sb, true); SET_BCH_SB_MULTI_DEVICE(c->disk_sb.sb, true);
@ -824,6 +839,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path, struct printbuf *err)
}; };
kobject_uevent_env(&ca->disk_sb.bdev->bd_device.kobj, KOBJ_CHANGE, envp); kobject_uevent_env(&ca->disk_sb.bdev->bd_device.kobj, KOBJ_CHANGE, envp);
} }
try(bch2_set_rebalance_needs_scan(c, s, true));
out: out:
bch_err_fn(c, ret); bch_err_fn(c, ret);
return ret; return ret;
@ -936,6 +953,11 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets, struct p
return -EINVAL; return -EINVAL;
} }
bool wakeup_rebalance_pending = nbuckets > ca->mi.nbuckets;
struct rebalance_scan s = { .type = REBALANCE_SCAN_pending };
if (wakeup_rebalance_pending)
try(bch2_set_rebalance_needs_scan(c, s, false));
if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) { if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) {
prt_printf(err, "New device size too big (%llu greater than max %u)\n", prt_printf(err, "New device size too big (%llu greater than max %u)\n",
nbuckets, BCH_MEMBER_NBUCKETS_MAX); nbuckets, BCH_MEMBER_NBUCKETS_MAX);
@ -979,6 +1001,9 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets, struct p
} }
bch2_recalc_capacity(c); bch2_recalc_capacity(c);
if (wakeup_rebalance_pending)
try(bch2_set_rebalance_needs_scan(c, s, true));
return 0; return 0;
} }

View File

@ -370,14 +370,12 @@ void bch2_fs_read_only(struct bch_fs *c)
test_bit(BCH_FS_clean_shutdown, &c->flags) && test_bit(BCH_FS_clean_shutdown, &c->flags) &&
c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) { c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) {
BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
BUG_ON(!c->sb.clean);
BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty)); BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty));
BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
BUG_ON(c->btree_write_buffer.inc.keys.nr); BUG_ON(c->btree_write_buffer.inc.keys.nr);
BUG_ON(c->btree_write_buffer.flushing.keys.nr); BUG_ON(c->btree_write_buffer.flushing.keys.nr);
bch2_verify_accounting_clean(c); bch2_verify_accounting_clean(c);
bch_verbose(c, "marking filesystem clean");
bch2_fs_mark_clean(c);
} else { } else {
/* Make sure error counts/counters are persisted */ /* Make sure error counts/counters are persisted */
guard(mutex)(&c->sb_lock); guard(mutex)(&c->sb_lock);
@ -473,7 +471,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
try(bch2_fs_init_rw(c)); try(bch2_fs_init_rw(c));
try(bch2_sb_members_v2_init(c)); try(bch2_sb_members_v2_init(c));
try(bch2_fs_mark_dirty(c));
clear_bit(BCH_FS_clean_shutdown, &c->flags); clear_bit(BCH_FS_clean_shutdown, &c->flags);
@ -1052,7 +1049,6 @@ static int bch2_fs_init(struct bch_fs *c, struct bch_sb *sb,
init_rwsem(&c->state_lock); init_rwsem(&c->state_lock);
mutex_init(&c->sb_lock); mutex_init(&c->sb_lock);
mutex_init(&c->replicas_gc_lock);
mutex_init(&c->btree_root_lock); mutex_init(&c->btree_root_lock);
INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);

View File

@ -377,6 +377,7 @@ int bch2_fs_journal_start(struct journal *j, struct journal_start_info info)
struct journal_replay *i, **_i; struct journal_replay *i, **_i;
struct genradix_iter iter; struct genradix_iter iter;
bool had_entries = false; bool had_entries = false;
int ret = 0;
/* /*
* *
@ -445,12 +446,26 @@ int bch2_fs_journal_start(struct journal *j, struct journal_start_info info)
if (journal_entry_empty(&i->j)) if (journal_entry_empty(&i->j))
j->last_empty_seq = le64_to_cpu(i->j.seq); j->last_empty_seq = le64_to_cpu(i->j.seq);
struct bch_devs_list seq_devs = {}; if (!info.clean) {
darray_for_each(i->ptrs, ptr) struct bch_devs_list seq_devs = {};
seq_devs.data[seq_devs.nr++] = ptr->dev; darray_for_each(i->ptrs, ptr)
seq_devs.data[seq_devs.nr++] = ptr->dev;
p = journal_seq_pin(j, seq); p = journal_seq_pin(j, seq);
bch2_devlist_to_replicas(&p->devs.e, BCH_DATA_journal, seq_devs); bch2_devlist_to_replicas(&p->devs.e, BCH_DATA_journal, seq_devs);
CLASS(printbuf, buf)();
bch2_replicas_entry_to_text(&buf, &p->devs.e);
fsck_err_on(!test_bit(JOURNAL_degraded, &j->flags) &&
!bch2_replicas_marked(c, &p->devs.e),
c, journal_entry_replicas_not_marked,
"superblock not marked as containing replicas for journal entry %llu\n%s",
le64_to_cpu(i->j.seq), buf.buf);
if (bch2_replicas_entry_get(c, &p->devs.e))
p->devs.e.nr_devs = 0;
}
had_entries = true; had_entries = true;
} }
@ -464,7 +479,9 @@ int bch2_fs_journal_start(struct journal *j, struct journal_start_info info)
c->last_bucket_seq_cleanup = journal_cur_seq(j); c->last_bucket_seq_cleanup = journal_cur_seq(j);
} }
return 0; try(bch2_replicas_gc_reffed(c));
fsck_err:
return ret;
} }
void bch2_journal_set_replay_done(struct journal *j) void bch2_journal_set_replay_done(struct journal *j)

View File

@ -358,7 +358,6 @@ static int journal_entry_open(struct journal *j)
lockdep_assert_held(&j->lock); lockdep_assert_held(&j->lock);
BUG_ON(journal_entry_is_open(j)); BUG_ON(journal_entry_is_open(j));
BUG_ON(c->sb.clean);
if (j->blocked) if (j->blocked)
return bch_err_throw(c, journal_blocked); return bch_err_throw(c, journal_blocked);
@ -435,7 +434,8 @@ static int journal_entry_open(struct journal *j)
bkey_extent_init(&buf->key); bkey_extent_init(&buf->key);
buf->noflush = false; buf->noflush = false;
buf->must_flush = false; /* if filesystem is clean, the first journal write must be a flush */
buf->must_flush = c->sb.clean;
buf->separate_flush = false; buf->separate_flush = false;
buf->flush_time = 0; buf->flush_time = 0;
buf->need_flush_to_write_buffer = true; buf->need_flush_to_write_buffer = true;
@ -1097,6 +1097,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
prt_printf(out, "last_seq:\t%llu\n", j->last_seq); prt_printf(out, "last_seq:\t%llu\n", j->last_seq);
prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk);
prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
prt_printf(out, "last_empty_seq:\t%llu\n", j->last_empty_seq);
prt_printf(out, "watermark:\t%s\n", bch2_watermarks[j->watermark]); prt_printf(out, "watermark:\t%s\n", bch2_watermarks[j->watermark]);
prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes);

View File

@ -1351,7 +1351,7 @@ int bch2_journal_read(struct bch_fs *c, struct journal_start_info *info)
struct journal_list jlist; struct journal_list jlist;
struct journal_replay *i, **_i; struct journal_replay *i, **_i;
struct genradix_iter radix_iter; struct genradix_iter radix_iter;
bool degraded = false, last_write_torn = false; bool last_write_torn = false;
u64 seq; u64 seq;
int ret = 0; int ret = 0;
@ -1376,7 +1376,7 @@ int bch2_journal_read(struct bch_fs *c, struct journal_start_info *info)
system_unbound_wq, system_unbound_wq,
&jlist.cl); &jlist.cl);
else else
degraded = true; set_bit(JOURNAL_degraded, &c->journal.flags);
} }
while (closure_sync_timeout(&jlist.cl, sysctl_hung_task_timeout_secs * HZ / 2)) while (closure_sync_timeout(&jlist.cl, sysctl_hung_task_timeout_secs * HZ / 2))
@ -1514,17 +1514,6 @@ int bch2_journal_read(struct bch_fs *c, struct journal_start_info *info)
replicas_entry_add_dev(&replicas.e, ptr->dev); replicas_entry_add_dev(&replicas.e, ptr->dev);
bch2_replicas_entry_sort(&replicas.e); bch2_replicas_entry_sort(&replicas.e);
CLASS(printbuf, buf)();
bch2_replicas_entry_to_text(&buf, &replicas.e);
if (!degraded &&
!bch2_replicas_marked(c, &replicas.e) &&
(le64_to_cpu(i->j.seq) == info->seq_read_start ||
fsck_err(c, journal_entry_replicas_not_marked,
"superblock not marked as containing replicas for journal entry %llu\n%s",
le64_to_cpu(i->j.seq), buf.buf)))
try(bch2_mark_replicas(c, &replicas.e));
} }
fsck_err: fsck_err:
return ret; return ret;

View File

@ -346,25 +346,47 @@ void bch2_journal_update_last_seq(struct journal *j)
} }
} }
void bch2_journal_update_last_seq_ondisk(struct journal *j, u64 last_seq_ondisk) void bch2_journal_update_last_seq_ondisk(struct journal *j, u64 last_seq_ondisk,
bool clean)
{ {
struct bch_fs *c = container_of(j, struct bch_fs, journal);
union bch_replicas_padded replicas;
unsigned nr_refs = 0;
size_t dirty_entry_bytes = 0; size_t dirty_entry_bytes = 0;
scoped_guard(mutex, &j->last_seq_ondisk_lock) scoped_guard(mutex, &j->last_seq_ondisk_lock) {
while (j->last_seq_ondisk < last_seq_ondisk) { for (u64 seq = j->last_seq_ondisk;
struct journal_entry_pin_list *pin_list = journal_seq_pin(j, j->last_seq_ondisk); seq < (clean ? j->pin.back : last_seq_ondisk);
seq++) {
struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
if (pin_list->devs.e.nr_devs) {
if (nr_refs &&
!bch2_replicas_entry_eq(&replicas.e, &pin_list->devs.e)) {
bch2_replicas_entry_put_many(c, &replicas.e, nr_refs);
nr_refs = 0;
}
memcpy(&replicas, &pin_list->devs, replicas_entry_bytes(&pin_list->devs.e));
pin_list->devs.e.nr_devs = 0;
nr_refs++;
}
dirty_entry_bytes += pin_list->bytes; dirty_entry_bytes += pin_list->bytes;
pin_list->bytes = 0; pin_list->bytes = 0;
j->last_seq_ondisk++;
} }
j->last_seq_ondisk = last_seq_ondisk;
}
scoped_guard(spinlock, &j->lock) { scoped_guard(spinlock, &j->lock) {
if (WARN_ON(j->dirty_entry_bytes < dirty_entry_bytes)) if (WARN_ON(j->dirty_entry_bytes < dirty_entry_bytes))
dirty_entry_bytes = j->dirty_entry_bytes; dirty_entry_bytes = j->dirty_entry_bytes;
j->dirty_entry_bytes -= dirty_entry_bytes; j->dirty_entry_bytes -= dirty_entry_bytes;
} }
if (nr_refs)
bch2_replicas_entry_put_many(c, &replicas.e, nr_refs);
} }
bool __bch2_journal_pin_put(struct journal *j, u64 seq) bool __bch2_journal_pin_put(struct journal *j, u64 seq)
@ -975,39 +997,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
try(bch2_journal_error(j)); try(bch2_journal_error(j));
guard(mutex)(&c->replicas_gc_lock); return 0;
bch2_replicas_gc_start(c, 1 << BCH_DATA_journal);
/*
* Now that we've populated replicas_gc, write to the journal to mark
* active journal devices. This handles the case where the journal might
* be empty. Otherwise we could clear all journal replicas and
* temporarily put the fs into an unrecoverable state. Journal recovery
* expects to find devices marked for journal data on unclean mount.
*/
int ret = bch2_journal_meta(&c->journal);
if (ret)
goto err;
seq = 0;
scoped_guard(spinlock, &j->lock)
while (!ret) {
seq = max(seq, j->last_seq);
if (seq > j->seq_ondisk)
break;
union bch_replicas_padded replicas;
memcpy(&replicas, &journal_seq_pin(j, seq)->devs, sizeof(replicas));
seq++;
if (replicas.e.nr_devs) {
spin_unlock(&j->lock);
ret = bch2_mark_replicas(c, &replicas.e);
spin_lock(&j->lock);
}
}
err:
return bch2_replicas_gc_end(c, ret);
} }
bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)

View File

@ -44,7 +44,7 @@ journal_seq_pin(struct journal *j, u64 seq)
} }
void bch2_journal_update_last_seq(struct journal *); void bch2_journal_update_last_seq(struct journal *);
void bch2_journal_update_last_seq_ondisk(struct journal *, u64); void bch2_journal_update_last_seq_ondisk(struct journal *, u64, bool);
bool __bch2_journal_pin_put(struct journal *, u64); bool __bch2_journal_pin_put(struct journal *, u64);
void bch2_journal_pin_put(struct journal *, u64); void bch2_journal_pin_put(struct journal *, u64);

View File

@ -150,6 +150,7 @@ enum journal_space_from {
}; };
#define JOURNAL_FLAGS() \ #define JOURNAL_FLAGS() \
x(degraded) \
x(replay_done) \ x(replay_done) \
x(running) \ x(running) \
x(may_skip_flush) \ x(may_skip_flush) \

View File

@ -196,10 +196,17 @@ static CLOSURE_CALLBACK(journal_write_done)
? j->flush_write_time ? j->flush_write_time
: j->noflush_write_time, j->write_start_time); : j->noflush_write_time, j->write_start_time);
struct bch_replicas_entry_v1 *r = &journal_seq_pin(j, seq)->devs.e;
if (w->had_error) { if (w->had_error) {
struct bch_replicas_entry_v1 *r = &journal_seq_pin(j, seq)->devs.e; bch2_replicas_entry_put(c, r);
r->nr_devs = 0;
}
if (!r->nr_devs && !w->empty) {
bch2_devlist_to_replicas(r, BCH_DATA_journal, w->devs_written); bch2_devlist_to_replicas(r, BCH_DATA_journal, w->devs_written);
err = bch2_replicas_entry_get(c, r);
if (err)
r->nr_devs = 0;
} }
if (!w->devs_written.nr) if (!w->devs_written.nr)
@ -261,7 +268,7 @@ again:
* properly - when the flush completes replcias * properly - when the flush completes replcias
* refs need to have been dropped * refs need to have been dropped
* */ * */
bch2_journal_update_last_seq_ondisk(j, w->last_seq); bch2_journal_update_last_seq_ondisk(j, w->last_seq, w->empty);
last_seq_ondisk_updated = true; last_seq_ondisk_updated = true;
spin_lock(&j->lock); spin_lock(&j->lock);
goto again; goto again;
@ -657,7 +664,6 @@ CLOSURE_CALLBACK(bch2_journal_write)
unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_free]); unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_free]);
int ret; int ret;
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
BUG_ON(!w->write_started); BUG_ON(!w->write_started);
BUG_ON(w->write_allocated); BUG_ON(w->write_allocated);
BUG_ON(w->write_done); BUG_ON(w->write_done);
@ -718,15 +724,24 @@ CLOSURE_CALLBACK(bch2_journal_write)
w->devs_written = bch2_bkey_devs(c, bkey_i_to_s_c(&w->key)); w->devs_written = bch2_bkey_devs(c, bkey_i_to_s_c(&w->key));
/* if (!c->sb.clean) {
* Mark journal replicas before we submit the write to guarantee /*
* recovery will find the journal entries after a crash. * Mark journal replicas before we submit the write to guarantee
*/ * recovery will find the journal entries after a crash.
struct bch_replicas_entry_v1 *r = &journal_seq_pin(j, le64_to_cpu(w->data->seq))->devs.e; *
bch2_devlist_to_replicas(r, BCH_DATA_journal, w->devs_written); * If the filesystem is clean, we have to defer this until after
ret = bch2_mark_replicas(c, r); * the write completes, so the filesystem isn't marked dirty
if (ret) * before anything is in the journal:
goto err; */
struct bch_replicas_entry_v1 *r = &journal_seq_pin(j, le64_to_cpu(w->data->seq))->devs.e;
bch2_devlist_to_replicas(r, BCH_DATA_journal, w->devs_written);
ret = bch2_replicas_entry_get(c, r);
if (ret) {
r->nr_devs = 0;
goto err;
}
}
if (c->opts.nochanges) if (c->opts.nochanges)
goto no_io; goto no_io;

View File

@ -108,6 +108,11 @@ static const char * const __bch2_fs_usage_types[] = {
NULL NULL
}; };
const char * const __bch2_rebalance_accounting_types[] = {
BCH_REBALANCE_ACCOUNTING()
NULL
};
#undef x #undef x
static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[], static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[],
@ -132,6 +137,7 @@ PRT_STR_OPT_BOUNDSCHECKED(csum_opt, enum bch_csum_opt);
PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type); PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type);
PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type); PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type);
PRT_STR_OPT_BOUNDSCHECKED(str_hash_type, enum bch_str_hash_type); PRT_STR_OPT_BOUNDSCHECKED(str_hash_type, enum bch_str_hash_type);
PRT_STR_OPT_BOUNDSCHECKED(rebalance_accounting_type, enum bch_rebalance_accounting_type);
static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res, static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res,
struct printbuf *err) struct printbuf *err)
@ -525,7 +531,8 @@ void bch2_opts_to_text(struct printbuf *out,
} }
} }
static int opt_hook_io(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum bch_opt_id id, bool post) static int opt_hook_io(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum bch_opt_id id,
u64 v, bool post)
{ {
if (!test_bit(BCH_FS_started, &c->flags)) if (!test_bit(BCH_FS_started, &c->flags))
return 0; return 0;
@ -544,11 +551,23 @@ static int opt_hook_io(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum bch_
.inum = inum, .inum = inum,
}; };
try(bch2_set_rebalance_needs_scan(c, s)); try(bch2_set_rebalance_needs_scan(c, s, post));
if (post)
bch2_rebalance_wakeup(c);
break; break;
} }
case Opt_metadata_target:
case Opt_metadata_checksum:
case Opt_metadata_replicas:
try(bch2_set_rebalance_needs_scan(c,
(struct rebalance_scan) { .type = REBALANCE_SCAN_metadata, .dev = inum }, post));
break;
case Opt_durability:
if (!post && v > ca->mi.durability)
try(bch2_set_rebalance_needs_scan(c,
(struct rebalance_scan) { .type = REBALANCE_SCAN_pending}, post));
try(bch2_set_rebalance_needs_scan(c,
(struct rebalance_scan) { .type = REBALANCE_SCAN_device, .dev = inum }, post));
break;
default: default:
break; break;
} }
@ -578,7 +597,7 @@ int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum b
} }
if (change) if (change)
try(opt_hook_io(c, ca, inum, id, false)); try(opt_hook_io(c, ca, inum, id, v, false));
return 0; return 0;
} }
@ -594,7 +613,7 @@ int bch2_opts_hooks_pre_set(struct bch_fs *c)
void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum,
enum bch_opt_id id, u64 v) enum bch_opt_id id, u64 v)
{ {
opt_hook_io(c, ca, inum, id, true); opt_hook_io(c, ca, inum, id, v, true);
switch (id) { switch (id) {
case Opt_rebalance_enabled: case Opt_rebalance_enabled:

View File

@ -25,6 +25,7 @@ extern const char * const __bch2_str_hash_types[];
extern const char * const bch2_str_hash_opts[]; extern const char * const bch2_str_hash_opts[];
extern const char * const __bch2_data_types[]; extern const char * const __bch2_data_types[];
extern const char * const bch2_member_states[]; extern const char * const bch2_member_states[];
extern const char * const __bch2_rebalance_accounting_types[];
extern const char * const bch2_d_types[]; extern const char * const bch2_d_types[];
void bch2_prt_jset_entry_type(struct printbuf *, enum bch_jset_entry_type); void bch2_prt_jset_entry_type(struct printbuf *, enum bch_jset_entry_type);
@ -34,6 +35,7 @@ void bch2_prt_csum_opt(struct printbuf *, enum bch_csum_opt);
void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type); void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type);
void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type); void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type);
void bch2_prt_str_hash_type(struct printbuf *, enum bch_str_hash_type); void bch2_prt_str_hash_type(struct printbuf *, enum bch_str_hash_type);
void bch2_prt_rebalance_accounting_type(struct printbuf *, enum bch_rebalance_accounting_type);
static inline const char *bch2_d_type_str(unsigned d_type) static inline const char *bch2_d_type_str(unsigned d_type)
{ {

View File

@ -256,18 +256,10 @@ const struct bch_sb_field_ops bch_sb_field_ops_clean = {
.to_text = bch2_sb_clean_to_text, .to_text = bch2_sb_clean_to_text,
}; };
int bch2_fs_mark_dirty(struct bch_fs *c) void bch2_fs_mark_dirty(struct bch_fs *c)
{ {
/*
* Unconditionally write superblock, to verify it hasn't changed before
* we go rw:
*/
guard(mutex)(&c->sb_lock);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false); SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
return bch2_write_super(c);
} }
void bch2_fs_mark_clean(struct bch_fs *c) void bch2_fs_mark_clean(struct bch_fs *c)
@ -277,7 +269,6 @@ void bch2_fs_mark_clean(struct bch_fs *c)
unsigned u64s; unsigned u64s;
int ret; int ret;
guard(mutex)(&c->sb_lock);
if (BCH_SB_CLEAN(c->disk_sb.sb)) if (BCH_SB_CLEAN(c->disk_sb.sb))
return; return;
@ -321,6 +312,4 @@ void bch2_fs_mark_clean(struct bch_fs *c)
} }
bch2_journal_pos_from_member_info_set(c); bch2_journal_pos_from_member_info_set(c);
bch2_write_super(c);
} }

View File

@ -10,7 +10,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry **
extern const struct bch_sb_field_ops bch_sb_field_ops_clean; extern const struct bch_sb_field_ops bch_sb_field_ops_clean;
int bch2_fs_mark_dirty(struct bch_fs *); void bch2_fs_mark_dirty(struct bch_fs *);
void bch2_fs_mark_clean(struct bch_fs *); void bch2_fs_mark_clean(struct bch_fs *);
#endif /* _BCACHEFS_SB_CLEAN_H */ #endif /* _BCACHEFS_SB_CLEAN_H */

View File

@ -110,7 +110,16 @@
BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set)\ BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set)\
x(btree_node_accounting, \ x(btree_node_accounting, \
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
BCH_FSCK_ERR_accounting_mismatch) BCH_FSCK_ERR_accounting_mismatch) \
x(rebalance_v2, \
BIT_ULL(BCH_RECOVERY_PASS_check_rebalance_work), \
BCH_FSCK_ERR_accounting_mismatch, \
BCH_FSCK_ERR_extent_io_opts_not_set)
#define UPGRADE_TABLE_INCOMPAT() \
x(rebalance_v2, \
BIT_ULL(BCH_RECOVERY_PASS_check_rebalance_work), \
BCH_FSCK_ERR_extent_io_opts_not_set)
#define DOWNGRADE_TABLE() \ #define DOWNGRADE_TABLE() \
x(bucket_stripe_sectors, \ x(bucket_stripe_sectors, \
@ -175,17 +184,32 @@ struct upgrade_downgrade_entry {
UPGRADE_TABLE() UPGRADE_TABLE()
#undef x #undef x
#define x(ver, passes, ...) static const u16 upgrade_incompat_##ver##_errors[] = { __VA_ARGS__ };
UPGRADE_TABLE_INCOMPAT()
#undef x
static const struct upgrade_downgrade_entry upgrade_table[] = { static const struct upgrade_downgrade_entry upgrade_table[] = {
#define x(ver, passes, ...) { \ #define x(ver, passes, ...) { \
.recovery_passes = passes, \ .recovery_passes = passes, \
.version = bcachefs_metadata_version_##ver,\ .version = bcachefs_metadata_version_##ver, \
.nr_errors = ARRAY_SIZE(upgrade_##ver##_errors), \ .nr_errors = ARRAY_SIZE(upgrade_##ver##_errors), \
.errors = upgrade_##ver##_errors, \ .errors = upgrade_##ver##_errors, \
}, },
UPGRADE_TABLE() UPGRADE_TABLE()
#undef x #undef x
}; };
static const struct upgrade_downgrade_entry upgrade_table_incompat[] = {
#define x(ver, passes, ...) { \
.recovery_passes = passes, \
.version = bcachefs_metadata_version_##ver, \
.nr_errors = ARRAY_SIZE(upgrade_incompat_##ver##_errors), \
.errors = upgrade_incompat_##ver##_errors, \
},
UPGRADE_TABLE_INCOMPAT()
#undef x
};
static int have_stripes(struct bch_fs *c) static int have_stripes(struct bch_fs *c)
{ {
if (IS_ERR_OR_NULL(c->btree_roots_known[BTREE_ID_stripes].b)) if (IS_ERR_OR_NULL(c->btree_roots_known[BTREE_ID_stripes].b))
@ -219,17 +243,17 @@ int bch2_sb_set_upgrade_extra(struct bch_fs *c)
return ret < 0 ? ret : 0; return ret < 0 ? ret : 0;
} }
void bch2_sb_set_upgrade(struct bch_fs *c, static void __bch2_sb_set_upgrade(struct bch_fs *c,
unsigned old_version, unsigned old_version,
unsigned new_version) unsigned new_version,
const struct upgrade_downgrade_entry *table,
size_t nr_entries)
{ {
lockdep_assert_held(&c->sb_lock); lockdep_assert_held(&c->sb_lock);
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
for (const struct upgrade_downgrade_entry *i = upgrade_table; for (const struct upgrade_downgrade_entry *i = table; i < table + nr_entries; i++)
i < upgrade_table + ARRAY_SIZE(upgrade_table);
i++)
if (i->version > old_version && i->version <= new_version) { if (i->version > old_version && i->version <= new_version) {
u64 passes = i->recovery_passes; u64 passes = i->recovery_passes;
@ -245,6 +269,24 @@ void bch2_sb_set_upgrade(struct bch_fs *c,
} }
} }
void bch2_sb_set_upgrade(struct bch_fs *c,
unsigned old_version,
unsigned new_version)
{
return __bch2_sb_set_upgrade(c, old_version, new_version,
upgrade_table,
ARRAY_SIZE(upgrade_table));
}
void bch2_sb_set_upgrade_incompat(struct bch_fs *c,
unsigned old_version,
unsigned new_version)
{
return __bch2_sb_set_upgrade(c, old_version, new_version,
upgrade_table_incompat,
ARRAY_SIZE(upgrade_table_incompat));
}
#define x(ver, passes, ...) static const u16 downgrade_##ver##_errors[] = { __VA_ARGS__ }; #define x(ver, passes, ...) static const u16 downgrade_##ver##_errors[] = { __VA_ARGS__ };
DOWNGRADE_TABLE() DOWNGRADE_TABLE()
#undef x #undef x

View File

@ -6,6 +6,7 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_downgrade;
int bch2_sb_downgrade_update(struct bch_fs *); int bch2_sb_downgrade_update(struct bch_fs *);
void bch2_sb_set_upgrade(struct bch_fs *, unsigned, unsigned); void bch2_sb_set_upgrade(struct bch_fs *, unsigned, unsigned);
void bch2_sb_set_upgrade_incompat(struct bch_fs *, unsigned, unsigned);
int bch2_sb_set_upgrade_extra(struct bch_fs *); int bch2_sb_set_upgrade_extra(struct bch_fs *);
void bch2_sb_set_downgrade(struct bch_fs *, unsigned, unsigned); void bch2_sb_set_downgrade(struct bch_fs *, unsigned, unsigned);

View File

@ -160,6 +160,10 @@ enum bch_fsck_flags {
x(extent_ptrs_redundant_stripe, 139, 0) \ x(extent_ptrs_redundant_stripe, 139, 0) \
x(extent_ptrs_unwritten, 140, 0) \ x(extent_ptrs_unwritten, 140, 0) \
x(extent_ptrs_written_and_unwritten, 141, 0) \ x(extent_ptrs_written_and_unwritten, 141, 0) \
x(extent_ptrs_all_invalid, 338, 0) \
x(extent_rebalance_bad_pending, 332, 0) \
x(extent_rebalance_bad_hipri, 333, 0) \
x(extent_rebalance_bad_replicas, 339, 0) \
x(ptr_to_invalid_device, 142, 0) \ x(ptr_to_invalid_device, 142, 0) \
x(ptr_to_removed_device, 322, FSCK_AUTOFIX) \ x(ptr_to_removed_device, 322, FSCK_AUTOFIX) \
x(ptr_to_duplicate_device, 143, 0) \ x(ptr_to_duplicate_device, 143, 0) \
@ -339,9 +343,15 @@ enum bch_fsck_flags {
x(dirent_cf_name_too_big, 304, 0) \ x(dirent_cf_name_too_big, 304, 0) \
x(dirent_stray_data_after_cf_name, 305, 0) \ x(dirent_stray_data_after_cf_name, 305, 0) \
x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \ x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \
x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \
x(validate_error_in_commit, 329, 0) \ x(validate_error_in_commit, 329, 0) \
x(MAX, 330, 0) x(extent_io_opts_not_set, 330, FSCK_AUTOFIX) \
x(extent_io_opts_unneeded, 331, FSCK_AUTOFIX) \
x(rebalance_bp_to_missing_btree_ptr, 310, FSCK_AUTOFIX) \
x(rebalance_bp_to_leaf_node_key, 334, FSCK_AUTOFIX) \
x(btree_ptr_with_no_rebalance_bp, 335, FSCK_AUTOFIX) \
x(btree_ptr_with_bad_rebalance_bp, 336, FSCK_AUTOFIX) \
x(btree_ptr_to_bad_rebalance_bp, 337, FSCK_AUTOFIX) \
x(MAX, 340, 0)
enum bch_sb_error_id { enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n, #define x(t, n, ...) BCH_FSCK_ERR_##t = n,

View File

@ -1024,6 +1024,11 @@ int bch2_write_super(struct bch_fs *c)
closure_init_stack(cl); closure_init_stack(cl);
memset(&sb_written, 0, sizeof(sb_written)); memset(&sb_written, 0, sizeof(sb_written));
if (bch2_sb_has_journal(c->disk_sb.sb))
bch2_fs_mark_dirty(c);
else
bch2_fs_mark_clean(c);
/* /*
* Note: we do writes to RO devices here, and we might want to change * Note: we do writes to RO devices here, and we might want to change
* that in the future. * that in the future.
@ -1280,6 +1285,8 @@ void bch2_sb_upgrade_incompat(struct bch_fs *c)
c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb,
max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), c->sb.version)); max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), c->sb.version));
bch2_sb_set_upgrade_incompat(c, c->sb.version_incompat_allowed, c->sb.version);
bch2_write_super(c); bch2_write_super(c);
} }

View File

@ -115,12 +115,12 @@ static int bch2_write_inode_trans(struct btree_trans *trans,
struct bch_inode_unpacked inode_u; struct bch_inode_unpacked inode_u;
try(bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent)); try(bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent));
struct bch_extent_rebalance old_r = bch2_inode_rebalance_opts_get(c, &inode_u); struct bch_extent_rebalance_v2 old_r = bch2_inode_rebalance_opts_get(c, &inode_u);
if (set) if (set)
try(set(trans, inode, &inode_u, p)); try(set(trans, inode, &inode_u, p));
struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u); struct bch_extent_rebalance_v2 new_r = bch2_inode_rebalance_opts_get(c, &inode_u);
*rebalance_changed = memcmp(&old_r, &new_r, sizeof(new_r)); *rebalance_changed = memcmp(&old_r, &new_r, sizeof(new_r));
if (*rebalance_changed) if (*rebalance_changed)
try(bch2_set_rebalance_needs_scan_trans(trans, try(bch2_set_rebalance_needs_scan_trans(trans,