Update bcachefs sources to fe72e70682 bcachefs: Fix for btree_gc repairing interior btree ptrs

This commit is contained in:
Kent Overstreet 2021-04-15 13:05:38 -04:00
parent 8ba5e814fd
commit ceac31bcb6
31 changed files with 774 additions and 1373 deletions

View File

@ -1 +1 @@
8eca47e4d5c4e6817ad4c020be4280bd82104efd fe72e70682cd2430a099c08c3135253675030d28

View File

@ -90,6 +90,7 @@ do { \
__wait_event(wq, condition); \ __wait_event(wq, condition); \
} while (0) } while (0)
#define wait_event_freezable(wq, condition) ({wait_event(wq, condition); 0; })
#define wait_event_killable(wq, condition) ({wait_event(wq, condition); 0; }) #define wait_event_killable(wq, condition) ({wait_event(wq, condition); 0; })
#define wait_event_interruptible(wq, condition) ({wait_event(wq, condition); 0; }) #define wait_event_interruptible(wq, condition) ({wait_event(wq, condition); 0; })

View File

@ -353,28 +353,6 @@ DEFINE_EVENT(btree_node, btree_set_root,
/* Garbage collection */ /* Garbage collection */
DEFINE_EVENT(btree_node, btree_gc_coalesce,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
TRACE_EVENT(btree_gc_coalesce_fail,
TP_PROTO(struct bch_fs *c, int reason),
TP_ARGS(c, reason),
TP_STRUCT__entry(
__field(u8, reason )
__array(char, uuid, 16 )
),
TP_fast_assign(
__entry->reason = reason;
memcpy(__entry->uuid, c->disk_sb.sb->user_uuid.b, 16);
),
TP_printk("%pU: %u", __entry->uuid, __entry->reason)
);
DEFINE_EVENT(btree_node, btree_gc_rewrite_node, DEFINE_EVENT(btree_node, btree_gc_rewrite_node,
TP_PROTO(struct bch_fs *c, struct btree *b), TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b) TP_ARGS(c, b)
@ -395,16 +373,6 @@ DEFINE_EVENT(bch_fs, gc_end,
TP_ARGS(c) TP_ARGS(c)
); );
DEFINE_EVENT(bch_fs, gc_coalesce_start,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c)
);
DEFINE_EVENT(bch_fs, gc_coalesce_end,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c)
);
DEFINE_EVENT(bch_fs, gc_cannot_inc_gens, DEFINE_EVENT(bch_fs, gc_cannot_inc_gens,
TP_PROTO(struct bch_fs *c), TP_PROTO(struct bch_fs *c),
TP_ARGS(c) TP_ARGS(c)
@ -412,24 +380,27 @@ DEFINE_EVENT(bch_fs, gc_cannot_inc_gens,
/* Allocator */ /* Allocator */
TRACE_EVENT(alloc_batch, TRACE_EVENT(alloc_scan,
TP_PROTO(struct bch_dev *ca, size_t free, size_t total), TP_PROTO(struct bch_dev *ca, u64 found, u64 inc_gen, u64 inc_gen_skipped),
TP_ARGS(ca, free, total), TP_ARGS(ca, found, inc_gen, inc_gen_skipped),
TP_STRUCT__entry( TP_STRUCT__entry(
__array(char, uuid, 16 ) __field(dev_t, dev )
__field(size_t, free ) __field(u64, found )
__field(size_t, total ) __field(u64, inc_gen )
__field(u64, inc_gen_skipped )
), ),
TP_fast_assign( TP_fast_assign(
memcpy(__entry->uuid, ca->uuid.b, 16); __entry->dev = ca->disk_sb.bdev->bd_dev;
__entry->free = free; __entry->found = found;
__entry->total = total; __entry->inc_gen = inc_gen;
__entry->inc_gen_skipped = inc_gen_skipped;
), ),
TP_printk("%pU free %zu total %zu", TP_printk("%d,%d found %llu inc_gen %llu inc_gen_skipped %llu",
__entry->uuid, __entry->free, __entry->total) MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->found, __entry->inc_gen, __entry->inc_gen_skipped)
); );
TRACE_EVENT(invalidate, TRACE_EVENT(invalidate,
@ -449,13 +420,10 @@ TRACE_EVENT(invalidate,
), ),
TP_printk("invalidated %u sectors at %d,%d sector=%llu", TP_printk("invalidated %u sectors at %d,%d sector=%llu",
__entry->sectors, MAJOR(__entry->dev), __entry->sectors,
MINOR(__entry->dev), __entry->offset) MAJOR(__entry->dev),
); MINOR(__entry->dev),
__entry->offset)
DEFINE_EVENT(bch_fs, rescale_prios,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c)
); );
DECLARE_EVENT_CLASS(bucket_alloc, DECLARE_EVENT_CLASS(bucket_alloc,
@ -463,16 +431,18 @@ DECLARE_EVENT_CLASS(bucket_alloc,
TP_ARGS(ca, reserve), TP_ARGS(ca, reserve),
TP_STRUCT__entry( TP_STRUCT__entry(
__array(char, uuid, 16) __field(dev_t, dev )
__field(enum alloc_reserve, reserve ) __field(enum alloc_reserve, reserve )
), ),
TP_fast_assign( TP_fast_assign(
memcpy(__entry->uuid, ca->uuid.b, 16); __entry->dev = ca->disk_sb.bdev->bd_dev;
__entry->reserve = reserve; __entry->reserve = reserve;
), ),
TP_printk("%pU reserve %d", __entry->uuid, __entry->reserve) TP_printk("%d,%d reserve %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->reserve)
); );
DEFINE_EVENT(bucket_alloc, bucket_alloc, DEFINE_EVENT(bucket_alloc, bucket_alloc,
@ -598,77 +568,93 @@ DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused,
TRACE_EVENT(trans_restart_would_deadlock, TRACE_EVENT(trans_restart_would_deadlock,
TP_PROTO(unsigned long trans_ip, TP_PROTO(unsigned long trans_ip,
unsigned long caller_ip, unsigned long caller_ip,
bool in_traverse_all,
unsigned reason, unsigned reason,
enum btree_id have_btree_id, enum btree_id have_btree_id,
unsigned have_iter_type, unsigned have_iter_type,
struct bpos *have_pos,
enum btree_id want_btree_id, enum btree_id want_btree_id,
unsigned want_iter_type), unsigned want_iter_type,
TP_ARGS(trans_ip, caller_ip, reason, struct bpos *want_pos),
have_btree_id, have_iter_type, TP_ARGS(trans_ip, caller_ip, in_traverse_all, reason,
want_btree_id, want_iter_type), have_btree_id, have_iter_type, have_pos,
want_btree_id, want_iter_type, want_pos),
TP_STRUCT__entry( TP_STRUCT__entry(
__field(unsigned long, trans_ip ) __field(unsigned long, trans_ip )
__field(unsigned long, caller_ip ) __field(unsigned long, caller_ip )
__field(u8, in_traverse_all )
__field(u8, reason ) __field(u8, reason )
__field(u8, have_btree_id ) __field(u8, have_btree_id )
__field(u8, have_iter_type ) __field(u8, have_iter_type )
__field(u8, want_btree_id ) __field(u8, want_btree_id )
__field(u8, want_iter_type ) __field(u8, want_iter_type )
__field(u64, have_pos_inode )
__field(u64, have_pos_offset )
__field(u32, have_pos_snapshot)
__field(u32, want_pos_snapshot)
__field(u64, want_pos_inode )
__field(u64, want_pos_offset )
), ),
TP_fast_assign( TP_fast_assign(
__entry->trans_ip = trans_ip; __entry->trans_ip = trans_ip;
__entry->caller_ip = caller_ip; __entry->caller_ip = caller_ip;
__entry->in_traverse_all = in_traverse_all;
__entry->reason = reason; __entry->reason = reason;
__entry->have_btree_id = have_btree_id; __entry->have_btree_id = have_btree_id;
__entry->have_iter_type = have_iter_type; __entry->have_iter_type = have_iter_type;
__entry->want_btree_id = want_btree_id; __entry->want_btree_id = want_btree_id;
__entry->want_iter_type = want_iter_type; __entry->want_iter_type = want_iter_type;
__entry->have_pos_inode = have_pos->inode;
__entry->have_pos_offset = have_pos->offset;
__entry->have_pos_snapshot = have_pos->snapshot;
__entry->want_pos_inode = want_pos->inode;
__entry->want_pos_offset = want_pos->offset;
__entry->want_pos_snapshot = want_pos->snapshot;
), ),
TP_printk("%ps %pS because %u have %u:%u want %u:%u", TP_printk("%ps %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u",
(void *) __entry->trans_ip, (void *) __entry->trans_ip,
(void *) __entry->caller_ip, (void *) __entry->caller_ip,
__entry->in_traverse_all,
__entry->reason, __entry->reason,
__entry->have_btree_id, __entry->have_btree_id,
__entry->have_iter_type, __entry->have_iter_type,
__entry->have_pos_inode,
__entry->have_pos_offset,
__entry->have_pos_snapshot,
__entry->want_btree_id, __entry->want_btree_id,
__entry->want_iter_type) __entry->want_iter_type,
); __entry->want_pos_inode,
__entry->want_pos_offset,
TRACE_EVENT(trans_restart_iters_realloced, __entry->want_pos_snapshot)
TP_PROTO(unsigned long ip, unsigned nr),
TP_ARGS(ip, nr),
TP_STRUCT__entry(
__field(unsigned long, ip )
__field(unsigned, nr )
),
TP_fast_assign(
__entry->ip = ip;
__entry->nr = nr;
),
TP_printk("%ps nr %u", (void *) __entry->ip, __entry->nr)
); );
TRACE_EVENT(trans_restart_mem_realloced, TRACE_EVENT(trans_restart_mem_realloced,
TP_PROTO(unsigned long ip, unsigned long bytes), TP_PROTO(unsigned long trans_ip, unsigned long caller_ip,
TP_ARGS(ip, bytes), unsigned long bytes),
TP_ARGS(trans_ip, caller_ip, bytes),
TP_STRUCT__entry( TP_STRUCT__entry(
__field(unsigned long, ip ) __field(unsigned long, trans_ip )
__field(unsigned long, bytes ) __field(unsigned long, caller_ip )
__field(unsigned long, bytes )
), ),
TP_fast_assign( TP_fast_assign(
__entry->ip = ip; __entry->trans_ip = trans_ip;
__entry->bytes = bytes; __entry->caller_ip = caller_ip;
__entry->bytes = bytes;
), ),
TP_printk("%ps bytes %lu", (void *) __entry->ip, __entry->bytes) TP_printk("%ps %pS bytes %lu",
(void *) __entry->trans_ip,
(void *) __entry->caller_ip,
__entry->bytes)
); );
DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get, DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get,
@ -726,6 +712,11 @@ DEFINE_EVENT(transaction_restart, trans_restart_traverse,
TP_ARGS(ip) TP_ARGS(ip)
); );
DEFINE_EVENT(transaction_restart, trans_traverse_all,
TP_PROTO(unsigned long ip),
TP_ARGS(ip)
);
DECLARE_EVENT_CLASS(node_lock_fail, DECLARE_EVENT_CLASS(node_lock_fail,
TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
TP_ARGS(level, iter_seq, node, node_seq), TP_ARGS(level, iter_seq, node, node_seq),

View File

@ -25,44 +25,19 @@
#include <linux/sort.h> #include <linux/sort.h>
#include <trace/events/bcachefs.h> #include <trace/events/bcachefs.h>
const char * const bch2_allocator_states[] = {
#define x(n) #n,
ALLOC_THREAD_STATES()
#undef x
NULL
};
static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8, #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
BCH_ALLOC_FIELDS_V1() BCH_ALLOC_FIELDS_V1()
#undef x #undef x
}; };
/* Ratelimiting/PD controllers */
static void pd_controllers_update(struct work_struct *work)
{
struct bch_fs *c = container_of(to_delayed_work(work),
struct bch_fs,
pd_controllers_update);
struct bch_dev *ca;
s64 free = 0, fragmented = 0;
unsigned i;
for_each_member_device(ca, c, i) {
struct bch_dev_usage stats = bch2_dev_usage_read(ca);
free += bucket_to_sector(ca,
__dev_buckets_available(ca, stats)) << 9;
/*
* Bytes of internal fragmentation, which can be
* reclaimed by copy GC
*/
fragmented += max_t(s64, 0, (bucket_to_sector(ca,
stats.d[BCH_DATA_user].buckets +
stats.d[BCH_DATA_cached].buckets) -
(stats.d[BCH_DATA_user].sectors +
stats.d[BCH_DATA_cached].sectors)) << 9);
}
bch2_pd_controller_update(&c->copygc_pd, free, fragmented, -1);
schedule_delayed_work(&c->pd_controllers_update,
c->pd_controllers_update_seconds * HZ);
}
/* Persistent alloc info: */ /* Persistent alloc info: */
static inline u64 alloc_field_v1_get(const struct bch_alloc *a, static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
@ -234,7 +209,7 @@ void bch2_alloc_pack(struct bch_fs *c,
bch2_alloc_pack_v2(dst, src); bch2_alloc_pack_v2(dst, src);
} }
static unsigned bch_alloc_val_u64s(const struct bch_alloc *a) static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
{ {
unsigned i, bytes = offsetof(struct bch_alloc, data); unsigned i, bytes = offsetof(struct bch_alloc, data);
@ -254,7 +229,7 @@ const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k)
return "invalid device"; return "invalid device";
/* allow for unknown fields */ /* allow for unknown fields */
if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v)) if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v))
return "incorrect value size"; return "incorrect value size";
return NULL; return NULL;
@ -279,9 +254,9 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
{ {
struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
pr_buf(out, "gen %u oldest_gen %u data_type %u", pr_buf(out, "gen %u oldest_gen %u data_type %s",
u.gen, u.oldest_gen, u.data_type); u.gen, u.oldest_gen, bch2_data_types[u.data_type]);
#define x(_name, ...) pr_buf(out, #_name " %llu ", (u64) u._name); #define x(_name, ...) pr_buf(out, " " #_name " %llu", (u64) u._name);
BCH_ALLOC_FIELDS_V2() BCH_ALLOC_FIELDS_V2()
#undef x #undef x
} }
@ -322,7 +297,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_alloc, ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_alloc,
NULL, bch2_alloc_read_fn); NULL, bch2_alloc_read_fn);
up_read(&c->gc_lock); up_read(&c->gc_lock);
if (ret) { if (ret) {
bch_err(c, "error reading alloc info: %i", ret); bch_err(c, "error reading alloc info: %i", ret);
return ret; return ret;
@ -467,52 +441,6 @@ out:
* commands to the newly free buckets, then puts them on the various freelists. * commands to the newly free buckets, then puts them on the various freelists.
*/ */
/**
* wait_buckets_available - wait on reclaimable buckets
*
* If there aren't enough available buckets to fill up free_inc, wait until
* there are.
*/
static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
{
unsigned long gc_count = c->gc_count;
s64 available;
int ret = 0;
ca->allocator_state = ALLOCATOR_BLOCKED;
closure_wake_up(&c->freelist_wait);
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
if (kthread_should_stop()) {
ret = 1;
break;
}
if (gc_count != c->gc_count)
ca->inc_gen_really_needs_gc = 0;
available = dev_buckets_reclaimable(ca);
available -= ca->inc_gen_really_needs_gc;
available = max(available, 0LL);
if (available)
break;
up_read(&c->gc_lock);
schedule();
try_to_freeze();
down_read(&c->gc_lock);
}
__set_current_state(TASK_RUNNING);
ca->allocator_state = ALLOCATOR_RUNNING;
closure_wake_up(&c->freelist_wait);
return ret;
}
static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b, static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
struct bucket_mark m) struct bucket_mark m)
{ {
@ -530,11 +458,8 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
gc_gen = bucket_gc_gen(bucket(ca, b)); gc_gen = bucket_gc_gen(bucket(ca, b));
if (gc_gen >= BUCKET_GC_GEN_MAX / 2) ca->inc_gen_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX / 2;
ca->inc_gen_needs_gc++; ca->inc_gen_really_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX;
if (gc_gen >= BUCKET_GC_GEN_MAX)
ca->inc_gen_really_needs_gc++;
return gc_gen < BUCKET_GC_GEN_MAX; return gc_gen < BUCKET_GC_GEN_MAX;
} }
@ -611,6 +536,8 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
struct bucket_mark m = READ_ONCE(g->mark); struct bucket_mark m = READ_ONCE(g->mark);
unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk); unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
cond_resched();
if (!bch2_can_invalidate_bucket(ca, b, m)) if (!bch2_can_invalidate_bucket(ca, b, m))
continue; continue;
@ -627,8 +554,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
.key = key, .key = key,
}; };
} }
cond_resched();
} }
if (e.nr) if (e.nr)
@ -721,6 +646,7 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
size_t i, nr = 0; size_t i, nr = 0;
ca->inc_gen_needs_gc = 0; ca->inc_gen_needs_gc = 0;
ca->inc_gen_really_needs_gc = 0;
switch (ca->mi.replacement) { switch (ca->mi.replacement) {
case BCH_CACHE_REPLACEMENT_lru: case BCH_CACHE_REPLACEMENT_lru:
@ -742,25 +668,6 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
return nr; return nr;
} }
static inline long next_alloc_bucket(struct bch_dev *ca)
{
struct alloc_heap_entry e, *top = ca->alloc_heap.data;
while (ca->alloc_heap.used) {
if (top->nr) {
size_t b = top->bucket;
top->bucket++;
top->nr--;
return b;
}
heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
}
return -1;
}
/* /*
* returns sequence number of most recent journal entry that updated this * returns sequence number of most recent journal entry that updated this
* bucket: * bucket:
@ -783,17 +690,56 @@ static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m)
} }
} }
static int bch2_invalidate_one_bucket2(struct btree_trans *trans, static int bucket_invalidate_btree(struct btree_trans *trans,
struct bch_dev *ca, struct bch_dev *ca, u64 b)
struct btree_iter *iter,
u64 *journal_seq, unsigned flags)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct bkey_alloc_buf a; struct bkey_alloc_buf *a;
struct bkey_alloc_unpacked u; struct bkey_alloc_unpacked u;
struct bucket *g; struct bucket *g;
struct bucket_mark m; struct bucket_mark m;
bool invalidating_cached_data; struct btree_iter *iter =
bch2_trans_get_iter(trans, BTREE_ID_alloc,
POS(ca->dev_idx, b),
BTREE_ITER_CACHED|
BTREE_ITER_CACHED_NOFILL|
BTREE_ITER_INTENT);
int ret;
a = bch2_trans_kmalloc(trans, sizeof(*a));
ret = PTR_ERR_OR_ZERO(a);
if (ret)
goto err;
ret = bch2_btree_iter_traverse(iter);
if (ret)
goto err;
percpu_down_read(&c->mark_lock);
g = bucket(ca, b);
m = READ_ONCE(g->mark);
u = alloc_mem_to_key(iter, g, m);
percpu_up_read(&c->mark_lock);
u.gen++;
u.data_type = 0;
u.dirty_sectors = 0;
u.cached_sectors = 0;
u.read_time = atomic64_read(&c->io_clock[READ].now);
u.write_time = atomic64_read(&c->io_clock[WRITE].now);
bch2_alloc_pack(c, a, u);
bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_BUCKET_INVALIDATE);
err:
bch2_trans_iter_put(trans, iter);
return ret;
}
static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
u64 *journal_seq, unsigned flags)
{
struct bucket *g;
struct bucket_mark m;
size_t b; size_t b;
int ret = 0; int ret = 0;
@ -808,7 +754,7 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
BUG_ON(m.dirty_sectors); BUG_ON(m.dirty_sectors);
bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0); bch2_mark_alloc_bucket(c, ca, b, true);
spin_lock(&c->freelist_lock); spin_lock(&c->freelist_lock);
verify_not_on_freelist(c, ca, b); verify_not_on_freelist(c, ca, b);
@ -839,48 +785,12 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
goto out; goto out;
} }
bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); ret = bch2_trans_do(c, NULL, journal_seq,
retry: BTREE_INSERT_NOCHECK_RW|
ret = bch2_btree_iter_traverse(iter); BTREE_INSERT_NOFAIL|
if (ret) BTREE_INSERT_JOURNAL_RESERVED|
return ret; flags,
bucket_invalidate_btree(&trans, ca, b));
percpu_down_read(&c->mark_lock);
g = bucket(ca, iter->pos.offset);
m = READ_ONCE(g->mark);
u = alloc_mem_to_key(iter, g, m);
percpu_up_read(&c->mark_lock);
invalidating_cached_data = u.cached_sectors != 0;
u.gen++;
u.data_type = 0;
u.dirty_sectors = 0;
u.cached_sectors = 0;
u.read_time = atomic64_read(&c->io_clock[READ].now);
u.write_time = atomic64_read(&c->io_clock[WRITE].now);
bch2_alloc_pack(c, &a, u);
bch2_trans_update(trans, iter, &a.k,
BTREE_TRIGGER_BUCKET_INVALIDATE);
/*
* XXX:
* when using deferred btree updates, we have journal reclaim doing
* btree updates and thus requiring the allocator to make forward
* progress, and here the allocator is requiring space in the journal -
* so we need a journal pre-reservation:
*/
ret = bch2_trans_commit(trans, NULL,
invalidating_cached_data ? journal_seq : NULL,
BTREE_INSERT_NOUNLOCK|
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_JOURNAL_RESERVED|
flags);
if (ret == -EINTR)
goto retry;
out: out:
if (!ret) { if (!ret) {
/* remove from alloc_heap: */ /* remove from alloc_heap: */
@ -905,8 +815,7 @@ out:
percpu_down_read(&c->mark_lock); percpu_down_read(&c->mark_lock);
spin_lock(&c->freelist_lock); spin_lock(&c->freelist_lock);
bch2_mark_alloc_bucket(c, ca, b, false, bch2_mark_alloc_bucket(c, ca, b, false);
gc_pos_alloc(c, NULL), 0);
BUG_ON(!fifo_pop_back(&ca->free_inc, b2)); BUG_ON(!fifo_pop_back(&ca->free_inc, b2));
BUG_ON(b != b2); BUG_ON(b != b2);
@ -923,29 +832,23 @@ out:
*/ */
static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
{ {
struct btree_trans trans;
struct btree_iter *iter;
u64 journal_seq = 0; u64 journal_seq = 0;
int ret = 0; int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc,
POS(ca->dev_idx, 0),
BTREE_ITER_CACHED|
BTREE_ITER_CACHED_NOFILL|
BTREE_ITER_INTENT);
/* Only use nowait if we've already invalidated at least one bucket: */ /* Only use nowait if we've already invalidated at least one bucket: */
while (!ret && while (!ret &&
!fifo_full(&ca->free_inc) && !fifo_full(&ca->free_inc) &&
ca->alloc_heap.used) ca->alloc_heap.used) {
ret = bch2_invalidate_one_bucket2(&trans, ca, iter, &journal_seq, ret = bch2_invalidate_one_bucket(c, ca, &journal_seq,
BTREE_INSERT_GC_LOCK_HELD|
(!fifo_empty(&ca->free_inc) (!fifo_empty(&ca->free_inc)
? BTREE_INSERT_NOWAIT : 0)); ? BTREE_INSERT_NOWAIT : 0));
/*
bch2_trans_iter_put(&trans, iter); * We only want to batch up invalidates when they're going to
bch2_trans_exit(&trans); * require flushing the journal:
*/
if (!journal_seq)
break;
}
/* If we used NOWAIT, don't return the error: */ /* If we used NOWAIT, don't return the error: */
if (!fifo_empty(&ca->free_inc)) if (!fifo_empty(&ca->free_inc))
@ -965,83 +868,72 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
return 0; return 0;
} }
static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket) static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state)
{
if (ca->allocator_state != new_state) {
ca->allocator_state = new_state;
closure_wake_up(&ca->fs->freelist_wait);
}
}
static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
{ {
unsigned i; unsigned i;
int ret = 0; int ret = 0;
while (1) { spin_lock(&c->freelist_lock);
set_current_state(TASK_INTERRUPTIBLE); for (i = 0; i < RESERVE_NR; i++) {
/*
* Don't strand buckets on the copygc freelist until
* after recovery is finished:
*/
if (i == RESERVE_MOVINGGC &&
!test_bit(BCH_FS_STARTED, &c->flags))
continue;
spin_lock(&c->freelist_lock); if (fifo_push(&ca->free[i], b)) {
for (i = 0; i < RESERVE_NR; i++) { fifo_pop(&ca->free_inc, b);
/*
* Don't strand buckets on the copygc freelist until
* after recovery is finished:
*/
if (!test_bit(BCH_FS_STARTED, &c->flags) &&
i == RESERVE_MOVINGGC)
continue;
if (fifo_push(&ca->free[i], bucket)) {
fifo_pop(&ca->free_inc, bucket);
closure_wake_up(&c->freelist_wait);
ca->allocator_state = ALLOCATOR_RUNNING;
spin_unlock(&c->freelist_lock);
goto out;
}
}
if (ca->allocator_state != ALLOCATOR_BLOCKED_FULL) {
ca->allocator_state = ALLOCATOR_BLOCKED_FULL;
closure_wake_up(&c->freelist_wait);
}
spin_unlock(&c->freelist_lock);
if ((current->flags & PF_KTHREAD) &&
kthread_should_stop()) {
ret = 1; ret = 1;
break; break;
} }
schedule();
try_to_freeze();
} }
out: spin_unlock(&c->freelist_lock);
__set_current_state(TASK_RUNNING);
ca->allocator_state = ret
? ALLOCATOR_running
: ALLOCATOR_blocked_full;
closure_wake_up(&c->freelist_wait);
return ret; return ret;
} }
/* static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
* Pulls buckets off free_inc, discards them (if enabled), then adds them to
* freelists, waiting until there's room if necessary:
*/
static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
{ {
while (!fifo_empty(&ca->free_inc)) { if (ca->mi.discard &&
size_t bucket = fifo_peek(&ca->free_inc); blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b),
if (ca->mi.discard && ca->mi.bucket_size, GFP_NOFS, 0);
blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
blkdev_issue_discard(ca->disk_sb.bdev,
bucket_to_sector(ca, bucket),
ca->mi.bucket_size, GFP_NOIO, 0);
if (push_invalidated_bucket(c, ca, bucket))
return 1;
}
return 0;
} }
static inline bool allocator_thread_running(struct bch_dev *ca) static bool allocator_thread_running(struct bch_dev *ca)
{ {
return ca->mi.state == BCH_MEMBER_STATE_rw && unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw &&
test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags); test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags)
? ALLOCATOR_running
: ALLOCATOR_stopped;
alloc_thread_set_state(ca, state);
return state == ALLOCATOR_running;
}
static int buckets_available(struct bch_dev *ca, unsigned long gc_count)
{
s64 available = dev_buckets_reclaimable(ca) -
(gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0);
bool ret = available > 0;
alloc_thread_set_state(ca, ret
? ALLOCATOR_running
: ALLOCATOR_blocked);
return ret;
} }
/** /**
@ -1056,62 +948,29 @@ static int bch2_allocator_thread(void *arg)
{ {
struct bch_dev *ca = arg; struct bch_dev *ca = arg;
struct bch_fs *c = ca->fs; struct bch_fs *c = ca->fs;
unsigned long gc_count = c->gc_count;
size_t nr; size_t nr;
int ret; int ret;
set_freezable(); set_freezable();
while (1) { while (1) {
if (!allocator_thread_running(ca)) { ret = kthread_wait_freezable(allocator_thread_running(ca));
ca->allocator_state = ALLOCATOR_STOPPED;
if (kthread_wait_freezable(allocator_thread_running(ca)))
break;
}
ca->allocator_state = ALLOCATOR_RUNNING;
cond_resched();
if (kthread_should_stop())
break;
pr_debug("discarding %zu invalidated buckets",
fifo_used(&ca->free_inc));
ret = discard_invalidated_buckets(c, ca);
if (ret) if (ret)
goto stop; goto stop;
down_read(&c->gc_lock); while (!ca->alloc_heap.used) {
ret = bch2_invalidate_buckets(c, ca);
if (ret) {
up_read(&c->gc_lock);
goto stop;
}
if (!fifo_empty(&ca->free_inc)) {
up_read(&c->gc_lock);
continue;
}
pr_debug("free_inc now empty");
while (1) {
cond_resched(); cond_resched();
/*
* Find some buckets that we can invalidate, either
* they're completely unused, or only contain clean data
* that's been written back to the backing device or
* another cache tier
*/
pr_debug("scanning for reclaimable buckets"); ret = kthread_wait_freezable(buckets_available(ca, gc_count));
if (ret)
goto stop;
gc_count = c->gc_count;
nr = find_reclaimable_buckets(c, ca); nr = find_reclaimable_buckets(c, ca);
pr_debug("found %zu buckets", nr); trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
ca->inc_gen_really_needs_gc);
trace_alloc_batch(ca, nr, ca->alloc_heap.size);
if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) || if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
ca->inc_gen_really_needs_gc) && ca->inc_gen_really_needs_gc) &&
@ -1119,37 +978,24 @@ static int bch2_allocator_thread(void *arg)
atomic_inc(&c->kick_gc); atomic_inc(&c->kick_gc);
wake_up_process(c->gc_thread); wake_up_process(c->gc_thread);
} }
if (nr)
break;
/*
* If we found any buckets, we have to invalidate them
* before we scan for more - but if we didn't find very
* many we may want to wait on more buckets being
* available so we don't spin:
*/
ret = wait_buckets_available(c, ca);
if (ret) {
up_read(&c->gc_lock);
goto stop;
}
} }
up_read(&c->gc_lock); ret = bch2_invalidate_buckets(c, ca);
if (ret)
goto stop;
pr_debug("%zu buckets to invalidate", nr); while (!fifo_empty(&ca->free_inc)) {
u64 b = fifo_peek(&ca->free_inc);
/* discard_one_bucket(c, ca, b);
* alloc_heap is now full of newly-invalidated buckets: next,
* write out the new bucket gens: ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b));
*/ if (ret)
goto stop;
}
} }
stop: stop:
pr_debug("alloc thread stopping (ret %i)", ret); alloc_thread_set_state(ca, ALLOCATOR_stopped);
ca->allocator_state = ALLOCATOR_STOPPED;
closure_wake_up(&c->freelist_wait);
return 0; return 0;
} }
@ -1158,7 +1004,7 @@ stop:
void bch2_recalc_capacity(struct bch_fs *c) void bch2_recalc_capacity(struct bch_fs *c)
{ {
struct bch_dev *ca; struct bch_dev *ca;
u64 capacity = 0, reserved_sectors = 0, gc_reserve, copygc_threshold = 0; u64 capacity = 0, reserved_sectors = 0, gc_reserve;
unsigned bucket_size_max = 0; unsigned bucket_size_max = 0;
unsigned long ra_pages = 0; unsigned long ra_pages = 0;
unsigned i, j; unsigned i, j;
@ -1201,8 +1047,6 @@ void bch2_recalc_capacity(struct bch_fs *c)
dev_reserve *= ca->mi.bucket_size; dev_reserve *= ca->mi.bucket_size;
copygc_threshold += dev_reserve;
capacity += bucket_to_sector(ca, ca->mi.nbuckets - capacity += bucket_to_sector(ca, ca->mi.nbuckets -
ca->mi.first_bucket); ca->mi.first_bucket);
@ -1220,7 +1064,6 @@ void bch2_recalc_capacity(struct bch_fs *c)
reserved_sectors = min(reserved_sectors, capacity); reserved_sectors = min(reserved_sectors, capacity);
c->copygc_threshold = copygc_threshold;
c->capacity = capacity - reserved_sectors; c->capacity = capacity - reserved_sectors;
c->bucket_size_max = bucket_size_max; c->bucket_size_max = bucket_size_max;
@ -1331,7 +1174,7 @@ void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
{ {
if (ca->alloc_thread) if (ca->alloc_thread)
closure_wait_event(&c->freelist_wait, closure_wait_event(&c->freelist_wait,
ca->allocator_state != ALLOCATOR_RUNNING); ca->allocator_state != ALLOCATOR_running);
} }
/* stop allocator thread: */ /* stop allocator thread: */
@ -1385,7 +1228,4 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
void bch2_fs_allocator_background_init(struct bch_fs *c) void bch2_fs_allocator_background_init(struct bch_fs *c)
{ {
spin_lock_init(&c->freelist_lock); spin_lock_init(&c->freelist_lock);
c->pd_controllers_update_seconds = 5;
INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
} }

View File

@ -6,6 +6,8 @@
#include "alloc_types.h" #include "alloc_types.h"
#include "debug.h" #include "debug.h"
extern const char * const bch2_allocator_states[];
struct bkey_alloc_unpacked { struct bkey_alloc_unpacked {
u64 bucket; u64 bucket;
u8 dev; u8 dev;
@ -98,10 +100,8 @@ static inline void bch2_wake_allocator(struct bch_dev *ca)
rcu_read_lock(); rcu_read_lock();
p = rcu_dereference(ca->alloc_thread); p = rcu_dereference(ca->alloc_thread);
if (p) { if (p)
wake_up_process(p); wake_up_process(p);
ca->allocator_state = ALLOCATOR_RUNNING;
}
rcu_read_unlock(); rcu_read_unlock();
} }

View File

@ -1,57 +1,14 @@
// SPDX-License-Identifier: GPL-2.0 // SPDX-License-Identifier: GPL-2.0
/* /*
* Primary bucket allocation code
*
* Copyright 2012 Google, Inc. * Copyright 2012 Google, Inc.
* *
* Allocation in bcache is done in terms of buckets: * Foreground allocator code: allocate buckets from freelist, and allocate in
* * sector granularity from writepoints.
* Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
* btree pointers - they must match for the pointer to be considered valid.
*
* Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
* bucket simply by incrementing its gen.
*
* The gens (along with the priorities; it's really the gens are important but
* the code is named as if it's the priorities) are written in an arbitrary list
* of buckets on disk, with a pointer to them in the journal header.
*
* When we invalidate a bucket, we have to write its new gen to disk and wait
* for that write to complete before we use it - otherwise after a crash we
* could have pointers that appeared to be good but pointed to data that had
* been overwritten.
*
* Since the gens and priorities are all stored contiguously on disk, we can
* batch this up: We fill up the free_inc list with freshly invalidated buckets,
* call prio_write(), and when prio_write() finishes we pull buckets off the
* free_inc list and optionally discard them.
*
* free_inc isn't the only freelist - if it was, we'd often have to sleep while
* priorities and gens were being written before we could allocate. c->free is a
* smaller freelist, and buckets on that list are always ready to be used.
*
* If we've got discards enabled, that happens when a bucket moves from the
* free_inc list to the free list.
*
* It's important to ensure that gens don't wrap around - with respect to
* either the oldest gen in the btree or the gen on disk. This is quite
* difficult to do in practice, but we explicitly guard against it anyways - if
* a bucket is in danger of wrapping around we simply skip invalidating it that
* time around, and we garbage collect or rewrite the priorities sooner than we
* would have otherwise.
* *
* bch2_bucket_alloc() allocates a single bucket from a specific device. * bch2_bucket_alloc() allocates a single bucket from a specific device.
* *
* bch2_bucket_alloc_set() allocates one or more buckets from different devices * bch2_bucket_alloc_set() allocates one or more buckets from different devices
* in a given filesystem. * in a given filesystem.
*
* invalidate_buckets() drives all the processes described above. It's called
* from bch2_bucket_alloc() and a few other places that need to make sure free
* buckets are ready.
*
* invalidate_buckets_(lru|fifo)() find buckets that are available to be
* invalidated, and then invalidate them and stick them on the free_inc list -
* in either lru or fifo order.
*/ */
#include "bcachefs.h" #include "bcachefs.h"
@ -98,8 +55,7 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
percpu_down_read(&c->mark_lock); percpu_down_read(&c->mark_lock);
spin_lock(&ob->lock); spin_lock(&ob->lock);
bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), false);
false, gc_pos_alloc(c, ob), 0);
ob->valid = false; ob->valid = false;
ob->type = 0; ob->type = 0;
@ -109,7 +65,9 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
spin_lock(&c->freelist_lock); spin_lock(&c->freelist_lock);
ob->freelist = c->open_buckets_freelist; ob->freelist = c->open_buckets_freelist;
c->open_buckets_freelist = ob - c->open_buckets; c->open_buckets_freelist = ob - c->open_buckets;
c->open_buckets_nr_free++; c->open_buckets_nr_free++;
ca->nr_open_buckets--;
spin_unlock(&c->freelist_lock); spin_unlock(&c->freelist_lock);
closure_wake_up(&c->open_buckets_wait); closure_wake_up(&c->open_buckets_wait);
@ -316,6 +274,7 @@ out:
c->blocked_allocate = 0; c->blocked_allocate = 0;
} }
ca->nr_open_buckets++;
spin_unlock(&c->freelist_lock); spin_unlock(&c->freelist_lock);
bch2_wake_allocator(ca); bch2_wake_allocator(ca);
@ -680,11 +639,14 @@ static struct write_point *__writepoint_find(struct hlist_head *head,
{ {
struct write_point *wp; struct write_point *wp;
rcu_read_lock();
hlist_for_each_entry_rcu(wp, head, node) hlist_for_each_entry_rcu(wp, head, node)
if (wp->write_point == write_point) if (wp->write_point == write_point)
return wp; goto out;
wp = NULL;
return NULL; out:
rcu_read_unlock();
return wp;
} }
static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)

View File

@ -10,6 +10,18 @@
struct ec_bucket_buf; struct ec_bucket_buf;
#define ALLOC_THREAD_STATES() \
x(stopped) \
x(running) \
x(blocked) \
x(blocked_full)
enum allocator_states {
#define x(n) ALLOCATOR_##n,
ALLOC_THREAD_STATES()
#undef x
};
enum alloc_reserve { enum alloc_reserve {
RESERVE_BTREE_MOVINGGC = -2, RESERVE_BTREE_MOVINGGC = -2,
RESERVE_BTREE = -1, RESERVE_BTREE = -1,

View File

@ -379,7 +379,6 @@ enum gc_phase {
GC_PHASE_BTREE_reflink, GC_PHASE_BTREE_reflink,
GC_PHASE_PENDING_DELETE, GC_PHASE_PENDING_DELETE,
GC_PHASE_ALLOC,
}; };
struct gc_pos { struct gc_pos {
@ -447,6 +446,7 @@ struct bch_dev {
*/ */
alloc_fifo free[RESERVE_NR]; alloc_fifo free[RESERVE_NR];
alloc_fifo free_inc; alloc_fifo free_inc;
unsigned nr_open_buckets;
open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT];
open_bucket_idx_t open_buckets_partial_nr; open_bucket_idx_t open_buckets_partial_nr;
@ -456,16 +456,7 @@ struct bch_dev {
size_t inc_gen_needs_gc; size_t inc_gen_needs_gc;
size_t inc_gen_really_needs_gc; size_t inc_gen_really_needs_gc;
/* enum allocator_states allocator_state;
* XXX: this should be an enum for allocator state, so as to include
* error state
*/
enum {
ALLOCATOR_STOPPED,
ALLOCATOR_RUNNING,
ALLOCATOR_BLOCKED,
ALLOCATOR_BLOCKED_FULL,
} allocator_state;
alloc_heap alloc_heap; alloc_heap alloc_heap;
@ -664,9 +655,6 @@ struct bch_fs {
struct workqueue_struct *copygc_wq; struct workqueue_struct *copygc_wq;
/* ALLOCATION */ /* ALLOCATION */
struct delayed_work pd_controllers_update;
unsigned pd_controllers_update_seconds;
struct bch_devs_mask rw_devs[BCH_DATA_NR]; struct bch_devs_mask rw_devs[BCH_DATA_NR];
u64 capacity; /* sectors */ u64 capacity; /* sectors */
@ -726,6 +714,9 @@ struct bch_fs {
atomic_t kick_gc; atomic_t kick_gc;
unsigned long gc_count; unsigned long gc_count;
enum btree_id gc_gens_btree;
struct bpos gc_gens_pos;
/* /*
* Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos] * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
* has been marked by GC. * has been marked by GC.
@ -772,9 +763,8 @@ struct bch_fs {
/* COPYGC */ /* COPYGC */
struct task_struct *copygc_thread; struct task_struct *copygc_thread;
copygc_heap copygc_heap; copygc_heap copygc_heap;
struct bch_pd_controller copygc_pd;
struct write_point copygc_write_point; struct write_point copygc_write_point;
u64 copygc_threshold; s64 copygc_wait;
/* STRIPES: */ /* STRIPES: */
GENRADIX(struct stripe) stripes[2]; GENRADIX(struct stripe) stripes[2];

View File

@ -98,12 +98,50 @@ const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k)
return bch2_bkey_ops[k.k->type].key_invalid(c, k); return bch2_bkey_ops[k.k->type].key_invalid(c, k);
} }
static unsigned bch2_key_types_allowed[] = {
[BKEY_TYPE_extents] =
(1U << KEY_TYPE_discard)|
(1U << KEY_TYPE_error)|
(1U << KEY_TYPE_extent)|
(1U << KEY_TYPE_reservation)|
(1U << KEY_TYPE_reflink_p)|
(1U << KEY_TYPE_inline_data),
[BKEY_TYPE_inodes] =
(1U << KEY_TYPE_inode)|
(1U << KEY_TYPE_inode_generation),
[BKEY_TYPE_dirents] =
(1U << KEY_TYPE_hash_whiteout)|
(1U << KEY_TYPE_dirent),
[BKEY_TYPE_xattrs] =
(1U << KEY_TYPE_hash_whiteout)|
(1U << KEY_TYPE_xattr),
[BKEY_TYPE_alloc] =
(1U << KEY_TYPE_alloc)|
(1U << KEY_TYPE_alloc_v2),
[BKEY_TYPE_quotas] =
(1U << KEY_TYPE_quota),
[BKEY_TYPE_stripes] =
(1U << KEY_TYPE_stripe),
[BKEY_TYPE_reflink] =
(1U << KEY_TYPE_reflink_v)|
(1U << KEY_TYPE_indirect_inline_data),
[BKEY_TYPE_btree] =
(1U << KEY_TYPE_btree_ptr)|
(1U << KEY_TYPE_btree_ptr_v2),
};
const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
enum btree_node_type type) enum btree_node_type type)
{ {
unsigned key_types_allowed = (1U << KEY_TYPE_deleted)|
bch2_key_types_allowed[type] ;
if (k.k->u64s < BKEY_U64s) if (k.k->u64s < BKEY_U64s)
return "u64s too small"; return "u64s too small";
if (!(key_types_allowed & (1U << k.k->type)))
return "invalid key type for this btree";
if (type == BKEY_TYPE_btree && if (type == BKEY_TYPE_btree &&
bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
return "value too big"; return "value too big";

View File

@ -250,39 +250,54 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
bkey_reassemble(new, *k); bkey_reassemble(new, *k);
bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ if (level) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); /*
struct bucket *g = PTR_BUCKET(ca, ptr, true); * We don't want to drop btree node pointers - if the
* btree node isn't there anymore, the read path will
* sort it out:
*/
ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct bucket *g = PTR_BUCKET(ca, ptr, true);
(ptr->cached && ptr->gen = g->mark.gen;
(!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) || }
(!ptr->cached && } else {
gen_cmp(ptr->gen, g->mark.gen) < 0); bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
})); struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct bucket *g = PTR_BUCKET(ca, ptr, true);
(ptr->cached &&
(!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) ||
(!ptr->cached &&
gen_cmp(ptr->gen, g->mark.gen) < 0);
}));
again: again:
ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
bkey_extent_entry_for_each(ptrs, entry) { bkey_extent_entry_for_each(ptrs, entry) {
if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
struct stripe *m = genradix_ptr(&c->stripes[true], struct stripe *m = genradix_ptr(&c->stripes[true],
entry->stripe_ptr.idx); entry->stripe_ptr.idx);
union bch_extent_entry *next_ptr; union bch_extent_entry *next_ptr;
bkey_extent_entry_for_each_from(ptrs, next_ptr, entry) bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr) if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
goto found; goto found;
next_ptr = NULL; next_ptr = NULL;
found: found:
if (!next_ptr) { if (!next_ptr) {
bch_err(c, "aieee, found stripe ptr with no data ptr"); bch_err(c, "aieee, found stripe ptr with no data ptr");
continue; continue;
} }
if (!m || !m->alive || if (!m || !m->alive ||
!__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block], !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
&next_ptr->ptr, &next_ptr->ptr,
m->sectors)) { m->sectors)) {
bch2_bkey_extent_entry_drop(new, entry); bch2_bkey_extent_entry_drop(new, entry);
goto again; goto again;
}
} }
} }
} }
@ -301,10 +316,10 @@ fsck_err:
static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
unsigned level, bool is_root, unsigned level, bool is_root,
struct bkey_s_c k, struct bkey_s_c *k,
u8 *max_stale, bool initial) u8 *max_stale, bool initial)
{ {
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); struct bkey_ptrs_c ptrs;
const struct bch_extent_ptr *ptr; const struct bch_extent_ptr *ptr;
unsigned flags = unsigned flags =
BTREE_TRIGGER_GC| BTREE_TRIGGER_GC|
@ -313,28 +328,29 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
if (initial) { if (initial) {
BUG_ON(bch2_journal_seq_verify && BUG_ON(bch2_journal_seq_verify &&
k.k->version.lo > journal_cur_seq(&c->journal)); k->k->version.lo > journal_cur_seq(&c->journal));
if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c, if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c,
"key version number higher than recorded: %llu > %llu", "key version number higher than recorded: %llu > %llu",
k.k->version.lo, k->k->version.lo,
atomic64_read(&c->key_version))) atomic64_read(&c->key_version)))
atomic64_set(&c->key_version, k.k->version.lo); atomic64_set(&c->key_version, k->k->version.lo);
if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
fsck_err_on(!bch2_bkey_replicas_marked(c, k), c, fsck_err_on(!bch2_bkey_replicas_marked(c, *k), c,
"superblock not marked as containing replicas (type %u)", "superblock not marked as containing replicas (type %u)",
k.k->type)) { k->k->type)) {
ret = bch2_mark_bkey_replicas(c, k); ret = bch2_mark_bkey_replicas(c, *k);
if (ret) { if (ret) {
bch_err(c, "error marking bkey replicas: %i", ret); bch_err(c, "error marking bkey replicas: %i", ret);
goto err; goto err;
} }
} }
ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, &k); ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k);
} }
ptrs = bch2_bkey_ptrs_c(*k);
bkey_for_each_ptr(ptrs, ptr) { bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct bucket *g = PTR_BUCKET(ca, ptr, true); struct bucket *g = PTR_BUCKET(ca, ptr, true);
@ -345,7 +361,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
*max_stale = max(*max_stale, ptr_stale(ca, ptr)); *max_stale = max(*max_stale, ptr_stale(ca, ptr));
} }
bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags); bch2_mark_key(c, *k, 0, k->k->size, NULL, 0, flags);
fsck_err: fsck_err:
err: err:
if (ret) if (ret)
@ -374,7 +390,7 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false, ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
k, max_stale, initial); &k, max_stale, initial);
if (ret) if (ret)
break; break;
@ -396,12 +412,13 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
} }
static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
bool initial) bool initial, bool metadata_only)
{ {
struct btree_trans trans; struct btree_trans trans;
struct btree_iter *iter; struct btree_iter *iter;
struct btree *b; struct btree *b;
unsigned depth = bch2_expensive_debug_checks ? 0 unsigned depth = metadata_only ? 1
: bch2_expensive_debug_checks ? 0
: !btree_node_type_needs_gc(btree_id) ? 1 : !btree_node_type_needs_gc(btree_id) ? 1
: 0; : 0;
u8 max_stale = 0; u8 max_stale = 0;
@ -445,10 +462,12 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
mutex_lock(&c->btree_root_lock); mutex_lock(&c->btree_root_lock);
b = c->btree_roots[btree_id].b; b = c->btree_roots[btree_id].b;
if (!btree_node_fake(b)) if (!btree_node_fake(b)) {
struct bkey_s_c k = bkey_i_to_s_c(&b->key);
ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true, ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true,
bkey_i_to_s_c(&b->key), &k, &max_stale, initial);
&max_stale, initial); }
gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
mutex_unlock(&c->btree_root_lock); mutex_unlock(&c->btree_root_lock);
@ -474,7 +493,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0); BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false, ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
k, &max_stale, true); &k, &max_stale, true);
if (ret) { if (ret) {
bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret); bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret);
break; break;
@ -544,11 +563,13 @@ fsck_err:
} }
static int bch2_gc_btree_init(struct bch_fs *c, static int bch2_gc_btree_init(struct bch_fs *c,
enum btree_id btree_id) enum btree_id btree_id,
bool metadata_only)
{ {
struct btree *b; struct btree *b;
unsigned target_depth = bch2_expensive_debug_checks ? 0 unsigned target_depth = metadata_only ? 1
: !btree_node_type_needs_gc(btree_id) ? 1 : bch2_expensive_debug_checks ? 0
: !btree_node_type_needs_gc(btree_id) ? 1
: 0; : 0;
u8 max_stale = 0; u8 max_stale = 0;
char buf[100]; char buf[100];
@ -575,10 +596,12 @@ static int bch2_gc_btree_init(struct bch_fs *c,
if (b->c.level >= target_depth) if (b->c.level >= target_depth)
ret = bch2_gc_btree_init_recurse(c, b, target_depth); ret = bch2_gc_btree_init_recurse(c, b, target_depth);
if (!ret) if (!ret) {
struct bkey_s_c k = bkey_i_to_s_c(&b->key);
ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true, ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true,
bkey_i_to_s_c(&b->key), &k, &max_stale, true);
&max_stale, true); }
fsck_err: fsck_err:
six_unlock_read(&b->c.lock); six_unlock_read(&b->c.lock);
@ -593,7 +616,7 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
(int) btree_id_to_gc_phase(r); (int) btree_id_to_gc_phase(r);
} }
static int bch2_gc_btrees(struct bch_fs *c, bool initial) static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
{ {
enum btree_id ids[BTREE_ID_NR]; enum btree_id ids[BTREE_ID_NR];
unsigned i; unsigned i;
@ -605,8 +628,8 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial)
for (i = 0; i < BTREE_ID_NR; i++) { for (i = 0; i < BTREE_ID_NR; i++) {
enum btree_id id = ids[i]; enum btree_id id = ids[i];
int ret = initial int ret = initial
? bch2_gc_btree_init(c, id) ? bch2_gc_btree_init(c, id, metadata_only)
: bch2_gc_btree(c, id, initial); : bch2_gc_btree(c, id, initial, metadata_only);
if (ret) { if (ret) {
bch_err(c, "%s: ret %i", __func__, ret); bch_err(c, "%s: ret %i", __func__, ret);
return ret; return ret;
@ -707,52 +730,6 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
} }
#endif #endif
static void bch2_mark_allocator_buckets(struct bch_fs *c)
{
struct bch_dev *ca;
struct open_bucket *ob;
size_t i, j, iter;
unsigned ci;
percpu_down_read(&c->mark_lock);
spin_lock(&c->freelist_lock);
gc_pos_set(c, gc_pos_alloc(c, NULL));
for_each_member_device(ca, c, ci) {
fifo_for_each_entry(i, &ca->free_inc, iter)
bch2_mark_alloc_bucket(c, ca, i, true,
gc_pos_alloc(c, NULL),
BTREE_TRIGGER_GC);
for (j = 0; j < RESERVE_NR; j++)
fifo_for_each_entry(i, &ca->free[j], iter)
bch2_mark_alloc_bucket(c, ca, i, true,
gc_pos_alloc(c, NULL),
BTREE_TRIGGER_GC);
}
spin_unlock(&c->freelist_lock);
for (ob = c->open_buckets;
ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
ob++) {
spin_lock(&ob->lock);
if (ob->valid) {
gc_pos_set(c, gc_pos_alloc(c, ob));
ca = bch_dev_bkey_exists(c, ob->ptr.dev);
bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true,
gc_pos_alloc(c, ob),
BTREE_TRIGGER_GC);
}
spin_unlock(&ob->lock);
}
percpu_up_read(&c->mark_lock);
}
static void bch2_gc_free(struct bch_fs *c) static void bch2_gc_free(struct bch_fs *c)
{ {
struct bch_dev *ca; struct bch_dev *ca;
@ -775,10 +752,10 @@ static void bch2_gc_free(struct bch_fs *c)
} }
static int bch2_gc_done(struct bch_fs *c, static int bch2_gc_done(struct bch_fs *c,
bool initial) bool initial, bool metadata_only)
{ {
struct bch_dev *ca; struct bch_dev *ca;
bool verify = (!initial || bool verify = !metadata_only && (!initial ||
(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
unsigned i, dev; unsigned i, dev;
int ret = 0; int ret = 0;
@ -805,7 +782,7 @@ static int bch2_gc_done(struct bch_fs *c,
if (dst->b[b].mark._f != src->b[b].mark._f) { \ if (dst->b[b].mark._f != src->b[b].mark._f) { \
if (verify) \ if (verify) \
fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \ fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \
": got %u, should be %u", i, b, \ ": got %u, should be %u", dev, b, \
dst->b[b].mark.gen, \ dst->b[b].mark.gen, \
bch2_data_types[dst->b[b].mark.data_type],\ bch2_data_types[dst->b[b].mark.data_type],\
dst->b[b].mark._f, src->b[b].mark._f); \ dst->b[b].mark._f, src->b[b].mark._f); \
@ -813,11 +790,11 @@ static int bch2_gc_done(struct bch_fs *c,
set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
} }
#define copy_dev_field(_f, _msg, ...) \ #define copy_dev_field(_f, _msg, ...) \
copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__) copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
#define copy_fs_field(_f, _msg, ...) \ #define copy_fs_field(_f, _msg, ...) \
copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
{ if (!metadata_only) {
struct genradix_iter iter = genradix_iter_init(&c->stripes[1], 0); struct genradix_iter iter = genradix_iter_init(&c->stripes[1], 0);
struct stripe *dst, *src; struct stripe *dst, *src;
@ -857,7 +834,6 @@ static int bch2_gc_done(struct bch_fs *c,
for (b = 0; b < src->nbuckets; b++) { for (b = 0; b < src->nbuckets; b++) {
copy_bucket_field(gen); copy_bucket_field(gen);
copy_bucket_field(data_type); copy_bucket_field(data_type);
copy_bucket_field(owned_by_allocator);
copy_bucket_field(stripe); copy_bucket_field(stripe);
copy_bucket_field(dirty_sectors); copy_bucket_field(dirty_sectors);
copy_bucket_field(cached_sectors); copy_bucket_field(cached_sectors);
@ -890,20 +866,28 @@ static int bch2_gc_done(struct bch_fs *c,
copy_fs_field(hidden, "hidden"); copy_fs_field(hidden, "hidden");
copy_fs_field(btree, "btree"); copy_fs_field(btree, "btree");
copy_fs_field(data, "data");
copy_fs_field(cached, "cached");
copy_fs_field(reserved, "reserved");
copy_fs_field(nr_inodes,"nr_inodes");
for (i = 0; i < BCH_REPLICAS_MAX; i++) if (!metadata_only) {
copy_fs_field(persistent_reserved[i], copy_fs_field(data, "data");
"persistent_reserved[%i]", i); copy_fs_field(cached, "cached");
copy_fs_field(reserved, "reserved");
copy_fs_field(nr_inodes,"nr_inodes");
for (i = 0; i < BCH_REPLICAS_MAX; i++)
copy_fs_field(persistent_reserved[i],
"persistent_reserved[%i]", i);
}
for (i = 0; i < c->replicas.nr; i++) { for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e = struct bch_replicas_entry *e =
cpu_replicas_entry(&c->replicas, i); cpu_replicas_entry(&c->replicas, i);
char buf[80]; char buf[80];
if (metadata_only &&
(e->data_type == BCH_DATA_user ||
e->data_type == BCH_DATA_cached))
continue;
bch2_replicas_entry_to_text(&PBUF(buf), e); bch2_replicas_entry_to_text(&PBUF(buf), e);
copy_fs_field(replicas[i], "%s", buf); copy_fs_field(replicas[i], "%s", buf);
@ -921,7 +905,8 @@ fsck_err:
return ret; return ret;
} }
static int bch2_gc_start(struct bch_fs *c) static int bch2_gc_start(struct bch_fs *c,
bool metadata_only)
{ {
struct bch_dev *ca; struct bch_dev *ca;
unsigned i; unsigned i;
@ -985,6 +970,11 @@ static int bch2_gc_start(struct bch_fs *c)
d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen; d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen;
d->gen_valid = s->gen_valid; d->gen_valid = s->gen_valid;
if (metadata_only &&
(s->mark.data_type == BCH_DATA_user ||
s->mark.data_type == BCH_DATA_cached))
d->_mark = s->mark;
} }
}; };
@ -1011,7 +1001,7 @@ static int bch2_gc_start(struct bch_fs *c)
* move around - if references move backwards in the ordering GC * move around - if references move backwards in the ordering GC
* uses, GC could skip past them * uses, GC could skip past them
*/ */
int bch2_gc(struct bch_fs *c, bool initial) int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
{ {
struct bch_dev *ca; struct bch_dev *ca;
u64 start_time = local_clock(); u64 start_time = local_clock();
@ -1027,21 +1017,19 @@ int bch2_gc(struct bch_fs *c, bool initial)
closure_wait_event(&c->btree_interior_update_wait, closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c)); !bch2_btree_interior_updates_nr_pending(c));
again: again:
ret = bch2_gc_start(c); ret = bch2_gc_start(c, metadata_only);
if (ret) if (ret)
goto out; goto out;
bch2_mark_superblocks(c); bch2_mark_superblocks(c);
ret = bch2_gc_btrees(c, initial); ret = bch2_gc_btrees(c, initial, metadata_only);
if (ret) if (ret)
goto out; goto out;
#if 0 #if 0
bch2_mark_pending_btree_node_frees(c); bch2_mark_pending_btree_node_frees(c);
#endif #endif
bch2_mark_allocator_buckets(c);
c->gc_count++; c->gc_count++;
if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) || if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
@ -1071,7 +1059,7 @@ out:
bch2_journal_block(&c->journal); bch2_journal_block(&c->journal);
percpu_down_write(&c->mark_lock); percpu_down_write(&c->mark_lock);
ret = bch2_gc_done(c, initial); ret = bch2_gc_done(c, initial, metadata_only);
bch2_journal_unblock(&c->journal); bch2_journal_unblock(&c->journal);
} else { } else {
@ -1142,7 +1130,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
struct btree_iter *iter; struct btree_iter *iter;
struct bkey_s_c k; struct bkey_s_c k;
struct bkey_buf sk; struct bkey_buf sk;
int ret = 0; int ret = 0, commit_err = 0;
bch2_bkey_buf_init(&sk); bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, 0, 0); bch2_trans_init(&trans, c, 0, 0);
@ -1154,18 +1142,20 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
while ((k = bch2_btree_iter_peek(iter)).k && while ((k = bch2_btree_iter_peek(iter)).k &&
!(ret = bkey_err(k))) { !(ret = bkey_err(k))) {
if (gc_btree_gens_key(c, k)) { c->gc_gens_pos = iter->pos;
if (gc_btree_gens_key(c, k) && !commit_err) {
bch2_bkey_buf_reassemble(&sk, c, k); bch2_bkey_buf_reassemble(&sk, c, k);
bch2_extent_normalize(c, bkey_i_to_s(sk.k)); bch2_extent_normalize(c, bkey_i_to_s(sk.k));
bch2_trans_update(&trans, iter, sk.k, 0); bch2_trans_update(&trans, iter, sk.k, 0);
ret = bch2_trans_commit(&trans, NULL, NULL, commit_err = bch2_trans_commit(&trans, NULL, NULL,
BTREE_INSERT_NOFAIL); BTREE_INSERT_NOWAIT|
if (ret == -EINTR) BTREE_INSERT_NOFAIL);
if (commit_err == -EINTR) {
commit_err = 0;
continue; continue;
if (ret) {
break;
} }
} }
@ -1205,6 +1195,8 @@ int bch2_gc_gens(struct bch_fs *c)
for (i = 0; i < BTREE_ID_NR; i++) for (i = 0; i < BTREE_ID_NR; i++)
if ((1 << i) & BTREE_ID_HAS_PTRS) { if ((1 << i) & BTREE_ID_HAS_PTRS) {
c->gc_gens_btree = i;
c->gc_gens_pos = POS_MIN;
ret = bch2_gc_btree_gens(c, i); ret = bch2_gc_btree_gens(c, i);
if (ret) { if (ret) {
bch_err(c, "error recalculating oldest_gen: %i", ret); bch_err(c, "error recalculating oldest_gen: %i", ret);
@ -1221,352 +1213,15 @@ int bch2_gc_gens(struct bch_fs *c)
up_read(&ca->bucket_lock); up_read(&ca->bucket_lock);
} }
c->gc_gens_btree = 0;
c->gc_gens_pos = POS_MIN;
c->gc_count++; c->gc_count++;
err: err:
up_read(&c->gc_lock); up_read(&c->gc_lock);
return ret; return ret;
} }
/* Btree coalescing */
static void recalc_packed_keys(struct btree *b)
{
struct bset *i = btree_bset_first(b);
struct bkey_packed *k;
memset(&b->nr, 0, sizeof(b->nr));
BUG_ON(b->nsets != 1);
vstruct_for_each(i, k)
btree_keys_account_key_add(&b->nr, 0, k);
}
static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
struct btree *old_nodes[GC_MERGE_NODES])
{
struct btree *parent = btree_node_parent(iter, old_nodes[0]);
unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0;
unsigned blocks = btree_blocks(c) * 2 / 3;
struct btree *new_nodes[GC_MERGE_NODES];
struct btree_update *as;
struct keylist keylist;
struct bkey_format_state format_state;
struct bkey_format new_format;
memset(new_nodes, 0, sizeof(new_nodes));
bch2_keylist_init(&keylist, NULL);
/* Count keys that are not deleted */
for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++)
u64s += old_nodes[i]->nr.live_u64s;
nr_old_nodes = nr_new_nodes = i;
/* Check if all keys in @old_nodes could fit in one fewer node */
if (nr_old_nodes <= 1 ||
__vstruct_blocks(struct btree_node, c->block_bits,
DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks)
return;
/* Find a format that all keys in @old_nodes can pack into */
bch2_bkey_format_init(&format_state);
/*
* XXX: this won't correctly take it account the new min/max keys:
*/
for (i = 0; i < nr_old_nodes; i++)
__bch2_btree_calc_format(&format_state, old_nodes[i]);
new_format = bch2_bkey_format_done(&format_state);
/* Check if repacking would make any nodes too big to fit */
for (i = 0; i < nr_old_nodes; i++)
if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) {
trace_btree_gc_coalesce_fail(c,
BTREE_GC_COALESCE_FAIL_FORMAT_FITS);
return;
}
if (bch2_keylist_realloc(&keylist, NULL, 0,
BKEY_BTREE_PTR_U64s_MAX * nr_old_nodes)) {
trace_btree_gc_coalesce_fail(c,
BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC);
return;
}
as = bch2_btree_update_start(iter, old_nodes[0]->c.level,
btree_update_reserve_required(c, parent) + nr_old_nodes,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE);
if (IS_ERR(as)) {
trace_btree_gc_coalesce_fail(c,
BTREE_GC_COALESCE_FAIL_RESERVE_GET);
bch2_keylist_free(&keylist, NULL);
return;
}
trace_btree_gc_coalesce(c, old_nodes[0]);
for (i = 0; i < nr_old_nodes; i++)
bch2_btree_interior_update_will_free_node(as, old_nodes[i]);
/* Repack everything with @new_format and sort down to one bset */
for (i = 0; i < nr_old_nodes; i++)
new_nodes[i] =
__bch2_btree_node_alloc_replacement(as, old_nodes[i],
new_format);
/*
* Conceptually we concatenate the nodes together and slice them
* up at different boundaries.
*/
for (i = nr_new_nodes - 1; i > 0; --i) {
struct btree *n1 = new_nodes[i];
struct btree *n2 = new_nodes[i - 1];
struct bset *s1 = btree_bset_first(n1);
struct bset *s2 = btree_bset_first(n2);
struct bkey_packed *k, *last = NULL;
/* Calculate how many keys from @n2 we could fit inside @n1 */
u64s = 0;
for (k = s2->start;
k < vstruct_last(s2) &&
vstruct_blocks_plus(n1->data, c->block_bits,
u64s + k->u64s) <= blocks;
k = bkey_next(k)) {
last = k;
u64s += k->u64s;
}
if (u64s == le16_to_cpu(s2->u64s)) {
/* n2 fits entirely in n1 */
n1->key.k.p = n1->data->max_key = n2->data->max_key;
memcpy_u64s(vstruct_last(s1),
s2->start,
le16_to_cpu(s2->u64s));
le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s));
set_btree_bset_end(n1, n1->set);
six_unlock_write(&n2->c.lock);
bch2_btree_node_free_never_inserted(c, n2);
six_unlock_intent(&n2->c.lock);
memmove(new_nodes + i - 1,
new_nodes + i,
sizeof(new_nodes[0]) * (nr_new_nodes - i));
new_nodes[--nr_new_nodes] = NULL;
} else if (u64s) {
/* move part of n2 into n1 */
n1->key.k.p = n1->data->max_key =
bkey_unpack_pos(n1, last);
n2->data->min_key = bpos_successor(n1->data->max_key);
memcpy_u64s(vstruct_last(s1),
s2->start, u64s);
le16_add_cpu(&s1->u64s, u64s);
memmove(s2->start,
vstruct_idx(s2, u64s),
(le16_to_cpu(s2->u64s) - u64s) * sizeof(u64));
s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s);
set_btree_bset_end(n1, n1->set);
set_btree_bset_end(n2, n2->set);
}
}
for (i = 0; i < nr_new_nodes; i++) {
struct btree *n = new_nodes[i];
recalc_packed_keys(n);
btree_node_reset_sib_u64s(n);
bch2_btree_build_aux_trees(n);
bch2_btree_update_add_new_node(as, n);
six_unlock_write(&n->c.lock);
bch2_btree_node_write(c, n, SIX_LOCK_intent);
}
/*
* The keys for the old nodes get deleted. We don't want to insert keys
* that compare equal to the keys for the new nodes we'll also be
* inserting - we can't because keys on a keylist must be strictly
* greater than the previous keys, and we also don't need to since the
* key for the new node will serve the same purpose (overwriting the key
* for the old node).
*/
for (i = 0; i < nr_old_nodes; i++) {
struct bkey_i delete;
unsigned j;
for (j = 0; j < nr_new_nodes; j++)
if (!bpos_cmp(old_nodes[i]->key.k.p,
new_nodes[j]->key.k.p))
goto next;
bkey_init(&delete.k);
delete.k.p = old_nodes[i]->key.k.p;
bch2_keylist_add_in_order(&keylist, &delete);
next:
i = i;
}
/*
* Keys for the new nodes get inserted: bch2_btree_insert_keys() only
* does the lookup once and thus expects the keys to be in sorted order
* so we have to make sure the new keys are correctly ordered with
* respect to the deleted keys added in the previous loop
*/
for (i = 0; i < nr_new_nodes; i++)
bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key);
/* Insert the newly coalesced nodes */
bch2_btree_insert_node(as, parent, iter, &keylist, 0);
BUG_ON(!bch2_keylist_empty(&keylist));
BUG_ON(iter->l[old_nodes[0]->c.level].b != old_nodes[0]);
bch2_btree_iter_node_replace(iter, new_nodes[0]);
for (i = 0; i < nr_new_nodes; i++)
bch2_btree_update_get_open_buckets(as, new_nodes[i]);
/* Free the old nodes and update our sliding window */
for (i = 0; i < nr_old_nodes; i++) {
bch2_btree_node_free_inmem(c, old_nodes[i], iter);
/*
* the index update might have triggered a split, in which case
* the nodes we coalesced - the new nodes we just created -
* might not be sibling nodes anymore - don't add them to the
* sliding window (except the first):
*/
if (!i) {
old_nodes[i] = new_nodes[i];
} else {
old_nodes[i] = NULL;
}
}
for (i = 0; i < nr_new_nodes; i++)
six_unlock_intent(&new_nodes[i]->c.lock);
bch2_btree_update_done(as);
bch2_keylist_free(&keylist, NULL);
}
static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
{
struct btree_trans trans;
struct btree_iter *iter;
struct btree *b;
bool kthread = (current->flags & PF_KTHREAD) != 0;
unsigned i;
int ret = 0;
/* Sliding window of adjacent btree nodes */
struct btree *merge[GC_MERGE_NODES];
u32 lock_seq[GC_MERGE_NODES];
bch2_trans_init(&trans, c, 0, 0);
/*
* XXX: We don't have a good way of positively matching on sibling nodes
* that have the same parent - this code works by handling the cases
* where they might not have the same parent, and is thus fragile. Ugh.
*
* Perhaps redo this to use multiple linked iterators?
*/
memset(merge, 0, sizeof(merge));
__for_each_btree_node(&trans, iter, btree_id, POS_MIN,
BTREE_MAX_DEPTH, 0,
BTREE_ITER_PREFETCH, b) {
memmove(merge + 1, merge,
sizeof(merge) - sizeof(merge[0]));
memmove(lock_seq + 1, lock_seq,
sizeof(lock_seq) - sizeof(lock_seq[0]));
merge[0] = b;
for (i = 1; i < GC_MERGE_NODES; i++) {
if (!merge[i] ||
!six_relock_intent(&merge[i]->c.lock, lock_seq[i]))
break;
if (merge[i]->c.level != merge[0]->c.level) {
six_unlock_intent(&merge[i]->c.lock);
break;
}
}
memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0]));
bch2_coalesce_nodes(c, iter, merge);
for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) {
lock_seq[i] = merge[i]->c.lock.state.seq;
six_unlock_intent(&merge[i]->c.lock);
}
lock_seq[0] = merge[0]->c.lock.state.seq;
if (kthread && kthread_should_stop()) {
ret = -ESHUTDOWN;
break;
}
bch2_trans_cond_resched(&trans);
/*
* If the parent node wasn't relocked, it might have been split
* and the nodes in our sliding window might not have the same
* parent anymore - blow away the sliding window:
*/
if (btree_iter_node(iter, iter->level + 1) &&
!btree_node_intent_locked(iter, iter->level + 1))
memset(merge + 1, 0,
(GC_MERGE_NODES - 1) * sizeof(merge[0]));
}
bch2_trans_iter_put(&trans, iter);
return bch2_trans_exit(&trans) ?: ret;
}
/**
* bch_coalesce - coalesce adjacent nodes with low occupancy
*/
void bch2_coalesce(struct bch_fs *c)
{
enum btree_id id;
down_read(&c->gc_lock);
trace_gc_coalesce_start(c);
for (id = 0; id < BTREE_ID_NR; id++) {
int ret = c->btree_roots[id].b
? bch2_coalesce_btree(c, id)
: 0;
if (ret) {
if (ret != -ESHUTDOWN)
bch_err(c, "btree coalescing failed: %d", ret);
return;
}
}
trace_gc_coalesce_end(c);
up_read(&c->gc_lock);
}
static int bch2_gc_thread(void *arg) static int bch2_gc_thread(void *arg)
{ {
struct bch_fs *c = arg; struct bch_fs *c = arg;

View File

@ -4,9 +4,7 @@
#include "btree_types.h" #include "btree_types.h"
void bch2_coalesce(struct bch_fs *); int bch2_gc(struct bch_fs *, bool, bool);
int bch2_gc(struct bch_fs *, bool);
int bch2_gc_gens(struct bch_fs *); int bch2_gc_gens(struct bch_fs *);
void bch2_gc_thread_stop(struct bch_fs *); void bch2_gc_thread_stop(struct bch_fs *);
int bch2_gc_thread_start(struct bch_fs *); int bch2_gc_thread_start(struct bch_fs *);
@ -92,14 +90,6 @@ static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH); return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH);
} }
static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob)
{
return (struct gc_pos) {
.phase = GC_PHASE_ALLOC,
.pos = POS(ob ? ob - c->open_buckets : 0, 0),
};
}
static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
{ {
unsigned seq; unsigned seq;

View File

@ -1057,14 +1057,17 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
struct btree_read_bio *rb; struct btree_read_bio *rb;
struct bch_dev *ca; struct bch_dev *ca;
struct bio *bio; struct bio *bio;
char buf[200];
int ret; int ret;
btree_pos_to_text(&PBUF(buf), c, b);
trace_btree_read(c, b); trace_btree_read(c, b);
ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
NULL, &pick); NULL, &pick);
if (bch2_fs_fatal_err_on(ret <= 0, c, if (bch2_fs_fatal_err_on(ret <= 0, c,
"btree node read error: no device to read from")) { "btree node read error: no device to read from\n"
" at %s", buf)) {
set_btree_node_read_error(b); set_btree_node_read_error(b);
return; return;
} }
@ -1337,13 +1340,6 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
return ret; return ret;
} }
static void btree_write_submit(struct work_struct *work)
{
struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work);
bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &wbio->key);
}
void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
{ {
struct btree_write_bio *wbio; struct btree_write_bio *wbio;
@ -1351,6 +1347,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
struct bset *i; struct bset *i;
struct btree_node *bn = NULL; struct btree_node *bn = NULL;
struct btree_node_entry *bne = NULL; struct btree_node_entry *bne = NULL;
struct bkey_buf k;
struct bch_extent_ptr *ptr; struct bch_extent_ptr *ptr;
struct sort_iter sort_iter; struct sort_iter sort_iter;
struct nonce nonce; struct nonce nonce;
@ -1361,6 +1358,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
bool validate_before_checksum = false; bool validate_before_checksum = false;
void *data; void *data;
bch2_bkey_buf_init(&k);
if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
return; return;
@ -1537,7 +1536,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
wbio_init(&wbio->wbio.bio); wbio_init(&wbio->wbio.bio);
wbio->data = data; wbio->data = data;
wbio->bytes = bytes; wbio->bytes = bytes;
wbio->wbio.c = c;
wbio->wbio.used_mempool = used_mempool; wbio->wbio.used_mempool = used_mempool;
wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META; wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META;
wbio->wbio.bio.bi_end_io = btree_node_write_endio; wbio->wbio.bio.bi_end_io = btree_node_write_endio;
@ -1560,9 +1558,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
* just make all btree node writes FUA to keep things sane. * just make all btree node writes FUA to keep things sane.
*/ */
bkey_copy(&wbio->key, &b->key); bch2_bkey_buf_copy(&k, c, &b->key);
bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&wbio->key)), ptr) bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(k.k)), ptr)
ptr->offset += b->written; ptr->offset += b->written;
b->written += sectors_to_write; b->written += sectors_to_write;
@ -1570,8 +1568,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
atomic64_inc(&c->btree_writes_nr); atomic64_inc(&c->btree_writes_nr);
atomic64_add(sectors_to_write, &c->btree_writes_sectors); atomic64_add(sectors_to_write, &c->btree_writes_sectors);
INIT_WORK(&wbio->work, btree_write_submit); /* XXX: submitting IO with btree locks held: */
schedule_work(&wbio->work); bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, k.k);
bch2_bkey_buf_exit(&k, c);
return; return;
err: err:
set_btree_node_noevict(b); set_btree_node_noevict(b);

View File

@ -42,7 +42,6 @@ struct btree_read_bio {
struct btree_write_bio { struct btree_write_bio {
struct work_struct work; struct work_struct work;
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
void *data; void *data;
unsigned bytes; unsigned bytes;
struct bch_write_bio wbio; struct bch_write_bio wbio;

View File

@ -260,13 +260,8 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
*/ */
if (type == SIX_LOCK_intent && if (type == SIX_LOCK_intent &&
linked->nodes_locked != linked->nodes_intent_locked) { linked->nodes_locked != linked->nodes_intent_locked) {
linked->locks_want = max_t(unsigned, deadlock_iter = linked;
linked->locks_want, reason = 1;
__fls(linked->nodes_locked) + 1);
if (!btree_iter_get_locks(linked, true, false)) {
deadlock_iter = linked;
reason = 1;
}
} }
if (linked->btree_id != iter->btree_id) { if (linked->btree_id != iter->btree_id) {
@ -295,14 +290,8 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
* we're about to lock, it must have the ancestors locked too: * we're about to lock, it must have the ancestors locked too:
*/ */
if (level > __fls(linked->nodes_locked)) { if (level > __fls(linked->nodes_locked)) {
linked->locks_want = deadlock_iter = linked;
max(level + 1, max_t(unsigned, reason = 5;
linked->locks_want,
iter->locks_want));
if (!btree_iter_get_locks(linked, true, false)) {
deadlock_iter = linked;
reason = 5;
}
} }
/* Must lock btree nodes in key order: */ /* Must lock btree nodes in key order: */
@ -311,27 +300,19 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
btree_iter_type(linked))) <= 0) { btree_iter_type(linked))) <= 0) {
deadlock_iter = linked; deadlock_iter = linked;
reason = 7; reason = 7;
} BUG_ON(trans->in_traverse_all);
/*
* Recheck if this is a node we already have locked - since one
* of the get_locks() calls might've successfully
* upgraded/relocked it:
*/
if (linked->l[level].b == b &&
btree_node_locked_type(linked, level) >= type) {
six_lock_increment(&b->c.lock, type);
return true;
} }
} }
if (unlikely(deadlock_iter)) { if (unlikely(deadlock_iter)) {
trace_trans_restart_would_deadlock(iter->trans->ip, ip, trace_trans_restart_would_deadlock(iter->trans->ip, ip,
reason, trans->in_traverse_all, reason,
deadlock_iter->btree_id, deadlock_iter->btree_id,
btree_iter_type(deadlock_iter), btree_iter_type(deadlock_iter),
&deadlock_iter->real_pos,
iter->btree_id, iter->btree_id,
btree_iter_type(iter)); btree_iter_type(iter),
&pos);
return false; return false;
} }
@ -409,12 +390,27 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
return true; return true;
/* /*
* Ancestor nodes must be locked before child nodes, so set locks_want * XXX: this is ugly - we'd prefer to not be mucking with other
* on iterators that might lock ancestors before us to avoid getting * iterators in the btree_trans here.
* -EINTR later: *
* On failure to upgrade the iterator, setting iter->locks_want and
* calling get_locks() is sufficient to make bch2_btree_iter_traverse()
* get the locks we want on transaction restart.
*
* But if this iterator was a clone, on transaction restart what we did
* to this iterator isn't going to be preserved.
*
* Possibly we could add an iterator field for the parent iterator when
* an iterator is a copy - for now, we'll just upgrade any other
* iterators with the same btree id.
*
* The code below used to be needed to ensure ancestor nodes get locked
* before interior nodes - now that's handled by
* bch2_btree_iter_traverse_all().
*/ */
trans_for_each_iter(iter->trans, linked) trans_for_each_iter(iter->trans, linked)
if (linked != iter && if (linked != iter &&
btree_iter_type(linked) == btree_iter_type(iter) &&
linked->btree_id == iter->btree_id && linked->btree_id == iter->btree_id &&
linked->locks_want < new_locks_want) { linked->locks_want < new_locks_want) {
linked->locks_want = new_locks_want; linked->locks_want = new_locks_want;
@ -1184,7 +1180,8 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct btree_iter *iter; struct btree_iter *iter;
u8 sorted[BTREE_ITER_MAX]; u8 sorted[BTREE_ITER_MAX];
unsigned i, nr_sorted = 0; int i, nr_sorted = 0;
bool relock_fail;
if (trans->in_traverse_all) if (trans->in_traverse_all)
return -EINTR; return -EINTR;
@ -1192,15 +1189,36 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
trans->in_traverse_all = true; trans->in_traverse_all = true;
retry_all: retry_all:
nr_sorted = 0; nr_sorted = 0;
relock_fail = false;
trans_for_each_iter(trans, iter) trans_for_each_iter(trans, iter) {
if (!bch2_btree_iter_relock(iter, true))
relock_fail = true;
sorted[nr_sorted++] = iter->idx; sorted[nr_sorted++] = iter->idx;
}
if (!relock_fail) {
trans->in_traverse_all = false;
return 0;
}
#define btree_iter_cmp_by_idx(_l, _r) \ #define btree_iter_cmp_by_idx(_l, _r) \
btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r]) btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r])
bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx); bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
#undef btree_iter_cmp_by_idx #undef btree_iter_cmp_by_idx
for (i = nr_sorted - 2; i >= 0; --i) {
struct btree_iter *iter1 = trans->iters + sorted[i];
struct btree_iter *iter2 = trans->iters + sorted[i + 1];
if (iter1->btree_id == iter2->btree_id &&
iter1->locks_want < iter2->locks_want)
__bch2_btree_iter_upgrade(iter1, iter2->locks_want);
else if (!iter1->locks_want && iter2->locks_want)
__bch2_btree_iter_upgrade(iter1, 1);
}
bch2_trans_unlock(trans); bch2_trans_unlock(trans);
cond_resched(); cond_resched();
@ -1250,6 +1268,8 @@ out:
bch2_btree_cache_cannibalize_unlock(c); bch2_btree_cache_cannibalize_unlock(c);
trans->in_traverse_all = false; trans->in_traverse_all = false;
trace_trans_traverse_all(trans->ip);
return ret; return ret;
} }
@ -2009,10 +2029,14 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
if (iter->btree_id != btree_id) if (iter->btree_id != btree_id)
continue; continue;
if (best && if (best) {
bkey_cmp(bpos_diff(best->real_pos, pos), int cmp = bkey_cmp(bpos_diff(best->real_pos, pos),
bpos_diff(iter->real_pos, pos)) > 0) bpos_diff(iter->real_pos, pos));
continue;
if (cmp < 0 ||
((cmp == 0 && btree_iter_keep(trans, iter))))
continue;
}
best = iter; best = iter;
} }
@ -2040,13 +2064,18 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
iter->snapshot = pos.snapshot; iter->snapshot = pos.snapshot;
locks_want = min(locks_want, BTREE_MAX_DEPTH); /*
* If the iterator has locks_want greater than requested, we explicitly
* do not downgrade it here - on transaction restart because btree node
* split needs to upgrade locks, we might be putting/getting the
* iterator again. Downgrading iterators only happens via an explicit
* bch2_trans_downgrade().
*/
locks_want = min(locks_want, BTREE_MAX_DEPTH);
if (locks_want > iter->locks_want) { if (locks_want > iter->locks_want) {
iter->locks_want = locks_want; iter->locks_want = locks_want;
btree_iter_get_locks(iter, true, false); btree_iter_get_locks(iter, true, false);
} else if (locks_want < iter->locks_want) {
__bch2_btree_iter_downgrade(iter, locks_want);
} }
while (iter->level < depth) { while (iter->level < depth) {
@ -2108,37 +2137,28 @@ struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
return iter; return iter;
} }
static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size) void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
{ {
if (size > trans->mem_bytes) { size_t new_top = trans->mem_top + size;
void *p;
if (new_top > trans->mem_bytes) {
size_t old_bytes = trans->mem_bytes; size_t old_bytes = trans->mem_bytes;
size_t new_bytes = roundup_pow_of_two(size); size_t new_bytes = roundup_pow_of_two(new_top);
void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
if (!new_mem) if (!new_mem)
return -ENOMEM; return ERR_PTR(-ENOMEM);
trans->mem = new_mem; trans->mem = new_mem;
trans->mem_bytes = new_bytes; trans->mem_bytes = new_bytes;
if (old_bytes) { if (old_bytes) {
trace_trans_restart_mem_realloced(trans->ip, new_bytes); trace_trans_restart_mem_realloced(trans->ip, _RET_IP_, new_bytes);
return -EINTR; return ERR_PTR(-EINTR);
} }
} }
return 0;
}
void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
{
void *p;
int ret;
ret = bch2_trans_preload_mem(trans, trans->mem_top + size);
if (ret)
return ERR_PTR(ret);
p = trans->mem + trans->mem_top; p = trans->mem + trans->mem_top;
trans->mem_top += size; trans->mem_top += size;
return p; return p;
@ -2188,7 +2208,8 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
if (!(flags & TRANS_RESET_NOUNLOCK)) if (!(flags & TRANS_RESET_NOUNLOCK))
bch2_trans_cond_resched(trans); bch2_trans_cond_resched(trans);
if (!(flags & TRANS_RESET_NOTRAVERSE)) if (!(flags & TRANS_RESET_NOTRAVERSE) &&
trans->iters_linked)
bch2_btree_iter_traverse_all(trans); bch2_btree_iter_traverse_all(trans);
} }

View File

@ -187,7 +187,7 @@ static inline int btree_iter_lock_cmp(const struct btree_iter *l,
{ {
return cmp_int(l->btree_id, r->btree_id) ?: return cmp_int(l->btree_id, r->btree_id) ?:
-cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?: -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?:
bkey_cmp(l->pos, r->pos); bkey_cmp(l->real_pos, r->real_pos);
} }
/* /*

View File

@ -222,18 +222,6 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
static inline void btree_insert_entry_checks(struct btree_trans *trans, static inline void btree_insert_entry_checks(struct btree_trans *trans,
struct btree_insert_entry *i) struct btree_insert_entry *i)
{ {
struct bch_fs *c = trans->c;
if (bch2_debug_check_bkeys) {
const char *invalid = bch2_bkey_invalid(c,
bkey_i_to_s_c(i->k), i->bkey_type);
if (invalid) {
char buf[200];
bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
panic("invalid bkey %s on insert: %s\n", buf, invalid);
}
}
BUG_ON(!i->is_extent && bpos_cmp(i->k->k.p, i->iter->real_pos)); BUG_ON(!i->is_extent && bpos_cmp(i->k->k.p, i->iter->real_pos));
BUG_ON(i->level != i->iter->level); BUG_ON(i->level != i->iter->level);
BUG_ON(i->btree_id != i->iter->btree_id); BUG_ON(i->btree_id != i->iter->btree_id);
@ -319,8 +307,7 @@ btree_key_can_insert_cached(struct btree_trans *trans,
} }
static inline void do_btree_insert_one(struct btree_trans *trans, static inline void do_btree_insert_one(struct btree_trans *trans,
struct btree_iter *iter, struct btree_insert_entry *i)
struct bkey_i *insert)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct journal *j = &c->journal; struct journal *j = &c->journal;
@ -329,20 +316,22 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
EBUG_ON(trans->journal_res.ref != EBUG_ON(trans->journal_res.ref !=
!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
insert->k.needs_whiteout = false; i->k->k.needs_whiteout = false;
did_work = (btree_iter_type(iter) != BTREE_ITER_CACHED) did_work = (btree_iter_type(i->iter) != BTREE_ITER_CACHED)
? btree_insert_key_leaf(trans, iter, insert) ? btree_insert_key_leaf(trans, i->iter, i->k)
: bch2_btree_insert_key_cached(trans, iter, insert); : bch2_btree_insert_key_cached(trans, i->iter, i->k);
if (!did_work) if (!did_work)
return; return;
if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
bch2_journal_add_keys(j, &trans->journal_res, bch2_journal_add_keys(j, &trans->journal_res,
iter->btree_id, insert); i->btree_id,
i->level,
i->k);
bch2_journal_set_has_inode(j, &trans->journal_res, bch2_journal_set_has_inode(j, &trans->journal_res,
insert->k.p.inode); i->k->k.p.inode);
if (trans->journal_seq) if (trans->journal_seq)
*trans->journal_seq = trans->journal_res.seq; *trans->journal_seq = trans->journal_res.seq;
@ -480,7 +469,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
bch2_trans_mark_gc(trans); bch2_trans_mark_gc(trans);
trans_for_each_update2(trans, i) trans_for_each_update2(trans, i)
do_btree_insert_one(trans, i->iter, i->k); do_btree_insert_one(trans, i);
err: err:
if (marking) { if (marking) {
percpu_up_read(&c->mark_lock); percpu_up_read(&c->mark_lock);
@ -592,9 +581,18 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
} }
} }
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) trans_for_each_update2(trans, i) {
trans_for_each_update2(trans, i) const char *invalid = bch2_bkey_invalid(c,
btree_insert_entry_checks(trans, i); bkey_i_to_s_c(i->k), i->bkey_type);
if (invalid) {
char buf[200];
bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
bch_err(c, "invalid bkey %s on insert: %s\n", buf, invalid);
bch2_fatal_error(c);
}
btree_insert_entry_checks(trans, i);
}
bch2_btree_trans_verify_locks(trans); bch2_btree_trans_verify_locks(trans);
trans_for_each_update2(trans, i) trans_for_each_update2(trans, i)
@ -629,25 +627,11 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
static int journal_reclaim_wait_done(struct bch_fs *c) static int journal_reclaim_wait_done(struct bch_fs *c)
{ {
int ret; int ret = bch2_journal_error(&c->journal) ?:
!bch2_btree_key_cache_must_wait(c);
ret = bch2_journal_error(&c->journal);
if (ret)
return ret;
ret = !bch2_btree_key_cache_must_wait(c);
if (ret)
return ret;
journal_reclaim_kick(&c->journal);
if (mutex_trylock(&c->journal.reclaim_lock)) {
ret = bch2_journal_reclaim(&c->journal);
mutex_unlock(&c->journal.reclaim_lock);
}
if (!ret) if (!ret)
ret = !bch2_btree_key_cache_must_wait(c); journal_reclaim_kick(&c->journal);
return ret; return ret;
} }
@ -735,10 +719,12 @@ int bch2_trans_commit_error(struct btree_trans *trans,
case BTREE_INSERT_NEED_JOURNAL_RECLAIM: case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
bch2_trans_unlock(trans); bch2_trans_unlock(trans);
wait_event(c->journal.reclaim_wait, wait_event_freezable(c->journal.reclaim_wait,
(ret = journal_reclaim_wait_done(c))); (ret = journal_reclaim_wait_done(c)));
if (ret < 0)
return ret;
if (!ret && bch2_trans_relock(trans)) if (bch2_trans_relock(trans))
return 0; return 0;
trace_trans_restart_journal_reclaim(trans->ip); trace_trans_restart_journal_reclaim(trans->ip);
@ -1151,8 +1137,7 @@ int __bch2_btree_insert(struct btree_trans *trans,
iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k), iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
BTREE_ITER_INTENT); BTREE_ITER_INTENT);
ret = bch2_btree_iter_traverse(iter) ?: ret = bch2_trans_update(trans, iter, k, 0);
bch2_trans_update(trans, iter, k, 0);
bch2_trans_iter_put(trans, iter); bch2_trans_iter_put(trans, iter);
return ret; return ret;
} }

View File

@ -3,64 +3,6 @@
* Code for manipulating bucket marks for garbage collection. * Code for manipulating bucket marks for garbage collection.
* *
* Copyright 2014 Datera, Inc. * Copyright 2014 Datera, Inc.
*
* Bucket states:
* - free bucket: mark == 0
* The bucket contains no data and will not be read
*
* - allocator bucket: owned_by_allocator == 1
* The bucket is on a free list, or it is an open bucket
*
* - cached bucket: owned_by_allocator == 0 &&
* dirty_sectors == 0 &&
* cached_sectors > 0
* The bucket contains data but may be safely discarded as there are
* enough replicas of the data on other cache devices, or it has been
* written back to the backing device
*
* - dirty bucket: owned_by_allocator == 0 &&
* dirty_sectors > 0
* The bucket contains data that we must not discard (either only copy,
* or one of the 'main copies' for data requiring multiple replicas)
*
* - metadata bucket: owned_by_allocator == 0 && is_metadata == 1
* This is a btree node, journal or gen/prio bucket
*
* Lifecycle:
*
* bucket invalidated => bucket on freelist => open bucket =>
* [dirty bucket =>] cached bucket => bucket invalidated => ...
*
* Note that cache promotion can skip the dirty bucket step, as data
* is copied from a deeper tier to a shallower tier, onto a cached
* bucket.
* Note also that a cached bucket can spontaneously become dirty --
* see below.
*
* Only a traversal of the key space can determine whether a bucket is
* truly dirty or cached.
*
* Transitions:
*
* - free => allocator: bucket was invalidated
* - cached => allocator: bucket was invalidated
*
* - allocator => dirty: open bucket was filled up
* - allocator => cached: open bucket was filled up
* - allocator => metadata: metadata was allocated
*
* - dirty => cached: dirty sectors were copied to a deeper tier
* - dirty => free: dirty sectors were overwritten or moved (copy gc)
* - cached => free: cached sectors were overwritten
*
* - metadata => free: metadata was freed
*
* Oddities:
* - cached => dirty: a device was removed so formerly replicated data
* is no longer sufficiently replicated
* - free => cached: cannot happen
* - free => dirty: cannot happen
* - free => metadata: cannot happen
*/ */
#include "bcachefs.h" #include "bcachefs.h"
@ -229,7 +171,7 @@ struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
percpu_down_read(&c->mark_lock); percpu_down_read(&c->mark_lock);
ret = kmalloc(sizeof(struct bch_fs_usage_online) + ret = kmalloc(sizeof(struct bch_fs_usage_online) +
sizeof(u64) + c->replicas.nr, GFP_NOFS); sizeof(u64) * c->replicas.nr, GFP_NOFS);
if (unlikely(!ret)) { if (unlikely(!ret)) {
percpu_up_read(&c->mark_lock); percpu_up_read(&c->mark_lock);
return NULL; return NULL;
@ -538,33 +480,17 @@ static inline void update_cached_sectors_list(struct btree_trans *trans,
ret; \ ret; \
}) })
static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, bool owned_by_allocator, size_t b, bool owned_by_allocator)
bool gc)
{ {
struct bucket *g = __bucket(ca, b, gc); struct bucket *g = bucket(ca, b);
struct bucket_mark old, new; struct bucket_mark old, new;
old = bucket_cmpxchg(g, new, ({ old = bucket_cmpxchg(g, new, ({
new.owned_by_allocator = owned_by_allocator; new.owned_by_allocator = owned_by_allocator;
})); }));
BUG_ON(!gc && BUG_ON(owned_by_allocator == old.owned_by_allocator);
!owned_by_allocator && !old.owned_by_allocator);
return 0;
}
void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, bool owned_by_allocator,
struct gc_pos pos, unsigned flags)
{
preempt_disable();
do_mark_fn(__bch2_mark_alloc_bucket, c, pos, flags,
ca, b, owned_by_allocator);
preempt_enable();
} }
static int bch2_mark_alloc(struct bch_fs *c, static int bch2_mark_alloc(struct bch_fs *c,
@ -1890,10 +1816,11 @@ int bch2_trans_mark_update(struct btree_trans *trans,
return 0; return 0;
if (!btree_node_type_is_extents(iter->btree_id)) { if (!btree_node_type_is_extents(iter->btree_id)) {
/* iterators should be uptodate, shouldn't get errors here: */
if (btree_iter_type(iter) != BTREE_ITER_CACHED) { if (btree_iter_type(iter) != BTREE_ITER_CACHED) {
old = bch2_btree_iter_peek_slot(iter); old = bch2_btree_iter_peek_slot(iter);
BUG_ON(bkey_err(old)); ret = bkey_err(old);
if (ret)
return ret;
} else { } else {
struct bkey_cached *ck = (void *) iter->l[0].b; struct bkey_cached *ck = (void *) iter->l[0].b;
@ -2004,22 +1931,6 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
goto out; goto out;
} }
if ((unsigned) (u.dirty_sectors + sectors) > ca->mi.bucket_size) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %llu:%llu gen %u data type %s sector count overflow: %u + %u > %u\n"
"while marking %s",
iter->pos.inode, iter->pos.offset, u.gen,
bch2_data_types[u.data_type ?: type],
u.dirty_sectors, sectors, ca->mi.bucket_size,
bch2_data_types[type]);
ret = -EIO;
goto out;
}
if (u.data_type == type &&
u.dirty_sectors == sectors)
goto out;
u.data_type = type; u.data_type = type;
u.dirty_sectors = sectors; u.dirty_sectors = sectors;
@ -2031,53 +1942,44 @@ out:
} }
int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
struct disk_reservation *res,
struct bch_dev *ca, size_t b, struct bch_dev *ca, size_t b,
enum bch_data_type type, enum bch_data_type type,
unsigned sectors) unsigned sectors)
{ {
return __bch2_trans_do(trans, res, NULL, 0, return __bch2_trans_do(trans, NULL, NULL, 0,
__bch2_trans_mark_metadata_bucket(trans, ca, b, BCH_DATA_journal, __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
ca->mi.bucket_size));
} }
static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
struct disk_reservation *res,
struct bch_dev *ca, struct bch_dev *ca,
u64 start, u64 end, u64 start, u64 end,
enum bch_data_type type, enum bch_data_type type,
u64 *bucket, unsigned *bucket_sectors) u64 *bucket, unsigned *bucket_sectors)
{ {
int ret;
do { do {
u64 b = sector_to_bucket(ca, start); u64 b = sector_to_bucket(ca, start);
unsigned sectors = unsigned sectors =
min_t(u64, bucket_to_sector(ca, b + 1), end) - start; min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
if (b != *bucket) { if (b != *bucket && *bucket_sectors) {
if (*bucket_sectors) { int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket,
ret = bch2_trans_mark_metadata_bucket(trans, res, ca, type, *bucket_sectors);
*bucket, type, *bucket_sectors); if (ret)
if (ret) return ret;
return ret;
}
*bucket = b; *bucket_sectors = 0;
*bucket_sectors = 0;
} }
*bucket = b;
*bucket_sectors += sectors; *bucket_sectors += sectors;
start += sectors; start += sectors;
} while (!ret && start < end); } while (start < end);
return 0; return 0;
} }
static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
struct disk_reservation *res, struct bch_dev *ca)
struct bch_dev *ca)
{ {
struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
u64 bucket = 0; u64 bucket = 0;
@ -2088,14 +1990,14 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
u64 offset = le64_to_cpu(layout->sb_offset[i]); u64 offset = le64_to_cpu(layout->sb_offset[i]);
if (offset == BCH_SB_SECTOR) { if (offset == BCH_SB_SECTOR) {
ret = bch2_trans_mark_metadata_sectors(trans, res, ca, ret = bch2_trans_mark_metadata_sectors(trans, ca,
0, BCH_SB_SECTOR, 0, BCH_SB_SECTOR,
BCH_DATA_sb, &bucket, &bucket_sectors); BCH_DATA_sb, &bucket, &bucket_sectors);
if (ret) if (ret)
return ret; return ret;
} }
ret = bch2_trans_mark_metadata_sectors(trans, res, ca, offset, ret = bch2_trans_mark_metadata_sectors(trans, ca, offset,
offset + (1 << layout->sb_max_size_bits), offset + (1 << layout->sb_max_size_bits),
BCH_DATA_sb, &bucket, &bucket_sectors); BCH_DATA_sb, &bucket, &bucket_sectors);
if (ret) if (ret)
@ -2103,14 +2005,14 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
} }
if (bucket_sectors) { if (bucket_sectors) {
ret = bch2_trans_mark_metadata_bucket(trans, res, ca, ret = bch2_trans_mark_metadata_bucket(trans, ca,
bucket, BCH_DATA_sb, bucket_sectors); bucket, BCH_DATA_sb, bucket_sectors);
if (ret) if (ret)
return ret; return ret;
} }
for (i = 0; i < ca->journal.nr; i++) { for (i = 0; i < ca->journal.nr; i++) {
ret = bch2_trans_mark_metadata_bucket(trans, res, ca, ret = bch2_trans_mark_metadata_bucket(trans, ca,
ca->journal.buckets[i], ca->journal.buckets[i],
BCH_DATA_journal, ca->mi.bucket_size); BCH_DATA_journal, ca->mi.bucket_size);
if (ret) if (ret)
@ -2120,12 +2022,10 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
return 0; return 0;
} }
int bch2_trans_mark_dev_sb(struct bch_fs *c, int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
struct disk_reservation *res,
struct bch_dev *ca)
{ {
return bch2_trans_do(c, res, NULL, 0, return bch2_trans_do(c, NULL, NULL, 0,
__bch2_trans_mark_dev_sb(&trans, res, ca)); __bch2_trans_mark_dev_sb(&trans, ca));
} }
/* Disk reservations: */ /* Disk reservations: */

View File

@ -191,6 +191,7 @@ static inline u64 __dev_buckets_reclaimable(struct bch_dev *ca,
for (i = 0; i < RESERVE_NR; i++) for (i = 0; i < RESERVE_NR; i++)
available -= fifo_used(&ca->free[i]); available -= fifo_used(&ca->free[i]);
available -= fifo_used(&ca->free_inc); available -= fifo_used(&ca->free_inc);
available -= ca->nr_open_buckets;
spin_unlock(&c->freelist_lock); spin_unlock(&c->freelist_lock);
return max(available, 0LL); return max(available, 0LL);
@ -234,8 +235,7 @@ bch2_fs_usage_read_short(struct bch_fs *);
void bch2_bucket_seq_cleanup(struct bch_fs *); void bch2_bucket_seq_cleanup(struct bch_fs *);
void bch2_fs_usage_initialize(struct bch_fs *); void bch2_fs_usage_initialize(struct bch_fs *);
void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool);
size_t, bool, struct gc_pos, unsigned);
void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
size_t, enum bch_data_type, unsigned, size_t, enum bch_data_type, unsigned,
struct gc_pos, unsigned); struct gc_pos, unsigned);
@ -252,11 +252,9 @@ int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter,
struct bkey_i *insert, unsigned); struct bkey_i *insert, unsigned);
void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_mark_metadata_bucket(struct btree_trans *, int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
struct disk_reservation *, struct bch_dev *, size_t, enum bch_data_type, unsigned);
size_t, enum bch_data_type, unsigned); int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *);
int bch2_trans_mark_dev_sb(struct bch_fs *, struct disk_reservation *,
struct bch_dev *);
/* disk reservations: */ /* disk reservations: */

View File

@ -59,6 +59,11 @@ struct bch_dev_usage {
struct { struct {
u64 buckets; u64 buckets;
u64 sectors; /* _compressed_ sectors: */ u64 sectors; /* _compressed_ sectors: */
/*
* XXX
* Why do we have this? Isn't it just buckets * bucket_size -
* sectors?
*/
u64 fragmented; u64 fragmented;
} d[BCH_DATA_NR]; } d[BCH_DATA_NR];
}; };

View File

@ -2619,54 +2619,21 @@ err:
return ret; return ret;
} }
static long bchfs_fallocate(struct bch_inode_info *inode, int mode, static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
loff_t offset, loff_t len) u64 start_sector, u64 end_sector)
{ {
struct address_space *mapping = inode->v.i_mapping;
struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct btree_trans trans; struct btree_trans trans;
struct btree_iter *iter; struct btree_iter *iter;
struct bpos end_pos; struct bpos end_pos = POS(inode->v.i_ino, end_sector);
loff_t end = offset + len;
loff_t block_start = round_down(offset, block_bytes(c));
loff_t block_end = round_up(end, block_bytes(c));
unsigned sectors;
unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas; unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas;
int ret; int ret = 0;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
inode_lock(&inode->v);
inode_dio_wait(&inode->v);
bch2_pagecache_block_get(&inode->ei_pagecache_lock);
if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
ret = inode_newsize_ok(&inode->v, end);
if (ret)
goto err;
}
if (mode & FALLOC_FL_ZERO_RANGE) {
ret = __bch2_truncate_page(inode,
offset >> PAGE_SHIFT,
offset, end);
if (!ret &&
offset >> PAGE_SHIFT != end >> PAGE_SHIFT)
ret = __bch2_truncate_page(inode,
end >> PAGE_SHIFT,
offset, end);
if (unlikely(ret))
goto err;
truncate_pagecache_range(&inode->v, offset, end - 1);
}
iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
POS(inode->v.i_ino, block_start >> 9), POS(inode->v.i_ino, start_sector),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT); BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
end_pos = POS(inode->v.i_ino, block_end >> 9);
while (!ret && bkey_cmp(iter->pos, end_pos) < 0) { while (!ret && bkey_cmp(iter->pos, end_pos) < 0) {
s64 i_sectors_delta = 0; s64 i_sectors_delta = 0;
@ -2674,6 +2641,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
struct quota_res quota_res = { 0 }; struct quota_res quota_res = { 0 };
struct bkey_i_reservation reservation; struct bkey_i_reservation reservation;
struct bkey_s_c k; struct bkey_s_c k;
unsigned sectors;
bch2_trans_begin(&trans); bch2_trans_begin(&trans);
@ -2734,7 +2702,48 @@ bkey_err:
ret = 0; ret = 0;
} }
bch2_trans_iter_put(&trans, iter); bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
return ret;
}
static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
loff_t offset, loff_t len)
{
struct address_space *mapping = inode->v.i_mapping;
struct bch_fs *c = inode->v.i_sb->s_fs_info;
loff_t end = offset + len;
loff_t block_start = round_down(offset, block_bytes(c));
loff_t block_end = round_up(end, block_bytes(c));
int ret;
inode_lock(&inode->v);
inode_dio_wait(&inode->v);
bch2_pagecache_block_get(&inode->ei_pagecache_lock);
if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
ret = inode_newsize_ok(&inode->v, end);
if (ret)
goto err;
}
if (mode & FALLOC_FL_ZERO_RANGE) {
ret = __bch2_truncate_page(inode,
offset >> PAGE_SHIFT,
offset, end);
if (!ret &&
offset >> PAGE_SHIFT != end >> PAGE_SHIFT)
ret = __bch2_truncate_page(inode,
end >> PAGE_SHIFT,
offset, end);
if (unlikely(ret))
goto err;
truncate_pagecache_range(&inode->v, offset, end - 1);
}
ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9);
if (ret) if (ret)
goto err; goto err;
@ -2748,28 +2757,13 @@ bkey_err:
if (end >= inode->v.i_size && if (end >= inode->v.i_size &&
(!(mode & FALLOC_FL_KEEP_SIZE) || (!(mode & FALLOC_FL_KEEP_SIZE) ||
(mode & FALLOC_FL_ZERO_RANGE))) { (mode & FALLOC_FL_ZERO_RANGE))) {
struct btree_iter *inode_iter;
struct bch_inode_unpacked inode_u;
do {
bch2_trans_begin(&trans);
inode_iter = bch2_inode_peek(&trans, &inode_u,
inode->v.i_ino, 0);
ret = PTR_ERR_OR_ZERO(inode_iter);
} while (ret == -EINTR);
bch2_trans_iter_put(&trans, inode_iter);
bch2_trans_unlock(&trans);
if (ret)
goto err;
/* /*
* Sync existing appends before extending i_size, * Sync existing appends before extending i_size,
* as in bch2_extend(): * as in bch2_extend():
*/ */
ret = filemap_write_and_wait_range(mapping, ret = filemap_write_and_wait_range(mapping,
inode_u.bi_size, S64_MAX); inode->ei_inode.bi_size, S64_MAX);
if (ret) if (ret)
goto err; goto err;
@ -2783,7 +2777,6 @@ bkey_err:
mutex_unlock(&inode->ei_update_lock); mutex_unlock(&inode->ei_update_lock);
} }
err: err:
bch2_trans_exit(&trans);
bch2_pagecache_block_put(&inode->ei_pagecache_lock); bch2_pagecache_block_put(&inode->ei_pagecache_lock);
inode_unlock(&inode->v); inode_unlock(&inode->v);
return ret; return ret;

View File

@ -81,51 +81,37 @@ static int write_inode(struct btree_trans *trans,
return ret; return ret;
} }
static int __remove_dirent(struct btree_trans *trans, static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
struct bkey_s_c_dirent dirent)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct qstr name; struct btree_iter *iter;
struct bch_inode_unpacked dir_inode; struct bch_inode_unpacked dir_inode;
struct bch_hash_info dir_hash_info; struct bch_hash_info dir_hash_info;
u64 dir_inum = dirent.k->p.inode;
int ret; int ret;
char *buf;
name.len = bch2_dirent_name_bytes(dirent); ret = lookup_inode(trans, pos.inode, &dir_inode, NULL);
buf = bch2_trans_kmalloc(trans, name.len + 1);
if (IS_ERR(buf))
return PTR_ERR(buf);
memcpy(buf, dirent.v->d_name, name.len);
buf[name.len] = '\0';
name.name = buf;
ret = lookup_inode(trans, dir_inum, &dir_inode, NULL);
if (ret && ret != -EINTR)
bch_err(c, "remove_dirent: err %i looking up directory inode", ret);
if (ret) if (ret)
return ret; return ret;
dir_hash_info = bch2_hash_info_init(c, &dir_inode); dir_hash_info = bch2_hash_info_init(c, &dir_inode);
ret = bch2_hash_delete(trans, bch2_dirent_hash_desc, iter = bch2_trans_get_iter(trans, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
&dir_hash_info, dir_inum, &name);
if (ret && ret != -EINTR)
bch_err(c, "remove_dirent: err %i deleting dirent", ret);
if (ret)
return ret;
return 0; ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
&dir_hash_info, iter);
bch2_trans_iter_put(trans, iter);
return ret;
} }
static int remove_dirent(struct btree_trans *trans, static int remove_dirent(struct btree_trans *trans, struct bpos pos)
struct bkey_s_c_dirent dirent)
{ {
return __bch2_trans_do(trans, NULL, NULL, int ret = __bch2_trans_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL| BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW, BTREE_INSERT_LAZY_RW,
__remove_dirent(trans, dirent)); __remove_dirent(trans, pos));
if (ret)
bch_err(trans->c, "remove_dirent: err %i deleting dirent", ret);
return ret;
} }
static int __reattach_inode(struct btree_trans *trans, static int __reattach_inode(struct btree_trans *trans,
@ -173,13 +159,10 @@ static int reattach_inode(struct btree_trans *trans,
struct bch_inode_unpacked *lostfound, struct bch_inode_unpacked *lostfound,
u64 inum) u64 inum)
{ {
struct bch_fs *c = trans->c; int ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
int ret;
ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
__reattach_inode(trans, lostfound, inum)); __reattach_inode(trans, lostfound, inum));
if (ret) if (ret)
bch_err(c, "error %i reattaching inode %llu", ret, inum); bch_err(trans->c, "error %i reattaching inode %llu", ret, inum);
return ret; return ret;
} }
@ -202,7 +185,7 @@ static int remove_backpointer(struct btree_trans *trans,
goto out; goto out;
} }
ret = remove_dirent(trans, bkey_s_c_to_dirent(k)); ret = remove_dirent(trans, k.k->p);
out: out:
bch2_trans_iter_put(trans, iter); bch2_trans_iter_put(trans, iter);
return ret; return ret;
@ -752,7 +735,7 @@ retry:
"dirent points to missing inode:\n%s", "dirent points to missing inode:\n%s",
(bch2_bkey_val_to_text(&PBUF(buf), c, (bch2_bkey_val_to_text(&PBUF(buf), c,
k), buf))) { k), buf))) {
ret = remove_dirent(&trans, d); ret = remove_dirent(&trans, d.k->p);
if (ret) if (ret)
goto err; goto err;
goto next; goto next;
@ -783,7 +766,7 @@ retry:
backpointer_exists, c, backpointer_exists, c,
"directory %llu with multiple links", "directory %llu with multiple links",
target.bi_inum)) { target.bi_inum)) {
ret = remove_dirent(&trans, d); ret = remove_dirent(&trans, d.k->p);
if (ret) if (ret)
goto err; goto err;
continue; continue;

View File

@ -787,7 +787,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
* We may be called from the device add path, before the new device has * We may be called from the device add path, before the new device has
* actually been added to the running filesystem: * actually been added to the running filesystem:
*/ */
if (c) if (!new_fs)
spin_lock(&c->journal.lock); spin_lock(&c->journal.lock);
memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64));
@ -795,17 +795,17 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
swap(new_buckets, ja->buckets); swap(new_buckets, ja->buckets);
swap(new_bucket_seq, ja->bucket_seq); swap(new_bucket_seq, ja->bucket_seq);
if (c) if (!new_fs)
spin_unlock(&c->journal.lock); spin_unlock(&c->journal.lock);
while (ja->nr < nr) { while (ja->nr < nr) {
struct open_bucket *ob = NULL; struct open_bucket *ob = NULL;
unsigned pos; unsigned pos;
long bucket; long b;
if (new_fs) { if (new_fs) {
bucket = bch2_bucket_alloc_new_fs(ca); b = bch2_bucket_alloc_new_fs(ca);
if (bucket < 0) { if (b < 0) {
ret = -ENOSPC; ret = -ENOSPC;
goto err; goto err;
} }
@ -819,10 +819,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
goto err; goto err;
} }
bucket = sector_to_bucket(ca, ob->ptr.offset); b = sector_to_bucket(ca, ob->ptr.offset);
}
if (c) {
percpu_down_read(&c->mark_lock); percpu_down_read(&c->mark_lock);
spin_lock(&c->journal.lock); spin_lock(&c->journal.lock);
} }
@ -839,9 +837,9 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
__array_insert_item(journal_buckets->buckets, ja->nr, pos); __array_insert_item(journal_buckets->buckets, ja->nr, pos);
ja->nr++; ja->nr++;
ja->buckets[pos] = bucket; ja->buckets[pos] = b;
ja->bucket_seq[pos] = 0; ja->bucket_seq[pos] = 0;
journal_buckets->buckets[pos] = cpu_to_le64(bucket); journal_buckets->buckets[pos] = cpu_to_le64(b);
if (pos <= ja->discard_idx) if (pos <= ja->discard_idx)
ja->discard_idx = (ja->discard_idx + 1) % ja->nr; ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
@ -852,28 +850,25 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
if (pos <= ja->cur_idx) if (pos <= ja->cur_idx)
ja->cur_idx = (ja->cur_idx + 1) % ja->nr; ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
if (!c || new_fs) if (new_fs) {
bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal, bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal,
ca->mi.bucket_size, ca->mi.bucket_size,
gc_phase(GC_PHASE_SB), gc_phase(GC_PHASE_SB),
0); 0);
} else {
if (c) {
spin_unlock(&c->journal.lock); spin_unlock(&c->journal.lock);
percpu_up_read(&c->mark_lock); percpu_up_read(&c->mark_lock);
}
if (c && !new_fs)
ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
bch2_trans_mark_metadata_bucket(&trans, NULL, ca, bch2_trans_mark_metadata_bucket(&trans, ca,
bucket, BCH_DATA_journal, b, BCH_DATA_journal,
ca->mi.bucket_size)); ca->mi.bucket_size));
if (!new_fs)
bch2_open_bucket_put(c, ob); bch2_open_bucket_put(c, ob);
if (ret) if (ret)
goto err; goto err;
}
} }
err: err:
bch2_sb_resize_journal(&ca->disk_sb, bch2_sb_resize_journal(&ca->disk_sb,

View File

@ -241,10 +241,11 @@ static inline void bch2_journal_add_entry(struct journal *j, struct journal_res
} }
static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res, static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
enum btree_id id, const struct bkey_i *k) enum btree_id id, unsigned level,
const struct bkey_i *k)
{ {
bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys, bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys,
id, 0, k, k->k.u64s); id, level, k, k->k.u64s);
} }
static inline bool journal_entry_empty(struct jset *j) static inline bool journal_entry_empty(struct jset *j)

View File

@ -599,7 +599,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_fs *c = container_of(j, struct bch_fs, journal);
bool kthread = (current->flags & PF_KTHREAD) != 0; bool kthread = (current->flags & PF_KTHREAD) != 0;
u64 seq_to_flush; u64 seq_to_flush;
size_t min_nr, nr_flushed; size_t min_nr, min_key_cache, nr_flushed;
unsigned flags; unsigned flags;
int ret = 0; int ret = 0;
@ -649,9 +649,10 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
atomic_long_read(&c->btree_key_cache.nr_dirty), atomic_long_read(&c->btree_key_cache.nr_dirty),
atomic_long_read(&c->btree_key_cache.nr_keys)); atomic_long_read(&c->btree_key_cache.nr_keys));
min_key_cache = min(bch2_nr_btree_keys_need_flush(c), 128UL);
nr_flushed = journal_flush_pins(j, seq_to_flush, nr_flushed = journal_flush_pins(j, seq_to_flush,
min_nr, min_nr, min_key_cache);
min(bch2_nr_btree_keys_need_flush(c), 128UL));
if (direct) if (direct)
j->nr_direct_reclaim += nr_flushed; j->nr_direct_reclaim += nr_flushed;
@ -661,7 +662,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
if (nr_flushed) if (nr_flushed)
wake_up(&j->reclaim_wait); wake_up(&j->reclaim_wait);
} while (min_nr && nr_flushed && !direct); } while ((min_nr || min_key_cache) && !direct);
memalloc_noreclaim_restore(flags); memalloc_noreclaim_restore(flags);

View File

@ -68,7 +68,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
bch2_bkey_buf_init(&_insert); bch2_bkey_buf_init(&_insert);
bch2_bkey_buf_realloc(&_insert, c, U8_MAX); bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
iter = bch2_trans_get_iter(&trans, m->btree_id, iter = bch2_trans_get_iter(&trans, m->btree_id,
bkey_start_pos(&bch2_keylist_front(keys)->k), bkey_start_pos(&bch2_keylist_front(keys)->k),

View File

@ -108,7 +108,7 @@ static bool have_copygc_reserve(struct bch_dev *ca)
spin_lock(&ca->fs->freelist_lock); spin_lock(&ca->fs->freelist_lock);
ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) || ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) ||
ca->allocator_state != ALLOCATOR_RUNNING; ca->allocator_state != ALLOCATOR_running;
spin_unlock(&ca->fs->freelist_lock); spin_unlock(&ca->fs->freelist_lock);
return ret; return ret;
@ -222,7 +222,7 @@ static int bch2_copygc(struct bch_fs *c)
ret = bch2_move_data(c, ret = bch2_move_data(c,
0, POS_MIN, 0, POS_MIN,
BTREE_ID_NR, POS_MAX, BTREE_ID_NR, POS_MAX,
&c->copygc_pd.rate, NULL,
writepoint_ptr(&c->copygc_write_point), writepoint_ptr(&c->copygc_write_point),
copygc_pred, NULL, copygc_pred, NULL,
&move_stats); &move_stats);
@ -282,8 +282,7 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
{ {
struct bch_dev *ca; struct bch_dev *ca;
unsigned dev_idx; unsigned dev_idx;
u64 fragmented_allowed = c->copygc_threshold; u64 fragmented_allowed = 0, fragmented = 0;
u64 fragmented = 0;
for_each_rw_member(ca, c, dev_idx) { for_each_rw_member(ca, c, dev_idx) {
struct bch_dev_usage usage = bch2_dev_usage_read(ca); struct bch_dev_usage usage = bch2_dev_usage_read(ca);
@ -312,11 +311,14 @@ static int bch2_copygc_thread(void *arg)
wait = bch2_copygc_wait_amount(c); wait = bch2_copygc_wait_amount(c);
if (wait > clock->max_slop) { if (wait > clock->max_slop) {
c->copygc_wait = last + wait;
bch2_kthread_io_clock_wait(clock, last + wait, bch2_kthread_io_clock_wait(clock, last + wait,
MAX_SCHEDULE_TIMEOUT); MAX_SCHEDULE_TIMEOUT);
continue; continue;
} }
c->copygc_wait = 0;
if (bch2_copygc(c)) if (bch2_copygc(c))
break; break;
} }
@ -326,9 +328,6 @@ static int bch2_copygc_thread(void *arg)
void bch2_copygc_stop(struct bch_fs *c) void bch2_copygc_stop(struct bch_fs *c)
{ {
c->copygc_pd.rate.rate = UINT_MAX;
bch2_ratelimit_reset(&c->copygc_pd.rate);
if (c->copygc_thread) { if (c->copygc_thread) {
kthread_stop(c->copygc_thread); kthread_stop(c->copygc_thread);
put_task_struct(c->copygc_thread); put_task_struct(c->copygc_thread);
@ -365,6 +364,4 @@ int bch2_copygc_start(struct bch_fs *c)
void bch2_fs_copygc_init(struct bch_fs *c) void bch2_fs_copygc_init(struct bch_fs *c)
{ {
bch2_pd_controller_init(&c->copygc_pd);
c->copygc_pd.d_term = 0;
} }

View File

@ -1005,13 +1005,6 @@ int bch2_fs_recovery(struct bch_fs *c)
} }
if (!c->sb.clean &&
!(c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) {
bch_info(c, "BCH_FEATURE_atomic_nlink not set and filesystem dirty, fsck required");
c->opts.fsck = true;
c->opts.fix_errors = FSCK_OPT_YES;
}
if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) { if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) {
bch_info(c, "alloc_v2 feature bit not set, fsck required"); bch_info(c, "alloc_v2 feature bit not set, fsck required");
c->opts.fsck = true; c->opts.fsck = true;
@ -1145,9 +1138,11 @@ use_clean:
!(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) || !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) ||
!(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) || !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) ||
test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
bool metadata_only = c->opts.norecovery;
bch_info(c, "starting mark and sweep"); bch_info(c, "starting mark and sweep");
err = "error in mark and sweep"; err = "error in mark and sweep";
ret = bch2_gc(c, true); ret = bch2_gc(c, true, metadata_only);
if (ret) if (ret)
goto err; goto err;
bch_verbose(c, "mark and sweep done"); bch_verbose(c, "mark and sweep done");
@ -1245,8 +1240,8 @@ use_clean:
} }
if (c->opts.fsck && if (c->opts.fsck &&
!test_bit(BCH_FS_ERROR, &c->flags)) { !test_bit(BCH_FS_ERROR, &c->flags) &&
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; BCH_SB_HAS_ERRORS(c->disk_sb.sb)) {
SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
write_sb = true; write_sb = true;
} }
@ -1338,10 +1333,12 @@ int bch2_fs_initialize(struct bch_fs *c)
* Write out the superblock and journal buckets, now that we can do * Write out the superblock and journal buckets, now that we can do
* btree updates * btree updates
*/ */
err = "error writing alloc info"; err = "error marking superblock and journal";
ret = bch2_alloc_write(c, 0); for_each_member_device(ca, c, i) {
if (ret) ret = bch2_trans_mark_dev_sb(c, ca);
goto err; if (ret)
goto err;
}
bch2_inode_init(c, &root_inode, 0, 0, bch2_inode_init(c, &root_inode, 0, 0,
S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);

View File

@ -313,8 +313,8 @@ static int replicas_table_update(struct bch_fs *c,
out: out:
free_percpu(new_gc); free_percpu(new_gc);
kfree(new_scratch); kfree(new_scratch);
free_percpu(new_usage[1]); for (i = 0; i < ARRAY_SIZE(new_usage); i++)
free_percpu(new_usage[0]); free_percpu(new_usage[i]);
kfree(new_base); kfree(new_base);
return ret; return ret;
err: err:

View File

@ -286,7 +286,6 @@ void bch2_fs_read_only(struct bch_fs *c)
percpu_ref_kill(&c->writes); percpu_ref_kill(&c->writes);
cancel_work_sync(&c->ec_stripe_delete_work); cancel_work_sync(&c->ec_stripe_delete_work);
cancel_delayed_work(&c->pd_controllers_update);
/* /*
* If we're not doing an emergency shutdown, we want to wait on * If we're not doing an emergency shutdown, we want to wait on
@ -371,8 +370,6 @@ static int bch2_fs_read_write_late(struct bch_fs *c)
return ret; return ret;
} }
schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
schedule_work(&c->ec_stripe_delete_work); schedule_work(&c->ec_stripe_delete_work);
return 0; return 0;
@ -566,7 +563,6 @@ void __bch2_fs_stop(struct bch_fs *c)
cancel_work_sync(&ca->io_error_work); cancel_work_sync(&ca->io_error_work);
cancel_work_sync(&c->btree_write_error_work); cancel_work_sync(&c->btree_write_error_work);
cancel_delayed_work_sync(&c->pd_controllers_update);
cancel_work_sync(&c->read_only_work); cancel_work_sync(&c->read_only_work);
for (i = 0; i < c->sb.nr_devices; i++) for (i = 0; i < c->sb.nr_devices; i++)
@ -908,9 +904,16 @@ int bch2_fs_start(struct bch_fs *c)
/* /*
* Allocator threads don't start filling copygc reserve until after we * Allocator threads don't start filling copygc reserve until after we
* set BCH_FS_STARTED - wake them now: * set BCH_FS_STARTED - wake them now:
*
* XXX ugly hack:
* Need to set ca->allocator_state here instead of relying on the
* allocator threads to do it to avoid racing with the copygc threads
* checking it and thinking they have no alloc reserve:
*/ */
for_each_online_member(ca, c, i) for_each_online_member(ca, c, i) {
ca->allocator_state = ALLOCATOR_running;
bch2_wake_allocator(ca); bch2_wake_allocator(ca);
}
if (c->opts.read_only || c->opts.nochanges) { if (c->opts.read_only || c->opts.nochanges) {
bch2_fs_read_only(c); bch2_fs_read_only(c);
@ -1679,7 +1682,7 @@ have_slot:
bch2_dev_usage_journal_reserve(c); bch2_dev_usage_journal_reserve(c);
err = "error marking superblock"; err = "error marking superblock";
ret = bch2_trans_mark_dev_sb(c, NULL, ca); ret = bch2_trans_mark_dev_sb(c, ca);
if (ret) if (ret)
goto err_late; goto err_late;
@ -1739,7 +1742,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
ca = bch_dev_locked(c, dev_idx); ca = bch_dev_locked(c, dev_idx);
if (bch2_trans_mark_dev_sb(c, NULL, ca)) { if (bch2_trans_mark_dev_sb(c, ca)) {
err = "bch2_trans_mark_dev_sb() error"; err = "bch2_trans_mark_dev_sb() error";
goto err; goto err;
} }

View File

@ -132,10 +132,10 @@ do { \
} while (0) } while (0)
write_attribute(trigger_journal_flush); write_attribute(trigger_journal_flush);
write_attribute(trigger_btree_coalesce);
write_attribute(trigger_gc); write_attribute(trigger_gc);
write_attribute(prune_cache); write_attribute(prune_cache);
rw_attribute(btree_gc_periodic); rw_attribute(btree_gc_periodic);
rw_attribute(gc_gens_pos);
read_attribute(uuid); read_attribute(uuid);
read_attribute(minor); read_attribute(minor);
@ -190,7 +190,7 @@ rw_attribute(cache_replacement_policy);
rw_attribute(label); rw_attribute(label);
rw_attribute(copy_gc_enabled); rw_attribute(copy_gc_enabled);
sysfs_pd_controller_attribute(copy_gc); read_attribute(copy_gc_wait);
rw_attribute(rebalance_enabled); rw_attribute(rebalance_enabled);
sysfs_pd_controller_attribute(rebalance); sysfs_pd_controller_attribute(rebalance);
@ -199,8 +199,6 @@ rw_attribute(promote_whole_extents);
read_attribute(new_stripes); read_attribute(new_stripes);
rw_attribute(pd_controllers_update_seconds);
read_attribute(io_timers_read); read_attribute(io_timers_read);
read_attribute(io_timers_write); read_attribute(io_timers_write);
@ -314,6 +312,13 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
return 0; return 0;
} }
void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
{
pr_buf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]);
bch2_bpos_to_text(out, c->gc_gens_pos);
pr_buf(out, "\n");
}
SHOW(bch2_fs) SHOW(bch2_fs)
{ {
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
@ -339,14 +344,18 @@ SHOW(bch2_fs)
sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic);
sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); if (attr == &sysfs_gc_gens_pos) {
bch2_gc_gens_pos_to_text(&out, c);
return out.pos - buf;
}
sysfs_print(pd_controllers_update_seconds, sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
c->pd_controllers_update_seconds);
sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled); sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled);
sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */
sysfs_pd_controller_show(copy_gc, &c->copygc_pd); sysfs_hprint(copy_gc_wait,
max(0LL, c->copygc_wait -
atomic64_read(&c->io_clock[WRITE].now)) << 9);
if (attr == &sysfs_rebalance_work) { if (attr == &sysfs_rebalance_work) {
bch2_rebalance_work_to_text(&out, c); bch2_rebalance_work_to_text(&out, c);
@ -454,10 +463,7 @@ STORE(bch2_fs)
return ret; return ret;
} }
sysfs_strtoul(pd_controllers_update_seconds,
c->pd_controllers_update_seconds);
sysfs_pd_controller_store(rebalance, &c->rebalance.pd); sysfs_pd_controller_store(rebalance, &c->rebalance.pd);
sysfs_pd_controller_store(copy_gc, &c->copygc_pd);
sysfs_strtoul(promote_whole_extents, c->promote_whole_extents); sysfs_strtoul(promote_whole_extents, c->promote_whole_extents);
@ -471,9 +477,6 @@ STORE(bch2_fs)
if (attr == &sysfs_trigger_journal_flush) if (attr == &sysfs_trigger_journal_flush)
bch2_journal_meta(&c->journal); bch2_journal_meta(&c->journal);
if (attr == &sysfs_trigger_btree_coalesce)
bch2_coalesce(c);
if (attr == &sysfs_trigger_gc) { if (attr == &sysfs_trigger_gc) {
/* /*
* Full gc is currently incompatible with btree key cache: * Full gc is currently incompatible with btree key cache:
@ -570,16 +573,16 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_extent_migrate_raced, &sysfs_extent_migrate_raced,
&sysfs_trigger_journal_flush, &sysfs_trigger_journal_flush,
&sysfs_trigger_btree_coalesce,
&sysfs_trigger_gc, &sysfs_trigger_gc,
&sysfs_gc_gens_pos,
&sysfs_prune_cache, &sysfs_prune_cache,
&sysfs_copy_gc_enabled, &sysfs_copy_gc_enabled,
&sysfs_copy_gc_wait,
&sysfs_rebalance_enabled, &sysfs_rebalance_enabled,
&sysfs_rebalance_work, &sysfs_rebalance_work,
sysfs_pd_controller_files(rebalance), sysfs_pd_controller_files(rebalance),
sysfs_pd_controller_files(copy_gc),
&sysfs_new_stripes, &sysfs_new_stripes,
@ -817,23 +820,28 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
"free[RESERVE_MOVINGGC]\t%zu/%zu\n" "free[RESERVE_MOVINGGC]\t%zu/%zu\n"
"free[RESERVE_NONE]\t%zu/%zu\n" "free[RESERVE_NONE]\t%zu/%zu\n"
"freelist_wait\t\t%s\n" "freelist_wait\t\t%s\n"
"open buckets\t\t%u/%u (reserved %u)\n" "open buckets allocated\t%u\n"
"open buckets this dev\t%u\n"
"open buckets total\t%u\n"
"open_buckets_wait\t%s\n" "open_buckets_wait\t%s\n"
"open_buckets_btree\t%u\n" "open_buckets_btree\t%u\n"
"open_buckets_user\t%u\n" "open_buckets_user\t%u\n"
"btree reserve cache\t%u\n", "btree reserve cache\t%u\n"
"thread state:\t\t%s\n",
stats.buckets_ec, stats.buckets_ec,
__dev_buckets_available(ca, stats), __dev_buckets_available(ca, stats),
fifo_used(&ca->free_inc), ca->free_inc.size, fifo_used(&ca->free_inc), ca->free_inc.size,
fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size,
c->freelist_wait.list.first ? "waiting" : "empty", c->freelist_wait.list.first ? "waiting" : "empty",
c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, OPEN_BUCKETS_COUNT - c->open_buckets_nr_free,
BTREE_NODE_OPEN_BUCKET_RESERVE, ca->nr_open_buckets,
OPEN_BUCKETS_COUNT,
c->open_buckets_wait.list.first ? "waiting" : "empty", c->open_buckets_wait.list.first ? "waiting" : "empty",
nr[BCH_DATA_btree], nr[BCH_DATA_btree],
nr[BCH_DATA_user], nr[BCH_DATA_user],
c->btree_reserve_cache_nr); c->btree_reserve_cache_nr,
bch2_allocator_states[ca->allocator_state]);
} }
static const char * const bch2_rw[] = { static const char * const bch2_rw[] = {

View File

@ -497,6 +497,42 @@ static int rand_insert(struct bch_fs *c, u64 nr)
return ret; return ret;
} }
static int rand_insert_multi(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
struct bkey_i_cookie k[8];
int ret = 0;
unsigned j;
u64 i;
bch2_trans_init(&trans, c, 0, 0);
for (i = 0; i < nr; i += ARRAY_SIZE(k)) {
for (j = 0; j < ARRAY_SIZE(k); j++) {
bkey_cookie_init(&k[j].k_i);
k[j].k.p.offset = test_rand();
k[j].k.p.snapshot = U32_MAX;
}
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i) ?:
__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i) ?:
__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i) ?:
__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[3].k_i) ?:
__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[4].k_i) ?:
__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[5].k_i) ?:
__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i) ?:
__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i));
if (ret) {
bch_err(c, "error in rand_insert_multi: %i", ret);
break;
}
}
bch2_trans_exit(&trans);
return ret;
}
static int rand_lookup(struct bch_fs *c, u64 nr) static int rand_lookup(struct bch_fs *c, u64 nr)
{ {
struct btree_trans trans; struct btree_trans trans;
@ -765,6 +801,7 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
if (!strcmp(testname, #_test)) j.fn = _test if (!strcmp(testname, #_test)) j.fn = _test
perf_test(rand_insert); perf_test(rand_insert);
perf_test(rand_insert_multi);
perf_test(rand_lookup); perf_test(rand_lookup);
perf_test(rand_mixed); perf_test(rand_mixed);
perf_test(rand_delete); perf_test(rand_delete);