Update bcachefs sources to 783085c3cc44 kbuild: Allow gcov to be enabled on the command line

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2023-11-20 19:33:52 -05:00
parent e6b578917f
commit 06611a71a3
71 changed files with 1309 additions and 632 deletions

View File

@ -1 +1 @@
938f680845d1be28979e23aee972dba010c464ba
783085c3cc440183ba5e987b1aa7791cc1ca42ba

View File

@ -5,6 +5,7 @@
#include "libbcachefs/bcachefs_ioctl.h"
#include "libbcachefs/btree_cache.h"
#include "libbcachefs/move.h"
#include "cmds.h"
#include "libbcachefs.h"
@ -55,7 +56,7 @@ int cmd_data_rereplicate(int argc, char *argv[])
die("too many arguments");
return bchu_data(bcache_fs_open(fs_path), (struct bch_ioctl_data) {
.op = BCH_DATA_OP_REREPLICATE,
.op = BCH_DATA_OP_rereplicate,
.start_btree = 0,
.start_pos = POS_MIN,
.end_btree = BTREE_ID_NR,
@ -70,7 +71,7 @@ static void data_job_usage(void)
"\n"
"Kick off a data job and report progress\n"
"\n"
"job: one of scrub, rereplicate, migrate, or rewrite_old_nodes\n"
"job: one of scrub, rereplicate, migrate, rewrite_old_nodes, or drop_extra_replicas\n"
"\n"
"Options:\n"
" -b btree btree to operate on\n"
@ -81,14 +82,6 @@ static void data_job_usage(void)
exit(EXIT_SUCCESS);
}
const char * const data_jobs[] = {
"scrub",
"rereplicate",
"migrate",
"rewrite_old_nodes",
NULL
};
int cmd_data_job(int argc, char *argv[])
{
struct bch_ioctl_data op = {
@ -121,10 +114,7 @@ int cmd_data_job(int argc, char *argv[])
if (!job)
die("please specify which type of job");
op.op = read_string_list_or_die(job, data_jobs, "bad job type");
if (op.op == BCH_DATA_OP_SCRUB)
die("scrub not implemented yet");
op.op = read_string_list_or_die(job, bch2_data_ops_strs, "bad job type");
char *fs_path = arg_pop();
if (!fs_path)

View File

@ -332,7 +332,7 @@ int cmd_device_evacuate(int argc, char *argv[])
}
return bchu_data(fs, (struct bch_ioctl_data) {
.op = BCH_DATA_OP_MIGRATE,
.op = BCH_DATA_OP_migrate,
.start_btree = 0,
.start_pos = POS_MIN,
.end_btree = BTREE_ID_NR,

View File

@ -161,6 +161,13 @@ static inline i_type a_type##_read(const a_type##_t *v) \
return __ATOMIC_READ(&v->counter); \
} \
\
static inline i_type a_type##_read_acquire(const a_type##_t *v) \
{ \
i_type ret = __ATOMIC_READ(&v->counter); \
smp_mb__after_atomic(); \
return ret; \
} \
\
static inline void a_type##_set(a_type##_t *v, i_type i) \
{ \
return __ATOMIC_SET(&v->counter, i); \

View File

@ -104,7 +104,7 @@
struct closure;
struct closure_syncer;
typedef void (closure_fn) (struct closure *);
typedef void (closure_fn) (struct work_struct *);
extern struct dentry *bcache_debug;
struct closure_waitlist {
@ -254,7 +254,7 @@ static inline void closure_queue(struct closure *cl)
INIT_WORK(&cl->work, cl->work.func);
BUG_ON(!queue_work(wq, &cl->work));
} else
cl->fn(cl);
cl->fn(&cl->work);
}
/**
@ -309,6 +309,11 @@ static inline void closure_wake_up(struct closure_waitlist *list)
__closure_wake_up(list);
}
#define CLOSURE_CALLBACK(name) void name(struct work_struct *ws)
#define closure_type(name, type, member) \
struct closure *cl = container_of(ws, struct closure, work); \
type *name = container_of(cl, type, member)
/**
* continue_at - jump to another function with barrier
*

View File

@ -22,10 +22,18 @@ struct shrinker {
int seeks; /* seeks to recreate an obj */
long batch; /* reclaim batch size, 0 = default */
struct list_head list;
void *private_data;
};
int register_shrinker(struct shrinker *, const char *, ...);
void unregister_shrinker(struct shrinker *);
static inline void shrinker_free(struct shrinker *s)
{
free(s);
}
struct shrinker *shrinker_alloc(unsigned int, const char *, ...);
int shrinker_register(struct shrinker *);
void shrinker_unregister(struct shrinker *);
void run_shrinkers(gfp_t gfp_mask, bool);

View File

@ -847,6 +847,19 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
return ret;
}
/*
* need to know if we're getting called from the invalidate path or
* not:
*/
if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
old_a->cached_sectors) {
ret = bch2_update_cached_sectors_list(trans, new->k.p.inode,
-((s64) old_a->cached_sectors));
if (ret)
return ret;
}
return 0;
}
@ -1212,7 +1225,7 @@ fsck_err:
return ret;
}
static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_trans *trans,
static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans,
struct btree_iter *iter)
{
struct bch_fs *c = trans->c;
@ -1271,24 +1284,6 @@ delete:
goto out;
}
static int bch2_check_discard_freespace_key(struct btree_trans *trans,
struct btree_iter *iter,
struct bpos end)
{
if (!btree_id_is_extents(iter->btree_id)) {
return __bch2_check_discard_freespace_key(trans, iter);
} else {
int ret = 0;
while (!bkey_eq(iter->pos, end) &&
!(ret = btree_trans_too_many_iters(trans) ?:
__bch2_check_discard_freespace_key(trans, iter)))
bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
return ret;
}
}
/*
* We've already checked that generation numbers in the bucket_gens btree are
* valid for buckets that exist; this just checks for keys for nonexistent
@ -1445,12 +1440,40 @@ bkey_err:
ret = for_each_btree_key2(trans, iter,
BTREE_ID_need_discard, POS_MIN,
BTREE_ITER_PREFETCH, k,
bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
for_each_btree_key2(trans, iter,
BTREE_ID_freespace, POS_MIN,
BTREE_ITER_PREFETCH, k,
bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
for_each_btree_key_commit(trans, iter,
bch2_check_discard_freespace_key(trans, &iter));
if (ret)
goto err;
bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN,
BTREE_ITER_PREFETCH);
while (1) {
bch2_trans_begin(trans);
k = bch2_btree_iter_peek(&iter);
if (!k.k)
break;
ret = bkey_err(k) ?:
bch2_check_discard_freespace_key(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
ret = 0;
continue;
}
if (ret) {
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
bch_err(c, "while checking %s", buf.buf);
printbuf_exit(&buf);
break;
}
bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
}
bch2_trans_iter_exit(trans, &iter);
if (ret)
goto err;
ret = for_each_btree_key_commit(trans, iter,
BTREE_ID_bucket_gens, POS_MIN,
BTREE_ITER_PREFETCH, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
@ -1802,7 +1825,7 @@ static void bch2_do_invalidates_work(struct work_struct *work)
unsigned i;
int ret = 0;
ret = bch2_btree_write_buffer_flush(trans);
ret = bch2_btree_write_buffer_tryflush(trans);
if (ret)
goto err;

View File

@ -1297,6 +1297,30 @@ out:
return wp;
}
static noinline void
deallocate_extra_replicas(struct bch_fs *c,
struct open_buckets *ptrs,
struct open_buckets *ptrs_no_use,
unsigned extra_replicas)
{
struct open_buckets ptrs2 = { 0 };
struct open_bucket *ob;
unsigned i;
open_bucket_for_each(c, ptrs, ob, i) {
unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability;
if (d && d <= extra_replicas) {
extra_replicas -= d;
ob_push(c, ptrs_no_use, ob);
} else {
ob_push(c, &ptrs2, ob);
}
}
*ptrs = ptrs2;
}
/*
* Get us an open_bucket we can allocate from, return with it locked:
*/
@ -1382,6 +1406,9 @@ alloc_done:
if (ret)
goto err;
if (nr_effective > nr_replicas)
deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas);
/* Free buckets we didn't use: */
open_bucket_for_each(c, &wp->ptrs, ob, i)
open_bucket_free_unused(c, ob);

View File

@ -406,6 +406,7 @@ BCH_DEBUG_PARAMS_DEBUG()
x(blocked_journal_max_in_flight) \
x(blocked_allocate) \
x(blocked_allocate_open_bucket) \
x(blocked_write_buffer_full) \
x(nocow_lock_contended)
enum bch_time_stats {
@ -640,6 +641,8 @@ struct journal_keys {
size_t gap;
size_t nr;
size_t size;
atomic_t ref;
bool initial_ref_held;
};
struct btree_trans_buf {
@ -664,7 +667,8 @@ struct btree_trans_buf {
x(invalidate) \
x(delete_dead_snapshots) \
x(snapshot_delete_pagecache) \
x(sysfs)
x(sysfs) \
x(btree_write_buffer)
enum bch_write_ref {
#define x(n) BCH_WRITE_REF_##n,
@ -1064,6 +1068,16 @@ static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref)
#endif
}
static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
{
#ifdef BCH_WRITE_REF_DEBUG
return !test_bit(BCH_FS_GOING_RO, &c->flags) &&
atomic_long_inc_not_zero(&c->writes[ref]);
#else
return percpu_ref_tryget(&c->writes);
#endif
}
static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
{
#ifdef BCH_WRITE_REF_DEBUG

View File

@ -303,6 +303,13 @@ struct bkey_i {
struct bch_val v;
};
#define POS_KEY(_pos) \
((struct bkey) { \
.u64s = BKEY_U64s, \
.format = KEY_FORMAT_CURRENT, \
.p = _pos, \
})
#define KEY(_inode, _offset, _size) \
((struct bkey) { \
.u64s = BKEY_U64s, \
@ -1436,7 +1443,7 @@ struct bch_sb_field_replicas_v0 {
struct bch_replicas_entry_v0 entries[];
} __packed __aligned(8);
struct bch_replicas_entry {
struct bch_replicas_entry_v1 {
__u8 data_type;
__u8 nr_devs;
__u8 nr_required;
@ -1448,7 +1455,7 @@ struct bch_replicas_entry {
struct bch_sb_field_replicas {
struct bch_sb_field field;
struct bch_replicas_entry entries[];
struct bch_replicas_entry_v1 entries[];
} __packed __aligned(8);
/* BCH_SB_FIELD_quota: */
@ -2124,7 +2131,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
x(clock, 7) \
x(dev_usage, 8) \
x(log, 9) \
x(overwrite, 10)
x(overwrite, 10) \
x(write_buffer_keys, 11)
enum {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
@ -2174,7 +2182,7 @@ struct jset_entry_usage {
struct jset_entry_data_usage {
struct jset_entry entry;
__le64 v;
struct bch_replicas_entry r;
struct bch_replicas_entry_v1 r;
} __packed;
struct jset_entry_clock {

View File

@ -173,12 +173,18 @@ struct bch_ioctl_disk_set_state {
__u64 dev;
};
#define BCH_DATA_OPS() \
x(scrub, 0) \
x(rereplicate, 1) \
x(migrate, 2) \
x(rewrite_old_nodes, 3) \
x(drop_extra_replicas, 4)
enum bch_data_ops {
BCH_DATA_OP_SCRUB = 0,
BCH_DATA_OP_REREPLICATE = 1,
BCH_DATA_OP_MIGRATE = 2,
BCH_DATA_OP_REWRITE_OLD_NODES = 3,
BCH_DATA_OP_NR = 4,
#define x(t, n) BCH_DATA_OP_##t = n,
BCH_DATA_OPS()
#undef x
BCH_DATA_OP_NR
};
/*
@ -237,7 +243,7 @@ struct bch_ioctl_data_event {
struct bch_replicas_usage {
__u64 sectors;
struct bch_replicas_entry r;
struct bch_replicas_entry_v1 r;
} __packed;
static inline struct bch_replicas_usage *

View File

@ -318,8 +318,7 @@ static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
struct bch_fs *c = container_of(shrink, struct bch_fs,
btree_cache.shrink);
struct bch_fs *c = shrink->private_data;
struct btree_cache *bc = &c->btree_cache;
struct btree *b, *t;
unsigned long nr = sc->nr_to_scan;
@ -420,8 +419,7 @@ out_nounlock:
static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
struct shrink_control *sc)
{
struct bch_fs *c = container_of(shrink, struct bch_fs,
btree_cache.shrink);
struct bch_fs *c = shrink->private_data;
struct btree_cache *bc = &c->btree_cache;
if (bch2_btree_shrinker_disabled)
@ -432,8 +430,7 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
{
struct bch_fs *c = container_of(shrink, struct bch_fs,
btree_cache.shrink);
struct bch_fs *c = shrink->private_data;
char *cbuf;
size_t buflen = seq_buf_get_buf(s, &cbuf);
struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
@ -448,7 +445,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
struct btree *b;
unsigned i, flags;
unregister_shrinker(&bc->shrink);
shrinker_free(bc->shrink);
/* vfree() can allocate memory: */
flags = memalloc_nofs_save();
@ -502,6 +499,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
int bch2_fs_btree_cache_init(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
struct shrinker *shrink;
unsigned i;
int ret = 0;
@ -521,13 +519,16 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
mutex_init(&c->verify_lock);
bc->shrink.count_objects = bch2_btree_cache_count;
bc->shrink.scan_objects = bch2_btree_cache_scan;
bc->shrink.to_text = bch2_btree_cache_shrinker_to_text;
bc->shrink.seeks = 4;
ret = register_shrinker(&bc->shrink, "%s-btree_cache", c->name);
if (ret)
shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
if (!shrink)
goto err;
bc->shrink = shrink;
shrink->count_objects = bch2_btree_cache_count;
shrink->scan_objects = bch2_btree_cache_scan;
shrink->to_text = bch2_btree_cache_shrinker_to_text;
shrink->seeks = 4;
shrink->private_data = c;
shrinker_register(shrink);
return 0;
err:

View File

@ -1287,7 +1287,7 @@ static int bch2_gc_done(struct bch_fs *c,
}
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e =
struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
if (metadata_only &&

View File

@ -1358,10 +1358,9 @@ static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *
return offset;
}
static void btree_node_read_all_replicas_done(struct closure *cl)
static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
{
struct btree_node_read_all *ra =
container_of(cl, struct btree_node_read_all, cl);
closure_type(ra, struct btree_node_read_all, cl);
struct bch_fs *c = ra->c;
struct btree *b = ra->b;
struct printbuf buf = PRINTBUF;
@ -1567,7 +1566,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
if (sync) {
closure_sync(&ra->cl);
btree_node_read_all_replicas_done(&ra->cl);
btree_node_read_all_replicas_done(&ra->cl.work);
} else {
continue_at(&ra->cl, btree_node_read_all_replicas_done,
c->io_complete_wq);

View File

@ -1854,19 +1854,11 @@ static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
struct btree_iter *iter,
struct bpos end_pos)
{
struct bkey_i *k;
if (bpos_lt(iter->path->pos, iter->journal_pos))
iter->journal_idx = 0;
k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
return bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
iter->path->level,
iter->path->pos,
end_pos,
&iter->journal_idx);
iter->journal_pos = k ? k->k.p : end_pos;
return k;
}
static noinline
@ -2874,7 +2866,8 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
trans->fn_idx = fn_idx;
trans->locking_wait.task = current;
trans->journal_replay_not_finished =
!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) &&
atomic_inc_not_zero(&c->journal_keys.ref);
closure_init_stack(&trans->ref);
s = btree_trans_stats(trans);
@ -2991,6 +2984,9 @@ void bch2_trans_put(struct btree_trans *trans)
kfree(trans->fs_usage_deltas);
}
if (unlikely(trans->journal_replay_not_finished))
bch2_journal_keys_put(c);
if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
mempool_free(trans->mem, &c->btree_trans_mem_pool);
else

View File

@ -445,14 +445,16 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
unsigned flags,
unsigned long ip)
{
memset(iter, 0, sizeof(*iter));
iter->trans = trans;
iter->update_path = NULL;
iter->key_cache_path = NULL;
iter->btree_id = btree_id;
iter->min_depth = 0;
iter->flags = flags;
iter->snapshot = pos.snapshot;
iter->pos = pos;
iter->k.p = pos;
iter->k = POS_KEY(pos);
iter->journal_idx = 0;
#ifdef CONFIG_BCACHEFS_DEBUG
iter->ip_allocated = ip;
#endif

View File

@ -73,6 +73,7 @@ static size_t bch2_journal_key_search(struct journal_keys *keys,
return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
}
/* Returns first non-overwritten key >= search key: */
struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
unsigned level, struct bpos pos,
struct bpos end_pos, size_t *idx)
@ -80,16 +81,32 @@ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree
struct journal_keys *keys = &c->journal_keys;
unsigned iters = 0;
struct journal_key *k;
BUG_ON(*idx > keys->nr);
search:
if (!*idx)
*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
while (*idx &&
__journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
--(*idx);
iters++;
if (iters == 10) {
*idx = 0;
goto search;
}
}
while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
return NULL;
if (__journal_key_cmp(btree_id, level, pos, k) <= 0 &&
!k->overwritten)
if (k->overwritten) {
(*idx)++;
continue;
}
if (__journal_key_cmp(btree_id, level, pos, k) <= 0)
return k->k;
(*idx)++;
@ -189,7 +206,9 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
/* Since @keys was full, there was no gap: */
memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
kvfree(keys->d);
*keys = new_keys;
keys->d = new_keys.d;
keys->nr = new_keys.nr;
keys->size = new_keys.size;
/* And now the gap is at the end: */
keys->gap = keys->nr;
@ -415,10 +434,16 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
cmp_int(l->journal_offset, r->journal_offset);
}
void bch2_journal_keys_free(struct journal_keys *keys)
void bch2_journal_keys_put(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
struct journal_key *i;
BUG_ON(atomic_read(&keys->ref) <= 0);
if (!atomic_dec_and_test(&keys->ref))
return;
move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
keys->gap = keys->nr;
@ -429,6 +454,8 @@ void bch2_journal_keys_free(struct journal_keys *keys)
kvfree(keys->d);
keys->d = NULL;
keys->nr = keys->gap = keys->size = 0;
bch2_journal_entries_free(c);
}
static void __journal_keys_sort(struct journal_keys *keys)

View File

@ -49,7 +49,15 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
struct bch_fs *,
struct btree *);
void bch2_journal_keys_free(struct journal_keys *);
void bch2_journal_keys_put(struct bch_fs *);
static inline void bch2_journal_keys_put_initial(struct bch_fs *c)
{
if (c->journal_keys.initial_ref_held)
bch2_journal_keys_put(c);
c->journal_keys.initial_ref_held = false;
}
void bch2_journal_entries_free(struct bch_fs *);
int bch2_journal_keys_sort(struct bch_fs *);

View File

@ -646,11 +646,19 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
if (journal_seq && ck->journal.seq != journal_seq)
goto out;
trans->journal_res.seq = ck->journal.seq;
/*
* Since journal reclaim depends on us making progress here, and the
* allocator/copygc depend on journal reclaim making progress, we need
* to be using alloc reserves:
* If we're at the end of the journal, we really want to free up space
* in the journal right away - we don't want to pin that old journal
* sequence number with a new btree node write, we want to re-journal
* the update
*/
if (ck->journal.seq == journal_last_seq(j))
commit_flags |= BCH_WATERMARK_reclaim;
else
commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
ret = bch2_btree_iter_traverse(&b_iter) ?:
bch2_trans_update(trans, &b_iter, ck->k,
BTREE_UPDATE_KEY_CACHE_RECLAIM|
@ -659,9 +667,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
bch2_trans_commit(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc|
(ck->journal.seq == journal_last_seq(j)
? BCH_WATERMARK_reclaim
: 0)|
commit_flags);
bch2_fs_fatal_err_on(ret &&
@ -830,8 +835,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
struct bch_fs *c = container_of(shrink, struct bch_fs,
btree_key_cache.shrink);
struct bch_fs *c = shrink->private_data;
struct btree_key_cache *bc = &c->btree_key_cache;
struct bucket_table *tbl;
struct bkey_cached *ck, *t;
@ -932,8 +936,7 @@ out:
static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
struct shrink_control *sc)
{
struct bch_fs *c = container_of(shrink, struct bch_fs,
btree_key_cache.shrink);
struct bch_fs *c = shrink->private_data;
struct btree_key_cache *bc = &c->btree_key_cache;
long nr = atomic_long_read(&bc->nr_keys) -
atomic_long_read(&bc->nr_dirty);
@ -953,7 +956,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
int cpu;
#endif
unregister_shrinker(&bc->shrink);
shrinker_free(bc->shrink);
mutex_lock(&bc->lock);
@ -1028,8 +1031,8 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
{
struct btree_key_cache *bc =
container_of(shrink, struct btree_key_cache, shrink);
struct bch_fs *c = shrink->private_data;
struct btree_key_cache *bc = &c->btree_key_cache;
char *cbuf;
size_t buflen = seq_buf_get_buf(s, &cbuf);
struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
@ -1041,6 +1044,7 @@ static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shri
int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
struct shrinker *shrink;
#ifdef __KERNEL__
bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
@ -1053,12 +1057,16 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
bc->table_init_done = true;
bc->shrink.seeks = 0;
bc->shrink.count_objects = bch2_btree_key_cache_count;
bc->shrink.scan_objects = bch2_btree_key_cache_scan;
bc->shrink.to_text = bch2_btree_key_cache_shrinker_to_text;
if (register_shrinker(&bc->shrink, "%s-btree_key_cache", c->name))
shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name);
if (!shrink)
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
bc->shrink = shrink;
shrink->seeks = 0;
shrink->count_objects = bch2_btree_key_cache_count;
shrink->scan_objects = bch2_btree_key_cache_scan;
shrink->to_text = bch2_btree_key_cache_shrinker_to_text;
shrink->private_data = c;
shrinker_register(shrink);
return 0;
}

View File

@ -17,7 +17,7 @@ struct btree_key_cache {
struct list_head freed_nonpcpu;
size_t nr_freed_nonpcpu;
struct shrinker shrink;
struct shrinker *shrink;
unsigned shrink_iter;
struct btree_key_cache_freelist __percpu *pcpu_freed;

View File

@ -660,10 +660,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
i->k->k.needs_whiteout = false;
}
if (trans->nr_wb_updates &&
trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size)
return -BCH_ERR_btree_insert_need_flush_buffer;
/*
* Don't get journal reservation until after we know insert will
* succeed:
@ -698,14 +694,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
return -BCH_ERR_btree_insert_need_mark_replicas;
if (trans->nr_wb_updates) {
EBUG_ON(flags & BCH_TRANS_COMMIT_no_journal_res);
ret = bch2_btree_insert_keys_write_buffer(trans);
if (ret)
goto revert_fs_usage;
}
h = trans->hooks;
while (h) {
ret = h->fn(trans, h);
@ -767,7 +755,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
trans_for_each_wb_update(trans, wb) {
entry = bch2_journal_add_entry(j, &trans->journal_res,
BCH_JSET_ENTRY_btree_keys,
BCH_JSET_ENTRY_write_buffer_keys,
wb->btree, 0,
wb->k.k.u64s);
bkey_copy((struct bkey_i *) entry->start, &wb->k);
@ -951,30 +939,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
ret = bch2_trans_relock(trans);
break;
case -BCH_ERR_btree_insert_need_flush_buffer: {
struct btree_write_buffer *wb = &c->btree_write_buffer;
ret = 0;
if (wb->state.nr > wb->size * 3 / 4) {
bch2_trans_unlock(trans);
mutex_lock(&wb->flush_lock);
if (wb->state.nr > wb->size * 3 / 4) {
bch2_trans_begin(trans);
ret = __bch2_btree_write_buffer_flush(trans,
flags|BCH_TRANS_COMMIT_no_check_rw, true);
if (!ret) {
trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
}
} else {
mutex_unlock(&wb->flush_lock);
ret = bch2_trans_relock(trans);
}
}
break;
}
default:
BUG_ON(ret >= 0);
break;
@ -1073,20 +1037,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
goto out_reset;
}
if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 &&
mutex_trylock(&c->btree_write_buffer.flush_lock)) {
bch2_trans_begin(trans);
bch2_trans_unlock(trans);
ret = __bch2_btree_write_buffer_flush(trans,
flags|BCH_TRANS_COMMIT_no_check_rw, true);
if (!ret) {
trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
}
goto out;
}
EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
trans->journal_u64s = trans->extra_journal_entries.nr;

View File

@ -173,7 +173,7 @@ struct btree_cache {
unsigned not_freed_will_make_reachable;
unsigned not_freed_access_bit;
atomic_t dirty;
struct shrinker shrink;
struct shrinker *shrink;
/*
* If we need to allocate memory for a new btree node and that
@ -297,8 +297,7 @@ struct btree_iter {
struct btree_path *key_cache_path;
enum btree_id btree_id:8;
unsigned min_depth:3;
unsigned advanced:1;
u8 min_depth;
/* btree_iter_copy starts here: */
u16 flags;
@ -315,7 +314,6 @@ struct btree_iter {
/* BTREE_ITER_WITH_JOURNAL: */
size_t journal_idx;
struct bpos journal_pos;
#ifdef TRACK_PATH_ALLOCATED
unsigned long ip_allocated;
#endif

View File

@ -774,9 +774,9 @@ static void btree_interior_update_work(struct work_struct *work)
}
}
static void btree_update_set_nodes_written(struct closure *cl)
static CLOSURE_CALLBACK(btree_update_set_nodes_written)
{
struct btree_update *as = container_of(cl, struct btree_update, cl);
closure_type(as, struct btree_update, cl);
struct bch_fs *c = as->c;
mutex_lock(&c->btree_interior_update_lock);

View File

@ -7,43 +7,132 @@
#include "btree_write_buffer.h"
#include "error.h"
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include <linux/sort.h>
#include <linux/prefetch.h>
static int bch2_btree_write_buffer_journal_flush(struct journal *,
struct journal_entry_pin *, u64);
static int btree_write_buffered_key_cmp(const void *_l, const void *_r)
{
const struct btree_write_buffered_key *l = _l;
const struct btree_write_buffered_key *r = _r;
static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct journal_buf *);
return cmp_int(l->btree, r->btree) ?:
bpos_cmp(l->k.k.p, r->k.k.p) ?:
cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->journal_offset, r->journal_offset);
static inline bool __wb_key_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
{
return (cmp_int(l->hi, r->hi) ?:
cmp_int(l->mi, r->mi) ?:
cmp_int(l->lo, r->lo)) >= 0;
}
static int btree_write_buffered_journal_cmp(const void *_l, const void *_r)
static inline bool wb_key_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
{
const struct btree_write_buffered_key *l = _l;
const struct btree_write_buffered_key *r = _r;
#ifdef CONFIG_X86_64
int cmp;
return cmp_int(l->journal_seq, r->journal_seq);
asm(".intel_syntax noprefix;"
"mov rax, [%[l]];"
"sub rax, [%[r]];"
"mov rax, [%[l] + 8];"
"sbb rax, [%[r] + 8];"
"mov rax, [%[l] + 16];"
"sbb rax, [%[r] + 16];"
".att_syntax prefix;"
: "=@ccae" (cmp)
: [l] "r" (l), [r] "r" (r)
: "rax", "cc");
EBUG_ON(cmp != __wb_key_cmp(l, r));
return cmp;
#else
return __wb_key_cmp(l, r);
#endif
}
static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
/* Compare excluding idx, the low 24 bits: */
static inline bool wb_key_eq(const void *_l, const void *_r)
{
const struct wb_key_ref *l = _l;
const struct wb_key_ref *r = _r;
return !((l->hi ^ r->hi)|
(l->mi ^ r->mi)|
((l->lo >> 24) ^ (r->lo >> 24)));
}
static noinline void wb_sort(struct wb_key_ref *base, size_t num)
{
size_t n = num, a = num / 2;
if (!a) /* num < 2 || size == 0 */
return;
for (;;) {
size_t b, c, d;
if (a) /* Building heap: sift down --a */
--a;
else if (--n) /* Sorting: Extract root to --n */
swap(base[0], base[n]);
else /* Sort complete */
break;
/*
* Sift element at "a" down into heap. This is the
* "bottom-up" variant, which significantly reduces
* calls to cmp_func(): we find the sift-down path all
* the way to the leaves (one compare per level), then
* backtrack to find where to insert the target element.
*
* Because elements tend to sift down close to the leaves,
* this uses fewer compares than doing two per level
* on the way down. (A bit more than half as many on
* average, 3/4 worst-case.)
*/
for (b = a; c = 2*b + 1, (d = c + 1) < n;)
b = wb_key_cmp(base + c, base + d) ? c : d;
if (d == n) /* Special case last leaf with no sibling */
b = c;
/* Now backtrack from "b" to the correct location for "a" */
while (b != a && wb_key_cmp(base + a, base + b))
b = (b - 1) / 2;
c = b; /* Where "a" belongs */
while (b != a) { /* Shift it into place */
b = (b - 1) / 2;
swap(base[b], base[c]);
}
}
}
static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
struct btree_iter *iter,
struct btree_write_buffered_key *wb)
{
bch2_btree_node_unlock_write(trans, iter->path, iter->path->l[0].b);
trans->journal_res.seq = wb->journal_seq;
return bch2_trans_update(trans, iter, &wb->k,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_journal_res|
BCH_TRANS_COMMIT_journal_reclaim);
}
static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter,
struct btree_write_buffered_key *wb,
unsigned commit_flags,
bool *write_locked,
size_t *fast)
bool *write_locked, size_t *fast)
{
struct bch_fs *c = trans->c;
struct btree_path *path;
int ret;
EBUG_ON(!wb->journal_seq);
EBUG_ON(!c->btree_write_buffer.flushing.pin.seq);
EBUG_ON(c->btree_write_buffer.flushing.pin.seq > wb->journal_seq);
ret = bch2_btree_iter_traverse(iter);
if (ret)
return ret;
@ -66,46 +155,14 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
*write_locked = true;
}
if (!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s)) {
bch2_btree_node_unlock_write(trans, path, path->l[0].b);
if (unlikely(!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s))) {
*write_locked = false;
goto trans_commit;
return wb_flush_one_slowpath(trans, iter, wb);
}
bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
(*fast)++;
return 0;
trans_commit:
trans->journal_res.seq = wb->journal_seq;
return bch2_trans_update(trans, iter, &wb->k,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, NULL, NULL,
commit_flags|
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_journal_res|
BCH_TRANS_COMMIT_journal_reclaim);
}
static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb)
{
union btree_write_buffer_state old, new;
u64 v = READ_ONCE(wb->state.v);
do {
old.v = new.v = v;
new.nr = 0;
new.idx++;
} while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);
while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1)
cpu_relax();
smp_mb();
return old;
}
/*
@ -137,35 +194,79 @@ btree_write_buffered_insert(struct btree_trans *trans,
return ret;
}
int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags,
bool locked)
static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb)
{
struct bch_fs *c = container_of(wb, struct bch_fs, btree_write_buffer);
struct journal *j = &c->journal;
if (!wb->inc.keys.nr)
return;
bch2_journal_pin_add(j, wb->inc.keys.data[0].journal_seq, &wb->flushing.pin,
bch2_btree_write_buffer_journal_flush);
darray_resize(&wb->flushing.keys, min_t(size_t, 1U << 20, wb->flushing.keys.nr + wb->inc.keys.nr));
darray_resize(&wb->sorted, wb->flushing.keys.size);
if (!wb->flushing.keys.nr && wb->sorted.size >= wb->inc.keys.nr) {
swap(wb->flushing.keys, wb->inc.keys);
goto out;
}
size_t nr = min(darray_room(wb->flushing.keys),
wb->sorted.size - wb->flushing.keys.nr);
nr = min(nr, wb->inc.keys.nr);
memcpy(&darray_top(wb->flushing.keys),
wb->inc.keys.data,
sizeof(wb->inc.keys.data[0]) * nr);
memmove(wb->inc.keys.data,
wb->inc.keys.data + nr,
sizeof(wb->inc.keys.data[0]) * (wb->inc.keys.nr - nr));
wb->flushing.keys.nr += nr;
wb->inc.keys.nr -= nr;
out:
if (!wb->inc.keys.nr)
bch2_journal_pin_drop(j, &wb->inc.pin);
else
bch2_journal_pin_update(j, wb->inc.keys.data[0].journal_seq, &wb->inc.pin,
bch2_btree_write_buffer_journal_flush);
if (j->watermark) {
spin_lock(&j->lock);
bch2_journal_set_watermark(j);
spin_unlock(&j->lock);
}
BUG_ON(wb->sorted.size < wb->flushing.keys.nr);
}
static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
struct btree_write_buffer *wb = &c->btree_write_buffer;
struct journal_entry_pin pin;
struct btree_write_buffered_key *i, *keys;
struct wb_key_ref *i;
struct btree_iter iter = { NULL };
size_t nr = 0, skipped = 0, fast = 0, slowpath = 0;
size_t skipped = 0, fast = 0, slowpath = 0;
bool write_locked = false;
union btree_write_buffer_state s;
int ret = 0;
memset(&pin, 0, sizeof(pin));
bch2_trans_unlock(trans);
bch2_trans_begin(trans);
if (!locked && !mutex_trylock(&wb->flush_lock))
return 0;
mutex_lock(&wb->inc.lock);
move_keys_from_inc_to_flushing(wb);
mutex_unlock(&wb->inc.lock);
bch2_journal_pin_copy(j, &pin, &wb->journal_pin,
bch2_btree_write_buffer_journal_flush);
bch2_journal_pin_drop(j, &wb->journal_pin);
s = btree_write_buffer_switch(wb);
keys = wb->keys[s.idx];
nr = s.nr;
if (race_fault())
goto slowpath;
for (size_t i = 0; i < wb->flushing.keys.nr; i++) {
wb->sorted.data[i].idx = i;
wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree;
memcpy(&wb->sorted.data[i].pos, &wb->flushing.keys.data[i].k.k.p, sizeof(struct bpos));
}
wb->sorted.nr = wb->flushing.keys.nr;
/*
* We first sort so that we can detect and skip redundant updates, and
@ -181,110 +282,178 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
* If that happens, simply skip the key so we can optimistically insert
* as many keys as possible in the fast path.
*/
sort(keys, nr, sizeof(keys[0]),
btree_write_buffered_key_cmp, NULL);
wb_sort(wb->sorted.data, wb->sorted.nr);
darray_for_each(wb->sorted, i) {
struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++)
prefetch(&wb->flushing.keys.data[n->idx]);
BUG_ON(!k->journal_seq);
if (i + 1 < &darray_top(wb->sorted) &&
wb_key_eq(i, i + 1)) {
struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx];
for (i = keys; i < keys + nr; i++) {
if (i + 1 < keys + nr &&
i[0].btree == i[1].btree &&
bpos_eq(i[0].k.k.p, i[1].k.k.p)) {
skipped++;
i->journal_seq = 0;
n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);;
k->journal_seq = 0;
continue;
}
if (write_locked &&
(iter.path->btree_id != i->btree ||
bpos_gt(i->k.k.p, iter.path->l[0].b->key.k.p))) {
(iter.path->btree_id != k->btree ||
bpos_gt(k->k.k.p, iter.path->l[0].b->key.k.p))) {
bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
write_locked = false;
}
if (!iter.path || iter.path->btree_id != i->btree) {
if (!iter.path || iter.path->btree_id != k->btree) {
bch2_trans_iter_exit(trans, &iter);
bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p,
bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p,
BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS);
}
bch2_btree_iter_set_pos(&iter, i->k.k.p);
bch2_btree_iter_set_pos(&iter, k->k.k.p);
iter.path->preserve = false;
do {
ret = bch2_btree_write_buffer_flush_one(trans, &iter, i,
commit_flags, &write_locked, &fast);
if (race_fault()) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
break;
}
ret = wb_flush_one(trans, &iter, k, &write_locked, &fast);
if (!write_locked)
bch2_trans_begin(trans);
} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
if (!ret) {
k->journal_seq = 0;
} else if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
slowpath++;
continue;
}
if (ret)
ret = 0;
} else
break;
i->journal_seq = 0;
}
if (write_locked)
bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
bch2_trans_iter_exit(trans, &iter);
trace_write_buffer_flush(trans, nr, skipped, fast, wb->size);
if (slowpath)
goto slowpath;
bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
out:
bch2_journal_pin_drop(j, &pin);
mutex_unlock(&wb->flush_lock);
return ret;
slowpath:
trace_write_buffer_flush_slowpath(trans, i - keys, nr);
if (ret)
goto err;
if (slowpath) {
/*
* Now sort the rest by journal seq and bump the journal pin as we go.
* The slowpath zapped the seq of keys that were successfully flushed so
* Flush in the order they were present in the journal, so that
* we can release journal pins:
* The fastpath zapped the seq of keys that were successfully flushed so
* we can skip those here.
*/
sort(keys, nr, sizeof(keys[0]),
btree_write_buffered_journal_cmp,
NULL);
trace_write_buffer_flush_slowpath(trans, slowpath, wb->flushing.keys.nr);
commit_flags &= ~BCH_WATERMARK_MASK;
commit_flags |= BCH_WATERMARK_reclaim;
for (i = keys; i < keys + nr; i++) {
struct btree_write_buffered_key *i;
darray_for_each(wb->flushing.keys, i) {
if (!i->journal_seq)
continue;
bch2_journal_pin_update(j, i->journal_seq, &pin,
bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
bch2_btree_write_buffer_journal_flush);
bch2_trans_begin(trans);
ret = commit_do(trans, NULL, NULL,
commit_flags|
BCH_WATERMARK_reclaim|
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_journal_res|
BCH_TRANS_COMMIT_journal_reclaim,
btree_write_buffered_insert(trans, i));
if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
if (ret)
goto err;
}
}
err:
bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
bch2_journal_pin_drop(j, &wb->flushing.pin);
wb->flushing.keys.nr = 0;
return ret;
}
static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq)
{
struct journal *j = &c->journal;
struct journal_buf *buf;
int ret = 0;
mutex_lock(&j->buf_lock);
while ((buf = bch2_next_write_buffer_flush_journal_buf(j, seq)))
if (bch2_journal_keys_to_write_buffer(c, buf)) {
ret = -ENOMEM;
break;
}
mutex_unlock(&j->buf_lock);
goto out;
return ret;
}
int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
struct btree_write_buffer *wb = &c->btree_write_buffer;
int ret = 0, fetch_from_journal_err;
trace_write_buffer_flush_sync(trans, _RET_IP_);
retry:
bch2_trans_unlock(trans);
mutex_lock(&trans->c->btree_write_buffer.flush_lock);
return __bch2_btree_write_buffer_flush(trans, 0, true);
bch2_journal_block_reservations(&c->journal);
fetch_from_journal_err = fetch_wb_keys_from_journal(c, U64_MAX);
bch2_journal_unblock(&c->journal);
/*
* On memory allocation failure, bch2_btree_write_buffer_flush_locked()
* is not guaranteed to empty wb->inc:
*/
mutex_lock(&wb->flushing.lock);
while (!ret &&
(wb->flushing.keys.nr || wb->inc.keys.nr))
ret = bch2_btree_write_buffer_flush_locked(trans);
mutex_unlock(&wb->flushing.lock);
if (!ret && fetch_from_journal_err)
goto retry;
return ret;
}
int bch2_btree_write_buffer_flush(struct btree_trans *trans)
int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans)
{
return __bch2_btree_write_buffer_flush(trans, 0, false);
struct bch_fs *c = trans->c;
struct btree_write_buffer *wb = &c->btree_write_buffer;
int ret = 0;
if (mutex_trylock(&wb->flushing.lock)) {
ret = bch2_btree_write_buffer_flush_locked(trans);
mutex_unlock(&wb->flushing.lock);
}
return ret;
}
int bch2_btree_write_buffer_tryflush(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer))
return -BCH_ERR_erofs_no_writes;
int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans);
bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
return ret;
}
static int bch2_btree_write_buffer_journal_flush(struct journal *j,
@ -292,84 +461,195 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j,
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct btree_write_buffer *wb = &c->btree_write_buffer;
int ret, fetch_from_journal_err;
mutex_lock(&wb->flush_lock);
do {
fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq);
return bch2_trans_run(c,
__bch2_btree_write_buffer_flush(trans, BCH_TRANS_COMMIT_no_check_rw, true));
mutex_lock(&wb->flushing.lock);
ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans));
mutex_unlock(&wb->flushing.lock);
} while (!ret &&
(fetch_from_journal_err ||
(wb->flushing.pin.seq && wb->flushing.pin.seq <= seq) ||
(wb->inc.pin.seq && wb->inc.pin.seq <= seq)));
return ret;
}
static inline u64 btree_write_buffer_ref(int idx)
static void bch2_btree_write_buffer_flush_work(struct work_struct *work)
{
return ((union btree_write_buffer_state) {
.ref0 = idx == 0,
.ref1 = idx == 1,
}).v;
}
int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work);
struct btree_write_buffer *wb = &c->btree_write_buffer;
struct btree_write_buffered_key *i;
union btree_write_buffer_state old, new;
int ret = 0;
u64 v;
int ret;
trans_for_each_wb_update(trans, i) {
EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
mutex_lock(&wb->flushing.lock);
do {
ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans));
} while (!ret && bch2_btree_write_buffer_should_flush(c));
mutex_unlock(&wb->flushing.lock);
i->journal_seq = trans->journal_res.seq;
i->journal_offset = trans->journal_res.offset;
bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
}
int __bch2_journal_key_to_wb(struct bch_fs *c,
struct journal_keys_to_wb *dst,
enum btree_id btree, struct bkey_i *k)
{
struct btree_write_buffer *wb = &c->btree_write_buffer;
int ret;
retry:
ret = darray_make_room_gfp(&dst->wb->keys, 1, GFP_KERNEL);
if (!ret && dst->wb == &wb->flushing)
ret = darray_resize(&wb->sorted, wb->flushing.keys.size);
if (unlikely(ret)) {
if (dst->wb == &c->btree_write_buffer.flushing) {
mutex_unlock(&dst->wb->lock);
dst->wb = &c->btree_write_buffer.inc;
bch2_journal_pin_add(&c->journal, dst->seq, &dst->wb->pin,
bch2_btree_write_buffer_journal_flush);
goto retry;
}
preempt_disable();
v = READ_ONCE(wb->state.v);
do {
old.v = new.v = v;
return ret;
}
new.v += btree_write_buffer_ref(new.idx);
new.nr += trans->nr_wb_updates;
if (new.nr > wb->size) {
ret = -BCH_ERR_btree_insert_need_flush_buffer;
dst->room = darray_room(dst->wb->keys);
if (dst->wb == &wb->flushing)
dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr);
BUG_ON(!dst->room);
BUG_ON(!dst->seq);
struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
wb_k->journal_seq = dst->seq;
wb_k->btree = btree;
bkey_copy(&wb_k->k, k);
dst->wb->keys.nr++;
dst->room--;
return 0;
}
void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_keys_to_wb *dst, u64 seq)
{
struct btree_write_buffer *wb = &c->btree_write_buffer;
if (mutex_trylock(&wb->flushing.lock)) {
mutex_lock(&wb->inc.lock);
move_keys_from_inc_to_flushing(wb);
/*
* Attempt to skip wb->inc, and add keys directly to
* wb->flushing, saving us a copy later:
*/
if (!wb->inc.keys.nr) {
dst->wb = &wb->flushing;
} else {
mutex_unlock(&wb->flushing.lock);
dst->wb = &wb->inc;
}
} else {
mutex_lock(&wb->inc.lock);
dst->wb = &wb->inc;
}
dst->room = darray_room(dst->wb->keys);
if (dst->wb == &wb->flushing)
dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr);
dst->seq = seq;
bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin,
bch2_btree_write_buffer_journal_flush);
}
void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst)
{
struct btree_write_buffer *wb = &c->btree_write_buffer;
if (!dst->wb->keys.nr)
bch2_journal_pin_drop(&c->journal, &dst->wb->pin);
if (bch2_btree_write_buffer_should_flush(c) &&
__bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) &&
!queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work))
bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
if (dst->wb == &wb->flushing)
mutex_unlock(&wb->flushing.lock);
mutex_unlock(&wb->inc.lock);
}
static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
{
struct journal_keys_to_wb dst;
struct jset_entry *entry;
struct bkey_i *k;
int ret = 0;
bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq));
for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) {
jset_entry_for_each_key(entry, k) {
ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k);
if (ret)
goto out;
}
} while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);
memcpy(wb->keys[new.idx] + old.nr,
trans->wb_updates,
sizeof(trans->wb_updates[0]) * trans->nr_wb_updates);
entry->type = BCH_JSET_ENTRY_btree_keys;
}
bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &wb->journal_pin,
bch2_btree_write_buffer_journal_flush);
atomic64_sub_return_release(btree_write_buffer_ref(new.idx), &wb->state.counter);
buf->need_flush_to_write_buffer = false;
out:
preempt_enable();
bch2_journal_keys_to_write_buffer_end(c, &dst);
return ret;
}
static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size)
{
if (wb->keys.size >= new_size)
return 0;
if (!mutex_trylock(&wb->lock))
return -EINTR;
int ret = darray_resize(&wb->keys, new_size);
mutex_unlock(&wb->lock);
return ret;
}
int bch2_btree_write_buffer_resize(struct bch_fs *c, size_t new_size)
{
struct btree_write_buffer *wb = &c->btree_write_buffer;
return wb_keys_resize(&wb->flushing, new_size) ?:
wb_keys_resize(&wb->inc, new_size);
}
void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
{
struct btree_write_buffer *wb = &c->btree_write_buffer;
BUG_ON(wb->state.nr && !bch2_journal_error(&c->journal));
BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) &&
!bch2_journal_error(&c->journal));
kvfree(wb->keys[1]);
kvfree(wb->keys[0]);
darray_exit(&wb->sorted);
darray_exit(&wb->flushing.keys);
darray_exit(&wb->inc.keys);
}
int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
{
struct btree_write_buffer *wb = &c->btree_write_buffer;
mutex_init(&wb->flush_lock);
wb->size = c->opts.btree_write_buffer_size;
mutex_init(&wb->inc.lock);
mutex_init(&wb->flushing.lock);
INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work);
wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL);
wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL);
if (!wb->keys[0] || !wb->keys[1])
return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init;
/* Will be resized by journal as needed: */
unsigned initial_size = 1 << 16;
return 0;
return darray_make_room(&wb->inc.keys, initial_size) ?:
darray_make_room(&wb->flushing.keys, initial_size) ?:
darray_make_room(&wb->sorted, initial_size);
}

View File

@ -2,12 +2,59 @@
#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H
#define _BCACHEFS_BTREE_WRITE_BUFFER_H
int __bch2_btree_write_buffer_flush(struct btree_trans *, unsigned, bool);
#include "bkey.h"
static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c)
{
struct btree_write_buffer *wb = &c->btree_write_buffer;
return wb->inc.keys.nr + wb->flushing.keys.nr > wb->inc.keys.size / 4;
}
static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c)
{
struct btree_write_buffer *wb = &c->btree_write_buffer;
return wb->inc.keys.nr > wb->inc.keys.size * 3 / 4;
}
struct btree_trans;
int bch2_btree_write_buffer_flush_sync(struct btree_trans *);
int bch2_btree_write_buffer_flush(struct btree_trans *);
int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *);
int bch2_btree_write_buffer_tryflush(struct btree_trans *);
int bch2_btree_insert_keys_write_buffer(struct btree_trans *);
struct journal_keys_to_wb {
struct btree_write_buffer_keys *wb;
size_t room;
u64 seq;
};
int __bch2_journal_key_to_wb(struct bch_fs *,
struct journal_keys_to_wb *,
enum btree_id, struct bkey_i *);
static inline int bch2_journal_key_to_wb(struct bch_fs *c,
struct journal_keys_to_wb *dst,
enum btree_id btree, struct bkey_i *k)
{
EBUG_ON(!dst->seq);
if (unlikely(!dst->room))
return __bch2_journal_key_to_wb(c, dst, btree, k);
struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
wb_k->journal_seq = dst->seq;
wb_k->btree = btree;
bkey_copy(&wb_k->k, k);
dst->wb->keys.nr++;
dst->room--;
return 0;
}
void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64);
void bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *);
int bch2_btree_write_buffer_resize(struct bch_fs *, size_t);
void bch2_fs_btree_write_buffer_exit(struct bch_fs *);
int bch2_fs_btree_write_buffer_init(struct bch_fs *);

View File

@ -2,43 +2,56 @@
#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
#include "darray.h"
#include "journal_types.h"
#define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4
#define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX)
struct wb_key_ref {
union {
struct {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
unsigned idx:24;
u8 pos[sizeof(struct bpos)];
enum btree_id btree:8;
#else
enum btree_id btree:8;
u8 pos[sizeof(struct bpos)];
unsigned idx:24;
#endif
} __packed;
struct {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
u64 lo;
u64 mi;
u64 hi;
#else
u64 hi;
u64 mi;
u64 lo;
#endif
};
};
};
struct btree_write_buffered_key {
u64 journal_seq;
unsigned journal_offset;
enum btree_id btree;
enum btree_id btree:8;
u64 journal_seq:56;
__BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX);
};
union btree_write_buffer_state {
struct {
atomic64_t counter;
};
struct {
u64 v;
};
struct {
u64 nr:23;
u64 idx:1;
u64 ref0:20;
u64 ref1:20;
};
struct btree_write_buffer_keys {
DARRAY(struct btree_write_buffered_key) keys;
struct journal_entry_pin pin;
struct mutex lock;
};
struct btree_write_buffer {
struct mutex flush_lock;
struct journal_entry_pin journal_pin;
union btree_write_buffer_state state;
size_t size;
struct btree_write_buffered_key *keys[2];
DARRAY(struct wb_key_ref) sorted;
struct btree_write_buffer_keys inc;
struct btree_write_buffer_keys flushing;
struct work_struct flush_work;
};
#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */

View File

@ -61,7 +61,7 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
usage->reserved += usage->persistent_reserved[i];
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e =
struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
@ -214,7 +214,7 @@ void bch2_fs_usage_to_text(struct printbuf *out,
}
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e =
struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
prt_printf(out, "\t");
@ -345,7 +345,7 @@ static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca,
static inline int __update_replicas(struct bch_fs *c,
struct bch_fs_usage *fs_usage,
struct bch_replicas_entry *r,
struct bch_replicas_entry_v1 *r,
s64 sectors)
{
int idx = bch2_replicas_entry_idx(c, r);
@ -359,7 +359,7 @@ static inline int __update_replicas(struct bch_fs *c,
}
static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
struct bch_replicas_entry *r, s64 sectors,
struct bch_replicas_entry_v1 *r, s64 sectors,
unsigned journal_seq, bool gc)
{
struct bch_fs_usage *fs_usage;
@ -453,8 +453,8 @@ int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
__replicas_deltas_realloc(trans, more, _gfp));
}
static inline int update_replicas_list(struct btree_trans *trans,
struct bch_replicas_entry *r,
int bch2_update_replicas_list(struct btree_trans *trans,
struct bch_replicas_entry_v1 *r,
s64 sectors)
{
struct replicas_delta_list *d;
@ -481,14 +481,13 @@ static inline int update_replicas_list(struct btree_trans *trans,
return 0;
}
static inline int update_cached_sectors_list(struct btree_trans *trans,
unsigned dev, s64 sectors)
int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64 sectors)
{
struct bch_replicas_padded r;
bch2_replicas_entry_cached(&r.e, dev);
return update_replicas_list(trans, &r.e, sectors);
return bch2_update_replicas_list(trans, &r.e, sectors);
}
int bch2_mark_alloc(struct btree_trans *trans,
@ -580,23 +579,6 @@ int bch2_mark_alloc(struct btree_trans *trans,
}
percpu_up_read(&c->mark_lock);
/*
* need to know if we're getting called from the invalidate path or
* not:
*/
if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
old_a->cached_sectors) {
ret = update_cached_sectors(c, new, ca->dev_idx,
-((s64) old_a->cached_sectors),
journal_seq, gc);
if (ret) {
bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors",
__func__);
return ret;
}
}
if (new_a->data_type == BCH_DATA_free &&
(!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk))
closure_wake_up(&c->freelist_wait);
@ -1470,7 +1452,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
r.e.data_type = data_type;
ret = update_replicas_list(trans, &r.e, sectors);
ret = bch2_update_replicas_list(trans, &r.e, sectors);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
@ -1513,7 +1495,7 @@ static int __trans_mark_extent(struct btree_trans *trans,
if (p.ptr.cached) {
if (!stale) {
ret = update_cached_sectors_list(trans, p.ptr.dev,
ret = bch2_update_cached_sectors_list(trans, p.ptr.dev,
disk_sectors);
if (ret)
return ret;
@ -1532,7 +1514,7 @@ static int __trans_mark_extent(struct btree_trans *trans,
}
if (r.e.nr_devs)
ret = update_replicas_list(trans, &r.e, dirty_sectors);
ret = bch2_update_replicas_list(trans, &r.e, dirty_sectors);
return ret;
}
@ -1669,7 +1651,7 @@ int bch2_trans_mark_stripe(struct btree_trans *trans,
s64 sectors = le16_to_cpu(new_s->sectors);
bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new));
ret = update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
ret = bch2_update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
if (ret)
return ret;
}
@ -1678,7 +1660,7 @@ int bch2_trans_mark_stripe(struct btree_trans *trans,
s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
bch2_bkey_to_replicas(&r.e, old);
ret = update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
ret = bch2_update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
if (ret)
return ret;
}

View File

@ -315,6 +315,9 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
: c->usage[journal_seq & JOURNAL_BUF_MASK]);
}
int bch2_update_replicas_list(struct btree_trans *,
struct bch_replicas_entry_v1 *, s64);
int bch2_update_cached_sectors_list(struct btree_trans *, unsigned, s64);
int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned);
void bch2_fs_usage_initialize(struct bch_fs *);

View File

@ -444,7 +444,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
dst_end = (void *) arg->replicas + replica_entries_bytes;
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *src_e =
struct bch_replicas_entry_v1 *src_e =
cpu_replicas_entry(&c->replicas, i);
/* check that we have enough space for one replicas entry */

View File

@ -95,7 +95,6 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
unsigned long io_until,
unsigned long cpu_timeout)
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
struct io_clock_wait wait;
wait.io_timer.expire = io_until;
@ -111,7 +110,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
if (kthread && kthread_should_stop())
if (kthread_should_stop())
break;
if (wait.expired)

View File

@ -9,10 +9,12 @@ int __bch2_darray_resize(darray_void *d, size_t element_size, size_t new_size, g
if (new_size > d->size) {
new_size = roundup_pow_of_two(new_size);
void *data = krealloc_array(d->data, new_size, element_size, gfp);
void *data = kvmalloc_array(new_size, element_size, gfp);
if (!data)
return -ENOMEM;
memcpy(data, d->data, d->size * element_size);
kvfree(d->data);
d->data = data;
d->size = new_size;
}

View File

@ -92,7 +92,7 @@ do { \
#define darray_exit(_d) \
do { \
kfree((_d)->data); \
kvfree((_d)->data); \
darray_init(_d); \
} while (0)

View File

@ -1005,7 +1005,7 @@ static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
int ret = 0;
ret = bch2_btree_write_buffer_flush(trans);
ret = bch2_btree_write_buffer_flush_sync(trans);
if (ret)
goto err;

View File

@ -5,7 +5,7 @@
#include "bcachefs_format.h"
struct bch_replicas_padded {
struct bch_replicas_entry e;
struct bch_replicas_entry_v1 e;
u8 pad[BCH_BKEY_PTRS_MAX];
};

View File

@ -150,7 +150,6 @@
x(BCH_ERR_btree_insert_fail, btree_insert_need_mark_replicas) \
x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \
x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \
x(BCH_ERR_btree_insert_fail, btree_insert_need_flush_buffer) \
x(0, backpointer_to_overwritten_btree_node) \
x(0, lock_fail_root_changed) \
x(0, journal_reclaim_would_deadlock) \

View File

@ -35,9 +35,9 @@ static void bio_check_or_release(struct bio *bio, bool check_dirty)
}
}
static void bch2_dio_read_complete(struct closure *cl)
static CLOSURE_CALLBACK(bch2_dio_read_complete)
{
struct dio_read *dio = container_of(cl, struct dio_read, cl);
closure_type(dio, struct dio_read, cl);
dio->req->ki_complete(dio->req, dio->ret);
bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
@ -325,9 +325,9 @@ static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
return 0;
}
static void bch2_dio_write_flush_done(struct closure *cl)
static CLOSURE_CALLBACK(bch2_dio_write_flush_done)
{
struct dio_write *dio = container_of(cl, struct dio_write, op.cl);
closure_type(dio, struct dio_write, op.cl);
struct bch_fs *c = dio->op.c;
closure_debug_destroy(cl);

View File

@ -861,7 +861,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
abs(pos_src - pos_dst) < len)
return -EINVAL;
bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
lock_two_nondirectories(&src->v, &dst->v);
bch2_lock_inodes(INODE_PAGECACHE_BLOCK, src, dst);
inode_dio_wait(&src->v);
inode_dio_wait(&dst->v);
@ -914,7 +915,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
ret = bch2_flush_inode(c, dst);
err:
bch2_quota_reservation_put(c, dst, &quota_res);
bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
bch2_unlock_inodes(INODE_PAGECACHE_BLOCK, src, dst);
unlock_two_nondirectories(&src->v, &dst->v);
return bch2_err_class(ret);
}

View File

@ -453,35 +453,33 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
struct bch_ioctl_subvolume arg)
{
struct filename *name;
struct path path;
struct inode *dir;
struct dentry *victim;
int ret = 0;
if (arg.flags)
return -EINVAL;
name = getname((const char __user *)(unsigned long)arg.dst_ptr);
victim = filename_path_locked(arg.dirfd, name, &path);
putname(name);
if (IS_ERR(victim))
return PTR_ERR(victim);
ret = user_path_at(arg.dirfd,
(const char __user *)(unsigned long)arg.dst_ptr,
LOOKUP_FOLLOW, &path);
if (ret)
return ret;
if (victim->d_sb->s_fs_info != c) {
if (path.dentry->d_sb->s_fs_info != c) {
ret = -EXDEV;
goto err;
}
dir = d_inode(path.dentry);
ret = __bch2_unlink(dir, victim, true);
if (!ret) {
fsnotify_rmdir(dir, victim);
d_delete(victim);
}
inode_unlock(dir);
dir = path.dentry->d_parent->d_inode;
ret = __bch2_unlink(dir, path.dentry, true);
if (ret)
goto err;
fsnotify_rmdir(dir, path.dentry);
d_delete(path.dentry);
err:
dput(victim);
path_put(&path);
return ret;
}

View File

@ -1667,8 +1667,7 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
if (!first)
seq_putc(seq, ':');
first = false;
seq_puts(seq, "/dev/");
seq_puts(seq, ca->name);
seq_puts(seq, ca->disk_sb.sb_name);
}
return 0;
@ -1901,7 +1900,7 @@ got_sb:
sb->s_flags |= SB_POSIXACL;
#endif
sb->s_shrink.seeks = 0;
sb->s_shrink->seeks = 0;
vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
ret = PTR_ERR_OR_ZERO(vinode);

View File

@ -77,9 +77,8 @@ static inline int ptrcmp(void *l, void *r)
}
enum bch_inode_lock_op {
INODE_LOCK = (1U << 0),
INODE_PAGECACHE_BLOCK = (1U << 1),
INODE_UPDATE_LOCK = (1U << 2),
INODE_PAGECACHE_BLOCK = (1U << 0),
INODE_UPDATE_LOCK = (1U << 1),
};
#define bch2_lock_inodes(_locks, ...) \
@ -91,8 +90,6 @@ do { \
\
for (i = 1; i < ARRAY_SIZE(a); i++) \
if (a[i] != a[i - 1]) { \
if ((_locks) & INODE_LOCK) \
down_write_nested(&a[i]->v.i_rwsem, i); \
if ((_locks) & INODE_PAGECACHE_BLOCK) \
bch2_pagecache_block_get(a[i]);\
if ((_locks) & INODE_UPDATE_LOCK) \
@ -109,8 +106,6 @@ do { \
\
for (i = 1; i < ARRAY_SIZE(a); i++) \
if (a[i] != a[i - 1]) { \
if ((_locks) & INODE_LOCK) \
up_write(&a[i]->v.i_rwsem); \
if ((_locks) & INODE_PAGECACHE_BLOCK) \
bch2_pagecache_block_put(a[i]);\
if ((_locks) & INODE_UPDATE_LOCK) \

View File

@ -826,6 +826,18 @@ fsck_err:
goto out;
}
static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
{
struct btree_iter iter;
struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
int ret = bkey_err(k);
if (ret)
return ret;
bch2_trans_iter_exit(trans, &iter);
return k.k->type == KEY_TYPE_set;
}
static int check_inode(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
@ -890,6 +902,17 @@ static int check_inode(struct btree_trans *trans,
return 0;
}
if (u.bi_flags & BCH_INODE_unlinked &&
c->sb.version >= bcachefs_metadata_version_deleted_inodes) {
ret = check_inode_deleted_list(trans, k.k->p);
if (ret)
return ret;
fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list,
"inode %llu:%u unlinked, but not on deleted list",
u.bi_inum, k.k->p.snapshot);
}
if (u.bi_flags & BCH_INODE_unlinked &&
(!c->sb.clean ||
fsck_err(c, inode_unlinked_but_clean,

View File

@ -1157,10 +1157,6 @@ int bch2_delete_dead_inodes(struct bch_fs *c)
again:
need_another_pass = false;
ret = bch2_btree_write_buffer_flush_sync(trans);
if (ret)
goto err;
/*
* Weird transaction restart handling here because on successful delete,
* bch2_inode_rm_snapshot() will return a nested transaction restart,
@ -1191,8 +1187,12 @@ again:
}
bch2_trans_iter_exit(trans, &iter);
if (!ret && need_another_pass)
if (!ret && need_another_pass) {
ret = bch2_btree_write_buffer_flush_sync(trans);
if (ret)
goto err;
goto again;
}
err:
bch2_trans_put(trans);

View File

@ -580,9 +580,9 @@ static inline void wp_update_state(struct write_point *wp, bool running)
__wp_update_state(wp, state);
}
static void bch2_write_index(struct closure *cl)
static CLOSURE_CALLBACK(bch2_write_index)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
closure_type(op, struct bch_write_op, cl);
struct write_point *wp = op->wp;
struct workqueue_struct *wq = index_update_wq(op);
unsigned long flags;
@ -1208,9 +1208,9 @@ static void __bch2_nocow_write_done(struct bch_write_op *op)
bch2_nocow_write_convert_unwritten(op);
}
static void bch2_nocow_write_done(struct closure *cl)
static CLOSURE_CALLBACK(bch2_nocow_write_done)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
closure_type(op, struct bch_write_op, cl);
__bch2_nocow_write_done(op);
bch2_write_done(cl);
@ -1363,7 +1363,7 @@ err:
op->insert_keys.top = op->insert_keys.keys;
} else if (op->flags & BCH_WRITE_SYNC) {
closure_sync(&op->cl);
bch2_nocow_write_done(&op->cl);
bch2_nocow_write_done(&op->cl.work);
} else {
/*
* XXX
@ -1566,9 +1566,9 @@ err:
* If op->discard is true, instead of inserting the data it invalidates the
* region of the cache represented by op->bio and op->inode.
*/
void bch2_write(struct closure *cl)
CLOSURE_CALLBACK(bch2_write)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
closure_type(op, struct bch_write_op, cl);
struct bio *bio = &op->wbio.bio;
struct bch_fs *c = op->c;
unsigned data_len;

View File

@ -90,8 +90,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
op->devs_need_flush = NULL;
}
void bch2_write(struct closure *);
CLOSURE_CALLBACK(bch2_write);
void bch2_write_point_do_index_updates(struct work_struct *);
static inline struct bch_write_bio *wbio_init(struct bio *bio)

View File

@ -10,6 +10,7 @@
#include "bkey_methods.h"
#include "btree_gc.h"
#include "btree_update.h"
#include "btree_write_buffer.h"
#include "buckets.h"
#include "error.h"
#include "journal.h"
@ -147,6 +148,7 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
bch2_journal_reclaim_fast(j);
if (write)
closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
wake_up(&j->wait);
}
/*
@ -184,6 +186,8 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val)
/* Close out old buffer: */
buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
trace_journal_entry_close(c, vstruct_bytes(buf->data));
sectors = vstruct_blocks_plus(buf->data, c->block_bits,
buf->u64s_reserved) << c->block_bits;
BUG_ON(sectors > buf->sectors);
@ -328,6 +332,7 @@ static int journal_entry_open(struct journal *j)
buf->must_flush = false;
buf->separate_flush = false;
buf->flush_time = 0;
buf->need_flush_to_write_buffer = true;
memset(buf->data, 0, sizeof(*buf->data));
buf->data->seq = cpu_to_le64(journal_cur_seq(j));
@ -764,6 +769,75 @@ void bch2_journal_block(struct journal *j)
journal_quiesce(j);
}
/*
* XXX: ideally this would not be closing the current journal entry, but
* otherwise we do not have a way to avoid racing with res_get() - j->blocked
* will race.
*/
static bool journal_reservations_stopped(struct journal *j)
{
union journal_res_state s;
journal_entry_close(j);
s.v = atomic64_read_acquire(&j->reservations.counter);
return s.buf0_count == 0 &&
s.buf1_count == 0 &&
s.buf2_count == 0 &&
s.buf3_count == 0;
}
void bch2_journal_block_reservations(struct journal *j)
{
spin_lock(&j->lock);
j->blocked++;
spin_unlock(&j->lock);
wait_event(j->wait, journal_reservations_stopped(j));
}
static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq)
{
spin_lock(&j->lock);
max_seq = min(max_seq, journal_cur_seq(j));
for (u64 seq = journal_last_unwritten_seq(j);
seq <= max_seq;
seq++) {
unsigned idx = seq & JOURNAL_BUF_MASK;
struct journal_buf *buf = j->buf + idx;
union journal_res_state s;
if (!buf->need_flush_to_write_buffer)
continue;
if (seq == journal_cur_seq(j))
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
s.v = atomic64_read_acquire(&j->reservations.counter);
if (journal_state_count(s, idx)) {
spin_unlock(&j->lock);
return ERR_PTR(-EAGAIN);
}
spin_unlock(&j->lock);
return buf;
}
spin_unlock(&j->lock);
return NULL;
}
struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq)
{
struct journal_buf *ret;
wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, max_seq)) != ERR_PTR(-EAGAIN));
return ret;
}
/* allocate journal on a device: */
static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
@ -1215,6 +1289,7 @@ int bch2_fs_journal_init(struct journal *j)
static struct lock_class_key res_key;
unsigned i;
mutex_init(&j->buf_lock);
spin_lock_init(&j->lock);
spin_lock_init(&j->err_lock);
init_waitqueue_head(&j->wait);

View File

@ -259,7 +259,7 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u
{
union journal_res_state s;
s.v = atomic64_sub_return(((union journal_res_state) {
s.v = atomic64_sub_return_release(((union journal_res_state) {
.buf0_count = idx == 0,
.buf1_count = idx == 1,
.buf2_count = idx == 2,
@ -427,6 +427,8 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
void bch2_journal_unblock(struct journal *);
void bch2_journal_block(struct journal *);
void bch2_journal_block_reservations(struct journal *);
struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq);
void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
void bch2_journal_debug_to_text(struct printbuf *, struct journal *);

View File

@ -4,6 +4,7 @@
#include "alloc_foreground.h"
#include "btree_io.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
#include "buckets.h"
#include "checksum.h"
#include "disk_groups.h"
@ -713,6 +714,22 @@ static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs
journal_entry_btree_keys_to_text(out, c, entry);
}
static int journal_entry_write_buffer_keys_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
enum bkey_invalid_flags flags)
{
return journal_entry_btree_keys_validate(c, jset, entry,
version, big_endian, READ);
}
static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c,
struct jset_entry *entry)
{
journal_entry_btree_keys_to_text(out, c, entry);
}
struct jset_entry_ops {
int (*validate)(struct bch_fs *, struct jset *,
struct jset_entry *, unsigned, int,
@ -1025,10 +1042,9 @@ next_block:
return 0;
}
static void bch2_journal_read_device(struct closure *cl)
static CLOSURE_CALLBACK(bch2_journal_read_device)
{
struct journal_device *ja =
container_of(cl, struct journal_device, read);
closure_type(ja, struct journal_device, read);
struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
struct bch_fs *c = ca->fs;
struct journal_list *jlist =
@ -1494,6 +1510,8 @@ done:
static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
/* we aren't holding j->lock: */
unsigned new_size = READ_ONCE(j->buf_size_want);
void *new_buf;
@ -1501,6 +1519,11 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
if (buf->buf_size >= new_size)
return;
size_t btree_write_buffer_size = new_size / 64;
if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
return;
new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
if (!new_buf)
return;
@ -1520,9 +1543,9 @@ static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
}
static void journal_write_done(struct closure *cl)
static CLOSURE_CALLBACK(journal_write_done)
{
struct journal *j = container_of(cl, struct journal, io);
closure_type(j, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_replicas_padded replicas;
@ -1590,6 +1613,7 @@ static void journal_write_done(struct closure *cl)
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
bch2_journal_reclaim_fast(j);
bch2_journal_space_available(j);
track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
@ -1641,9 +1665,9 @@ static void journal_write_endio(struct bio *bio)
percpu_ref_put(&ca->io_ref);
}
static void do_journal_write(struct closure *cl)
static CLOSURE_CALLBACK(do_journal_write)
{
struct journal *j = container_of(cl, struct journal, io);
closure_type(j, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
struct journal_buf *w = journal_last_unwritten_buf(j);
@ -1693,9 +1717,11 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct jset_entry *start, *end, *i, *next, *prev = NULL;
struct jset *jset = w->data;
struct journal_keys_to_wb wb = { NULL };
unsigned sectors, bytes, u64s;
bool validate_before_checksum = false;
unsigned long btree_roots_have = 0;
bool validate_before_checksum = false;
u64 seq = le64_to_cpu(jset->seq);
int ret;
/*
@ -1723,9 +1749,28 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
* to c->btree_roots we have to get any missing btree roots and
* add them to this journal entry:
*/
if (i->type == BCH_JSET_ENTRY_btree_root) {
switch (i->type) {
case BCH_JSET_ENTRY_btree_root:
bch2_journal_entry_to_btree_root(c, i);
__set_bit(i->btree_id, &btree_roots_have);
break;
case BCH_JSET_ENTRY_write_buffer_keys:
EBUG_ON(!w->need_flush_to_write_buffer);
if (!wb.wb)
bch2_journal_keys_to_write_buffer_start(c, &wb, seq);
struct bkey_i *k;
jset_entry_for_each_key(i, k) {
ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k);
if (ret) {
bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer");
bch2_journal_keys_to_write_buffer_end(c, &wb);
return ret;
}
}
i->type = BCH_JSET_ENTRY_btree_keys;
break;
}
/* Can we merge with previous entry? */
@ -1748,6 +1793,10 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
memmove_u64s_down(prev, i, jset_u64s(u64s));
}
if (wb.wb)
bch2_journal_keys_to_write_buffer_end(c, &wb);
w->need_flush_to_write_buffer = false;
prev = prev ? vstruct_next(prev) : jset->start;
jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
@ -1755,8 +1804,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
bch2_journal_super_entries_add_common(c, &end,
le64_to_cpu(jset->seq));
bch2_journal_super_entries_add_common(c, &end, seq);
u64s = (u64 *) end - (u64 *) start;
BUG_ON(u64s > j->entry_u64s_reserved);
@ -1779,7 +1827,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
j->last_empty_seq = le64_to_cpu(jset->seq);
j->last_empty_seq = seq;
if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
validate_before_checksum = true;
@ -1853,9 +1901,9 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
return 0;
}
void bch2_journal_write(struct closure *cl)
CLOSURE_CALLBACK(bch2_journal_write)
{
struct journal *j = container_of(cl, struct journal, io);
closure_type(j, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
struct journal_buf *w = journal_last_unwritten_buf(j);
@ -1875,9 +1923,11 @@ void bch2_journal_write(struct closure *cl)
if (ret)
goto err;
mutex_lock(&j->buf_lock);
journal_buf_realloc(j, w);
ret = bch2_journal_write_prep(j, w);
mutex_unlock(&j->buf_lock);
if (ret)
goto err;

View File

@ -60,6 +60,6 @@ void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,
int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
void bch2_journal_write(struct closure *);
CLOSURE_CALLBACK(bch2_journal_write);
#endif /* _BCACHEFS_JOURNAL_IO_H */

View File

@ -3,6 +3,7 @@
#include "bcachefs.h"
#include "btree_key_cache.h"
#include "btree_update.h"
#include "btree_write_buffer.h"
#include "buckets.h"
#include "errcode.h"
#include "error.h"
@ -50,20 +51,23 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
return available;
}
static inline void journal_set_watermark(struct journal *j)
void bch2_journal_set_watermark(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
bool low_on_space = j->space[journal_space_clean].total * 4 <=
j->space[journal_space_total].total;
bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4;
unsigned watermark = low_on_space || low_on_pin
bool low_on_wb = bch2_btree_write_buffer_must_wait(c);
unsigned watermark = low_on_space || low_on_pin || low_on_wb
? BCH_WATERMARK_reclaim
: BCH_WATERMARK_stripe;
if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space],
&j->low_on_space_start, low_on_space) ||
track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin],
&j->low_on_pin_start, low_on_pin))
&j->low_on_pin_start, low_on_pin) ||
track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full],
&j->write_buffer_full_start, low_on_wb))
trace_and_count(c, journal_full, c);
swap(watermark, j->watermark);
@ -230,7 +234,7 @@ void bch2_journal_space_available(struct journal *j)
else
clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
journal_set_watermark(j);
bch2_journal_set_watermark(j);
out:
j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
j->cur_entry_error = ret;
@ -303,6 +307,7 @@ void bch2_journal_reclaim_fast(struct journal *j)
* all btree nodes got written out
*/
while (!fifo_empty(&j->pin) &&
j->pin.front <= j->seq_ondisk &&
!atomic_read(&fifo_peek_front(&j->pin).count)) {
j->pin.front++;
popped = true;
@ -635,7 +640,6 @@ static u64 journal_seq_to_flush(struct journal *j)
static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
bool kthread = (current->flags & PF_KTHREAD) != 0;
u64 seq_to_flush;
size_t min_nr, min_key_cache, nr_flushed;
unsigned flags;
@ -651,7 +655,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
flags = memalloc_noreclaim_save();
do {
if (kthread && kthread_should_stop())
if (kthread_should_stop())
break;
if (bch2_journal_error(j)) {

View File

@ -16,6 +16,7 @@ static inline void journal_reclaim_kick(struct journal *j)
unsigned bch2_journal_dev_buckets_available(struct journal *,
struct journal_device *,
enum journal_space_from);
void bch2_journal_set_watermark(struct journal *);
void bch2_journal_space_available(struct journal *);
static inline bool journal_pin_active(struct journal_entry_pin *pin)

View File

@ -36,6 +36,7 @@ struct journal_buf {
bool noflush; /* write has already been kicked off, and was noflush */
bool must_flush; /* something wants a flush */
bool separate_flush;
bool need_flush_to_write_buffer;
};
/*
@ -181,6 +182,12 @@ struct journal {
*/
darray_u64 early_journal_entries;
/*
* Protects journal_buf->data, when accessing without a jorunal
* reservation: for synchronization between the btree write buffer code
* and the journal write path:
*/
struct mutex buf_lock;
/*
* Two journal entries -- one is currently open for new entries, the
* other is possibly being written out.
@ -271,6 +278,7 @@ struct journal {
u64 low_on_space_start;
u64 low_on_pin_start;
u64 max_in_flight_start;
u64 write_buffer_full_start;
struct bch2_time_stats *flush_write_time;
struct bch2_time_stats *noflush_write_time;

View File

@ -27,6 +27,13 @@
#include <linux/ioprio.h>
#include <linux/kthread.h>
const char * const bch2_data_ops_strs[] = {
#define x(t, n, ...) [n] = #t,
BCH_DATA_OPS()
#undef x
NULL
};
static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k)
{
if (trace_move_extent_enabled()) {
@ -163,12 +170,17 @@ void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
atomic_read(&ctxt->write_sectors) != sectors_pending);
}
static void bch2_moving_ctxt_flush_all(struct moving_context *ctxt)
{
move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
closure_sync(&ctxt->cl);
}
void bch2_moving_ctxt_exit(struct moving_context *ctxt)
{
struct bch_fs *c = ctxt->trans->c;
move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
closure_sync(&ctxt->cl);
bch2_moving_ctxt_flush_all(ctxt);
EBUG_ON(atomic_read(&ctxt->write_sectors));
EBUG_ON(atomic_read(&ctxt->write_ios));
@ -216,7 +228,7 @@ void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
trace_move_data(c, stats);
}
void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
void bch2_move_stats_init(struct bch_move_stats *stats, const char *name)
{
memset(stats, 0, sizeof(*stats));
stats->data_type = BCH_DATA_user;
@ -484,8 +496,8 @@ int bch2_move_ratelimit(struct moving_context *ctxt)
struct bch_fs *c = ctxt->trans->c;
u64 delay;
if (ctxt->wait_on_copygc && !c->copygc_running) {
bch2_trans_unlock_long(ctxt->trans);
if (ctxt->wait_on_copygc && c->copygc_running) {
bch2_moving_ctxt_flush_all(ctxt);
wait_event_killable(c->copygc_running_wq,
!c->copygc_running ||
kthread_should_stop());
@ -503,7 +515,7 @@ int bch2_move_ratelimit(struct moving_context *ctxt)
set_current_state(TASK_INTERRUPTIBLE);
}
if ((current->flags & PF_KTHREAD) && kthread_should_stop()) {
if (kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
return 1;
}
@ -512,7 +524,7 @@ int bch2_move_ratelimit(struct moving_context *ctxt)
schedule_timeout(delay);
if (unlikely(freezing(current))) {
move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
bch2_moving_ctxt_flush_all(ctxt);
try_to_freeze();
}
} while (delay);
@ -721,11 +733,10 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
fragmentation = a->fragmentation_lru;
ret = bch2_btree_write_buffer_flush(trans);
if (ret) {
ret = bch2_btree_write_buffer_tryflush(trans);
bch_err_msg(c, ret, "flushing btree write buffer");
if (ret)
goto err;
}
while (!(ret = bch2_move_ratelimit(ctxt))) {
bch2_trans_begin(trans);
@ -856,18 +867,17 @@ typedef bool (*move_btree_pred)(struct bch_fs *, void *,
struct data_update_opts *);
static int bch2_move_btree(struct bch_fs *c,
enum btree_id start_btree_id, struct bpos start_pos,
enum btree_id end_btree_id, struct bpos end_pos,
struct bbpos start,
struct bbpos end,
move_btree_pred pred, void *arg,
struct bch_move_stats *stats)
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
struct moving_context ctxt;
struct btree_trans *trans;
struct btree_iter iter;
struct btree *b;
enum btree_id id;
enum btree_id btree;
struct data_update_opts data_opts;
int ret = 0;
@ -878,26 +888,26 @@ static int bch2_move_btree(struct bch_fs *c,
stats->data_type = BCH_DATA_btree;
for (id = start_btree_id;
id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
id++) {
stats->pos = BBPOS(id, POS_MIN);
for (btree = start.btree;
btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
btree ++) {
stats->pos = BBPOS(btree, POS_MIN);
if (!bch2_btree_id_root(c, id)->b)
if (!bch2_btree_id_root(c, btree)->b)
continue;
bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0,
BTREE_ITER_PREFETCH);
retry:
ret = 0;
while (bch2_trans_begin(trans),
(b = bch2_btree_iter_peek_node(&iter)) &&
!(ret = PTR_ERR_OR_ZERO(b))) {
if (kthread && kthread_should_stop())
if (kthread_should_stop())
break;
if ((cmp_int(id, end_btree_id) ?:
bpos_cmp(b->key.k.p, end_pos)) > 0)
if ((cmp_int(btree, end.btree) ?:
bpos_cmp(b->key.k.p, end.pos)) > 0)
break;
stats->pos = BBPOS(iter.btree_id, iter.pos);
@ -918,7 +928,7 @@ next:
bch2_trans_iter_exit(trans, &iter);
if (kthread && kthread_should_stop())
if (kthread_should_stop())
break;
}
@ -1034,8 +1044,8 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
int ret;
ret = bch2_move_btree(c,
0, POS_MIN,
BTREE_ID_NR, SPOS_MAX,
BBPOS_MIN,
BBPOS_MAX,
rewrite_old_nodes_pred, c, stats);
if (!ret) {
mutex_lock(&c->sb_lock);
@ -1050,71 +1060,101 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
return ret;
}
static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
struct bkey_s_c k,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
unsigned durability = bch2_bkey_durability(c, k);
unsigned replicas = bkey_is_btree_ptr(k.k)
? c->opts.metadata_replicas
: io_opts->data_replicas;
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
unsigned i = 0;
bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
unsigned d = bch2_extent_ptr_durability(c, &p);
if (d && durability - d >= replicas) {
data_opts->kill_ptrs |= BIT(i);
durability -= d;
}
i++;
}
return data_opts->kill_ptrs != 0;
}
static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
struct btree *b,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
}
int bch2_data_job(struct bch_fs *c,
struct bch_move_stats *stats,
struct bch_ioctl_data op)
{
struct bbpos start = BBPOS(op.start_btree, op.start_pos);
struct bbpos end = BBPOS(op.end_btree, op.end_pos);
int ret = 0;
if (op.op >= BCH_DATA_OP_NR)
return -EINVAL;
bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]);
switch (op.op) {
case BCH_DATA_OP_REREPLICATE:
bch2_move_stats_init(stats, "rereplicate");
case BCH_DATA_OP_rereplicate:
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, -1);
ret = bch2_move_btree(c,
op.start_btree, op.start_pos,
op.end_btree, op.end_pos,
ret = bch2_move_btree(c, start, end,
rereplicate_btree_pred, c, stats) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
ret = bch2_move_data(c,
(struct bbpos) { op.start_btree, op.start_pos },
(struct bbpos) { op.end_btree, op.end_pos },
ret = bch2_move_data(c, start, end,
NULL,
stats,
writepoint_hashed((unsigned long) current),
true,
rereplicate_pred, c) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
bch2_move_stats_exit(stats, c);
break;
case BCH_DATA_OP_MIGRATE:
case BCH_DATA_OP_migrate:
if (op.migrate.dev >= c->sb.nr_devices)
return -EINVAL;
bch2_move_stats_init(stats, "migrate");
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
ret = bch2_move_btree(c,
op.start_btree, op.start_pos,
op.end_btree, op.end_pos,
ret = bch2_move_btree(c, start, end,
migrate_btree_pred, &op, stats) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
ret = bch2_move_data(c,
(struct bbpos) { op.start_btree, op.start_pos },
(struct bbpos) { op.end_btree, op.end_pos },
ret = bch2_move_data(c, start, end,
NULL,
stats,
writepoint_hashed((unsigned long) current),
true,
migrate_pred, &op) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
bch2_move_stats_exit(stats, c);
break;
case BCH_DATA_OP_REWRITE_OLD_NODES:
bch2_move_stats_init(stats, "rewrite_old_nodes");
case BCH_DATA_OP_rewrite_old_nodes:
ret = bch2_scan_old_btree_nodes(c, stats);
bch2_move_stats_exit(stats, c);
break;
case BCH_DATA_OP_drop_extra_replicas:
ret = bch2_move_btree(c, start, end,
drop_extra_replicas_btree_pred, c, stats) ?: ret;
ret = bch2_move_data(c, start, end, NULL, stats,
writepoint_hashed((unsigned long) current),
true,
drop_extra_replicas_pred, c) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
break;
default:
ret = -EINVAL;
}
bch2_move_stats_exit(stats, c);
return ret;
}

View File

@ -56,6 +56,8 @@ do { \
typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c,
struct bch_io_opts *, struct data_update_opts *);
extern const char * const bch2_data_ops_strs[];
void bch2_moving_ctxt_exit(struct moving_context *);
void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
struct bch_ratelimit *, struct bch_move_stats *,
@ -130,7 +132,7 @@ int bch2_data_job(struct bch_fs *,
void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *);
void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *);
void bch2_move_stats_init(struct bch_move_stats *, char *);
void bch2_move_stats_init(struct bch_move_stats *, const char *);
void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);

View File

@ -153,8 +153,11 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
move_buckets_wait(ctxt, buckets_in_flight, false);
ret = bch2_btree_write_buffer_flush(trans);
if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()",
ret = bch2_btree_write_buffer_tryflush(trans);
if (bch2_err_matches(ret, EROFS))
return ret;
if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_tryflush()",
__func__, bch2_err_str(ret)))
return ret;

View File

@ -233,11 +233,6 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, true, \
NULL, "Stash pointer to in memory btree node in btree ptr")\
x(btree_write_buffer_size, u32, \
OPT_FS|OPT_MOUNT, \
OPT_UINT(16, (1U << 20) - 1), \
BCH2_NO_SB_OPT, 1U << 13, \
NULL, "Number of btree write buffer entries") \
x(gc_reserve_percent, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_UINT(5, 21), \

View File

@ -159,6 +159,8 @@ static int bch2_journal_replay(struct bch_fs *c)
goto err;
}
BUG_ON(!atomic_read(&keys->ref));
/*
* First, attempt to replay keys in sorted order. This is more
* efficient - better locality of btree access - but some might fail if
@ -218,14 +220,15 @@ static int bch2_journal_replay(struct bch_fs *c)
bch2_trans_put(trans);
trans = NULL;
if (!c->opts.keep_journal)
bch2_journal_keys_put_initial(c);
replay_now_at(j, j->replay_journal_seq_end);
j->replay_journal_seq = 0;
bch2_journal_set_replay_done(j);
bch2_journal_flush_all_pins(j);
ret = bch2_journal_error(j);
if (keys->nr && !ret)
if (keys->nr)
bch2_journal_log_msg(c, "journal replay finished");
err:
if (trans)
@ -935,8 +938,12 @@ use_clean:
bch2_move_stats_init(&stats, "recovery");
bch_info(c, "scanning for old btree nodes");
ret = bch2_fs_read_write(c) ?:
struct printbuf buf = PRINTBUF;
bch2_version_to_text(&buf, c->sb.version_min);
bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf);
printbuf_exit(&buf);
ret = bch2_fs_read_write_early(c) ?:
bch2_scan_old_btree_nodes(c, &stats);
if (ret)
goto err;
@ -953,10 +960,8 @@ out:
bch2_flush_fsck_errs(c);
if (!c->opts.keep_journal &&
test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) {
bch2_journal_keys_free(&c->journal_keys);
bch2_journal_entries_free(c);
}
test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
bch2_journal_keys_put_initial(c);
kfree(clean);
if (!ret && test_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) {

View File

@ -11,7 +11,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
/* Replicas tracking - in memory: */
static void verify_replicas_entry(struct bch_replicas_entry *e)
static void verify_replicas_entry(struct bch_replicas_entry_v1 *e)
{
#ifdef CONFIG_BCACHEFS_DEBUG
unsigned i;
@ -26,7 +26,7 @@ static void verify_replicas_entry(struct bch_replicas_entry *e)
#endif
}
void bch2_replicas_entry_sort(struct bch_replicas_entry *e)
void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
{
bubble_sort(e->devs, e->nr_devs, u8_cmp);
}
@ -53,7 +53,7 @@ static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
}
void bch2_replicas_entry_to_text(struct printbuf *out,
struct bch_replicas_entry *e)
struct bch_replicas_entry_v1 *e)
{
unsigned i;
@ -71,7 +71,7 @@ void bch2_replicas_entry_to_text(struct printbuf *out,
void bch2_cpu_replicas_to_text(struct printbuf *out,
struct bch_replicas_cpu *r)
{
struct bch_replicas_entry *e;
struct bch_replicas_entry_v1 *e;
bool first = true;
for_each_cpu_replicas_entry(r, e) {
@ -84,7 +84,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *out,
}
static void extent_to_replicas(struct bkey_s_c k,
struct bch_replicas_entry *r)
struct bch_replicas_entry_v1 *r)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
@ -104,7 +104,7 @@ static void extent_to_replicas(struct bkey_s_c k,
}
static void stripe_to_replicas(struct bkey_s_c k,
struct bch_replicas_entry *r)
struct bch_replicas_entry_v1 *r)
{
struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
const struct bch_extent_ptr *ptr;
@ -117,7 +117,7 @@ static void stripe_to_replicas(struct bkey_s_c k,
r->devs[r->nr_devs++] = ptr->dev;
}
void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e,
struct bkey_s_c k)
{
e->nr_devs = 0;
@ -142,7 +142,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
bch2_replicas_entry_sort(e);
}
void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
enum bch_data_type data_type,
struct bch_devs_list devs)
{
@ -164,7 +164,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
static struct bch_replicas_cpu
cpu_replicas_add_entry(struct bch_replicas_cpu *old,
struct bch_replicas_entry *new_entry)
struct bch_replicas_entry_v1 *new_entry)
{
unsigned i;
struct bch_replicas_cpu new = {
@ -194,7 +194,7 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old,
}
static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
struct bch_replicas_entry *search)
struct bch_replicas_entry_v1 *search)
{
int idx, entry_size = replicas_entry_bytes(search);
@ -212,7 +212,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
}
int bch2_replicas_entry_idx(struct bch_fs *c,
struct bch_replicas_entry *search)
struct bch_replicas_entry_v1 *search)
{
bch2_replicas_entry_sort(search);
@ -220,13 +220,13 @@ int bch2_replicas_entry_idx(struct bch_fs *c,
}
static bool __replicas_has_entry(struct bch_replicas_cpu *r,
struct bch_replicas_entry *search)
struct bch_replicas_entry_v1 *search)
{
return __replicas_entry_idx(r, search) >= 0;
}
bool bch2_replicas_marked(struct bch_fs *c,
struct bch_replicas_entry *search)
struct bch_replicas_entry_v1 *search)
{
bool marked;
@ -343,7 +343,7 @@ err:
static unsigned reserve_journal_replicas(struct bch_fs *c,
struct bch_replicas_cpu *r)
{
struct bch_replicas_entry *e;
struct bch_replicas_entry_v1 *e;
unsigned journal_res_u64s = 0;
/* nr_inodes: */
@ -368,7 +368,7 @@ static unsigned reserve_journal_replicas(struct bch_fs *c,
noinline
static int bch2_mark_replicas_slowpath(struct bch_fs *c,
struct bch_replicas_entry *new_entry)
struct bch_replicas_entry_v1 *new_entry)
{
struct bch_replicas_cpu new_r, new_gc;
int ret = 0;
@ -433,7 +433,7 @@ err:
goto out;
}
int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r)
int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
{
return likely(bch2_replicas_marked(c, r))
? 0 : bch2_mark_replicas_slowpath(c, r);
@ -484,7 +484,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
{
struct bch_replicas_entry *e;
struct bch_replicas_entry_v1 *e;
unsigned i = 0;
lockdep_assert_held(&c->replicas_gc_lock);
@ -559,7 +559,7 @@ retry:
}
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e =
struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
if (e->data_type == BCH_DATA_journal ||
@ -590,7 +590,7 @@ retry:
}
int bch2_replicas_set_usage(struct bch_fs *c,
struct bch_replicas_entry *r,
struct bch_replicas_entry_v1 *r,
u64 sectors)
{
int ret, idx = bch2_replicas_entry_idx(c, r);
@ -623,7 +623,7 @@ static int
__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
struct bch_replicas_cpu *cpu_r)
{
struct bch_replicas_entry *e, *dst;
struct bch_replicas_entry_v1 *e, *dst;
unsigned nr = 0, entry_size = 0, idx = 0;
for_each_replicas_entry(sb_r, e) {
@ -661,7 +661,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
nr++;
}
entry_size += sizeof(struct bch_replicas_entry) -
entry_size += sizeof(struct bch_replicas_entry_v1) -
sizeof(struct bch_replicas_entry_v0);
cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
@ -672,7 +672,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
cpu_r->entry_size = entry_size;
for_each_replicas_entry(sb_r, e) {
struct bch_replicas_entry *dst =
struct bch_replicas_entry_v1 *dst =
cpu_replicas_entry(cpu_r, idx++);
dst->data_type = e->data_type;
@ -716,7 +716,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
{
struct bch_sb_field_replicas_v0 *sb_r;
struct bch_replicas_entry_v0 *dst;
struct bch_replicas_entry *src;
struct bch_replicas_entry_v1 *src;
size_t bytes;
bytes = sizeof(struct bch_sb_field_replicas);
@ -754,7 +754,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
struct bch_replicas_cpu *r)
{
struct bch_sb_field_replicas *sb_r;
struct bch_replicas_entry *dst, *src;
struct bch_replicas_entry_v1 *dst, *src;
bool need_v1 = false;
size_t bytes;
@ -805,7 +805,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
memcmp, NULL);
for (i = 0; i < cpu_r->nr; i++) {
struct bch_replicas_entry *e =
struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(cpu_r, i);
if (e->data_type >= BCH_DATA_NR) {
@ -835,7 +835,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
}
if (i + 1 < cpu_r->nr) {
struct bch_replicas_entry *n =
struct bch_replicas_entry_v1 *n =
cpu_replicas_entry(cpu_r, i + 1);
BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
@ -872,7 +872,7 @@ static void bch2_sb_replicas_to_text(struct printbuf *out,
struct bch_sb_field *f)
{
struct bch_sb_field_replicas *r = field_to_type(f, replicas);
struct bch_replicas_entry *e;
struct bch_replicas_entry_v1 *e;
bool first = true;
for_each_replicas_entry(r, e) {
@ -934,7 +934,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
unsigned flags, bool print)
{
struct bch_replicas_entry *e;
struct bch_replicas_entry_v1 *e;
bool ret = true;
percpu_down_read(&c->mark_lock);
@ -994,7 +994,7 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
if (replicas) {
struct bch_replicas_entry *r;
struct bch_replicas_entry_v1 *r;
for_each_replicas_entry(replicas, r)
for (i = 0; i < r->nr_devs; i++)

View File

@ -6,26 +6,26 @@
#include "eytzinger.h"
#include "replicas_types.h"
void bch2_replicas_entry_sort(struct bch_replicas_entry *);
void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *);
void bch2_replicas_entry_to_text(struct printbuf *,
struct bch_replicas_entry *);
struct bch_replicas_entry_v1 *);
void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
static inline struct bch_replicas_entry *
static inline struct bch_replicas_entry_v1 *
cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
{
return (void *) r->entries + r->entry_size * i;
}
int bch2_replicas_entry_idx(struct bch_fs *,
struct bch_replicas_entry *);
struct bch_replicas_entry_v1 *);
void bch2_devlist_to_replicas(struct bch_replicas_entry *,
void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *,
enum bch_data_type,
struct bch_devs_list);
bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *);
bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry_v1 *);
int bch2_mark_replicas(struct bch_fs *,
struct bch_replicas_entry *);
struct bch_replicas_entry_v1 *);
static inline struct replicas_delta *
replicas_delta_next(struct replicas_delta *d)
@ -35,9 +35,9 @@ replicas_delta_next(struct replicas_delta *d)
int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *);
void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c);
void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *, struct bkey_s_c);
static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e,
unsigned dev)
{
e->data_type = BCH_DATA_cached;
@ -57,7 +57,7 @@ int bch2_replicas_gc_start(struct bch_fs *, unsigned);
int bch2_replicas_gc2(struct bch_fs *);
int bch2_replicas_set_usage(struct bch_fs *,
struct bch_replicas_entry *,
struct bch_replicas_entry_v1 *,
u64);
#define for_each_cpu_replicas_entry(_r, _i) \

View File

@ -5,12 +5,12 @@
struct bch_replicas_cpu {
unsigned nr;
unsigned entry_size;
struct bch_replicas_entry *entries;
struct bch_replicas_entry_v1 *entries;
};
struct replicas_delta {
s64 delta;
struct bch_replicas_entry r;
struct bch_replicas_entry_v1 r;
} __packed;
struct replicas_delta_list {

View File

@ -235,7 +235,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
}
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e =
struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
struct jset_entry_data_usage *u =
container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),

View File

@ -248,7 +248,8 @@
x(root_inode_not_dir, 240) \
x(dir_loop, 241) \
x(hash_table_key_duplicate, 242) \
x(hash_table_key_wrong_offset, 243)
x(hash_table_key_wrong_offset, 243) \
x(unlinked_inode_not_on_deleted_list, 244)
enum bch_sb_error_id {
#define x(t, n) BCH_FSCK_ERR_##t = n,

View File

@ -324,7 +324,7 @@ bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
}
EXPORT_SYMBOL_GPL(six_relock_ip);
#ifdef CONFIG_LOCK_SPIN_ON_OWNER
#ifdef CONFIG_BCACHEFS_SIX_OPTIMISTIC_SPIN
static inline bool six_owner_running(struct six_lock *lock)
{

View File

@ -166,6 +166,7 @@ void bch2_free_super(struct bch_sb_handle *sb)
if (!IS_ERR_OR_NULL(sb->bdev))
blkdev_put(sb->bdev, sb->holder);
kfree(sb->holder);
kfree(sb->sb_name);
kfree(sb->sb);
memset(sb, 0, sizeof(*sb));
@ -657,12 +658,13 @@ reread:
return 0;
}
int bch2_read_super(const char *path, struct bch_opts *opts,
struct bch_sb_handle *sb)
int __bch2_read_super(const char *path, struct bch_opts *opts,
struct bch_sb_handle *sb, bool ignore_notbchfs_msg)
{
u64 offset = opt_get(*opts, sb);
struct bch_sb_layout layout;
struct printbuf err = PRINTBUF;
struct printbuf err2 = PRINTBUF;
__le64 *i;
int ret;
#ifndef __KERNEL__
@ -675,6 +677,10 @@ retry:
if (!sb->holder)
return -ENOMEM;
sb->sb_name = kstrdup(path, GFP_KERNEL);
if (!sb->sb_name)
return -ENOMEM;
#ifndef __KERNEL__
if (opt_get(*opts, direct_io) == false)
sb->mode |= BLK_OPEN_BUFFERED;
@ -721,8 +727,14 @@ retry:
if (opt_defined(*opts, sb))
goto err;
printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s\n",
prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n",
path, err.buf);
if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg)
printk(KERN_INFO "%s", err2.buf);
else
printk(KERN_ERR "%s", err2.buf);
printbuf_exit(&err2);
printbuf_reset(&err);
/*
@ -798,6 +810,20 @@ err_no_print:
goto out;
}
int bch2_read_super(const char *path, struct bch_opts *opts,
struct bch_sb_handle *sb)
{
return __bch2_read_super(path, opts, sb, false);
}
/* provide a silenced version for mount.bcachefs */
int bch2_read_super_silent(const char *path, struct bch_opts *opts,
struct bch_sb_handle *sb)
{
return __bch2_read_super(path, opts, sb, true);
}
/* write superblock: */
static void write_super_endio(struct bio *bio)

View File

@ -74,6 +74,7 @@ void bch2_free_super(struct bch_sb_handle *);
int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *);
int bch2_write_super(struct bch_fs *);
void __bch2_check_set_feature(struct bch_fs *, unsigned);

View File

@ -314,7 +314,8 @@ void bch2_fs_read_only(struct bch_fs *c)
BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
BUG_ON(atomic_read(&c->btree_cache.dirty));
BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
BUG_ON(c->btree_write_buffer.state.nr);
BUG_ON(c->btree_write_buffer.inc.keys.nr);
BUG_ON(c->btree_write_buffer.flushing.keys.nr);
bch_verbose(c, "marking filesystem clean");
bch2_fs_mark_clean(c);
@ -504,8 +505,8 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_io_clock_exit(&c->io_clock[WRITE]);
bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_compress_exit(c);
bch2_journal_keys_free(&c->journal_keys);
bch2_journal_entries_free(c);
bch2_journal_keys_put_initial(c);
BUG_ON(atomic_read(&c->journal_keys.ref));
bch2_fs_btree_write_buffer_exit(c);
percpu_free_rwsem(&c->mark_lock);
free_percpu(c->online_reserved);
@ -704,6 +705,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
init_rwsem(&c->gc_lock);
mutex_init(&c->gc_gens_lock);
atomic_set(&c->journal_keys.ref, 1);
c->journal_keys.initial_ref_held = true;
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_init(&c->times[i]);

View File

@ -5,6 +5,7 @@
struct bch_sb_handle {
struct bch_sb *sb;
struct block_device *bdev;
char *sb_name;
struct bio *bio;
void *holder;
size_t buffer_size;

View File

@ -496,7 +496,7 @@ STORE(bch2_fs)
sc.gfp_mask = GFP_KERNEL;
sc.nr_to_scan = strtoul_or_return(buf);
c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc);
}
if (attr == &sysfs_btree_wakeup)

View File

@ -188,6 +188,25 @@ DEFINE_EVENT(bch_fs, journal_entry_full,
TP_ARGS(c)
);
TRACE_EVENT(journal_entry_close,
TP_PROTO(struct bch_fs *c, unsigned bytes),
TP_ARGS(c, bytes),
TP_STRUCT__entry(
__field(dev_t, dev )
__field(u32, bytes )
),
TP_fast_assign(
__entry->dev = c->dev;
__entry->bytes = bytes;
),
TP_printk("%d,%d entry bytes %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->bytes)
);
DEFINE_EVENT(bio, journal_write,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
@ -1313,21 +1332,38 @@ TRACE_EVENT(write_buffer_flush,
__entry->nr, __entry->size, __entry->skipped, __entry->fast)
);
TRACE_EVENT(write_buffer_flush_slowpath,
TP_PROTO(struct btree_trans *trans, size_t nr, size_t size),
TP_ARGS(trans, nr, size),
TRACE_EVENT(write_buffer_flush_sync,
TP_PROTO(struct btree_trans *trans, unsigned long caller_ip),
TP_ARGS(trans, caller_ip),
TP_STRUCT__entry(
__field(size_t, nr )
__field(size_t, size )
__array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
),
TP_fast_assign(
__entry->nr = nr;
__entry->size = size;
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
),
TP_printk("%zu/%zu", __entry->nr, __entry->size)
TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
);
TRACE_EVENT(write_buffer_flush_slowpath,
TP_PROTO(struct btree_trans *trans, size_t slowpath, size_t total),
TP_ARGS(trans, slowpath, total),
TP_STRUCT__entry(
__field(size_t, slowpath )
__field(size_t, total )
),
TP_fast_assign(
__entry->slowpath = slowpath;
__entry->total = total;
),
TP_printk("%zu/%zu", __entry->slowpath, __entry->total)
);
#endif /* _TRACE_BCACHEFS_H */

View File

@ -36,7 +36,7 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
closure_debug_destroy(cl);
if (destructor)
destructor(cl);
destructor(&cl->work);
if (parent)
closure_put(parent);
@ -108,8 +108,9 @@ struct closure_syncer {
int done;
};
static void closure_sync_fn(struct closure *cl)
static CLOSURE_CALLBACK(closure_sync_fn)
{
struct closure *cl = container_of(ws, struct closure, work);
struct closure_syncer *s = cl->s;
struct task_struct *p;

View File

@ -12,7 +12,12 @@
static LIST_HEAD(shrinker_list);
static DEFINE_MUTEX(shrinker_lock);
int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...)
{
return calloc(sizeof(struct shrinker), 1);
}
int shrinker_register(struct shrinker *shrinker)
{
mutex_lock(&shrinker_lock);
list_add_tail(&shrinker->list, &shrinker_list);