Update bcachefs sources to 8d3fc97ca3 bcachefs: Fixes for building in userspace

This commit is contained in:
Kent Overstreet 2022-11-13 20:04:21 -05:00
parent 980f7437e2
commit c1e4d447f6
52 changed files with 907 additions and 755 deletions

View File

@ -1 +1 @@
61ebcb532a1266e5e36f354858b552e2a4fb9925
8d3fc97ca3f24d8f7ab1e9ed04d8ca354c44dd8c

View File

@ -2,7 +2,7 @@ PREFIX?=/usr/local
PKG_CONFIG?=pkg-config
INSTALL=install
CFLAGS+=-std=gnu89 -O2 -g -MMD -Wall -fPIC \
CFLAGS+=-std=gnu11 -O2 -g -MMD -Wall -fPIC \
-Wno-pointer-sign \
-fno-strict-aliasing \
-fno-delete-null-pointer-checks \

View File

@ -54,6 +54,8 @@ typedef struct {
#define __ATOMIC_ADD_RETURN_RELEASE(v, p) \
__atomic_add_fetch(p, v, __ATOMIC_RELEASE)
#define __ATOMIC_SUB_RETURN(v, p) __atomic_sub_fetch(p, v, __ATOMIC_RELAXED)
#define __ATOMIC_SUB_RETURN_RELEASE(v, p) \
__atomic_sub_fetch(p, v, __ATOMIC_RELEASE)
#define xchg(p, v) __atomic_exchange_n(p, v, __ATOMIC_SEQ_CST)
#define xchg_acquire(p, v) __atomic_exchange_n(p, v, __ATOMIC_ACQUIRE)
@ -123,6 +125,11 @@ do { \
({ smp_mb__before_atomic(); __ATOMIC_ADD_RETURN(i, v); })
#endif
#ifndef __ATOMIC_SUB_RETURN_RELEASE
#define __ATOMIC_SUB_RETURN_RELEASE(i, v) \
({ smp_mb__before_atomic(); __ATOMIC_SUB_RETURN(i, v); })
#endif
#ifndef __ATOMIC_SUB
#define __ATOMIC_SUB(i, v) __ATOMIC_SUB_RETURN(i, v)
#endif
@ -164,6 +171,11 @@ static inline i_type a_type##_add_return_release(i_type i, a_type##_t *v)\
return __ATOMIC_ADD_RETURN_RELEASE(i, &v->counter); \
} \
\
static inline i_type a_type##_sub_return_release(i_type i, a_type##_t *v)\
{ \
return __ATOMIC_SUB_RETURN_RELEASE(i, &v->counter); \
} \
\
static inline i_type a_type##_sub_return(i_type i, a_type##_t *v) \
{ \
return __ATOMIC_SUB_RETURN(i, &v->counter); \

View File

@ -229,6 +229,8 @@ static inline int __must_check kstrtos32(const char *s, unsigned int base, s32 *
}
struct printbuf;
extern void prt_u64(struct printbuf *out, u64 num);
extern __printf(2, 0) void prt_vprintf(struct printbuf *out, const char *fmt, va_list args);
extern __printf(2, 3) void prt_printf(struct printbuf *out, const char *fmt, ...);

View File

@ -155,7 +155,7 @@ struct mean_and_variance_weighted {
u64 variance;
};
inline s64 fast_divpow2(s64 n, u8 d);
s64 fast_divpow2(s64 n, u8 d);
struct mean_and_variance mean_and_variance_update(struct mean_and_variance s1, s64 v1);
s64 mean_and_variance_get_mean(struct mean_and_variance s);

View File

@ -24,6 +24,7 @@ typedef struct {
} wait_queue_head_t;
void wake_up(wait_queue_head_t *);
void wake_up_all(wait_queue_head_t *);
void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state);
void finish_wait(wait_queue_head_t *q, wait_queue_t *wait);
int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);

View File

@ -344,25 +344,29 @@ DEFINE_EVENT(btree_node, btree_node_free,
TRACE_EVENT(btree_reserve_get_fail,
TP_PROTO(const char *trans_fn,
unsigned long caller_ip,
size_t required),
TP_ARGS(trans_fn, caller_ip, required),
size_t required,
int ret),
TP_ARGS(trans_fn, caller_ip, required, ret),
TP_STRUCT__entry(
__array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
__field(size_t, required )
__array(char, ret, 32 )
),
TP_fast_assign(
strscpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
__entry->required = required;
strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret));
),
TP_printk("%s %pS required %zu",
TP_printk("%s %pS required %zu ret %s",
__entry->trans_fn,
(void *) __entry->caller_ip,
__entry->required)
__entry->required,
__entry->ret)
);
DEFINE_EVENT(btree_node, btree_node_compact,
@ -542,14 +546,11 @@ TRACE_EVENT(bucket_alloc_fail,
u64 avail,
u64 copygc_wait_amount,
s64 copygc_waiting_for,
u64 seen,
u64 open,
u64 need_journal_commit,
u64 nouse,
struct bucket_alloc_state *s,
bool nonblocking,
const char *err),
TP_ARGS(ca, alloc_reserve, free, avail, copygc_wait_amount, copygc_waiting_for,
seen, open, need_journal_commit, nouse, nonblocking, err),
s, nonblocking, err),
TP_STRUCT__entry(
__field(dev_t, dev )
@ -573,10 +574,10 @@ TRACE_EVENT(bucket_alloc_fail,
__entry->avail = avail;
__entry->copygc_wait_amount = copygc_wait_amount;
__entry->copygc_waiting_for = copygc_waiting_for;
__entry->seen = seen;
__entry->open = open;
__entry->need_journal_commit = need_journal_commit;
__entry->nouse = nouse;
__entry->seen = s->buckets_seen;
__entry->open = s->skipped_open;
__entry->need_journal_commit = s->skipped_need_journal_commit;
__entry->nouse = s->skipped_nouse;
__entry->nonblocking = nonblocking;
strscpy(__entry->err, err, sizeof(__entry->err));
),

View File

@ -279,6 +279,22 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
return -EINVAL;
}
/*
* XXX this is wrong, we'll be checking updates that happened from
* before BCH_FS_CHECK_BACKPOINTERS_DONE
*/
if (rw == WRITE && test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
unsigned i, bp_len = 0;
for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++)
bp_len += alloc_v4_backpointers_c(a.v)[i].bucket_len;
if (bp_len > a.v->dirty_sectors) {
prt_printf(err, "too many backpointers");
return -EINVAL;
}
}
if (rw == WRITE) {
if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) {
prt_printf(err, "invalid data type (got %u should be %u)",

View File

@ -195,26 +195,24 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
u64 bucket,
enum alloc_reserve reserve,
struct bch_alloc_v4 *a,
u64 *skipped_open,
u64 *skipped_need_journal_commit,
u64 *skipped_nouse,
struct bucket_alloc_state *s,
struct closure *cl)
{
struct open_bucket *ob;
if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
(*skipped_nouse)++;
s->skipped_nouse++;
return NULL;
}
if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
(*skipped_open)++;
s->skipped_open++;
return NULL;
}
if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
(*skipped_need_journal_commit)++;
s->skipped_need_journal_commit++;
return NULL;
}
@ -234,7 +232,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
/* Recheck under lock: */
if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
spin_unlock(&c->freelist_lock);
(*skipped_open)++;
s->skipped_open++;
return NULL;
}
@ -274,9 +272,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
enum alloc_reserve reserve, u64 free_entry,
u64 *skipped_open,
u64 *skipped_need_journal_commit,
u64 *skipped_nouse,
struct bucket_alloc_state *s,
struct bkey_s_c freespace_k,
struct closure *cl)
{
@ -339,7 +335,8 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
u64 bp_offset = 0;
ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1,
&bp_offset, &bp, 0);
&bp_offset, &bp,
BTREE_ITER_NOPRESERVE);
if (ret) {
ob = ERR_PTR(ret);
goto err;
@ -356,11 +353,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
}
}
ob = __try_alloc_bucket(c, ca, b, reserve, &a,
skipped_open,
skipped_need_journal_commit,
skipped_nouse,
cl);
ob = __try_alloc_bucket(c, ca, b, reserve, &a, s, cl);
if (!ob)
iter.path->preserve = false;
err:
@ -406,11 +399,7 @@ static noinline struct open_bucket *
bch2_bucket_alloc_early(struct btree_trans *trans,
struct bch_dev *ca,
enum alloc_reserve reserve,
u64 *cur_bucket,
u64 *buckets_seen,
u64 *skipped_open,
u64 *skipped_need_journal_commit,
u64 *skipped_nouse,
struct bucket_alloc_state *s,
struct closure *cl)
{
struct btree_iter iter;
@ -418,10 +407,10 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
struct open_bucket *ob = NULL;
int ret;
*cur_bucket = max_t(u64, *cur_bucket, ca->mi.first_bucket);
*cur_bucket = max_t(u64, *cur_bucket, ca->new_fs_bucket_idx);
s->cur_bucket = max_t(u64, s->cur_bucket, ca->mi.first_bucket);
s->cur_bucket = max_t(u64, s->cur_bucket, ca->new_fs_bucket_idx);
for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, *cur_bucket),
for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, s->cur_bucket),
BTREE_ITER_SLOTS, k, ret) {
struct bch_alloc_v4 a;
@ -437,19 +426,15 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
if (a.data_type != BCH_DATA_free)
continue;
(*buckets_seen)++;
s->buckets_seen++;
ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, &a,
skipped_open,
skipped_need_journal_commit,
skipped_nouse,
cl);
ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, &a, s, cl);
if (ob)
break;
}
bch2_trans_iter_exit(trans, &iter);
*cur_bucket = iter.pos.offset;
s->cur_bucket = iter.pos.offset;
return ob ?: ERR_PTR(ret ?: -BCH_ERR_no_buckets_found);
}
@ -457,11 +442,7 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
struct bch_dev *ca,
enum alloc_reserve reserve,
u64 *cur_bucket,
u64 *buckets_seen,
u64 *skipped_open,
u64 *skipped_need_journal_commit,
u64 *skipped_nouse,
struct bucket_alloc_state *s,
struct closure *cl)
{
struct btree_iter iter;
@ -477,25 +458,21 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
* at previously
*/
for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
POS(ca->dev_idx, *cur_bucket), 0, k, ret) {
POS(ca->dev_idx, s->cur_bucket), 0, k, ret) {
if (k.k->p.inode != ca->dev_idx)
break;
for (*cur_bucket = max(*cur_bucket, bkey_start_offset(k.k));
*cur_bucket < k.k->p.offset;
(*cur_bucket)++) {
for (s->cur_bucket = max(s->cur_bucket, bkey_start_offset(k.k));
s->cur_bucket < k.k->p.offset;
s->cur_bucket++) {
ret = btree_trans_too_many_iters(trans);
if (ret)
break;
(*buckets_seen)++;
s->buckets_seen++;
ob = try_alloc_bucket(trans, ca, reserve,
*cur_bucket,
skipped_open,
skipped_need_journal_commit,
skipped_nouse,
k, cl);
s->cur_bucket, s, k, cl);
if (ob)
break;
}
@ -525,11 +502,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
bool freespace_initialized = READ_ONCE(ca->mi.freespace_initialized);
u64 start = freespace_initialized ? 0 : ca->bucket_alloc_trans_early_cursor;
u64 avail;
u64 cur_bucket = start;
u64 buckets_seen = 0;
u64 skipped_open = 0;
u64 skipped_need_journal_commit = 0;
u64 skipped_nouse = 0;
struct bucket_alloc_state s = { .cur_bucket = start };
bool waiting = false;
again:
bch2_dev_usage_read_fast(ca, usage);
@ -568,31 +541,19 @@ again:
}
ob = likely(ca->mi.freespace_initialized)
? bch2_bucket_alloc_freelist(trans, ca, reserve,
&cur_bucket,
&buckets_seen,
&skipped_open,
&skipped_need_journal_commit,
&skipped_nouse,
cl)
: bch2_bucket_alloc_early(trans, ca, reserve,
&cur_bucket,
&buckets_seen,
&skipped_open,
&skipped_need_journal_commit,
&skipped_nouse,
cl);
? bch2_bucket_alloc_freelist(trans, ca, reserve, &s, cl)
: bch2_bucket_alloc_early(trans, ca, reserve, &s, cl);
if (skipped_need_journal_commit * 2 > avail)
if (s.skipped_need_journal_commit * 2 > avail)
bch2_journal_flush_async(&c->journal, NULL);
if (!ob && !freespace_initialized && start) {
start = cur_bucket = 0;
start = s.cur_bucket = 0;
goto again;
}
if (!freespace_initialized)
ca->bucket_alloc_trans_early_cursor = cur_bucket;
ca->bucket_alloc_trans_early_cursor = s.cur_bucket;
err:
if (!ob)
ob = ERR_PTR(-BCH_ERR_no_buckets_found);
@ -607,10 +568,7 @@ err:
avail,
bch2_copygc_wait_amount(c),
c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
buckets_seen,
skipped_open,
skipped_need_journal_commit,
skipped_nouse,
&s,
cl == NULL,
bch2_err_str(PTR_ERR(ob)));
@ -1152,16 +1110,17 @@ out:
/*
* Get us an open_bucket we can allocate from, return with it locked:
*/
struct write_point *bch2_alloc_sectors_start_trans(struct btree_trans *trans,
unsigned target,
unsigned erasure_code,
struct write_point_specifier write_point,
struct bch_devs_list *devs_have,
unsigned nr_replicas,
unsigned nr_replicas_required,
enum alloc_reserve reserve,
unsigned flags,
struct closure *cl)
int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
unsigned target,
unsigned erasure_code,
struct write_point_specifier write_point,
struct bch_devs_list *devs_have,
unsigned nr_replicas,
unsigned nr_replicas_required,
enum alloc_reserve reserve,
unsigned flags,
struct closure *cl,
struct write_point **wp_ret)
{
struct bch_fs *c = trans->c;
struct write_point *wp;
@ -1183,7 +1142,7 @@ retry:
write_points_nr = c->write_points_nr;
have_cache = false;
wp = writepoint_find(trans, write_point.v);
*wp_ret = wp = writepoint_find(trans, write_point.v);
if (wp->data_type == BCH_DATA_user)
ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
@ -1240,7 +1199,7 @@ alloc_done:
BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
return wp;
return 0;
err:
open_bucket_for_each(c, &wp->ptrs, ob, i)
if (ptrs.nr < ARRAY_SIZE(ptrs.v))
@ -1258,39 +1217,13 @@ err:
if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
bch2_err_matches(ret, BCH_ERR_freelist_empty))
return cl
? ERR_PTR(-EAGAIN)
: ERR_PTR(-BCH_ERR_ENOSPC_bucket_alloc);
? -EAGAIN
: -BCH_ERR_ENOSPC_bucket_alloc;
if (bch2_err_matches(ret, BCH_ERR_insufficient_devices))
return ERR_PTR(-EROFS);
return ERR_PTR(ret);
}
struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
unsigned target,
unsigned erasure_code,
struct write_point_specifier write_point,
struct bch_devs_list *devs_have,
unsigned nr_replicas,
unsigned nr_replicas_required,
enum alloc_reserve reserve,
unsigned flags,
struct closure *cl)
{
struct write_point *wp;
bch2_trans_do(c, NULL, NULL, 0,
PTR_ERR_OR_ZERO(wp = bch2_alloc_sectors_start_trans(&trans, target,
erasure_code,
write_point,
devs_have,
nr_replicas,
nr_replicas_required,
reserve,
flags, cl)));
return wp;
return -EROFS;
return ret;
}
struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
@ -1361,6 +1294,10 @@ static inline void writepoint_init(struct write_point *wp,
{
mutex_init(&wp->lock);
wp->data_type = type;
INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates);
INIT_LIST_HEAD(&wp->writes);
spin_lock_init(&wp->writes_lock);
}
void bch2_fs_allocator_foreground_init(struct bch_fs *c)

View File

@ -136,22 +136,15 @@ int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
unsigned, unsigned *, bool *, enum alloc_reserve,
unsigned, struct closure *);
struct write_point *bch2_alloc_sectors_start_trans(struct btree_trans *,
unsigned, unsigned,
struct write_point_specifier,
struct bch_devs_list *,
unsigned, unsigned,
enum alloc_reserve,
unsigned,
struct closure *);
struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
unsigned, unsigned,
struct write_point_specifier,
struct bch_devs_list *,
unsigned, unsigned,
enum alloc_reserve,
unsigned,
struct closure *);
int bch2_alloc_sectors_start_trans(struct btree_trans *,
unsigned, unsigned,
struct write_point_specifier,
struct bch_devs_list *,
unsigned, unsigned,
enum alloc_reserve,
unsigned,
struct closure *,
struct write_point **);
struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *);
void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,

View File

@ -8,6 +8,14 @@
#include "clock_types.h"
#include "fifo.h"
struct bucket_alloc_state {
u64 cur_bucket;
u64 buckets_seen;
u64 skipped_open;
u64 skipped_need_journal_commit;
u64 skipped_nouse;
};
struct ec_bucket_buf;
#define BCH_ALLOC_RESERVES() \
@ -78,6 +86,11 @@ struct write_point {
struct open_buckets ptrs;
struct dev_stripe_state stripe;
struct work_struct index_update_work;
struct list_head writes;
spinlock_t writes_lock;
};
struct write_point_specifier {

View File

@ -9,8 +9,6 @@
#include <linux/mm.h>
#define MAX_EXTENT_COMPRESS_RATIO_SHIFT 10
/*
* Convert from pos in backpointer btree to pos of corresponding bucket in alloc
* btree:
@ -43,27 +41,6 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c,
return ret;
}
void bch2_extent_ptr_to_bp(struct bch_fs *c,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k, struct extent_ptr_decoded p,
struct bpos *bucket_pos, struct bch_backpointer *bp)
{
enum bch_data_type data_type = level ? BCH_DATA_btree : BCH_DATA_user;
s64 sectors = level ? btree_sectors(c) : k.k->size;
u32 bucket_offset;
*bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset);
*bp = (struct bch_backpointer) {
.btree_id = btree_id,
.level = level,
.data_type = data_type,
.bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) +
p.crc.offset,
.bucket_len = ptr_disk_sectors(sectors, p),
.pos = k.k->p,
};
}
static bool extent_matches_bp(struct bch_fs *c,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k,

View File

@ -2,6 +2,7 @@
#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H
#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H
#include "buckets.h"
#include "super.h"
int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k,
@ -16,9 +17,28 @@ void bch2_backpointer_swab(struct bkey_s);
.swab = bch2_backpointer_swab, \
})
void bch2_extent_ptr_to_bp(struct bch_fs *, enum btree_id, unsigned,
struct bkey_s_c, struct extent_ptr_decoded,
struct bpos *, struct bch_backpointer *);
#define MAX_EXTENT_COMPRESS_RATIO_SHIFT 10
static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k, struct extent_ptr_decoded p,
struct bpos *bucket_pos, struct bch_backpointer *bp)
{
enum bch_data_type data_type = level ? BCH_DATA_btree : BCH_DATA_user;
s64 sectors = level ? btree_sectors(c) : k.k->size;
u32 bucket_offset;
*bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset);
*bp = (struct bch_backpointer) {
.btree_id = btree_id,
.level = level,
.data_type = data_type,
.bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) +
p.crc.offset,
.bucket_len = ptr_disk_sectors(sectors, p),
.pos = k.k->p,
};
}
int bch2_bucket_backpointer_del(struct btree_trans *, struct bkey_i_alloc_v4 *,
struct bch_backpointer, struct bkey_s_c);

View File

@ -226,6 +226,10 @@ do { \
dynamic_fault("bcachefs:meta:write:" name)
#ifdef __KERNEL__
#define BCACHEFS_LOG_PREFIX
#endif
#ifdef BCACHEFS_LOG_PREFIX
#define bch2_log_msg(_c, fmt) "bcachefs (%s): " fmt, ((_c)->name)
#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n")
#define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
@ -598,6 +602,23 @@ typedef struct {
#define BCACHEFS_ROOT_SUBVOL_INUM \
((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO })
#define BCH_BTREE_WRITE_TYPES() \
x(initial, 0) \
x(init_next_bset, 1) \
x(cache_reclaim, 2) \
x(journal_reclaim, 3) \
x(interior, 4)
enum btree_write_type {
#define x(t, n) BTREE_WRITE_##t,
BCH_BTREE_WRITE_TYPES()
#undef x
BTREE_WRITE_TYPE_NR,
};
#define BTREE_WRITE_TYPE_MASK (roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1)
#define BTREE_WRITE_TYPE_BITS ilog2(BTREE_WRITE_TYPE_MASK)
struct bch_fs {
struct closure cl;
@ -707,6 +728,13 @@ struct bch_fs {
struct workqueue_struct *btree_interior_update_worker;
struct work_struct btree_interior_update_work;
/* btree_io.c: */
spinlock_t btree_write_error_lock;
struct btree_write_stats {
atomic64_t nr;
atomic64_t bytes;
} btree_write_stats[BTREE_WRITE_TYPE_NR];
/* btree_iter.c: */
struct mutex btree_trans_lock;
struct list_head btree_trans_list;
@ -881,11 +909,6 @@ struct bch_fs {
struct bio_set dio_write_bioset;
struct bio_set dio_read_bioset;
atomic64_t btree_writes_nr;
atomic64_t btree_writes_sectors;
spinlock_t btree_write_error_lock;
/* ERRORS */
struct list_head fsck_errors;
struct mutex fsck_error_lock;

View File

@ -178,7 +178,7 @@ unsigned bch2_sort_keys(struct bkey_packed *dst,
continue;
while ((next = sort_iter_peek(iter)) &&
!bch2_bkey_cmp_packed(iter->b, in, next)) {
!bch2_bkey_cmp_packed_inlined(iter->b, in, next)) {
BUG_ON(in->needs_whiteout &&
next->needs_whiteout);
needs_whiteout |= in->needs_whiteout;

View File

@ -280,9 +280,11 @@ wait_on_io:
* the post write cleanup:
*/
if (bch2_verify_btree_ondisk)
bch2_btree_node_write(c, b, SIX_LOCK_intent, 0);
bch2_btree_node_write(c, b, SIX_LOCK_intent,
BTREE_WRITE_cache_reclaim);
else
__bch2_btree_node_write(c, b, 0);
__bch2_btree_node_write(c, b,
BTREE_WRITE_cache_reclaim);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
@ -389,7 +391,7 @@ restart:
six_trylock_read(&b->c.lock)) {
list_move(&bc->live, &b->list);
mutex_unlock(&bc->lock);
__bch2_btree_node_write(c, b, 0);
__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
six_unlock_read(&b->c.lock);
if (touched >= nr)
goto out_nounlock;
@ -675,6 +677,7 @@ out:
b->flags = 0;
b->written = 0;
b->nsets = 0;
b->write_type = 0;
b->sib_u64s[0] = 0;
b->sib_u64s[1] = 0;
b->whiteout_u64s = 0;
@ -1118,7 +1121,7 @@ wait_on_io:
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
if (btree_node_dirty(b)) {
__bch2_btree_node_write(c, b, 0);
__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
goto wait_on_io;

View File

@ -450,6 +450,24 @@ void bch2_btree_build_aux_trees(struct btree *b)
t == bset_tree_last(b));
}
/*
* If we have MAX_BSETS (3) bsets, should we sort them all down to just one?
*
* The first bset is going to be of similar order to the size of the node, the
* last bset is bounded by btree_write_set_buffer(), which is set to keep the
* memmove on insert from being too expensive: the middle bset should, ideally,
* be the geometric mean of the first and the last.
*
* Returns true if the middle bset is greater than that geometric mean:
*/
static inline bool should_compact_all(struct bch_fs *c, struct btree *b)
{
unsigned mid_u64s_bits =
(ilog2(btree_max_u64s(c)) + BTREE_WRITE_SET_U64s_BITS) / 2;
return bset_u64s(&b->set[1]) > 1U << mid_u64s_bits;
}
/*
* @bch_btree_init_next - initialize a new (unwritten) bset that can then be
* inserted into
@ -467,19 +485,14 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
EBUG_ON(!(b->c.lock.state.seq & 1));
BUG_ON(bset_written(b, bset(b, &b->set[1])));
BUG_ON(btree_node_just_written(b));
if (b->nsets == MAX_BSETS &&
!btree_node_write_in_flight(b)) {
unsigned log_u64s[] = {
ilog2(bset_u64s(&b->set[0])),
ilog2(bset_u64s(&b->set[1])),
ilog2(bset_u64s(&b->set[2])),
};
if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) {
bch2_btree_node_write(c, b, SIX_LOCK_write, 0);
reinit_iter = true;
}
!btree_node_write_in_flight(b) &&
should_compact_all(c, b)) {
bch2_btree_node_write(c, b, SIX_LOCK_write,
BTREE_WRITE_init_next_bset);
reinit_iter = true;
}
if (b->nsets == MAX_BSETS &&
@ -1653,7 +1666,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
} while ((v = cmpxchg(&b->flags, old, new)) != old);
if (new & (1U << BTREE_NODE_write_in_flight))
__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED);
__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|b->write_type);
else
wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
}
@ -1802,6 +1815,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
bool used_mempool;
unsigned long old, new;
bool validate_before_checksum = false;
enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK;
void *data;
int ret;
@ -1848,6 +1862,12 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
if (new & (1U << BTREE_NODE_need_write))
return;
do_write:
if ((flags & BTREE_WRITE_ONLY_IF_NEED))
type = b->write_type;
b->write_type = 0;
BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0));
atomic_dec(&c->btree_cache.dirty);
BUG_ON(btree_node_fake(b));
@ -2022,8 +2042,8 @@ do_write:
bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
cpu_to_le16(b->written);
atomic64_inc(&c->btree_writes_nr);
atomic64_add(sectors_to_write, &c->btree_writes_sectors);
atomic64_inc(&c->btree_write_stats[type].nr);
atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes);
INIT_WORK(&wbio->work, btree_write_submit);
queue_work(c->io_complete_wq, &wbio->work);
@ -2151,3 +2171,33 @@ bool bch2_btree_flush_all_writes(struct bch_fs *c)
{
return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
}
const char * const bch2_btree_write_types[] = {
#define x(t, n) [n] = #t,
BCH_BTREE_WRITE_TYPES()
NULL
};
void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c)
{
printbuf_tabstop_push(out, 20);
printbuf_tabstop_push(out, 10);
prt_tab(out);
prt_str(out, "nr");
prt_tab(out);
prt_str(out, "size");
prt_newline(out);
for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) {
u64 nr = atomic64_read(&c->btree_write_stats[i].nr);
u64 bytes = atomic64_read(&c->btree_write_stats[i].bytes);
prt_printf(out, "%s:", bch2_btree_write_types[i]);
prt_tab(out);
prt_u64(out, nr);
prt_tab(out);
prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0);
prt_newline(out);
}
}

View File

@ -139,8 +139,12 @@ void bch2_btree_complete_write(struct bch_fs *, struct btree *,
bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
#define BTREE_WRITE_ONLY_IF_NEED (1U << 0)
#define BTREE_WRITE_ALREADY_STARTED (1U << 1)
enum btree_write_flags {
__BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS,
__BTREE_WRITE_ALREADY_STARTED,
};
#define BTREE_WRITE_ONLY_IF_NEED (1U << __BTREE_WRITE_ONLY_IF_NEED )
#define BTREE_WRITE_ALREADY_STARTED (1U << __BTREE_WRITE_ALREADY_STARTED)
void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
void bch2_btree_node_write(struct bch_fs *, struct btree *,
@ -219,4 +223,6 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
bn->min_key = bpos_nosnap_successor(bn->min_key);
}
void bch2_btree_write_stats_to_text(struct printbuf *, struct bch_fs *);
#endif /* _BCACHEFS_BTREE_IO_H */

View File

@ -646,9 +646,9 @@ static inline void __btree_path_level_init(struct btree_path *path,
bch2_btree_node_iter_peek(&l->iter, l->b);
}
inline void bch2_btree_path_level_init(struct btree_trans *trans,
struct btree_path *path,
struct btree *b)
void bch2_btree_path_level_init(struct btree_trans *trans,
struct btree_path *path,
struct btree *b)
{
BUG_ON(path->cached);
@ -1172,11 +1172,10 @@ int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
btree_path_traverse_one(trans, path, flags, _RET_IP_);
}
static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
struct btree_path *src)
{
unsigned i, offset = offsetof(struct btree_path, pos);
int cmp = btree_path_cmp(dst, src);
memcpy((void *) dst + offset,
(void *) src + offset,
@ -1188,9 +1187,6 @@ static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
if (t != BTREE_NODE_UNLOCKED)
six_lock_increment(&dst->l[i].b->c.lock, t);
}
if (cmp)
bch2_btree_path_check_sort_fast(trans, dst, cmp);
}
static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
@ -1203,21 +1199,18 @@ static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btr
return new;
}
__flatten
struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans,
struct btree_path *path, bool intent,
unsigned long ip)
{
if (path->ref > 1 || path->preserve) {
__btree_path_put(path, intent);
path = btree_path_clone(trans, path, intent);
path->preserve = false;
__btree_path_put(path, intent);
path = btree_path_clone(trans, path, intent);
path->preserve = false;
#ifdef CONFIG_BCACHEFS_DEBUG
path->ip_allocated = ip;
path->ip_allocated = ip;
#endif
btree_trans_verify_sorted(trans);
}
path->should_be_locked = false;
btree_trans_verify_sorted(trans);
return path;
}
@ -1554,7 +1547,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
return path;
}
inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
{
struct btree_path_level *l = path_l(path);
@ -2536,6 +2529,18 @@ static inline void btree_path_swap(struct btree_trans *trans,
btree_path_verify_sorted_ref(trans, r);
}
static inline struct btree_path *sib_btree_path(struct btree_trans *trans,
struct btree_path *path, int sib)
{
unsigned idx = (unsigned) path->sorted_idx + sib;
EBUG_ON(sib != -1 && sib != 1);
return idx < trans->nr_sorted
? trans->paths + trans->sorted[idx]
: NULL;
}
static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans *trans,
struct btree_path *path,
int cmp)
@ -2545,9 +2550,7 @@ static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans *
EBUG_ON(!cmp);
while ((n = cmp < 0
? prev_btree_path(trans, path)
: next_btree_path(trans, path)) &&
while ((n = sib_btree_path(trans, path, cmp)) &&
(cmp2 = btree_path_cmp(n, path)) &&
cmp2 != cmp)
btree_path_swap(trans, n, path);

View File

@ -165,13 +165,12 @@ int __must_check bch2_btree_path_traverse(struct btree_trans *,
struct btree_path *, unsigned);
struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
unsigned, unsigned, unsigned, unsigned long);
inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
struct btree_iter *, struct bpos);
inline void bch2_btree_path_level_init(struct btree_trans *,
struct btree_path *, struct btree *);
void bch2_btree_path_level_init(struct btree_trans *, struct btree_path *, struct btree *);
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_trans_verify_paths(struct btree_trans *);

View File

@ -173,10 +173,9 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
}
if (unlikely(!best)) {
struct bch_fs *c = g->g->trans->c;
struct printbuf buf = PRINTBUF;
bch_err(c, "cycle of nofail locks");
prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
for (i = g->g; i < g->g + g->nr; i++) {
struct btree_trans *trans = i->trans;

View File

@ -77,6 +77,7 @@ struct btree {
u8 nsets;
u8 nr_key_bits;
u16 version_ondisk;
u8 write_type;
struct bkey_format format;

View File

@ -246,6 +246,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
unsigned nr_reserve;
enum alloc_reserve alloc_reserve;
int ret;
if (flags & BTREE_INSERT_USE_RESERVE) {
nr_reserve = 0;
@ -268,7 +269,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
mutex_unlock(&c->btree_reserve_cache_lock);
retry:
wp = bch2_alloc_sectors_start_trans(trans,
ret = bch2_alloc_sectors_start_trans(trans,
c->opts.metadata_target ?:
c->opts.foreground_target,
0,
@ -276,9 +277,9 @@ retry:
&devs_have,
res->nr_replicas,
c->opts.metadata_replicas_required,
alloc_reserve, 0, cl);
if (IS_ERR(wp))
return ERR_CAST(wp);
alloc_reserve, 0, cl, &wp);
if (unlikely(ret))
return ERR_PTR(ret);
if (wp->sectors_free < btree_sectors(c)) {
struct open_bucket *ob;
@ -1178,7 +1179,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
}
if (ret) {
trace_and_count(c, btree_reserve_get_fail, trans->fn, _RET_IP_, nr_nodes[0] + nr_nodes[1]);
trace_and_count(c, btree_reserve_get_fail, trans->fn,
_RET_IP_, nr_nodes[0] + nr_nodes[1], ret);
goto err;
}
@ -1307,6 +1309,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
set_btree_node_dirty_acct(c, b);
set_btree_node_need_write(b);
b->write_type = BTREE_WRITE_interior;
printbuf_exit(&buf);
}

View File

@ -282,6 +282,7 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b,
struct bkey_packed k;
BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s);
EBUG_ON(btree_node_just_written(b));
if (!bkey_pack_pos(&k, pos, b)) {
struct bkey *u = (void *) &k;

View File

@ -181,6 +181,8 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
new |= 1 << BTREE_NODE_need_write;
} while ((v = cmpxchg(&b->flags, old, new)) != old);
b->write_type = BTREE_WRITE_journal_reclaim;
btree_node_write_if_need(c, b, SIX_LOCK_read);
six_unlock_read(&b->c.lock);
@ -289,7 +291,7 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s,
return 0;
}
static inline int bch2_trans_journal_res_get(struct btree_trans *trans,
static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
unsigned flags)
{
struct bch_fs *c = trans->c;
@ -721,33 +723,34 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
return ret;
}
static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
{
while (--i >= trans->updates) {
if (same_leaf_as_prev(trans, i))
continue;
bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
}
trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
}
static inline int trans_lock_write(struct btree_trans *trans)
{
struct btree_insert_entry *i;
int ret;
trans_for_each_update(trans, i) {
if (same_leaf_as_prev(trans, i))
continue;
ret = bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c);
if (ret)
goto fail;
if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
return trans_lock_write_fail(trans, i);
bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
}
return 0;
fail:
while (--i >= trans->updates) {
if (same_leaf_as_prev(trans, i))
continue;
bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b);
}
trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
}
static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
@ -758,6 +761,33 @@ static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans
bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
}
static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
struct btree_insert_entry *i,
struct printbuf *err)
{
struct bch_fs *c = trans->c;
int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
printbuf_reset(err);
prt_printf(err, "invalid bkey on insert from %s -> %ps",
trans->fn, (void *) i->ip_allocated);
prt_newline(err);
printbuf_indent_add(err, 2);
bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k));
prt_newline(err);
bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
i->bkey_type, rw, err);
bch2_print_string_as_lines(KERN_ERR, err->buf);
bch2_inconsistent_error(c);
bch2_dump_trans_updates(trans);
printbuf_exit(err);
return -EINVAL;
}
/*
* Get journal reservation, take write locks, and attempt to do btree update(s):
*/
@ -772,24 +802,9 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
trans_for_each_update(trans, i) {
if (bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
i->bkey_type, rw, &buf)) {
printbuf_reset(&buf);
prt_printf(&buf, "invalid bkey on insert from %s -> %ps",
trans->fn, (void *) i->ip_allocated);
prt_newline(&buf);
printbuf_indent_add(&buf, 2);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
prt_newline(&buf);
bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
i->bkey_type, rw, &buf);
bch2_trans_inconsistent(trans, "%s", buf.buf);
printbuf_exit(&buf);
return -EINVAL;
}
if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
i->bkey_type, rw, &buf)))
return bch2_trans_commit_bkey_invalid(trans, i, &buf);
btree_insert_entry_checks(trans, i);
}

View File

@ -1263,23 +1263,24 @@ void fs_usage_apply_warn(struct btree_trans *trans,
struct btree_insert_entry *i;
struct printbuf buf = PRINTBUF;
bch_err(c, "disk usage increased %lli more than %u sectors reserved",
should_not_have_added, disk_res_sectors);
prt_printf(&buf,
bch2_fmt(c, "disk usage increased %lli more than %u sectors reserved)"),
should_not_have_added, disk_res_sectors);
trans_for_each_update(trans, i) {
struct bkey_s_c old = { &i->old_k, i->old_v };
pr_err("while inserting");
printbuf_reset(&buf);
prt_str(&buf, "new ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
pr_err(" %s", buf.buf);
pr_err("overlapping with");
printbuf_reset(&buf);
prt_newline(&buf);
prt_str(&buf, "old ");
bch2_bkey_val_to_text(&buf, c, old);
pr_err(" %s", buf.buf);
prt_newline(&buf);
}
__WARN();
bch2_print_string_as_lines(KERN_ERR, buf.buf);
printbuf_exit(&buf);
}
@ -1949,7 +1950,7 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
#define SECTORS_CACHE 1024
int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
u64 sectors, int flags)
{
struct bch_fs_pcpu *pcpu;

View File

@ -259,15 +259,39 @@ int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *);
static inline void bch2_disk_reservation_put(struct bch_fs *c,
struct disk_reservation *res)
{
this_cpu_sub(*c->online_reserved, res->sectors);
res->sectors = 0;
if (res->sectors) {
this_cpu_sub(*c->online_reserved, res->sectors);
res->sectors = 0;
}
}
#define BCH_DISK_RESERVATION_NOFAIL (1 << 0)
int bch2_disk_reservation_add(struct bch_fs *,
struct disk_reservation *,
u64, int);
int __bch2_disk_reservation_add(struct bch_fs *,
struct disk_reservation *,
u64, int);
static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
u64 sectors, int flags)
{
#ifdef __KERNEL__
u64 old, new;
do {
old = this_cpu_read(c->pcpu->sectors_available);
if (sectors > old)
return __bch2_disk_reservation_add(c, res, sectors, flags);
new = old - sectors;
} while (this_cpu_cmpxchg(c->pcpu->sectors_available, old, new) != old);
this_cpu_add(*c->online_reserved, sectors);
res->sectors += sectors;
return 0;
#else
return __bch2_disk_reservation_add(c, res, sectors, flags);
#endif
}
static inline struct disk_reservation
bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)

View File

@ -316,7 +316,7 @@ struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
return __bch2_checksum_bio(c, type, nonce, bio, &iter);
}
int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
struct nonce nonce, struct bio *bio)
{
struct bio_vec bv;

View File

@ -61,8 +61,16 @@ int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
struct bch_extent_crc_unpacked *,
unsigned, unsigned, unsigned);
int bch2_encrypt_bio(struct bch_fs *, unsigned,
struct nonce, struct bio *);
int __bch2_encrypt_bio(struct bch_fs *, unsigned,
struct nonce, struct bio *);
static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
struct nonce nonce, struct bio *bio)
{
return bch2_csum_type_is_encryption(type)
? __bch2_encrypt_bio(c, type, nonce, bio)
: 0;
}
int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
struct bch_key *);

View File

@ -97,7 +97,7 @@ static void bch2_bkey_mark_dev_cached(struct bkey_s k, unsigned dev)
ptr->cached = true;
}
static int bch2_data_update_index_update(struct bch_write_op *op)
int bch2_data_update_index_update(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
struct btree_trans trans;
@ -225,7 +225,7 @@ static int bch2_data_update_index_update(struct bch_write_op *op)
bch2_trans_update(&trans, &iter, insert,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(&trans, &op->res,
op_journal_seq(op),
NULL,
BTREE_INSERT_NOFAIL|
m->data_opts.btree_insert_flags);
if (!ret) {
@ -270,8 +270,7 @@ out:
}
void bch2_data_update_read_done(struct data_update *m,
struct bch_extent_crc_unpacked crc,
struct closure *cl)
struct bch_extent_crc_unpacked crc)
{
/* write bio must own pages: */
BUG_ON(!m->op.wbio.bio.bi_vcnt);
@ -279,7 +278,7 @@ void bch2_data_update_read_done(struct data_update *m,
m->op.crc = crc;
m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9;
closure_call(&m->op.cl, bch2_write, NULL, cl);
closure_call(&m->op.cl, bch2_write, NULL, NULL);
}
void bch2_data_update_exit(struct data_update *update)
@ -317,14 +316,13 @@ int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
m->op.flags |= BCH_WRITE_PAGES_STABLE|
BCH_WRITE_PAGES_OWNED|
BCH_WRITE_DATA_ENCODED|
BCH_WRITE_FROM_INTERNAL|
BCH_WRITE_MOVE|
m->data_opts.write_flags;
m->op.compression_type =
bch2_compression_opt_to_type[io_opts.background_compression ?:
io_opts.compression];
if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
m->op.alloc_reserve = RESERVE_movinggc;
m->op.index_update_fn = bch2_data_update_index_update;
i = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {

View File

@ -26,9 +26,10 @@ struct data_update {
struct bch_write_op op;
};
int bch2_data_update_index_update(struct bch_write_op *);
void bch2_data_update_read_done(struct data_update *,
struct bch_extent_crc_unpacked,
struct closure *);
struct bch_extent_crc_unpacked);
void bch2_data_update_exit(struct data_update *);
int bch2_data_update_init(struct bch_fs *, struct data_update *,

View File

@ -125,8 +125,10 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
s->nr++;
}
#ifdef BCACHEFS_LOG_PREFIX
if (!strncmp(fmt, "bcachefs:", 9))
prt_printf(out, bch2_log_msg(c, ""));
#endif
va_start(args, fmt);
prt_vprintf(out, fmt, args);

View File

@ -65,7 +65,6 @@ struct quota_res {
};
struct bch_writepage_io {
struct closure cl;
struct bch_inode_info *inode;
/* must be last: */
@ -73,11 +72,13 @@ struct bch_writepage_io {
};
struct dio_write {
struct completion done;
struct kiocb *req;
struct address_space *mapping;
struct bch_inode_info *inode;
struct mm_struct *mm;
unsigned loop:1,
sync:1,
flush:1,
free_iov:1;
struct quota_res quota_res;
u64 written;
@ -98,7 +99,7 @@ struct dio_read {
};
/* pagecache_block must be held */
static int write_invalidate_inode_pages_range(struct address_space *mapping,
static noinline int write_invalidate_inode_pages_range(struct address_space *mapping,
loff_t start, loff_t end)
{
int ret;
@ -750,25 +751,25 @@ vm_fault_t bch2_page_fault(struct vm_fault *vmf)
if (fdm > mapping) {
struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
if (bch2_pagecache_add_tryget(&inode->ei_pagecache_lock))
if (bch2_pagecache_add_tryget(inode))
goto got_lock;
bch2_pagecache_block_put(&fdm_host->ei_pagecache_lock);
bch2_pagecache_block_put(fdm_host);
bch2_pagecache_add_get(&inode->ei_pagecache_lock);
bch2_pagecache_add_put(&inode->ei_pagecache_lock);
bch2_pagecache_add_get(inode);
bch2_pagecache_add_put(inode);
bch2_pagecache_block_get(&fdm_host->ei_pagecache_lock);
bch2_pagecache_block_get(fdm_host);
/* Signal that lock has been dropped: */
set_fdm_dropped_locks();
return VM_FAULT_SIGBUS;
}
bch2_pagecache_add_get(&inode->ei_pagecache_lock);
bch2_pagecache_add_get(inode);
got_lock:
ret = filemap_fault(vmf);
bch2_pagecache_add_put(&inode->ei_pagecache_lock);
bch2_pagecache_add_put(inode);
return ret;
}
@ -796,7 +797,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
* a write_invalidate_inode_pages_range() that works without dropping
* page lock before invalidating page
*/
bch2_pagecache_add_get(&inode->ei_pagecache_lock);
bch2_pagecache_add_get(inode);
lock_page(page);
isize = i_size_read(&inode->v);
@ -829,7 +830,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
wait_for_stable_page(page);
ret = VM_FAULT_LOCKED;
out:
bch2_pagecache_add_put(&inode->ei_pagecache_lock);
bch2_pagecache_add_put(inode);
sb_end_pagefault(inode->v.i_sb);
return ret;
@ -1097,7 +1098,7 @@ void bch2_readahead(struct readahead_control *ractl)
bch2_trans_init(&trans, c, 0, 0);
bch2_pagecache_add_get(&inode->ei_pagecache_lock);
bch2_pagecache_add_get(inode);
while ((page = readpage_iter_next(&readpages_iter))) {
pgoff_t index = readpages_iter.offset + readpages_iter.idx;
@ -1120,7 +1121,7 @@ void bch2_readahead(struct readahead_control *ractl)
&readpages_iter);
}
bch2_pagecache_add_put(&inode->ei_pagecache_lock);
bch2_pagecache_add_put(inode);
bch2_trans_exit(&trans);
kfree(readpages_iter.pages);
@ -1200,18 +1201,10 @@ static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs
};
}
static void bch2_writepage_io_free(struct closure *cl)
static void bch2_writepage_io_done(struct bch_write_op *op)
{
struct bch_writepage_io *io = container_of(cl,
struct bch_writepage_io, cl);
bio_put(&io->op.wbio.bio);
}
static void bch2_writepage_io_done(struct closure *cl)
{
struct bch_writepage_io *io = container_of(cl,
struct bch_writepage_io, cl);
struct bch_writepage_io *io =
container_of(op, struct bch_writepage_io, op);
struct bch_fs *c = io->op.c;
struct bio *bio = &io->op.wbio.bio;
struct bvec_iter_all iter;
@ -1273,7 +1266,7 @@ static void bch2_writepage_io_done(struct closure *cl)
end_page_writeback(bvec->bv_page);
}
closure_return_with_destructor(&io->cl, bch2_writepage_io_free);
bio_put(&io->op.wbio.bio);
}
static void bch2_writepage_do_io(struct bch_writepage_state *w)
@ -1281,8 +1274,7 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w)
struct bch_writepage_io *io = w->io;
w->io = NULL;
closure_call(&io->op.cl, bch2_write, NULL, &io->cl);
continue_at(&io->cl, bch2_writepage_io_done, NULL);
closure_call(&io->op.cl, bch2_write, NULL, NULL);
}
/*
@ -1304,9 +1296,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
&c->writepage_bioset),
struct bch_writepage_io, op.wbio.bio);
closure_init(&w->io->cl, NULL);
w->io->inode = inode;
op = &w->io->op;
bch2_write_op_init(op, c, w->opts);
op->target = w->opts.foreground_target;
@ -1315,6 +1305,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
op->write_point = writepoint_hashed(inode->ei_last_dirtied);
op->subvol = inode->ei_subvol;
op->pos = POS(inode->v.i_ino, sector);
op->end_io = bch2_writepage_io_done;
op->wbio.bio.bi_iter.bi_sector = sector;
op->wbio.bio.bi_opf = wbc_to_write_flags(wbc);
}
@ -1438,7 +1429,8 @@ do_io:
/* Check for writing past i_size: */
WARN_ON_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
round_up(i_size, block_bytes(c)));
round_up(i_size, block_bytes(c)) &&
!test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
w->io->op.res.sectors += reserved_sectors;
w->io->op.i_sectors_delta -= dirty_sectors;
@ -1490,7 +1482,7 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
bch2_page_reservation_init(c, inode, res);
*fsdata = res;
bch2_pagecache_add_get(&inode->ei_pagecache_lock);
bch2_pagecache_add_get(inode);
page = grab_cache_page_write_begin(mapping, index);
if (!page)
@ -1547,7 +1539,7 @@ err:
put_page(page);
*pagep = NULL;
err_unlock:
bch2_pagecache_add_put(&inode->ei_pagecache_lock);
bch2_pagecache_add_put(inode);
kfree(res);
*fsdata = NULL;
return bch2_err_class(ret);
@ -1591,7 +1583,7 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
unlock_page(page);
put_page(page);
bch2_pagecache_add_put(&inode->ei_pagecache_lock);
bch2_pagecache_add_put(inode);
bch2_page_reservation_put(c, inode, res);
kfree(res);
@ -1760,7 +1752,7 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
ssize_t written = 0;
int ret = 0;
bch2_pagecache_add_get(&inode->ei_pagecache_lock);
bch2_pagecache_add_get(inode);
do {
unsigned offset = pos & (PAGE_SIZE - 1);
@ -1818,7 +1810,7 @@ again:
balance_dirty_pages_ratelimited(mapping);
} while (iov_iter_count(iter));
bch2_pagecache_add_put(&inode->ei_pagecache_lock);
bch2_pagecache_add_put(inode);
return written ? written : ret;
}
@ -1981,11 +1973,13 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
if (iocb->ki_flags & IOCB_DIRECT) {
struct blk_plug plug;
ret = filemap_write_and_wait_range(mapping,
iocb->ki_pos,
iocb->ki_pos + count - 1);
if (ret < 0)
goto out;
if (unlikely(mapping->nrpages)) {
ret = filemap_write_and_wait_range(mapping,
iocb->ki_pos,
iocb->ki_pos + count - 1);
if (ret < 0)
goto out;
}
file_accessed(file);
@ -1996,9 +1990,9 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
if (ret >= 0)
iocb->ki_pos += ret;
} else {
bch2_pagecache_add_get(&inode->ei_pagecache_lock);
bch2_pagecache_add_get(inode);
ret = generic_file_read_iter(iocb, iter);
bch2_pagecache_add_put(&inode->ei_pagecache_lock);
bch2_pagecache_add_put(inode);
}
out:
return bch2_err_class(ret);
@ -2050,31 +2044,154 @@ err:
return err ? false : ret;
}
static void bch2_dio_write_loop_async(struct bch_write_op *);
static long bch2_dio_write_loop(struct dio_write *dio)
static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio)
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
struct bch_fs *c = dio->op.c;
struct bch_inode_info *inode = dio->inode;
struct bio *bio = &dio->op.wbio.bio;
return bch2_check_range_allocated(c, inode_inum(inode),
dio->op.pos.offset, bio_sectors(bio),
dio->op.opts.data_replicas,
dio->op.opts.compression != 0);
}
static void bch2_dio_write_loop_async(struct bch_write_op *);
static __always_inline long bch2_dio_write_done(struct dio_write *dio);
static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
{
struct iovec *iov = dio->inline_vecs;
if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
GFP_KERNEL);
if (unlikely(!iov))
return -ENOMEM;
dio->free_iov = true;
}
memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov));
dio->iter.iov = iov;
return 0;
}
static void bch2_dio_write_flush_done(struct closure *cl)
{
struct dio_write *dio = container_of(cl, struct dio_write, op.cl);
struct bch_fs *c = dio->op.c;
closure_debug_destroy(cl);
dio->op.error = bch2_journal_error(&c->journal);
bch2_dio_write_done(dio);
}
static noinline void bch2_dio_write_flush(struct dio_write *dio)
{
struct bch_fs *c = dio->op.c;
struct bch_inode_unpacked inode;
int ret;
dio->flush = 0;
closure_init(&dio->op.cl, NULL);
if (!dio->op.error) {
ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode);
if (ret)
dio->op.error = ret;
else
bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, &dio->op.cl);
}
if (dio->sync) {
closure_sync(&dio->op.cl);
closure_debug_destroy(&dio->op.cl);
} else {
continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL);
}
}
static __always_inline long bch2_dio_write_done(struct dio_write *dio)
{
struct bch_fs *c = dio->op.c;
struct kiocb *req = dio->req;
struct address_space *mapping = req->ki_filp->f_mapping;
struct bch_inode_info *inode = file_bch_inode(req->ki_filp);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_inode_info *inode = dio->inode;
bool sync = dio->sync;
long ret;
if (unlikely(dio->flush)) {
bch2_dio_write_flush(dio);
if (!sync)
return -EIOCBQUEUED;
}
bch2_pagecache_block_put(inode);
bch2_quota_reservation_put(c, inode, &dio->quota_res);
if (dio->free_iov)
kfree(dio->iter.iov);
ret = dio->op.error ?: ((long) dio->written << 9);
bio_put(&dio->op.wbio.bio);
/* inode->i_dio_count is our ref on inode and thus bch_fs */
inode_dio_end(&inode->v);
if (ret < 0)
ret = bch2_err_class(ret);
if (!sync) {
req->ki_complete(req, ret);
ret = -EIOCBQUEUED;
}
return ret;
}
static __always_inline void bch2_dio_write_end(struct dio_write *dio)
{
struct bch_fs *c = dio->op.c;
struct kiocb *req = dio->req;
struct bch_inode_info *inode = dio->inode;
struct bio *bio = &dio->op.wbio.bio;
struct bvec_iter_all iter;
struct bio_vec *bv;
i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
req->ki_pos += (u64) dio->op.written << 9;
dio->written += dio->op.written;
spin_lock(&inode->v.i_lock);
if (req->ki_pos > inode->v.i_size)
i_size_write(&inode->v, req->ki_pos);
spin_unlock(&inode->v.i_lock);
if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF)))
bio_for_each_segment_all(bv, bio, iter)
put_page(bv->bv_page);
if (unlikely(dio->op.error))
set_bit(EI_INODE_ERROR, &inode->ei_flags);
}
static long bch2_dio_write_loop(struct dio_write *dio)
{
struct bch_fs *c = dio->op.c;
struct kiocb *req = dio->req;
struct address_space *mapping = dio->mapping;
struct bch_inode_info *inode = dio->inode;
struct bio *bio = &dio->op.wbio.bio;
unsigned unaligned, iter_count;
bool sync = dio->sync, dropped_locks;
long ret;
if (dio->loop)
goto loop;
while (1) {
iter_count = dio->iter.count;
if (kthread && dio->mm)
kthread_use_mm(dio->mm);
BUG_ON(current->faults_disabled_mapping);
EBUG_ON(current->faults_disabled_mapping);
current->faults_disabled_mapping = mapping;
ret = bio_iov_iter_get_pages(bio, &dio->iter);
@ -2082,8 +2199,6 @@ static long bch2_dio_write_loop(struct dio_write *dio)
dropped_locks = fdm_dropped_locks();
current->faults_disabled_mapping = NULL;
if (kthread && dio->mm)
kthread_unuse_mm(dio->mm);
/*
* If the fault handler returned an error but also signalled
@ -2121,116 +2236,80 @@ static long bch2_dio_write_loop(struct dio_write *dio)
}
bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode));
dio->op.end_io = bch2_dio_write_loop_async;
dio->op.end_io = sync
? NULL
: bch2_dio_write_loop_async;
dio->op.target = dio->op.opts.foreground_target;
dio->op.write_point = writepoint_hashed((unsigned long) current);
dio->op.nr_replicas = dio->op.opts.data_replicas;
dio->op.subvol = inode->ei_subvol;
dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
if ((req->ki_flags & IOCB_DSYNC) &&
!c->opts.journal_flush_disabled)
dio->op.flags |= BCH_WRITE_FLUSH;
if (sync)
dio->op.flags |= BCH_WRITE_SYNC;
dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
dio->op.opts.data_replicas, 0);
if (unlikely(ret) &&
!bch2_check_range_allocated(c, inode_inum(inode),
dio->op.pos.offset, bio_sectors(bio),
dio->op.opts.data_replicas,
dio->op.opts.compression != 0))
!bch2_dio_write_check_allocated(dio))
goto err;
task_io_account_write(bio->bi_iter.bi_size);
if (!dio->sync && !dio->loop && dio->iter.count) {
struct iovec *iov = dio->inline_vecs;
if (unlikely(dio->iter.count) &&
!dio->sync &&
!dio->loop &&
bch2_dio_write_copy_iov(dio))
dio->sync = sync = true;
if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
GFP_KERNEL);
if (unlikely(!iov)) {
dio->sync = sync = true;
goto do_io;
}
dio->free_iov = true;
}
memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov));
dio->iter.iov = iov;
}
do_io:
dio->loop = true;
closure_call(&dio->op.cl, bch2_write, NULL, NULL);
if (sync)
wait_for_completion(&dio->done);
else
if (!sync)
return -EIOCBQUEUED;
loop:
i_sectors_acct(c, inode, &dio->quota_res,
dio->op.i_sectors_delta);
req->ki_pos += (u64) dio->op.written << 9;
dio->written += dio->op.written;
spin_lock(&inode->v.i_lock);
if (req->ki_pos > inode->v.i_size)
i_size_write(&inode->v, req->ki_pos);
spin_unlock(&inode->v.i_lock);
bch2_dio_write_end(dio);
if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF)))
bio_for_each_segment_all(bv, bio, iter)
put_page(bv->bv_page);
bio->bi_vcnt = 0;
if (dio->op.error) {
set_bit(EI_INODE_ERROR, &inode->ei_flags);
break;
}
if (!dio->iter.count)
if (likely(!dio->iter.count) || dio->op.error)
break;
bio_reset(bio, NULL, REQ_OP_WRITE);
reinit_completion(&dio->done);
}
ret = dio->op.error ?: ((long) dio->written << 9);
out:
return bch2_dio_write_done(dio);
err:
bch2_pagecache_block_put(&inode->ei_pagecache_lock);
bch2_quota_reservation_put(c, inode, &dio->quota_res);
dio->op.error = ret;
if (dio->free_iov)
kfree(dio->iter.iov);
if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
struct bvec_iter_all iter;
struct bio_vec *bv;
if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF)))
bio_for_each_segment_all(bv, bio, iter)
put_page(bv->bv_page);
bio_put(bio);
/* inode->i_dio_count is our ref on inode and thus bch_fs */
inode_dio_end(&inode->v);
if (ret < 0)
ret = bch2_err_class(ret);
if (!sync) {
req->ki_complete(req, ret);
ret = -EIOCBQUEUED;
}
return ret;
goto out;
}
static void bch2_dio_write_loop_async(struct bch_write_op *op)
{
struct dio_write *dio = container_of(op, struct dio_write, op);
struct mm_struct *mm = dio->mm;
if (dio->sync)
complete(&dio->done);
else
bch2_dio_write_loop(dio);
bch2_dio_write_end(dio);
if (likely(!dio->iter.count) || dio->op.error) {
bch2_dio_write_done(dio);
return;
}
bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE);
if (mm)
kthread_use_mm(mm);
bch2_dio_write_loop(dio);
if (mm)
kthread_unuse_mm(mm);
}
static noinline
@ -2268,7 +2347,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
goto err;
inode_dio_begin(&inode->v);
bch2_pagecache_block_get(&inode->ei_pagecache_lock);
bch2_pagecache_block_get(inode);
extending = req->ki_pos + iter->count > inode->v.i_size;
if (!extending) {
@ -2282,26 +2361,31 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
GFP_KERNEL,
&c->dio_write_bioset);
dio = container_of(bio, struct dio_write, op.wbio.bio);
init_completion(&dio->done);
dio->req = req;
dio->mapping = mapping;
dio->inode = inode;
dio->mm = current->mm;
dio->loop = false;
dio->sync = is_sync_kiocb(req) || extending;
dio->flush = iocb_is_dsync(req) && !c->opts.journal_flush_disabled;
dio->free_iov = false;
dio->quota_res.sectors = 0;
dio->written = 0;
dio->iter = *iter;
dio->op.c = c;
ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
iter->count >> 9, true);
if (unlikely(ret))
goto err_put_bio;
ret = write_invalidate_inode_pages_range(mapping,
req->ki_pos,
req->ki_pos + iter->count - 1);
if (unlikely(ret))
goto err_put_bio;
if (unlikely(mapping->nrpages)) {
ret = write_invalidate_inode_pages_range(mapping,
req->ki_pos,
req->ki_pos + iter->count - 1);
if (unlikely(ret))
goto err_put_bio;
}
ret = bch2_dio_write_loop(dio);
err:
@ -2309,7 +2393,7 @@ err:
inode_unlock(&inode->v);
return ret;
err_put_bio:
bch2_pagecache_block_put(&inode->ei_pagecache_lock);
bch2_pagecache_block_put(inode);
bch2_quota_reservation_put(c, inode, &dio->quota_res);
bio_put(bio);
inode_dio_end(&inode->v);
@ -2613,7 +2697,7 @@ int bch2_truncate(struct user_namespace *mnt_userns,
}
inode_dio_wait(&inode->v);
bch2_pagecache_block_get(&inode->ei_pagecache_lock);
bch2_pagecache_block_get(inode);
ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u);
if (ret)
@ -2692,7 +2776,7 @@ int bch2_truncate(struct user_namespace *mnt_userns,
ret = bch2_setattr_nonsize(mnt_userns, inode, iattr);
err:
bch2_pagecache_block_put(&inode->ei_pagecache_lock);
bch2_pagecache_block_put(inode);
return bch2_err_class(ret);
}
@ -3005,8 +3089,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
}
ret = bch2_extent_update(&trans, inode_inum(inode), &iter,
&reservation.k_i,
&disk_res, NULL,
&reservation.k_i, &disk_res,
0, &i_sectors_delta, true);
if (ret)
goto bkey_err;
@ -3105,7 +3188,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
inode_lock(&inode->v);
inode_dio_wait(&inode->v);
bch2_pagecache_block_get(&inode->ei_pagecache_lock);
bch2_pagecache_block_get(inode);
ret = file_modified(file);
if (ret)
@ -3122,7 +3205,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
else
ret = -EOPNOTSUPP;
err:
bch2_pagecache_block_put(&inode->ei_pagecache_lock);
bch2_pagecache_block_put(inode);
inode_unlock(&inode->v);
percpu_ref_put(&c->writes);

View File

@ -43,58 +43,6 @@ static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
struct bch_inode_unpacked *,
struct bch_subvolume *);
static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
{
BUG_ON(atomic_long_read(&lock->v) == 0);
if (atomic_long_sub_return_release(i, &lock->v) == 0)
wake_up_all(&lock->wait);
}
static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i)
{
long v = atomic_long_read(&lock->v), old;
do {
old = v;
if (i > 0 ? v < 0 : v > 0)
return false;
} while ((v = atomic_long_cmpxchg_acquire(&lock->v,
old, old + i)) != old);
return true;
}
static void __pagecache_lock_get(struct pagecache_lock *lock, long i)
{
wait_event(lock->wait, __pagecache_lock_tryget(lock, i));
}
void bch2_pagecache_add_put(struct pagecache_lock *lock)
{
__pagecache_lock_put(lock, 1);
}
bool bch2_pagecache_add_tryget(struct pagecache_lock *lock)
{
return __pagecache_lock_tryget(lock, 1);
}
void bch2_pagecache_add_get(struct pagecache_lock *lock)
{
__pagecache_lock_get(lock, 1);
}
void bch2_pagecache_block_put(struct pagecache_lock *lock)
{
__pagecache_lock_put(lock, -1);
}
void bch2_pagecache_block_get(struct pagecache_lock *lock)
{
__pagecache_lock_get(lock, -1);
}
void bch2_inode_update_after_write(struct btree_trans *trans,
struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
@ -1409,7 +1357,7 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
inode_init_once(&inode->v);
mutex_init(&inode->ei_update_lock);
pagecache_lock_init(&inode->ei_pagecache_lock);
two_state_lock_init(&inode->ei_pagecache_lock);
mutex_init(&inode->ei_quota_lock);
return &inode->v;

View File

@ -6,31 +6,11 @@
#include "opts.h"
#include "str_hash.h"
#include "quota_types.h"
#include "two_state_shared_lock.h"
#include <linux/seqlock.h>
#include <linux/stat.h>
/*
* Two-state lock - can be taken for add or block - both states are shared,
* like read side of rwsem, but conflict with other state:
*/
struct pagecache_lock {
atomic_long_t v;
wait_queue_head_t wait;
};
static inline void pagecache_lock_init(struct pagecache_lock *lock)
{
atomic_long_set(&lock->v, 0);
init_waitqueue_head(&lock->wait);
}
void bch2_pagecache_add_put(struct pagecache_lock *);
bool bch2_pagecache_add_tryget(struct pagecache_lock *);
void bch2_pagecache_add_get(struct pagecache_lock *);
void bch2_pagecache_block_put(struct pagecache_lock *);
void bch2_pagecache_block_get(struct pagecache_lock *);
struct bch_inode_info {
struct inode v;
unsigned long ei_flags;
@ -39,7 +19,7 @@ struct bch_inode_info {
u64 ei_quota_reserved;
unsigned long ei_last_dirtied;
struct pagecache_lock ei_pagecache_lock;
two_state_lock_t ei_pagecache_lock;
struct mutex ei_quota_lock;
struct bch_qid ei_qid;
@ -50,6 +30,13 @@ struct bch_inode_info {
struct bch_inode_unpacked ei_inode;
};
#define bch2_pagecache_add_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 0)
#define bch2_pagecache_add_tryget(i) bch2_two_state_trylock(&i->ei_pagecache_lock, 0)
#define bch2_pagecache_add_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 0)
#define bch2_pagecache_block_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 1)
#define bch2_pagecache_block_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 1)
static inline subvol_inum inode_inum(struct bch_inode_info *inode)
{
return (subvol_inum) {
@ -96,7 +83,7 @@ do { \
if ((_locks) & INODE_LOCK) \
down_write_nested(&a[i]->v.i_rwsem, i); \
if ((_locks) & INODE_PAGECACHE_BLOCK) \
bch2_pagecache_block_get(&a[i]->ei_pagecache_lock);\
bch2_pagecache_block_get(a[i]);\
if ((_locks) & INODE_UPDATE_LOCK) \
mutex_lock_nested(&a[i]->ei_update_lock, i);\
} \
@ -114,7 +101,7 @@ do { \
if ((_locks) & INODE_LOCK) \
up_write(&a[i]->v.i_rwsem); \
if ((_locks) & INODE_PAGECACHE_BLOCK) \
bch2_pagecache_block_put(&a[i]->ei_pagecache_lock);\
bch2_pagecache_block_put(a[i]);\
if ((_locks) & INODE_UPDATE_LOCK) \
mutex_unlock(&a[i]->ei_update_lock); \
} \

View File

@ -16,6 +16,7 @@
#include "checksum.h"
#include "compress.h"
#include "clock.h"
#include "data_update.h"
#include "debug.h"
#include "disk_groups.h"
#include "ec.h"
@ -237,12 +238,14 @@ int bch2_extent_update(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *k,
struct disk_reservation *disk_res,
u64 *journal_seq,
u64 new_i_size,
s64 *i_sectors_delta_total,
bool check_enospc)
{
struct btree_iter inode_iter = { NULL };
struct bkey_s_c inode_k;
struct bkey_s_c_inode_v3 inode;
struct bkey_i_inode_v3 *new_inode;
struct bpos next_pos;
bool usage_increasing;
s64 i_sectors_delta = 0, disk_sectors_delta = 0;
@ -282,59 +285,51 @@ int bch2_extent_update(struct btree_trans *trans,
return ret;
}
if (new_i_size || i_sectors_delta) {
struct bkey_s_c k;
struct bkey_s_c_inode_v3 inode;
struct bkey_i_inode_v3 *new_inode;
bool i_size_update;
bch2_trans_iter_init(trans, &inode_iter, BTREE_ID_inodes,
SPOS(0, inum.inum, iter->snapshot),
BTREE_ITER_INTENT|BTREE_ITER_CACHED);
inode_k = bch2_btree_iter_peek_slot(&inode_iter);
ret = bkey_err(inode_k);
if (unlikely(ret))
goto err;
bch2_trans_iter_init(trans, &inode_iter, BTREE_ID_inodes,
SPOS(0, inum.inum, iter->snapshot),
BTREE_ITER_INTENT|BTREE_ITER_CACHED);
k = bch2_btree_iter_peek_slot(&inode_iter);
ret = bkey_err(k);
if (unlikely(ret))
goto err;
ret = bkey_is_inode(inode_k.k) ? 0 : -ENOENT;
if (unlikely(ret))
goto err;
ret = bkey_is_inode(k.k) ? 0 : -ENOENT;
if (unlikely(ret))
goto err;
if (unlikely(k.k->type != KEY_TYPE_inode_v3)) {
k = bch2_inode_to_v3(trans, k);
ret = bkey_err(k);
if (unlikely(ret))
goto err;
}
inode = bkey_s_c_to_inode_v3(k);
i_size_update = !(le64_to_cpu(inode.v->bi_flags) & BCH_INODE_I_SIZE_DIRTY) &&
new_i_size > le64_to_cpu(inode.v->bi_size);
if (!i_sectors_delta && !i_size_update)
goto no_inode_update;
new_inode = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
ret = PTR_ERR_OR_ZERO(new_inode);
if (unlikely(ret))
goto err;
bkey_reassemble(&new_inode->k_i, k);
if (i_size_update)
new_inode->v.bi_size = cpu_to_le64(new_i_size);
le64_add_cpu(&new_inode->v.bi_sectors, i_sectors_delta);
new_inode->k.p.snapshot = iter->snapshot;
ret = bch2_trans_update(trans, &inode_iter, &new_inode->k_i, 0);
if (unlikely(inode_k.k->type != KEY_TYPE_inode_v3)) {
inode_k = bch2_inode_to_v3(trans, inode_k);
ret = bkey_err(inode_k);
if (unlikely(ret))
goto err;
}
no_inode_update:
ret = bch2_trans_update(trans, iter, k, 0) ?:
bch2_trans_commit(trans, disk_res, journal_seq,
inode = bkey_s_c_to_inode_v3(inode_k);
new_inode = bch2_trans_kmalloc(trans, bkey_bytes(inode_k.k));
ret = PTR_ERR_OR_ZERO(new_inode);
if (unlikely(ret))
goto err;
bkey_reassemble(&new_inode->k_i, inode.s_c);
if (!(le64_to_cpu(inode.v->bi_flags) & BCH_INODE_I_SIZE_DIRTY) &&
new_i_size > le64_to_cpu(inode.v->bi_size))
new_inode->v.bi_size = cpu_to_le64(new_i_size);
le64_add_cpu(&new_inode->v.bi_sectors, i_sectors_delta);
new_inode->k.p.snapshot = iter->snapshot;
/*
* Note:
* We always have to do an inode updated - even when i_size/i_sectors
* aren't changing - for fsync to work properly; fsync relies on
* inode->bi_journal_seq which is updated by the trigger code:
*/
ret = bch2_trans_update(trans, &inode_iter, &new_inode->k_i, 0) ?:
bch2_trans_update(trans, iter, k, 0) ?:
bch2_trans_commit(trans, disk_res, NULL,
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL);
if (unlikely(ret))
@ -397,8 +392,7 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
bch2_cut_back(end_pos, &delete);
ret = bch2_extent_update(trans, inum, iter, &delete,
&disk_res, NULL,
0, i_sectors_delta, false);
&disk_res, 0, i_sectors_delta, false);
bch2_disk_reservation_put(c, &disk_res);
}
@ -428,7 +422,7 @@ int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
return ret;
}
int bch2_write_index_default(struct bch_write_op *op)
static int bch2_write_index_default(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
struct bkey_buf sk;
@ -465,7 +459,7 @@ int bch2_write_index_default(struct bch_write_op *op)
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
ret = bch2_extent_update(&trans, inum, &iter, sk.k,
&op->res, op_journal_seq(op),
&op->res,
op->new_i_size, &op->i_sectors_delta,
op->flags & BCH_WRITE_CHECK_ENOSPC);
bch2_trans_iter_exit(&trans, &iter);
@ -543,29 +537,22 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
}
}
static void __bch2_write(struct closure *);
static void __bch2_write(struct bch_write_op *);
static void bch2_write_done(struct closure *cl)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct bch_fs *c = op->c;
if (!op->error && (op->flags & BCH_WRITE_FLUSH))
op->error = bch2_journal_error(&c->journal);
bch2_disk_reservation_put(c, &op->res);
percpu_ref_put(&c->writes);
bch2_keylist_free(&op->insert_keys, op->inline_keys);
bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
if (op->end_io) {
EBUG_ON(cl->parent);
closure_debug_destroy(cl);
closure_debug_destroy(cl);
if (op->end_io)
op->end_io(op);
} else {
closure_return(cl);
}
}
static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
@ -603,7 +590,7 @@ static void __bch2_write_index(struct bch_write_op *op)
struct keylist *keys = &op->insert_keys;
struct bkey_i *k;
unsigned dev;
int ret;
int ret = 0;
if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
ret = bch2_write_drop_io_error_ptrs(op);
@ -626,7 +613,10 @@ static void __bch2_write_index(struct bch_write_op *op)
if (!bch2_keylist_empty(keys)) {
u64 sectors_start = keylist_sectors(keys);
int ret = op->index_update_fn(op);
ret = !(op->flags & BCH_WRITE_MOVE)
? bch2_write_index_default(op)
: bch2_data_update_index_update(op);
BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
BUG_ON(keylist_sectors(keys) && !ret);
@ -636,7 +626,7 @@ static void __bch2_write_index(struct bch_write_op *op)
if (ret) {
bch_err_inum_ratelimited(c, op->pos.inode,
"write error while doing btree update: %s", bch2_err_str(ret));
op->error = ret;
goto err;
}
}
out:
@ -649,25 +639,45 @@ out:
err:
keys->top = keys->keys;
op->error = ret;
op->flags |= BCH_WRITE_DONE;
goto out;
}
static void bch2_write_index(struct closure *cl)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct bch_fs *c = op->c;
struct write_point *wp = op->wp;
struct workqueue_struct *wq = index_update_wq(op);
__bch2_write_index(op);
barrier();
op->btree_update_ready = true;
queue_work(wq, &wp->index_update_work);
}
if (!(op->flags & BCH_WRITE_DONE)) {
continue_at(cl, __bch2_write, index_update_wq(op));
} else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
bch2_journal_flush_seq_async(&c->journal,
*op_journal_seq(op),
cl);
continue_at(cl, bch2_write_done, index_update_wq(op));
} else {
continue_at_nobarrier(cl, bch2_write_done, NULL);
void bch2_write_point_do_index_updates(struct work_struct *work)
{
struct write_point *wp =
container_of(work, struct write_point, index_update_work);
struct bch_write_op *op;
while (1) {
spin_lock(&wp->writes_lock);
op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list);
if (op && !op->btree_update_ready)
op = NULL;
if (op)
list_del(&op->wp_list);
spin_unlock(&wp->writes_lock);
if (!op)
break;
__bch2_write_index(op);
if (!(op->flags & BCH_WRITE_DONE))
__bch2_write(op);
else
bch2_write_done(&op->cl);
}
}
@ -700,12 +710,12 @@ static void bch2_write_endio(struct bio *bio)
if (wbio->put_bio)
bio_put(bio);
if (parent)
if (parent) {
bio_endio(&parent->bio);
else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT))
closure_put(cl);
else
continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op));
return;
}
closure_put(cl);
}
static void init_append_extent(struct bch_write_op *op,
@ -1112,19 +1122,18 @@ err:
return ret;
}
static void __bch2_write(struct closure *cl)
static void __bch2_write(struct bch_write_op *op)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct bch_fs *c = op->c;
struct write_point *wp;
struct write_point *wp = NULL;
struct bio *bio = NULL;
bool skip_put = true;
unsigned nofs_flags;
int ret;
nofs_flags = memalloc_nofs_save();
again:
memset(&op->failed, 0, sizeof(op->failed));
op->btree_update_ready = false;
do {
struct bkey_i *key_to_write;
@ -1134,76 +1143,60 @@ again:
/* +1 for possible cache device: */
if (op->open_buckets.nr + op->nr_replicas + 1 >
ARRAY_SIZE(op->open_buckets.v))
goto flush_io;
break;
if (bch2_keylist_realloc(&op->insert_keys,
op->inline_keys,
ARRAY_SIZE(op->inline_keys),
BKEY_EXTENT_U64s_MAX))
goto flush_io;
break;
/*
* The copygc thread is now global, which means it's no longer
* freeing up space on specific disks, which means that
* allocations for specific disks may hang arbitrarily long:
*/
wp = bch2_alloc_sectors_start(c,
op->target,
op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
op->write_point,
&op->devs_have,
op->nr_replicas,
op->nr_replicas_required,
op->alloc_reserve,
op->flags,
(op->flags & (BCH_WRITE_ALLOC_NOWAIT|
BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl);
EBUG_ON(!wp);
if (IS_ERR(wp)) {
if (unlikely(wp != ERR_PTR(-EAGAIN))) {
ret = PTR_ERR(wp);
goto err;
ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_alloc_sectors_start_trans(&trans,
op->target,
op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
op->write_point,
&op->devs_have,
op->nr_replicas,
op->nr_replicas_required,
op->alloc_reserve,
op->flags,
(op->flags & (BCH_WRITE_ALLOC_NOWAIT|
BCH_WRITE_ONLY_SPECIFIED_DEVS))
? NULL : &op->cl, &wp));
if (unlikely(ret)) {
if (unlikely(ret != -EAGAIN)) {
op->error = ret;
op->flags |= BCH_WRITE_DONE;
}
goto flush_io;
break;
}
/*
* It's possible for the allocator to fail, put us on the
* freelist waitlist, and then succeed in one of various retry
* paths: if that happens, we need to disable the skip_put
* optimization because otherwise there won't necessarily be a
* barrier before we free the bch_write_op:
*/
if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
skip_put = false;
bch2_open_bucket_get(c, wp, &op->open_buckets);
ret = bch2_write_extent(op, wp, &bio);
bch2_alloc_sectors_done(c, wp);
if (ret < 0)
goto err;
if (ret) {
skip_put = false;
} else {
/*
* for the skip_put optimization this has to be set
* before we submit the bio:
*/
if (ret < 0) {
op->error = ret;
op->flags |= BCH_WRITE_DONE;
break;
}
if (!ret)
op->flags |= BCH_WRITE_DONE;
bio->bi_end_io = bch2_write_endio;
bio->bi_private = &op->cl;
bio->bi_opf |= REQ_OP_WRITE;
if (!skip_put)
closure_get(bio->bi_private);
else
op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT;
closure_get(bio->bi_private);
key_to_write = (void *) (op->insert_keys.keys_p +
key_to_write_offset);
@ -1212,48 +1205,34 @@ again:
key_to_write);
} while (ret);
if (!skip_put)
continue_at(cl, bch2_write_index, index_update_wq(op));
out:
memalloc_nofs_restore(nofs_flags);
return;
err:
op->error = ret;
op->flags |= BCH_WRITE_DONE;
continue_at(cl, bch2_write_index, index_update_wq(op));
goto out;
flush_io:
/*
* If the write can't all be submitted at once, we generally want to
* block synchronously as that signals backpressure to the caller.
* Sync or no?
*
* However, if we're running out of a workqueue, we can't block here
* because we'll be blocking other work items from completing:
* If we're running asynchronously, wne may still want to block
* synchronously here if we weren't able to submit all of the IO at
* once, as that signals backpressure to the caller.
*/
if (current->flags & PF_WQ_WORKER) {
continue_at(cl, bch2_write_index, index_update_wq(op));
goto out;
}
closure_sync(cl);
if (!bch2_keylist_empty(&op->insert_keys)) {
if ((op->flags & BCH_WRITE_SYNC) || !(op->flags & BCH_WRITE_DONE)) {
closure_sync(&op->cl);
__bch2_write_index(op);
if (op->error) {
op->flags |= BCH_WRITE_DONE;
continue_at_nobarrier(cl, bch2_write_done, NULL);
goto out;
}
if (!(op->flags & BCH_WRITE_DONE))
goto again;
bch2_write_done(&op->cl);
} else {
spin_lock(&wp->writes_lock);
op->wp = wp;
list_add_tail(&op->wp_list, &wp->writes);
spin_unlock(&wp->writes_lock);
continue_at(&op->cl, bch2_write_index, NULL);
}
goto again;
memalloc_nofs_restore(nofs_flags);
}
static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
{
struct closure *cl = &op->cl;
struct bio *bio = &op->wbio.bio;
struct bvec_iter iter;
struct bkey_i_inline_data *id;
@ -1290,8 +1269,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
op->flags |= BCH_WRITE_DONE;
continue_at_nobarrier(cl, bch2_write_index, NULL);
return;
__bch2_write_index(op);
err:
bch2_write_done(&op->cl);
}
@ -1319,6 +1297,7 @@ void bch2_write(struct closure *cl)
struct bch_fs *c = op->c;
unsigned data_len;
EBUG_ON(op->cl.parent);
BUG_ON(!op->nr_replicas);
BUG_ON(!op->write_point.v);
BUG_ON(!bkey_cmp(op->pos, POS_MAX));
@ -1352,24 +1331,19 @@ void bch2_write(struct closure *cl)
return;
}
continue_at_nobarrier(cl, __bch2_write, NULL);
__bch2_write(op);
return;
err:
bch2_disk_reservation_put(c, &op->res);
if (op->end_io) {
EBUG_ON(cl->parent);
closure_debug_destroy(cl);
closure_debug_destroy(&op->cl);
if (op->end_io)
op->end_io(op);
} else {
closure_return(cl);
}
}
/* Cache promotion on read */
struct promote_op {
struct closure cl;
struct rcu_head rcu;
u64 start_time;
@ -1423,10 +1397,10 @@ static void promote_free(struct bch_fs *c, struct promote_op *op)
kfree_rcu(op, rcu);
}
static void promote_done(struct closure *cl)
static void promote_done(struct bch_write_op *wop)
{
struct promote_op *op =
container_of(cl, struct promote_op, cl);
container_of(wop, struct promote_op, write.op);
struct bch_fs *c = op->write.op.c;
bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
@ -1438,7 +1412,6 @@ static void promote_done(struct closure *cl)
static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
{
struct closure *cl = &op->cl;
struct bio *bio = &op->write.op.wbio.bio;
trace_and_count(op->write.op.c, read_promote, &rbio->bio);
@ -1451,9 +1424,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
closure_init(cl, NULL);
bch2_data_update_read_done(&op->write, rbio->pick.crc, cl);
closure_return_with_destructor(cl, promote_done);
bch2_data_update_read_done(&op->write, rbio->pick.crc);
}
static struct promote_op *__promote_alloc(struct bch_fs *c,
@ -1518,6 +1489,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
},
btree_id, k);
BUG_ON(ret);
op->write.op.end_io = promote_done;
return op;
err:

View File

@ -27,28 +27,20 @@ const char *bch2_blk_status_to_str(blk_status_t);
enum bch_write_flags {
BCH_WRITE_ALLOC_NOWAIT = (1 << 0),
BCH_WRITE_CACHED = (1 << 1),
BCH_WRITE_FLUSH = (1 << 2),
BCH_WRITE_DATA_ENCODED = (1 << 3),
BCH_WRITE_PAGES_STABLE = (1 << 4),
BCH_WRITE_PAGES_OWNED = (1 << 5),
BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6),
BCH_WRITE_WROTE_DATA_INLINE = (1 << 7),
BCH_WRITE_FROM_INTERNAL = (1 << 8),
BCH_WRITE_CHECK_ENOSPC = (1 << 9),
BCH_WRITE_DATA_ENCODED = (1 << 2),
BCH_WRITE_PAGES_STABLE = (1 << 3),
BCH_WRITE_PAGES_OWNED = (1 << 4),
BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 5),
BCH_WRITE_WROTE_DATA_INLINE = (1 << 6),
BCH_WRITE_CHECK_ENOSPC = (1 << 7),
BCH_WRITE_SYNC = (1 << 8),
BCH_WRITE_MOVE = (1 << 9),
/* Internal: */
BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 10),
BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 11),
BCH_WRITE_DONE = (1 << 12),
BCH_WRITE_IO_ERROR = (1 << 13),
BCH_WRITE_DONE = (1 << 10),
BCH_WRITE_IO_ERROR = (1 << 11),
};
static inline u64 *op_journal_seq(struct bch_write_op *op)
{
return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR)
? op->journal_seq_p : &op->journal_seq;
}
static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
{
return op->alloc_reserve == RESERVE_movinggc
@ -60,14 +52,12 @@ int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
struct bkey_i *, bool *, s64 *, s64 *);
int bch2_extent_update(struct btree_trans *, subvol_inum,
struct btree_iter *, struct bkey_i *,
struct disk_reservation *, u64 *, u64, s64 *, bool);
struct disk_reservation *, u64, s64 *, bool);
int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
subvol_inum, u64, s64 *);
int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
int bch2_write_index_default(struct bch_write_op *);
static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
struct bch_io_opts opts)
{
@ -91,14 +81,14 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
op->version = ZERO_VERSION;
op->write_point = (struct write_point_specifier) { 0 };
op->res = (struct disk_reservation) { 0 };
op->journal_seq = 0;
op->new_i_size = U64_MAX;
op->i_sectors_delta = 0;
op->index_update_fn = bch2_write_index_default;
}
void bch2_write(struct closure *);
void bch2_write_point_do_index_updates(struct work_struct *);
static inline struct bch_write_bio *wbio_init(struct bio *bio)
{
struct bch_write_bio *wbio = to_wbio(bio);

View File

@ -117,6 +117,7 @@ struct bch_write_op {
unsigned nr_replicas_required:4;
unsigned alloc_reserve:3;
unsigned incompressible:1;
unsigned btree_update_ready:1;
struct bch_devs_list devs_have;
u16 target;
@ -132,23 +133,16 @@ struct bch_write_op {
struct write_point_specifier write_point;
struct write_point *wp;
struct list_head wp_list;
struct disk_reservation res;
struct open_buckets open_buckets;
/*
* If caller wants to flush but hasn't passed us a journal_seq ptr, we
* still need to stash the journal_seq somewhere:
*/
union {
u64 *journal_seq_p;
u64 journal_seq;
};
u64 new_i_size;
s64 i_sectors_delta;
int (*index_update_fn)(struct bch_write_op *);
struct bch_devs_mask failed;
struct keylist insert_keys;

View File

@ -17,7 +17,6 @@ static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys)
{
if (l->keys_p != inline_keys)
kfree(l->keys_p);
bch2_keylist_init(l, inline_keys);
}
static inline void bch2_keylist_push(struct keylist *l)

View File

@ -53,9 +53,8 @@ struct moving_io {
struct bio_vec bi_inline_vecs[0];
};
static void move_free(struct closure *cl)
static void move_free(struct moving_io *io)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
struct moving_context *ctxt = io->write.ctxt;
struct bch_fs *c = ctxt->c;
@ -65,31 +64,30 @@ static void move_free(struct closure *cl)
kfree(io);
}
static void move_write_done(struct closure *cl)
static void move_write_done(struct bch_write_op *op)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
struct moving_io *io = container_of(op, struct moving_io, write.op);
struct moving_context *ctxt = io->write.ctxt;
if (io->write.op.error)
ctxt->write_error = true;
atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
closure_return_with_destructor(cl, move_free);
move_free(io);
closure_put(&ctxt->cl);
}
static void move_write(struct closure *cl)
static void move_write(struct moving_io *io)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
closure_return_with_destructor(cl, move_free);
move_free(io);
return;
}
closure_get(&io->write.ctxt->cl);
atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
bch2_data_update_read_done(&io->write, io->rbio.pick.crc, cl);
continue_at(cl, move_write_done, NULL);
bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
}
static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
@ -121,7 +119,7 @@ static void do_pending_writes(struct moving_context *ctxt, struct btree_trans *t
while ((io = next_pending_write(ctxt))) {
list_del(&io->list);
closure_call(&io->cl, move_write, NULL, &ctxt->cl);
move_write(io);
}
}
@ -185,7 +183,7 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt,
}
}
void bch_move_stats_init(struct bch_move_stats *stats, char *name)
void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
{
memset(stats, 0, sizeof(*stats));
scnprintf(stats->name, sizeof(stats->name), "%s", name);
@ -302,6 +300,7 @@ static int bch2_move_extent(struct btree_trans *trans,
goto err_free_pages;
io->write.ctxt = ctxt;
io->write.op.end_io = move_write_done;
atomic64_inc(&ctxt->stats->keys_moved);
atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
@ -956,7 +955,7 @@ int bch2_data_job(struct bch_fs *c,
switch (op.op) {
case BCH_DATA_OP_REREPLICATE:
bch_move_stats_init(stats, "rereplicate");
bch2_move_stats_init(stats, "rereplicate");
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, -1);
@ -980,7 +979,7 @@ int bch2_data_job(struct bch_fs *c,
if (op.migrate.dev >= c->sb.nr_devices)
return -EINVAL;
bch_move_stats_init(stats, "migrate");
bch2_move_stats_init(stats, "migrate");
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
@ -1001,7 +1000,7 @@ int bch2_data_job(struct bch_fs *c,
ret = bch2_replicas_gc2(c) ?: ret;
break;
case BCH_DATA_OP_REWRITE_OLD_NODES:
bch_move_stats_init(stats, "rewrite_old_nodes");
bch2_move_stats_init(stats, "rewrite_old_nodes");
ret = bch2_scan_old_btree_nodes(c, stats);
break;
default:

View File

@ -60,8 +60,7 @@ int bch2_data_job(struct bch_fs *,
struct bch_move_stats *,
struct bch_ioctl_data);
inline void bch_move_stats_init(struct bch_move_stats *stats,
char *name);
void bch2_move_stats_init(struct bch_move_stats *stats, char *name);
#endif /* _BCACHEFS_MOVE_H */

View File

@ -102,7 +102,7 @@ static int bch2_copygc(struct bch_fs *c)
};
int ret = 0;
bch_move_stats_init(&move_stats, "copygc");
bch2_move_stats_init(&move_stats, "copygc");
for_each_rw_member(ca, c, dev_idx)
heap_size += ca->mi.nbuckets >> 7;

View File

@ -189,7 +189,7 @@ static int bch2_rebalance_thread(void *arg)
prev_start = jiffies;
prev_cputime = curr_cputime();
bch_move_stats_init(&move_stats, "rebalance");
bch2_move_stats_init(&move_stats, "rebalance");
while (!kthread_wait_freezable(r->enabled)) {
cond_resched();

View File

@ -1414,7 +1414,7 @@ use_clean:
le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written) {
struct bch_move_stats stats;
bch_move_stats_init(&stats, "recovery");
bch2_move_stats_init(&stats, "recovery");
bch_info(c, "scanning for old btree nodes");
ret = bch2_fs_read_write(c);
@ -1486,6 +1486,9 @@ int bch2_fs_initialize(struct bch_fs *c)
mutex_unlock(&c->sb_lock);
set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags);
set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
set_bit(BCH_FS_MAY_GO_RW, &c->flags);
set_bit(BCH_FS_FSCK_DONE, &c->flags);

View File

@ -378,7 +378,7 @@ s64 bch2_remap_range(struct bch_fs *c,
dst_end.offset - dst_iter.pos.offset));
ret = bch2_extent_update(&trans, dst_inum, &dst_iter,
new_dst.k, &disk_res, NULL,
new_dst.k, &disk_res,
new_i_size, i_sectors_delta,
true);
bch2_disk_reservation_put(c, &disk_res);

View File

@ -184,7 +184,7 @@ read_attribute(io_latency_stats_read);
read_attribute(io_latency_stats_write);
read_attribute(congested);
read_attribute(btree_avg_write_size);
read_attribute(btree_write_stats);
read_attribute(btree_cache_size);
read_attribute(compression_stats);
@ -250,14 +250,6 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
return ret;
}
static size_t bch2_btree_avg_write_size(struct bch_fs *c)
{
u64 nr = atomic64_read(&c->btree_writes_nr);
u64 sectors = atomic64_read(&c->btree_writes_sectors);
return nr ? div64_u64(sectors, nr) : 0;
}
static long data_progress_to_text(struct printbuf *out, struct bch_fs *c)
{
long ret = 0;
@ -396,7 +388,9 @@ SHOW(bch2_fs)
sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b);
sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c));
sysfs_hprint(btree_avg_write_size, bch2_btree_avg_write_size(c));
if (attr == &sysfs_btree_write_stats)
bch2_btree_write_stats_to_text(out, c);
sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic);
@ -554,7 +548,7 @@ SYSFS_OPS(bch2_fs);
struct attribute *bch2_fs_files[] = {
&sysfs_minor,
&sysfs_btree_cache_size,
&sysfs_btree_avg_write_size,
&sysfs_btree_write_stats,
&sysfs_promote_whole_extents,

View File

@ -0,0 +1,33 @@
// SPDX-License-Identifier: GPL-2.0
#include "two_state_shared_lock.h"
void bch2_two_state_unlock(two_state_lock_t *lock, int s)
{
long i = s ? 1 : -1;
BUG_ON(atomic_long_read(&lock->v) == 0);
if (atomic_long_sub_return_release(i, &lock->v) == 0)
wake_up_all(&lock->wait);
}
bool bch2_two_state_trylock(two_state_lock_t *lock, int s)
{
long i = s ? 1 : -1;
long v = atomic_long_read(&lock->v), old;
do {
old = v;
if (i > 0 ? v < 0 : v > 0)
return false;
} while ((v = atomic_long_cmpxchg_acquire(&lock->v,
old, old + i)) != old);
return true;
}
void bch2_two_state_lock(two_state_lock_t *lock, int s)
{
wait_event(lock->wait, bch2_two_state_trylock(lock, s));
}

View File

@ -0,0 +1,28 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_TWO_STATE_LOCK_H
#define _BCACHEFS_TWO_STATE_LOCK_H
#include <linux/atomic.h>
#include <linux/sched.h>
#include <linux/wait.h>
/*
* Two-state lock - can be taken for add or block - both states are shared,
* like read side of rwsem, but conflict with other state:
*/
typedef struct {
atomic_long_t v;
wait_queue_head_t wait;
} two_state_lock_t;
static inline void two_state_lock_init(two_state_lock_t *lock)
{
atomic_long_set(&lock->v, 0);
init_waitqueue_head(&lock->wait);
}
void bch2_two_state_unlock(two_state_lock_t *, int);
bool bch2_two_state_trylock(two_state_lock_t *, int);
void bch2_two_state_lock(two_state_lock_t *, int);
#endif /* _BCACHEFS_TWO_STATE_LOCK_H */

View File

@ -52,7 +52,7 @@
*
* note: this rounds towards 0.
*/
inline s64 fast_divpow2(s64 n, u8 d)
s64 fast_divpow2(s64 n, u8 d)
{
return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d;
}

View File

@ -27,3 +27,8 @@ void prt_printf(struct printbuf *out, const char *fmt, ...)
prt_vprintf(out, fmt, args);
va_end(args);
}
void prt_u64(struct printbuf *out, u64 v)
{
prt_printf(out, "%llu", v);
}

View File

@ -342,7 +342,11 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
return true;
}
#ifdef CONFIG_LOCK_SPIN_ON_OWNER
/*
* We don't see stable performance with SIX_LOCK_SPIN_ON_OWNER enabled, so it's
* off for now:
*/
#ifdef SIX_LOCK_SPIN_ON_OWNER
static inline bool six_optimistic_spin(struct six_lock *lock,
struct six_lock_waiter *wait)

View File

@ -66,6 +66,11 @@ void wake_up(wait_queue_head_t *q)
__wake_up(q, TASK_NORMAL, 1, NULL);
}
void wake_up_all(wait_queue_head_t *q)
{
__wake_up(q, TASK_NORMAL, 0, NULL);
}
static void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
{
__wake_up_common(q, mode, nr, 0, NULL);