diff --git a/.bcachefs_revision b/.bcachefs_revision index 3a147c61..1b5f928a 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -61ebcb532a1266e5e36f354858b552e2a4fb9925 +8d3fc97ca3f24d8f7ab1e9ed04d8ca354c44dd8c diff --git a/Makefile b/Makefile index d460a6d3..01aa0b71 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ PREFIX?=/usr/local PKG_CONFIG?=pkg-config INSTALL=install -CFLAGS+=-std=gnu89 -O2 -g -MMD -Wall -fPIC \ +CFLAGS+=-std=gnu11 -O2 -g -MMD -Wall -fPIC \ -Wno-pointer-sign \ -fno-strict-aliasing \ -fno-delete-null-pointer-checks \ diff --git a/include/linux/atomic.h b/include/linux/atomic.h index 38a364c0..a9852fa1 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -54,6 +54,8 @@ typedef struct { #define __ATOMIC_ADD_RETURN_RELEASE(v, p) \ __atomic_add_fetch(p, v, __ATOMIC_RELEASE) #define __ATOMIC_SUB_RETURN(v, p) __atomic_sub_fetch(p, v, __ATOMIC_RELAXED) +#define __ATOMIC_SUB_RETURN_RELEASE(v, p) \ + __atomic_sub_fetch(p, v, __ATOMIC_RELEASE) #define xchg(p, v) __atomic_exchange_n(p, v, __ATOMIC_SEQ_CST) #define xchg_acquire(p, v) __atomic_exchange_n(p, v, __ATOMIC_ACQUIRE) @@ -123,6 +125,11 @@ do { \ ({ smp_mb__before_atomic(); __ATOMIC_ADD_RETURN(i, v); }) #endif +#ifndef __ATOMIC_SUB_RETURN_RELEASE +#define __ATOMIC_SUB_RETURN_RELEASE(i, v) \ + ({ smp_mb__before_atomic(); __ATOMIC_SUB_RETURN(i, v); }) +#endif + #ifndef __ATOMIC_SUB #define __ATOMIC_SUB(i, v) __ATOMIC_SUB_RETURN(i, v) #endif @@ -164,6 +171,11 @@ static inline i_type a_type##_add_return_release(i_type i, a_type##_t *v)\ return __ATOMIC_ADD_RETURN_RELEASE(i, &v->counter); \ } \ \ +static inline i_type a_type##_sub_return_release(i_type i, a_type##_t *v)\ +{ \ + return __ATOMIC_SUB_RETURN_RELEASE(i, &v->counter); \ +} \ + \ static inline i_type a_type##_sub_return(i_type i, a_type##_t *v) \ { \ return __ATOMIC_SUB_RETURN(i, &v->counter); \ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d31b5f56..b2c1751c 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -229,6 +229,8 @@ static inline int __must_check kstrtos32(const char *s, unsigned int base, s32 * } struct printbuf; +extern void prt_u64(struct printbuf *out, u64 num); + extern __printf(2, 0) void prt_vprintf(struct printbuf *out, const char *fmt, va_list args); extern __printf(2, 3) void prt_printf(struct printbuf *out, const char *fmt, ...); diff --git a/include/linux/mean_and_variance.h b/include/linux/mean_and_variance.h index 3d62abe7..cbac6ac8 100644 --- a/include/linux/mean_and_variance.h +++ b/include/linux/mean_and_variance.h @@ -155,7 +155,7 @@ struct mean_and_variance_weighted { u64 variance; }; -inline s64 fast_divpow2(s64 n, u8 d); +s64 fast_divpow2(s64 n, u8 d); struct mean_and_variance mean_and_variance_update(struct mean_and_variance s1, s64 v1); s64 mean_and_variance_get_mean(struct mean_and_variance s); diff --git a/include/linux/wait.h b/include/linux/wait.h index d1d33e67..d30fb10d 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -24,6 +24,7 @@ typedef struct { } wait_queue_head_t; void wake_up(wait_queue_head_t *); +void wake_up_all(wait_queue_head_t *); void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state); void finish_wait(wait_queue_head_t *q, wait_queue_t *wait); int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index 444f43f0..47ba750d 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -344,25 +344,29 @@ DEFINE_EVENT(btree_node, btree_node_free, TRACE_EVENT(btree_reserve_get_fail, TP_PROTO(const char *trans_fn, unsigned long caller_ip, - size_t required), - TP_ARGS(trans_fn, caller_ip, required), + size_t required, + int ret), + TP_ARGS(trans_fn, caller_ip, required, ret), TP_STRUCT__entry( __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) __field(size_t, required ) + __array(char, ret, 32 ) ), TP_fast_assign( strscpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; __entry->required = required; + strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret)); ), - TP_printk("%s %pS required %zu", + TP_printk("%s %pS required %zu ret %s", __entry->trans_fn, (void *) __entry->caller_ip, - __entry->required) + __entry->required, + __entry->ret) ); DEFINE_EVENT(btree_node, btree_node_compact, @@ -542,14 +546,11 @@ TRACE_EVENT(bucket_alloc_fail, u64 avail, u64 copygc_wait_amount, s64 copygc_waiting_for, - u64 seen, - u64 open, - u64 need_journal_commit, - u64 nouse, + struct bucket_alloc_state *s, bool nonblocking, const char *err), TP_ARGS(ca, alloc_reserve, free, avail, copygc_wait_amount, copygc_waiting_for, - seen, open, need_journal_commit, nouse, nonblocking, err), + s, nonblocking, err), TP_STRUCT__entry( __field(dev_t, dev ) @@ -573,10 +574,10 @@ TRACE_EVENT(bucket_alloc_fail, __entry->avail = avail; __entry->copygc_wait_amount = copygc_wait_amount; __entry->copygc_waiting_for = copygc_waiting_for; - __entry->seen = seen; - __entry->open = open; - __entry->need_journal_commit = need_journal_commit; - __entry->nouse = nouse; + __entry->seen = s->buckets_seen; + __entry->open = s->skipped_open; + __entry->need_journal_commit = s->skipped_need_journal_commit; + __entry->nouse = s->skipped_nouse; __entry->nonblocking = nonblocking; strscpy(__entry->err, err, sizeof(__entry->err)); ), diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 796b9f5a..742313c2 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -279,6 +279,22 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, return -EINVAL; } + /* + * XXX this is wrong, we'll be checking updates that happened from + * before BCH_FS_CHECK_BACKPOINTERS_DONE + */ + if (rw == WRITE && test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) { + unsigned i, bp_len = 0; + + for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++) + bp_len += alloc_v4_backpointers_c(a.v)[i].bucket_len; + + if (bp_len > a.v->dirty_sectors) { + prt_printf(err, "too many backpointers"); + return -EINVAL; + } + } + if (rw == WRITE) { if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) { prt_printf(err, "invalid data type (got %u should be %u)", diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index 186c2ed4..949c068e 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -195,26 +195,24 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * u64 bucket, enum alloc_reserve reserve, struct bch_alloc_v4 *a, - u64 *skipped_open, - u64 *skipped_need_journal_commit, - u64 *skipped_nouse, + struct bucket_alloc_state *s, struct closure *cl) { struct open_bucket *ob; if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { - (*skipped_nouse)++; + s->skipped_nouse++; return NULL; } if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { - (*skipped_open)++; + s->skipped_open++; return NULL; } if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) { - (*skipped_need_journal_commit)++; + s->skipped_need_journal_commit++; return NULL; } @@ -234,7 +232,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * /* Recheck under lock: */ if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { spin_unlock(&c->freelist_lock); - (*skipped_open)++; + s->skipped_open++; return NULL; } @@ -274,9 +272,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, enum alloc_reserve reserve, u64 free_entry, - u64 *skipped_open, - u64 *skipped_need_journal_commit, - u64 *skipped_nouse, + struct bucket_alloc_state *s, struct bkey_s_c freespace_k, struct closure *cl) { @@ -339,7 +335,8 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc u64 bp_offset = 0; ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1, - &bp_offset, &bp, 0); + &bp_offset, &bp, + BTREE_ITER_NOPRESERVE); if (ret) { ob = ERR_PTR(ret); goto err; @@ -356,11 +353,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc } } - ob = __try_alloc_bucket(c, ca, b, reserve, &a, - skipped_open, - skipped_need_journal_commit, - skipped_nouse, - cl); + ob = __try_alloc_bucket(c, ca, b, reserve, &a, s, cl); if (!ob) iter.path->preserve = false; err: @@ -406,11 +399,7 @@ static noinline struct open_bucket * bch2_bucket_alloc_early(struct btree_trans *trans, struct bch_dev *ca, enum alloc_reserve reserve, - u64 *cur_bucket, - u64 *buckets_seen, - u64 *skipped_open, - u64 *skipped_need_journal_commit, - u64 *skipped_nouse, + struct bucket_alloc_state *s, struct closure *cl) { struct btree_iter iter; @@ -418,10 +407,10 @@ bch2_bucket_alloc_early(struct btree_trans *trans, struct open_bucket *ob = NULL; int ret; - *cur_bucket = max_t(u64, *cur_bucket, ca->mi.first_bucket); - *cur_bucket = max_t(u64, *cur_bucket, ca->new_fs_bucket_idx); + s->cur_bucket = max_t(u64, s->cur_bucket, ca->mi.first_bucket); + s->cur_bucket = max_t(u64, s->cur_bucket, ca->new_fs_bucket_idx); - for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, *cur_bucket), + for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, s->cur_bucket), BTREE_ITER_SLOTS, k, ret) { struct bch_alloc_v4 a; @@ -437,19 +426,15 @@ bch2_bucket_alloc_early(struct btree_trans *trans, if (a.data_type != BCH_DATA_free) continue; - (*buckets_seen)++; + s->buckets_seen++; - ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, &a, - skipped_open, - skipped_need_journal_commit, - skipped_nouse, - cl); + ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, &a, s, cl); if (ob) break; } bch2_trans_iter_exit(trans, &iter); - *cur_bucket = iter.pos.offset; + s->cur_bucket = iter.pos.offset; return ob ?: ERR_PTR(ret ?: -BCH_ERR_no_buckets_found); } @@ -457,11 +442,7 @@ bch2_bucket_alloc_early(struct btree_trans *trans, static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, struct bch_dev *ca, enum alloc_reserve reserve, - u64 *cur_bucket, - u64 *buckets_seen, - u64 *skipped_open, - u64 *skipped_need_journal_commit, - u64 *skipped_nouse, + struct bucket_alloc_state *s, struct closure *cl) { struct btree_iter iter; @@ -477,25 +458,21 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, * at previously */ for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace, - POS(ca->dev_idx, *cur_bucket), 0, k, ret) { + POS(ca->dev_idx, s->cur_bucket), 0, k, ret) { if (k.k->p.inode != ca->dev_idx) break; - for (*cur_bucket = max(*cur_bucket, bkey_start_offset(k.k)); - *cur_bucket < k.k->p.offset; - (*cur_bucket)++) { + for (s->cur_bucket = max(s->cur_bucket, bkey_start_offset(k.k)); + s->cur_bucket < k.k->p.offset; + s->cur_bucket++) { ret = btree_trans_too_many_iters(trans); if (ret) break; - (*buckets_seen)++; + s->buckets_seen++; ob = try_alloc_bucket(trans, ca, reserve, - *cur_bucket, - skipped_open, - skipped_need_journal_commit, - skipped_nouse, - k, cl); + s->cur_bucket, s, k, cl); if (ob) break; } @@ -525,11 +502,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, bool freespace_initialized = READ_ONCE(ca->mi.freespace_initialized); u64 start = freespace_initialized ? 0 : ca->bucket_alloc_trans_early_cursor; u64 avail; - u64 cur_bucket = start; - u64 buckets_seen = 0; - u64 skipped_open = 0; - u64 skipped_need_journal_commit = 0; - u64 skipped_nouse = 0; + struct bucket_alloc_state s = { .cur_bucket = start }; bool waiting = false; again: bch2_dev_usage_read_fast(ca, usage); @@ -568,31 +541,19 @@ again: } ob = likely(ca->mi.freespace_initialized) - ? bch2_bucket_alloc_freelist(trans, ca, reserve, - &cur_bucket, - &buckets_seen, - &skipped_open, - &skipped_need_journal_commit, - &skipped_nouse, - cl) - : bch2_bucket_alloc_early(trans, ca, reserve, - &cur_bucket, - &buckets_seen, - &skipped_open, - &skipped_need_journal_commit, - &skipped_nouse, - cl); + ? bch2_bucket_alloc_freelist(trans, ca, reserve, &s, cl) + : bch2_bucket_alloc_early(trans, ca, reserve, &s, cl); - if (skipped_need_journal_commit * 2 > avail) + if (s.skipped_need_journal_commit * 2 > avail) bch2_journal_flush_async(&c->journal, NULL); if (!ob && !freespace_initialized && start) { - start = cur_bucket = 0; + start = s.cur_bucket = 0; goto again; } if (!freespace_initialized) - ca->bucket_alloc_trans_early_cursor = cur_bucket; + ca->bucket_alloc_trans_early_cursor = s.cur_bucket; err: if (!ob) ob = ERR_PTR(-BCH_ERR_no_buckets_found); @@ -607,10 +568,7 @@ err: avail, bch2_copygc_wait_amount(c), c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now), - buckets_seen, - skipped_open, - skipped_need_journal_commit, - skipped_nouse, + &s, cl == NULL, bch2_err_str(PTR_ERR(ob))); @@ -1152,16 +1110,17 @@ out: /* * Get us an open_bucket we can allocate from, return with it locked: */ -struct write_point *bch2_alloc_sectors_start_trans(struct btree_trans *trans, - unsigned target, - unsigned erasure_code, - struct write_point_specifier write_point, - struct bch_devs_list *devs_have, - unsigned nr_replicas, - unsigned nr_replicas_required, - enum alloc_reserve reserve, - unsigned flags, - struct closure *cl) +int bch2_alloc_sectors_start_trans(struct btree_trans *trans, + unsigned target, + unsigned erasure_code, + struct write_point_specifier write_point, + struct bch_devs_list *devs_have, + unsigned nr_replicas, + unsigned nr_replicas_required, + enum alloc_reserve reserve, + unsigned flags, + struct closure *cl, + struct write_point **wp_ret) { struct bch_fs *c = trans->c; struct write_point *wp; @@ -1183,7 +1142,7 @@ retry: write_points_nr = c->write_points_nr; have_cache = false; - wp = writepoint_find(trans, write_point.v); + *wp_ret = wp = writepoint_find(trans, write_point.v); if (wp->data_type == BCH_DATA_user) ob_flags |= BUCKET_MAY_ALLOC_PARTIAL; @@ -1240,7 +1199,7 @@ alloc_done: BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); - return wp; + return 0; err: open_bucket_for_each(c, &wp->ptrs, ob, i) if (ptrs.nr < ARRAY_SIZE(ptrs.v)) @@ -1258,39 +1217,13 @@ err: if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) || bch2_err_matches(ret, BCH_ERR_freelist_empty)) return cl - ? ERR_PTR(-EAGAIN) - : ERR_PTR(-BCH_ERR_ENOSPC_bucket_alloc); + ? -EAGAIN + : -BCH_ERR_ENOSPC_bucket_alloc; if (bch2_err_matches(ret, BCH_ERR_insufficient_devices)) - return ERR_PTR(-EROFS); - - return ERR_PTR(ret); -} - -struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, - unsigned target, - unsigned erasure_code, - struct write_point_specifier write_point, - struct bch_devs_list *devs_have, - unsigned nr_replicas, - unsigned nr_replicas_required, - enum alloc_reserve reserve, - unsigned flags, - struct closure *cl) -{ - struct write_point *wp; - - bch2_trans_do(c, NULL, NULL, 0, - PTR_ERR_OR_ZERO(wp = bch2_alloc_sectors_start_trans(&trans, target, - erasure_code, - write_point, - devs_have, - nr_replicas, - nr_replicas_required, - reserve, - flags, cl))); - return wp; + return -EROFS; + return ret; } struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) @@ -1361,6 +1294,10 @@ static inline void writepoint_init(struct write_point *wp, { mutex_init(&wp->lock); wp->data_type = type; + + INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates); + INIT_LIST_HEAD(&wp->writes); + spin_lock_init(&wp->writes_lock); } void bch2_fs_allocator_foreground_init(struct bch_fs *c) diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h index 6de63a35..16490ffb 100644 --- a/libbcachefs/alloc_foreground.h +++ b/libbcachefs/alloc_foreground.h @@ -136,22 +136,15 @@ int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, unsigned, unsigned *, bool *, enum alloc_reserve, unsigned, struct closure *); -struct write_point *bch2_alloc_sectors_start_trans(struct btree_trans *, - unsigned, unsigned, - struct write_point_specifier, - struct bch_devs_list *, - unsigned, unsigned, - enum alloc_reserve, - unsigned, - struct closure *); -struct write_point *bch2_alloc_sectors_start(struct bch_fs *, - unsigned, unsigned, - struct write_point_specifier, - struct bch_devs_list *, - unsigned, unsigned, - enum alloc_reserve, - unsigned, - struct closure *); +int bch2_alloc_sectors_start_trans(struct btree_trans *, + unsigned, unsigned, + struct write_point_specifier, + struct bch_devs_list *, + unsigned, unsigned, + enum alloc_reserve, + unsigned, + struct closure *, + struct write_point **); struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *); void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h index e078584d..e66a85f7 100644 --- a/libbcachefs/alloc_types.h +++ b/libbcachefs/alloc_types.h @@ -8,6 +8,14 @@ #include "clock_types.h" #include "fifo.h" +struct bucket_alloc_state { + u64 cur_bucket; + u64 buckets_seen; + u64 skipped_open; + u64 skipped_need_journal_commit; + u64 skipped_nouse; +}; + struct ec_bucket_buf; #define BCH_ALLOC_RESERVES() \ @@ -78,6 +86,11 @@ struct write_point { struct open_buckets ptrs; struct dev_stripe_state stripe; + + struct work_struct index_update_work; + + struct list_head writes; + spinlock_t writes_lock; }; struct write_point_specifier { diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c index dace68e2..614811ea 100644 --- a/libbcachefs/backpointers.c +++ b/libbcachefs/backpointers.c @@ -9,8 +9,6 @@ #include -#define MAX_EXTENT_COMPRESS_RATIO_SHIFT 10 - /* * Convert from pos in backpointer btree to pos of corresponding bucket in alloc * btree: @@ -43,27 +41,6 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c, return ret; } -void bch2_extent_ptr_to_bp(struct bch_fs *c, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, struct extent_ptr_decoded p, - struct bpos *bucket_pos, struct bch_backpointer *bp) -{ - enum bch_data_type data_type = level ? BCH_DATA_btree : BCH_DATA_user; - s64 sectors = level ? btree_sectors(c) : k.k->size; - u32 bucket_offset; - - *bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset); - *bp = (struct bch_backpointer) { - .btree_id = btree_id, - .level = level, - .data_type = data_type, - .bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + - p.crc.offset, - .bucket_len = ptr_disk_sectors(sectors, p), - .pos = k.k->p, - }; -} - static bool extent_matches_bp(struct bch_fs *c, enum btree_id btree_id, unsigned level, struct bkey_s_c k, diff --git a/libbcachefs/backpointers.h b/libbcachefs/backpointers.h index 8c58f929..48a48b75 100644 --- a/libbcachefs/backpointers.h +++ b/libbcachefs/backpointers.h @@ -2,6 +2,7 @@ #ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H #define _BCACHEFS_BACKPOINTERS_BACKGROUND_H +#include "buckets.h" #include "super.h" int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k, @@ -16,9 +17,28 @@ void bch2_backpointer_swab(struct bkey_s); .swab = bch2_backpointer_swab, \ }) -void bch2_extent_ptr_to_bp(struct bch_fs *, enum btree_id, unsigned, - struct bkey_s_c, struct extent_ptr_decoded, - struct bpos *, struct bch_backpointer *); +#define MAX_EXTENT_COMPRESS_RATIO_SHIFT 10 + +static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, struct extent_ptr_decoded p, + struct bpos *bucket_pos, struct bch_backpointer *bp) +{ + enum bch_data_type data_type = level ? BCH_DATA_btree : BCH_DATA_user; + s64 sectors = level ? btree_sectors(c) : k.k->size; + u32 bucket_offset; + + *bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset); + *bp = (struct bch_backpointer) { + .btree_id = btree_id, + .level = level, + .data_type = data_type, + .bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + + p.crc.offset, + .bucket_len = ptr_disk_sectors(sectors, p), + .pos = k.k->p, + }; +} int bch2_bucket_backpointer_del(struct btree_trans *, struct bkey_i_alloc_v4 *, struct bch_backpointer, struct bkey_s_c); diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index fcbe8f8c..8a43fcfa 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -226,6 +226,10 @@ do { \ dynamic_fault("bcachefs:meta:write:" name) #ifdef __KERNEL__ +#define BCACHEFS_LOG_PREFIX +#endif + +#ifdef BCACHEFS_LOG_PREFIX #define bch2_log_msg(_c, fmt) "bcachefs (%s): " fmt, ((_c)->name) #define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") #define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum) @@ -598,6 +602,23 @@ typedef struct { #define BCACHEFS_ROOT_SUBVOL_INUM \ ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) +#define BCH_BTREE_WRITE_TYPES() \ + x(initial, 0) \ + x(init_next_bset, 1) \ + x(cache_reclaim, 2) \ + x(journal_reclaim, 3) \ + x(interior, 4) + +enum btree_write_type { +#define x(t, n) BTREE_WRITE_##t, + BCH_BTREE_WRITE_TYPES() +#undef x + BTREE_WRITE_TYPE_NR, +}; + +#define BTREE_WRITE_TYPE_MASK (roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1) +#define BTREE_WRITE_TYPE_BITS ilog2(BTREE_WRITE_TYPE_MASK) + struct bch_fs { struct closure cl; @@ -707,6 +728,13 @@ struct bch_fs { struct workqueue_struct *btree_interior_update_worker; struct work_struct btree_interior_update_work; + /* btree_io.c: */ + spinlock_t btree_write_error_lock; + struct btree_write_stats { + atomic64_t nr; + atomic64_t bytes; + } btree_write_stats[BTREE_WRITE_TYPE_NR]; + /* btree_iter.c: */ struct mutex btree_trans_lock; struct list_head btree_trans_list; @@ -881,11 +909,6 @@ struct bch_fs { struct bio_set dio_write_bioset; struct bio_set dio_read_bioset; - - atomic64_t btree_writes_nr; - atomic64_t btree_writes_sectors; - spinlock_t btree_write_error_lock; - /* ERRORS */ struct list_head fsck_errors; struct mutex fsck_error_lock; diff --git a/libbcachefs/bkey_sort.c b/libbcachefs/bkey_sort.c index 8518054a..be0d4bc1 100644 --- a/libbcachefs/bkey_sort.c +++ b/libbcachefs/bkey_sort.c @@ -178,7 +178,7 @@ unsigned bch2_sort_keys(struct bkey_packed *dst, continue; while ((next = sort_iter_peek(iter)) && - !bch2_bkey_cmp_packed(iter->b, in, next)) { + !bch2_bkey_cmp_packed_inlined(iter->b, in, next)) { BUG_ON(in->needs_whiteout && next->needs_whiteout); needs_whiteout |= in->needs_whiteout; diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index d1cbf926..75e74479 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -280,9 +280,11 @@ wait_on_io: * the post write cleanup: */ if (bch2_verify_btree_ondisk) - bch2_btree_node_write(c, b, SIX_LOCK_intent, 0); + bch2_btree_node_write(c, b, SIX_LOCK_intent, + BTREE_WRITE_cache_reclaim); else - __bch2_btree_node_write(c, b, 0); + __bch2_btree_node_write(c, b, + BTREE_WRITE_cache_reclaim); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); @@ -389,7 +391,7 @@ restart: six_trylock_read(&b->c.lock)) { list_move(&bc->live, &b->list); mutex_unlock(&bc->lock); - __bch2_btree_node_write(c, b, 0); + __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); six_unlock_read(&b->c.lock); if (touched >= nr) goto out_nounlock; @@ -675,6 +677,7 @@ out: b->flags = 0; b->written = 0; b->nsets = 0; + b->write_type = 0; b->sib_u64s[0] = 0; b->sib_u64s[1] = 0; b->whiteout_u64s = 0; @@ -1118,7 +1121,7 @@ wait_on_io: btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); if (btree_node_dirty(b)) { - __bch2_btree_node_write(c, b, 0); + __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); goto wait_on_io; diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 90f67ccd..cee3b500 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -450,6 +450,24 @@ void bch2_btree_build_aux_trees(struct btree *b) t == bset_tree_last(b)); } +/* + * If we have MAX_BSETS (3) bsets, should we sort them all down to just one? + * + * The first bset is going to be of similar order to the size of the node, the + * last bset is bounded by btree_write_set_buffer(), which is set to keep the + * memmove on insert from being too expensive: the middle bset should, ideally, + * be the geometric mean of the first and the last. + * + * Returns true if the middle bset is greater than that geometric mean: + */ +static inline bool should_compact_all(struct bch_fs *c, struct btree *b) +{ + unsigned mid_u64s_bits = + (ilog2(btree_max_u64s(c)) + BTREE_WRITE_SET_U64s_BITS) / 2; + + return bset_u64s(&b->set[1]) > 1U << mid_u64s_bits; +} + /* * @bch_btree_init_next - initialize a new (unwritten) bset that can then be * inserted into @@ -467,19 +485,14 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) EBUG_ON(!(b->c.lock.state.seq & 1)); BUG_ON(bset_written(b, bset(b, &b->set[1]))); + BUG_ON(btree_node_just_written(b)); if (b->nsets == MAX_BSETS && - !btree_node_write_in_flight(b)) { - unsigned log_u64s[] = { - ilog2(bset_u64s(&b->set[0])), - ilog2(bset_u64s(&b->set[1])), - ilog2(bset_u64s(&b->set[2])), - }; - - if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) { - bch2_btree_node_write(c, b, SIX_LOCK_write, 0); - reinit_iter = true; - } + !btree_node_write_in_flight(b) && + should_compact_all(c, b)) { + bch2_btree_node_write(c, b, SIX_LOCK_write, + BTREE_WRITE_init_next_bset); + reinit_iter = true; } if (b->nsets == MAX_BSETS && @@ -1653,7 +1666,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) } while ((v = cmpxchg(&b->flags, old, new)) != old); if (new & (1U << BTREE_NODE_write_in_flight)) - __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED); + __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|b->write_type); else wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); } @@ -1802,6 +1815,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) bool used_mempool; unsigned long old, new; bool validate_before_checksum = false; + enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK; void *data; int ret; @@ -1848,6 +1862,12 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) if (new & (1U << BTREE_NODE_need_write)) return; do_write: + if ((flags & BTREE_WRITE_ONLY_IF_NEED)) + type = b->write_type; + b->write_type = 0; + + BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0)); + atomic_dec(&c->btree_cache.dirty); BUG_ON(btree_node_fake(b)); @@ -2022,8 +2042,8 @@ do_write: bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written = cpu_to_le16(b->written); - atomic64_inc(&c->btree_writes_nr); - atomic64_add(sectors_to_write, &c->btree_writes_sectors); + atomic64_inc(&c->btree_write_stats[type].nr); + atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes); INIT_WORK(&wbio->work, btree_write_submit); queue_work(c->io_complete_wq, &wbio->work); @@ -2151,3 +2171,33 @@ bool bch2_btree_flush_all_writes(struct bch_fs *c) { return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); } + +const char * const bch2_btree_write_types[] = { +#define x(t, n) [n] = #t, + BCH_BTREE_WRITE_TYPES() + NULL +}; + +void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c) +{ + printbuf_tabstop_push(out, 20); + printbuf_tabstop_push(out, 10); + + prt_tab(out); + prt_str(out, "nr"); + prt_tab(out); + prt_str(out, "size"); + prt_newline(out); + + for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) { + u64 nr = atomic64_read(&c->btree_write_stats[i].nr); + u64 bytes = atomic64_read(&c->btree_write_stats[i].bytes); + + prt_printf(out, "%s:", bch2_btree_write_types[i]); + prt_tab(out); + prt_u64(out, nr); + prt_tab(out); + prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0); + prt_newline(out); + } +} diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index 8af85364..4b1810ad 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -139,8 +139,12 @@ void bch2_btree_complete_write(struct bch_fs *, struct btree *, bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); -#define BTREE_WRITE_ONLY_IF_NEED (1U << 0) -#define BTREE_WRITE_ALREADY_STARTED (1U << 1) +enum btree_write_flags { + __BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS, + __BTREE_WRITE_ALREADY_STARTED, +}; +#define BTREE_WRITE_ONLY_IF_NEED (1U << __BTREE_WRITE_ONLY_IF_NEED ) +#define BTREE_WRITE_ALREADY_STARTED (1U << __BTREE_WRITE_ALREADY_STARTED) void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned); void bch2_btree_node_write(struct bch_fs *, struct btree *, @@ -219,4 +223,6 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id, bn->min_key = bpos_nosnap_successor(bn->min_key); } +void bch2_btree_write_stats_to_text(struct printbuf *, struct bch_fs *); + #endif /* _BCACHEFS_BTREE_IO_H */ diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 99a92a89..5080f56b 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -646,9 +646,9 @@ static inline void __btree_path_level_init(struct btree_path *path, bch2_btree_node_iter_peek(&l->iter, l->b); } -inline void bch2_btree_path_level_init(struct btree_trans *trans, - struct btree_path *path, - struct btree *b) +void bch2_btree_path_level_init(struct btree_trans *trans, + struct btree_path *path, + struct btree *b) { BUG_ON(path->cached); @@ -1172,11 +1172,10 @@ int __must_check bch2_btree_path_traverse(struct btree_trans *trans, btree_path_traverse_one(trans, path, flags, _RET_IP_); } -static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, +static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, struct btree_path *src) { unsigned i, offset = offsetof(struct btree_path, pos); - int cmp = btree_path_cmp(dst, src); memcpy((void *) dst + offset, (void *) src + offset, @@ -1188,9 +1187,6 @@ static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, if (t != BTREE_NODE_UNLOCKED) six_lock_increment(&dst->l[i].b->c.lock, t); } - - if (cmp) - bch2_btree_path_check_sort_fast(trans, dst, cmp); } static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src, @@ -1203,21 +1199,18 @@ static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btr return new; } +__flatten struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans, struct btree_path *path, bool intent, unsigned long ip) { - if (path->ref > 1 || path->preserve) { - __btree_path_put(path, intent); - path = btree_path_clone(trans, path, intent); - path->preserve = false; + __btree_path_put(path, intent); + path = btree_path_clone(trans, path, intent); + path->preserve = false; #ifdef CONFIG_BCACHEFS_DEBUG - path->ip_allocated = ip; + path->ip_allocated = ip; #endif - btree_trans_verify_sorted(trans); - } - - path->should_be_locked = false; + btree_trans_verify_sorted(trans); return path; } @@ -1554,7 +1547,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, return path; } -inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u) +struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u) { struct btree_path_level *l = path_l(path); @@ -2536,6 +2529,18 @@ static inline void btree_path_swap(struct btree_trans *trans, btree_path_verify_sorted_ref(trans, r); } +static inline struct btree_path *sib_btree_path(struct btree_trans *trans, + struct btree_path *path, int sib) +{ + unsigned idx = (unsigned) path->sorted_idx + sib; + + EBUG_ON(sib != -1 && sib != 1); + + return idx < trans->nr_sorted + ? trans->paths + trans->sorted[idx] + : NULL; +} + static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans *trans, struct btree_path *path, int cmp) @@ -2545,9 +2550,7 @@ static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans * EBUG_ON(!cmp); - while ((n = cmp < 0 - ? prev_btree_path(trans, path) - : next_btree_path(trans, path)) && + while ((n = sib_btree_path(trans, path, cmp)) && (cmp2 = btree_path_cmp(n, path)) && cmp2 != cmp) btree_path_swap(trans, n, path); diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 8c35d7d4..bad51ceb 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -165,13 +165,12 @@ int __must_check bch2_btree_path_traverse(struct btree_trans *, struct btree_path *, unsigned); struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, unsigned, unsigned, unsigned, unsigned long); -inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); +struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *, struct btree_iter *, struct bpos); -inline void bch2_btree_path_level_init(struct btree_trans *, - struct btree_path *, struct btree *); +void bch2_btree_path_level_init(struct btree_trans *, struct btree_path *, struct btree *); #ifdef CONFIG_BCACHEFS_DEBUG void bch2_trans_verify_paths(struct btree_trans *); diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c index 9d090437..dce2dc0c 100644 --- a/libbcachefs/btree_locking.c +++ b/libbcachefs/btree_locking.c @@ -173,10 +173,9 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) } if (unlikely(!best)) { - struct bch_fs *c = g->g->trans->c; struct printbuf buf = PRINTBUF; - bch_err(c, "cycle of nofail locks"); + prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks")); for (i = g->g; i < g->g + g->nr; i++) { struct btree_trans *trans = i->trans; diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index cab3de0d..d89489e4 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -77,6 +77,7 @@ struct btree { u8 nsets; u8 nr_key_bits; u16 version_ondisk; + u8 write_type; struct bkey_format format; diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 5ce91ae6..dac2fa6b 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -246,6 +246,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; unsigned nr_reserve; enum alloc_reserve alloc_reserve; + int ret; if (flags & BTREE_INSERT_USE_RESERVE) { nr_reserve = 0; @@ -268,7 +269,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, mutex_unlock(&c->btree_reserve_cache_lock); retry: - wp = bch2_alloc_sectors_start_trans(trans, + ret = bch2_alloc_sectors_start_trans(trans, c->opts.metadata_target ?: c->opts.foreground_target, 0, @@ -276,9 +277,9 @@ retry: &devs_have, res->nr_replicas, c->opts.metadata_replicas_required, - alloc_reserve, 0, cl); - if (IS_ERR(wp)) - return ERR_CAST(wp); + alloc_reserve, 0, cl, &wp); + if (unlikely(ret)) + return ERR_PTR(ret); if (wp->sectors_free < btree_sectors(c)) { struct open_bucket *ob; @@ -1178,7 +1179,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, } if (ret) { - trace_and_count(c, btree_reserve_get_fail, trans->fn, _RET_IP_, nr_nodes[0] + nr_nodes[1]); + trace_and_count(c, btree_reserve_get_fail, trans->fn, + _RET_IP_, nr_nodes[0] + nr_nodes[1], ret); goto err; } @@ -1307,6 +1309,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, bch2_btree_bset_insert_key(trans, path, b, node_iter, insert); set_btree_node_dirty_acct(c, b); set_btree_node_need_write(b); + b->write_type = BTREE_WRITE_interior; printbuf_exit(&buf); } diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index dabe8159..2e6d220c 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -282,6 +282,7 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b, struct bkey_packed k; BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s); + EBUG_ON(btree_node_just_written(b)); if (!bkey_pack_pos(&k, pos, b)) { struct bkey *u = (void *) &k; diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 3a683820..b930b788 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -181,6 +181,8 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, new |= 1 << BTREE_NODE_need_write; } while ((v = cmpxchg(&b->flags, old, new)) != old); + b->write_type = BTREE_WRITE_journal_reclaim; + btree_node_write_if_need(c, b, SIX_LOCK_read); six_unlock_read(&b->c.lock); @@ -289,7 +291,7 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s, return 0; } -static inline int bch2_trans_journal_res_get(struct btree_trans *trans, +static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, unsigned flags) { struct bch_fs *c = trans->c; @@ -721,33 +723,34 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, return ret; } +static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i) +{ + while (--i >= trans->updates) { + if (same_leaf_as_prev(trans, i)) + continue; + + bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b); + } + + trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); +} + static inline int trans_lock_write(struct btree_trans *trans) { struct btree_insert_entry *i; - int ret; trans_for_each_update(trans, i) { if (same_leaf_as_prev(trans, i)) continue; - ret = bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c); - if (ret) - goto fail; + if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c)) + return trans_lock_write_fail(trans, i); bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); } return 0; -fail: - while (--i >= trans->updates) { - if (same_leaf_as_prev(trans, i)) - continue; - - bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b); - } - - trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); } static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) @@ -758,6 +761,33 @@ static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); } +static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, + struct btree_insert_entry *i, + struct printbuf *err) +{ + struct bch_fs *c = trans->c; + int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE; + + printbuf_reset(err); + prt_printf(err, "invalid bkey on insert from %s -> %ps", + trans->fn, (void *) i->ip_allocated); + prt_newline(err); + printbuf_indent_add(err, 2); + + bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k)); + prt_newline(err); + + bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), + i->bkey_type, rw, err); + bch2_print_string_as_lines(KERN_ERR, err->buf); + + bch2_inconsistent_error(c); + bch2_dump_trans_updates(trans); + printbuf_exit(err); + + return -EINVAL; +} + /* * Get journal reservation, take write locks, and attempt to do btree update(s): */ @@ -772,24 +802,9 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE; trans_for_each_update(trans, i) { - if (bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), - i->bkey_type, rw, &buf)) { - printbuf_reset(&buf); - prt_printf(&buf, "invalid bkey on insert from %s -> %ps", - trans->fn, (void *) i->ip_allocated); - prt_newline(&buf); - printbuf_indent_add(&buf, 2); - - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); - prt_newline(&buf); - - bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), - i->bkey_type, rw, &buf); - - bch2_trans_inconsistent(trans, "%s", buf.buf); - printbuf_exit(&buf); - return -EINVAL; - } + if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), + i->bkey_type, rw, &buf))) + return bch2_trans_commit_bkey_invalid(trans, i, &buf); btree_insert_entry_checks(trans, i); } diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index cd297941..bf01837e 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -1263,23 +1263,24 @@ void fs_usage_apply_warn(struct btree_trans *trans, struct btree_insert_entry *i; struct printbuf buf = PRINTBUF; - bch_err(c, "disk usage increased %lli more than %u sectors reserved", - should_not_have_added, disk_res_sectors); + prt_printf(&buf, + bch2_fmt(c, "disk usage increased %lli more than %u sectors reserved)"), + should_not_have_added, disk_res_sectors); trans_for_each_update(trans, i) { struct bkey_s_c old = { &i->old_k, i->old_v }; - pr_err("while inserting"); - printbuf_reset(&buf); + prt_str(&buf, "new "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); - pr_err(" %s", buf.buf); - pr_err("overlapping with"); - printbuf_reset(&buf); + prt_newline(&buf); + + prt_str(&buf, "old "); bch2_bkey_val_to_text(&buf, c, old); - pr_err(" %s", buf.buf); + prt_newline(&buf); } __WARN(); + bch2_print_string_as_lines(KERN_ERR, buf.buf); printbuf_exit(&buf); } @@ -1949,7 +1950,7 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca) #define SECTORS_CACHE 1024 -int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, +int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, u64 sectors, int flags) { struct bch_fs_pcpu *pcpu; diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 56c06ccd..b6a1db76 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -259,15 +259,39 @@ int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *); static inline void bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) { - this_cpu_sub(*c->online_reserved, res->sectors); - res->sectors = 0; + if (res->sectors) { + this_cpu_sub(*c->online_reserved, res->sectors); + res->sectors = 0; + } } #define BCH_DISK_RESERVATION_NOFAIL (1 << 0) -int bch2_disk_reservation_add(struct bch_fs *, - struct disk_reservation *, - u64, int); +int __bch2_disk_reservation_add(struct bch_fs *, + struct disk_reservation *, + u64, int); + +static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, + u64 sectors, int flags) +{ +#ifdef __KERNEL__ + u64 old, new; + + do { + old = this_cpu_read(c->pcpu->sectors_available); + if (sectors > old) + return __bch2_disk_reservation_add(c, res, sectors, flags); + + new = old - sectors; + } while (this_cpu_cmpxchg(c->pcpu->sectors_available, old, new) != old); + + this_cpu_add(*c->online_reserved, sectors); + res->sectors += sectors; + return 0; +#else + return __bch2_disk_reservation_add(c, res, sectors, flags); +#endif +} static inline struct disk_reservation bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas) diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c index 3268e8d4..43d22fe8 100644 --- a/libbcachefs/checksum.c +++ b/libbcachefs/checksum.c @@ -316,7 +316,7 @@ struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type, return __bch2_checksum_bio(c, type, nonce, bio, &iter); } -int bch2_encrypt_bio(struct bch_fs *c, unsigned type, +int __bch2_encrypt_bio(struct bch_fs *c, unsigned type, struct nonce nonce, struct bio *bio) { struct bio_vec bv; diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h index 3d6d13bc..f7ccef7a 100644 --- a/libbcachefs/checksum.h +++ b/libbcachefs/checksum.h @@ -61,8 +61,16 @@ int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion, struct bch_extent_crc_unpacked *, unsigned, unsigned, unsigned); -int bch2_encrypt_bio(struct bch_fs *, unsigned, - struct nonce, struct bio *); +int __bch2_encrypt_bio(struct bch_fs *, unsigned, + struct nonce, struct bio *); + +static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type, + struct nonce nonce, struct bio *bio) +{ + return bch2_csum_type_is_encryption(type) + ? __bch2_encrypt_bio(c, type, nonce, bio) + : 0; +} int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, struct bch_key *); diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c index b75ff07e..be45cf54 100644 --- a/libbcachefs/data_update.c +++ b/libbcachefs/data_update.c @@ -97,7 +97,7 @@ static void bch2_bkey_mark_dev_cached(struct bkey_s k, unsigned dev) ptr->cached = true; } -static int bch2_data_update_index_update(struct bch_write_op *op) +int bch2_data_update_index_update(struct bch_write_op *op) { struct bch_fs *c = op->c; struct btree_trans trans; @@ -225,7 +225,7 @@ static int bch2_data_update_index_update(struct bch_write_op *op) bch2_trans_update(&trans, &iter, insert, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: bch2_trans_commit(&trans, &op->res, - op_journal_seq(op), + NULL, BTREE_INSERT_NOFAIL| m->data_opts.btree_insert_flags); if (!ret) { @@ -270,8 +270,7 @@ out: } void bch2_data_update_read_done(struct data_update *m, - struct bch_extent_crc_unpacked crc, - struct closure *cl) + struct bch_extent_crc_unpacked crc) { /* write bio must own pages: */ BUG_ON(!m->op.wbio.bio.bi_vcnt); @@ -279,7 +278,7 @@ void bch2_data_update_read_done(struct data_update *m, m->op.crc = crc; m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9; - closure_call(&m->op.cl, bch2_write, NULL, cl); + closure_call(&m->op.cl, bch2_write, NULL, NULL); } void bch2_data_update_exit(struct data_update *update) @@ -317,14 +316,13 @@ int bch2_data_update_init(struct bch_fs *c, struct data_update *m, m->op.flags |= BCH_WRITE_PAGES_STABLE| BCH_WRITE_PAGES_OWNED| BCH_WRITE_DATA_ENCODED| - BCH_WRITE_FROM_INTERNAL| + BCH_WRITE_MOVE| m->data_opts.write_flags; m->op.compression_type = bch2_compression_opt_to_type[io_opts.background_compression ?: io_opts.compression]; if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) m->op.alloc_reserve = RESERVE_movinggc; - m->op.index_update_fn = bch2_data_update_index_update; i = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { diff --git a/libbcachefs/data_update.h b/libbcachefs/data_update.h index 6793aa57..5d869079 100644 --- a/libbcachefs/data_update.h +++ b/libbcachefs/data_update.h @@ -26,9 +26,10 @@ struct data_update { struct bch_write_op op; }; +int bch2_data_update_index_update(struct bch_write_op *); + void bch2_data_update_read_done(struct data_update *, - struct bch_extent_crc_unpacked, - struct closure *); + struct bch_extent_crc_unpacked); void bch2_data_update_exit(struct data_update *); int bch2_data_update_init(struct bch_fs *, struct data_update *, diff --git a/libbcachefs/error.c b/libbcachefs/error.c index 2fb5102e..3e49d72d 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -125,8 +125,10 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...) s->nr++; } +#ifdef BCACHEFS_LOG_PREFIX if (!strncmp(fmt, "bcachefs:", 9)) prt_printf(out, bch2_log_msg(c, "")); +#endif va_start(args, fmt); prt_vprintf(out, fmt, args); diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 3900995d..6d0a6dec 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -65,7 +65,6 @@ struct quota_res { }; struct bch_writepage_io { - struct closure cl; struct bch_inode_info *inode; /* must be last: */ @@ -73,11 +72,13 @@ struct bch_writepage_io { }; struct dio_write { - struct completion done; struct kiocb *req; + struct address_space *mapping; + struct bch_inode_info *inode; struct mm_struct *mm; unsigned loop:1, sync:1, + flush:1, free_iov:1; struct quota_res quota_res; u64 written; @@ -98,7 +99,7 @@ struct dio_read { }; /* pagecache_block must be held */ -static int write_invalidate_inode_pages_range(struct address_space *mapping, +static noinline int write_invalidate_inode_pages_range(struct address_space *mapping, loff_t start, loff_t end) { int ret; @@ -750,25 +751,25 @@ vm_fault_t bch2_page_fault(struct vm_fault *vmf) if (fdm > mapping) { struct bch_inode_info *fdm_host = to_bch_ei(fdm->host); - if (bch2_pagecache_add_tryget(&inode->ei_pagecache_lock)) + if (bch2_pagecache_add_tryget(inode)) goto got_lock; - bch2_pagecache_block_put(&fdm_host->ei_pagecache_lock); + bch2_pagecache_block_put(fdm_host); - bch2_pagecache_add_get(&inode->ei_pagecache_lock); - bch2_pagecache_add_put(&inode->ei_pagecache_lock); + bch2_pagecache_add_get(inode); + bch2_pagecache_add_put(inode); - bch2_pagecache_block_get(&fdm_host->ei_pagecache_lock); + bch2_pagecache_block_get(fdm_host); /* Signal that lock has been dropped: */ set_fdm_dropped_locks(); return VM_FAULT_SIGBUS; } - bch2_pagecache_add_get(&inode->ei_pagecache_lock); + bch2_pagecache_add_get(inode); got_lock: ret = filemap_fault(vmf); - bch2_pagecache_add_put(&inode->ei_pagecache_lock); + bch2_pagecache_add_put(inode); return ret; } @@ -796,7 +797,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) * a write_invalidate_inode_pages_range() that works without dropping * page lock before invalidating page */ - bch2_pagecache_add_get(&inode->ei_pagecache_lock); + bch2_pagecache_add_get(inode); lock_page(page); isize = i_size_read(&inode->v); @@ -829,7 +830,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) wait_for_stable_page(page); ret = VM_FAULT_LOCKED; out: - bch2_pagecache_add_put(&inode->ei_pagecache_lock); + bch2_pagecache_add_put(inode); sb_end_pagefault(inode->v.i_sb); return ret; @@ -1097,7 +1098,7 @@ void bch2_readahead(struct readahead_control *ractl) bch2_trans_init(&trans, c, 0, 0); - bch2_pagecache_add_get(&inode->ei_pagecache_lock); + bch2_pagecache_add_get(inode); while ((page = readpage_iter_next(&readpages_iter))) { pgoff_t index = readpages_iter.offset + readpages_iter.idx; @@ -1120,7 +1121,7 @@ void bch2_readahead(struct readahead_control *ractl) &readpages_iter); } - bch2_pagecache_add_put(&inode->ei_pagecache_lock); + bch2_pagecache_add_put(inode); bch2_trans_exit(&trans); kfree(readpages_iter.pages); @@ -1200,18 +1201,10 @@ static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs }; } -static void bch2_writepage_io_free(struct closure *cl) +static void bch2_writepage_io_done(struct bch_write_op *op) { - struct bch_writepage_io *io = container_of(cl, - struct bch_writepage_io, cl); - - bio_put(&io->op.wbio.bio); -} - -static void bch2_writepage_io_done(struct closure *cl) -{ - struct bch_writepage_io *io = container_of(cl, - struct bch_writepage_io, cl); + struct bch_writepage_io *io = + container_of(op, struct bch_writepage_io, op); struct bch_fs *c = io->op.c; struct bio *bio = &io->op.wbio.bio; struct bvec_iter_all iter; @@ -1273,7 +1266,7 @@ static void bch2_writepage_io_done(struct closure *cl) end_page_writeback(bvec->bv_page); } - closure_return_with_destructor(&io->cl, bch2_writepage_io_free); + bio_put(&io->op.wbio.bio); } static void bch2_writepage_do_io(struct bch_writepage_state *w) @@ -1281,8 +1274,7 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w) struct bch_writepage_io *io = w->io; w->io = NULL; - closure_call(&io->op.cl, bch2_write, NULL, &io->cl); - continue_at(&io->cl, bch2_writepage_io_done, NULL); + closure_call(&io->op.cl, bch2_write, NULL, NULL); } /* @@ -1304,9 +1296,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, &c->writepage_bioset), struct bch_writepage_io, op.wbio.bio); - closure_init(&w->io->cl, NULL); w->io->inode = inode; - op = &w->io->op; bch2_write_op_init(op, c, w->opts); op->target = w->opts.foreground_target; @@ -1315,6 +1305,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, op->write_point = writepoint_hashed(inode->ei_last_dirtied); op->subvol = inode->ei_subvol; op->pos = POS(inode->v.i_ino, sector); + op->end_io = bch2_writepage_io_done; op->wbio.bio.bi_iter.bi_sector = sector; op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); } @@ -1438,7 +1429,8 @@ do_io: /* Check for writing past i_size: */ WARN_ON_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > - round_up(i_size, block_bytes(c))); + round_up(i_size, block_bytes(c)) && + !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)); w->io->op.res.sectors += reserved_sectors; w->io->op.i_sectors_delta -= dirty_sectors; @@ -1490,7 +1482,7 @@ int bch2_write_begin(struct file *file, struct address_space *mapping, bch2_page_reservation_init(c, inode, res); *fsdata = res; - bch2_pagecache_add_get(&inode->ei_pagecache_lock); + bch2_pagecache_add_get(inode); page = grab_cache_page_write_begin(mapping, index); if (!page) @@ -1547,7 +1539,7 @@ err: put_page(page); *pagep = NULL; err_unlock: - bch2_pagecache_add_put(&inode->ei_pagecache_lock); + bch2_pagecache_add_put(inode); kfree(res); *fsdata = NULL; return bch2_err_class(ret); @@ -1591,7 +1583,7 @@ int bch2_write_end(struct file *file, struct address_space *mapping, unlock_page(page); put_page(page); - bch2_pagecache_add_put(&inode->ei_pagecache_lock); + bch2_pagecache_add_put(inode); bch2_page_reservation_put(c, inode, res); kfree(res); @@ -1760,7 +1752,7 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) ssize_t written = 0; int ret = 0; - bch2_pagecache_add_get(&inode->ei_pagecache_lock); + bch2_pagecache_add_get(inode); do { unsigned offset = pos & (PAGE_SIZE - 1); @@ -1818,7 +1810,7 @@ again: balance_dirty_pages_ratelimited(mapping); } while (iov_iter_count(iter)); - bch2_pagecache_add_put(&inode->ei_pagecache_lock); + bch2_pagecache_add_put(inode); return written ? written : ret; } @@ -1981,11 +1973,13 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (iocb->ki_flags & IOCB_DIRECT) { struct blk_plug plug; - ret = filemap_write_and_wait_range(mapping, - iocb->ki_pos, - iocb->ki_pos + count - 1); - if (ret < 0) - goto out; + if (unlikely(mapping->nrpages)) { + ret = filemap_write_and_wait_range(mapping, + iocb->ki_pos, + iocb->ki_pos + count - 1); + if (ret < 0) + goto out; + } file_accessed(file); @@ -1996,9 +1990,9 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (ret >= 0) iocb->ki_pos += ret; } else { - bch2_pagecache_add_get(&inode->ei_pagecache_lock); + bch2_pagecache_add_get(inode); ret = generic_file_read_iter(iocb, iter); - bch2_pagecache_add_put(&inode->ei_pagecache_lock); + bch2_pagecache_add_put(inode); } out: return bch2_err_class(ret); @@ -2050,31 +2044,154 @@ err: return err ? false : ret; } -static void bch2_dio_write_loop_async(struct bch_write_op *); - -static long bch2_dio_write_loop(struct dio_write *dio) +static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio) { - bool kthread = (current->flags & PF_KTHREAD) != 0; + struct bch_fs *c = dio->op.c; + struct bch_inode_info *inode = dio->inode; + struct bio *bio = &dio->op.wbio.bio; + + return bch2_check_range_allocated(c, inode_inum(inode), + dio->op.pos.offset, bio_sectors(bio), + dio->op.opts.data_replicas, + dio->op.opts.compression != 0); +} + +static void bch2_dio_write_loop_async(struct bch_write_op *); +static __always_inline long bch2_dio_write_done(struct dio_write *dio); + +static noinline int bch2_dio_write_copy_iov(struct dio_write *dio) +{ + struct iovec *iov = dio->inline_vecs; + + if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { + iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov), + GFP_KERNEL); + if (unlikely(!iov)) + return -ENOMEM; + + dio->free_iov = true; + } + + memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov)); + dio->iter.iov = iov; + return 0; +} + +static void bch2_dio_write_flush_done(struct closure *cl) +{ + struct dio_write *dio = container_of(cl, struct dio_write, op.cl); + struct bch_fs *c = dio->op.c; + + closure_debug_destroy(cl); + + dio->op.error = bch2_journal_error(&c->journal); + + bch2_dio_write_done(dio); +} + +static noinline void bch2_dio_write_flush(struct dio_write *dio) +{ + struct bch_fs *c = dio->op.c; + struct bch_inode_unpacked inode; + int ret; + + dio->flush = 0; + + closure_init(&dio->op.cl, NULL); + + if (!dio->op.error) { + ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode); + if (ret) + dio->op.error = ret; + else + bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, &dio->op.cl); + } + + if (dio->sync) { + closure_sync(&dio->op.cl); + closure_debug_destroy(&dio->op.cl); + } else { + continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL); + } +} + +static __always_inline long bch2_dio_write_done(struct dio_write *dio) +{ + struct bch_fs *c = dio->op.c; struct kiocb *req = dio->req; - struct address_space *mapping = req->ki_filp->f_mapping; - struct bch_inode_info *inode = file_bch_inode(req->ki_filp); - struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_inode_info *inode = dio->inode; + bool sync = dio->sync; + long ret; + + if (unlikely(dio->flush)) { + bch2_dio_write_flush(dio); + if (!sync) + return -EIOCBQUEUED; + } + + bch2_pagecache_block_put(inode); + bch2_quota_reservation_put(c, inode, &dio->quota_res); + + if (dio->free_iov) + kfree(dio->iter.iov); + + ret = dio->op.error ?: ((long) dio->written << 9); + bio_put(&dio->op.wbio.bio); + + /* inode->i_dio_count is our ref on inode and thus bch_fs */ + inode_dio_end(&inode->v); + + if (ret < 0) + ret = bch2_err_class(ret); + + if (!sync) { + req->ki_complete(req, ret); + ret = -EIOCBQUEUED; + } + return ret; +} + +static __always_inline void bch2_dio_write_end(struct dio_write *dio) +{ + struct bch_fs *c = dio->op.c; + struct kiocb *req = dio->req; + struct bch_inode_info *inode = dio->inode; struct bio *bio = &dio->op.wbio.bio; struct bvec_iter_all iter; struct bio_vec *bv; + + i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta); + req->ki_pos += (u64) dio->op.written << 9; + dio->written += dio->op.written; + + spin_lock(&inode->v.i_lock); + if (req->ki_pos > inode->v.i_size) + i_size_write(&inode->v, req->ki_pos); + spin_unlock(&inode->v.i_lock); + + if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF))) + bio_for_each_segment_all(bv, bio, iter) + put_page(bv->bv_page); + + if (unlikely(dio->op.error)) + set_bit(EI_INODE_ERROR, &inode->ei_flags); +} + +static long bch2_dio_write_loop(struct dio_write *dio) +{ + struct bch_fs *c = dio->op.c; + struct kiocb *req = dio->req; + struct address_space *mapping = dio->mapping; + struct bch_inode_info *inode = dio->inode; + struct bio *bio = &dio->op.wbio.bio; unsigned unaligned, iter_count; bool sync = dio->sync, dropped_locks; long ret; - if (dio->loop) - goto loop; - while (1) { iter_count = dio->iter.count; - if (kthread && dio->mm) - kthread_use_mm(dio->mm); - BUG_ON(current->faults_disabled_mapping); + EBUG_ON(current->faults_disabled_mapping); current->faults_disabled_mapping = mapping; ret = bio_iov_iter_get_pages(bio, &dio->iter); @@ -2082,8 +2199,6 @@ static long bch2_dio_write_loop(struct dio_write *dio) dropped_locks = fdm_dropped_locks(); current->faults_disabled_mapping = NULL; - if (kthread && dio->mm) - kthread_unuse_mm(dio->mm); /* * If the fault handler returned an error but also signalled @@ -2121,116 +2236,80 @@ static long bch2_dio_write_loop(struct dio_write *dio) } bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode)); - dio->op.end_io = bch2_dio_write_loop_async; + dio->op.end_io = sync + ? NULL + : bch2_dio_write_loop_async; dio->op.target = dio->op.opts.foreground_target; dio->op.write_point = writepoint_hashed((unsigned long) current); dio->op.nr_replicas = dio->op.opts.data_replicas; dio->op.subvol = inode->ei_subvol; dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); - if ((req->ki_flags & IOCB_DSYNC) && - !c->opts.journal_flush_disabled) - dio->op.flags |= BCH_WRITE_FLUSH; + if (sync) + dio->op.flags |= BCH_WRITE_SYNC; dio->op.flags |= BCH_WRITE_CHECK_ENOSPC; ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), dio->op.opts.data_replicas, 0); if (unlikely(ret) && - !bch2_check_range_allocated(c, inode_inum(inode), - dio->op.pos.offset, bio_sectors(bio), - dio->op.opts.data_replicas, - dio->op.opts.compression != 0)) + !bch2_dio_write_check_allocated(dio)) goto err; task_io_account_write(bio->bi_iter.bi_size); - if (!dio->sync && !dio->loop && dio->iter.count) { - struct iovec *iov = dio->inline_vecs; + if (unlikely(dio->iter.count) && + !dio->sync && + !dio->loop && + bch2_dio_write_copy_iov(dio)) + dio->sync = sync = true; - if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { - iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov), - GFP_KERNEL); - if (unlikely(!iov)) { - dio->sync = sync = true; - goto do_io; - } - - dio->free_iov = true; - } - - memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov)); - dio->iter.iov = iov; - } -do_io: dio->loop = true; closure_call(&dio->op.cl, bch2_write, NULL, NULL); - if (sync) - wait_for_completion(&dio->done); - else + if (!sync) return -EIOCBQUEUED; -loop: - i_sectors_acct(c, inode, &dio->quota_res, - dio->op.i_sectors_delta); - req->ki_pos += (u64) dio->op.written << 9; - dio->written += dio->op.written; - spin_lock(&inode->v.i_lock); - if (req->ki_pos > inode->v.i_size) - i_size_write(&inode->v, req->ki_pos); - spin_unlock(&inode->v.i_lock); + bch2_dio_write_end(dio); - if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF))) - bio_for_each_segment_all(bv, bio, iter) - put_page(bv->bv_page); - bio->bi_vcnt = 0; - - if (dio->op.error) { - set_bit(EI_INODE_ERROR, &inode->ei_flags); - break; - } - - if (!dio->iter.count) + if (likely(!dio->iter.count) || dio->op.error) break; bio_reset(bio, NULL, REQ_OP_WRITE); - reinit_completion(&dio->done); } - - ret = dio->op.error ?: ((long) dio->written << 9); +out: + return bch2_dio_write_done(dio); err: - bch2_pagecache_block_put(&inode->ei_pagecache_lock); - bch2_quota_reservation_put(c, inode, &dio->quota_res); + dio->op.error = ret; - if (dio->free_iov) - kfree(dio->iter.iov); + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { + struct bvec_iter_all iter; + struct bio_vec *bv; - if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF))) bio_for_each_segment_all(bv, bio, iter) put_page(bv->bv_page); - bio_put(bio); - - /* inode->i_dio_count is our ref on inode and thus bch_fs */ - inode_dio_end(&inode->v); - - if (ret < 0) - ret = bch2_err_class(ret); - - if (!sync) { - req->ki_complete(req, ret); - ret = -EIOCBQUEUED; } - return ret; + goto out; } static void bch2_dio_write_loop_async(struct bch_write_op *op) { struct dio_write *dio = container_of(op, struct dio_write, op); + struct mm_struct *mm = dio->mm; - if (dio->sync) - complete(&dio->done); - else - bch2_dio_write_loop(dio); + bch2_dio_write_end(dio); + + if (likely(!dio->iter.count) || dio->op.error) { + bch2_dio_write_done(dio); + return; + } + + bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE); + + if (mm) + kthread_use_mm(mm); + bch2_dio_write_loop(dio); + if (mm) + kthread_unuse_mm(mm); } static noinline @@ -2268,7 +2347,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) goto err; inode_dio_begin(&inode->v); - bch2_pagecache_block_get(&inode->ei_pagecache_lock); + bch2_pagecache_block_get(inode); extending = req->ki_pos + iter->count > inode->v.i_size; if (!extending) { @@ -2282,26 +2361,31 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) GFP_KERNEL, &c->dio_write_bioset); dio = container_of(bio, struct dio_write, op.wbio.bio); - init_completion(&dio->done); dio->req = req; + dio->mapping = mapping; + dio->inode = inode; dio->mm = current->mm; dio->loop = false; dio->sync = is_sync_kiocb(req) || extending; + dio->flush = iocb_is_dsync(req) && !c->opts.journal_flush_disabled; dio->free_iov = false; dio->quota_res.sectors = 0; dio->written = 0; dio->iter = *iter; + dio->op.c = c; ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, iter->count >> 9, true); if (unlikely(ret)) goto err_put_bio; - ret = write_invalidate_inode_pages_range(mapping, - req->ki_pos, - req->ki_pos + iter->count - 1); - if (unlikely(ret)) - goto err_put_bio; + if (unlikely(mapping->nrpages)) { + ret = write_invalidate_inode_pages_range(mapping, + req->ki_pos, + req->ki_pos + iter->count - 1); + if (unlikely(ret)) + goto err_put_bio; + } ret = bch2_dio_write_loop(dio); err: @@ -2309,7 +2393,7 @@ err: inode_unlock(&inode->v); return ret; err_put_bio: - bch2_pagecache_block_put(&inode->ei_pagecache_lock); + bch2_pagecache_block_put(inode); bch2_quota_reservation_put(c, inode, &dio->quota_res); bio_put(bio); inode_dio_end(&inode->v); @@ -2613,7 +2697,7 @@ int bch2_truncate(struct user_namespace *mnt_userns, } inode_dio_wait(&inode->v); - bch2_pagecache_block_get(&inode->ei_pagecache_lock); + bch2_pagecache_block_get(inode); ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u); if (ret) @@ -2692,7 +2776,7 @@ int bch2_truncate(struct user_namespace *mnt_userns, ret = bch2_setattr_nonsize(mnt_userns, inode, iattr); err: - bch2_pagecache_block_put(&inode->ei_pagecache_lock); + bch2_pagecache_block_put(inode); return bch2_err_class(ret); } @@ -3005,8 +3089,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, } ret = bch2_extent_update(&trans, inode_inum(inode), &iter, - &reservation.k_i, - &disk_res, NULL, + &reservation.k_i, &disk_res, 0, &i_sectors_delta, true); if (ret) goto bkey_err; @@ -3105,7 +3188,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode, inode_lock(&inode->v); inode_dio_wait(&inode->v); - bch2_pagecache_block_get(&inode->ei_pagecache_lock); + bch2_pagecache_block_get(inode); ret = file_modified(file); if (ret) @@ -3122,7 +3205,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode, else ret = -EOPNOTSUPP; err: - bch2_pagecache_block_put(&inode->ei_pagecache_lock); + bch2_pagecache_block_put(inode); inode_unlock(&inode->v); percpu_ref_put(&c->writes); diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 186faa54..4591b75f 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -43,58 +43,6 @@ static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, struct bch_subvolume *); -static void __pagecache_lock_put(struct pagecache_lock *lock, long i) -{ - BUG_ON(atomic_long_read(&lock->v) == 0); - - if (atomic_long_sub_return_release(i, &lock->v) == 0) - wake_up_all(&lock->wait); -} - -static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i) -{ - long v = atomic_long_read(&lock->v), old; - - do { - old = v; - - if (i > 0 ? v < 0 : v > 0) - return false; - } while ((v = atomic_long_cmpxchg_acquire(&lock->v, - old, old + i)) != old); - return true; -} - -static void __pagecache_lock_get(struct pagecache_lock *lock, long i) -{ - wait_event(lock->wait, __pagecache_lock_tryget(lock, i)); -} - -void bch2_pagecache_add_put(struct pagecache_lock *lock) -{ - __pagecache_lock_put(lock, 1); -} - -bool bch2_pagecache_add_tryget(struct pagecache_lock *lock) -{ - return __pagecache_lock_tryget(lock, 1); -} - -void bch2_pagecache_add_get(struct pagecache_lock *lock) -{ - __pagecache_lock_get(lock, 1); -} - -void bch2_pagecache_block_put(struct pagecache_lock *lock) -{ - __pagecache_lock_put(lock, -1); -} - -void bch2_pagecache_block_get(struct pagecache_lock *lock) -{ - __pagecache_lock_get(lock, -1); -} - void bch2_inode_update_after_write(struct btree_trans *trans, struct bch_inode_info *inode, struct bch_inode_unpacked *bi, @@ -1409,7 +1357,7 @@ static struct inode *bch2_alloc_inode(struct super_block *sb) inode_init_once(&inode->v); mutex_init(&inode->ei_update_lock); - pagecache_lock_init(&inode->ei_pagecache_lock); + two_state_lock_init(&inode->ei_pagecache_lock); mutex_init(&inode->ei_quota_lock); return &inode->v; diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h index 9f4b57e3..b11a1508 100644 --- a/libbcachefs/fs.h +++ b/libbcachefs/fs.h @@ -6,31 +6,11 @@ #include "opts.h" #include "str_hash.h" #include "quota_types.h" +#include "two_state_shared_lock.h" #include #include -/* - * Two-state lock - can be taken for add or block - both states are shared, - * like read side of rwsem, but conflict with other state: - */ -struct pagecache_lock { - atomic_long_t v; - wait_queue_head_t wait; -}; - -static inline void pagecache_lock_init(struct pagecache_lock *lock) -{ - atomic_long_set(&lock->v, 0); - init_waitqueue_head(&lock->wait); -} - -void bch2_pagecache_add_put(struct pagecache_lock *); -bool bch2_pagecache_add_tryget(struct pagecache_lock *); -void bch2_pagecache_add_get(struct pagecache_lock *); -void bch2_pagecache_block_put(struct pagecache_lock *); -void bch2_pagecache_block_get(struct pagecache_lock *); - struct bch_inode_info { struct inode v; unsigned long ei_flags; @@ -39,7 +19,7 @@ struct bch_inode_info { u64 ei_quota_reserved; unsigned long ei_last_dirtied; - struct pagecache_lock ei_pagecache_lock; + two_state_lock_t ei_pagecache_lock; struct mutex ei_quota_lock; struct bch_qid ei_qid; @@ -50,6 +30,13 @@ struct bch_inode_info { struct bch_inode_unpacked ei_inode; }; +#define bch2_pagecache_add_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 0) +#define bch2_pagecache_add_tryget(i) bch2_two_state_trylock(&i->ei_pagecache_lock, 0) +#define bch2_pagecache_add_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 0) + +#define bch2_pagecache_block_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 1) +#define bch2_pagecache_block_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 1) + static inline subvol_inum inode_inum(struct bch_inode_info *inode) { return (subvol_inum) { @@ -96,7 +83,7 @@ do { \ if ((_locks) & INODE_LOCK) \ down_write_nested(&a[i]->v.i_rwsem, i); \ if ((_locks) & INODE_PAGECACHE_BLOCK) \ - bch2_pagecache_block_get(&a[i]->ei_pagecache_lock);\ + bch2_pagecache_block_get(a[i]);\ if ((_locks) & INODE_UPDATE_LOCK) \ mutex_lock_nested(&a[i]->ei_update_lock, i);\ } \ @@ -114,7 +101,7 @@ do { \ if ((_locks) & INODE_LOCK) \ up_write(&a[i]->v.i_rwsem); \ if ((_locks) & INODE_PAGECACHE_BLOCK) \ - bch2_pagecache_block_put(&a[i]->ei_pagecache_lock);\ + bch2_pagecache_block_put(a[i]);\ if ((_locks) & INODE_UPDATE_LOCK) \ mutex_unlock(&a[i]->ei_update_lock); \ } \ diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 60a14fa1..82caaf51 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -16,6 +16,7 @@ #include "checksum.h" #include "compress.h" #include "clock.h" +#include "data_update.h" #include "debug.h" #include "disk_groups.h" #include "ec.h" @@ -237,12 +238,14 @@ int bch2_extent_update(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *k, struct disk_reservation *disk_res, - u64 *journal_seq, u64 new_i_size, s64 *i_sectors_delta_total, bool check_enospc) { struct btree_iter inode_iter = { NULL }; + struct bkey_s_c inode_k; + struct bkey_s_c_inode_v3 inode; + struct bkey_i_inode_v3 *new_inode; struct bpos next_pos; bool usage_increasing; s64 i_sectors_delta = 0, disk_sectors_delta = 0; @@ -282,59 +285,51 @@ int bch2_extent_update(struct btree_trans *trans, return ret; } - if (new_i_size || i_sectors_delta) { - struct bkey_s_c k; - struct bkey_s_c_inode_v3 inode; - struct bkey_i_inode_v3 *new_inode; - bool i_size_update; + bch2_trans_iter_init(trans, &inode_iter, BTREE_ID_inodes, + SPOS(0, inum.inum, iter->snapshot), + BTREE_ITER_INTENT|BTREE_ITER_CACHED); + inode_k = bch2_btree_iter_peek_slot(&inode_iter); + ret = bkey_err(inode_k); + if (unlikely(ret)) + goto err; - bch2_trans_iter_init(trans, &inode_iter, BTREE_ID_inodes, - SPOS(0, inum.inum, iter->snapshot), - BTREE_ITER_INTENT|BTREE_ITER_CACHED); - k = bch2_btree_iter_peek_slot(&inode_iter); - ret = bkey_err(k); - if (unlikely(ret)) - goto err; + ret = bkey_is_inode(inode_k.k) ? 0 : -ENOENT; + if (unlikely(ret)) + goto err; - ret = bkey_is_inode(k.k) ? 0 : -ENOENT; - if (unlikely(ret)) - goto err; - - if (unlikely(k.k->type != KEY_TYPE_inode_v3)) { - k = bch2_inode_to_v3(trans, k); - ret = bkey_err(k); - if (unlikely(ret)) - goto err; - } - - inode = bkey_s_c_to_inode_v3(k); - i_size_update = !(le64_to_cpu(inode.v->bi_flags) & BCH_INODE_I_SIZE_DIRTY) && - new_i_size > le64_to_cpu(inode.v->bi_size); - - if (!i_sectors_delta && !i_size_update) - goto no_inode_update; - - new_inode = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); - ret = PTR_ERR_OR_ZERO(new_inode); - if (unlikely(ret)) - goto err; - - bkey_reassemble(&new_inode->k_i, k); - - if (i_size_update) - new_inode->v.bi_size = cpu_to_le64(new_i_size); - - le64_add_cpu(&new_inode->v.bi_sectors, i_sectors_delta); - - new_inode->k.p.snapshot = iter->snapshot; - - ret = bch2_trans_update(trans, &inode_iter, &new_inode->k_i, 0); + if (unlikely(inode_k.k->type != KEY_TYPE_inode_v3)) { + inode_k = bch2_inode_to_v3(trans, inode_k); + ret = bkey_err(inode_k); if (unlikely(ret)) goto err; } -no_inode_update: - ret = bch2_trans_update(trans, iter, k, 0) ?: - bch2_trans_commit(trans, disk_res, journal_seq, + + inode = bkey_s_c_to_inode_v3(inode_k); + + new_inode = bch2_trans_kmalloc(trans, bkey_bytes(inode_k.k)); + ret = PTR_ERR_OR_ZERO(new_inode); + if (unlikely(ret)) + goto err; + + bkey_reassemble(&new_inode->k_i, inode.s_c); + + if (!(le64_to_cpu(inode.v->bi_flags) & BCH_INODE_I_SIZE_DIRTY) && + new_i_size > le64_to_cpu(inode.v->bi_size)) + new_inode->v.bi_size = cpu_to_le64(new_i_size); + + le64_add_cpu(&new_inode->v.bi_sectors, i_sectors_delta); + + new_inode->k.p.snapshot = iter->snapshot; + + /* + * Note: + * We always have to do an inode updated - even when i_size/i_sectors + * aren't changing - for fsync to work properly; fsync relies on + * inode->bi_journal_seq which is updated by the trigger code: + */ + ret = bch2_trans_update(trans, &inode_iter, &new_inode->k_i, 0) ?: + bch2_trans_update(trans, iter, k, 0) ?: + bch2_trans_commit(trans, disk_res, NULL, BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL); if (unlikely(ret)) @@ -397,8 +392,7 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, bch2_cut_back(end_pos, &delete); ret = bch2_extent_update(trans, inum, iter, &delete, - &disk_res, NULL, - 0, i_sectors_delta, false); + &disk_res, 0, i_sectors_delta, false); bch2_disk_reservation_put(c, &disk_res); } @@ -428,7 +422,7 @@ int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, return ret; } -int bch2_write_index_default(struct bch_write_op *op) +static int bch2_write_index_default(struct bch_write_op *op) { struct bch_fs *c = op->c; struct bkey_buf sk; @@ -465,7 +459,7 @@ int bch2_write_index_default(struct bch_write_op *op) BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ret = bch2_extent_update(&trans, inum, &iter, sk.k, - &op->res, op_journal_seq(op), + &op->res, op->new_i_size, &op->i_sectors_delta, op->flags & BCH_WRITE_CHECK_ENOSPC); bch2_trans_iter_exit(&trans, &iter); @@ -543,29 +537,22 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, } } -static void __bch2_write(struct closure *); +static void __bch2_write(struct bch_write_op *); static void bch2_write_done(struct closure *cl) { struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct bch_fs *c = op->c; - if (!op->error && (op->flags & BCH_WRITE_FLUSH)) - op->error = bch2_journal_error(&c->journal); - bch2_disk_reservation_put(c, &op->res); percpu_ref_put(&c->writes); bch2_keylist_free(&op->insert_keys, op->inline_keys); bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); - if (op->end_io) { - EBUG_ON(cl->parent); - closure_debug_destroy(cl); + closure_debug_destroy(cl); + if (op->end_io) op->end_io(op); - } else { - closure_return(cl); - } } static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) @@ -603,7 +590,7 @@ static void __bch2_write_index(struct bch_write_op *op) struct keylist *keys = &op->insert_keys; struct bkey_i *k; unsigned dev; - int ret; + int ret = 0; if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { ret = bch2_write_drop_io_error_ptrs(op); @@ -626,7 +613,10 @@ static void __bch2_write_index(struct bch_write_op *op) if (!bch2_keylist_empty(keys)) { u64 sectors_start = keylist_sectors(keys); - int ret = op->index_update_fn(op); + + ret = !(op->flags & BCH_WRITE_MOVE) + ? bch2_write_index_default(op) + : bch2_data_update_index_update(op); BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); BUG_ON(keylist_sectors(keys) && !ret); @@ -636,7 +626,7 @@ static void __bch2_write_index(struct bch_write_op *op) if (ret) { bch_err_inum_ratelimited(c, op->pos.inode, "write error while doing btree update: %s", bch2_err_str(ret)); - op->error = ret; + goto err; } } out: @@ -649,25 +639,45 @@ out: err: keys->top = keys->keys; op->error = ret; + op->flags |= BCH_WRITE_DONE; goto out; } static void bch2_write_index(struct closure *cl) { struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - struct bch_fs *c = op->c; + struct write_point *wp = op->wp; + struct workqueue_struct *wq = index_update_wq(op); - __bch2_write_index(op); + barrier(); + op->btree_update_ready = true; + queue_work(wq, &wp->index_update_work); +} - if (!(op->flags & BCH_WRITE_DONE)) { - continue_at(cl, __bch2_write, index_update_wq(op)); - } else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) { - bch2_journal_flush_seq_async(&c->journal, - *op_journal_seq(op), - cl); - continue_at(cl, bch2_write_done, index_update_wq(op)); - } else { - continue_at_nobarrier(cl, bch2_write_done, NULL); +void bch2_write_point_do_index_updates(struct work_struct *work) +{ + struct write_point *wp = + container_of(work, struct write_point, index_update_work); + struct bch_write_op *op; + + while (1) { + spin_lock(&wp->writes_lock); + op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list); + if (op && !op->btree_update_ready) + op = NULL; + if (op) + list_del(&op->wp_list); + spin_unlock(&wp->writes_lock); + + if (!op) + break; + + __bch2_write_index(op); + + if (!(op->flags & BCH_WRITE_DONE)) + __bch2_write(op); + else + bch2_write_done(&op->cl); } } @@ -700,12 +710,12 @@ static void bch2_write_endio(struct bio *bio) if (wbio->put_bio) bio_put(bio); - if (parent) + if (parent) { bio_endio(&parent->bio); - else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT)) - closure_put(cl); - else - continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op)); + return; + } + + closure_put(cl); } static void init_append_extent(struct bch_write_op *op, @@ -1112,19 +1122,18 @@ err: return ret; } -static void __bch2_write(struct closure *cl) +static void __bch2_write(struct bch_write_op *op) { - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct bch_fs *c = op->c; - struct write_point *wp; + struct write_point *wp = NULL; struct bio *bio = NULL; - bool skip_put = true; unsigned nofs_flags; int ret; nofs_flags = memalloc_nofs_save(); again: memset(&op->failed, 0, sizeof(op->failed)); + op->btree_update_ready = false; do { struct bkey_i *key_to_write; @@ -1134,76 +1143,60 @@ again: /* +1 for possible cache device: */ if (op->open_buckets.nr + op->nr_replicas + 1 > ARRAY_SIZE(op->open_buckets.v)) - goto flush_io; + break; if (bch2_keylist_realloc(&op->insert_keys, op->inline_keys, ARRAY_SIZE(op->inline_keys), BKEY_EXTENT_U64s_MAX)) - goto flush_io; + break; /* * The copygc thread is now global, which means it's no longer * freeing up space on specific disks, which means that * allocations for specific disks may hang arbitrarily long: */ - wp = bch2_alloc_sectors_start(c, - op->target, - op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), - op->write_point, - &op->devs_have, - op->nr_replicas, - op->nr_replicas_required, - op->alloc_reserve, - op->flags, - (op->flags & (BCH_WRITE_ALLOC_NOWAIT| - BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl); - EBUG_ON(!wp); - - if (IS_ERR(wp)) { - if (unlikely(wp != ERR_PTR(-EAGAIN))) { - ret = PTR_ERR(wp); - goto err; + ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_alloc_sectors_start_trans(&trans, + op->target, + op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), + op->write_point, + &op->devs_have, + op->nr_replicas, + op->nr_replicas_required, + op->alloc_reserve, + op->flags, + (op->flags & (BCH_WRITE_ALLOC_NOWAIT| + BCH_WRITE_ONLY_SPECIFIED_DEVS)) + ? NULL : &op->cl, &wp)); + if (unlikely(ret)) { + if (unlikely(ret != -EAGAIN)) { + op->error = ret; + op->flags |= BCH_WRITE_DONE; } - goto flush_io; + break; } - /* - * It's possible for the allocator to fail, put us on the - * freelist waitlist, and then succeed in one of various retry - * paths: if that happens, we need to disable the skip_put - * optimization because otherwise there won't necessarily be a - * barrier before we free the bch_write_op: - */ - if (atomic_read(&cl->remaining) & CLOSURE_WAITING) - skip_put = false; - bch2_open_bucket_get(c, wp, &op->open_buckets); ret = bch2_write_extent(op, wp, &bio); + bch2_alloc_sectors_done(c, wp); - if (ret < 0) - goto err; - - if (ret) { - skip_put = false; - } else { - /* - * for the skip_put optimization this has to be set - * before we submit the bio: - */ + if (ret < 0) { + op->error = ret; op->flags |= BCH_WRITE_DONE; + break; } + if (!ret) + op->flags |= BCH_WRITE_DONE; + bio->bi_end_io = bch2_write_endio; bio->bi_private = &op->cl; bio->bi_opf |= REQ_OP_WRITE; - if (!skip_put) - closure_get(bio->bi_private); - else - op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT; + closure_get(bio->bi_private); key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset); @@ -1212,48 +1205,34 @@ again: key_to_write); } while (ret); - if (!skip_put) - continue_at(cl, bch2_write_index, index_update_wq(op)); -out: - memalloc_nofs_restore(nofs_flags); - return; -err: - op->error = ret; - op->flags |= BCH_WRITE_DONE; - - continue_at(cl, bch2_write_index, index_update_wq(op)); - goto out; -flush_io: /* - * If the write can't all be submitted at once, we generally want to - * block synchronously as that signals backpressure to the caller. + * Sync or no? * - * However, if we're running out of a workqueue, we can't block here - * because we'll be blocking other work items from completing: + * If we're running asynchronously, wne may still want to block + * synchronously here if we weren't able to submit all of the IO at + * once, as that signals backpressure to the caller. */ - if (current->flags & PF_WQ_WORKER) { - continue_at(cl, bch2_write_index, index_update_wq(op)); - goto out; - } - - closure_sync(cl); - - if (!bch2_keylist_empty(&op->insert_keys)) { + if ((op->flags & BCH_WRITE_SYNC) || !(op->flags & BCH_WRITE_DONE)) { + closure_sync(&op->cl); __bch2_write_index(op); - if (op->error) { - op->flags |= BCH_WRITE_DONE; - continue_at_nobarrier(cl, bch2_write_done, NULL); - goto out; - } + if (!(op->flags & BCH_WRITE_DONE)) + goto again; + bch2_write_done(&op->cl); + } else { + spin_lock(&wp->writes_lock); + op->wp = wp; + list_add_tail(&op->wp_list, &wp->writes); + spin_unlock(&wp->writes_lock); + + continue_at(&op->cl, bch2_write_index, NULL); } - goto again; + memalloc_nofs_restore(nofs_flags); } static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) { - struct closure *cl = &op->cl; struct bio *bio = &op->wbio.bio; struct bvec_iter iter; struct bkey_i_inline_data *id; @@ -1290,8 +1269,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) op->flags |= BCH_WRITE_WROTE_DATA_INLINE; op->flags |= BCH_WRITE_DONE; - continue_at_nobarrier(cl, bch2_write_index, NULL); - return; + __bch2_write_index(op); err: bch2_write_done(&op->cl); } @@ -1319,6 +1297,7 @@ void bch2_write(struct closure *cl) struct bch_fs *c = op->c; unsigned data_len; + EBUG_ON(op->cl.parent); BUG_ON(!op->nr_replicas); BUG_ON(!op->write_point.v); BUG_ON(!bkey_cmp(op->pos, POS_MAX)); @@ -1352,24 +1331,19 @@ void bch2_write(struct closure *cl) return; } - continue_at_nobarrier(cl, __bch2_write, NULL); + __bch2_write(op); return; err: bch2_disk_reservation_put(c, &op->res); - if (op->end_io) { - EBUG_ON(cl->parent); - closure_debug_destroy(cl); + closure_debug_destroy(&op->cl); + if (op->end_io) op->end_io(op); - } else { - closure_return(cl); - } } /* Cache promotion on read */ struct promote_op { - struct closure cl; struct rcu_head rcu; u64 start_time; @@ -1423,10 +1397,10 @@ static void promote_free(struct bch_fs *c, struct promote_op *op) kfree_rcu(op, rcu); } -static void promote_done(struct closure *cl) +static void promote_done(struct bch_write_op *wop) { struct promote_op *op = - container_of(cl, struct promote_op, cl); + container_of(wop, struct promote_op, write.op); struct bch_fs *c = op->write.op.c; bch2_time_stats_update(&c->times[BCH_TIME_data_promote], @@ -1438,7 +1412,6 @@ static void promote_done(struct closure *cl) static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) { - struct closure *cl = &op->cl; struct bio *bio = &op->write.op.wbio.bio; trace_and_count(op->write.op.c, read_promote, &rbio->bio); @@ -1451,9 +1424,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) sizeof(struct bio_vec) * rbio->bio.bi_vcnt); swap(bio->bi_vcnt, rbio->bio.bi_vcnt); - closure_init(cl, NULL); - bch2_data_update_read_done(&op->write, rbio->pick.crc, cl); - closure_return_with_destructor(cl, promote_done); + bch2_data_update_read_done(&op->write, rbio->pick.crc); } static struct promote_op *__promote_alloc(struct bch_fs *c, @@ -1518,6 +1489,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, }, btree_id, k); BUG_ON(ret); + op->write.op.end_io = promote_done; return op; err: diff --git a/libbcachefs/io.h b/libbcachefs/io.h index 3ae31758..e23ff0ed 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -27,28 +27,20 @@ const char *bch2_blk_status_to_str(blk_status_t); enum bch_write_flags { BCH_WRITE_ALLOC_NOWAIT = (1 << 0), BCH_WRITE_CACHED = (1 << 1), - BCH_WRITE_FLUSH = (1 << 2), - BCH_WRITE_DATA_ENCODED = (1 << 3), - BCH_WRITE_PAGES_STABLE = (1 << 4), - BCH_WRITE_PAGES_OWNED = (1 << 5), - BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6), - BCH_WRITE_WROTE_DATA_INLINE = (1 << 7), - BCH_WRITE_FROM_INTERNAL = (1 << 8), - BCH_WRITE_CHECK_ENOSPC = (1 << 9), + BCH_WRITE_DATA_ENCODED = (1 << 2), + BCH_WRITE_PAGES_STABLE = (1 << 3), + BCH_WRITE_PAGES_OWNED = (1 << 4), + BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 5), + BCH_WRITE_WROTE_DATA_INLINE = (1 << 6), + BCH_WRITE_CHECK_ENOSPC = (1 << 7), + BCH_WRITE_SYNC = (1 << 8), + BCH_WRITE_MOVE = (1 << 9), /* Internal: */ - BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 10), - BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 11), - BCH_WRITE_DONE = (1 << 12), - BCH_WRITE_IO_ERROR = (1 << 13), + BCH_WRITE_DONE = (1 << 10), + BCH_WRITE_IO_ERROR = (1 << 11), }; -static inline u64 *op_journal_seq(struct bch_write_op *op) -{ - return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR) - ? op->journal_seq_p : &op->journal_seq; -} - static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) { return op->alloc_reserve == RESERVE_movinggc @@ -60,14 +52,12 @@ int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, struct bkey_i *, bool *, s64 *, s64 *); int bch2_extent_update(struct btree_trans *, subvol_inum, struct btree_iter *, struct bkey_i *, - struct disk_reservation *, u64 *, u64, s64 *, bool); + struct disk_reservation *, u64, s64 *, bool); int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, subvol_inum, u64, s64 *); int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *); -int bch2_write_index_default(struct bch_write_op *); - static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, struct bch_io_opts opts) { @@ -91,14 +81,14 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, op->version = ZERO_VERSION; op->write_point = (struct write_point_specifier) { 0 }; op->res = (struct disk_reservation) { 0 }; - op->journal_seq = 0; op->new_i_size = U64_MAX; op->i_sectors_delta = 0; - op->index_update_fn = bch2_write_index_default; } void bch2_write(struct closure *); +void bch2_write_point_do_index_updates(struct work_struct *); + static inline struct bch_write_bio *wbio_init(struct bio *bio) { struct bch_write_bio *wbio = to_wbio(bio); diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h index 78bff13d..a91635d1 100644 --- a/libbcachefs/io_types.h +++ b/libbcachefs/io_types.h @@ -117,6 +117,7 @@ struct bch_write_op { unsigned nr_replicas_required:4; unsigned alloc_reserve:3; unsigned incompressible:1; + unsigned btree_update_ready:1; struct bch_devs_list devs_have; u16 target; @@ -132,23 +133,16 @@ struct bch_write_op { struct write_point_specifier write_point; + struct write_point *wp; + struct list_head wp_list; + struct disk_reservation res; struct open_buckets open_buckets; - /* - * If caller wants to flush but hasn't passed us a journal_seq ptr, we - * still need to stash the journal_seq somewhere: - */ - union { - u64 *journal_seq_p; - u64 journal_seq; - }; u64 new_i_size; s64 i_sectors_delta; - int (*index_update_fn)(struct bch_write_op *); - struct bch_devs_mask failed; struct keylist insert_keys; diff --git a/libbcachefs/keylist.h b/libbcachefs/keylist.h index 195799bb..635efb7e 100644 --- a/libbcachefs/keylist.h +++ b/libbcachefs/keylist.h @@ -17,7 +17,6 @@ static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys) { if (l->keys_p != inline_keys) kfree(l->keys_p); - bch2_keylist_init(l, inline_keys); } static inline void bch2_keylist_push(struct keylist *l) diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 74869204..1d11cf0d 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -53,9 +53,8 @@ struct moving_io { struct bio_vec bi_inline_vecs[0]; }; -static void move_free(struct closure *cl) +static void move_free(struct moving_io *io) { - struct moving_io *io = container_of(cl, struct moving_io, cl); struct moving_context *ctxt = io->write.ctxt; struct bch_fs *c = ctxt->c; @@ -65,31 +64,30 @@ static void move_free(struct closure *cl) kfree(io); } -static void move_write_done(struct closure *cl) +static void move_write_done(struct bch_write_op *op) { - struct moving_io *io = container_of(cl, struct moving_io, cl); + struct moving_io *io = container_of(op, struct moving_io, write.op); struct moving_context *ctxt = io->write.ctxt; if (io->write.op.error) ctxt->write_error = true; atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); - closure_return_with_destructor(cl, move_free); + move_free(io); + closure_put(&ctxt->cl); } -static void move_write(struct closure *cl) +static void move_write(struct moving_io *io) { - struct moving_io *io = container_of(cl, struct moving_io, cl); - if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { - closure_return_with_destructor(cl, move_free); + move_free(io); return; } + closure_get(&io->write.ctxt->cl); atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); - bch2_data_update_read_done(&io->write, io->rbio.pick.crc, cl); - continue_at(cl, move_write_done, NULL); + bch2_data_update_read_done(&io->write, io->rbio.pick.crc); } static inline struct moving_io *next_pending_write(struct moving_context *ctxt) @@ -121,7 +119,7 @@ static void do_pending_writes(struct moving_context *ctxt, struct btree_trans *t while ((io = next_pending_write(ctxt))) { list_del(&io->list); - closure_call(&io->cl, move_write, NULL, &ctxt->cl); + move_write(io); } } @@ -185,7 +183,7 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt, } } -void bch_move_stats_init(struct bch_move_stats *stats, char *name) +void bch2_move_stats_init(struct bch_move_stats *stats, char *name) { memset(stats, 0, sizeof(*stats)); scnprintf(stats->name, sizeof(stats->name), "%s", name); @@ -302,6 +300,7 @@ static int bch2_move_extent(struct btree_trans *trans, goto err_free_pages; io->write.ctxt = ctxt; + io->write.op.end_io = move_write_done; atomic64_inc(&ctxt->stats->keys_moved); atomic64_add(k.k->size, &ctxt->stats->sectors_moved); @@ -956,7 +955,7 @@ int bch2_data_job(struct bch_fs *c, switch (op.op) { case BCH_DATA_OP_REREPLICATE: - bch_move_stats_init(stats, "rereplicate"); + bch2_move_stats_init(stats, "rereplicate"); stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, -1); @@ -980,7 +979,7 @@ int bch2_data_job(struct bch_fs *c, if (op.migrate.dev >= c->sb.nr_devices) return -EINVAL; - bch_move_stats_init(stats, "migrate"); + bch2_move_stats_init(stats, "migrate"); stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); @@ -1001,7 +1000,7 @@ int bch2_data_job(struct bch_fs *c, ret = bch2_replicas_gc2(c) ?: ret; break; case BCH_DATA_OP_REWRITE_OLD_NODES: - bch_move_stats_init(stats, "rewrite_old_nodes"); + bch2_move_stats_init(stats, "rewrite_old_nodes"); ret = bch2_scan_old_btree_nodes(c, stats); break; default: diff --git a/libbcachefs/move.h b/libbcachefs/move.h index c0fec69b..b14f679f 100644 --- a/libbcachefs/move.h +++ b/libbcachefs/move.h @@ -60,8 +60,7 @@ int bch2_data_job(struct bch_fs *, struct bch_move_stats *, struct bch_ioctl_data); -inline void bch_move_stats_init(struct bch_move_stats *stats, - char *name); +void bch2_move_stats_init(struct bch_move_stats *stats, char *name); #endif /* _BCACHEFS_MOVE_H */ diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 044eca87..63bc692f 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -102,7 +102,7 @@ static int bch2_copygc(struct bch_fs *c) }; int ret = 0; - bch_move_stats_init(&move_stats, "copygc"); + bch2_move_stats_init(&move_stats, "copygc"); for_each_rw_member(ca, c, dev_idx) heap_size += ca->mi.nbuckets >> 7; diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c index 17b289b0..4df981bd 100644 --- a/libbcachefs/rebalance.c +++ b/libbcachefs/rebalance.c @@ -189,7 +189,7 @@ static int bch2_rebalance_thread(void *arg) prev_start = jiffies; prev_cputime = curr_cputime(); - bch_move_stats_init(&move_stats, "rebalance"); + bch2_move_stats_init(&move_stats, "rebalance"); while (!kthread_wait_freezable(r->enabled)) { cond_resched(); diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 6968f934..fdcd70e8 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -1414,7 +1414,7 @@ use_clean: le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written) { struct bch_move_stats stats; - bch_move_stats_init(&stats, "recovery"); + bch2_move_stats_init(&stats, "recovery"); bch_info(c, "scanning for old btree nodes"); ret = bch2_fs_read_write(c); @@ -1486,6 +1486,9 @@ int bch2_fs_initialize(struct bch_fs *c) mutex_unlock(&c->sb_lock); set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); + set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags); + set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags); + set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags); set_bit(BCH_FS_MAY_GO_RW, &c->flags); set_bit(BCH_FS_FSCK_DONE, &c->flags); diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c index d5c14bb2..0d4c004d 100644 --- a/libbcachefs/reflink.c +++ b/libbcachefs/reflink.c @@ -378,7 +378,7 @@ s64 bch2_remap_range(struct bch_fs *c, dst_end.offset - dst_iter.pos.offset)); ret = bch2_extent_update(&trans, dst_inum, &dst_iter, - new_dst.k, &disk_res, NULL, + new_dst.k, &disk_res, new_i_size, i_sectors_delta, true); bch2_disk_reservation_put(c, &disk_res); diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 06b2924c..647d018b 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -184,7 +184,7 @@ read_attribute(io_latency_stats_read); read_attribute(io_latency_stats_write); read_attribute(congested); -read_attribute(btree_avg_write_size); +read_attribute(btree_write_stats); read_attribute(btree_cache_size); read_attribute(compression_stats); @@ -250,14 +250,6 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) return ret; } -static size_t bch2_btree_avg_write_size(struct bch_fs *c) -{ - u64 nr = atomic64_read(&c->btree_writes_nr); - u64 sectors = atomic64_read(&c->btree_writes_sectors); - - return nr ? div64_u64(sectors, nr) : 0; -} - static long data_progress_to_text(struct printbuf *out, struct bch_fs *c) { long ret = 0; @@ -396,7 +388,9 @@ SHOW(bch2_fs) sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); - sysfs_hprint(btree_avg_write_size, bch2_btree_avg_write_size(c)); + + if (attr == &sysfs_btree_write_stats) + bch2_btree_write_stats_to_text(out, c); sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); @@ -554,7 +548,7 @@ SYSFS_OPS(bch2_fs); struct attribute *bch2_fs_files[] = { &sysfs_minor, &sysfs_btree_cache_size, - &sysfs_btree_avg_write_size, + &sysfs_btree_write_stats, &sysfs_promote_whole_extents, diff --git a/libbcachefs/two_state_shared_lock.c b/libbcachefs/two_state_shared_lock.c new file mode 100644 index 00000000..dc508d54 --- /dev/null +++ b/libbcachefs/two_state_shared_lock.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "two_state_shared_lock.h" + +void bch2_two_state_unlock(two_state_lock_t *lock, int s) +{ + long i = s ? 1 : -1; + + BUG_ON(atomic_long_read(&lock->v) == 0); + + if (atomic_long_sub_return_release(i, &lock->v) == 0) + wake_up_all(&lock->wait); +} + +bool bch2_two_state_trylock(two_state_lock_t *lock, int s) +{ + long i = s ? 1 : -1; + long v = atomic_long_read(&lock->v), old; + + do { + old = v; + + if (i > 0 ? v < 0 : v > 0) + return false; + } while ((v = atomic_long_cmpxchg_acquire(&lock->v, + old, old + i)) != old); + return true; +} + +void bch2_two_state_lock(two_state_lock_t *lock, int s) +{ + wait_event(lock->wait, bch2_two_state_trylock(lock, s)); +} diff --git a/libbcachefs/two_state_shared_lock.h b/libbcachefs/two_state_shared_lock.h new file mode 100644 index 00000000..1b4f1089 --- /dev/null +++ b/libbcachefs/two_state_shared_lock.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_TWO_STATE_LOCK_H +#define _BCACHEFS_TWO_STATE_LOCK_H + +#include +#include +#include + +/* + * Two-state lock - can be taken for add or block - both states are shared, + * like read side of rwsem, but conflict with other state: + */ +typedef struct { + atomic_long_t v; + wait_queue_head_t wait; +} two_state_lock_t; + +static inline void two_state_lock_init(two_state_lock_t *lock) +{ + atomic_long_set(&lock->v, 0); + init_waitqueue_head(&lock->wait); +} + +void bch2_two_state_unlock(two_state_lock_t *, int); +bool bch2_two_state_trylock(two_state_lock_t *, int); +void bch2_two_state_lock(two_state_lock_t *, int); + +#endif /* _BCACHEFS_TWO_STATE_LOCK_H */ diff --git a/linux/mean_and_variance.c b/linux/mean_and_variance.c index 643e3113..aa95db12 100644 --- a/linux/mean_and_variance.c +++ b/linux/mean_and_variance.c @@ -52,7 +52,7 @@ * * note: this rounds towards 0. */ -inline s64 fast_divpow2(s64 n, u8 d) +s64 fast_divpow2(s64 n, u8 d) { return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d; } diff --git a/linux/printbuf_userspace.c b/linux/printbuf_userspace.c index df9567c5..0ae56ee1 100644 --- a/linux/printbuf_userspace.c +++ b/linux/printbuf_userspace.c @@ -27,3 +27,8 @@ void prt_printf(struct printbuf *out, const char *fmt, ...) prt_vprintf(out, fmt, args); va_end(args); } + +void prt_u64(struct printbuf *out, u64 v) +{ + prt_printf(out, "%llu", v); +} diff --git a/linux/six.c b/linux/six.c index 39f7ea79..39a9bd6e 100644 --- a/linux/six.c +++ b/linux/six.c @@ -342,7 +342,11 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type, return true; } -#ifdef CONFIG_LOCK_SPIN_ON_OWNER +/* + * We don't see stable performance with SIX_LOCK_SPIN_ON_OWNER enabled, so it's + * off for now: + */ +#ifdef SIX_LOCK_SPIN_ON_OWNER static inline bool six_optimistic_spin(struct six_lock *lock, struct six_lock_waiter *wait) diff --git a/linux/wait.c b/linux/wait.c index 991875c5..b1f002b9 100644 --- a/linux/wait.c +++ b/linux/wait.c @@ -66,6 +66,11 @@ void wake_up(wait_queue_head_t *q) __wake_up(q, TASK_NORMAL, 1, NULL); } +void wake_up_all(wait_queue_head_t *q) +{ + __wake_up(q, TASK_NORMAL, 0, NULL); +} + static void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) { __wake_up_common(q, mode, nr, 0, NULL);