From cef2f30ae2a25df41704b9b06fc13882d737cc27 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 15 Aug 2018 19:41:24 -0400 Subject: [PATCH] Update bcachefs sources to 15f6e66e86 bcachefs: pass around bset_tree less --- .bcachefs_revision | 2 +- cmd_debug.c | 4 +- cmd_migrate.c | 4 +- include/linux/kernel.h | 4 + include/linux/log2.h | 2 +- include/linux/sched.h | 1 + include/linux/time64.h | 6 + include/trace/events/bcachefs.h | 33 +- libbcachefs/acl.c | 15 +- libbcachefs/alloc.c | 647 +++++++++++---------- libbcachefs/alloc.h | 4 +- libbcachefs/bcachefs.h | 13 +- libbcachefs/bcachefs_format.h | 1 + libbcachefs/bkey_methods.c | 21 +- libbcachefs/bkey_methods.h | 3 +- libbcachefs/bset.c | 274 +++++---- libbcachefs/bset.h | 83 ++- libbcachefs/btree_gc.c | 30 +- libbcachefs/btree_io.c | 28 +- libbcachefs/btree_io.h | 9 - libbcachefs/btree_iter.c | 315 ++++++----- libbcachefs/btree_iter.h | 17 +- libbcachefs/btree_locking.h | 23 +- libbcachefs/btree_types.h | 95 ++-- libbcachefs/btree_update.h | 41 +- libbcachefs/btree_update_interior.c | 24 +- libbcachefs/btree_update_interior.h | 60 +- libbcachefs/btree_update_leaf.c | 167 +++--- libbcachefs/buckets.c | 261 +++++---- libbcachefs/buckets.h | 25 +- libbcachefs/buckets_types.h | 17 +- libbcachefs/chardev.c | 7 +- libbcachefs/dirent.c | 24 +- libbcachefs/dirent.h | 2 +- libbcachefs/extents.c | 833 ++++++++++------------------ libbcachefs/extents.h | 22 +- libbcachefs/fifo.h | 12 +- libbcachefs/fs-io.c | 752 ++++++++++++------------- libbcachefs/fs-ioctl.c | 92 +-- libbcachefs/fs-ioctl.h | 73 ++- libbcachefs/fs.c | 138 +++-- libbcachefs/fs.h | 12 +- libbcachefs/fsck.c | 38 +- libbcachefs/inode.c | 50 +- libbcachefs/inode.h | 12 +- libbcachefs/io.c | 4 +- libbcachefs/journal.c | 265 ++++----- libbcachefs/journal_io.c | 16 +- libbcachefs/journal_reclaim.c | 135 +++-- libbcachefs/journal_reclaim.h | 7 +- libbcachefs/journal_types.h | 9 +- libbcachefs/migrate.c | 2 +- libbcachefs/move.c | 2 +- libbcachefs/movinggc.c | 8 +- libbcachefs/opts.h | 5 +- libbcachefs/quota.c | 16 +- libbcachefs/quota.h | 2 +- libbcachefs/recovery.c | 4 +- libbcachefs/str_hash.h | 6 +- libbcachefs/super.c | 61 +- libbcachefs/sysfs.c | 71 +-- libbcachefs/tests.c | 90 ++- libbcachefs/xattr.c | 32 +- libbcachefs/xattr.h | 2 +- 64 files changed, 2400 insertions(+), 2633 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index dddb0443..300e9284 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -eab3b355cf6fcabbf07d7a9032c68e95cab37ad0 +15f6e66e86a97245d967fedcb2f33598c174fd96 diff --git a/cmd_debug.c b/cmd_debug.c index 11d73b35..51099f1a 100644 --- a/cmd_debug.c +++ b/cmd_debug.c @@ -204,9 +204,7 @@ static void list_nodes_keys(struct bch_fs *c, enum btree_id btree_id, buf[0] = '\t'; - for_each_btree_node_key_unpack(b, k, &node_iter, - btree_node_is_extents(b), - &unpacked) { + for_each_btree_node_key_unpack(b, k, &node_iter, &unpacked) { bch2_bkey_val_to_text(c, bkey_type(0, btree_id), buf + 1, sizeof(buf) - 1, k); puts(buf); diff --git a/cmd_migrate.c b/cmd_migrate.c index 44283c3c..177884da 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -121,7 +121,7 @@ static void update_inode(struct bch_fs *c, bch2_inode_pack(&packed, inode); ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, - NULL, NULL, NULL, 0); + NULL, NULL, 0); if (ret) die("error creating file: %s", strerror(-ret)); } @@ -350,7 +350,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst, extent_i_to_s_c(e).s_c); ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &e->k_i, - &res, NULL, NULL, 0); + &res, NULL, 0); if (ret) die("btree insert error %s", strerror(-ret)); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index a4c8149e..a281edcf 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -101,6 +101,10 @@ #define swap(a, b) \ do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) +/* This counts to 12. Any more, it will return 13th argument. */ +#define __COUNT_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _n, X...) _n +#define COUNT_ARGS(X...) __COUNT_ARGS(, ##X, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) + #define _RET_IP_ (unsigned long)__builtin_return_address(0) #define _THIS_IP_ ({ __label__ __here; __here: (unsigned long)&&__here; }) diff --git a/include/linux/log2.h b/include/linux/log2.h index 96f62458..2bbe25e4 100644 --- a/include/linux/log2.h +++ b/include/linux/log2.h @@ -23,7 +23,7 @@ /* * deal with unrepresentable constant logarithms */ -extern __attribute__((const, noreturn)) +extern __attribute__((const)) int ____ilog2_NaN(void); /* diff --git a/include/linux/sched.h b/include/linux/sched.h index 38a5fecb..f9bb6a4d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -146,6 +146,7 @@ static inline struct timespec current_kernel_time(void) return ts; } +#define current_kernel_time64() current_kernel_time() #define CURRENT_TIME (current_kernel_time()) #endif /* __TOOLS_LINUX_SCHED_H */ diff --git a/include/linux/time64.h b/include/linux/time64.h index fd59a9a6..cd6cc1c1 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -3,6 +3,8 @@ #include +#define timespec64 timespec + typedef __s64 time64_t; /* Parameters used to convert the timespec values: */ @@ -42,4 +44,8 @@ static inline struct timespec timespec_trunc(struct timespec t, unsigned gran) return t; } +#define ns_to_timespec64 ns_to_timespec +#define timespec64_to_ns timespec_to_ns +#define timespec64_trunc timespec_trunc + #endif /* _LINUX_TIME64_H */ diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index 13264b82..73be8873 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -7,7 +7,7 @@ #include DECLARE_EVENT_CLASS(bpos, - TP_PROTO(struct bpos p), + TP_PROTO(struct bpos *p), TP_ARGS(p), TP_STRUCT__entry( @@ -16,8 +16,8 @@ DECLARE_EVENT_CLASS(bpos, ), TP_fast_assign( - __entry->inode = p.inode; - __entry->offset = p.offset; + __entry->inode = p->inode; + __entry->offset = p->offset; ), TP_printk("%llu:%llu", __entry->inode, __entry->offset) @@ -43,21 +43,6 @@ DECLARE_EVENT_CLASS(bkey, __entry->offset, __entry->size) ); -DECLARE_EVENT_CLASS(bch_dev, - TP_PROTO(struct bch_dev *ca), - TP_ARGS(ca), - - TP_STRUCT__entry( - __array(char, uuid, 16 ) - ), - - TP_fast_assign( - memcpy(__entry->uuid, ca->uuid.b, 16); - ), - - TP_printk("%pU", __entry->uuid) -); - DECLARE_EVENT_CLASS(bch_fs, TP_PROTO(struct bch_fs *c), TP_ARGS(c), @@ -138,7 +123,7 @@ DEFINE_EVENT(bio, journal_write, /* bset.c: */ DEFINE_EVENT(bpos, bkey_pack_pos_fail, - TP_PROTO(struct bpos p), + TP_PROTO(struct bpos *p), TP_ARGS(p) ); @@ -360,16 +345,6 @@ DEFINE_EVENT(bch_fs, gc_coalesce_end, TP_ARGS(c) ); -DEFINE_EVENT(bch_dev, sectors_saturated, - TP_PROTO(struct bch_dev *ca), - TP_ARGS(ca) -); - -DEFINE_EVENT(bch_fs, gc_sectors_saturated, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - DEFINE_EVENT(bch_fs, gc_cannot_inc_gens, TP_PROTO(struct bch_fs *c), TP_ARGS(c) diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c index 534ea94e..5dd666ec 100644 --- a/libbcachefs/acl.c +++ b/libbcachefs/acl.c @@ -284,10 +284,9 @@ static int inode_update_for_set_acl_fn(struct bch_inode_info *inode, void *p) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct timespec now = current_time(&inode->v); umode_t mode = (unsigned long) p; - bi->bi_ctime = timespec_to_bch2_time(c, now); + bi->bi_ctime = bch2_current_time(c); bi->bi_mode = mode; return 0; } @@ -301,13 +300,14 @@ int bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type) umode_t mode = inode->v.i_mode; int ret; + mutex_lock(&inode->ei_update_lock); + bch2_trans_init(&trans, c); + if (type == ACL_TYPE_ACCESS && acl) { ret = posix_acl_update_mode(&inode->v, &mode, &acl); if (ret) - return ret; + goto err; } - - bch2_trans_init(&trans, c); retry: bch2_trans_begin(&trans); @@ -318,7 +318,7 @@ retry: bch2_write_inode_trans(&trans, inode, &inode_u, inode_update_for_set_acl_fn, (void *)(unsigned long) mode) ?: - bch2_trans_commit(&trans, NULL, NULL, + bch2_trans_commit(&trans, NULL, &inode->ei_journal_seq, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOUNLOCK); @@ -333,6 +333,7 @@ retry: set_cached_acl(&inode->v, type, acl); err: bch2_trans_exit(&trans); + mutex_unlock(&inode->ei_update_lock); return ret; } @@ -372,7 +373,7 @@ int bch2_acl_chmod(struct btree_trans *trans, goto err; } - bch2_trans_update(trans, iter, &new->k_i, 0); + bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &new->k_i)); *new_acl = acl; acl = NULL; err: diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c index ac2c7d1f..3f43a1be 100644 --- a/libbcachefs/alloc.c +++ b/libbcachefs/alloc.c @@ -154,8 +154,8 @@ const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k) return NULL; } -void bch2_alloc_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) +int bch2_alloc_to_text(struct bch_fs *c, char *buf, + size_t size, struct bkey_s_c k) { buf[0] = '\0'; @@ -163,6 +163,8 @@ void bch2_alloc_to_text(struct bch_fs *c, char *buf, case BCH_ALLOC: break; } + + return 0; } static inline unsigned get_alloc_field(const u8 **p, unsigned bytes) @@ -288,53 +290,41 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list) static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca, size_t b, struct btree_iter *iter, - u64 *journal_seq, bool nowait) + u64 *journal_seq, unsigned flags) { struct bucket_mark m; __BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key; struct bucket *g; struct bkey_i_alloc *a; u8 *d; - int ret; - unsigned flags = BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_USE_ALLOC_RESERVE; - if (nowait) - flags |= BTREE_INSERT_NOWAIT; + percpu_down_read_preempt_disable(&c->usage_lock); + g = bucket(ca, b); - bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); + m = READ_ONCE(g->mark); + a = bkey_alloc_init(&alloc_key.k); + a->k.p = POS(ca->dev_idx, b); + a->v.fields = 0; + a->v.gen = m.gen; + set_bkey_val_u64s(&a->k, bch_alloc_val_u64s(&a->v)); - do { - ret = btree_iter_err(bch2_btree_iter_peek_slot(iter)); - if (ret) - break; + d = a->v.data; + if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME)) + put_alloc_field(&d, 2, g->io_time[READ]); + if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME)) + put_alloc_field(&d, 2, g->io_time[WRITE]); + percpu_up_read_preempt_enable(&c->usage_lock); - percpu_down_read_preempt_disable(&c->usage_lock); - g = bucket(ca, b); + bch2_btree_iter_cond_resched(iter); - /* read mark under btree node lock: */ - m = READ_ONCE(g->mark); - a = bkey_alloc_init(&alloc_key.k); - a->k.p = iter->pos; - a->v.fields = 0; - a->v.gen = m.gen; - set_bkey_val_u64s(&a->k, bch_alloc_val_u64s(&a->v)); + bch2_btree_iter_set_pos(iter, a->k.p); - d = a->v.data; - if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME)) - put_alloc_field(&d, 2, g->io_time[READ]); - if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME)) - put_alloc_field(&d, 2, g->io_time[WRITE]); - percpu_up_read_preempt_enable(&c->usage_lock); - - ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, flags, - BTREE_INSERT_ENTRY(iter, &a->k_i)); - bch2_btree_iter_cond_resched(iter); - } while (ret == -EINTR); - - return ret; + return bch2_btree_insert_at(c, NULL, journal_seq, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_USE_ALLOC_RESERVE| + flags, + BTREE_INSERT_ENTRY(iter, &a->k_i)); } int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos) @@ -354,8 +344,7 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos) bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN, BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter, - NULL, false); + ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter, NULL, 0); bch2_btree_iter_unlock(&iter); return ret; } @@ -375,8 +364,8 @@ int bch2_alloc_write(struct bch_fs *c) down_read(&ca->bucket_lock); for_each_set_bit(bucket, ca->buckets_dirty, ca->mi.nbuckets) { - ret = __bch2_alloc_write_key(c, ca, bucket, &iter, - NULL, false); + ret = __bch2_alloc_write_key(c, ca, bucket, + &iter, NULL, 0); if (ret) break; @@ -582,47 +571,6 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, return gc_gen < BUCKET_GC_GEN_MAX; } -static void bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t bucket) -{ - struct bucket_mark m; - - percpu_down_read_preempt_disable(&c->usage_lock); - spin_lock(&c->freelist_lock); - - if (!bch2_invalidate_bucket(c, ca, bucket, &m)) { - spin_unlock(&c->freelist_lock); - percpu_up_read_preempt_enable(&c->usage_lock); - return; - } - - verify_not_on_freelist(c, ca, bucket); - BUG_ON(!fifo_push(&ca->free_inc, bucket)); - - spin_unlock(&c->freelist_lock); - percpu_up_read_preempt_enable(&c->usage_lock); - - /* gc lock held: */ - bucket_io_clock_reset(c, ca, bucket, READ); - bucket_io_clock_reset(c, ca, bucket, WRITE); - - if (m.cached_sectors) { - ca->allocator_invalidating_data = true; - } else if (m.journal_seq_valid) { - u64 journal_seq = atomic64_read(&c->journal.seq); - u64 bucket_seq = journal_seq; - - bucket_seq &= ~((u64) U16_MAX); - bucket_seq |= m.journal_seq; - - if (bucket_seq > journal_seq) - bucket_seq -= 1 << 16; - - ca->allocator_journal_seq_flush = - max(ca->allocator_journal_seq_flush, bucket_seq); - } -} - /* * Determines what order we're going to reuse buckets, smallest bucket_key() * first. @@ -674,11 +622,18 @@ static inline int bucket_alloc_cmp(alloc_heap *h, (l.bucket > r.bucket) - (l.bucket < r.bucket); } +static inline int bucket_idx_cmp(const void *_l, const void *_r) +{ + const struct alloc_heap_entry *l = _l, *r = _r; + + return (l->bucket > r->bucket) - (l->bucket < r->bucket); +} + static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) { struct bucket_array *buckets; struct alloc_heap_entry e = { 0 }; - size_t b; + size_t b, i, nr = 0; ca->alloc_heap.used = 0; @@ -720,55 +675,58 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) if (e.nr) heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp); + for (i = 0; i < ca->alloc_heap.used; i++) + nr += ca->alloc_heap.data[i].nr; + + while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) { + nr -= ca->alloc_heap.data[0].nr; + heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp); + } + up_read(&ca->bucket_lock); mutex_unlock(&c->bucket_clock[READ].lock); - - heap_resort(&ca->alloc_heap, bucket_alloc_cmp); - - while (heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp)) { - for (b = e.bucket; - b < e.bucket + e.nr; - b++) { - if (fifo_full(&ca->free_inc)) - return; - - bch2_invalidate_one_bucket(c, ca, b); - } - } } static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) { struct bucket_array *buckets = bucket_array(ca); struct bucket_mark m; - size_t b, checked; + size_t b, start; - for (checked = 0; - checked < ca->mi.nbuckets && !fifo_full(&ca->free_inc); - checked++) { - if (ca->fifo_last_bucket < ca->mi.first_bucket || - ca->fifo_last_bucket >= ca->mi.nbuckets) + if (ca->fifo_last_bucket < ca->mi.first_bucket || + ca->fifo_last_bucket >= ca->mi.nbuckets) + ca->fifo_last_bucket = ca->mi.first_bucket; + + start = ca->fifo_last_bucket; + + do { + ca->fifo_last_bucket++; + if (ca->fifo_last_bucket == ca->mi.nbuckets) ca->fifo_last_bucket = ca->mi.first_bucket; - b = ca->fifo_last_bucket++; - + b = ca->fifo_last_bucket; m = READ_ONCE(buckets->b[b].mark); - if (bch2_can_invalidate_bucket(ca, b, m)) - bch2_invalidate_one_bucket(c, ca, b); + if (bch2_can_invalidate_bucket(ca, b, m)) { + struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; + + heap_add(&ca->alloc_heap, e, bucket_alloc_cmp); + if (heap_full(&ca->alloc_heap)) + break; + } cond_resched(); - } + } while (ca->fifo_last_bucket != start); } static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca) { struct bucket_array *buckets = bucket_array(ca); struct bucket_mark m; - size_t checked; + size_t checked, i; for (checked = 0; - checked < ca->mi.nbuckets / 2 && !fifo_full(&ca->free_inc); + checked < ca->mi.nbuckets / 2; checked++) { size_t b = bch2_rand_range(ca->mi.nbuckets - ca->mi.first_bucket) + @@ -776,17 +734,34 @@ static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca m = READ_ONCE(buckets->b[b].mark); - if (bch2_can_invalidate_bucket(ca, b, m)) - bch2_invalidate_one_bucket(c, ca, b); + if (bch2_can_invalidate_bucket(ca, b, m)) { + struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; + + heap_add(&ca->alloc_heap, e, bucket_alloc_cmp); + if (heap_full(&ca->alloc_heap)) + break; + } cond_resched(); } + + sort(ca->alloc_heap.data, + ca->alloc_heap.used, + sizeof(ca->alloc_heap.data[0]), + bucket_idx_cmp, NULL); + + /* remove duplicates: */ + for (i = 0; i + 1 < ca->alloc_heap.used; i++) + if (ca->alloc_heap.data[i].bucket == + ca->alloc_heap.data[i + 1].bucket) + ca->alloc_heap.data[i].nr = 0; } -static void find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) +static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) { + size_t i, nr = 0; + ca->inc_gen_needs_gc = 0; - ca->inc_gen_really_needs_gc = 0; switch (ca->mi.replacement) { case CACHE_REPLACEMENT_LRU: @@ -799,86 +774,132 @@ static void find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) find_reclaimable_buckets_random(c, ca); break; } + + heap_resort(&ca->alloc_heap, bucket_alloc_cmp); + + for (i = 0; i < ca->alloc_heap.used; i++) + nr += ca->alloc_heap.data[i].nr; + + return nr; } -static int size_t_cmp(const void *_l, const void *_r) +static inline long next_alloc_bucket(struct bch_dev *ca) { - const size_t *l = _l, *r = _r; + struct alloc_heap_entry e, *top = ca->alloc_heap.data; - return (*l > *r) - (*l < *r); + while (ca->alloc_heap.used) { + if (top->nr) { + size_t b = top->bucket; + + top->bucket++; + top->nr--; + return b; + } + + heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp); + } + + return -1; } -static void sort_free_inc(struct bch_fs *c, struct bch_dev *ca) +static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t bucket, u64 *flush_seq) { - BUG_ON(ca->free_inc.front); + struct bucket_mark m; + percpu_down_read_preempt_disable(&c->usage_lock); spin_lock(&c->freelist_lock); - sort(ca->free_inc.data, - ca->free_inc.back, - sizeof(ca->free_inc.data[0]), - size_t_cmp, NULL); + + bch2_invalidate_bucket(c, ca, bucket, &m); + + verify_not_on_freelist(c, ca, bucket); + BUG_ON(!fifo_push(&ca->free_inc, bucket)); + spin_unlock(&c->freelist_lock); + + bucket_io_clock_reset(c, ca, bucket, READ); + bucket_io_clock_reset(c, ca, bucket, WRITE); + + percpu_up_read_preempt_enable(&c->usage_lock); + + if (m.journal_seq_valid) { + u64 journal_seq = atomic64_read(&c->journal.seq); + u64 bucket_seq = journal_seq; + + bucket_seq &= ~((u64) U16_MAX); + bucket_seq |= m.journal_seq; + + if (bucket_seq > journal_seq) + bucket_seq -= 1 << 16; + + *flush_seq = max(*flush_seq, bucket_seq); + } + + return m.cached_sectors != 0; } -static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca, - u64 *journal_seq, size_t nr, - bool nowait) +/* + * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc: + */ +static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) { struct btree_iter iter; + u64 journal_seq = 0; int ret = 0; + long b; bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); /* Only use nowait if we've already invalidated at least one bucket: */ - while (ca->nr_invalidated < min(nr, fifo_used(&ca->free_inc))) { - size_t b = fifo_idx_entry(&ca->free_inc, ca->nr_invalidated); + while (!ret && + !fifo_full(&ca->free_inc) && + (b = next_alloc_bucket(ca)) >= 0) { + bool must_flush = + bch2_invalidate_one_bucket(c, ca, b, &journal_seq); - ret = __bch2_alloc_write_key(c, ca, b, &iter, journal_seq, - nowait && ca->nr_invalidated); - if (ret) - break; - - ca->nr_invalidated++; + ret = __bch2_alloc_write_key(c, ca, b, &iter, + must_flush ? &journal_seq : NULL, + !fifo_empty(&ca->free_inc) ? BTREE_INSERT_NOWAIT : 0); } bch2_btree_iter_unlock(&iter); /* If we used NOWAIT, don't return the error: */ - return ca->nr_invalidated ? 0 : ret; -} + if (!fifo_empty(&ca->free_inc)) + ret = 0; + if (ret) { + bch_err(ca, "error invalidating buckets: %i", ret); + return ret; + } -static bool __push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket) -{ - unsigned i; + if (journal_seq) + ret = bch2_journal_flush_seq(&c->journal, journal_seq); + if (ret) { + bch_err(ca, "journal error: %i", ret); + return ret; + } - /* - * Don't remove from free_inc until after it's added to - * freelist, so gc can find it: - */ - spin_lock(&c->freelist_lock); - for (i = 0; i < RESERVE_NR; i++) - if (fifo_push(&ca->free[i], bucket)) { - fifo_pop(&ca->free_inc, bucket); - --ca->nr_invalidated; - closure_wake_up(&c->freelist_wait); - spin_unlock(&c->freelist_lock); - return true; - } - spin_unlock(&c->freelist_lock); - - return false; + return 0; } static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket) { + unsigned i; int ret = 0; while (1) { set_current_state(TASK_INTERRUPTIBLE); - if (__push_invalidated_bucket(c, ca, bucket)) - break; + spin_lock(&c->freelist_lock); + for (i = 0; i < RESERVE_NR; i++) + if (fifo_push(&ca->free[i], bucket)) { + fifo_pop(&ca->free_inc, bucket); + closure_wake_up(&c->freelist_wait); + spin_unlock(&c->freelist_lock); + goto out; + } + spin_unlock(&c->freelist_lock); if ((current->flags & PF_KTHREAD) && kthread_should_stop()) { @@ -889,22 +910,20 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t schedule(); try_to_freeze(); } - +out: __set_current_state(TASK_RUNNING); return ret; } /* - * Given an invalidated, ready to use bucket: issue a discard to it if enabled, - * then add it to the freelist, waiting until there's room if necessary: + * Pulls buckets off free_inc, discards them (if enabled), then adds them to + * freelists, waiting until there's room if necessary: */ static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca) { - while (ca->nr_invalidated) { + while (!fifo_empty(&ca->free_inc)) { size_t bucket = fifo_peek(&ca->free_inc); - BUG_ON(fifo_empty(&ca->free_inc) || !ca->nr_invalidated); - if (ca->mi.discard && blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) blkdev_issue_discard(ca->disk_sb.bdev, @@ -930,68 +949,37 @@ static int bch2_allocator_thread(void *arg) { struct bch_dev *ca = arg; struct bch_fs *c = ca->fs; - u64 journal_seq; + size_t nr; int ret; set_freezable(); while (1) { - while (1) { - cond_resched(); + cond_resched(); - pr_debug("discarding %zu invalidated buckets", - ca->nr_invalidated); + pr_debug("discarding %zu invalidated buckets", + fifo_used(&ca->free_inc)); - ret = discard_invalidated_buckets(c, ca); - if (ret) - goto stop; + ret = discard_invalidated_buckets(c, ca); + if (ret) + goto stop; - if (fifo_empty(&ca->free_inc)) - break; + down_read(&c->gc_lock); - pr_debug("invalidating %zu buckets", - fifo_used(&ca->free_inc)); + ret = bch2_invalidate_buckets(c, ca); + if (ret) { + up_read(&c->gc_lock); + goto stop; + } - journal_seq = 0; - ret = bch2_invalidate_free_inc(c, ca, &journal_seq, - SIZE_MAX, true); - if (ret) { - bch_err(ca, "error invalidating buckets: %i", ret); - goto stop; - } - - if (!ca->nr_invalidated) { - bch_err(ca, "allocator thread unable to make forward progress!"); - goto stop; - } - - if (ca->allocator_invalidating_data) - ret = bch2_journal_flush_seq(&c->journal, journal_seq); - else if (ca->allocator_journal_seq_flush) - ret = bch2_journal_flush_seq(&c->journal, - ca->allocator_journal_seq_flush); - - /* - * journal error - buckets haven't actually been - * invalidated, can't discard them: - */ - if (ret) { - bch_err(ca, "journal error: %i", ret); - goto stop; - } + if (!fifo_empty(&ca->free_inc)) { + up_read(&c->gc_lock); + continue; } pr_debug("free_inc now empty"); - /* Reset front/back so we can easily sort fifo entries later: */ - ca->free_inc.front = ca->free_inc.back = 0; - ca->allocator_journal_seq_flush = 0; - ca->allocator_invalidating_data = false; - - down_read(&c->gc_lock); - while (1) { - size_t prev = fifo_used(&ca->free_inc); - + do { if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) { up_read(&c->gc_lock); bch_err(ca, "gc failure"); @@ -1007,56 +995,46 @@ static int bch2_allocator_thread(void *arg) pr_debug("scanning for reclaimable buckets"); - find_reclaimable_buckets(c, ca); + nr = find_reclaimable_buckets(c, ca); - pr_debug("found %zu buckets (free_inc %zu/%zu)", - fifo_used(&ca->free_inc) - prev, - fifo_used(&ca->free_inc), ca->free_inc.size); + pr_debug("found %zu buckets", nr); - trace_alloc_batch(ca, fifo_used(&ca->free_inc), - ca->free_inc.size); + trace_alloc_batch(ca, nr, ca->alloc_heap.size); - if ((ca->inc_gen_needs_gc >= ca->free_inc.size || - (!fifo_full(&ca->free_inc) && - ca->inc_gen_really_needs_gc >= - fifo_free(&ca->free_inc))) && + if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) || + ca->inc_gen_really_needs_gc) && c->gc_thread) { atomic_inc(&c->kick_gc); wake_up_process(c->gc_thread); } - if (fifo_full(&ca->free_inc)) - break; - - if (!fifo_empty(&ca->free_inc) && - !fifo_full(&ca->free[RESERVE_MOVINGGC])) - break; - /* - * copygc may be waiting until either its reserve fills - * up, or we can't make forward progress: + * If we found any buckets, we have to invalidate them + * before we scan for more - but if we didn't find very + * many we may want to wait on more buckets being + * available so we don't spin: */ - ca->allocator_blocked = true; - closure_wake_up(&c->freelist_wait); + if (!nr || + (nr < ALLOC_SCAN_BATCH(ca) && + !fifo_full(&ca->free[RESERVE_MOVINGGC]))) { + ca->allocator_blocked = true; + closure_wake_up(&c->freelist_wait); - ret = wait_buckets_available(c, ca); - if (ret) { - up_read(&c->gc_lock); - goto stop; + ret = wait_buckets_available(c, ca); + if (ret) { + up_read(&c->gc_lock); + goto stop; + } } - } + } while (!nr); ca->allocator_blocked = false; up_read(&c->gc_lock); - pr_debug("free_inc now %zu/%zu", - fifo_used(&ca->free_inc), - ca->free_inc.size); - - sort_free_inc(c, ca); + pr_debug("%zu buckets to invalidate", nr); /* - * free_inc is now full of newly-invalidated buckets: next, + * alloc_heap is now full of newly-invalidated buckets: next, * write out the new bucket gens: */ } @@ -1733,7 +1711,7 @@ void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) void bch2_recalc_capacity(struct bch_fs *c) { struct bch_dev *ca; - u64 total_capacity, capacity = 0, reserved_sectors = 0; + u64 capacity = 0, reserved_sectors = 0, gc_reserve; unsigned long ra_pages = 0; unsigned i, j; @@ -1748,7 +1726,7 @@ void bch2_recalc_capacity(struct bch_fs *c) bch2_set_ra_pages(c, ra_pages); for_each_rw_member(ca, c, i) { - size_t reserve = 0; + u64 dev_reserve = 0; /* * We need to reserve buckets (from the number @@ -1767,30 +1745,36 @@ void bch2_recalc_capacity(struct bch_fs *c) * not -ENOSPC calculations. */ for (j = 0; j < RESERVE_NONE; j++) - reserve += ca->free[j].size; + dev_reserve += ca->free[j].size; - reserve += ca->free_inc.size; + dev_reserve += ca->free_inc.size; - reserve += ARRAY_SIZE(c->write_points); + dev_reserve += ARRAY_SIZE(c->write_points); - reserve += 1; /* btree write point */ + dev_reserve += 1; /* btree write point */ + dev_reserve += 1; /* copygc write point */ + dev_reserve += 1; /* rebalance write point */ + dev_reserve += WRITE_POINT_COUNT; - reserved_sectors += bucket_to_sector(ca, reserve); + dev_reserve *= ca->mi.bucket_size; + + ca->copygc_threshold = dev_reserve; capacity += bucket_to_sector(ca, ca->mi.nbuckets - ca->mi.first_bucket); + + reserved_sectors += dev_reserve * 2; } - total_capacity = capacity; + gc_reserve = c->opts.gc_reserve_bytes + ? c->opts.gc_reserve_bytes >> 9 + : div64_u64(capacity * c->opts.gc_reserve_percent, 100); - capacity *= (100 - c->opts.gc_reserve_percent); - capacity = div64_u64(capacity, 100); + reserved_sectors = max(gc_reserve, reserved_sectors); - BUG_ON(reserved_sectors > total_capacity); + reserved_sectors = min(reserved_sectors, capacity); - capacity = min(capacity, total_capacity - reserved_sectors); - - c->capacity = capacity; + c->capacity = capacity - reserved_sectors; if (c->capacity) { bch2_io_timer_add(&c->io_clock[READ], @@ -1946,39 +1930,83 @@ int bch2_dev_allocator_start(struct bch_dev *ca) return 0; } +static void flush_held_btree_writes(struct bch_fs *c) +{ + struct bucket_table *tbl; + struct rhash_head *pos; + struct btree *b; + bool flush_updates; + size_t i, nr_pending_updates; + + clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags); +again: + pr_debug("flushing dirty btree nodes"); + cond_resched(); + + flush_updates = false; + nr_pending_updates = bch2_btree_interior_updates_nr_pending(c); + + rcu_read_lock(); + for_each_cached_btree(b, c, tbl, i, pos) + if (btree_node_dirty(b) && (!b->written || b->level)) { + if (btree_node_may_write(b)) { + rcu_read_unlock(); + btree_node_lock_type(c, b, SIX_LOCK_read); + bch2_btree_node_write(c, b, SIX_LOCK_read); + six_unlock_read(&b->lock); + goto again; + } else { + flush_updates = true; + } + } + rcu_read_unlock(); + + if (c->btree_roots_dirty) + bch2_journal_meta(&c->journal); + + /* + * This is ugly, but it's needed to flush btree node writes + * without spinning... + */ + if (flush_updates) { + closure_wait_event(&c->btree_interior_update_wait, + bch2_btree_interior_updates_nr_pending(c) < + nr_pending_updates); + goto again; + } + +} + static void allocator_start_issue_discards(struct bch_fs *c) { struct bch_dev *ca; unsigned dev_iter; - size_t i, bu; - - for_each_rw_member(ca, c, dev_iter) { - unsigned done = 0; - - fifo_for_each_entry(bu, &ca->free_inc, i) { - if (done == ca->nr_invalidated) - break; + size_t bu; + for_each_rw_member(ca, c, dev_iter) + while (fifo_pop(&ca->free_inc, bu)) blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, bu), ca->mi.bucket_size, GFP_NOIO, 0); - done++; - } - } } static int __bch2_fs_allocator_start(struct bch_fs *c) { struct bch_dev *ca; - size_t bu, i; unsigned dev_iter; u64 journal_seq = 0; + long bu; bool invalidating_data = false; int ret = 0; if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) return -1; + if (test_alloc_startup(c)) { + invalidating_data = true; + goto not_enough; + } + /* Scan for buckets that are already invalidated: */ for_each_rw_member(ca, c, dev_iter) { struct btree_iter iter; @@ -2003,7 +2031,6 @@ static int __bch2_fs_allocator_start(struct bch_fs *c) percpu_up_read_preempt_enable(&c->usage_lock); fifo_push(&ca->free_inc, bu); - ca->nr_invalidated++; if (fifo_full(&ca->free_inc)) break; @@ -2022,24 +2049,23 @@ static int __bch2_fs_allocator_start(struct bch_fs *c) not_enough: pr_debug("did not find enough empty buckets; issuing discards"); - /* clear out free_inc - find_reclaimable_buckets() assumes it's empty */ + /* clear out free_inc, we'll be using it again below: */ for_each_rw_member(ca, c, dev_iter) discard_invalidated_buckets(c, ca); pr_debug("scanning for reclaimable buckets"); for_each_rw_member(ca, c, dev_iter) { - BUG_ON(!fifo_empty(&ca->free_inc)); - ca->free_inc.front = ca->free_inc.back = 0; - find_reclaimable_buckets(c, ca); - sort_free_inc(c, ca); - invalidating_data |= ca->allocator_invalidating_data; + while (!fifo_full(&ca->free[RESERVE_BTREE]) && + (bu = next_alloc_bucket(ca)) >= 0) { + invalidating_data |= + bch2_invalidate_one_bucket(c, ca, bu, &journal_seq); - fifo_for_each_entry(bu, &ca->free_inc, i) - if (!fifo_push(&ca->free[RESERVE_BTREE], bu)) - break; + fifo_push(&ca->free[RESERVE_BTREE], bu); + set_bit(bu, ca->buckets_dirty); + } } pr_debug("done scanning for reclaimable buckets"); @@ -2054,6 +2080,8 @@ not_enough: * invalidated on disk: */ if (invalidating_data) { + BUG(); + pr_info("holding writes"); pr_debug("invalidating existing data"); set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags); } else { @@ -2065,16 +2093,9 @@ not_enough: * XXX: it's possible for this to deadlock waiting on journal reclaim, * since we're holding btree writes. What then? */ - - for_each_rw_member(ca, c, dev_iter) { - ret = bch2_invalidate_free_inc(c, ca, &journal_seq, - ca->free[RESERVE_BTREE].size, - false); - if (ret) { - percpu_ref_put(&ca->io_ref); - return ret; - } - } + ret = bch2_alloc_write(c); + if (ret) + return ret; if (invalidating_data) { pr_debug("flushing journal"); @@ -2087,57 +2108,11 @@ not_enough: allocator_start_issue_discards(c); } - for_each_rw_member(ca, c, dev_iter) - while (ca->nr_invalidated) { - BUG_ON(!fifo_pop(&ca->free_inc, bu)); - ca->nr_invalidated--; - } - set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags); /* now flush dirty btree nodes: */ - if (invalidating_data) { - struct bucket_table *tbl; - struct rhash_head *pos; - struct btree *b; - bool flush_updates; - size_t nr_pending_updates; - - clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags); -again: - pr_debug("flushing dirty btree nodes"); - cond_resched(); - - flush_updates = false; - nr_pending_updates = bch2_btree_interior_updates_nr_pending(c); - - - rcu_read_lock(); - for_each_cached_btree(b, c, tbl, i, pos) - if (btree_node_dirty(b) && (!b->written || b->level)) { - if (btree_node_may_write(b)) { - rcu_read_unlock(); - btree_node_lock_type(c, b, SIX_LOCK_read); - bch2_btree_node_write(c, b, SIX_LOCK_read); - six_unlock_read(&b->lock); - goto again; - } else { - flush_updates = true; - } - } - rcu_read_unlock(); - - /* - * This is ugly, but it's needed to flush btree node writes - * without spinning... - */ - if (flush_updates) { - closure_wait_event(&c->btree_interior_update_wait, - bch2_btree_interior_updates_nr_pending(c) < - nr_pending_updates); - goto again; - } - } + if (invalidating_data) + flush_held_btree_writes(c); return 0; } diff --git a/libbcachefs/alloc.h b/libbcachefs/alloc.h index 00d01f46..739df233 100644 --- a/libbcachefs/alloc.h +++ b/libbcachefs/alloc.h @@ -9,8 +9,10 @@ struct bch_dev; struct bch_fs; struct bch_devs_List; +#define ALLOC_SCAN_BATCH(ca) ((ca)->mi.nbuckets >> 9) + const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c); -void bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); +int bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); #define bch2_bkey_alloc_ops (struct bkey_ops) { \ .key_invalid = bch2_alloc_invalid, \ diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index bd5ea6fc..92727cca 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -267,6 +267,10 @@ do { \ "Store the journal sequence number in the version " \ "number of every btree key, and verify that btree " \ "update ordering is preserved during recovery") \ + BCH_DEBUG_PARAM(test_alloc_startup, \ + "Force allocator startup to use the slowpath where it" \ + "can't find enough free buckets without invalidating" \ + "cached data") #define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG() @@ -400,7 +404,6 @@ struct bch_dev { alloc_fifo free[RESERVE_NR]; alloc_fifo free_inc; spinlock_t freelist_lock; - size_t nr_invalidated; u8 open_buckets_partial[OPEN_BUCKETS_COUNT]; unsigned open_buckets_partial_nr; @@ -410,11 +413,8 @@ struct bch_dev { /* last calculated minimum prio */ u16 max_last_bucket_io[2]; - atomic_long_t saturated_count; size_t inc_gen_needs_gc; size_t inc_gen_really_needs_gc; - u64 allocator_journal_seq_flush; - bool allocator_invalidating_data; bool allocator_blocked; alloc_heap alloc_heap; @@ -424,6 +424,7 @@ struct bch_dev { copygc_heap copygc_heap; struct bch_pd_controller copygc_pd; struct write_point copygc_write_point; + u64 copygc_threshold; atomic64_t rebalance_work; @@ -576,6 +577,8 @@ struct bch_fs { struct mutex btree_interior_update_lock; struct closure_waitlist btree_interior_update_wait; + mempool_t btree_iters_pool; + struct workqueue_struct *wq; /* copygc needs its own workqueue for index updates.. */ struct workqueue_struct *copygc_wq; @@ -716,7 +719,7 @@ struct bch_fs { struct journal journal; - unsigned bucket_journal_seq; + u64 last_bucket_seq_cleanup; /* The rest of this all shows up in sysfs */ atomic_long_t read_realloc_races; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index e300738d..f1814f4c 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -1214,6 +1214,7 @@ LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64); LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE, struct bch_sb, flags[2], 0, 4); +LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); /* Features: */ enum bch_sb_features { diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index e4f62f90..bbe9af67 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -122,16 +122,27 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) +int bch2_bpos_to_text(char *buf, size_t size, struct bpos pos) +{ + char *out = buf, *end = buf + size; + + if (!bkey_cmp(pos, POS_MIN)) + p("POS_MIN"); + else if (!bkey_cmp(pos, POS_MAX)) + p("POS_MAX"); + else + p("%llu:%llu", pos.inode, pos.offset); + + return out - buf; +} + int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k) { char *out = buf, *end = buf + size; p("u64s %u type %u ", k->u64s, k->type); - if (bkey_cmp(k->p, POS_MAX)) - p("%llu:%llu", k->p.inode, k->p.offset); - else - p("POS_MAX"); + out += bch2_bpos_to_text(out, end - out, k->p); p(" snap %u len %u ver %llu", k->p.snapshot, k->size, k->version.lo); @@ -159,7 +170,7 @@ int bch2_val_to_text(struct bch_fs *c, enum bkey_type type, break; default: if (k.k->type >= KEY_TYPE_GENERIC_NR && ops->val_to_text) - ops->val_to_text(c, buf, size, k); + out += ops->val_to_text(c, out, end - out, k); break; } diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h index 9e2c90d5..c708f8c0 100644 --- a/libbcachefs/bkey_methods.h +++ b/libbcachefs/bkey_methods.h @@ -56,7 +56,7 @@ struct bkey_ops { struct bkey_s_c); void (*key_debugcheck)(struct bch_fs *, struct btree *, struct bkey_s_c); - void (*val_to_text)(struct bch_fs *, char *, + int (*val_to_text)(struct bch_fs *, char *, size_t, struct bkey_s_c); void (*swab)(const struct bkey_format *, struct bkey_packed *); key_filter_fn key_normalize; @@ -72,6 +72,7 @@ const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c); void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c); +int bch2_bpos_to_text(char *, size_t, struct bpos); int bch2_bkey_to_text(char *, size_t, const struct bkey *); int bch2_val_to_text(struct bch_fs *, enum bkey_type, char *, size_t, struct bkey_s_c); diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index 8c77fc50..fdd624a1 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -21,14 +21,19 @@ #include "alloc_types.h" #include +static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *, + struct btree *); + struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k) { + unsigned offset = __btree_node_key_to_offset(b, k); struct bset_tree *t; for_each_bset(b, t) - if (k >= btree_bkey_first(b, t) && - k < btree_bkey_last(b, t)) + if (offset <= t->end_offset) { + EBUG_ON(offset < btree_bkey_first_offset(t)); return t; + } BUG(); } @@ -64,8 +69,8 @@ void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set) _n = bkey_next(_k); bch2_bkey_to_text(buf, sizeof(buf), &k); - printk(KERN_ERR "block %u key %zi/%u: %s\n", set, - _k->_data - i->_data, i->u64s, buf); + printk(KERN_ERR "block %u key %5u: %s\n", set, + __btree_node_key_to_offset(b, _k), buf); if (_n == vstruct_last(i)) continue; @@ -121,20 +126,6 @@ void bch2_dump_btree_node_iter(struct btree *b, #ifdef CONFIG_BCACHEFS_DEBUG -static bool keys_out_of_order(struct btree *b, - const struct bkey_packed *prev, - const struct bkey_packed *next, - bool is_extents) -{ - struct bkey nextu = bkey_unpack_key(b, next); - - return bkey_cmp_left_packed_byval(b, prev, bkey_start_pos(&nextu)) > 0 || - ((is_extents - ? !bkey_deleted(next) - : !bkey_deleted(prev)) && - !bkey_cmp_packed(b, prev, next)); -} - void __bch2_verify_btree_nr_keys(struct btree *b) { struct bset_tree *t; @@ -151,16 +142,21 @@ void __bch2_verify_btree_nr_keys(struct btree *b) BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); } -static void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, - struct btree *b, - struct bkey_packed *k) +static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, + struct btree *b) { - const struct bkey_packed *n = bch2_btree_node_iter_peek_all(iter, b); + struct btree_node_iter iter = *_iter; + const struct bkey_packed *k, *n; + + k = bch2_btree_node_iter_peek_all(&iter, b); + __bch2_btree_node_iter_advance(&iter, b); + n = bch2_btree_node_iter_peek_all(&iter, b); bkey_unpack_key(b, k); if (n && - keys_out_of_order(b, k, n, iter->is_extents)) { + __btree_node_iter_cmp(b, k, n) > 0) { + struct btree_node_iter_set *set; struct bkey ku = bkey_unpack_key(b, k); struct bkey nu = bkey_unpack_key(b, n); char buf1[80], buf2[80]; @@ -168,106 +164,104 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, bch2_dump_btree_node(b); bch2_bkey_to_text(buf1, sizeof(buf1), &ku); bch2_bkey_to_text(buf2, sizeof(buf2), &nu); - panic("out of order/overlapping:\n%s\n%s\n", buf1, buf2); + printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n", + buf1, buf2); + printk(KERN_ERR "iter was:"); + + btree_node_iter_for_each(_iter, set) { + struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); + struct bset_tree *t = bch2_bkey_to_bset(b, k); + printk(" [%zi %zi]", t - b->set, + k->_data - bset(b, t)->_data); + } + panic("\n"); } } void bch2_btree_node_iter_verify(struct btree_node_iter *iter, - struct btree *b) + struct btree *b) { - struct btree_node_iter_set *set, *prev = NULL; + struct btree_node_iter_set *set, *s2; struct bset_tree *t; - struct bkey_packed *k, *first; - if (bch2_btree_node_iter_end(iter)) - return; + /* Verify no duplicates: */ + btree_node_iter_for_each(iter, set) + btree_node_iter_for_each(iter, s2) + BUG_ON(set != s2 && set->end == s2->end); + /* Verify that set->end is correct: */ btree_node_iter_for_each(iter, set) { - k = __btree_node_offset_to_key(b, set->k); - t = bch2_bkey_to_bset(b, k); - - BUG_ON(__btree_node_offset_to_key(b, set->end) != - btree_bkey_last(b, t)); - - BUG_ON(prev && - btree_node_iter_cmp(iter, b, *prev, *set) > 0); - - prev = set; + for_each_bset(b, t) + if (set->end == t->end_offset) + goto found; + BUG(); +found: + BUG_ON(set->k < btree_bkey_first_offset(t) || + set->k >= t->end_offset); } - first = __btree_node_offset_to_key(b, iter->data[0].k); - - for_each_bset(b, t) - if (bch2_btree_node_iter_bset_pos(iter, b, t) == - btree_bkey_last(b, t) && - (k = bch2_bkey_prev_all(b, t, btree_bkey_last(b, t)))) - BUG_ON(__btree_node_iter_cmp(iter->is_extents, b, - k, first) > 0); + /* Verify iterator is sorted: */ + btree_node_iter_for_each(iter, set) + BUG_ON(set != iter->data && + btree_node_iter_cmp(b, set[-1], set[0]) > 0); } -void bch2_verify_key_order(struct btree *b, - struct btree_node_iter *iter, - struct bkey_packed *where) +void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, + struct bkey_packed *insert, unsigned clobber_u64s) { struct bset_tree *t = bch2_bkey_to_bset(b, where); - struct bkey_packed *k, *prev; - struct bkey uk, uw = bkey_unpack_key(b, where); - - k = bch2_bkey_prev_all(b, t, where); - if (k && - keys_out_of_order(b, k, where, iter->is_extents)) { - char buf1[100], buf2[100]; + struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); + struct bkey_packed *next = (void *) (where->_data + clobber_u64s); +#if 0 + BUG_ON(prev && + __btree_node_iter_cmp(b, prev, insert) > 0); +#else + if (prev && + __btree_node_iter_cmp(b, prev, insert) > 0) { + struct bkey k1 = bkey_unpack_key(b, prev); + struct bkey k2 = bkey_unpack_key(b, insert); + char buf1[100]; + char buf2[100]; bch2_dump_btree_node(b); - uk = bkey_unpack_key(b, k); - bch2_bkey_to_text(buf1, sizeof(buf1), &uk); - bch2_bkey_to_text(buf2, sizeof(buf2), &uw); - panic("out of order with prev:\n%s\n%s\n", - buf1, buf2); + bch2_bkey_to_text(buf1, sizeof(buf1), &k1); + bch2_bkey_to_text(buf2, sizeof(buf2), &k2); + + panic("prev > insert:\n" + "prev key %5u %s\n" + "insert key %5u %s\n", + __btree_node_key_to_offset(b, prev), buf1, + __btree_node_key_to_offset(b, insert), buf2); } +#endif +#if 0 + BUG_ON(next != btree_bkey_last(b, t) && + __btree_node_iter_cmp(b, insert, next) > 0); +#else + if (next != btree_bkey_last(b, t) && + __btree_node_iter_cmp(b, insert, next) > 0) { + struct bkey k1 = bkey_unpack_key(b, insert); + struct bkey k2 = bkey_unpack_key(b, next); + char buf1[100]; + char buf2[100]; - k = bkey_next(where); - BUG_ON(k != btree_bkey_last(b, t) && - keys_out_of_order(b, where, k, iter->is_extents)); + bch2_dump_btree_node(b); + bch2_bkey_to_text(buf1, sizeof(buf1), &k1); + bch2_bkey_to_text(buf2, sizeof(buf2), &k2); - for_each_bset(b, t) { - if (where >= btree_bkey_first(b, t) || - where < btree_bkey_last(b, t)) - continue; - - k = bch2_btree_node_iter_bset_pos(iter, b, t); - - if (k == btree_bkey_last(b, t)) - k = bch2_bkey_prev_all(b, t, k); - - while (bkey_cmp_left_packed_byval(b, k, bkey_start_pos(&uw)) > 0 && - (prev = bch2_bkey_prev_all(b, t, k))) - k = prev; - - for (; - k != btree_bkey_last(b, t); - k = bkey_next(k)) { - uk = bkey_unpack_key(b, k); - - if (iter->is_extents) { - BUG_ON(!(bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0 || - bkey_cmp(uk.p, bkey_start_pos(&uw)) <= 0)); - } else { - BUG_ON(!bkey_cmp(uw.p, uk.p) && - !bkey_deleted(&uk)); - } - - if (bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0) - break; - } + panic("insert > next:\n" + "insert key %5u %s\n" + "next key %5u %s\n", + __btree_node_key_to_offset(b, insert), buf1, + __btree_node_key_to_offset(b, next), buf2); } +#endif } #else static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, - struct btree *b, - struct bkey_packed *k) {} + struct btree *b) {} #endif @@ -622,28 +616,30 @@ static unsigned rw_aux_tree_bsearch(struct btree *b, struct bset_tree *t, unsigned offset) { - unsigned l = 0, r = t->size; + unsigned bset_offs = offset - btree_bkey_first_offset(t); + unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t); + unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0; EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); + EBUG_ON(!t->size); + EBUG_ON(idx > t->size); - while (l < r) { - unsigned m = (l + r) >> 1; + while (idx < t->size && + rw_aux_tree(b, t)[idx].offset < offset) + idx++; - if (rw_aux_tree(b, t)[m].offset < offset) - l = m + 1; - else - r = m; - } + while (idx && + rw_aux_tree(b, t)[idx - 1].offset >= offset) + idx--; - EBUG_ON(l < t->size && - rw_aux_tree(b, t)[l].offset < offset); - EBUG_ON(l && - rw_aux_tree(b, t)[l - 1].offset >= offset); + EBUG_ON(idx < t->size && + rw_aux_tree(b, t)[idx].offset < offset); + EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset); + EBUG_ON(idx + 1 < t->size && + rw_aux_tree(b, t)[idx].offset == + rw_aux_tree(b, t)[idx + 1].offset); - EBUG_ON(l > r); - EBUG_ON(l > t->size); - - return l; + return idx; } static inline unsigned bfloat_mantissa(const struct bkey_float *f, @@ -1129,9 +1125,10 @@ static void ro_aux_tree_fix_invalidated_key(struct btree *b, * modified, fix any auxiliary search tree by remaking all the nodes in the * auxiliary search tree that @k corresponds to */ -void bch2_bset_fix_invalidated_key(struct btree *b, struct bset_tree *t, - struct bkey_packed *k) +void bch2_bset_fix_invalidated_key(struct btree *b, struct bkey_packed *k) { + struct bset_tree *t = bch2_bkey_to_bset(b, k); + switch (bset_aux_tree_type(t)) { case BSET_NO_AUX_TREE: break; @@ -1158,13 +1155,9 @@ static void bch2_bset_fix_lookup_table(struct btree *b, if (!bset_has_rw_aux_tree(t)) return; + /* returns first entry >= where */ l = rw_aux_tree_bsearch(b, t, where); - /* l is first >= than @where */ - - EBUG_ON(l < t->size && rw_aux_tree(b, t)[l].offset < where); - EBUG_ON(l && rw_aux_tree(b, t)[l - 1].offset >= where); - if (!l) /* never delete first entry */ l++; else if (l < t->size && @@ -1242,6 +1235,7 @@ void bch2_bset_insert(struct btree *b, struct bkey_packed packed, *src = bkey_to_packed(insert); bch2_bset_verify_rw_aux_tree(b, t); + bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s); if (bch2_bkey_pack_key(&packed, &insert->k, f)) src = &packed; @@ -1268,7 +1262,6 @@ void bch2_bset_insert(struct btree *b, bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s); - bch2_verify_key_order(b, iter, where); bch2_verify_btree_nr_keys(b); } @@ -1474,11 +1467,11 @@ void bch2_btree_node_iter_push(struct btree_node_iter *iter, noinline __flatten __attribute__((cold)) static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, struct btree *b, struct bpos search, - bool strictly_greater, bool is_extents) + bool strictly_greater) { struct bset_tree *t; - trace_bkey_pack_pos_fail(search); + trace_bkey_pack_pos_fail(&search); for_each_bset(b, t) __bch2_btree_node_iter_push(iter, b, @@ -1531,7 +1524,7 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, */ void bch2_btree_node_iter_init(struct btree_node_iter *iter, struct btree *b, struct bpos search, - bool strictly_greater, bool is_extents) + bool strictly_greater) { struct bset_tree *t; struct bkey_packed p, *packed_search = NULL; @@ -1539,7 +1532,7 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter, EBUG_ON(bkey_cmp(search, b->data->min_key) < 0); bset_aux_tree_verify(b); - __bch2_btree_node_iter_init(iter, is_extents); + memset(iter, 0, sizeof(*iter)); switch (bch2_bkey_pack_pos_lossy(&p, search, b)) { case BKEY_PACK_POS_EXACT: @@ -1550,7 +1543,7 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter, break; case BKEY_PACK_POS_FAIL: btree_node_iter_init_pack_failed(iter, b, search, - strictly_greater, is_extents); + strictly_greater); return; } @@ -1565,12 +1558,11 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter, } void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter, - struct btree *b, - bool is_extents) + struct btree *b) { struct bset_tree *t; - __bch2_btree_node_iter_init(iter, is_extents); + memset(iter, 0, sizeof(*iter)); for_each_bset(b, t) __bch2_btree_node_iter_push(iter, b, @@ -1598,7 +1590,7 @@ static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter, { bool ret; - if ((ret = (btree_node_iter_cmp(iter, b, + if ((ret = (btree_node_iter_cmp(b, iter->data[first], iter->data[first + 1]) > 0))) swap(iter->data[first], iter->data[first + 1]); @@ -1653,23 +1645,14 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, btree_node_iter_sort_two(iter, b, 1); } -/** - * bch_btree_node_iter_advance - advance @iter by one key - * - * Doesn't do debugchecks - for cases where (insert_fixup_extent()) a bset might - * momentarily have out of order extents. - */ void bch2_btree_node_iter_advance(struct btree_node_iter *iter, struct btree *b) { #ifdef CONFIG_BCACHEFS_DEBUG - struct bkey_packed *k = bch2_btree_node_iter_peek_all(iter, b); - - __bch2_btree_node_iter_advance(iter, b); - bch2_btree_node_iter_next_check(iter, b, k); -#else - __bch2_btree_node_iter_advance(iter, b); + bch2_btree_node_iter_verify(iter, b); + bch2_btree_node_iter_next_check(iter, b); #endif + __bch2_btree_node_iter_advance(iter, b); } static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) @@ -1702,8 +1685,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *ite bch2_btree_node_iter_bset_pos(iter, b, t), min_key_type); if (k && - (!prev || __btree_node_iter_cmp(iter->is_extents, b, - k, prev) > 0)) { + (!prev || __btree_node_iter_cmp(b, k, prev) > 0)) { prev = k; end = t->end_offset; } @@ -1736,11 +1718,11 @@ out: struct btree_node_iter iter2 = *iter; if (prev) - bch2_btree_node_iter_advance(&iter2, b); + __bch2_btree_node_iter_advance(&iter2, b); while ((k = bch2_btree_node_iter_peek_all(&iter2, b)) != orig_pos) { BUG_ON(k->type >= min_key_type); - bch2_btree_node_iter_advance(&iter2, b); + __bch2_btree_node_iter_advance(&iter2, b); } } diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h index 296c05b4..3a0ee491 100644 --- a/libbcachefs/bset.h +++ b/libbcachefs/bset.h @@ -342,8 +342,7 @@ void bch2_bset_init_first(struct btree *, struct bset *); void bch2_bset_init_next(struct bch_fs *, struct btree *, struct btree_node_entry *); void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); -void bch2_bset_fix_invalidated_key(struct btree *, struct bset_tree *, - struct bkey_packed *); +void bch2_bset_fix_invalidated_key(struct btree *, struct bkey_packed *); void bch2_bset_insert(struct btree *, struct btree_node_iter *, struct bkey_packed *, struct bkey_i *, unsigned); @@ -368,6 +367,17 @@ static inline int bkey_cmp_p_or_unp(const struct btree *b, return __bch2_bkey_cmp_left_packed_format_checked(b, l, r); } +/* Returns true if @k is after iterator position @pos */ +static inline bool btree_iter_pos_cmp(struct btree_iter *iter, + const struct bkey *k) +{ + int cmp = bkey_cmp(k->p, iter->pos); + + return cmp > 0 || + (cmp == 0 && + !(iter->flags & BTREE_ITER_IS_EXTENTS) && !bkey_deleted(k)); +} + /* Returns true if @k is after iterator position @pos */ static inline bool btree_iter_pos_cmp_packed(const struct btree *b, struct bpos *pos, @@ -418,7 +428,7 @@ enum bch_extent_overlap { /* Returns how k overlaps with m */ static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, - const struct bkey *m) + const struct bkey *m) { int cmp1 = bkey_cmp(k->p, m->p) < 0; int cmp2 = bkey_cmp(bkey_start_pos(k), @@ -429,20 +439,13 @@ static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, /* Btree key iteration */ -static inline void __bch2_btree_node_iter_init(struct btree_node_iter *iter, - bool is_extents) -{ - iter->is_extents = is_extents; - memset(iter->data, 0, sizeof(iter->data)); -} - void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *, const struct bkey_packed *, const struct bkey_packed *); void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *, - struct bpos, bool, bool); + struct bpos, bool); void bch2_btree_node_iter_init_from_start(struct btree_node_iter *, - struct btree *, bool); + struct btree *); struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *, struct btree *, struct bset_tree *); @@ -469,32 +472,21 @@ static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter) return __btree_node_iter_set_end(iter, 0); } -static inline int __btree_node_iter_cmp(bool is_extents, - struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) +static inline int __btree_node_iter_cmp(struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r) { - /* - * For non extents, when keys compare equal the deleted keys have to - * come first - so that bch2_btree_node_iter_next_check() can detect - * duplicate nondeleted keys (and possibly other reasons?) - * - * For extents, bkey_deleted() is used as a proxy for k->size == 0, so - * deleted keys have to sort last. - */ + /* When keys compare equal deleted keys come first */ return bkey_cmp_packed(b, l, r) - ?: (is_extents - ? (int) bkey_deleted(l) - (int) bkey_deleted(r) - : (int) bkey_deleted(r) - (int) bkey_deleted(l)) + ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: (l > r) - (l < r); } -static inline int btree_node_iter_cmp(struct btree_node_iter *iter, - struct btree *b, +static inline int btree_node_iter_cmp(struct btree *b, struct btree_node_iter_set l, struct btree_node_iter_set r) { - return __btree_node_iter_cmp(iter->is_extents, b, + return __btree_node_iter_cmp(b, __btree_node_offset_to_key(b, l.k), __btree_node_offset_to_key(b, r.k)); } @@ -581,21 +573,12 @@ bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b) return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_DISCARD + 1); } -/* - * Iterates over all _live_ keys - skipping deleted (and potentially - * overlapping) keys - */ -#define for_each_btree_node_key(b, k, iter, _is_extents) \ - for (bch2_btree_node_iter_init_from_start((iter), (b), (_is_extents));\ - ((k) = bch2_btree_node_iter_peek(iter, b)); \ - bch2_btree_node_iter_advance(iter, b)) - struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *, struct btree *, struct bkey *); -#define for_each_btree_node_key_unpack(b, k, iter, _is_extents, unpacked)\ - for (bch2_btree_node_iter_init_from_start((iter), (b), (_is_extents));\ +#define for_each_btree_node_key_unpack(b, k, iter, unpacked) \ + for (bch2_btree_node_iter_init_from_start((iter), (b)); \ (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\ bch2_btree_node_iter_advance(iter, b)) @@ -620,6 +603,13 @@ static inline void btree_keys_account_key(struct btree_nr_keys *n, #define btree_keys_account_key_drop(_nr, _bset_idx, _k) \ btree_keys_account_key(_nr, _bset_idx, _k, -1) +#define btree_account_key_add(_b, _k) \ + btree_keys_account_key(&(_b)->nr, \ + bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1) +#define btree_account_key_drop(_b, _k) \ + btree_keys_account_key(&(_b)->nr, \ + bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1) + struct bset_stats { struct { size_t nr, bytes; @@ -645,17 +635,18 @@ void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *); void __bch2_verify_btree_nr_keys(struct btree *); void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); -void bch2_verify_key_order(struct btree *, struct btree_node_iter *, - struct bkey_packed *); +void bch2_verify_insert_pos(struct btree *, struct bkey_packed *, + struct bkey_packed *, unsigned); #else static inline void __bch2_verify_btree_nr_keys(struct btree *b) {} static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter, struct btree *b) {} -static inline void bch2_verify_key_order(struct btree *b, - struct btree_node_iter *iter, - struct bkey_packed *where) {} +static inline void bch2_verify_insert_pos(struct btree *b, + struct bkey_packed *where, + struct bkey_packed *insert, + unsigned clobber_u64s) {} #endif static inline void bch2_verify_btree_nr_keys(struct btree *b) diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 969c1f19..7c18d830 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -122,13 +122,14 @@ static u8 bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type, switch (type) { case BKEY_TYPE_BTREE: - bch2_mark_key(c, k, c->opts.btree_node_size, true, pos, NULL, + bch2_mark_key(c, k, c->opts.btree_node_size, + BCH_DATA_BTREE, pos, NULL, 0, flags| BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| BCH_BUCKET_MARK_GC_LOCK_HELD); break; case BKEY_TYPE_EXTENTS: - bch2_mark_key(c, k, k.k->size, false, pos, NULL, + bch2_mark_key(c, k, k.k->size, BCH_DATA_USER, pos, NULL, 0, flags| BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| BCH_BUCKET_MARK_GC_LOCK_HELD); @@ -215,7 +216,6 @@ static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b) if (btree_node_has_ptrs(b)) for_each_btree_node_key_unpack(b, k, &iter, - btree_node_is_extents(b), &unpacked) { bch2_bkey_debugcheck(c, b, k); stale = max(stale, bch2_gc_mark_key(c, type, k, 0)); @@ -324,9 +324,16 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, unsigned i; u64 b; + /* + * This conditional is kind of gross, but we may be called from the + * device add path, before the new device has actually been added to the + * running filesystem: + */ if (c) { lockdep_assert_held(&c->sb_lock); percpu_down_read_preempt_disable(&c->usage_lock); + } else { + preempt_disable(); } for (i = 0; i < layout->nr_superblocks; i++) { @@ -354,6 +361,8 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, if (c) { percpu_up_read_preempt_enable(&c->usage_lock); spin_unlock(&c->journal.lock); + } else { + preempt_enable(); } } @@ -386,7 +395,8 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) for_each_pending_btree_node_free(c, as, d) if (d->index_update_done) bch2_mark_key(c, bkey_i_to_s_c(&d->key), - c->opts.btree_node_size, true, pos, + c->opts.btree_node_size, + BCH_DATA_BTREE, pos, &stats, 0, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| BCH_BUCKET_MARK_GC_LOCK_HELD); @@ -479,7 +489,8 @@ static void bch2_gc_start(struct bch_fs *c) struct bch_fs_usage *p = per_cpu_ptr(c->usage_percpu, cpu); - memset(p->s, 0, sizeof(p->s)); + memset(p->replicas, 0, sizeof(p->replicas)); + memset(p->buckets, 0, sizeof(p->buckets)); } percpu_up_write(&c->usage_lock); @@ -558,9 +569,6 @@ void bch2_gc(struct bch_fs *c) bch2_mark_pending_btree_node_frees(c); bch2_mark_allocator_buckets(c); - for_each_member_device(ca, c, i) - atomic_long_set(&ca->saturated_count, 0); - /* Indicates that gc is no longer in progress: */ gc_pos_set(c, gc_phase(GC_PHASE_DONE)); c->gc_count++; @@ -587,15 +595,14 @@ out: static void recalc_packed_keys(struct btree *b) { + struct bset *i = btree_bset_first(b); struct bkey_packed *k; memset(&b->nr, 0, sizeof(b->nr)); BUG_ON(b->nsets != 1); - for (k = btree_bkey_first(b, b->set); - k != btree_bkey_last(b, b->set); - k = bkey_next(k)) + vstruct_for_each(i, k) btree_keys_account_key_add(&b->nr, 0, k); } @@ -1032,7 +1039,6 @@ static int bch2_initial_gc_btree(struct bch_fs *c, enum btree_id id) struct bkey_s_c k; for_each_btree_node_key_unpack(b, k, &node_iter, - btree_node_is_extents(b), &unpacked) { ret = bch2_btree_mark_key_initial(c, btree_node_type(b), k); diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 94f56dbb..d83144b7 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -22,7 +22,7 @@ /* btree_node_iter_large: */ #define btree_node_iter_cmp_heap(h, _l, _r) \ - __btree_node_iter_cmp((iter)->is_extents, b, \ + __btree_node_iter_cmp(b, \ __btree_node_offset_to_key(b, (_l).k), \ __btree_node_offset_to_key(b, (_r).k)) @@ -248,6 +248,9 @@ static unsigned sort_extent_whiteouts(struct bkey_packed *dst, sort_iter_sort(iter, sort_extent_whiteouts_cmp); while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) { + if (bkey_deleted(in)) + continue; + EBUG_ON(bkeyp_val_u64s(f, in)); EBUG_ON(in->type != KEY_TYPE_DISCARD); @@ -309,7 +312,7 @@ static unsigned should_compact_bset(struct btree *b, struct bset_tree *t, if (mode == COMPACT_LAZY) { if (should_compact_bset_lazy(b, t) || - (compacting && bset_unwritten(b, bset(b, t)))) + (compacting && !bset_written(b, bset(b, t)))) return dead_u64s; } else { if (bset_written(b, bset(b, t))) @@ -356,7 +359,7 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, struct bkey_packed *k, *n, *out, *start, *end; struct btree_node_entry *src = NULL, *dst = NULL; - if (t != b->set && bset_unwritten(b, i)) { + if (t != b->set && !bset_written(b, i)) { src = container_of(i, struct btree_node_entry, keys); dst = max(write_block(b), (void *) btree_bkey_last(b, t -1)); @@ -396,7 +399,7 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, continue; if (bkey_whiteout(k)) { - unreserve_whiteout(b, t, k); + unreserve_whiteout(b, k); memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k)); set_bkeyp_val_u64s(f, u_pos, 0); u_pos = bkey_next(u_pos); @@ -467,7 +470,7 @@ static bool bch2_drop_whiteouts(struct btree *b) start = btree_bkey_first(b, t); end = btree_bkey_last(b, t); - if (bset_unwritten(b, i) && + if (!bset_written(b, i) && t != b->set) { struct bset *dst = max_t(struct bset *, write_block(b), @@ -785,8 +788,7 @@ void bch2_btree_sort_into(struct bch_fs *c, bch2_bset_set_no_aux_tree(dst, dst->set); - bch2_btree_node_iter_init_from_start(&src_iter, src, - btree_node_is_extents(src)); + bch2_btree_node_iter_init_from_start(&src_iter, src); if (btree_node_ops(src)->key_normalize || btree_node_ops(src)->key_merge) @@ -829,7 +831,7 @@ static bool btree_node_compact(struct bch_fs *c, struct btree *b, for (unwritten_idx = 0; unwritten_idx < b->nsets; unwritten_idx++) - if (bset_unwritten(b, bset(b, &b->set[unwritten_idx]))) + if (!bset_written(b, bset(b, &b->set[unwritten_idx]))) break; if (b->nsets - unwritten_idx > 1) { @@ -852,7 +854,7 @@ void bch2_btree_build_aux_trees(struct btree *b) for_each_bset(b, t) bch2_bset_build_aux_tree(b, t, - bset_unwritten(b, bset(b, t)) && + !bset_written(b, bset(b, t)) && t == bset_tree_last(b)); } @@ -1171,7 +1173,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry int ret, retry_read = 0, write = READ; iter = mempool_alloc(&c->fill_iter, GFP_NOIO); - __bch2_btree_node_iter_large_init(iter, btree_node_is_extents(b)); + iter->used = 0; if (bch2_meta_read_fault("btree")) btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL, @@ -1945,9 +1947,9 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) clear_btree_node_just_written(b); /* - * Note: immediately after write, bset_unwritten()/bset_written() don't - * work - the amount of data we had to write after compaction might have - * been smaller than the offset of the last bset. + * Note: immediately after write, bset_written() doesn't work - the + * amount of data we had to write after compaction might have been + * smaller than the offset of the last bset. * * However, we know that all bsets have been written here, as long as * we're still holding the write lock: diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index fa154642..ccd47326 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -145,20 +145,11 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *); /* Sorting */ struct btree_node_iter_large { - u8 is_extents; u16 used; struct btree_node_iter_set data[MAX_BSETS]; }; -static inline void -__bch2_btree_node_iter_large_init(struct btree_node_iter_large *iter, - bool is_extents) -{ - iter->used = 0; - iter->is_extents = is_extents; -} - void bch2_btree_node_iter_large_advance(struct btree_node_iter_large *, struct btree *); diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index a52ec12e..c37d82ae 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -34,10 +34,10 @@ void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter) struct btree_iter *linked; EBUG_ON(iter->l[b->level].b != b); - EBUG_ON(iter->lock_seq[b->level] + 1 != b->lock.state.seq); + EBUG_ON(iter->l[b->level].lock_seq + 1 != b->lock.state.seq); for_each_btree_iter_with_node(iter, b, linked) - linked->lock_seq[b->level] += 2; + linked->l[b->level].lock_seq += 2; six_unlock_write(&b->lock); } @@ -68,26 +68,6 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) &b->lock.state.counter); } -/* - * Lock a btree node if we already have it locked on one of our linked - * iterators: - */ -static inline bool btree_node_lock_increment(struct btree_iter *iter, - struct btree *b, unsigned level, - enum btree_node_locked_type want) -{ - struct btree_iter *linked; - - for_each_linked_btree_iter(iter, linked) - if (linked->l[level].b == b && - btree_node_locked_type(linked, level) >= want) { - six_lock_increment(&b->lock, want); - return true; - } - - return false; -} - bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) { struct btree *b = btree_iter_node(iter, level); @@ -99,8 +79,8 @@ bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) if (race_fault()) return false; - if (!six_relock_type(&b->lock, want, iter->lock_seq[level]) && - !(iter->lock_seq[level] >> 1 == b->lock.state.seq >> 1 && + if (!six_relock_type(&b->lock, want, iter->l[level].lock_seq) && + !(iter->l[level].lock_seq >> 1 == b->lock.state.seq >> 1 && btree_node_lock_increment(iter, b, level, want))) return false; @@ -125,10 +105,10 @@ static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level) if (btree_node_locked(iter, level) ? six_lock_tryupgrade(&b->lock) - : six_relock_type(&b->lock, SIX_LOCK_intent, iter->lock_seq[level])) + : six_relock_type(&b->lock, SIX_LOCK_intent, iter->l[level].lock_seq)) goto success; - if (iter->lock_seq[level] >> 1 == b->lock.state.seq >> 1 && + if (iter->l[level].lock_seq >> 1 == b->lock.state.seq >> 1 && btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) { btree_node_unlock(iter, level); goto success; @@ -189,34 +169,12 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, struct btree_iter *linked; bool ret = true; - /* Can't have children locked before ancestors: */ - EBUG_ON(iter->nodes_locked && level > __ffs(iter->nodes_locked)); - - /* - * Can't hold any read locks while we block taking an intent lock - see - * below for reasoning, and we should have already dropped any read - * locks in the current iterator - */ - EBUG_ON(type == SIX_LOCK_intent && - iter->nodes_locked != iter->nodes_intent_locked); - - if (btree_node_lock_increment(iter, b, level, type)) - return true; - - /* - * Must lock btree nodes in key order - this case happens when locking - * the prev sibling in btree node merging: - */ - if (iter->nodes_locked && - __ffs(iter->nodes_locked) <= level && - __btree_iter_cmp(iter->btree_id, pos, iter)) - return false; - - for_each_linked_btree_iter(iter, linked) { + /* Check if it's safe to block: */ + for_each_btree_iter(iter, linked) { if (!linked->nodes_locked) continue; - /* We have to lock btree nodes in key order: */ + /* * Must lock btree nodes in key order: */ if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0) ret = false; @@ -251,9 +209,10 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, if (linked->btree_id == iter->btree_id && level > __fls(linked->nodes_locked)) { if (may_drop_locks) { - linked->locks_want = max_t(unsigned, - linked->locks_want, - iter->locks_want); + linked->locks_want = + max(level + 1, max_t(unsigned, + linked->locks_want, + iter->locks_want)); btree_iter_get_locks(linked, true); } ret = false; @@ -415,14 +374,20 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter, struct btree_node_iter tmp = l->iter; struct bkey_packed *k; + if (iter->uptodate > BTREE_ITER_NEED_PEEK) + return; + bch2_btree_node_iter_verify(&l->iter, b); /* * For interior nodes, the iterator will have skipped past * deleted keys: + * + * For extents, the iterator may have skipped past deleted keys (but not + * whiteouts) */ - k = b->level - ? bch2_btree_node_iter_prev(&tmp, b) + k = b->level || iter->flags & BTREE_ITER_IS_EXTENTS + ? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_DISCARD) : bch2_btree_node_iter_prev_all(&tmp, b); if (k && btree_iter_pos_cmp_packed(b, &iter->pos, k, iter->flags & BTREE_ITER_IS_EXTENTS)) { @@ -430,7 +395,7 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter, struct bkey uk = bkey_unpack_key(b, k); bch2_bkey_to_text(buf, sizeof(buf), &uk); - panic("prev key should be before after pos:\n%s\n%llu:%llu\n", + panic("prev key should be before iter pos:\n%s\n%llu:%llu\n", buf, iter->pos.inode, iter->pos.offset); } @@ -441,15 +406,16 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter, struct bkey uk = bkey_unpack_key(b, k); bch2_bkey_to_text(buf, sizeof(buf), &uk); - panic("next key should be before iter pos:\n%llu:%llu\n%s\n", + panic("iter should be after current key:\n" + "iter pos %llu:%llu\n" + "cur key %s\n", iter->pos.inode, iter->pos.offset, buf); } - if (iter->uptodate == BTREE_ITER_UPTODATE && - (iter->flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES) { - BUG_ON(!bkey_whiteout(&iter->k) && - bch2_btree_node_iter_end(&l->iter)); - } + BUG_ON(iter->uptodate == BTREE_ITER_UPTODATE && + (iter->flags & BTREE_ITER_TYPE) == BTREE_ITER_KEYS && + !bkey_whiteout(&iter->k) && + bch2_btree_node_iter_end(&l->iter)); } void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b) @@ -460,6 +426,11 @@ void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b) __bch2_btree_iter_verify(linked, b); } +#else + +static inline void __bch2_btree_iter_verify(struct btree_iter *iter, + struct btree *b) {} + #endif static void __bch2_btree_node_iter_fix(struct btree_iter *iter, @@ -474,7 +445,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, struct btree_node_iter_set *set; unsigned offset = __btree_node_key_to_offset(b, where); int shift = new_u64s - clobber_u64s; - unsigned old_end = (int) __btree_node_key_to_offset(b, end) - shift; + unsigned old_end = t->end_offset - shift; btree_node_iter_for_each(node_iter, set) if (set->end == old_end) @@ -496,7 +467,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, } return; found: - set->end = (int) set->end + shift; + set->end = t->end_offset; /* Iterator hasn't gotten to the key that changed yet: */ if (set->k < offset) @@ -557,8 +528,7 @@ iter_current_key_not_modified: k = bch2_bkey_prev_all(b, t, bch2_btree_node_iter_bset_pos(node_iter, b, t)); if (k && - __btree_node_iter_cmp(node_iter, b, - k, where) > 0) { + __btree_node_iter_cmp(b, k, where) > 0) { struct btree_node_iter_set *set; unsigned offset = __btree_node_key_to_offset(b, bkey_next(k)); @@ -580,13 +550,13 @@ next_bset: } void bch2_btree_node_iter_fix(struct btree_iter *iter, - struct btree *b, - struct btree_node_iter *node_iter, - struct bset_tree *t, - struct bkey_packed *where, - unsigned clobber_u64s, - unsigned new_u64s) + struct btree *b, + struct btree_node_iter *node_iter, + struct bkey_packed *where, + unsigned clobber_u64s, + unsigned new_u64s) { + struct bset_tree *t = bch2_bkey_to_bset(b, where); struct btree_iter *linked; if (node_iter != &iter->l[b->level].iter) @@ -597,10 +567,6 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter, __bch2_btree_node_iter_fix(linked, b, &linked->l[b->level].iter, t, where, clobber_u64s, new_u64s); - - /* interior node iterators are... special... */ - if (!b->level) - bch2_btree_iter_verify(iter, b); } static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, @@ -687,17 +653,6 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) btree_node_unlock(iter, b->level + 1); } -/* Returns true if @k is after iterator position @pos */ -static inline bool btree_iter_pos_cmp(struct btree_iter *iter, - const struct bkey *k) -{ - int cmp = bkey_cmp(k->p, iter->pos); - - return cmp > 0 || - (cmp == 0 && - !(iter->flags & BTREE_ITER_IS_EXTENTS) && !bkey_deleted(k)); -} - static inline bool btree_iter_pos_after_node(struct btree_iter *iter, struct btree *b) { @@ -719,8 +674,7 @@ static inline void __btree_iter_init(struct btree_iter *iter, struct btree_iter_level *l = &iter->l[b->level]; bch2_btree_node_iter_init(&l->iter, b, iter->pos, - iter->flags & BTREE_ITER_IS_EXTENTS, - btree_node_is_extents(b)); + iter->flags & BTREE_ITER_IS_EXTENTS); /* Skip to first non whiteout: */ if (b->level) @@ -737,7 +691,7 @@ static inline void btree_iter_node_set(struct btree_iter *iter, EBUG_ON(!btree_iter_pos_in_node(iter, b)); EBUG_ON(b->lock.state.seq & 1); - iter->lock_seq[b->level] = b->lock.state.seq; + iter->l[b->level].lock_seq = b->lock.state.seq; iter->l[b->level].b = b; __btree_iter_init(iter, b); } @@ -1020,8 +974,6 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) if (__bch2_btree_iter_relock(iter)) return 0; - iter->flags &= ~BTREE_ITER_AT_END_OF_LEAF; - /* * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos * here unnecessary @@ -1062,7 +1014,9 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) } iter->uptodate = BTREE_ITER_NEED_PEEK; + bch2_btree_iter_verify_locks(iter); + __bch2_btree_iter_verify(iter, iter->l[iter->level].b); return 0; } @@ -1083,7 +1037,6 @@ static inline void bch2_btree_iter_checks(struct btree_iter *iter, enum btree_iter_type type) { EBUG_ON(iter->btree_id >= BTREE_ID_NR); - EBUG_ON((iter->flags & BTREE_ITER_TYPE) != type); EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != (iter->btree_id == BTREE_ID_EXTENTS && type != BTREE_ITER_NODES)); @@ -1199,10 +1152,8 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_ iter->flags & BTREE_ITER_IS_EXTENTS)) __btree_iter_advance(l); - if (!k && btree_iter_pos_after_node(iter, l->b)) { + if (!k && btree_iter_pos_after_node(iter, l->b)) btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - iter->flags |= BTREE_ITER_AT_END_OF_LEAF; - } } void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) @@ -1403,9 +1354,10 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) } static inline struct bkey_s_c -__bch2_btree_iter_peek_slot(struct btree_iter *iter) +__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) { struct btree_iter_level *l = &iter->l[0]; + struct btree_node_iter node_iter; struct bkey_s_c k; struct bkey n; int ret; @@ -1416,6 +1368,17 @@ recheck: bkey_cmp(bkey_start_pos(k.k), iter->pos) == 0) __btree_iter_advance(l); + /* + * iterator is now at the correct position for inserting at iter->pos, + * but we need to keep iterating until we find the first non whiteout so + * we know how big a hole we have, if any: + */ + + node_iter = l->iter; + if (k.k && bkey_whiteout(k.k)) + k = __btree_iter_unpack(iter, l, &iter->k, + bch2_btree_node_iter_peek(&node_iter, l->b)); + /* * If we got to the end of the node, check if we need to traverse to the * next node: @@ -1432,6 +1395,13 @@ recheck: if (k.k && !bkey_whiteout(k.k) && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { + /* + * if we skipped forward to find the first non whiteout and + * there _wasn't_ actually a hole, we want the iterator to be + * pointed at the key we found: + */ + l->iter = node_iter; + EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0); EBUG_ON(bkey_deleted(k.k)); iter->uptodate = BTREE_ITER_UPTODATE; @@ -1439,36 +1409,39 @@ recheck: } /* hole */ + + /* holes can't span inode numbers: */ + if (iter->pos.offset == KEY_OFFSET_MAX) { + if (iter->pos.inode == KEY_INODE_MAX) + return bkey_s_c_null; + + iter->pos = bkey_successor(iter->pos); + goto recheck; + } + + if (!k.k) + k.k = &l->b->key.k; + bkey_init(&n); n.p = iter->pos; + bch2_key_resize(&n, + min_t(u64, KEY_SIZE_MAX, + (k.k->p.inode == n.p.inode + ? bkey_start_offset(k.k) + : KEY_OFFSET_MAX) - + n.p.offset)); - if (iter->flags & BTREE_ITER_IS_EXTENTS) { - if (n.p.offset == KEY_OFFSET_MAX) { - if (n.p.inode == KEY_INODE_MAX) - return bkey_s_c_null; + //EBUG_ON(!n.size); + if (!n.size) { + char buf[100]; + bch2_dump_btree_node(iter->l[0].b); - iter->pos = bkey_successor(iter->pos); - goto recheck; - } - - if (k.k && bkey_whiteout(k.k)) { - struct btree_node_iter node_iter = l->iter; - - k = __btree_iter_unpack(iter, l, &iter->k, - bch2_btree_node_iter_peek(&node_iter, l->b)); - } - - if (!k.k) - k.k = &l->b->key.k; - - bch2_key_resize(&n, - min_t(u64, KEY_SIZE_MAX, - (k.k->p.inode == n.p.inode - ? bkey_start_offset(k.k) - : KEY_OFFSET_MAX) - - n.p.offset)); - - EBUG_ON(!n.size); + bch2_bkey_to_text(buf, sizeof(buf), k.k); + panic("iter at %llu:%llu\n" + "next key %s\n", + iter->pos.inode, + iter->pos.offset, + buf); } iter->k = n; @@ -1476,6 +1449,50 @@ recheck: return (struct bkey_s_c) { &iter->k, NULL }; } +static inline struct bkey_s_c +__bch2_btree_iter_peek_slot(struct btree_iter *iter) +{ + struct btree_iter_level *l = &iter->l[0]; + struct bkey_s_c k; + int ret; + + if (iter->flags & BTREE_ITER_IS_EXTENTS) + return __bch2_btree_iter_peek_slot_extents(iter); + +recheck: + while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k && + bkey_deleted(k.k) && + bkey_cmp(k.k->p, iter->pos) == 0) + __btree_iter_advance(l); + + /* + * If we got to the end of the node, check if we need to traverse to the + * next node: + */ + if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) { + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); + ret = bch2_btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); + + goto recheck; + } + + if (k.k && + !bkey_deleted(k.k) && + !bkey_cmp(iter->pos, k.k->p)) { + iter->uptodate = BTREE_ITER_UPTODATE; + return k; + } else { + /* hole */ + bkey_init(&iter->k); + iter->k.p = iter->pos; + + iter->uptodate = BTREE_ITER_UPTODATE; + return (struct bkey_s_c) { &iter->k, NULL }; + } +} + struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) { int ret; @@ -1611,17 +1628,29 @@ static void btree_trans_verify(struct btree_trans *trans) } } +static inline unsigned btree_trans_iter_idx(struct btree_trans *trans, + struct btree_iter *iter) +{ + ssize_t idx = iter - trans->iters; + + BUG_ON(idx < 0 || idx >= trans->nr_iters); + BUG_ON(!(trans->iters_live & (1U << idx))); + + return idx; +} + +void bch2_trans_iter_put(struct btree_trans *trans, + struct btree_iter *iter) +{ + ssize_t idx = btree_trans_iter_idx(trans, iter); + + trans->iters_live &= ~(1U << idx); +} + void bch2_trans_iter_free(struct btree_trans *trans, struct btree_iter *iter) { - unsigned idx; - - for (idx = 0; idx < trans->nr_iters; idx++) - if (&trans->iters[idx] == iter) - goto found; - BUG(); -found: - BUG_ON(!(trans->iters_linked & (1U << idx))); + ssize_t idx = btree_trans_iter_idx(trans, iter); trans->iters_live &= ~(1U << idx); trans->iters_linked &= ~(1U << idx); @@ -1635,10 +1664,7 @@ static int btree_trans_realloc_iters(struct btree_trans *trans) bch2_trans_unlock(trans); - new_iters = kmalloc(sizeof(struct btree_iter) * BTREE_ITER_MAX, - GFP_NOFS); - if (!new_iters) - return -ENOMEM; + new_iters = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); memcpy(new_iters, trans->iters, sizeof(struct btree_iter) * trans->nr_iters); @@ -1666,12 +1692,10 @@ static int btree_trans_realloc_iters(struct btree_trans *trans) return 0; } -int bch2_trans_preload_iters(struct btree_trans *trans) +void bch2_trans_preload_iters(struct btree_trans *trans) { - if (trans->iters != trans->iters_onstack) - return 0; - - return btree_trans_realloc_iters(trans); + if (trans->iters == trans->iters_onstack) + btree_trans_realloc_iters(trans); } static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, @@ -1711,10 +1735,6 @@ got_slot: } else { iter = &trans->iters[idx]; - BUG_ON(iter->btree_id != btree_id); - BUG_ON((iter->flags ^ flags) & - (BTREE_ITER_SLOTS|BTREE_ITER_IS_EXTENTS)); - iter->flags &= ~(BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); } @@ -1731,6 +1751,9 @@ got_slot: btree_trans_verify(trans); + BUG_ON(iter->btree_id != btree_id); + BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE); + return iter; } @@ -1855,7 +1878,7 @@ int bch2_trans_exit(struct btree_trans *trans) kfree(trans->mem); if (trans->iters != trans->iters_onstack) - kfree(trans->iters); + mempool_free(trans->iters, &trans->c->btree_iters_pool); trans->mem = (void *) 0x1; trans->iters = (void *) 0x1; return ret; diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index d046ad71..1a1ca952 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -40,7 +40,7 @@ static inline bool __iter_has_node(const struct btree_iter *iter, */ return iter->l[b->level].b == b && - iter->lock_seq[b->level] >> 1 == b->lock.state.seq >> 1; + iter->l[b->level].lock_seq >> 1 == b->lock.state.seq >> 1; } static inline struct btree_iter * @@ -100,8 +100,8 @@ static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {} #endif void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *, - struct btree_node_iter *, struct bset_tree *, - struct bkey_packed *, unsigned, unsigned); + struct btree_node_iter *, struct bkey_packed *, + unsigned, unsigned); int bch2_btree_iter_unlock(struct btree_iter *); @@ -271,9 +271,9 @@ static inline int btree_iter_err(struct bkey_s_c k) /* new multiple iterator interface: */ -int bch2_trans_preload_iters(struct btree_trans *); -void bch2_trans_iter_free(struct btree_trans *, - struct btree_iter *); +void bch2_trans_preload_iters(struct btree_trans *); +void bch2_trans_iter_put(struct btree_trans *, struct btree_iter *); +void bch2_trans_iter_free(struct btree_trans *, struct btree_iter *); struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id, struct bpos, unsigned, u64); @@ -308,6 +308,11 @@ bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src) void __bch2_trans_begin(struct btree_trans *); +static inline void bch2_trans_begin_updates(struct btree_trans *trans) +{ + trans->nr_updates = 0; +} + void *bch2_trans_kmalloc(struct btree_trans *, size_t); int bch2_trans_unlock(struct btree_trans *); void bch2_trans_init(struct btree_trans *, struct bch_fs *); diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index 419d0e81..9bbed99e 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -146,6 +146,26 @@ static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b, __btree_node_lock_type(c, b, type); } +/* + * Lock a btree node if we already have it locked on one of our linked + * iterators: + */ +static inline bool btree_node_lock_increment(struct btree_iter *iter, + struct btree *b, unsigned level, + enum btree_node_locked_type want) +{ + struct btree_iter *linked; + + for_each_linked_btree_iter(iter, linked) + if (linked->l[level].b == b && + btree_node_locked_type(linked, level) >= want) { + six_lock_increment(&b->lock, want); + return true; + } + + return false; +} + bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned, struct btree_iter *, enum six_lock_type, bool); @@ -158,6 +178,7 @@ static inline bool btree_node_lock(struct btree *b, struct bpos pos, EBUG_ON(level >= BTREE_MAX_DEPTH); return likely(six_trylock_type(&b->lock, type)) || + btree_node_lock_increment(iter, b, level, type) || __bch2_btree_node_lock(b, pos, level, iter, type, may_drop_locks); } @@ -184,7 +205,7 @@ void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *); static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) { EBUG_ON(iter->l[b->level].b != b); - EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq); + EBUG_ON(iter->l[b->level].lock_seq != b->lock.state.seq); if (!six_trylock_write(&b->lock)) __bch2_btree_node_lock_write(b, iter); diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 39e2db75..5f137af4 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -175,8 +175,6 @@ struct btree_cache { }; struct btree_node_iter { - u8 is_extents; - struct btree_node_iter_set { u16 k, end; } data[MAX_BSETS]; @@ -197,11 +195,7 @@ enum btree_iter_type { * @pos or the first key strictly greater than @pos */ #define BTREE_ITER_IS_EXTENTS (1 << 4) -/* - * indicates we need to call bch2_btree_iter_traverse() to revalidate iterator: - */ -#define BTREE_ITER_AT_END_OF_LEAF (1 << 5) -#define BTREE_ITER_ERROR (1 << 6) +#define BTREE_ITER_ERROR (1 << 5) enum btree_iter_uptodate { BTREE_ITER_UPTODATE = 0, @@ -232,10 +226,9 @@ struct btree_iter { struct btree_iter_level { struct btree *b; struct btree_node_iter iter; + u32 lock_seq; } l[BTREE_MAX_DEPTH]; - u32 lock_seq[BTREE_MAX_DEPTH]; - /* * Current unpacked key - so that bch2_btree_iter_next()/ * bch2_btree_iter_next_slot() can correctly advance pos. @@ -258,12 +251,6 @@ struct btree_iter { struct btree_insert_entry { struct btree_iter *iter; struct bkey_i *k; - unsigned extra_res; - /* - * true if entire key was inserted - can only be false for - * extents - */ - bool done; }; struct btree_trans { @@ -339,10 +326,38 @@ static inline struct bset_tree *bset_tree_last(struct btree *b) return b->set + b->nsets - 1; } +static inline void * +__btree_node_offset_to_ptr(const struct btree *b, u16 offset) +{ + return (void *) ((u64 *) b->data + 1 + offset); +} + +static inline u16 +__btree_node_ptr_to_offset(const struct btree *b, const void *p) +{ + u16 ret = (u64 *) p - 1 - (u64 *) b->data; + + EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p); + return ret; +} + static inline struct bset *bset(const struct btree *b, const struct bset_tree *t) { - return (void *) b->data + t->data_offset * sizeof(u64); + return __btree_node_offset_to_ptr(b, t->data_offset); +} + +static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t) +{ + t->end_offset = + __btree_node_ptr_to_offset(b, vstruct_last(bset(b, t))); +} + +static inline void set_btree_bset(struct btree *b, struct bset_tree *t, + const struct bset *i) +{ + t->data_offset = __btree_node_ptr_to_offset(b, i); + set_btree_bset_end(b, t); } static inline struct bset *btree_bset_first(struct btree *b) @@ -358,19 +373,27 @@ static inline struct bset *btree_bset_last(struct btree *b) static inline u16 __btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k) { - size_t ret = (u64 *) k - (u64 *) b->data - 1; - - EBUG_ON(ret > U16_MAX); - return ret; + return __btree_node_ptr_to_offset(b, k); } static inline struct bkey_packed * __btree_node_offset_to_key(const struct btree *b, u16 k) { - return (void *) ((u64 *) b->data + k + 1); + return __btree_node_offset_to_ptr(b, k); } -#define btree_bkey_first(_b, _t) (bset(_b, _t)->start) +static inline unsigned btree_bkey_first_offset(const struct bset_tree *t) +{ + return t->data_offset + offsetof(struct bset, _data) / sizeof(u64); +} + +#define btree_bkey_first(_b, _t) \ +({ \ + EBUG_ON(bset(_b, _t)->start != \ + __btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\ + \ + bset(_b, _t)->start; \ +}) #define btree_bkey_last(_b, _t) \ ({ \ @@ -380,23 +403,6 @@ __btree_node_offset_to_key(const struct btree *b, u16 k) __btree_node_offset_to_key(_b, (_t)->end_offset); \ }) -static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t) -{ - t->end_offset = - __btree_node_key_to_offset(b, vstruct_last(bset(b, t))); - btree_bkey_last(b, t); -} - -static inline void set_btree_bset(struct btree *b, struct bset_tree *t, - const struct bset *i) -{ - t->data_offset = (u64 *) i - (u64 *) b->data; - - EBUG_ON(bset(b, t) != i); - - set_btree_bset_end(b, t); -} - static inline unsigned bset_byte_offset(struct btree *b, void *i) { return i - (void *) b->data; @@ -439,28 +445,17 @@ struct btree_root { * we're holding the write lock and we know what key is about to be overwritten: */ -struct btree_iter; -struct btree_node_iter; - enum btree_insert_ret { BTREE_INSERT_OK, /* extent spanned multiple leaf nodes: have to traverse to next node: */ BTREE_INSERT_NEED_TRAVERSE, /* write lock held for too long */ - BTREE_INSERT_NEED_RESCHED, /* leaf node needs to be split */ BTREE_INSERT_BTREE_NODE_FULL, - BTREE_INSERT_JOURNAL_RES_FULL, BTREE_INSERT_ENOSPC, BTREE_INSERT_NEED_GC_LOCK, }; -struct extent_insert_hook { - enum btree_insert_ret - (*fn)(struct extent_insert_hook *, struct bpos, struct bpos, - struct bkey_s_c, const struct bkey_i *); -}; - enum btree_gc_coalesce_fail_reason { BTREE_GC_COALESCE_FAIL_RESERVE_GET, BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC, diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index 5e47d4cd..882e1c27 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -22,7 +22,6 @@ struct btree_insert { struct disk_reservation *disk_res; struct journal_res journal_res; u64 *journal_seq; - struct extent_insert_hook *hook; unsigned flags; bool did_work; @@ -32,22 +31,10 @@ struct btree_insert { int __bch2_btree_insert_at(struct btree_insert *); -#define _TENTH_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, N, ...) N -#define COUNT_ARGS(...) _TENTH_ARG(__VA_ARGS__, 9, 8, 7, 6, 5, 4, 3, 2, 1) - #define BTREE_INSERT_ENTRY(_iter, _k) \ ((struct btree_insert_entry) { \ .iter = (_iter), \ .k = (_k), \ - .done = false, \ - }) - -#define BTREE_INSERT_ENTRY_EXTRA_RES(_iter, _k, _extra) \ - ((struct btree_insert_entry) { \ - .iter = (_iter), \ - .k = (_k), \ - .extra_res = (_extra), \ - .done = false, \ }) /** @@ -63,13 +50,11 @@ int __bch2_btree_insert_at(struct btree_insert *); * -EROFS: filesystem read only * -EIO: journal or btree node IO error */ -#define bch2_btree_insert_at(_c, _disk_res, _hook, \ - _journal_seq, _flags, ...) \ +#define bch2_btree_insert_at(_c, _disk_res, _journal_seq, _flags, ...) \ __bch2_btree_insert_at(&(struct btree_insert) { \ .c = (_c), \ .disk_res = (_disk_res), \ .journal_seq = (_journal_seq), \ - .hook = (_hook), \ .flags = (_flags), \ .nr = COUNT_ARGS(__VA_ARGS__), \ .entries = (struct btree_insert_entry[]) { \ @@ -123,17 +108,13 @@ enum { int bch2_btree_delete_at(struct btree_iter *, unsigned); int bch2_btree_insert_list_at(struct btree_iter *, struct keylist *, - struct disk_reservation *, - struct extent_insert_hook *, u64 *, unsigned); + struct disk_reservation *, u64 *, unsigned); int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, - struct disk_reservation *, - struct extent_insert_hook *, u64 *, int flags); + struct disk_reservation *, u64 *, int flags); int bch2_btree_delete_range(struct bch_fs *, enum btree_id, - struct bpos, struct bpos, struct bversion, - struct disk_reservation *, - struct extent_insert_hook *, u64 *); + struct bpos, struct bpos, u64 *); int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *, __le64, unsigned); @@ -142,11 +123,17 @@ int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *, /* new transactional interface: */ -void bch2_trans_update(struct btree_trans *, struct btree_iter *, - struct bkey_i *, unsigned); +static inline void +bch2_trans_update(struct btree_trans *trans, + struct btree_insert_entry entry) +{ + BUG_ON(trans->nr_updates >= ARRAY_SIZE(trans->updates)); + + trans->updates[trans->nr_updates++] = entry; +} + int bch2_trans_commit(struct btree_trans *, struct disk_reservation *, - struct extent_insert_hook *, u64 *, unsigned); #define bch2_trans_do(_c, _journal_seq, _flags, _do) \ @@ -159,7 +146,7 @@ int bch2_trans_commit(struct btree_trans *, do { \ bch2_trans_begin(&trans); \ \ - _ret = (_do) ?: bch2_trans_commit(&trans, NULL, NULL, \ + _ret = (_do) ?: bch2_trans_commit(&trans, NULL, \ (_journal_seq), (_flags)); \ } while (_ret == -EINTR); \ \ diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 392ee0a0..a6832ef7 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -34,7 +34,7 @@ static void btree_node_interior_verify(struct btree *b) BUG_ON(!b->level); - bch2_btree_node_iter_init(&iter, b, b->key.k.p, false, false); + bch2_btree_node_iter_init(&iter, b, b->key.k.p, false); #if 1 BUG_ON(!(k = bch2_btree_node_iter_peek(&iter, b)) || bkey_cmp_left_packed(b, k, &b->key.k.p)); @@ -183,7 +183,8 @@ found: */ replicas = bch2_extent_nr_dirty_ptrs(k); if (replicas) - stats->s[replicas - 1].data[S_META] -= c->opts.btree_node_size; + stats->replicas[replicas - 1].data[BCH_DATA_BTREE] -= + c->opts.btree_node_size * replicas; /* * We're dropping @k from the btree, but it's still live until the @@ -210,7 +211,7 @@ found: struct bch_fs_usage tmp = { 0 }; bch2_mark_key(c, bkey_i_to_s_c(&d->key), - -c->opts.btree_node_size, true, b + -c->opts.btree_node_size, BCH_DATA_BTREE, b ? gc_pos_btree_node(b) : gc_pos_btree_root(as->btree_id), &tmp, 0, 0); @@ -289,7 +290,7 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c, BUG_ON(!pending->index_update_done); bch2_mark_key(c, bkey_i_to_s_c(&pending->key), - -c->opts.btree_node_size, true, + -c->opts.btree_node_size, BCH_DATA_BTREE, gc_phase(GC_PHASE_PENDING_DELETE), &stats, 0, 0); /* @@ -578,6 +579,8 @@ static void bch2_btree_update_free(struct btree_update *as) { struct bch_fs *c = as->c; + bch2_journal_pin_flush(&c->journal, &as->journal); + BUG_ON(as->nr_new_nodes); BUG_ON(as->nr_pending); @@ -1095,7 +1098,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b) __bch2_btree_set_root_inmem(c, b); bch2_mark_key(c, bkey_i_to_s_c(&b->key), - c->opts.btree_node_size, true, + c->opts.btree_node_size, BCH_DATA_BTREE, gc_pos_btree_root(b->btree_id), &stats, 0, 0); @@ -1142,7 +1145,8 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b, struct btree *old; trace_btree_set_root(c, b); - BUG_ON(!b->written); + BUG_ON(!b->written && + !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)); old = btree_node_root(c, b); @@ -1182,7 +1186,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b if (bkey_extent_is_data(&insert->k)) bch2_mark_key(c, bkey_i_to_s_c(insert), - c->opts.btree_node_size, true, + c->opts.btree_node_size, BCH_DATA_BTREE, gc_pos_btree_node(b), &stats, 0, 0); while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && @@ -1317,7 +1321,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE); - bch2_btree_node_iter_init(&node_iter, b, k->k.p, false, false); + bch2_btree_node_iter_init(&node_iter, b, k->k.p, false); while (!bch2_keylist_empty(keys)) { k = bch2_keylist_front(keys); @@ -1963,7 +1967,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, bch2_btree_node_lock_write(b, iter); bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i), - c->opts.btree_node_size, true, + c->opts.btree_node_size, BCH_DATA_BTREE, gc_pos_btree_root(b->btree_id), &stats, 0, 0); bch2_btree_node_free_index(as, NULL, @@ -2150,7 +2154,7 @@ ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf) as->mode, as->nodes_written, atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK, - bch2_journal_pin_seq(&c->journal, &as->journal)); + as->journal.seq); mutex_unlock(&c->btree_interior_update_lock); return out - buf; diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index e6f05071..fa30809d 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -160,15 +160,6 @@ static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c, { struct btree *b; - /* - * iterators are inconsistent when they hit end of leaf, until - * traversed again - * - * XXX inconsistent how? - */ - if (iter->flags & BTREE_ITER_AT_END_OF_LEAF) - return; - if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) return; @@ -240,14 +231,19 @@ static inline void *write_block(struct btree *b) return (void *) b->data + (b->written << 9); } -static inline bool bset_written(struct btree *b, struct bset *i) +static inline bool __btree_addr_written(struct btree *b, void *p) { - return (void *) i < write_block(b); + return p < write_block(b); } -static inline bool bset_unwritten(struct btree *b, struct bset *i) +static inline bool bset_written(struct btree *b, struct bset *i) { - return (void *) i > write_block(b); + return __btree_addr_written(b, i); +} + +static inline bool bkey_written(struct btree *b, struct bkey_packed *k) +{ + return __btree_addr_written(b, k); } static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, @@ -306,10 +302,9 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, return NULL; } -static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t, - struct bkey_packed *k) +static inline void unreserve_whiteout(struct btree *b, struct bkey_packed *k) { - if (bset_written(b, bset(b, t))) { + if (bkey_written(b, k)) { EBUG_ON(b->uncompacted_whiteout_u64s < bkeyp_key_u64s(&b->format, k)); b->uncompacted_whiteout_u64s -= @@ -317,10 +312,9 @@ static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t, } } -static inline void reserve_whiteout(struct btree *b, struct bset_tree *t, - struct bkey_packed *k) +static inline void reserve_whiteout(struct btree *b, struct bkey_packed *k) { - if (bset_written(b, bset(b, t))) { + if (bkey_written(b, k)) { BUG_ON(!k->needs_whiteout); b->uncompacted_whiteout_u64s += bkeyp_key_u64s(&b->format, k); @@ -332,40 +326,14 @@ static inline void reserve_whiteout(struct btree *b, struct bset_tree *t, * insert into could be written out from under us) */ static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, - struct btree *b, unsigned u64s) + struct btree *b, unsigned u64s) { if (unlikely(btree_node_fake(b))) return false; - if (btree_node_is_extents(b)) { - /* The insert key might split an existing key - * (bch2_insert_fixup_extent() -> BCH_EXTENT_OVERLAP_MIDDLE case: - */ - u64s += BKEY_EXTENT_U64s_MAX; - } - return u64s <= bch_btree_keys_u64s_remaining(c, b); } -static inline bool journal_res_insert_fits(struct btree_insert *trans, - struct btree_insert_entry *insert) -{ - unsigned u64s = 0; - struct btree_insert_entry *i; - - /* - * If we didn't get a journal reservation, we're in journal replay and - * we're not journalling updates: - */ - if (!trans->journal_res.ref) - return true; - - for (i = insert; i < trans->entries + trans->nr; i++) - u64s += jset_u64s(i->k->k.u64s + i->extra_res); - - return u64s <= trans->journal_res.u64s; -} - ssize_t bch2_btree_updates_print(struct bch_fs *, char *); size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *); diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index a481b0d6..33c913f7 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -24,7 +24,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, { const struct bkey_format *f = &b->format; struct bkey_packed *k; - struct bset_tree *t; unsigned clobber_u64s; EBUG_ON(btree_node_just_written(b)); @@ -37,9 +36,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, if (k && !bkey_cmp_packed(b, k, &insert->k)) { BUG_ON(bkey_whiteout(k)); - t = bch2_bkey_to_bset(b, k); - - if (bset_unwritten(b, bset(b, t)) && + if (!bkey_written(b, k) && bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k) && !bkey_whiteout(&insert->k)) { k->type = insert->k.type; @@ -50,9 +47,9 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, insert->k.needs_whiteout = k->needs_whiteout; - btree_keys_account_key_drop(&b->nr, t - b->set, k); + btree_account_key_drop(b, k); - if (t == bset_tree_last(b)) { + if (k >= btree_bset_last(b)->start) { clobber_u64s = k->u64s; /* @@ -62,8 +59,9 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, */ if (bkey_whiteout(&insert->k) && !k->needs_whiteout) { bch2_bset_delete(b, k, clobber_u64s); - bch2_btree_node_iter_fix(iter, b, node_iter, t, - k, clobber_u64s, 0); + bch2_btree_node_iter_fix(iter, b, node_iter, + k, clobber_u64s, 0); + bch2_btree_iter_verify(iter, b); return true; } @@ -71,11 +69,12 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, } k->type = KEY_TYPE_DELETED; - bch2_btree_node_iter_fix(iter, b, node_iter, t, k, - k->u64s, k->u64s); + bch2_btree_node_iter_fix(iter, b, node_iter, k, + k->u64s, k->u64s); + bch2_btree_iter_verify(iter, b); if (bkey_whiteout(&insert->k)) { - reserve_whiteout(b, t, k); + reserve_whiteout(b, k); return true; } else { k->needs_whiteout = false; @@ -90,14 +89,14 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, insert->k.needs_whiteout = false; } - t = bset_tree_last(b); - k = bch2_btree_node_iter_bset_pos(node_iter, b, t); + k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); clobber_u64s = 0; overwrite: bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k)) - bch2_btree_node_iter_fix(iter, b, node_iter, t, k, - clobber_u64s, k->u64s); + bch2_btree_node_iter_fix(iter, b, node_iter, k, + clobber_u64s, k->u64s); + bch2_btree_iter_verify(iter, b); return true; } @@ -110,8 +109,7 @@ static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, btree_node_lock_type(c, b, SIX_LOCK_read); bch2_btree_node_write_cond(c, b, - (btree_current_write(b) == w && - w->journal.pin_list == journal_seq_pin(j, seq))); + (btree_current_write(b) == w && w->journal.seq == seq)); six_unlock_read(&b->lock); } @@ -297,6 +295,30 @@ static inline int btree_trans_cmp(struct btree_insert_entry l, /* Normal update interface: */ +static enum btree_insert_ret +btree_key_can_insert(struct btree_insert *trans, + struct btree_insert_entry *insert, + unsigned *u64s) +{ + struct bch_fs *c = trans->c; + struct btree *b = insert->iter->l[0].b; + static enum btree_insert_ret ret; + + if (unlikely(btree_node_fake(b))) + return BTREE_INSERT_BTREE_NODE_FULL; + + ret = !btree_node_is_extents(b) + ? BTREE_INSERT_OK + : bch2_extent_can_insert(trans, insert, u64s); + if (ret) + return ret; + + if (*u64s > bch_btree_keys_u64s_remaining(c, b)) + return BTREE_INSERT_BTREE_NODE_FULL; + + return BTREE_INSERT_OK; +} + /* * Get journal reservation, take write locks, and attempt to do btree update(s): */ @@ -309,14 +331,12 @@ static inline int do_btree_insert_at(struct btree_insert *trans, unsigned u64s; int ret; - trans_for_each_entry(trans, i) { - BUG_ON(i->done); + trans_for_each_entry(trans, i) BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK); - } u64s = 0; trans_for_each_entry(trans, i) - u64s += jset_u64s(i->k->k.u64s + i->extra_res); + u64s += jset_u64s(i->k->k.u64s); memset(&trans->journal_res, 0, sizeof(trans->journal_res)); @@ -336,24 +356,34 @@ static inline int do_btree_insert_at(struct btree_insert *trans, goto out; } + /* + * Check if the insert will fit in the leaf node with the write lock + * held, otherwise another thread could write the node changing the + * amount of space available: + */ u64s = 0; trans_for_each_entry(trans, i) { /* Multiple inserts might go to same leaf: */ if (!same_leaf_as_prev(trans, i)) u64s = 0; - /* - * bch2_btree_node_insert_fits() must be called under write lock: - * with only an intent lock, another thread can still call - * bch2_btree_node_write(), converting an unwritten bset to a - * written one - */ - u64s += i->k->k.u64s + i->extra_res; - if (!bch2_btree_node_insert_fits(c, - i->iter->l[0].b, u64s)) { + u64s += i->k->k.u64s; + switch (btree_key_can_insert(trans, i, &u64s)) { + case BTREE_INSERT_OK: + break; + case BTREE_INSERT_BTREE_NODE_FULL: ret = -EINTR; *split = i->iter; goto out; + case BTREE_INSERT_ENOSPC: + ret = -ENOSPC; + goto out; + case BTREE_INSERT_NEED_GC_LOCK: + ret = -EINTR; + *cycle_gc_lock = true; + goto out; + default: + BUG(); } } @@ -369,34 +399,14 @@ static inline int do_btree_insert_at(struct btree_insert *trans, trans_for_each_entry(trans, i) { switch (btree_insert_key_leaf(trans, i)) { case BTREE_INSERT_OK: - i->done = true; break; - case BTREE_INSERT_JOURNAL_RES_FULL: case BTREE_INSERT_NEED_TRAVERSE: - case BTREE_INSERT_NEED_RESCHED: + BUG_ON((trans->flags & BTREE_INSERT_ATOMIC)); ret = -EINTR; - break; - case BTREE_INSERT_BTREE_NODE_FULL: - ret = -EINTR; - *split = i->iter; - break; - case BTREE_INSERT_ENOSPC: - ret = -ENOSPC; - break; - case BTREE_INSERT_NEED_GC_LOCK: - ret = -EINTR; - *cycle_gc_lock = true; - break; + goto out; default: BUG(); } - - /* - * If we did some work (i.e. inserted part of an extent), - * we have to do all the other updates as well: - */ - if (!trans->did_work && (ret || *split)) - break; } out: multi_unlock_write(trans); @@ -490,13 +500,8 @@ out: bch2_btree_iter_verify_locks(linked); BUG_ON((trans->flags & BTREE_INSERT_NOUNLOCK) && trans->did_work && - linked->uptodate >= BTREE_ITER_NEED_RELOCK); + !btree_node_locked(linked, 0)); } - - /* make sure we didn't lose an error: */ - if (!ret) - trans_for_each_entry(trans, i) - BUG_ON(!i->done); } BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR); @@ -581,29 +586,8 @@ err: goto out; } -void bch2_trans_update(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *k, - unsigned extra_journal_res) -{ - struct btree_insert_entry *i; - - BUG_ON(trans->nr_updates >= ARRAY_SIZE(trans->updates)); - - i = &trans->updates[trans->nr_updates++]; - - *i = (struct btree_insert_entry) { - .iter = iter, - .k = k, - .extra_res = extra_journal_res, - }; - - btree_insert_entry_checks(trans->c, i); -} - int bch2_trans_commit(struct btree_trans *trans, struct disk_reservation *disk_res, - struct extent_insert_hook *hook, u64 *journal_seq, unsigned flags) { @@ -631,7 +615,7 @@ int bch2_btree_delete_at(struct btree_iter *iter, unsigned flags) bkey_init(&k.k); k.k.p = iter->pos; - return bch2_btree_insert_at(iter->c, NULL, NULL, NULL, + return bch2_btree_insert_at(iter->c, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE|flags, BTREE_INSERT_ENTRY(iter, &k)); @@ -640,7 +624,6 @@ int bch2_btree_delete_at(struct btree_iter *iter, unsigned flags) int bch2_btree_insert_list_at(struct btree_iter *iter, struct keylist *keys, struct disk_reservation *disk_res, - struct extent_insert_hook *hook, u64 *journal_seq, unsigned flags) { BUG_ON(flags & BTREE_INSERT_ATOMIC); @@ -648,7 +631,7 @@ int bch2_btree_insert_list_at(struct btree_iter *iter, bch2_verify_keylist_sorted(keys); while (!bch2_keylist_empty(keys)) { - int ret = bch2_btree_insert_at(iter->c, disk_res, hook, + int ret = bch2_btree_insert_at(iter->c, disk_res, journal_seq, flags, BTREE_INSERT_ENTRY(iter, bch2_keylist_front(keys))); if (ret) @@ -670,7 +653,6 @@ int bch2_btree_insert_list_at(struct btree_iter *iter, int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k, struct disk_reservation *disk_res, - struct extent_insert_hook *hook, u64 *journal_seq, int flags) { struct btree_iter iter; @@ -678,7 +660,7 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, bch2_btree_iter_init(&iter, c, id, bkey_start_pos(&k->k), BTREE_ITER_INTENT); - ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, flags, + ret = bch2_btree_insert_at(c, disk_res, journal_seq, flags, BTREE_INSERT_ENTRY(&iter, k)); bch2_btree_iter_unlock(&iter); @@ -691,12 +673,8 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, * Range is a half open interval - [start, end) */ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, - struct bpos start, - struct bpos end, - struct bversion version, - struct disk_reservation *disk_res, - struct extent_insert_hook *hook, - u64 *journal_seq) + struct bpos start, struct bpos end, + u64 *journal_seq) { struct btree_iter iter; struct bkey_s_c k; @@ -706,14 +684,12 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, BTREE_ITER_INTENT); while ((k = bch2_btree_iter_peek(&iter)).k && - !(ret = btree_iter_err(k))) { + !(ret = btree_iter_err(k)) && + bkey_cmp(iter.pos, end) < 0) { unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); /* really shouldn't be using a bare, unpadded bkey_i */ struct bkey_i delete; - if (bkey_cmp(iter.pos, end) >= 0) - break; - bkey_init(&delete.k); /* @@ -727,7 +703,6 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, * bkey_start_pos(k.k)). */ delete.k.p = iter.pos; - delete.k.version = version; if (iter.flags & BTREE_ITER_IS_EXTENTS) { /* create the biggest key we can */ @@ -735,7 +710,7 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, bch2_cut_back(end, &delete.k); } - ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, + ret = bch2_btree_insert_at(c, NULL, journal_seq, BTREE_INSERT_NOFAIL, BTREE_INSERT_ENTRY(&iter, &delete)); if (ret) diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 43112445..801f6c37 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -72,6 +72,8 @@ #include #include +static inline u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage); + #ifdef DEBUG_BUCKETS #define lg_local_lock lg_global_lock @@ -81,22 +83,26 @@ static void bch2_fs_stats_verify(struct bch_fs *c) { struct bch_fs_usage stats = __bch2_fs_usage_read(c); - unsigned i; + unsigned i, j; - for (i = 0; i < ARRAY_SIZE(stats.s); i++) { - if ((s64) stats.s[i].data[S_META] < 0) - panic("replicas %u meta underflow: %lli\n", - i + 1, stats.s[i].data[S_META]); + for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) { + for (j = 0; j < ARRAY_SIZE(stats.replicas[i].data); j++) + if ((s64) stats.replicas[i].data[j] < 0) + panic("replicas %u %s sectors underflow: %lli\n", + i + 1, bch_data_types[j], + stats.replicas[i].data[j]); - if ((s64) stats.s[i].data[S_DIRTY] < 0) - panic("replicas %u dirty underflow: %lli\n", - i + 1, stats.s[i].data[S_DIRTY]); - - if ((s64) stats.s[i].persistent_reserved < 0) + if ((s64) stats.replicas[i].persistent_reserved < 0) panic("replicas %u reserved underflow: %lli\n", - i + 1, stats.s[i].persistent_reserved); + i + 1, stats.replicas[i].persistent_reserved); } + for (j = 0; j < ARRAY_SIZE(stats.buckets); j++) + if ((s64) stats.replicas[i].data_buckets[j] < 0) + panic("%s buckets underflow: %lli\n", + bch_data_types[j], + stats.buckets[j]); + if ((s64) stats.online_reserved < 0) panic("sectors_online_reserved underflow: %lli\n", stats.online_reserved); @@ -146,6 +152,7 @@ static void bch2_disk_reservations_verify(struct bch_fs *c, int flags) {} */ void bch2_bucket_seq_cleanup(struct bch_fs *c) { + u64 journal_seq = atomic64_read(&c->journal.seq); u16 last_seq_ondisk = c->journal.last_seq_ondisk; struct bch_dev *ca; struct bucket_array *buckets; @@ -153,6 +160,12 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c) struct bucket_mark m; unsigned i; + if (journal_seq - c->last_bucket_seq_cleanup < + (1U << (BUCKET_JOURNAL_SEQ_BITS - 2))) + return; + + c->last_bucket_seq_cleanup = journal_seq; + for_each_member_device(ca, c, i) { down_read(&ca->bucket_lock); buckets = bucket_array(ca); @@ -232,7 +245,9 @@ bch2_fs_usage_read(struct bch_fs *c) } struct fs_usage_sum { + u64 hidden; u64 data; + u64 cached; u64 reserved; }; @@ -241,10 +256,19 @@ static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats) struct fs_usage_sum sum = { 0 }; unsigned i; - for (i = 0; i < ARRAY_SIZE(stats.s); i++) { - sum.data += (stats.s[i].data[S_META] + - stats.s[i].data[S_DIRTY]) * (i + 1); - sum.reserved += stats.s[i].persistent_reserved * (i + 1); + /* + * For superblock and journal we count bucket usage, not sector usage, + * because any internal fragmentation should _not_ be counted as + * free space: + */ + sum.hidden += stats.buckets[BCH_DATA_SB]; + sum.hidden += stats.buckets[BCH_DATA_JOURNAL]; + + for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) { + sum.data += stats.replicas[i].data[BCH_DATA_BTREE]; + sum.data += stats.replicas[i].data[BCH_DATA_USER]; + sum.cached += stats.replicas[i].data[BCH_DATA_CACHED]; + sum.reserved += stats.replicas[i].persistent_reserved; } sum.reserved += stats.online_reserved; @@ -260,14 +284,14 @@ static u64 reserve_factor(u64 r) static u64 avail_factor(u64 r) { - return (r << RESERVE_FACTOR) / (1 << RESERVE_FACTOR) + 1; + return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1); } -u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats) +static inline u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats) { struct fs_usage_sum sum = __fs_usage_sum(stats); - return sum.data + reserve_factor(sum.reserved); + return sum.hidden + sum.data + reserve_factor(sum.reserved); } u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats) @@ -275,9 +299,9 @@ u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats) return min(c->capacity, __bch2_fs_sectors_used(c, stats)); } -u64 bch2_fs_sectors_free(struct bch_fs *c, struct bch_fs_usage stats) +static u64 bch2_fs_sectors_free(struct bch_fs *c, struct bch_fs_usage stats) { - return avail_factor(c->capacity - bch2_fs_sectors_used(c, stats)); + return c->capacity - bch2_fs_sectors_used(c, stats); } static inline int is_unavailable_bucket(struct bucket_mark m) @@ -313,9 +337,9 @@ static bool bucket_became_unavailable(struct bch_fs *c, } void bch2_fs_usage_apply(struct bch_fs *c, - struct bch_fs_usage *stats, - struct disk_reservation *disk_res, - struct gc_pos gc_pos) + struct bch_fs_usage *stats, + struct disk_reservation *disk_res, + struct gc_pos gc_pos) { struct fs_usage_sum sum = __fs_usage_sum(*stats); s64 added = sum.data + sum.reserved; @@ -347,21 +371,21 @@ void bch2_fs_usage_apply(struct bch_fs *c, } static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, + struct bch_fs_usage *stats, struct bucket_mark old, struct bucket_mark new) { struct bch_dev_usage *dev_usage; - if (c) - percpu_rwsem_assert_held(&c->usage_lock); + percpu_rwsem_assert_held(&c->usage_lock); - if (old.data_type && new.data_type && - old.data_type != new.data_type) { - BUG_ON(!c); - bch2_fs_inconsistent(c, - "different types of data in same bucket: %s, %s", - bch2_data_types[old.data_type], - bch2_data_types[new.data_type]); - } + bch2_fs_inconsistent_on(old.data_type && new.data_type && + old.data_type != new.data_type, c, + "different types of data in same bucket: %s, %s", + bch2_data_types[old.data_type], + bch2_data_types[new.data_type]); + + stats->buckets[bucket_type(old)] -= ca->mi.bucket_size; + stats->buckets[bucket_type(new)] += ca->mi.bucket_size; dev_usage = this_cpu_ptr(ca->usage_percpu); @@ -386,17 +410,18 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, bch2_dev_stats_verify(ca); } -#define bucket_data_cmpxchg(c, ca, g, new, expr) \ +#define bucket_data_cmpxchg(c, ca, stats, g, new, expr) \ ({ \ struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \ \ - bch2_dev_usage_update(c, ca, _old, new); \ + bch2_dev_usage_update(c, ca, stats, _old, new); \ _old; \ }) -bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, +void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, size_t b, struct bucket_mark *old) { + struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu); struct bucket *g; struct bucket_mark new; @@ -404,11 +429,8 @@ bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, g = bucket(ca, b); - *old = bucket_data_cmpxchg(c, ca, g, new, ({ - if (!is_available_bucket(new)) { - percpu_up_read_preempt_enable(&c->usage_lock); - return false; - } + *old = bucket_data_cmpxchg(c, ca, stats, g, new, ({ + BUG_ON(!is_available_bucket(new)); new.owned_by_allocator = 1; new.data_type = 0; @@ -417,16 +439,22 @@ bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, new.gen++; })); + /* + * This isn't actually correct yet, since fs usage is still + * uncompressed sectors: + */ + stats->replicas[0].data[BCH_DATA_CACHED] -= old->cached_sectors; + if (!old->owned_by_allocator && old->cached_sectors) trace_invalidate(ca, bucket_to_sector(ca, b), old->cached_sectors); - return true; } void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, size_t b, bool owned_by_allocator, struct gc_pos pos, unsigned flags) { + struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu); struct bucket *g; struct bucket_mark old, new; @@ -437,7 +465,7 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, gc_will_visit(c, pos)) return; - old = bucket_data_cmpxchg(c, ca, g, new, ({ + old = bucket_data_cmpxchg(c, ca, stats, g, new, ({ new.owned_by_allocator = owned_by_allocator; })); @@ -445,17 +473,11 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, c->gc_pos.phase == GC_PHASE_DONE); } -#define saturated_add(ca, dst, src, max) \ +#define checked_add(a, b) \ do { \ - BUG_ON((int) (dst) + (src) < 0); \ - if ((dst) == (max)) \ - ; \ - else if ((dst) + (src) <= (max)) \ - dst += (src); \ - else { \ - dst = (max); \ - trace_sectors_saturated(ca); \ - } \ + unsigned _res = (unsigned) (a) + (b); \ + (a) = _res; \ + BUG_ON((a) != _res); \ } while (0) void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, @@ -463,10 +485,12 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, unsigned sectors, struct gc_pos pos, unsigned flags) { + struct bch_fs_usage *stats; struct bucket *g; struct bucket_mark old, new; - BUG_ON(!type); + BUG_ON(type != BCH_DATA_SB && + type != BCH_DATA_JOURNAL); if (likely(c)) { percpu_rwsem_assert_held(&c->usage_lock); @@ -474,25 +498,32 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) && gc_will_visit(c, pos)) return; + + stats = this_cpu_ptr(c->usage_percpu); + + g = bucket(ca, b); + old = bucket_data_cmpxchg(c, ca, stats, g, new, ({ + new.data_type = type; + checked_add(new.dirty_sectors, sectors); + })); + + stats->replicas[0].data[type] += sectors; + } else { + rcu_read_lock(); + + g = bucket(ca, b); + old = bucket_cmpxchg(g, new, ({ + new.data_type = type; + checked_add(new.dirty_sectors, sectors); + })); + + rcu_read_unlock(); } - rcu_read_lock(); - - g = bucket(ca, b); - old = bucket_data_cmpxchg(c, ca, g, new, ({ - saturated_add(ca, new.dirty_sectors, sectors, - GC_MAX_SECTORS_USED); - new.data_type = type; - })); - - rcu_read_unlock(); - BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) && bucket_became_unavailable(c, old, new)); } -/* Reverting this until the copygc + compression issue is fixed: */ - static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors) { if (!sectors) @@ -511,16 +542,15 @@ static void bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c_extent e, const struct bch_extent_ptr *ptr, struct bch_extent_crc_unpacked crc, - s64 sectors, enum s_alloc type, - struct bch_fs_usage *stats, + s64 sectors, enum bch_data_type data_type, + unsigned replicas, + struct bch_fs_usage *fs_usage, u64 journal_seq, unsigned flags) { struct bucket_mark old, new; - unsigned saturated; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); struct bucket *g = PTR_BUCKET(ca, ptr); - enum bch_data_type data_type = type == S_META - ? BCH_DATA_BTREE : BCH_DATA_USER; + s64 uncompressed_sectors = sectors; u64 v; if (crc.compression_type) { @@ -538,6 +568,20 @@ static void bch2_mark_pointer(struct bch_fs *c, +__disk_sectors(crc, new_sectors); } + /* + * fs level usage (which determines free space) is in uncompressed + * sectors, until copygc + compression is sorted out: + * + * note also that we always update @fs_usage, even when we otherwise + * wouldn't do anything because gc is running - this is because the + * caller still needs to account w.r.t. its disk reservation. It is + * caller's responsibility to not apply @fs_usage if gc is in progress. + */ + fs_usage->replicas + [!ptr->cached && replicas ? replicas - 1 : 0].data + [!ptr->cached ? data_type : BCH_DATA_CACHED] += + uncompressed_sectors; + if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) { if (journal_seq) bucket_cmpxchg(g, new, ({ @@ -551,7 +595,6 @@ static void bch2_mark_pointer(struct bch_fs *c, v = atomic64_read(&g->_mark.v); do { new.v.counter = old.v.counter = v; - saturated = 0; /* * Check this after reading bucket mark to guard against @@ -565,17 +608,10 @@ static void bch2_mark_pointer(struct bch_fs *c, return; } - if (!ptr->cached && - new.dirty_sectors == GC_MAX_SECTORS_USED && - sectors < 0) - saturated = -sectors; - - if (ptr->cached) - saturated_add(ca, new.cached_sectors, sectors, - GC_MAX_SECTORS_USED); + if (!ptr->cached) + checked_add(new.dirty_sectors, sectors); else - saturated_add(ca, new.dirty_sectors, sectors, - GC_MAX_SECTORS_USED); + checked_add(new.cached_sectors, sectors); if (!new.dirty_sectors && !new.cached_sectors) { @@ -597,28 +633,22 @@ static void bch2_mark_pointer(struct bch_fs *c, old.v.counter, new.v.counter)) != old.v.counter); - bch2_dev_usage_update(c, ca, old, new); + bch2_dev_usage_update(c, ca, fs_usage, old, new); BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) && bucket_became_unavailable(c, old, new)); - - if (saturated && - atomic_long_add_return(saturated, - &ca->saturated_count) >= - bucket_to_sector(ca, ca->free_inc.size)) { - if (c->gc_thread) { - trace_gc_sectors_saturated(c); - wake_up_process(c->gc_thread); - } - } } void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, - s64 sectors, bool metadata, + s64 sectors, enum bch_data_type data_type, struct gc_pos pos, struct bch_fs_usage *stats, u64 journal_seq, unsigned flags) { + unsigned replicas = bch2_extent_nr_dirty_ptrs(k); + + BUG_ON(replicas && replicas - 1 > ARRAY_SIZE(stats->replicas)); + /* * synchronization w.r.t. GC: * @@ -661,34 +691,20 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, struct bkey_s_c_extent e = bkey_s_c_to_extent(k); const struct bch_extent_ptr *ptr; struct bch_extent_crc_unpacked crc; - enum s_alloc type = metadata ? S_META : S_DIRTY; - unsigned replicas = 0; - BUG_ON(metadata && bkey_extent_is_cached(e.k)); BUG_ON(!sectors); - extent_for_each_ptr_crc(e, ptr, crc) { - bch2_mark_pointer(c, e, ptr, crc, sectors, type, - stats, journal_seq, flags); - replicas += !ptr->cached; - } - - if (replicas) { - BUG_ON(replicas - 1 > ARRAY_SIZE(stats->s)); - stats->s[replicas - 1].data[type] += sectors; - } + extent_for_each_ptr_crc(e, ptr, crc) + bch2_mark_pointer(c, e, ptr, crc, sectors, data_type, + replicas, stats, journal_seq, flags); break; } - case BCH_RESERVATION: { - struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - - if (r.v->nr_replicas) { - BUG_ON(r.v->nr_replicas - 1 > ARRAY_SIZE(stats->s)); - stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors; - } + case BCH_RESERVATION: + if (replicas) + stats->replicas[replicas - 1].persistent_reserved += + sectors * replicas; break; } - } percpu_up_read_preempt_enable(&c->usage_lock); } @@ -701,7 +717,7 @@ static u64 __recalc_sectors_available(struct bch_fs *c) for_each_possible_cpu(cpu) per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0; - return bch2_fs_sectors_free(c, bch2_fs_usage_read(c)); + return avail_factor(bch2_fs_sectors_free(c, bch2_fs_usage_read(c))); } /* Used by gc when it's starting: */ @@ -833,9 +849,10 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, ca->mi.bucket_size / c->opts.btree_node_size); /* XXX: these should be tunable */ - size_t reserve_none = max_t(size_t, 4, ca->mi.nbuckets >> 9); - size_t copygc_reserve = max_t(size_t, 16, ca->mi.nbuckets >> 7); - size_t free_inc_reserve = copygc_reserve / 2; + size_t reserve_none = max_t(size_t, 4, nbuckets >> 9); + size_t copygc_reserve = max_t(size_t, 16, nbuckets >> 7); + size_t free_inc_nr = max(max_t(size_t, 16, nbuckets >> 12), + btree_reserve); bool resize = ca->buckets != NULL, start_copygc = ca->copygc_thread != NULL; int ret = -ENOMEM; @@ -858,8 +875,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) !init_fifo(&free[RESERVE_MOVINGGC], copygc_reserve, GFP_KERNEL) || !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) || - !init_fifo(&free_inc, free_inc_reserve, GFP_KERNEL) || - !init_heap(&alloc_heap, free_inc_reserve, GFP_KERNEL) || + !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) || + !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL) || !init_heap(©gc_heap, copygc_reserve, GFP_KERNEL)) goto err; diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 4deb6c37..ff86d23e 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -114,11 +114,6 @@ static inline u8 ptr_stale(struct bch_dev *ca, /* bucket gc marks */ -/* The dirty and cached sector counts saturate. If this occurs, - * reference counting alone will not free the bucket, and a btree - * GC must be performed. */ -#define GC_MAX_SECTORS_USED ((1U << 15) - 1) - static inline unsigned bucket_sectors_used(struct bucket_mark mark) { return mark.dirty_sectors + mark.cached_sectors; @@ -172,26 +167,12 @@ static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca) /* Filesystem usage: */ -static inline enum bch_data_type s_alloc_to_data_type(enum s_alloc s) -{ - switch (s) { - case S_META: - return BCH_DATA_BTREE; - case S_DIRTY: - return BCH_DATA_USER; - default: - BUG(); - } -} - struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *); struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *); void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, struct disk_reservation *, struct gc_pos); -u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage); u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage); -u64 bch2_fs_sectors_free(struct bch_fs *, struct bch_fs_usage); static inline bool is_available_bucket(struct bucket_mark mark) { @@ -209,7 +190,7 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m, void bch2_bucket_seq_cleanup(struct bch_fs *); -bool bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *, +void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *, size_t, struct bucket_mark *); void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool, struct gc_pos, unsigned); @@ -222,8 +203,8 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, #define BCH_BUCKET_MARK_GC_WILL_VISIT (1 << 2) #define BCH_BUCKET_MARK_GC_LOCK_HELD (1 << 3) -void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool, struct gc_pos, - struct bch_fs_usage *, u64, unsigned); +void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, enum bch_data_type, + struct gc_pos, struct bch_fs_usage *, u64, unsigned); void bch2_recalc_sectors_available(struct bch_fs *); diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index 10f00861..6f7d3a23 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -1,8 +1,11 @@ #ifndef _BUCKETS_TYPES_H #define _BUCKETS_TYPES_H +#include "bcachefs_format.h" #include "util.h" +#define BUCKET_JOURNAL_SEQ_BITS 16 + struct bucket_mark { union { struct { @@ -56,23 +59,17 @@ struct bch_dev_usage { u64 sectors_fragmented; }; -/* kill, switch to bch_data_type? */ -enum s_alloc { - S_META, - S_DIRTY, - S_ALLOC_NR, -}; - struct bch_fs_usage { /* all fields are in units of 512 byte sectors: */ - /* _uncompressed_ sectors: */ u64 online_reserved; u64 available_cache; struct { - u64 data[S_ALLOC_NR]; + u64 data[BCH_DATA_NR]; u64 persistent_reserved; - } s[BCH_REPLICAS_MAX]; + } replicas[BCH_REPLICAS_MAX]; + + u64 buckets[BCH_DATA_NR]; }; /* diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index 5593b9a1..c18079f9 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -403,11 +403,10 @@ static long bch2_ioctl_usage(struct bch_fs *c, for (i = 0; i < BCH_REPLICAS_MAX; i++) { dst.persistent_reserved[i] = - src.s[i].persistent_reserved; + src.replicas[i].persistent_reserved; - for (j = 0; j < S_ALLOC_NR; j++) - dst.sectors[s_alloc_to_data_type(j)][i] = - src.s[i].data[j]; + for (j = 0; j < BCH_DATA_NR; j++) + dst.sectors[j][i] = src.replicas[i].data[j]; } ret = copy_to_user(&user_arg->fs, &dst, sizeof(dst)); diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index d979ae0e..5f3e16b1 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -121,24 +121,26 @@ const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k) } } -void bch2_dirent_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) +int bch2_dirent_to_text(struct bch_fs *c, char *buf, + size_t size, struct bkey_s_c k) { + char *out = buf, *end = buf + size; struct bkey_s_c_dirent d; - size_t n = 0; switch (k.k->type) { case BCH_DIRENT: d = bkey_s_c_to_dirent(k); - n += bch_scnmemcpy(buf + n, size - n, d.v->d_name, - bch2_dirent_name_bytes(d)); - n += scnprintf(buf + n, size - n, " -> %llu", d.v->d_inum); + out += bch_scnmemcpy(out, end - out, d.v->d_name, + bch2_dirent_name_bytes(d)); + out += scnprintf(out, end - out, " -> %llu", d.v->d_inum); break; case BCH_DIRENT_WHITEOUT: - scnprintf(buf, size, "whiteout"); + out += scnprintf(out, end - out, "whiteout"); break; } + + return out - buf; } static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, @@ -289,7 +291,9 @@ int bch2_dirent_rename(struct btree_trans *trans, * new_dst at the src position: */ new_dst->k.p = src_iter->pos; - bch2_trans_update(trans, src_iter, &new_dst->k_i, 0); + bch2_trans_update(trans, + BTREE_INSERT_ENTRY(src_iter, + &new_dst->k_i)); return 0; } else { /* If we're overwriting, we can't insert new_dst @@ -312,8 +316,8 @@ int bch2_dirent_rename(struct btree_trans *trans, } } - bch2_trans_update(trans, src_iter, &new_src->k_i, 0); - bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0); + bch2_trans_update(trans, BTREE_INSERT_ENTRY(src_iter, &new_src->k_i)); + bch2_trans_update(trans, BTREE_INSERT_ENTRY(dst_iter, &new_dst->k_i)); return 0; } diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h index 4d92ffba..9fe32b9b 100644 --- a/libbcachefs/dirent.h +++ b/libbcachefs/dirent.h @@ -6,7 +6,7 @@ extern const struct bch_hash_desc bch2_dirent_hash_desc; const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c); -void bch2_dirent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); +int bch2_dirent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); #define bch2_bkey_dirent_ops (struct bkey_ops) { \ .key_invalid = bch2_dirent_invalid, \ diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index fe4bb527..a4d7e52b 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -733,8 +733,8 @@ err: mark.gen, (unsigned) mark.v.counter); } -void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) +int bch2_btree_ptr_to_text(struct bch_fs *c, char *buf, + size_t size, struct bkey_s_c k) { char *out = buf, *end = buf + size; const char *invalid; @@ -748,6 +748,7 @@ void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf, if (invalid) p(" invalid: %s", invalid); #undef p + return out - buf; } int bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b, @@ -857,30 +858,34 @@ void bch2_key_resize(struct bkey *k, * that we have to unpack the key, modify the unpacked key - then this * copies/repacks the unpacked to the original as necessary. */ -static bool __extent_save(struct btree *b, struct btree_node_iter *iter, - struct bkey_packed *dst, struct bkey *src) +static void extent_save(struct btree *b, struct bkey_packed *dst, + struct bkey *src) { struct bkey_format *f = &b->format; struct bkey_i *dst_unpacked; - bool ret; - if ((dst_unpacked = packed_to_bkey(dst))) { + if ((dst_unpacked = packed_to_bkey(dst))) dst_unpacked->k = *src; - ret = true; - } else { - ret = bch2_bkey_pack_key(dst, src, f); - } - - if (ret && iter) - bch2_verify_key_order(b, iter, dst); - - return ret; + else + BUG_ON(!bch2_bkey_pack_key(dst, src, f)); } -static void extent_save(struct btree *b, struct btree_node_iter *iter, - struct bkey_packed *dst, struct bkey *src) +static bool extent_i_save(struct btree *b, struct bkey_packed *dst, + struct bkey_i *src) { - BUG_ON(!__extent_save(b, iter, dst, src)); + struct bkey_format *f = &b->format; + struct bkey_i *dst_unpacked; + struct bkey_packed tmp; + + if ((dst_unpacked = packed_to_bkey(dst))) + dst_unpacked->k = src->k; + else if (bch2_bkey_pack_key(&tmp, &src->k, f)) + memcpy_u64s(dst, &tmp, f->key_u64s); + else + return false; + + memcpy_u64s(bkeyp_val(f, dst), &src->v, bkey_val_u64s(&src->k)); + return true; } /* @@ -1009,7 +1014,7 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, sort_key_next(iter, b, _r); } else { __bch2_cut_front(l.k->p, r); - extent_save(b, NULL, rk, r.k); + extent_save(b, rk, r.k); } extent_sort_sift(iter, b, _r - iter->data); @@ -1023,7 +1028,7 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k); __bch2_cut_front(r.k->p, l); - extent_save(b, NULL, lk, l.k); + extent_save(b, lk, l.k); extent_sort_sift(iter, b, 0); @@ -1031,7 +1036,7 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, bkey_to_packed(&tmp.k)); } else { bch2_cut_back(bkey_start_pos(r.k), l.k); - extent_save(b, NULL, lk, l.k); + extent_save(b, lk, l.k); } } @@ -1055,7 +1060,8 @@ struct extent_insert_state { /* for deleting: */ struct bkey_i whiteout; - bool do_journal; + bool update_journal; + bool update_btree; bool deleting; }; @@ -1070,7 +1076,7 @@ static void bch2_add_sectors(struct extent_insert_state *s, if (!sectors) return; - bch2_mark_key(c, k, sectors, false, gc_pos_btree_node(b), + bch2_mark_key(c, k, sectors, BCH_DATA_USER, gc_pos_btree_node(b), &s->stats, s->trans->journal_res.seq, 0); } @@ -1112,197 +1118,197 @@ static bool bch2_extent_merge_inline(struct bch_fs *, struct bkey_packed *, bool); -#define MAX_LOCK_HOLD_TIME (5 * NSEC_PER_MSEC) - -static enum btree_insert_ret -extent_insert_should_stop(struct extent_insert_state *s) +static void verify_extent_nonoverlapping(struct btree *b, + struct btree_node_iter *_iter, + struct bkey_i *insert) { - struct btree *b = s->insert->iter->l[0].b; +#ifdef CONFIG_BCACHEFS_DEBUG + struct btree_node_iter iter; + struct bkey_packed *k; + struct bkey uk; - /* - * Check if we have sufficient space in both the btree node and the - * journal reservation: - * - * Each insert checks for room in the journal entry, but we check for - * room in the btree node up-front. In the worst case, bkey_cmpxchg() - * will insert two keys, and one iteration of this room will insert one - * key, so we need room for three keys. - */ - if (!bch2_btree_node_insert_fits(s->trans->c, b, s->insert->k->k.u64s)) - return BTREE_INSERT_BTREE_NODE_FULL; - else if (!journal_res_insert_fits(s->trans, s->insert)) - return BTREE_INSERT_JOURNAL_RES_FULL; /* XXX worth tracing */ - else - return BTREE_INSERT_OK; + iter = *_iter; + k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_DISCARD); + BUG_ON(k && + (uk = bkey_unpack_key(b, k), + bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0)); + + iter = *_iter; + k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_DISCARD); +#if 0 + BUG_ON(k && + (uk = bkey_unpack_key(b, k), + bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0); +#else + if (k && + (uk = bkey_unpack_key(b, k), + bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) { + char buf1[100]; + char buf2[100]; + + bch2_bkey_to_text(buf1, sizeof(buf1), &insert->k); + bch2_bkey_to_text(buf2, sizeof(buf2), &uk); + + bch2_dump_btree_node(b); + panic("insert > next :\n" + "insert %s\n" + "next %s\n", + buf1, buf2); + } +#endif + +#endif +} + +static void verify_modified_extent(struct btree_iter *iter, + struct bkey_packed *k) +{ + bch2_btree_iter_verify(iter, iter->l[0].b); + bch2_verify_insert_pos(iter->l[0].b, k, k, k->u64s); } static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, struct bkey_i *insert) { struct btree_iter_level *l = &iter->l[0]; - struct bset_tree *t = bset_tree_last(l->b); - struct bkey_packed *where = - bch2_btree_node_iter_bset_pos(&l->iter, l->b, t); - struct bkey_packed *prev = bch2_bkey_prev_filter(l->b, t, where, - KEY_TYPE_DISCARD); - struct bkey_packed *next_live_key = where; - unsigned clobber_u64s; + struct btree_node_iter node_iter; + struct bkey_packed *k; + + BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b)); EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); + verify_extent_nonoverlapping(l->b, &l->iter, insert); - if (prev) - where = bkey_next(prev); + node_iter = l->iter; + k = bch2_btree_node_iter_prev_filter(&node_iter, l->b, KEY_TYPE_DISCARD); + if (k && !bkey_written(l->b, k) && + bch2_extent_merge_inline(c, iter, k, bkey_to_packed(insert), true)) + return; - while (next_live_key != btree_bkey_last(l->b, t) && - bkey_deleted(next_live_key)) - next_live_key = bkey_next(next_live_key); + node_iter = l->iter; + k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, KEY_TYPE_DISCARD); + if (k && !bkey_written(l->b, k) && + bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), k, false)) + return; - /* - * Everything between where and next_live_key is now deleted keys, and - * is overwritten: - */ - clobber_u64s = (u64 *) next_live_key - (u64 *) where; + k = bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b)); - if (prev && - bch2_extent_merge_inline(c, iter, prev, bkey_to_packed(insert), true)) - goto drop_deleted_keys; - - if (next_live_key != btree_bkey_last(l->b, t) && - bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), - next_live_key, false)) - goto drop_deleted_keys; - - bch2_bset_insert(l->b, &l->iter, where, insert, clobber_u64s); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, where, - clobber_u64s, where->u64s); - return; -drop_deleted_keys: - bch2_bset_delete(l->b, where, clobber_u64s); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, - where, clobber_u64s, 0); + bch2_bset_insert(l->b, &l->iter, k, insert, 0); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s); + bch2_btree_iter_verify(iter, l->b); } static void extent_insert_committed(struct extent_insert_state *s) { struct bch_fs *c = s->trans->c; struct btree_iter *iter = s->insert->iter; - struct bkey_i *insert = !s->deleting - ? s->insert->k - : &s->whiteout; + struct bkey_i *insert = s->insert->k; BKEY_PADDED(k) split; - EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); EBUG_ON(bkey_cmp(insert->k.p, s->committed) < 0); EBUG_ON(bkey_cmp(s->committed, bkey_start_pos(&insert->k)) < 0); - if (!bkey_cmp(s->committed, bkey_start_pos(&insert->k))) + bkey_copy(&split.k, insert); + if (s->deleting) + split.k.k.type = KEY_TYPE_DISCARD; + + if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) + bch2_cut_subtract_back(s, s->committed, + bkey_i_to_s(&split.k)); + else + bch2_cut_back(s->committed, &split.k.k); + + if (!bkey_cmp(s->committed, iter->pos)) return; - if (s->deleting && !s->do_journal) { - bch2_cut_front(s->committed, insert); - goto done; - } - - EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); - - bkey_copy(&split.k, insert); - - if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) && - bkey_cmp(s->committed, insert->k.p) && - bch2_extent_is_compressed(bkey_i_to_s_c(insert))) { - /* XXX: possibly need to increase our reservation? */ - bch2_cut_subtract_back(s, s->committed, - bkey_i_to_s(&split.k)); - bch2_cut_front(s->committed, insert); - bch2_add_sectors(s, bkey_i_to_s_c(insert), - bkey_start_offset(&insert->k), - insert->k.size); - } else { - bch2_cut_back(s->committed, &split.k.k); - bch2_cut_front(s->committed, insert); - } - - if (debug_check_bkeys(c)) - bch2_bkey_debugcheck(c, iter->l[0].b, bkey_i_to_s_c(&split.k)); - - bch2_btree_journal_key(s->trans, iter, &split.k); - - if (!s->deleting) - extent_bset_insert(c, iter, &split.k); -done: bch2_btree_iter_set_pos_same_leaf(iter, s->committed); + if (s->update_btree) { + if (debug_check_bkeys(c)) + bch2_bkey_debugcheck(c, iter->l[0].b, + bkey_i_to_s_c(&split.k)); + + EBUG_ON(bkey_deleted(&split.k.k) || !split.k.k.size); + + extent_bset_insert(c, iter, &split.k); + } + + if (s->update_journal) { + bkey_copy(&split.k, !s->deleting ? insert : &s->whiteout); + if (s->deleting) + split.k.k.type = KEY_TYPE_DISCARD; + + bch2_cut_back(s->committed, &split.k.k); + + EBUG_ON(bkey_deleted(&split.k.k) || !split.k.k.size); + + bch2_btree_journal_key(s->trans, iter, &split.k); + } + + bch2_cut_front(s->committed, insert); + insert->k.needs_whiteout = false; - s->do_journal = false; s->trans->did_work = true; } -static enum btree_insert_ret -__extent_insert_advance_pos(struct extent_insert_state *s, - struct bpos next_pos, - struct bkey_s_c k) +void bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) { - struct extent_insert_hook *hook = s->trans->hook; - enum btree_insert_ret ret; + struct btree *b = iter->l[0].b; - if (hook) - ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k); - else - ret = BTREE_INSERT_OK; + BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); - if (ret == BTREE_INSERT_OK) - s->committed = next_pos; + bch2_cut_back(b->key.k.p, &k->k); - return ret; + BUG_ON(bkey_cmp(bkey_start_pos(&k->k), b->data->min_key) < 0); } -/* - * Update iter->pos, marking how much of @insert we've processed, and call hook - * fn: - */ -static enum btree_insert_ret -extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k) +enum btree_insert_ret +bch2_extent_can_insert(struct btree_insert *trans, + struct btree_insert_entry *insert, + unsigned *u64s) { - struct btree *b = s->insert->iter->l[0].b; - struct bpos next_pos = bpos_min(s->insert->k->k.p, - k.k ? k.k->p : b->key.k.p); - enum btree_insert_ret ret; + struct btree_iter_level *l = &insert->iter->l[0]; + struct btree_node_iter node_iter = l->iter; + enum bch_extent_overlap overlap; + struct bkey_packed *_k; + struct bkey unpacked; + struct bkey_s_c k; + int sectors; - if (race_fault()) - return BTREE_INSERT_NEED_TRAVERSE; + BUG_ON(trans->flags & BTREE_INSERT_ATOMIC && + !bch2_extent_is_atomic(&insert->k->k, insert->iter)); - /* hole? */ - if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) { - ret = __extent_insert_advance_pos(s, bkey_start_pos(k.k), - bkey_s_c_null); - if (ret != BTREE_INSERT_OK) - return ret; - } + /* + * We avoid creating whiteouts whenever possible when deleting, but + * those optimizations mean we may potentially insert two whiteouts + * instead of one (when we overlap with the front of one extent and the + * back of another): + */ + if (bkey_whiteout(&insert->k->k)) + *u64s += BKEY_U64s; - /* avoid redundant calls to hook fn: */ - if (!bkey_cmp(s->committed, next_pos)) + _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, + KEY_TYPE_DISCARD); + if (!_k) return BTREE_INSERT_OK; - return __extent_insert_advance_pos(s, next_pos, k); -} + k = bkey_disassemble(l->b, _k, &unpacked); -static enum btree_insert_ret -extent_insert_check_split_compressed(struct extent_insert_state *s, - struct bkey_s_c k, - enum bch_extent_overlap overlap) -{ - struct bch_fs *c = s->trans->c; - unsigned sectors; + overlap = bch2_extent_overlap(&insert->k->k, k.k); + + /* account for having to split existing extent: */ + if (overlap == BCH_EXTENT_OVERLAP_MIDDLE) + *u64s += _k->u64s; if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && (sectors = bch2_extent_is_compressed(k))) { int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD; - if (s->trans->flags & BTREE_INSERT_NOFAIL) + if (trans->flags & BTREE_INSERT_NOFAIL) flags |= BCH_DISK_RESERVATION_NOFAIL; - switch (bch2_disk_reservation_add(c, - s->trans->disk_res, + switch (bch2_disk_reservation_add(trans->c, + trans->disk_res, sectors * bch2_extent_nr_dirty_ptrs(k), flags)) { case 0: @@ -1319,78 +1325,60 @@ extent_insert_check_split_compressed(struct extent_insert_state *s, return BTREE_INSERT_OK; } -static enum btree_insert_ret +static void extent_squash(struct extent_insert_state *s, struct bkey_i *insert, - struct bset_tree *t, struct bkey_packed *_k, struct bkey_s k, + struct bkey_packed *_k, struct bkey_s k, enum bch_extent_overlap overlap) { struct bch_fs *c = s->trans->c; struct btree_iter *iter = s->insert->iter; struct btree_iter_level *l = &iter->l[0]; - struct btree *b = l->b; - struct btree_node_iter *node_iter = &l->iter; - enum btree_insert_ret ret; switch (overlap) { case BCH_EXTENT_OVERLAP_FRONT: /* insert overlaps with start of k: */ bch2_cut_subtract_front(s, insert->k.p, k); BUG_ON(bkey_deleted(k.k)); - extent_save(b, node_iter, _k, k.k); + extent_save(l->b, _k, k.k); + verify_modified_extent(iter, _k); break; case BCH_EXTENT_OVERLAP_BACK: /* insert overlaps with end of k: */ bch2_cut_subtract_back(s, bkey_start_pos(&insert->k), k); BUG_ON(bkey_deleted(k.k)); - extent_save(b, node_iter, _k, k.k); + extent_save(l->b, _k, k.k); /* * As the auxiliary tree is indexed by the end of the * key and we've just changed the end, update the * auxiliary tree. */ - bch2_bset_fix_invalidated_key(b, t, _k); - bch2_btree_node_iter_fix(iter, b, node_iter, t, - _k, _k->u64s, _k->u64s); + bch2_bset_fix_invalidated_key(l->b, _k); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, + _k, _k->u64s, _k->u64s); + verify_modified_extent(iter, _k); break; case BCH_EXTENT_OVERLAP_ALL: { - struct bpos orig_pos = k.k->p; - /* The insert key completely covers k, invalidate k */ if (!bkey_whiteout(k.k)) - btree_keys_account_key_drop(&b->nr, - t - b->set, _k); + btree_account_key_drop(l->b, _k); bch2_drop_subtract(s, k); - k.k->p = bkey_start_pos(&insert->k); - if (!__extent_save(b, node_iter, _k, k.k)) { - /* - * Couldn't repack: we aren't necessarily able - * to repack if the new key is outside the range - * of the old extent, so we have to split - * @insert: - */ - k.k->p = orig_pos; - extent_save(b, node_iter, _k, k.k); - ret = extent_insert_advance_pos(s, k.s_c); - if (ret != BTREE_INSERT_OK) - return ret; + if (_k >= btree_bset_last(l->b)->start) { + unsigned u64s = _k->u64s; - extent_insert_committed(s); - /* - * We split and inserted upto at k.k->p - that - * has to coincide with iter->pos, so that we - * don't have anything more we have to insert - * until we recheck our journal reservation: - */ - EBUG_ON(bkey_cmp(s->committed, k.k->p)); + bch2_bset_delete(l->b, _k, _k->u64s); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, + _k, u64s, 0); + bch2_btree_iter_verify(iter, l->b); } else { - bch2_bset_fix_invalidated_key(b, t, _k); - bch2_btree_node_iter_fix(iter, b, node_iter, t, - _k, _k->u64s, _k->u64s); + extent_save(l->b, _k, k.k); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, + _k, _k->u64s, _k->u64s); + verify_modified_extent(iter, _k); } break; @@ -1412,14 +1400,15 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert, * what k points to) */ bkey_reassemble(&split.k, k.s_c); - split.k.k.needs_whiteout |= bset_written(b, bset(b, t)); + split.k.k.needs_whiteout |= bkey_written(l->b, _k); bch2_cut_back(bkey_start_pos(&insert->k), &split.k.k); BUG_ON(bkey_deleted(&split.k.k)); bch2_cut_subtract_front(s, insert->k.p, k); BUG_ON(bkey_deleted(k.k)); - extent_save(b, node_iter, _k, k.k); + extent_save(l->b, _k, k.k); + verify_modified_extent(iter, _k); bch2_add_sectors(s, bkey_i_to_s_c(&split.k), bkey_start_offset(&split.k.k), @@ -1428,158 +1417,96 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert, break; } } - - return BTREE_INSERT_OK; } -static enum btree_insert_ret -__bch2_delete_fixup_extent(struct extent_insert_state *s) +static void __bch2_insert_fixup_extent(struct extent_insert_state *s) { - struct bch_fs *c = s->trans->c; struct btree_iter *iter = s->insert->iter; struct btree_iter_level *l = &iter->l[0]; - struct btree *b = l->b; - struct btree_node_iter *node_iter = &l->iter; struct bkey_packed *_k; struct bkey unpacked; struct bkey_i *insert = s->insert->k; - enum btree_insert_ret ret = BTREE_INSERT_OK; - - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); - - s->whiteout = *insert; - s->whiteout.k.type = KEY_TYPE_DISCARD; while (bkey_cmp(s->committed, insert->k.p) < 0 && - (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK && - (_k = bch2_btree_node_iter_peek_all(node_iter, b))) { - struct bset_tree *t = bch2_bkey_to_bset(b, _k); - struct bkey_s k = __bkey_disassemble(b, _k, &unpacked); - enum bch_extent_overlap overlap; + (_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b, + KEY_TYPE_DISCARD))) { + struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked); + enum bch_extent_overlap overlap = bch2_extent_overlap(&insert->k, k.k); - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0); if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) break; - if (bkey_whiteout(k.k)) { - s->committed = bpos_min(insert->k.p, k.k->p); + s->committed = bpos_min(s->insert->k->k.p, k.k->p); + + if (!bkey_whiteout(k.k)) + s->update_journal = true; + + if (!s->update_journal) { + bch2_cut_front(s->committed, insert); + bch2_cut_front(s->committed, &s->whiteout); + bch2_btree_iter_set_pos_same_leaf(iter, s->committed); goto next; } - overlap = bch2_extent_overlap(&insert->k, k.k); - - ret = extent_insert_check_split_compressed(s, k.s_c, overlap); - if (ret) - break; - - ret = extent_insert_advance_pos(s, k.s_c); - if (ret) - break; - - s->do_journal = true; - - if (overlap == BCH_EXTENT_OVERLAP_ALL) { - btree_keys_account_key_drop(&b->nr, - t - b->set, _k); - bch2_subtract_sectors(s, k.s_c, - bkey_start_offset(k.k), k.k->size); - _k->type = KEY_TYPE_DISCARD; - reserve_whiteout(b, t, _k); - } else if (k.k->needs_whiteout || - bset_written(b, bset(b, t))) { - struct bkey_i discard = *insert; - - discard.k.type = KEY_TYPE_DISCARD; - - switch (overlap) { - case BCH_EXTENT_OVERLAP_FRONT: - bch2_cut_front(bkey_start_pos(k.k), &discard); - break; - case BCH_EXTENT_OVERLAP_BACK: - bch2_cut_back(k.k->p, &discard.k); - break; - default: - break; - } - - discard.k.needs_whiteout = true; - - ret = extent_squash(s, insert, t, _k, k, overlap); - BUG_ON(ret != BTREE_INSERT_OK); - - extent_bset_insert(c, iter, &discard); - } else { - ret = extent_squash(s, insert, t, _k, k, overlap); - BUG_ON(ret != BTREE_INSERT_OK); - } -next: - bch2_cut_front(s->committed, insert); - bch2_btree_iter_set_pos_same_leaf(iter, s->committed); - } - - return ret; -} - -static enum btree_insert_ret -__bch2_insert_fixup_extent(struct extent_insert_state *s) -{ - struct btree_iter *iter = s->insert->iter; - struct btree_iter_level *l = &iter->l[0]; - struct btree *b = l->b; - struct btree_node_iter *node_iter = &l->iter; - struct bkey_packed *_k; - struct bkey unpacked; - struct bkey_i *insert = s->insert->k; - enum btree_insert_ret ret = BTREE_INSERT_OK; - - while (bkey_cmp(s->committed, insert->k.p) < 0 && - (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK && - (_k = bch2_btree_node_iter_peek_all(node_iter, b))) { - struct bset_tree *t = bch2_bkey_to_bset(b, _k); - struct bkey_s k = __bkey_disassemble(b, _k, &unpacked); - enum bch_extent_overlap overlap; - - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); - EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0); - - if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) - break; - - overlap = bch2_extent_overlap(&insert->k, k.k); - - ret = extent_insert_check_split_compressed(s, k.s_c, overlap); - if (ret) - break; - - if (!k.k->size) - goto squash; - /* - * Only call advance pos & call hook for nonzero size extents: + * When deleting, if possible just do it by switching the type + * of the key we're deleting, instead of creating and inserting + * a new whiteout: */ - ret = extent_insert_advance_pos(s, k.s_c); - if (ret) + if (s->deleting && + !s->update_btree && + !bkey_cmp(insert->k.p, k.k->p) && + !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) { + if (!bkey_whiteout(k.k)) { + btree_account_key_drop(l->b, _k); + bch2_subtract_sectors(s, k.s_c, + bkey_start_offset(k.k), k.k->size); + _k->type = KEY_TYPE_DISCARD; + reserve_whiteout(l->b, _k); + } break; + } - if (k.k->size && - (k.k->needs_whiteout || bset_written(b, bset(b, t)))) + if (k.k->needs_whiteout || bkey_written(l->b, _k)) { insert->k.needs_whiteout = true; + s->update_btree = true; + } - if (overlap == BCH_EXTENT_OVERLAP_ALL && + if (s->update_btree && + overlap == BCH_EXTENT_OVERLAP_ALL && bkey_whiteout(k.k) && k.k->needs_whiteout) { - unreserve_whiteout(b, t, _k); + unreserve_whiteout(l->b, _k); _k->needs_whiteout = false; } -squash: - ret = extent_squash(s, insert, t, _k, k, overlap); - if (ret != BTREE_INSERT_OK) + + extent_squash(s, insert, _k, k, overlap); + + if (!s->update_btree) + bch2_cut_front(s->committed, insert); +next: + if (overlap == BCH_EXTENT_OVERLAP_FRONT || + overlap == BCH_EXTENT_OVERLAP_MIDDLE) break; } - return ret; + if (bkey_cmp(s->committed, insert->k.p) < 0) + s->committed = bpos_min(s->insert->k->k.p, l->b->key.k.p); + + /* + * may have skipped past some deleted extents greater than the insert + * key, before we got to a non deleted extent and knew we could bail out + * rewind the iterator a bit if necessary: + */ + { + struct btree_node_iter node_iter = l->iter; + + while ((_k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) && + bkey_cmp_left_packed(l->b, _k, &s->committed) > 0) + l->iter = node_iter; + } } /** @@ -1625,16 +1552,17 @@ enum btree_insert_ret bch2_insert_fixup_extent(struct btree_insert *trans, struct btree_insert_entry *insert) { - struct bch_fs *c = trans->c; - struct btree_iter *iter = insert->iter; - struct btree_iter_level *l = &iter->l[0]; - struct btree *b = l->b; - enum btree_insert_ret ret = BTREE_INSERT_OK; - + struct bch_fs *c = trans->c; + struct btree_iter *iter = insert->iter; + struct btree *b = iter->l[0].b; struct extent_insert_state s = { .trans = trans, .insert = insert, - .committed = insert->iter->pos, + .committed = iter->pos, + + .whiteout = *insert->k, + .update_journal = !bkey_whiteout(&insert->k->k), + .update_btree = !bkey_whiteout(&insert->k->k), .deleting = bkey_whiteout(&insert->k->k), }; @@ -1655,45 +1583,23 @@ bch2_insert_fixup_extent(struct btree_insert *trans, bkey_start_offset(&insert->k->k), insert->k->k.size); - ret = !s.deleting - ? __bch2_insert_fixup_extent(&s) - : __bch2_delete_fixup_extent(&s); - - if (ret == BTREE_INSERT_OK && - bkey_cmp(s.committed, insert->k->k.p) < 0) - ret = extent_insert_advance_pos(&s, bkey_s_c_null); + __bch2_insert_fixup_extent(&s); extent_insert_committed(&s); - if (s.deleting) - bch2_cut_front(iter->pos, insert->k); - - /* - * Subtract any remaining sectors from @insert, if we bailed out early - * and didn't fully insert @insert: - */ - if (!s.deleting && - !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY) && - insert->k->k.size) - bch2_subtract_sectors(&s, bkey_i_to_s_c(insert->k), - bkey_start_offset(&insert->k->k), - insert->k->k.size); - bch2_fs_usage_apply(c, &s.stats, trans->disk_res, gc_pos_btree_node(b)); EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k))); EBUG_ON(bkey_cmp(iter->pos, s.committed)); - EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != - !!(iter->flags & BTREE_ITER_AT_END_OF_LEAF)); - if (insert->k->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF)) - ret = BTREE_INSERT_NEED_TRAVERSE; + if (insert->k->k.size) { + /* got to the end of this leaf node */ + BUG_ON(bkey_cmp(iter->pos, b->key.k.p)); + return BTREE_INSERT_NEED_TRAVERSE; + } - WARN_ONCE((ret == BTREE_INSERT_OK) != (insert->k->k.size == 0), - "ret %u insert->k.size %u", ret, insert->k->k.size); - - return ret; + return BTREE_INSERT_OK; } const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) @@ -1877,8 +1783,8 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k } } -void bch2_extent_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) +int bch2_extent_to_text(struct bch_fs *c, char *buf, + size_t size, struct bkey_s_c k) { char *out = buf, *end = buf + size; const char *invalid; @@ -1892,6 +1798,7 @@ void bch2_extent_to_text(struct bch_fs *c, char *buf, if (invalid) p(" invalid: %s", invalid); #undef p + return out - buf; } static void bch2_extent_crc_init(union bch_extent_crc *crc, @@ -2162,130 +2069,6 @@ enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *b, return BCH_MERGE_MERGE; } -static void extent_i_save(struct btree *b, struct bkey_packed *dst, - struct bkey_i *src) -{ - struct bkey_format *f = &b->format; - struct bkey_i *dst_unpacked; - - BUG_ON(bkeyp_val_u64s(f, dst) != bkey_val_u64s(&src->k)); - - /* - * We don't want the bch2_verify_key_order() call in extent_save(), - * because we may be out of order with deleted keys that are about to be - * removed by extent_bset_insert() - */ - - if ((dst_unpacked = packed_to_bkey(dst))) - bkey_copy(dst_unpacked, src); - else - BUG_ON(!bch2_bkey_pack(dst, src, f)); -} - -static bool extent_merge_one_overlapping(struct btree_iter *iter, - struct bpos new_pos, - struct bset_tree *t, - struct bkey_packed *k, struct bkey uk, - bool check, bool could_pack) -{ - struct btree_iter_level *l = &iter->l[0]; - - BUG_ON(!bkey_deleted(k)); - - if (check) { - return !bkey_packed(k) || could_pack; - } else { - uk.p = new_pos; - extent_save(l->b, &l->iter, k, &uk); - bch2_bset_fix_invalidated_key(l->b, t, k); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, - k, k->u64s, k->u64s); - return true; - } -} - -static bool extent_merge_do_overlapping(struct btree_iter *iter, - struct bkey *m, bool back_merge) -{ - struct btree_iter_level *l = &iter->l[0]; - struct btree *b = l->b; - struct btree_node_iter *node_iter = &l->iter; - struct bset_tree *t; - struct bkey_packed *k; - struct bkey uk; - struct bpos new_pos = back_merge ? m->p : bkey_start_pos(m); - bool could_pack = bkey_pack_pos((void *) &uk, new_pos, b); - bool check = true; - - /* - * @m is the new merged extent: - * - * The merge took place in the last bset; we know there can't be any 0 - * size extents overlapping with m there because if so they would have - * been between the two extents we merged. - * - * But in the other bsets, we have to check for and fix such extents: - */ -do_fixup: - for_each_bset(b, t) { - if (t == bset_tree_last(b)) - break; - - /* - * if we don't find this bset in the iterator we already got to - * the end of that bset, so start searching from the end. - */ - k = bch2_btree_node_iter_bset_pos(node_iter, b, t); - - if (k == btree_bkey_last(b, t)) - k = bch2_bkey_prev_all(b, t, k); - if (!k) - continue; - - if (back_merge) { - /* - * Back merge: 0 size extents will be before the key - * that was just inserted (and thus the iterator - * position) - walk backwards to find them - */ - for (; - k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(uk.p, bkey_start_pos(m)) > 0); - k = bch2_bkey_prev_all(b, t, k)) { - if (bkey_cmp(uk.p, m->p) >= 0) - continue; - - if (!extent_merge_one_overlapping(iter, new_pos, - t, k, uk, check, could_pack)) - return false; - } - } else { - /* Front merge - walk forwards */ - for (; - k != btree_bkey_last(b, t) && - (uk = bkey_unpack_key(b, k), - bkey_cmp(uk.p, m->p) < 0); - k = bkey_next(k)) { - if (bkey_cmp(uk.p, - bkey_start_pos(m)) <= 0) - continue; - - if (!extent_merge_one_overlapping(iter, new_pos, - t, k, uk, check, could_pack)) - return false; - } - } - } - - if (check) { - check = false; - goto do_fixup; - } - - return true; -} - /* * When merging an extent that we're inserting into a btree node, the new merged * extent could overlap with an existing 0 size extent - if we don't fix that, @@ -2302,13 +2085,13 @@ static bool bch2_extent_merge_inline(struct bch_fs *c, { struct btree *b = iter->l[0].b; struct btree_node_iter *node_iter = &iter->l[0].iter; - const struct bkey_format *f = &b->format; - struct bset_tree *t = bset_tree_last(b); - struct bkey_packed *m; - BKEY_PADDED(k) li; - BKEY_PADDED(k) ri; - struct bkey_i *mi; - struct bkey tmp; + BKEY_PADDED(k) li, ri; + struct bkey_packed *m = back_merge ? l : r; + struct bkey_i *mi = back_merge ? &li.k : &ri.k; + struct bset_tree *t = bch2_bkey_to_bset(b, m); + enum merge_result ret; + + EBUG_ON(bkey_written(b, m)); /* * We need to save copies of both l and r, because we might get a @@ -2317,57 +2100,49 @@ static bool bch2_extent_merge_inline(struct bch_fs *c, bch2_bkey_unpack(b, &li.k, l); bch2_bkey_unpack(b, &ri.k, r); - m = back_merge ? l : r; - mi = back_merge ? &li.k : &ri.k; - - /* l & r should be in last bset: */ - EBUG_ON(bch2_bkey_to_bset(b, m) != t); - - switch (bch2_extent_merge(c, b, &li.k, &ri.k)) { - case BCH_MERGE_NOMERGE: + ret = bch2_extent_merge(c, b, &li.k, &ri.k); + if (ret == BCH_MERGE_NOMERGE) return false; - case BCH_MERGE_PARTIAL: - if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &mi->k, f)) + + /* + * check if we overlap with deleted extents - would break the sort + * order: + */ + if (back_merge) { + struct bkey_packed *n = bkey_next(m); + + if (n != btree_bkey_last(b, t) && + bkey_cmp_left_packed(b, n, &li.k.k.p) <= 0 && + bkey_deleted(n)) return false; + } else if (ret == BCH_MERGE_MERGE) { + struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); - if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge)) + if (prev && + bkey_cmp_left_packed_byval(b, prev, + bkey_start_pos(&li.k.k)) > 0) return false; + } - extent_i_save(b, m, mi); - bch2_bset_fix_invalidated_key(b, t, m); - - /* - * Update iterator to reflect what we just inserted - otherwise, - * the iter_fix() call is going to put us _before_ the key we - * just partially merged with: - */ - if (back_merge) - bch2_btree_iter_set_pos_same_leaf(iter, li.k.k.p); - - bch2_btree_node_iter_fix(iter, b, node_iter, - t, m, m->u64s, m->u64s); + if (ret == BCH_MERGE_PARTIAL) { + if (!extent_i_save(b, m, mi)) + return false; if (!back_merge) bkey_copy(packed_to_bkey(l), &li.k); else bkey_copy(packed_to_bkey(r), &ri.k); - return false; - case BCH_MERGE_MERGE: - if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &li.k.k, f)) + } else { + if (!extent_i_save(b, m, &li.k)) return false; - - if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge)) - return false; - - extent_i_save(b, m, &li.k); - bch2_bset_fix_invalidated_key(b, t, m); - - bch2_btree_node_iter_fix(iter, b, node_iter, - t, m, m->u64s, m->u64s); - return true; - default: - BUG(); } + + bch2_bset_fix_invalidated_key(b, m); + bch2_btree_node_iter_fix(iter, b, node_iter, + m, m->u64s, m->u64s); + verify_modified_extent(iter, m); + + return ret == BCH_MERGE_MERGE; } int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size) diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 08ad9647..66a02f1c 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -11,14 +11,13 @@ struct btree_node_iter; struct btree_node_iter_large; struct btree_insert; struct btree_insert_entry; -struct extent_insert_hook; struct bch_devs_mask; union bch_extent_crc; const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c); void bch2_btree_ptr_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c); -void bch2_btree_ptr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); +int bch2_btree_ptr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *); #define bch2_bkey_btree_ops (struct bkey_ops) { \ @@ -30,7 +29,7 @@ void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *); const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c); void bch2_extent_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c); -void bch2_extent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); +int bch2_extent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); bool bch2_ptr_normalize(struct bch_fs *, struct btree *, struct bkey_s); enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *, struct bkey_i *, struct bkey_i *); @@ -61,9 +60,22 @@ int bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c, struct bch_devs_mask *, struct extent_pick_ptr *); +void bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *); + +static inline bool bch2_extent_is_atomic(struct bkey *k, + struct btree_iter *iter) +{ + struct btree *b = iter->l[0].b; + + return bkey_cmp(k->p, b->key.k.p) <= 0 && + bkey_cmp(bkey_start_pos(k), b->data->min_key) >= 0; +} + enum btree_insert_ret -bch2_insert_fixup_extent(struct btree_insert *, - struct btree_insert_entry *); +bch2_extent_can_insert(struct btree_insert *, struct btree_insert_entry *, + unsigned *); +enum btree_insert_ret +bch2_insert_fixup_extent(struct btree_insert *, struct btree_insert_entry *); bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent, diff --git a/libbcachefs/fifo.h b/libbcachefs/fifo.h index 789ae663..085d828e 100644 --- a/libbcachefs/fifo.h +++ b/libbcachefs/fifo.h @@ -108,17 +108,17 @@ do { \ #define fifo_peek(fifo) fifo_peek_front(fifo) #define fifo_for_each_entry(_entry, _fifo, _iter) \ - for (((void) (&(_iter) == &(_fifo)->front)), \ - _iter = (_fifo)->front; \ + for (typecheck(typeof((_fifo)->front), _iter), \ + (_iter) = (_fifo)->front; \ ((_iter != (_fifo)->back) && \ (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \ - _iter++) + (_iter)++) #define fifo_for_each_entry_ptr(_ptr, _fifo, _iter) \ - for (((void) (&(_iter) == &(_fifo)->front)), \ - _iter = (_fifo)->front; \ + for (typecheck(typeof((_fifo)->front), _iter), \ + (_iter) = (_fifo)->front; \ ((_iter != (_fifo)->back) && \ (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \ - _iter++) + (_iter)++) #endif /* _BCACHEFS_FIFO_H */ diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index e4d2b39e..d4384303 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -5,6 +5,7 @@ #include "buckets.h" #include "clock.h" #include "error.h" +#include "extents.h" #include "fs.h" #include "fs-io.h" #include "fsck.h" @@ -32,16 +33,6 @@ struct quota_res { u64 sectors; }; -struct i_sectors_hook { - struct extent_insert_hook hook; - struct bch_inode_info *inode; - struct quota_res quota_res; - s64 sectors; - u64 new_i_size; - unsigned flags; - unsigned appending:1; -}; - struct bchfs_write_op { struct bch_inode_info *inode; s64 sectors_added; @@ -177,28 +168,48 @@ static int bch2_quota_reservation_add(struct bch_fs *c, /* i_size updates: */ +struct inode_new_size { + loff_t new_size; + u64 now; + unsigned fields; +}; + static int inode_set_size(struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { - loff_t *new_i_size = p; + struct inode_new_size *s = p; - lockdep_assert_held(&inode->ei_update_lock); + bi->bi_size = s->new_size; + if (s->fields & ATTR_ATIME) + bi->bi_atime = s->now; + if (s->fields & ATTR_MTIME) + bi->bi_mtime = s->now; + if (s->fields & ATTR_CTIME) + bi->bi_ctime = s->now; - bi->bi_size = *new_i_size; return 0; } static int __must_check bch2_write_inode_size(struct bch_fs *c, struct bch_inode_info *inode, - loff_t new_size) + loff_t new_size, unsigned fields) { - return __bch2_write_inode(c, inode, inode_set_size, &new_size, 0); + struct inode_new_size s = { + .new_size = new_size, + .now = bch2_current_time(c), + .fields = fields, + }; + + return bch2_write_inode(c, inode, inode_set_size, &s, fields); } static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, - struct quota_res *quota_res, int sectors) + struct quota_res *quota_res, s64 sectors) { + if (!sectors) + return; + mutex_lock(&inode->ei_quota_lock); #ifdef CONFIG_BCACHEFS_QUOTA if (quota_res && sectors > 0) { @@ -215,297 +226,191 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, mutex_unlock(&inode->ei_quota_lock); } -/* i_sectors accounting: */ - -static enum btree_insert_ret -i_sectors_hook_fn(struct extent_insert_hook *hook, - struct bpos committed_pos, - struct bpos next_pos, - struct bkey_s_c k, - const struct bkey_i *insert) -{ - struct i_sectors_hook *h = container_of(hook, - struct i_sectors_hook, hook); - s64 sectors = next_pos.offset - committed_pos.offset; - int sign = bkey_extent_is_allocation(&insert->k) - - (k.k && bkey_extent_is_allocation(k.k)); - - EBUG_ON(!(h->inode->ei_inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY)); - - h->sectors += sectors * sign; - - return BTREE_INSERT_OK; -} - -static int i_sectors_dirty_finish_fn(struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct i_sectors_hook *h = p; - - if (h->new_i_size != U64_MAX && - (!h->appending || - h->new_i_size > bi->bi_size)) - bi->bi_size = h->new_i_size; - bi->bi_sectors += h->sectors; - bi->bi_flags &= ~h->flags; - return 0; -} - -static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h) -{ - int ret; - - mutex_lock(&h->inode->ei_update_lock); - i_sectors_acct(c, h->inode, &h->quota_res, h->sectors); - - ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h, 0); - - if (!ret && h->new_i_size != U64_MAX) - i_size_write(&h->inode->v, h->new_i_size); - mutex_unlock(&h->inode->ei_update_lock); - - bch2_quota_reservation_put(c, h->inode, &h->quota_res); - - h->sectors = 0; - - return ret; -} - -static int i_sectors_dirty_start_fn(struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, void *p) -{ - struct i_sectors_hook *h = p; - - if (h->flags & BCH_INODE_I_SIZE_DIRTY) - bi->bi_size = h->new_i_size; - - bi->bi_flags |= h->flags; - return 0; -} - -static int i_sectors_dirty_start(struct bch_fs *c, struct i_sectors_hook *h) -{ - int ret; - - mutex_lock(&h->inode->ei_update_lock); - ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_start_fn, h, 0); - mutex_unlock(&h->inode->ei_update_lock); - - return ret; -} - -static inline struct i_sectors_hook -i_sectors_hook_init(struct bch_inode_info *inode, unsigned flags) -{ - return (struct i_sectors_hook) { - .hook.fn = i_sectors_hook_fn, - .inode = inode, - .sectors = 0, - .new_i_size = U64_MAX, - .flags = flags|BCH_INODE_I_SECTORS_DIRTY, - }; -} - /* normal i_size/i_sectors update machinery: */ -struct bchfs_extent_trans_hook { - struct bchfs_write_op *op; - struct extent_insert_hook hook; - - struct bch_inode_unpacked inode_u; - struct bkey_inode_buf inode_p; - - bool need_inode_update; -}; - -static enum btree_insert_ret -bchfs_extent_update_hook(struct extent_insert_hook *hook, - struct bpos committed_pos, - struct bpos next_pos, - struct bkey_s_c k, - const struct bkey_i *insert) +static s64 sum_sector_overwrites(struct bkey_i *new, struct btree_iter *_iter, + bool *allocating) { - struct bchfs_extent_trans_hook *h = container_of(hook, - struct bchfs_extent_trans_hook, hook); - struct bch_inode_info *inode = h->op->inode; - int sign = bkey_extent_is_allocation(&insert->k) - - (k.k && bkey_extent_is_allocation(k.k)); - s64 sectors = (s64) (next_pos.offset - committed_pos.offset) * sign; - u64 offset = min(next_pos.offset << 9, h->op->new_i_size); - bool do_pack = false; + struct btree_iter iter; + struct bkey_s_c old; + s64 delta = 0; - if (h->op->unalloc && - !bch2_extent_is_fully_allocated(k)) - return BTREE_INSERT_ENOSPC; + bch2_btree_iter_init(&iter, _iter->c, BTREE_ID_EXTENTS, POS_MIN, + BTREE_ITER_SLOTS); - BUG_ON((next_pos.offset << 9) > round_up(offset, PAGE_SIZE)); + bch2_btree_iter_link(_iter, &iter); + bch2_btree_iter_copy(&iter, _iter); + + for_each_btree_key_continue(&iter, BTREE_ITER_SLOTS, old) { + if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0) + break; + + if (allocating && + !bch2_extent_is_fully_allocated(old)) + *allocating = true; + + delta += (min(new->k.p.offset, + old.k->p.offset) - + max(bkey_start_offset(&new->k), + bkey_start_offset(old.k))) * + (bkey_extent_is_allocation(&new->k) - + bkey_extent_is_allocation(old.k)); + } + + bch2_btree_iter_unlink(&iter); + + return delta; +} + +static int bch2_extent_update(struct btree_trans *trans, + struct bch_inode_info *inode, + struct disk_reservation *disk_res, + struct quota_res *quota_res, + struct btree_iter *extent_iter, + struct bkey_i *k, + u64 new_i_size, + bool may_allocate, + bool direct, + s64 *total_delta) +{ + struct btree_iter *inode_iter = NULL; + struct bch_inode_unpacked inode_u; + struct bkey_inode_buf inode_p; + bool allocating = false; + bool extended = false; + s64 i_sectors_delta; + int ret; + + bch2_trans_begin_updates(trans); + + ret = bch2_btree_iter_traverse(extent_iter); + if (ret) + return ret; + + bch2_extent_trim_atomic(k, extent_iter); + + i_sectors_delta = sum_sector_overwrites(k, extent_iter, &allocating); + if (!may_allocate && allocating) + return -ENOSPC; + + bch2_trans_update(trans, BTREE_INSERT_ENTRY(extent_iter, k)); + + new_i_size = min(k->k.p.offset << 9, new_i_size); /* XXX: inode->i_size locking */ - if (offset > inode->ei_inode.bi_size) { - if (!h->need_inode_update) { - h->need_inode_update = true; - return BTREE_INSERT_NEED_TRAVERSE; + if (i_sectors_delta || + new_i_size > inode->ei_inode.bi_size) { + inode_iter = bch2_trans_get_iter(trans, + BTREE_ID_INODES, + POS(k->k.p.inode, 0), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + if (IS_ERR(inode_iter)) + return PTR_ERR(inode_iter); + + ret = bch2_btree_iter_traverse(inode_iter); + if (ret) + goto err; + + inode_u = inode->ei_inode; + inode_u.bi_sectors += i_sectors_delta; + + /* XXX: this is slightly suspect */ + if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && + new_i_size > inode_u.bi_size) { + inode_u.bi_size = new_i_size; + extended = true; } - /* truncate in progress? */ - if (h->inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) - goto no_i_size_update; - - h->inode_u.bi_size = offset; - do_pack = true; - - inode->ei_inode.bi_size = offset; - - spin_lock(&inode->v.i_lock); - if (offset > inode->v.i_size) { - if (h->op->is_dio) - i_size_write(&inode->v, offset); - else - BUG(); - } - spin_unlock(&inode->v.i_lock); - } -no_i_size_update: - if (sectors) { - if (!h->need_inode_update) { - h->need_inode_update = true; - return BTREE_INSERT_NEED_TRAVERSE; - } - - h->inode_u.bi_sectors += sectors; - do_pack = true; - - h->op->sectors_added += sectors; + bch2_inode_pack(&inode_p, &inode_u); + bch2_trans_update(trans, + BTREE_INSERT_ENTRY(inode_iter, &inode_p.inode.k_i)); } - if (do_pack) - bch2_inode_pack(&h->inode_p, &h->inode_u); + ret = bch2_trans_commit(trans, disk_res, + &inode->ei_journal_seq, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK| + BTREE_INSERT_USE_RESERVE); + if (ret) + goto err; - return BTREE_INSERT_OK; + inode->ei_inode.bi_sectors += i_sectors_delta; + + EBUG_ON(i_sectors_delta && + inode->ei_inode.bi_sectors != inode_u.bi_sectors); + + if (extended) { + inode->ei_inode.bi_size = new_i_size; + + if (direct) { + spin_lock(&inode->v.i_lock); + if (new_i_size > inode->v.i_size) + i_size_write(&inode->v, new_i_size); + spin_unlock(&inode->v.i_lock); + } + } + + if (direct) + i_sectors_acct(trans->c, inode, quota_res, i_sectors_delta); + + if (total_delta) + *total_delta += i_sectors_delta; +err: + if (!IS_ERR_OR_NULL(inode_iter)) + bch2_trans_iter_put(trans, inode_iter); + return ret; } static int bchfs_write_index_update(struct bch_write_op *wop) { struct bchfs_write_op *op = container_of(wop, struct bchfs_write_op, op); + struct quota_res *quota_res = op->is_dio + ? &container_of(op, struct dio_write, iop)->quota_res + : NULL; + struct bch_inode_info *inode = op->inode; struct keylist *keys = &op->op.insert_keys; - struct btree_trans trans; - struct btree_iter *extent_iter, *inode_iter = NULL; - struct bchfs_extent_trans_hook hook; struct bkey_i *k = bch2_keylist_front(keys); - s64 orig_sectors_added = op->sectors_added; + struct btree_trans trans; + struct btree_iter *iter; int ret; - BUG_ON(k->k.p.inode != op->inode->v.i_ino); + BUG_ON(k->k.p.inode != inode->v.i_ino); bch2_trans_init(&trans, wop->c); + bch2_trans_preload_iters(&trans); - extent_iter = bch2_trans_get_iter(&trans, + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - bkey_start_pos(&bch2_keylist_front(keys)->k), + bkey_start_pos(&k->k), BTREE_ITER_INTENT); - BUG_ON(IS_ERR(extent_iter)); - - hook.op = op; - hook.hook.fn = bchfs_extent_update_hook; - hook.need_inode_update = false; do { - /* XXX: inode->i_size locking */ - k = bch2_keylist_front(keys); - if (min(k->k.p.offset << 9, op->new_i_size) > - op->inode->ei_inode.bi_size) - hook.need_inode_update = true; + BKEY_PADDED(k) tmp; - /* optimization for fewer transaction restarts: */ - ret = bch2_btree_iter_traverse(extent_iter); - if (ret) - goto err; + bkey_copy(&tmp.k, bch2_keylist_front(keys)); - if (hook.need_inode_update) { - struct bkey_s_c inode; - - if (!inode_iter) { - inode_iter = bch2_trans_get_iter(&trans, - BTREE_ID_INODES, - POS(extent_iter->pos.inode, 0), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - BUG_ON(IS_ERR(inode_iter)); - } - - inode = bch2_btree_iter_peek_slot(inode_iter); - if ((ret = btree_iter_err(inode))) - goto err; - - if (WARN_ONCE(inode.k->type != BCH_INODE_FS, - "inode %llu not found when updating", - extent_iter->pos.inode)) { - ret = -ENOENT; - break; - } - - if (WARN_ONCE(bkey_bytes(inode.k) > - sizeof(hook.inode_p), - "inode %llu too big (%zu bytes, buf %zu)", - extent_iter->pos.inode, - bkey_bytes(inode.k), - sizeof(hook.inode_p))) { - ret = -ENOENT; - break; - } - - bkey_reassemble(&hook.inode_p.inode.k_i, inode); - ret = bch2_inode_unpack(bkey_s_c_to_inode(inode), - &hook.inode_u); - if (WARN_ONCE(ret, - "error %i unpacking inode %llu", - ret, extent_iter->pos.inode)) { - ret = -ENOENT; - break; - } - - ret = bch2_btree_insert_at(wop->c, &wop->res, - &hook.hook, op_journal_seq(wop), - BTREE_INSERT_NOFAIL| - BTREE_INSERT_ATOMIC| - BTREE_INSERT_USE_RESERVE, - BTREE_INSERT_ENTRY(extent_iter, k), - BTREE_INSERT_ENTRY_EXTRA_RES(inode_iter, - &hook.inode_p.inode.k_i, 2)); - } else { - ret = bch2_btree_insert_at(wop->c, &wop->res, - &hook.hook, op_journal_seq(wop), - BTREE_INSERT_NOFAIL| - BTREE_INSERT_ATOMIC| - BTREE_INSERT_USE_RESERVE, - BTREE_INSERT_ENTRY(extent_iter, k)); - } - - BUG_ON(bkey_cmp(extent_iter->pos, bkey_start_pos(&k->k))); - - if (WARN_ONCE(!ret != !k->k.size, - "ret %i k->size %u", ret, k->k.size)) - ret = k->k.size ? -EINTR : 0; -err: + ret = bch2_extent_update(&trans, inode, + &wop->res, quota_res, + iter, &tmp.k, + op->new_i_size, + !op->unalloc, + op->is_dio, + &op->sectors_added); if (ret == -EINTR) continue; if (ret) break; - BUG_ON(bkey_cmp(extent_iter->pos, k->k.p) < 0); - bch2_keylist_pop_front(keys); + if (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) < 0) + bch2_cut_front(iter->pos, bch2_keylist_front(keys)); + else + bch2_keylist_pop_front(keys); } while (!bch2_keylist_empty(keys)); bch2_trans_exit(&trans); - if (op->is_dio) { - struct dio_write *dio = container_of(op, struct dio_write, iop); - - i_sectors_acct(wop->c, op->inode, &dio->quota_res, - op->sectors_added - orig_sectors_added); - } - return ret; } @@ -828,17 +733,6 @@ static bool bio_can_add_page_contig(struct bio *bio, struct page *page) bio_end_sector(bio) == offset; } -static void __bio_add_page(struct bio *bio, struct page *page) -{ - bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) { - .bv_page = page, - .bv_len = PAGE_SIZE, - .bv_offset = 0, - }; - - bio->bi_iter.bi_size += PAGE_SIZE; -} - static int bio_add_page_contig(struct bio *bio, struct page *page) { sector_t offset = (sector_t) page->index << PAGE_SECTOR_SHIFT; @@ -850,7 +744,7 @@ static int bio_add_page_contig(struct bio *bio, struct page *page) else if (!bio_can_add_page_contig(bio, page)) return -1; - __bio_add_page(bio, page); + __bio_add_page(bio, page, PAGE_SIZE, 0); return 0; } @@ -974,7 +868,7 @@ static void readpage_bio_extend(struct readpages_iter *iter, iter->nr_pages--; } else if (get_more) { rcu_read_lock(); - page = radix_tree_lookup(&iter->mapping->page_tree, page_offset); + page = radix_tree_lookup(&iter->mapping->i_pages, page_offset); rcu_read_unlock(); if (page && !radix_tree_exceptional_entry(page)) @@ -994,7 +888,7 @@ static void readpage_bio_extend(struct readpages_iter *iter, if (ret) break; - __bio_add_page(bio, page); + __bio_add_page(bio, page, PAGE_SIZE, 0); } if (!iter->nr_pages) @@ -2068,7 +1962,7 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret; + int ret, ret2; ret = file_write_and_wait_range(file, start, end); if (ret) @@ -2084,11 +1978,63 @@ out: if (c->opts.journal_flush_disabled) return 0; - return bch2_journal_flush_seq(&c->journal, inode->ei_journal_seq); + ret = bch2_journal_flush_seq(&c->journal, inode->ei_journal_seq); + ret2 = file_check_and_advance_wb_err(file); + + return ret ?: ret2; } /* truncate: */ +static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode, + u64 start_offset, u64 end_offset, u64 *journal_seq) +{ + struct bpos start = POS(inode->v.i_ino, start_offset); + struct bpos end = POS(inode->v.i_ino, end_offset); + unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + int ret = 0; + + bch2_trans_init(&trans, c); + bch2_trans_preload_iters(&trans); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, start, + BTREE_ITER_INTENT); + + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = btree_iter_err(k)) && + bkey_cmp(iter->pos, end) < 0) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(c, 0); + struct bkey_i delete; + + bkey_init(&delete.k); + delete.k.p = iter->pos; + + /* create the biggest key we can */ + bch2_key_resize(&delete.k, max_sectors); + bch2_cut_back(end, &delete.k); + + ret = bch2_extent_update(&trans, inode, + &disk_res, NULL, iter, &delete, + 0, true, true, NULL); + bch2_disk_reservation_put(c, &disk_res); + + if (ret == -EINTR) + ret = 0; + if (ret) + break; + + bch2_btree_iter_cond_resched(iter); + } + + bch2_trans_exit(&trans); + + return ret; +} + static inline int range_has_data(struct bch_fs *c, struct bpos start, struct bpos end) @@ -2203,19 +2149,39 @@ static int bch2_extend(struct bch_inode_info *inode, struct iattr *iattr) setattr_copy(&inode->v, iattr); mutex_lock(&inode->ei_update_lock); - inode->v.i_mtime = inode->v.i_ctime = current_time(&inode->v); - ret = bch2_write_inode_size(c, inode, inode->v.i_size); + ret = bch2_write_inode_size(c, inode, inode->v.i_size, + ATTR_MTIME|ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); return ret; } +static int bch2_truncate_finish_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + + bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; + bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); + return 0; +} + +static int bch2_truncate_start_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, void *p) +{ + u64 *new_i_size = p; + + bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY; + bi->bi_size = *new_i_size; + return 0; +} + int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; - struct i_sectors_hook i_sectors_hook = - i_sectors_hook_init(inode, BCH_INODE_I_SIZE_DIRTY); + u64 new_i_size = iattr->ia_size; bool shrink; int ret = 0; @@ -2228,12 +2194,12 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) if (!shrink) { ret = bch2_extend(inode, iattr); - goto err_put_pagecache; + goto err; } ret = bch2_truncate_page(inode, iattr->ia_size); if (unlikely(ret)) - goto err_put_pagecache; + goto err; if (iattr->ia_size > inode->ei_inode.bi_size) ret = filemap_write_and_wait_range(mapping, @@ -2244,37 +2210,37 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) round_down(iattr->ia_size, PAGE_SIZE), iattr->ia_size - 1); if (ret) - goto err_put_pagecache; + goto err; - i_sectors_hook.new_i_size = iattr->ia_size; + mutex_lock(&inode->ei_update_lock); + ret = bch2_write_inode(c, inode, bch2_truncate_start_fn, + &new_i_size, 0); + mutex_unlock(&inode->ei_update_lock); - ret = i_sectors_dirty_start(c, &i_sectors_hook); if (unlikely(ret)) - goto err_put_pagecache; + goto err; truncate_setsize(&inode->v, iattr->ia_size); - ret = bch2_inode_truncate(c, inode->v.i_ino, - round_up(iattr->ia_size, PAGE_SIZE) >> 9, - &i_sectors_hook.hook, - &inode->ei_journal_seq); + /* + * XXX: need a comment explaining why PAGE_SIZE and not block_bytes() + * here: + */ + ret = __bch2_fpunch(c, inode, + round_up(iattr->ia_size, PAGE_SIZE) >> 9, + U64_MAX, &inode->ei_journal_seq); if (unlikely(ret)) - goto err_put_sectors_dirty; + goto err; setattr_copy(&inode->v, iattr); - inode->v.i_mtime = inode->v.i_ctime = current_time(&inode->v); -out: - ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret; -err_put_pagecache: + + mutex_lock(&inode->ei_update_lock); + ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, + ATTR_MTIME|ATTR_CTIME); + mutex_unlock(&inode->ei_update_lock); +err: pagecache_block_put(&mapping->add_lock); return ret; -err_put_sectors_dirty: - /* - * On error - in particular, bch2_truncate_page() error - don't clear - * I_SIZE_DIRTY, as we've left data above i_size!: - */ - i_sectors_hook.flags &= ~BCH_INODE_I_SIZE_DIRTY; - goto out; } /* fallocate: */ @@ -2283,7 +2249,6 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; - u64 ino = inode->v.i_ino; u64 discard_start = round_up(offset, PAGE_SIZE) >> 9; u64 discard_end = round_down(offset + len, PAGE_SIZE) >> 9; int ret = 0; @@ -2309,34 +2274,9 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) truncate_pagecache_range(&inode->v, offset, offset + len - 1); - if (discard_start < discard_end) { - /* - * We need to pass in a disk reservation here because we might - * be splitting a compressed extent into two. This isn't a - * problem with truncate because truncate will never split an - * extent, only truncate it... - */ - struct disk_reservation disk_res = - bch2_disk_reservation_init(c, 0); - struct i_sectors_hook i_sectors_hook = - i_sectors_hook_init(inode, 0); - int ret; - - ret = i_sectors_dirty_start(c, &i_sectors_hook); - if (unlikely(ret)) - goto err; - - ret = bch2_btree_delete_range(c, - BTREE_ID_EXTENTS, - POS(ino, discard_start), - POS(ino, discard_end), - ZERO_VERSION, - &disk_res, - &i_sectors_hook.hook, - &inode->ei_journal_seq); - - ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret; - } + if (discard_start < discard_end) + ret = __bch2_fpunch(c, inode, discard_start, discard_end, + &inode->ei_journal_seq); err: pagecache_block_put(&mapping->add_lock); inode_unlock(&inode->v); @@ -2353,7 +2293,6 @@ static long bch2_fcollapse(struct bch_inode_info *inode, struct btree_iter *src, *dst; BKEY_PADDED(k) copy; struct bkey_s_c k; - struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0); loff_t new_size; int ret; @@ -2361,16 +2300,7 @@ static long bch2_fcollapse(struct bch_inode_info *inode, return -EINVAL; bch2_trans_init(&trans, c); - - dst = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - POS(inode->v.i_ino, offset >> 9), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - BUG_ON(IS_ERR(dst)); - - /* position will be set from dst iter's position: */ - src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, - BTREE_ITER_SLOTS); - BUG_ON(IS_ERR(src)); + bch2_trans_preload_iters(&trans); /* * We need i_mutex to keep the page cache consistent with the extents @@ -2395,15 +2325,24 @@ static long bch2_fcollapse(struct bch_inode_info *inode, if (ret) goto err; - ret = i_sectors_dirty_start(c, &i_sectors_hook); - if (ret) - goto err; + dst = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + POS(inode->v.i_ino, offset >> 9), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + BUG_ON(IS_ERR_OR_NULL(dst)); + + src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + POS_MIN, BTREE_ITER_SLOTS); + BUG_ON(IS_ERR_OR_NULL(src)); while (bkey_cmp(dst->pos, POS(inode->v.i_ino, round_up(new_size, PAGE_SIZE) >> 9)) < 0) { struct disk_reservation disk_res; + ret = bch2_btree_iter_traverse(dst); + if (ret) + goto btree_iter_err; + bch2_btree_iter_set_pos(src, POS(dst->pos.inode, dst->pos.offset + (len >> 9))); @@ -2416,6 +2355,8 @@ static long bch2_fcollapse(struct bch_inode_info *inode, bch2_cut_front(src->pos, ©.k); copy.k.k.p.offset -= len >> 9; + bch2_extent_trim_atomic(©.k, dst); + BUG_ON(bkey_cmp(dst->pos, bkey_start_pos(©.k.k))); ret = bch2_disk_reservation_get(c, &disk_res, copy.k.k.size, @@ -2423,19 +2364,16 @@ static long bch2_fcollapse(struct bch_inode_info *inode, BCH_DISK_RESERVATION_NOFAIL); BUG_ON(ret); - ret = bch2_btree_insert_at(c, &disk_res, &i_sectors_hook.hook, - &inode->ei_journal_seq, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOFAIL, - BTREE_INSERT_ENTRY(dst, ©.k)); + ret = bch2_extent_update(&trans, inode, + &disk_res, NULL, + dst, ©.k, + 0, true, true, NULL); bch2_disk_reservation_put(c, &disk_res); btree_iter_err: if (ret == -EINTR) ret = 0; - if (ret) { - bch2_trans_exit(&trans); - goto err_put_sectors_dirty; - } + if (ret) + goto err; /* * XXX: if we error here we've left data with multiple * pointers... which isn't a _super_ serious problem... @@ -2443,20 +2381,21 @@ btree_iter_err: bch2_btree_iter_cond_resched(src); } + bch2_trans_unlock(&trans); - bch2_trans_exit(&trans); - - ret = bch2_inode_truncate(c, inode->v.i_ino, - round_up(new_size, block_bytes(c)) >> 9, - &i_sectors_hook.hook, - &inode->ei_journal_seq); + ret = __bch2_fpunch(c, inode, + round_up(new_size, block_bytes(c)) >> 9, + U64_MAX, &inode->ei_journal_seq); if (ret) - goto err_put_sectors_dirty; + goto err; - i_sectors_hook.new_i_size = new_size; -err_put_sectors_dirty: - ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret; + i_size_write(&inode->v, new_size); + mutex_lock(&inode->ei_update_lock); + ret = bch2_write_inode_size(c, inode, new_size, + ATTR_MTIME|ATTR_CTIME); + mutex_unlock(&inode->ei_update_lock); err: + bch2_trans_exit(&trans); pagecache_block_put(&mapping->add_lock); inode_unlock(&inode->v); return ret; @@ -2467,8 +2406,8 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, { struct address_space *mapping = inode->v.i_mapping; struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0); - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bpos end_pos; loff_t block_start, block_end; loff_t end = offset + len; @@ -2476,8 +2415,8 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, unsigned replicas = io_opts(c, inode).data_replicas; int ret; - bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + bch2_trans_init(&trans, c); + bch2_trans_preload_iters(&trans); inode_lock(&inode->v); inode_dio_wait(&inode->v); @@ -2512,34 +2451,32 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, block_end = round_up(end, PAGE_SIZE); } - bch2_btree_iter_set_pos(&iter, POS(inode->v.i_ino, block_start >> 9)); + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + POS(inode->v.i_ino, block_start >> 9), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); end_pos = POS(inode->v.i_ino, block_end >> 9); - ret = i_sectors_dirty_start(c, &i_sectors_hook); - if (unlikely(ret)) - goto err; - - while (bkey_cmp(iter.pos, end_pos) < 0) { + while (bkey_cmp(iter->pos, end_pos) < 0) { struct disk_reservation disk_res = { 0 }; + struct quota_res quota_res = { 0 }; struct bkey_i_reservation reservation; struct bkey_s_c k; - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(iter); if ((ret = btree_iter_err(k))) goto btree_iter_err; /* already reserved */ if (k.k->type == BCH_RESERVATION && bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) { - bch2_btree_iter_next_slot(&iter); + bch2_btree_iter_next_slot(iter); continue; } - if (bkey_extent_is_data(k.k)) { - if (!(mode & FALLOC_FL_ZERO_RANGE)) { - bch2_btree_iter_next_slot(&iter); - continue; - } + if (bkey_extent_is_data(k.k) && + !(mode & FALLOC_FL_ZERO_RANGE)) { + bch2_btree_iter_next_slot(iter); + continue; } bkey_reservation_init(&reservation.k_i); @@ -2547,7 +2484,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, reservation.k.p = k.k->p; reservation.k.size = k.k->size; - bch2_cut_front(iter.pos, &reservation.k_i); + bch2_cut_front(iter->pos, &reservation.k_i); bch2_cut_back(end_pos, &reservation.k); sectors = reservation.k.size; @@ -2555,7 +2492,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, if (!bkey_extent_is_allocation(k.k)) { ret = bch2_quota_reservation_add(c, inode, - &i_sectors_hook.quota_res, + "a_res, sectors, true); if (unlikely(ret)) goto btree_iter_err; @@ -2571,31 +2508,27 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, reservation.v.nr_replicas = disk_res.nr_replicas; } - ret = bch2_btree_insert_at(c, &disk_res, &i_sectors_hook.hook, - &inode->ei_journal_seq, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOFAIL, - BTREE_INSERT_ENTRY(&iter, &reservation.k_i)); + ret = bch2_extent_update(&trans, inode, + &disk_res, "a_res, + iter, &reservation.k_i, + 0, true, true, NULL); + + bch2_quota_reservation_put(c, inode, "a_res); bch2_disk_reservation_put(c, &disk_res); btree_iter_err: if (ret == -EINTR) ret = 0; - if (ret) { - bch2_btree_iter_unlock(&iter); - goto err_put_sectors_dirty; - } - + if (ret) + goto err; } - bch2_btree_iter_unlock(&iter); - - ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret; + bch2_trans_unlock(&trans); if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { i_size_write(&inode->v, end); mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode_size(c, inode, inode->v.i_size); + ret = bch2_write_inode_size(c, inode, inode->v.i_size, 0); mutex_unlock(&inode->ei_update_lock); } @@ -2611,18 +2544,13 @@ btree_iter_err: if (inode->ei_inode.bi_size != inode->v.i_size) { mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode_size(c, inode, inode->v.i_size); + ret = bch2_write_inode_size(c, inode, + inode->v.i_size, 0); mutex_unlock(&inode->ei_update_lock); } } - - pagecache_block_put(&mapping->add_lock); - inode_unlock(&inode->v); - - return 0; -err_put_sectors_dirty: - ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret; err: + bch2_trans_exit(&trans); pagecache_block_put(&mapping->add_lock); inode_unlock(&inode->v); return ret; diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index 336dbd4b..0eb0a011 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -11,79 +11,6 @@ #define FS_IOC_GOINGDOWN _IOR('X', 125, __u32) -/* Inode flags: */ - -/* bcachefs inode flags -> vfs inode flags: */ -static const unsigned bch_flags_to_vfs[] = { - [__BCH_INODE_SYNC] = S_SYNC, - [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE, - [__BCH_INODE_APPEND] = S_APPEND, - [__BCH_INODE_NOATIME] = S_NOATIME, -}; - -/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ -static const unsigned bch_flags_to_uflags[] = { - [__BCH_INODE_SYNC] = FS_SYNC_FL, - [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL, - [__BCH_INODE_APPEND] = FS_APPEND_FL, - [__BCH_INODE_NODUMP] = FS_NODUMP_FL, - [__BCH_INODE_NOATIME] = FS_NOATIME_FL, -}; - -/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ -static const unsigned bch_flags_to_xflags[] = { - [__BCH_INODE_SYNC] = FS_XFLAG_SYNC, - [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE, - [__BCH_INODE_APPEND] = FS_XFLAG_APPEND, - [__BCH_INODE_NODUMP] = FS_XFLAG_NODUMP, - [__BCH_INODE_NOATIME] = FS_XFLAG_NOATIME, - //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT; -}; - -#define set_flags(_map, _in, _out) \ -do { \ - unsigned _i; \ - \ - for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ - if ((_in) & (1 << _i)) \ - (_out) |= _map[_i]; \ - else \ - (_out) &= ~_map[_i]; \ -} while (0) - -#define map_flags(_map, _in) \ -({ \ - unsigned _out = 0; \ - \ - set_flags(_map, _in, _out); \ - _out; \ -}) - -#define map_flags_rev(_map, _in) \ -({ \ - unsigned _i, _out = 0; \ - \ - for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ - if ((_in) & _map[_i]) { \ - (_out) |= 1 << _i; \ - (_in) &= ~_map[_i]; \ - } \ - (_out); \ -}) - -#define map_defined(_map) \ -({ \ - unsigned _in = ~0; \ - \ - map_flags_rev(_map, _in); \ -}) - -/* Set VFS inode flags from bcachefs inode: */ -void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) -{ - set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); -} - struct flags_set { unsigned mask; unsigned flags; @@ -95,6 +22,7 @@ static int bch2_inode_flags_set(struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { + struct bch_fs *c = inode->v.i_sb->s_fs_info; /* * We're relying on btree locking here for exclusion with other ioctl * calls - use the flags in the btree (@bi), not inode->i_flags: @@ -107,14 +35,15 @@ static int bch2_inode_flags_set(struct bch_inode_info *inode, !capable(CAP_LINUX_IMMUTABLE)) return -EPERM; - if (!S_ISREG(inode->v.i_mode) && - !S_ISDIR(inode->v.i_mode) && + if (!S_ISREG(bi->bi_mode) && + !S_ISDIR(bi->bi_mode) && (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags) return -EINVAL; bi->bi_flags &= ~s->mask; bi->bi_flags |= newflags; - inode->v.i_ctime = current_time(&inode->v); + + bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v)); return 0; } @@ -152,10 +81,8 @@ static int bch2_ioc_setflags(struct bch_fs *c, } mutex_lock(&inode->ei_update_lock); - ret = __bch2_write_inode(c, inode, bch2_inode_flags_set, &s, 0); - - if (!ret) - bch2_inode_flags_to_vfs(inode); + ret = bch2_write_inode(c, inode, bch2_inode_flags_set, &s, + ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); setflags_out: @@ -241,9 +168,8 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c, if (ret) goto err_unlock; - ret = __bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, 0); - if (!ret) - bch2_inode_flags_to_vfs(inode); + ret = bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, + ATTR_CTIME); err_unlock: mutex_unlock(&inode->ei_update_lock); err: diff --git a/libbcachefs/fs-ioctl.h b/libbcachefs/fs-ioctl.h index c14e583d..c7124ed3 100644 --- a/libbcachefs/fs-ioctl.h +++ b/libbcachefs/fs-ioctl.h @@ -1,7 +1,78 @@ #ifndef _BCACHEFS_FS_IOCTL_H #define _BCACHEFS_FS_IOCTL_H -void bch2_inode_flags_to_vfs(struct bch_inode_info *); +/* Inode flags: */ + +/* bcachefs inode flags -> vfs inode flags: */ +static const unsigned bch_flags_to_vfs[] = { + [__BCH_INODE_SYNC] = S_SYNC, + [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE, + [__BCH_INODE_APPEND] = S_APPEND, + [__BCH_INODE_NOATIME] = S_NOATIME, +}; + +/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ +static const unsigned bch_flags_to_uflags[] = { + [__BCH_INODE_SYNC] = FS_SYNC_FL, + [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL, + [__BCH_INODE_APPEND] = FS_APPEND_FL, + [__BCH_INODE_NODUMP] = FS_NODUMP_FL, + [__BCH_INODE_NOATIME] = FS_NOATIME_FL, +}; + +/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ +static const unsigned bch_flags_to_xflags[] = { + [__BCH_INODE_SYNC] = FS_XFLAG_SYNC, + [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE, + [__BCH_INODE_APPEND] = FS_XFLAG_APPEND, + [__BCH_INODE_NODUMP] = FS_XFLAG_NODUMP, + [__BCH_INODE_NOATIME] = FS_XFLAG_NOATIME, + //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT; +}; + +#define set_flags(_map, _in, _out) \ +do { \ + unsigned _i; \ + \ + for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ + if ((_in) & (1 << _i)) \ + (_out) |= _map[_i]; \ + else \ + (_out) &= ~_map[_i]; \ +} while (0) + +#define map_flags(_map, _in) \ +({ \ + unsigned _out = 0; \ + \ + set_flags(_map, _in, _out); \ + _out; \ +}) + +#define map_flags_rev(_map, _in) \ +({ \ + unsigned _i, _out = 0; \ + \ + for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ + if ((_in) & _map[_i]) { \ + (_out) |= 1 << _i; \ + (_in) &= ~_map[_i]; \ + } \ + (_out); \ +}) + +#define map_defined(_map) \ +({ \ + unsigned _in = ~0; \ + \ + map_flags_rev(_map, _in); \ +}) + +/* Set VFS inode flags from bcachefs inode: */ +static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) +{ + set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); +} long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long); long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long); diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index c51a65da..ae875870 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -47,6 +47,30 @@ static void journal_seq_copy(struct bch_inode_info *dst, } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old); } +static inline int ptrcmp(void *l, void *r) +{ + return (l > r) - (l < r); +} + +#define __bch2_lock_inodes(_lock, ...) \ +do { \ + struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ + unsigned i; \ + \ + bubble_sort(&a[1], ARRAY_SIZE(a) - 1 , ptrcmp); \ + \ + for (i = ARRAY_SIZE(a) - 1; a[i]; --i) \ + if (a[i] != a[i - 1]) { \ + if (_lock) \ + mutex_lock_nested(&a[i]->ei_update_lock, i);\ + else \ + mutex_unlock(&a[i]->ei_update_lock); \ + } \ +} while (0) + +#define bch2_lock_inodes(...) __bch2_lock_inodes(true, __VA_ARGS__) +#define bch2_unlock_inodes(...) __bch2_lock_inodes(false, __VA_ARGS__) + /* * I_SIZE_DIRTY requires special handling: * @@ -96,6 +120,8 @@ void bch2_inode_update_after_write(struct bch_fs *c, inode->ei_inode = *bi; inode->ei_qid = bch_qid(bi); + + bch2_inode_flags_to_vfs(inode); } int __must_check bch2_write_inode_trans(struct btree_trans *trans, @@ -106,35 +132,22 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans, { struct btree_iter *iter; struct bkey_inode_buf *inode_p; - struct bkey_s_c k; - u64 inum = inode->v.i_ino; int ret; lockdep_assert_held(&inode->ei_update_lock); - iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(inum, 0), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, + POS(inode->v.i_ino, 0), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); if (IS_ERR(iter)) return PTR_ERR(iter); - k = bch2_btree_iter_peek_slot(iter); - if ((ret = btree_iter_err(k))) + /* The btree node lock is our lock on the inode: */ + ret = bch2_btree_iter_traverse(iter); + if (ret) return ret; - if (WARN_ONCE(k.k->type != BCH_INODE_FS, - "inode %llu not found when updating", inum)) - return -ENOENT; - - ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode_u); - if (WARN_ONCE(ret, - "error %i unpacking inode %llu", ret, inum)) - return -ENOENT; - - BUG_ON(inode_u->bi_size != inode->ei_inode.bi_size); - - BUG_ON(inode_u->bi_size != inode->ei_inode.bi_size && - !(inode_u->bi_flags & BCH_INODE_I_SIZE_DIRTY) && - inode_u->bi_size > i_size_read(&inode->v)); + *inode_u = inode->ei_inode; if (set) { ret = set(inode, inode_u, p); @@ -147,14 +160,14 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans, return PTR_ERR(inode_p); bch2_inode_pack(inode_p, inode_u); - bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); + bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &inode_p->inode.k_i)); return 0; } -int __must_check __bch2_write_inode(struct bch_fs *c, - struct bch_inode_info *inode, - inode_set_fn set, - void *p, unsigned fields) +int __must_check bch2_write_inode(struct bch_fs *c, + struct bch_inode_info *inode, + inode_set_fn set, + void *p, unsigned fields) { struct btree_trans trans; struct bch_inode_unpacked inode_u; @@ -165,7 +178,7 @@ retry: bch2_trans_begin(&trans); ret = bch2_write_inode_trans(&trans, inode, &inode_u, set, p) ?: - bch2_trans_commit(&trans, NULL, NULL, + bch2_trans_commit(&trans, NULL, &inode->ei_journal_seq, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOUNLOCK| @@ -235,9 +248,8 @@ static int inode_update_for_create_fn(struct bch_inode_info *inode, { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_inode_unpacked *new_inode = p; - struct timespec now = current_time(&inode->v); - bi->bi_mtime = bi->bi_ctime = timespec_to_bch2_time(c, now); + bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); if (S_ISDIR(new_inode->bi_mode)) bi->bi_nlink++; @@ -256,6 +268,7 @@ __bch2_create(struct bch_inode_info *dir, struct dentry *dentry, struct bch_inode_unpacked inode_u; struct bch_hash_info hash_info; struct posix_acl *default_acl = NULL, *acl = NULL; + u64 journal_seq = 0; int ret; bch2_inode_init(c, &inode_u, 0, 0, 0, rdev, &dir->ei_inode); @@ -288,6 +301,9 @@ __bch2_create(struct bch_inode_info *dir, struct dentry *dentry, goto err; } + if (!tmpfile) + mutex_lock(&dir->ei_update_lock); + bch2_trans_init(&trans, c); retry: bch2_trans_begin(&trans); @@ -316,8 +332,8 @@ retry: inode_update_for_create_fn, &inode_u) : 0) ?: - bch2_trans_commit(&trans, NULL, NULL, - &inode->ei_journal_seq, + bch2_trans_commit(&trans, NULL, + &journal_seq, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOUNLOCK); if (ret == -EINTR) @@ -331,9 +347,11 @@ retry: bch2_inode_update_after_write(c, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); journal_seq_copy(dir, inode->ei_journal_seq); + mutex_unlock(&dir->ei_update_lock); } bch2_vfs_inode_init(c, inode, &inode_u); + journal_seq_copy(inode, journal_seq); set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); @@ -369,6 +387,9 @@ out: posix_acl_release(acl); return inode; err_trans: + if (!tmpfile) + mutex_unlock(&dir->ei_update_lock); + bch2_trans_exit(&trans); make_bad_inode(&inode->v); iput(&inode->v); @@ -416,9 +437,8 @@ static int inode_update_for_link_fn(struct bch_inode_info *inode, void *p) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct timespec now = current_time(&inode->v); - bi->bi_ctime = timespec_to_bch2_time(c, now); + bi->bi_ctime = bch2_current_time(c); if (bi->bi_flags & BCH_INODE_UNLINKED) bi->bi_flags &= ~BCH_INODE_UNLINKED; @@ -437,8 +457,7 @@ static int __bch2_link(struct bch_fs *c, struct bch_inode_unpacked inode_u; int ret; - lockdep_assert_held(&inode->v.i_rwsem); - + mutex_lock(&inode->ei_update_lock); bch2_trans_init(&trans, c); retry: bch2_trans_begin(&trans); @@ -452,7 +471,7 @@ retry: bch2_write_inode_trans(&trans, inode, &inode_u, inode_update_for_link_fn, NULL) ?: - bch2_trans_commit(&trans, NULL, NULL, + bch2_trans_commit(&trans, NULL, &inode->ei_journal_seq, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOUNLOCK); @@ -464,6 +483,7 @@ retry: bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME); bch2_trans_exit(&trans); + mutex_unlock(&inode->ei_update_lock); return ret; } @@ -475,6 +495,8 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir, struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode); int ret; + lockdep_assert_held(&inode->v.i_rwsem); + ret = __bch2_link(c, inode, dir, dentry); if (unlikely(ret)) return ret; @@ -490,9 +512,8 @@ static int inode_update_dir_for_unlink_fn(struct bch_inode_info *inode, { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_inode_info *unlink_inode = p; - struct timespec now = current_time(&inode->v); - bi->bi_mtime = bi->bi_ctime = timespec_to_bch2_time(c, now); + bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); bi->bi_nlink -= S_ISDIR(unlink_inode->v.i_mode); @@ -504,9 +525,8 @@ static int inode_update_for_unlink_fn(struct bch_inode_info *inode, void *p) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct timespec now = current_time(&inode->v); - bi->bi_ctime = timespec_to_bch2_time(c, now); + bi->bi_ctime = bch2_current_time(c); if (bi->bi_nlink) bi->bi_nlink--; else @@ -524,6 +544,7 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) struct btree_trans trans; int ret; + bch2_lock_inodes(dir, inode); bch2_trans_init(&trans, c); retry: bch2_trans_begin(&trans); @@ -537,7 +558,7 @@ retry: bch2_write_inode_trans(&trans, inode, &inode_u, inode_update_for_unlink_fn, NULL) ?: - bch2_trans_commit(&trans, NULL, NULL, + bch2_trans_commit(&trans, NULL, &dir->ei_journal_seq, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOUNLOCK| @@ -556,6 +577,7 @@ retry: ATTR_MTIME); err: bch2_trans_exit(&trans); + bch2_unlock_inodes(dir, inode); return ret; } @@ -683,8 +705,6 @@ static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry, { struct bch_fs *c = src_vdir->i_sb->s_fs_info; struct rename_info i = { - .now = timespec_to_bch2_time(c, - current_time(src_vdir)), .src_dir = to_bch_ei(src_vdir), .dst_dir = to_bch_ei(dst_vdir), .src_inode = to_bch_ei(src_dentry->d_inode), @@ -718,10 +738,15 @@ static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry, return ret; } + bch2_lock_inodes(i.src_dir, + i.dst_dir, + i.src_inode, + i.dst_inode); + bch2_trans_init(&trans, c); retry: bch2_trans_begin(&trans); - i.now = timespec_to_bch2_time(c, current_time(src_vdir)), + i.now = bch2_current_time(c); ret = bch2_dirent_rename(&trans, i.src_dir, &src_dentry->d_name, @@ -739,7 +764,7 @@ retry: ? bch2_write_inode_trans(&trans, i.dst_inode, &dst_inode_u, inode_update_for_rename_fn, &i) : 0 ) ?: - bch2_trans_commit(&trans, NULL, NULL, + bch2_trans_commit(&trans, NULL, &journal_seq, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOUNLOCK); @@ -758,6 +783,10 @@ retry: journal_seq_copy(i.dst_dir, journal_seq); } + journal_seq_copy(i.src_inode, journal_seq); + if (i.dst_inode) + journal_seq_copy(i.dst_inode, journal_seq); + bch2_inode_update_after_write(c, i.src_inode, &src_inode_u, ATTR_CTIME); if (i.dst_inode) @@ -765,6 +794,10 @@ retry: ATTR_CTIME); err: bch2_trans_exit(&trans); + bch2_unlock_inodes(i.src_dir, + i.dst_dir, + i.src_inode, + i.dst_inode); return ret; } @@ -849,7 +882,7 @@ retry: (iattr->ia_valid & ATTR_MODE ? bch2_acl_chmod(&trans, inode, iattr->ia_mode, &acl) : 0) ?: - bch2_trans_commit(&trans, NULL, NULL, + bch2_trans_commit(&trans, NULL, &inode->ei_journal_seq, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOUNLOCK| @@ -1198,8 +1231,6 @@ static void bch2_vfs_inode_init(struct bch_fs *c, inode->ei_quota_reserved = 0; inode->ei_str_hash = bch2_hash_info_init(c, bi); - bch2_inode_flags_to_vfs(inode); - inode->v.i_mapping->a_ops = &bch_address_space_operations; switch (inode->v.i_mode & S_IFMT) { @@ -1272,8 +1303,8 @@ static int bch2_vfs_write_inode(struct inode *vinode, int ret; mutex_lock(&inode->ei_update_lock); - ret = __bch2_write_inode(c, inode, inode_update_times_fn, NULL, - ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); + ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, + ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); if (c->opts.journal_flush_disabled) @@ -1312,13 +1343,16 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct bch_fs *c = sb->s_fs_info; + struct bch_fs_usage usage = bch2_fs_usage_read(c); + u64 hidden_metadata = usage.buckets[BCH_DATA_SB] + + usage.buckets[BCH_DATA_JOURNAL]; + unsigned shift = sb->s_blocksize_bits - 9; u64 fsid; buf->f_type = BCACHEFS_STATFS_MAGIC; buf->f_bsize = sb->s_blocksize; - buf->f_blocks = c->capacity >> PAGE_SECTOR_SHIFT; - buf->f_bfree = bch2_fs_sectors_free(c, bch2_fs_usage_read(c)) >> - PAGE_SECTOR_SHIFT; + buf->f_blocks = (c->capacity - hidden_metadata) >> shift; + buf->f_bfree = (c->capacity - bch2_fs_sectors_used(c, usage)) >> shift; buf->f_bavail = buf->f_bfree; buf->f_files = atomic_long_read(&c->nr_inodes); buf->f_ffree = U64_MAX; diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h index e2fc2706..a434c757 100644 --- a/libbcachefs/fs.h +++ b/libbcachefs/fs.h @@ -1,6 +1,7 @@ #ifndef _BCACHEFS_FS_H #define _BCACHEFS_FS_H +#include "inode.h" #include "opts.h" #include "str_hash.h" #include "quota_types.h" @@ -43,6 +44,11 @@ static inline unsigned nlink_bias(umode_t mode) return S_ISDIR(mode) ? 2 : 1; } +static inline u64 bch2_current_time(struct bch_fs *c) +{ + return timespec_to_bch2_time(c, current_kernel_time64()); +} + struct bch_inode_unpacked; #ifndef NO_BCACHEFS_FS @@ -59,10 +65,8 @@ int __must_check bch2_write_inode_trans(struct btree_trans *, struct bch_inode_info *, struct bch_inode_unpacked *, inode_set_fn, void *); -int __must_check __bch2_write_inode(struct bch_fs *, struct bch_inode_info *, - inode_set_fn, void *, unsigned); -int __must_check bch2_write_inode(struct bch_fs *, - struct bch_inode_info *); +int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, + inode_set_fn, void *, unsigned); void bch2_vfs_exit(void); int bch2_vfs_init(void); diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index f6035cc7..b3e247af 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -72,8 +72,7 @@ static int reattach_inode(struct bch_fs *c, bch2_inode_pack(&packed, lostfound_inode); ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, - NULL, NULL, NULL, - BTREE_INSERT_NOFAIL); + NULL, NULL, BTREE_INSERT_NOFAIL); if (ret) { bch_err(c, "error %i reattaching inode %llu while updating lost+found", ret, inum); @@ -201,7 +200,7 @@ retry: } ret = bch2_hash_delete_at(&trans, desc, info, iter) ?: - bch2_trans_commit(&trans, NULL, NULL, NULL, + bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL); err: @@ -289,6 +288,13 @@ fsck_err: return ret; } +static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size) +{ + return bch2_btree_delete_range(c, BTREE_ID_EXTENTS, + POS(inode_nr, round_up(new_size, block_bytes(c)) >> 9), + POS(inode_nr + 1, 0), NULL); +} + /* * Walk extents: verify that extents have a corresponding S_ISREG inode, and * that i_size an i_sectors are consistent @@ -319,7 +325,7 @@ static int check_extents(struct bch_fs *c) k.k->type, k.k->p.inode, w.inode.bi_mode)) { bch2_btree_iter_unlock(&iter); - ret = bch2_inode_truncate(c, k.k->p.inode, 0, NULL, NULL); + ret = bch2_inode_truncate(c, k.k->p.inode, 0); if (ret) goto err; continue; @@ -341,10 +347,7 @@ static int check_extents(struct bch_fs *c) bch2_inode_pack(&p, &w.inode); ret = bch2_btree_insert(c, BTREE_ID_INODES, - &p.inode.k_i, - NULL, - NULL, - NULL, + &p.inode.k_i, NULL, NULL, BTREE_INSERT_NOFAIL); if (ret) { bch_err(c, "error in fs gc: error %i " @@ -365,8 +368,7 @@ static int check_extents(struct bch_fs *c) bch2_btree_iter_unlock(&iter); ret = bch2_inode_truncate(c, k.k->p.inode, - round_up(w.inode.bi_size, PAGE_SIZE) >> 9, - NULL, NULL); + w.inode.bi_size); if (ret) goto err; continue; @@ -397,7 +399,7 @@ static int check_dirents(struct bch_fs *c) bch2_trans_init(&trans, c); - BUG_ON(bch2_trans_preload_iters(&trans)); + bch2_trans_preload_iters(&trans); iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, POS(BCACHEFS_ROOT_INO, 0), 0); @@ -507,7 +509,7 @@ static int check_dirents(struct bch_fs *c) bkey_reassemble(&n->k_i, d.s_c); n->v.d_type = mode_to_type(target.bi_mode); - ret = bch2_btree_insert_at(c, NULL, NULL, NULL, + ret = bch2_btree_insert_at(c, NULL, NULL, BTREE_INSERT_NOFAIL, BTREE_INSERT_ENTRY(iter, &n->k_i)); kfree(n); @@ -538,7 +540,7 @@ static int check_xattrs(struct bch_fs *c) bch2_trans_init(&trans, c); - BUG_ON(bch2_trans_preload_iters(&trans)); + bch2_trans_preload_iters(&trans); iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS(BCACHEFS_ROOT_INO, 0), 0); @@ -601,7 +603,7 @@ create_root: bch2_inode_pack(&packed, root_inode); return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, - NULL, NULL, NULL, BTREE_INSERT_NOFAIL); + NULL, NULL, BTREE_INSERT_NOFAIL); } /* Get lost+found, create if it doesn't exist: */ @@ -645,7 +647,7 @@ create_lostfound: bch2_inode_pack(&packed, root_inode); ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, - NULL, NULL, NULL, BTREE_INSERT_NOFAIL); + NULL, NULL, BTREE_INSERT_NOFAIL); if (ret) return ret; @@ -1093,9 +1095,7 @@ static int check_inode(struct bch_fs *c, * just switch units to bytes and that issue goes away */ - ret = bch2_inode_truncate(c, u.bi_inum, - round_up(u.bi_size, PAGE_SIZE) >> 9, - NULL, NULL); + ret = bch2_inode_truncate(c, u.bi_inum, u.bi_size); if (ret) { bch_err(c, "error in fs gc: error %i " "truncating inode", ret); @@ -1141,7 +1141,7 @@ static int check_inode(struct bch_fs *c, bch2_inode_pack(&p, &u); - ret = bch2_btree_insert_at(c, NULL, NULL, NULL, + ret = bch2_btree_insert_at(c, NULL, NULL, BTREE_INSERT_NOFAIL, BTREE_INSERT_ENTRY(iter, &p.inode.k_i)); if (ret && ret != -EINTR) diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index d4139faa..4841715c 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -227,8 +227,8 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) } } -void bch2_inode_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) +int bch2_inode_to_text(struct bch_fs *c, char *buf, + size_t size, struct bkey_s_c k) { char *out = buf, *end = out + size; struct bkey_s_c_inode inode; @@ -248,6 +248,8 @@ void bch2_inode_to_text(struct bch_fs *c, char *buf, #undef BCH_INODE_FIELD break; } + + return out - buf; } void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, @@ -255,8 +257,8 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, struct bch_inode_unpacked *parent) { s64 now = timespec_to_bch2_time(c, - timespec_trunc(current_kernel_time(), - c->sb.time_precision)); + timespec64_trunc(current_kernel_time64(), + c->sb.time_precision)); memset(inode_u, 0, sizeof(*inode_u)); @@ -347,7 +349,8 @@ again: inode_u->bi_generation = bkey_generation(k); bch2_inode_pack(inode_p, inode_u); - bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); + bch2_trans_update(trans, + BTREE_INSERT_ENTRY(iter, &inode_p->inode.k_i)); return 0; } } @@ -369,33 +372,14 @@ int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u, __bch2_inode_create(&trans, inode_u, min, max, hint)); } -int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size, - struct extent_insert_hook *hook, u64 *journal_seq) -{ - return bch2_btree_delete_range(c, BTREE_ID_EXTENTS, - POS(inode_nr, new_size), - POS(inode_nr + 1, 0), - ZERO_VERSION, NULL, hook, - journal_seq); -} - int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) { struct btree_iter iter; struct bkey_i_inode_generation delete; + struct bpos start = POS(inode_nr, 0); + struct bpos end = POS(inode_nr + 1, 0); int ret; - ret = bch2_inode_truncate(c, inode_nr, 0, NULL, NULL); - if (ret < 0) - return ret; - - ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS, - POS(inode_nr, 0), - POS(inode_nr + 1, 0), - ZERO_VERSION, NULL, NULL, NULL); - if (ret < 0) - return ret; - /* * If this was a directory, there shouldn't be any real dirents left - * but there could be whiteouts (from hash collisions) that we should @@ -404,11 +388,13 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) * XXX: the dirent could ideally would delete whiteouts when they're no * longer needed */ - ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS, - POS(inode_nr, 0), - POS(inode_nr + 1, 0), - ZERO_VERSION, NULL, NULL, NULL); - if (ret < 0) + ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS, + start, end, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_XATTRS, + start, end, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_DIRENTS, + start, end, NULL); + if (ret) return ret; bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inode_nr, 0), @@ -452,7 +438,7 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) delete.v.bi_generation = cpu_to_le32(bi_generation); } - ret = bch2_btree_insert_at(c, NULL, NULL, NULL, + ret = bch2_btree_insert_at(c, NULL, NULL, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL, BTREE_INSERT_ENTRY(&iter, &delete.k_i)); diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index a47194ab..93dbdaeb 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -6,7 +6,7 @@ #include const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c); -void bch2_inode_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); +int bch2_inode_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); #define bch2_bkey_inode_ops (struct bkey_ops) { \ .key_invalid = bch2_inode_invalid, \ @@ -45,21 +45,19 @@ int __bch2_inode_create(struct btree_trans *, int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *, u64, u64, u64 *); -int bch2_inode_truncate(struct bch_fs *, u64, u64, - struct extent_insert_hook *, u64 *); int bch2_inode_rm(struct bch_fs *, u64); int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *); -static inline struct timespec bch2_time_to_timespec(struct bch_fs *c, u64 time) +static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, u64 time) { - return ns_to_timespec(time * c->sb.time_precision + c->sb.time_base_lo); + return ns_to_timespec64(time * c->sb.time_precision + c->sb.time_base_lo); } -static inline u64 timespec_to_bch2_time(struct bch_fs *c, struct timespec ts) +static inline u64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts) { - s64 ns = timespec_to_ns(&ts) - c->sb.time_base_lo; + s64 ns = timespec64_to_ns(&ts) - c->sb.time_base_lo; if (c->sb.time_precision == 1) return ns; diff --git a/libbcachefs/io.c b/libbcachefs/io.c index f26d4041..5ca2a2dd 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -285,7 +285,7 @@ int bch2_write_index_default(struct bch_write_op *op) BTREE_ITER_INTENT); ret = bch2_btree_insert_list_at(&iter, keys, &op->res, - NULL, op_journal_seq(op), + op_journal_seq(op), BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE); bch2_btree_iter_unlock(&iter); @@ -1388,7 +1388,7 @@ retry: if (!bch2_extent_narrow_crcs(e, new_crc)) goto out; - ret = bch2_btree_insert_at(c, NULL, NULL, NULL, + ret = bch2_btree_insert_at(c, NULL, NULL, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL| BTREE_INSERT_NOWAIT, diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index b4fe27f8..634123eb 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -32,14 +32,8 @@ void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set) test_bit(JOURNAL_NEED_WRITE, &j->flags)) bch2_time_stats_update(j->delay_time, j->need_write_time); -#if 0 - closure_call(&j->io, bch2_journal_write, NULL, NULL); -#else - /* Shut sparse up: */ - closure_init(&j->io, NULL); - set_closure_fn(&j->io, bch2_journal_write, NULL); - bch2_journal_write(&j->io); -#endif + + closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL); } static void journal_pin_new_entry(struct journal *j, int count) @@ -96,7 +90,7 @@ static enum { } journal_buf_switch(struct journal *j, bool need_write_just_set) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *buf; + struct journal_buf *buf = journal_cur_buf(j); union journal_res_state old, new; u64 v = atomic64_read(&j->reservations.counter); @@ -107,8 +101,11 @@ static enum { if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL) return JOURNAL_ENTRY_CLOSED; - if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) + if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) { + /* this entry will never be written: */ + closure_wake_up(&buf->wait); return JOURNAL_ENTRY_ERROR; + } if (new.prev_buf_unwritten) return JOURNAL_ENTRY_INUSE; @@ -129,7 +126,6 @@ static enum { clear_bit(JOURNAL_NEED_WRITE, &j->flags); - buf = &j->buf[old.idx]; buf->data->u64s = cpu_to_le32(old.cur_entry_offset); j->prev_buf_sectors = @@ -138,8 +134,26 @@ static enum { c->opts.block_size; BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors); + /* + * We have to set last_seq here, _before_ opening a new journal entry: + * + * A threads may replace an old pin with a new pin on their current + * journal reservation - the expectation being that the journal will + * contain either what the old pin protected or what the new pin + * protects. + * + * After the old pin is dropped journal_last_seq() won't include the old + * pin, so we can only write the updated last_seq on the entry that + * contains whatever the new pin protects. + * + * Restated, we can _not_ update last_seq for a given entry if there + * could be a newer entry open with reservations/pins that have been + * taken against it. + * + * Hence, we want update/set last_seq on the current journal entry right + * before we open a new one: + */ bch2_journal_reclaim_fast(j); - /* XXX: why set this here, and not in bch2_journal_write()? */ buf->data->last_seq = cpu_to_le64(journal_last_seq(j)); if (journal_entry_empty(buf->data)) @@ -154,13 +168,6 @@ static enum { cancel_delayed_work(&j->write_work); spin_unlock(&j->lock); - if (c->bucket_journal_seq > 1 << 14) { - c->bucket_journal_seq = 0; - bch2_bucket_seq_cleanup(c); - } - - c->bucket_journal_seq++; - /* ugh - might be called from __journal_res_get() under wait_event() */ __set_current_state(TASK_RUNNING); bch2_journal_buf_put(j, old.idx, need_write_just_set); @@ -265,34 +272,41 @@ static int journal_entry_open(struct journal *j) return 1; } -/* - * returns true if there's nothing to flush and no journal write still in flight - */ -static bool journal_flush_write(struct journal *j) +static bool __journal_entry_close(struct journal *j) { - bool ret; - - spin_lock(&j->lock); - ret = !j->reservations.prev_buf_unwritten; + bool set_need_write; if (!journal_entry_is_open(j)) { spin_unlock(&j->lock); - return ret; + return true; } - set_bit(JOURNAL_NEED_WRITE, &j->flags); - if (journal_buf_switch(j, false) == JOURNAL_UNLOCKED) - ret = false; - else + set_need_write = !test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags); + if (set_need_write) + j->need_write_time = local_clock(); + + switch (journal_buf_switch(j, set_need_write)) { + case JOURNAL_ENTRY_INUSE: spin_unlock(&j->lock); - return ret; + return false; + default: + spin_unlock(&j->lock); + case JOURNAL_UNLOCKED: + return true; + } +} + +static bool journal_entry_close(struct journal *j) +{ + spin_lock(&j->lock); + return __journal_entry_close(j); } static void journal_write_work(struct work_struct *work) { struct journal *j = container_of(work, struct journal, write_work.work); - journal_flush_write(j); + journal_entry_close(j); } /* @@ -462,6 +476,37 @@ int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *pare return ret; } +static int journal_seq_error(struct journal *j, u64 seq) +{ + union journal_res_state state = READ_ONCE(j->reservations); + + if (seq == journal_cur_seq(j)) + return bch2_journal_error(j); + + if (seq + 1 == journal_cur_seq(j) && + !state.prev_buf_unwritten && + seq > j->seq_ondisk) + return -EIO; + + return 0; +} + +static inline struct journal_buf * +journal_seq_to_buf(struct journal *j, u64 seq) +{ + /* seq should be for a journal entry that has been opened: */ + BUG_ON(seq > journal_cur_seq(j)); + BUG_ON(seq == journal_cur_seq(j) && + j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL); + + if (seq == journal_cur_seq(j)) + return journal_cur_buf(j); + if (seq + 1 == journal_cur_seq(j) && + j->reservations.prev_buf_unwritten) + return journal_prev_buf(j); + return NULL; +} + /** * bch2_journal_wait_on_seq - wait for a journal entry to be written * @@ -470,31 +515,22 @@ int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *pare * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is * configurable). */ -void bch2_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent) +void bch2_journal_wait_on_seq(struct journal *j, u64 seq, + struct closure *parent) { + struct journal_buf *buf; + spin_lock(&j->lock); - BUG_ON(seq > journal_cur_seq(j)); - - if (bch2_journal_error(j)) { - spin_unlock(&j->lock); - return; - } - - if (seq == journal_cur_seq(j)) { - if (!closure_wait(&journal_cur_buf(j)->wait, parent)) - BUG(); - } else if (seq + 1 == journal_cur_seq(j) && - j->reservations.prev_buf_unwritten) { - if (!closure_wait(&journal_prev_buf(j)->wait, parent)) + if ((buf = journal_seq_to_buf(j, seq))) { + if (!closure_wait(&buf->wait, parent)) BUG(); - smp_mb(); - - /* check if raced with write completion (or failure) */ - if (!j->reservations.prev_buf_unwritten || - bch2_journal_error(j)) - closure_wake_up(&journal_prev_buf(j)->wait); + if (seq == journal_cur_seq(j)) { + smp_mb(); + if (bch2_journal_error(j)) + closure_wake_up(&buf->wait); + } } spin_unlock(&j->lock); @@ -506,108 +542,35 @@ void bch2_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent * like bch2_journal_wait_on_seq, except that it triggers a write immediately if * necessary */ -void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent) +void bch2_journal_flush_seq_async(struct journal *j, u64 seq, + struct closure *parent) { struct journal_buf *buf; spin_lock(&j->lock); - BUG_ON(seq > journal_cur_seq(j)); - - if (bch2_journal_error(j)) { - spin_unlock(&j->lock); - return; - } - - if (seq == journal_cur_seq(j)) { - bool set_need_write = false; - - buf = journal_cur_buf(j); - - if (parent && !closure_wait(&buf->wait, parent)) - BUG(); - - if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) { - j->need_write_time = local_clock(); - set_need_write = true; - } - - switch (journal_buf_switch(j, set_need_write)) { - case JOURNAL_ENTRY_ERROR: - if (parent) - closure_wake_up(&buf->wait); - break; - case JOURNAL_ENTRY_CLOSED: - /* - * Journal entry hasn't been opened yet, but caller - * claims it has something - */ - BUG(); - case JOURNAL_ENTRY_INUSE: - break; - case JOURNAL_UNLOCKED: - return; - } - } else if (parent && - seq + 1 == journal_cur_seq(j) && - j->reservations.prev_buf_unwritten) { - buf = journal_prev_buf(j); - + if (parent && + (buf = journal_seq_to_buf(j, seq))) if (!closure_wait(&buf->wait, parent)) BUG(); - smp_mb(); - - /* check if raced with write completion (or failure) */ - if (!j->reservations.prev_buf_unwritten || - bch2_journal_error(j)) - closure_wake_up(&buf->wait); - } - - spin_unlock(&j->lock); + if (seq == journal_cur_seq(j)) + __journal_entry_close(j); + else + spin_unlock(&j->lock); } static int journal_seq_flushed(struct journal *j, u64 seq) { - struct journal_buf *buf; - int ret = 1; + int ret; spin_lock(&j->lock); - BUG_ON(seq > journal_cur_seq(j)); + ret = seq <= j->seq_ondisk ? 1 : journal_seq_error(j, seq); - if (seq == journal_cur_seq(j)) { - bool set_need_write = false; - - ret = 0; - - buf = journal_cur_buf(j); - - if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) { - j->need_write_time = local_clock(); - set_need_write = true; - } - - switch (journal_buf_switch(j, set_need_write)) { - case JOURNAL_ENTRY_ERROR: - ret = -EIO; - break; - case JOURNAL_ENTRY_CLOSED: - /* - * Journal entry hasn't been opened yet, but caller - * claims it has something - */ - BUG(); - case JOURNAL_ENTRY_INUSE: - break; - case JOURNAL_UNLOCKED: - return 0; - } - } else if (seq + 1 == journal_cur_seq(j) && - j->reservations.prev_buf_unwritten) { - ret = bch2_journal_error(j); - } - - spin_unlock(&j->lock); + if (seq == journal_cur_seq(j)) + __journal_entry_close(j); + else + spin_unlock(&j->lock); return ret; } @@ -727,6 +690,10 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, if (!journal_buckets) goto err; + /* + * We may be called from the device add path, before the new device has + * actually been added to the running filesystem: + */ if (c) spin_lock(&c->journal.lock); @@ -743,10 +710,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, long bucket; if (new_fs) { - percpu_down_read_preempt_disable(&c->usage_lock); bucket = bch2_bucket_alloc_new_fs(ca); - percpu_up_read_preempt_enable(&c->usage_lock); - if (bucket < 0) { ret = -ENOSPC; goto err; @@ -765,6 +729,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, if (c) { percpu_down_read_preempt_disable(&c->usage_lock); spin_lock(&c->journal.lock); + } else { + preempt_disable(); } __array_insert_item(ja->buckets, ja->nr, ja->last_idx); @@ -792,6 +758,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, if (c) { spin_unlock(&c->journal.lock); percpu_up_read_preempt_enable(&c->usage_lock); + } else { + preempt_enable(); } if (!new_fs) @@ -904,13 +872,16 @@ void bch2_fs_journal_stop(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - wait_event(j->wait, journal_flush_write(j)); + wait_event(j->wait, journal_entry_close(j)); /* do we need to write another journal entry? */ if (test_bit(JOURNAL_NOT_EMPTY, &j->flags) || c->btree_roots_dirty) bch2_journal_meta(j); + BUG_ON(journal_entry_is_open(j) || + j->reservations.prev_buf_unwritten); + BUG_ON(!bch2_journal_error(j) && test_bit(JOURNAL_NOT_EMPTY, &j->flags)); @@ -920,6 +891,7 @@ void bch2_fs_journal_stop(struct journal *j) void bch2_fs_journal_start(struct journal *j) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_seq_blacklist *bl; u64 blacklist = 0; @@ -941,6 +913,8 @@ void bch2_fs_journal_start(struct journal *j) journal_pin_new_entry(j, 1); bch2_journal_buf_init(j); + c->last_bucket_seq_cleanup = journal_cur_seq(j); + spin_unlock(&j->lock); /* @@ -1014,6 +988,7 @@ int bch2_fs_journal_init(struct journal *j) init_waitqueue_head(&j->wait); INIT_DELAYED_WORK(&j->write_work, journal_write_work); INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work); + init_waitqueue_head(&j->pin_flush_wait); mutex_init(&j->blacklist_lock); INIT_LIST_HEAD(&j->seq_blacklist); mutex_init(&j->reclaim_lock); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 8a4e7b2a..2a70edc2 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -901,7 +901,7 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list) bch2_disk_reservation_init(c, 0); ret = bch2_btree_insert(c, entry->btree_id, k, - &disk_res, NULL, NULL, + &disk_res, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_JOURNAL_REPLAY); } @@ -1204,6 +1204,9 @@ static void journal_write_done(struct closure *cl) struct bch_devs_list devs = bch2_extent_devs(bkey_i_to_s_c_extent(&w->key)); u64 seq = le64_to_cpu(w->data->seq); + u64 last_seq = le64_to_cpu(w->data->last_seq); + + bch2_time_stats_update(j->write_time, j->write_start_time); if (!devs.nr) { bch_err(c, "unable to write journal to sufficient devices"); @@ -1212,11 +1215,11 @@ static void journal_write_done(struct closure *cl) if (bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs)) goto err; -out: - bch2_time_stats_update(j->write_time, j->write_start_time); spin_lock(&j->lock); - j->last_seq_ondisk = seq; + j->seq_ondisk = seq; + j->last_seq_ondisk = last_seq; + if (seq >= j->pin.front) journal_seq_pin(j, seq)->devs = devs; @@ -1228,7 +1231,7 @@ out: * bch2_fs_journal_stop(): */ mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0); - +out: /* also must come before signalling write completion: */ closure_debug_destroy(cl); @@ -1246,6 +1249,7 @@ out: err: bch2_fatal_error(c); bch2_journal_halt(j); + spin_lock(&j->lock); goto out; } @@ -1385,6 +1389,8 @@ no_io: extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) ptr->offset += sectors; + bch2_bucket_seq_cleanup(c); + continue_at(cl, journal_write_done, system_highpri_wq); return; err: diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 394b72bb..978aba72 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -10,34 +10,18 @@ * entry, holding it open to ensure it gets replayed during recovery: */ -static inline u64 journal_pin_seq(struct journal *j, - struct journal_entry_pin_list *pin_list) -{ - return fifo_entry_idx_abs(&j->pin, pin_list); -} - -u64 bch2_journal_pin_seq(struct journal *j, struct journal_entry_pin *pin) -{ - u64 ret = 0; - - spin_lock(&j->lock); - if (journal_pin_active(pin)) - ret = journal_pin_seq(j, pin->pin_list); - spin_unlock(&j->lock); - - return ret; -} - static inline void __journal_pin_add(struct journal *j, - struct journal_entry_pin_list *pin_list, + u64 seq, struct journal_entry_pin *pin, journal_pin_flush_fn flush_fn) { + struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); + BUG_ON(journal_pin_active(pin)); BUG_ON(!atomic_read(&pin_list->count)); atomic_inc(&pin_list->count); - pin->pin_list = pin_list; + pin->seq = seq; pin->flush = flush_fn; if (flush_fn) @@ -57,19 +41,20 @@ void bch2_journal_pin_add(struct journal *j, u64 seq, journal_pin_flush_fn flush_fn) { spin_lock(&j->lock); - __journal_pin_add(j, journal_seq_pin(j, seq), pin, flush_fn); + __journal_pin_add(j, seq, pin, flush_fn); spin_unlock(&j->lock); } static inline void __journal_pin_drop(struct journal *j, struct journal_entry_pin *pin) { - struct journal_entry_pin_list *pin_list = pin->pin_list; + struct journal_entry_pin_list *pin_list; if (!journal_pin_active(pin)) return; - pin->pin_list = NULL; + pin_list = journal_seq_pin(j, pin->seq); + pin->seq = 0; list_del_init(&pin->list); /* @@ -82,7 +67,7 @@ static inline void __journal_pin_drop(struct journal *j, } void bch2_journal_pin_drop(struct journal *j, - struct journal_entry_pin *pin) + struct journal_entry_pin *pin) { spin_lock(&j->lock); __journal_pin_drop(j, pin); @@ -98,15 +83,21 @@ void bch2_journal_pin_add_if_older(struct journal *j, if (journal_pin_active(src_pin) && (!journal_pin_active(pin) || - journal_pin_seq(j, src_pin->pin_list) < - journal_pin_seq(j, pin->pin_list))) { + src_pin->seq < pin->seq)) { __journal_pin_drop(j, pin); - __journal_pin_add(j, src_pin->pin_list, pin, flush_fn); + __journal_pin_add(j, src_pin->seq, pin, flush_fn); } spin_unlock(&j->lock); } +void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) +{ + BUG_ON(journal_pin_active(pin)); + + wait_event(j->pin_flush_wait, j->flush_in_progress != pin); +} + /* * Journal reclaim: flush references to open journal entries to reclaim space in * the journal @@ -144,41 +135,42 @@ void bch2_journal_reclaim_fast(struct journal *j) journal_wake(j); } -static struct journal_entry_pin * -__journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq) +static void journal_pin_mark_flushing(struct journal *j, + struct journal_entry_pin *pin, + u64 seq) { - struct journal_entry_pin_list *pin_list; - struct journal_entry_pin *ret; - u64 iter; + lockdep_assert_held(&j->reclaim_lock); - /* no need to iterate over empty fifo entries: */ - bch2_journal_reclaim_fast(j); + list_move(&pin->list, &journal_seq_pin(j, seq)->flushed); + BUG_ON(j->flush_in_progress); + j->flush_in_progress = pin; +} - fifo_for_each_entry_ptr(pin_list, &j->pin, iter) { - if (iter > seq_to_flush) - break; +static void journal_pin_flush(struct journal *j, + struct journal_entry_pin *pin, + u64 seq) +{ + pin->flush(j, pin, seq); - ret = list_first_entry_or_null(&pin_list->list, - struct journal_entry_pin, list); - if (ret) { - /* must be list_del_init(), see bch2_journal_pin_drop() */ - list_move(&ret->list, &pin_list->flushed); - *seq = iter; - return ret; - } - } - - return NULL; + BUG_ON(j->flush_in_progress != pin); + j->flush_in_progress = NULL; + wake_up(&j->pin_flush_wait); } static struct journal_entry_pin * journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq) { - struct journal_entry_pin *ret; + struct journal_entry_pin_list *pin_list; + struct journal_entry_pin *ret = NULL; - spin_lock(&j->lock); - ret = __journal_get_next_pin(j, seq_to_flush, seq); - spin_unlock(&j->lock); + /* no need to iterate over empty fifo entries: */ + bch2_journal_reclaim_fast(j); + + fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) + if (*seq > seq_to_flush || + (ret = list_first_entry_or_null(&pin_list->list, + struct journal_entry_pin, list))) + break; return ret; } @@ -278,15 +270,11 @@ void bch2_journal_reclaim_work(struct work_struct *work) spin_unlock(&j->lock); } - if (reclaim_lock_held) - mutex_unlock(&j->reclaim_lock); - /* Also flush if the pin fifo is more than half full */ spin_lock(&j->lock); seq_to_flush = max_t(s64, seq_to_flush, (s64) journal_cur_seq(j) - (j->pin.size >> 1)); - spin_unlock(&j->lock); /* * If it's been longer than j->reclaim_delay_ms since we last flushed, @@ -298,13 +286,31 @@ void bch2_journal_reclaim_work(struct work_struct *work) while ((pin = journal_get_next_pin(j, need_flush ? U64_MAX : seq_to_flush, &seq))) { - __set_current_state(TASK_RUNNING); - pin->flush(j, pin, seq); - need_flush = false; + if (!reclaim_lock_held) { + spin_unlock(&j->lock); + __set_current_state(TASK_RUNNING); + mutex_lock(&j->reclaim_lock); + reclaim_lock_held = true; + spin_lock(&j->lock); + continue; + } + journal_pin_mark_flushing(j, pin, seq); + spin_unlock(&j->lock); + + journal_pin_flush(j, pin, seq); + + need_flush = false; j->last_flushed = jiffies; + + spin_lock(&j->lock); } + spin_unlock(&j->lock); + + if (reclaim_lock_held) + mutex_unlock(&j->reclaim_lock); + if (!test_bit(BCH_FS_RO, &c->flags)) queue_delayed_work(system_freezable_wq, &j->reclaim_work, msecs_to_jiffies(j->reclaim_delay_ms)); @@ -327,11 +333,14 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, * If journal replay hasn't completed, the unreplayed journal entries * hold refs on their corresponding sequence numbers */ - ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL || + ret = (*pin = journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL || !test_bit(JOURNAL_REPLAY_DONE, &j->flags) || journal_last_seq(j) > seq_to_flush || (fifo_used(&j->pin) == 1 && atomic_read(&fifo_peek_front(&j->pin).count) == 1); + if (*pin) + journal_pin_mark_flushing(j, *pin, *pin_seq); + spin_unlock(&j->lock); return ret; @@ -345,14 +354,18 @@ void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) if (!test_bit(JOURNAL_STARTED, &j->flags)) return; + mutex_lock(&j->reclaim_lock); + while (1) { wait_event(j->wait, journal_flush_done(j, seq_to_flush, &pin, &pin_seq)); if (!pin) break; - pin->flush(j, pin, pin_seq); + journal_pin_flush(j, pin, pin_seq); } + + mutex_unlock(&j->reclaim_lock); } int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) diff --git a/libbcachefs/journal_reclaim.h b/libbcachefs/journal_reclaim.h index eb227902..f7dcbfd3 100644 --- a/libbcachefs/journal_reclaim.h +++ b/libbcachefs/journal_reclaim.h @@ -5,19 +5,17 @@ static inline bool journal_pin_active(struct journal_entry_pin *pin) { - return pin->pin_list != NULL; + return pin->seq != 0; } static inline struct journal_entry_pin_list * journal_seq_pin(struct journal *j, u64 seq) { - BUG_ON(seq < j->pin.front || seq >= j->pin.back); + EBUG_ON(seq < j->pin.front || seq >= j->pin.back); return &j->pin.data[seq & j->pin.mask]; } -u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *); - void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *, journal_pin_flush_fn); void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); @@ -25,6 +23,7 @@ void bch2_journal_pin_add_if_older(struct journal *, struct journal_entry_pin *, struct journal_entry_pin *, journal_pin_flush_fn); +void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *); void bch2_journal_reclaim_fast(struct journal *); void bch2_journal_reclaim_work(struct work_struct *); diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index effbeece..26702482 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -47,7 +47,7 @@ typedef void (*journal_pin_flush_fn)(struct journal *j, struct journal_entry_pin { struct list_head list; journal_pin_flush_fn flush; - struct journal_entry_pin_list *pin_list; + u64 seq; }; /* corresponds to a btree node with a blacklisted bset: */ @@ -150,7 +150,8 @@ struct journal { /* Sequence number of most recent journal entry (last entry in @pin) */ atomic64_t seq; - /* last_seq from the most recent journal entry written */ + /* seq, last_seq from the most recent journal entry successfully written */ + u64 seq_ondisk; u64 last_seq_ondisk; /* @@ -173,6 +174,10 @@ struct journal { u64 front, back, size, mask; struct journal_entry_pin_list *data; } pin; + + struct journal_entry_pin *flush_in_progress; + wait_queue_head_t pin_flush_wait; + u64 replay_journal_seq; struct mutex blacklist_lock; diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index 215c5aa5..f5cbf44d 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -78,7 +78,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) iter.pos = bkey_start_pos(&tmp.key.k); - ret = bch2_btree_insert_at(c, NULL, NULL, NULL, + ret = bch2_btree_insert_at(c, NULL, NULL, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL, BTREE_INSERT_ENTRY(&iter, &tmp.key)); diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 3e52b7a2..4a5e435b 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -158,7 +158,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) break; ret = bch2_btree_insert_at(c, &op->res, - NULL, op_journal_seq(op), + op_journal_seq(op), BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE| diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 7bef4561..d414ee94 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -227,16 +227,10 @@ static int bch2_copygc_thread(void *arg) last = atomic_long_read(&clock->now); - reserve = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) * - ca->mi.bucket_size * - c->opts.gc_reserve_percent, 200); + reserve = ca->copygc_threshold; usage = bch2_dev_usage_read(c, ca); - /* - * don't start copygc until less than half the gc reserve is - * available: - */ available = __dev_buckets_available(ca, usage) * ca->mi.bucket_size; if (available > reserve) { diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index f476033e..79b16fe7 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -113,9 +113,12 @@ enum opt_type { BCH_OPT(inodes_32bit, u8, OPT_RUNTIME, \ OPT_BOOL(), \ BCH_SB_INODE_32BIT, false) \ - BCH_OPT(gc_reserve_percent, u8, OPT_MOUNT, \ + BCH_OPT(gc_reserve_percent, u8, OPT_RUNTIME, \ OPT_UINT(5, 21), \ BCH_SB_GC_RESERVE, 8) \ + BCH_OPT(gc_reserve_bytes, u64, OPT_RUNTIME, \ + OPT_UINT(0, U64_MAX), \ + BCH_SB_GC_RESERVE_BYTES, 0) \ BCH_OPT(root_reserve_percent, u8, OPT_MOUNT, \ OPT_UINT(0, 100), \ BCH_SB_ROOT_RESERVE, 0) \ diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c index bb03d83a..e1604581 100644 --- a/libbcachefs/quota.c +++ b/libbcachefs/quota.c @@ -45,10 +45,10 @@ static const char * const bch2_quota_counters[] = { "inodes", }; -void bch2_quota_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) +int bch2_quota_to_text(struct bch_fs *c, char *buf, + size_t size, struct bkey_s_c k) { - char *out = buf, *end= buf + size; + char *out = buf, *end = buf + size; struct bkey_s_c_quota dq; unsigned i; @@ -63,6 +63,8 @@ void bch2_quota_to_text(struct bch_fs *c, char *buf, le64_to_cpu(dq.v->c[i].softlimit)); break; } + + return out - buf; } #ifdef CONFIG_BCACHEFS_QUOTA @@ -538,7 +540,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags) ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, POS(QTYP_USR, 0), POS(QTYP_USR + 1, 0), - ZERO_VERSION, NULL, NULL, NULL); + NULL); if (ret) return ret; } @@ -550,7 +552,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags) ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, POS(QTYP_GRP, 0), POS(QTYP_GRP + 1, 0), - ZERO_VERSION, NULL, NULL, NULL); + NULL); if (ret) return ret; } @@ -562,7 +564,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags) ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, POS(QTYP_PRJ, 0), POS(QTYP_PRJ + 1, 0), - ZERO_VERSION, NULL, NULL, NULL); + NULL); if (ret) return ret; } @@ -761,7 +763,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid, if (qdq->d_fieldmask & QC_INO_HARD) new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); - ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0, + ret = bch2_btree_insert_at(c, NULL, NULL, 0, BTREE_INSERT_ENTRY(&iter, &new_quota.k_i)); bch2_btree_iter_unlock(&iter); diff --git a/libbcachefs/quota.h b/libbcachefs/quota.h index 0b24f22c..14570c8b 100644 --- a/libbcachefs/quota.h +++ b/libbcachefs/quota.h @@ -7,7 +7,7 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_quota; const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c); -void bch2_quota_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); +int bch2_quota_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); #define bch2_bkey_quota_ops (struct bkey_ops) { \ .key_invalid = bch2_quota_invalid, \ diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 0af136d6..3a20a774 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -330,7 +330,7 @@ int bch2_fs_initialize(struct bch_fs *c) err = "error creating root directory"; ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed_inode.inode.k_i, - NULL, NULL, NULL, 0); + NULL, NULL, 0); if (ret) goto err; @@ -343,7 +343,7 @@ int bch2_fs_initialize(struct bch_fs *c) err = "error creating lost+found"; ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed_inode.inode.k_i, - NULL, NULL, NULL, 0); + NULL, NULL, 0); if (ret) goto err; diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h index 99f1fe87..7eff5a42 100644 --- a/libbcachefs/str_hash.h +++ b/libbcachefs/str_hash.h @@ -254,14 +254,14 @@ not_found: return -ENOENT; insert->k.p = slot->pos; - bch2_trans_update(trans, slot, insert, 0); + bch2_trans_update(trans, BTREE_INSERT_ENTRY(slot, insert)); return 0; found: if (flags & BCH_HASH_SET_MUST_CREATE) return -EEXIST; insert->k.p = iter->pos; - bch2_trans_update(trans, iter, insert, 0); + bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, insert)); return 0; } @@ -296,7 +296,7 @@ static inline int bch2_hash_delete_at(struct btree_trans *trans, delete->k.p = iter->pos; delete->k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED; - bch2_trans_update(trans, iter, delete, 0); + bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, delete)); return 0; } diff --git a/libbcachefs/super.c b/libbcachefs/super.c index a2a32b92..f4cf44a0 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -403,6 +403,7 @@ static void bch2_fs_free(struct bch_fs *c) bch2_fs_compress_exit(c); percpu_free_rwsem(&c->usage_lock); free_percpu(c->usage_percpu); + mempool_exit(&c->btree_iters_pool); mempool_exit(&c->btree_bounce_pool); bioset_exit(&c->btree_bio); mempool_exit(&c->btree_interior_update_pool); @@ -435,6 +436,8 @@ void bch2_fs_stop(struct bch_fs *c) struct bch_dev *ca; unsigned i; + bch_verbose(c, "shutting down"); + for_each_member_device(ca, c, i) if (ca->kobj.state_in_sysfs && ca->disk_sb.bdev) @@ -476,6 +479,8 @@ void bch2_fs_stop(struct bch_fs *c) if (c->devs[i]) bch2_dev_free(rcu_dereference_protected(c->devs[i], 1)); + bch_verbose(c, "shutdown complete"); + kobject_put(&c->kobj); } @@ -628,6 +633,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) percpu_init_rwsem(&c->usage_lock) || mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) || + mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, + sizeof(struct btree_iter) * BTREE_ITER_MAX) || bch2_io_clock_init(&c->io_clock[READ]) || bch2_io_clock_init(&c->io_clock[WRITE]) || bch2_fs_journal_init(&c->journal) || @@ -1019,14 +1026,6 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) ca->disk_sb.bdev->bd_holder = ca; memset(sb, 0, sizeof(*sb)); - if (ca->fs) - mutex_lock(&ca->fs->sb_lock); - - bch2_mark_dev_superblock(ca->fs, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE); - - if (ca->fs) - mutex_unlock(&ca->fs->sb_lock); - percpu_ref_reinit(&ca->io_ref); return 0; @@ -1052,6 +1051,11 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) if (ret) return ret; + mutex_lock(&c->sb_lock); + bch2_mark_dev_superblock(ca->fs, ca, + BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE); + mutex_unlock(&c->sb_lock); + bch2_dev_sysfs_online(c, ca); if (c->sb.nr_devices == 1) @@ -1280,8 +1284,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), POS(ca->dev_idx + 1, 0), - ZERO_VERSION, - NULL, NULL, NULL); + NULL); if (ret) { bch_err(ca, "Remove failed, error deleting alloc info"); goto err; @@ -1329,6 +1332,24 @@ err: return ret; } +static void dev_usage_clear(struct bch_dev *ca) +{ + struct bucket_array *buckets; + int cpu; + + for_each_possible_cpu(cpu) { + struct bch_dev_usage *p = + per_cpu_ptr(ca->usage_percpu, cpu); + memset(p, 0, sizeof(*p)); + } + + down_read(&ca->bucket_lock); + buckets = bucket_array(ca); + + memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets); + up_read(&ca->bucket_lock); +} + /* Add new device to running filesystem: */ int bch2_dev_add(struct bch_fs *c, const char *path) { @@ -1367,11 +1388,28 @@ int bch2_dev_add(struct bch_fs *c, const char *path) return ret; } + /* + * We want to allocate journal on the new device before adding the new + * device to the filesystem because allocating after we attach requires + * spinning up the allocator thread, and the allocator thread requires + * doing btree writes, which if the existing devices are RO isn't going + * to work + * + * So we have to mark where the superblocks are, but marking allocated + * data normally updates the filesystem usage too, so we have to mark, + * allocate the journal, reset all the marks, then remark after we + * attach... + */ + bch2_mark_dev_superblock(ca->fs, ca, + BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE); + err = "journal alloc failed"; ret = bch2_dev_journal_alloc(ca); if (ret) goto err; + dev_usage_clear(ca); + mutex_lock(&c->state_lock); mutex_lock(&c->sb_lock); @@ -1422,6 +1460,9 @@ have_slot: ca->disk_sb.sb->dev_idx = dev_idx; bch2_dev_attach(c, ca, dev_idx); + bch2_mark_dev_superblock(c, ca, + BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE); + bch2_write_super(c); mutex_unlock(&c->sb_lock); diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 4987ee76..b353d7cd 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -229,41 +229,42 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf) { + char *out = buf, *end = buf + PAGE_SIZE; struct bch_fs_usage stats = bch2_fs_usage_read(c); + unsigned replicas, type; - return scnprintf(buf, PAGE_SIZE, - "capacity:\t\t%llu\n" - "1 replicas:\n" - "\tmeta:\t\t%llu\n" - "\tdirty:\t\t%llu\n" - "\treserved:\t%llu\n" - "2 replicas:\n" - "\tmeta:\t\t%llu\n" - "\tdirty:\t\t%llu\n" - "\treserved:\t%llu\n" - "3 replicas:\n" - "\tmeta:\t\t%llu\n" - "\tdirty:\t\t%llu\n" - "\treserved:\t%llu\n" - "4 replicas:\n" - "\tmeta:\t\t%llu\n" - "\tdirty:\t\t%llu\n" - "\treserved:\t%llu\n" + out += scnprintf(out, end - out, + "capacity:\t\t%llu\n", + c->capacity); + + for (replicas = 0; replicas < ARRAY_SIZE(stats.replicas); replicas++) { + out += scnprintf(out, end - out, + "%u replicas:\n", + replicas + 1); + + for (type = BCH_DATA_SB; type < BCH_DATA_NR; type++) + out += scnprintf(out, end - out, + "\t%s:\t\t%llu\n", + bch2_data_types[type], + stats.replicas[replicas].data[type]); + out += scnprintf(out, end - out, + "\treserved:\t%llu\n", + stats.replicas[replicas].persistent_reserved); + } + + out += scnprintf(out, end - out, "bucket usage\n"); + + for (type = BCH_DATA_SB; type < BCH_DATA_NR; type++) + out += scnprintf(out, end - out, + "\t%s:\t\t%llu\n", + bch2_data_types[type], + stats.buckets[type]); + + out += scnprintf(out, end - out, "online reserved:\t%llu\n", - c->capacity, - stats.s[0].data[S_META], - stats.s[0].data[S_DIRTY], - stats.s[0].persistent_reserved, - stats.s[1].data[S_META], - stats.s[1].data[S_DIRTY], - stats.s[1].persistent_reserved, - stats.s[2].data[S_META], - stats.s[2].data[S_DIRTY], - stats.s[2].persistent_reserved, - stats.s[3].data[S_META], - stats.s[3].data[S_DIRTY], - stats.s[3].persistent_reserved, stats.online_reserved); + + return out - buf; } static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) @@ -779,13 +780,15 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) " meta: %llu\n" " user: %llu\n" " cached: %llu\n" - " available: %llu\n" + " available: %lli\n" "sectors:\n" " sb: %llu\n" " journal: %llu\n" " meta: %llu\n" " user: %llu\n" " cached: %llu\n" + " fragmented: %llu\n" + " copygc threshold: %llu\n" "freelist_wait: %s\n" "open buckets: %u/%u (reserved %u)\n" "open_buckets_wait: %s\n", @@ -800,12 +803,14 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) stats.buckets[BCH_DATA_BTREE], stats.buckets[BCH_DATA_USER], stats.buckets[BCH_DATA_CACHED], - __dev_buckets_available(ca, stats), + ca->mi.nbuckets - ca->mi.first_bucket - stats.buckets_unavailable, stats.sectors[BCH_DATA_SB], stats.sectors[BCH_DATA_JOURNAL], stats.sectors[BCH_DATA_BTREE], stats.sectors[BCH_DATA_USER], stats.sectors[BCH_DATA_CACHED], + stats.sectors_fragmented, + ca->copygc_threshold, c->freelist_wait.list.first ? "waiting" : "empty", c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE, c->open_buckets_wait.list.first ? "waiting" : "empty"); diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c index 31847a94..f06eb2d8 100644 --- a/libbcachefs/tests.c +++ b/libbcachefs/tests.c @@ -14,12 +14,12 @@ static void delete_test_keys(struct bch_fs *c) ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS, POS(0, 0), POS(0, U64_MAX), - ZERO_VERSION, NULL, NULL, NULL); + NULL); BUG_ON(ret); ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS, POS(0, 0), POS(0, U64_MAX), - ZERO_VERSION, NULL, NULL, NULL); + NULL); BUG_ON(ret); } @@ -39,7 +39,7 @@ static void test_delete(struct bch_fs *c, u64 nr) ret = bch2_btree_iter_traverse(&iter); BUG_ON(ret); - ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0, + ret = bch2_btree_insert_at(c, NULL, NULL, 0, BTREE_INSERT_ENTRY(&iter, &k.k_i)); BUG_ON(ret); @@ -68,7 +68,7 @@ static void test_delete_written(struct bch_fs *c, u64 nr) ret = bch2_btree_iter_traverse(&iter); BUG_ON(ret); - ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0, + ret = bch2_btree_insert_at(c, NULL, NULL, 0, BTREE_INSERT_ENTRY(&iter, &k.k_i)); BUG_ON(ret); @@ -98,7 +98,7 @@ static void test_iterate(struct bch_fs *c, u64 nr) k.k.p.offset = i; ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i, - NULL, NULL, NULL, 0); + NULL, NULL, 0); BUG_ON(ret); } @@ -140,7 +140,7 @@ static void test_iterate_extents(struct bch_fs *c, u64 nr) k.k.size = 8; ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, - NULL, NULL, NULL, 0); + NULL, NULL, 0); BUG_ON(ret); } @@ -185,7 +185,7 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr) k.k.p.offset = i * 2; ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i, - NULL, NULL, NULL, 0); + NULL, NULL, 0); BUG_ON(ret); } @@ -235,7 +235,7 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr) k.k.size = 8; ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, - NULL, NULL, NULL, 0); + NULL, NULL, 0); BUG_ON(ret); } @@ -270,6 +270,63 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr) bch2_btree_iter_unlock(&iter); } +/* extent unit tests */ + +u64 test_version; + +static void insert_test_extent(struct bch_fs *c, + u64 start, u64 end) +{ + struct bkey_i_cookie k; + int ret; + + //pr_info("inserting %llu-%llu v %llu", start, end, test_version); + + bkey_cookie_init(&k.k_i); + k.k_i.k.p.offset = end; + k.k_i.k.size = end - start; + k.k_i.k.version.lo = test_version++; + + ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, + NULL, NULL, 0); + BUG_ON(ret); +} + +static void __test_extent_overwrite(struct bch_fs *c, + u64 e1_start, u64 e1_end, + u64 e2_start, u64 e2_end) +{ + insert_test_extent(c, e1_start, e1_end); + insert_test_extent(c, e2_start, e2_end); + + delete_test_keys(c); +} + +static void test_extent_overwrite_front(struct bch_fs *c, u64 nr) +{ + __test_extent_overwrite(c, 0, 64, 0, 32); + __test_extent_overwrite(c, 8, 64, 0, 32); +} + +static void test_extent_overwrite_back(struct bch_fs *c, u64 nr) +{ + __test_extent_overwrite(c, 0, 64, 32, 64); + __test_extent_overwrite(c, 0, 64, 32, 72); +} + +static void test_extent_overwrite_middle(struct bch_fs *c, u64 nr) +{ + __test_extent_overwrite(c, 0, 64, 32, 40); +} + +static void test_extent_overwrite_all(struct bch_fs *c, u64 nr) +{ + __test_extent_overwrite(c, 32, 64, 0, 64); + __test_extent_overwrite(c, 32, 64, 0, 128); + __test_extent_overwrite(c, 32, 64, 32, 64); + __test_extent_overwrite(c, 32, 64, 32, 128); +} + /* perf tests */ static u64 test_rand(void) @@ -294,7 +351,7 @@ static void rand_insert(struct bch_fs *c, u64 nr) k.k.p.offset = test_rand(); ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i, - NULL, NULL, NULL, 0); + NULL, NULL, 0); BUG_ON(ret); } } @@ -335,7 +392,7 @@ static void rand_mixed(struct bch_fs *c, u64 nr) bkey_cookie_init(&k.k_i); k.k.p = iter.pos; - ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0, + ret = bch2_btree_insert_at(c, NULL, NULL, 0, BTREE_INSERT_ENTRY(&iter, &k.k_i)); BUG_ON(ret); } @@ -356,7 +413,7 @@ static void rand_delete(struct bch_fs *c, u64 nr) k.k.p.offset = test_rand(); ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k, - NULL, NULL, NULL, 0); + NULL, NULL, 0); BUG_ON(ret); } } @@ -375,7 +432,7 @@ static void seq_insert(struct bch_fs *c, u64 nr) BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k) { insert.k.p = iter.pos; - ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0, + ret = bch2_btree_insert_at(c, NULL, NULL, 0, BTREE_INSERT_ENTRY(&iter, &insert.k_i)); BUG_ON(ret); @@ -407,7 +464,7 @@ static void seq_overwrite(struct bch_fs *c, u64 nr) bkey_reassemble(&u.k_i, k); - ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0, + ret = bch2_btree_insert_at(c, NULL, NULL, 0, BTREE_INSERT_ENTRY(&iter, &u.k_i)); BUG_ON(ret); } @@ -420,7 +477,7 @@ static void seq_delete(struct bch_fs *c, u64 nr) ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS, POS(0, 0), POS(0, U64_MAX), - ZERO_VERSION, NULL, NULL, NULL); + NULL); BUG_ON(ret); } @@ -498,6 +555,11 @@ void bch2_btree_perf_test(struct bch_fs *c, const char *testname, perf_test(test_iterate_slots); perf_test(test_iterate_slots_extents); + perf_test(test_extent_overwrite_front); + perf_test(test_extent_overwrite_back); + perf_test(test_extent_overwrite_middle); + perf_test(test_extent_overwrite_all); + if (!j.fn) { pr_err("unknown test %s", testname); return; diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index 7d0fee3a..398bc534 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -110,12 +110,12 @@ const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k) } } -void bch2_xattr_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) +int bch2_xattr_to_text(struct bch_fs *c, char *buf, + size_t size, struct bkey_s_c k) { + char *out = buf, *end = buf + size; const struct xattr_handler *handler; struct bkey_s_c_xattr xattr; - size_t n = 0; switch (k.k->type) { case BCH_XATTR: @@ -123,24 +123,26 @@ void bch2_xattr_to_text(struct bch_fs *c, char *buf, handler = bch2_xattr_type_to_handler(xattr.v->x_type); if (handler && handler->prefix) - n += scnprintf(buf + n, size - n, "%s", handler->prefix); + out += scnprintf(out, end - out, "%s", handler->prefix); else if (handler) - n += scnprintf(buf + n, size - n, "(type %u)", - xattr.v->x_type); + out += scnprintf(out, end - out, "(type %u)", + xattr.v->x_type); else - n += scnprintf(buf + n, size - n, "(unknown type %u)", - xattr.v->x_type); + out += scnprintf(out, end - out, "(unknown type %u)", + xattr.v->x_type); - n += bch_scnmemcpy(buf + n, size - n, xattr.v->x_name, - xattr.v->x_name_len); - n += scnprintf(buf + n, size - n, ":"); - n += bch_scnmemcpy(buf + n, size - n, xattr_val(xattr.v), - le16_to_cpu(xattr.v->x_val_len)); + out += bch_scnmemcpy(out, end - out, xattr.v->x_name, + xattr.v->x_name_len); + out += scnprintf(out, end - out, ":"); + out += bch_scnmemcpy(out, end - out, xattr_val(xattr.v), + le16_to_cpu(xattr.v->x_val_len)); break; case BCH_XATTR_WHITEOUT: - scnprintf(buf, size, "whiteout"); + out += scnprintf(out, end - out, "whiteout"); break; } + + return out - buf; } int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, @@ -433,7 +435,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, } mutex_lock(&inode->ei_update_lock); - ret = __bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); + ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); mutex_unlock(&inode->ei_update_lock); if (value && diff --git a/libbcachefs/xattr.h b/libbcachefs/xattr.h index 0689d327..cd1e7ad3 100644 --- a/libbcachefs/xattr.h +++ b/libbcachefs/xattr.h @@ -6,7 +6,7 @@ extern const struct bch_hash_desc bch2_xattr_hash_desc; const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c); -void bch2_xattr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); +int bch2_xattr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); #define bch2_bkey_xattr_ops (struct bkey_ops) { \ .key_invalid = bch2_xattr_invalid, \