diff --git a/.bcachefs_revision b/.bcachefs_revision index b34e94b8..666368e8 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -d83b992f653d9f742f3f8567dbcfd1f4f72e858f +6f603b8d79efa7d9ac04ea0c38ef1bbaa10fd678 diff --git a/cmd_debug.c b/cmd_debug.c index 5da97daa..637da1c5 100644 --- a/cmd_debug.c +++ b/cmd_debug.c @@ -91,7 +91,7 @@ int cmd_dump(int argc, char *argv[]) int fd, opt; opt_set(opts, nochanges, true); - opt_set(opts, noreplay, true); + opt_set(opts, norecovery, true); opt_set(opts, degraded, true); opt_set(opts, errors, BCH_ON_ERROR_CONTINUE); @@ -158,11 +158,12 @@ static void list_keys(struct bch_fs *c, enum btree_id btree_id, struct btree_iter *iter; struct bkey_s_c k; char buf[512]; + int ret; bch2_trans_init(&trans, c); for_each_btree_key(&trans, iter, btree_id, start, - BTREE_ITER_PREFETCH, k) { + BTREE_ITER_PREFETCH, k, ret) { if (bkey_cmp(k.k->p, end) > 0) break; diff --git a/cmd_fsck.c b/cmd_fsck.c index 824c4a1c..ebcf70bd 100644 --- a/cmd_fsck.c +++ b/cmd_fsck.c @@ -27,6 +27,7 @@ int cmd_fsck(int argc, char *argv[]) int opt, ret = 0; opt_set(opts, degraded, true); + opt_set(opts, fsck, true); opt_set(opts, fix_errors, FSCK_OPT_ASK); while ((opt = getopt(argc, argv, "apynfvh")) != -1) diff --git a/cmd_migrate.c b/cmd_migrate.c index 4b6ceaa7..f630c142 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -718,9 +718,9 @@ static int migrate_fs(const char *fs_path, mark_unreserved_space(c, extents); - const char *err = bch2_fs_start(c); - if (err) - die("Error starting new filesystem: %s", err); + int ret = bch2_fs_start(c); + if (ret) + die("Error starting new filesystem: %s", strerror(-ret)); copy_fs(c, fs_fd, fs_path, bcachefs_inum, &extents); diff --git a/include/linux/sched.h b/include/linux/sched.h index 5011ae7d..4a7f8a00 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -147,7 +147,7 @@ static inline u64 ktime_get_real_seconds(void) return ts.tv_sec; } -static inline void ktime_get_real_ts64(struct timespec64 *ts) +static inline void ktime_get_coarse_real_ts64(struct timespec64 *ts) { clock_gettime(CLOCK_MONOTONIC, ts); } diff --git a/include/linux/slab.h b/include/linux/slab.h index d8832c6c..5a9e7afd 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -59,9 +59,11 @@ static inline void *krealloc(void *old, size_t size, gfp_t flags) #define kcalloc(n, size, flags) kmalloc_array(n, size, flags|__GFP_ZERO) #define kfree(p) free(p) -#define kvfree(p) free(p) #define kzfree(p) free(p) +#define kvmalloc(size, flags) kmalloc(size, flags) +#define kvfree(p) kfree(p) + static inline struct page *alloc_pages(gfp_t flags, unsigned int order) { size_t size = PAGE_SIZE << order; diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 5a306568..a61b25cc 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -11,7 +11,7 @@ #include "debug.h" #include "ec.h" #include "error.h" -#include "journal_io.h" +#include "recovery.h" #include #include @@ -128,20 +128,26 @@ static inline void put_alloc_field(struct bkey_i_alloc *a, void **p, *p += bytes; } -struct bkey_alloc_unpacked bch2_alloc_unpack(const struct bch_alloc *a) +struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) { - struct bkey_alloc_unpacked ret = { .gen = a->gen }; - const void *d = a->data; - unsigned idx = 0; + struct bkey_alloc_unpacked ret = { .gen = 0 }; + + if (k.k->type == KEY_TYPE_alloc) { + const struct bch_alloc *a = bkey_s_c_to_alloc(k).v; + const void *d = a->data; + unsigned idx = 0; + + ret.gen = a->gen; #define x(_name, _bits) ret._name = get_alloc_field(a, &d, idx++); - BCH_ALLOC_FIELDS() + BCH_ALLOC_FIELDS() #undef x + } return ret; } -static void bch2_alloc_pack(struct bkey_i_alloc *dst, - const struct bkey_alloc_unpacked src) +void bch2_alloc_pack(struct bkey_i_alloc *dst, + const struct bkey_alloc_unpacked src) { unsigned idx = 0; void *d = dst->v.data; @@ -198,98 +204,46 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, get_alloc_field(a.v, &d, i)); } -static void __alloc_read_key(struct bucket *g, const struct bch_alloc *a) +static inline struct bkey_alloc_unpacked +alloc_mem_to_key(struct bucket *g, struct bucket_mark m) { - const void *d = a->data; - unsigned idx = 0, data_type, dirty_sectors, cached_sectors; - struct bucket_mark m; - - g->io_time[READ] = get_alloc_field(a, &d, idx++); - g->io_time[WRITE] = get_alloc_field(a, &d, idx++); - data_type = get_alloc_field(a, &d, idx++); - dirty_sectors = get_alloc_field(a, &d, idx++); - cached_sectors = get_alloc_field(a, &d, idx++); - g->oldest_gen = get_alloc_field(a, &d, idx++); - - bucket_cmpxchg(g, m, ({ - m.gen = a->gen; - m.data_type = data_type; - m.dirty_sectors = dirty_sectors; - m.cached_sectors = cached_sectors; - })); - - g->gen_valid = 1; + return (struct bkey_alloc_unpacked) { + .gen = m.gen, + .oldest_gen = g->oldest_gen, + .data_type = m.data_type, + .dirty_sectors = m.dirty_sectors, + .cached_sectors = m.cached_sectors, + .read_time = g->io_time[READ], + .write_time = g->io_time[WRITE], + }; } -static void __alloc_write_key(struct bkey_i_alloc *a, struct bucket *g, - struct bucket_mark m) +int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) { - unsigned idx = 0; - void *d = a->v.data; - - a->v.fields = 0; - a->v.gen = m.gen; - - d = a->v.data; - put_alloc_field(a, &d, idx++, g->io_time[READ]); - put_alloc_field(a, &d, idx++, g->io_time[WRITE]); - put_alloc_field(a, &d, idx++, m.data_type); - put_alloc_field(a, &d, idx++, m.dirty_sectors); - put_alloc_field(a, &d, idx++, m.cached_sectors); - put_alloc_field(a, &d, idx++, g->oldest_gen); - - set_bkey_val_bytes(&a->k, (void *) d - (void *) &a->v); -} - -static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k) -{ - struct bch_dev *ca; - struct bkey_s_c_alloc a; - - if (k.k->type != KEY_TYPE_alloc) - return; - - a = bkey_s_c_to_alloc(k); - ca = bch_dev_bkey_exists(c, a.k->p.inode); - - if (a.k->p.offset >= ca->mi.nbuckets) - return; - - percpu_down_read_preempt_disable(&c->mark_lock); - __alloc_read_key(bucket(ca, a.k->p.offset), a.v); - percpu_up_read_preempt_enable(&c->mark_lock); -} - -int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list) -{ - struct journal_replay *r; struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; struct bch_dev *ca; + struct journal_key *j; unsigned i; int ret; bch2_trans_init(&trans, c); - for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k) { - bch2_alloc_read_key(c, k); - bch2_trans_cond_resched(&trans); - } + for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret) + bch2_mark_key(c, k, true, 0, NULL, 0, 0); - ret = bch2_trans_exit(&trans); - if (ret) + ret = bch2_trans_exit(&trans) ?: ret; + if (ret) { + bch_err(c, "error reading alloc info: %i", ret); return ret; - - list_for_each_entry(r, journal_replay_list, list) { - struct bkey_i *k, *n; - struct jset_entry *entry; - - for_each_jset_key(k, n, entry, &r->j) - if (entry->btree_id == BTREE_ID_ALLOC) - bch2_alloc_read_key(c, bkey_i_to_s_c(k)); } + for_each_journal_key(*journal_keys, j) + if (j->btree_id == BTREE_ID_ALLOC) + bch2_mark_key(c, bkey_i_to_s_c(j->k), + true, 0, NULL, 0, 0); + percpu_down_write(&c->mark_lock); bch2_dev_usage_from_buckets(c); percpu_up_write(&c->mark_lock); @@ -356,86 +310,32 @@ err: return ret; } -static int __bch2_alloc_write_key(struct btree_trans *trans, struct bch_dev *ca, - size_t b, struct btree_iter *iter, - u64 *journal_seq, unsigned flags) +int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote) { - struct bch_fs *c = trans->c; -#if 0 - __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key; -#else - /* hack: */ - __BKEY_PADDED(k, 8) alloc_key; -#endif - struct bkey_i_alloc *a = bkey_alloc_init(&alloc_key.k); + struct btree_trans trans; + struct btree_iter *iter; + struct bucket_array *buckets; + struct bch_dev *ca; struct bucket *g; struct bucket_mark m, new; - int ret; + struct bkey_alloc_unpacked old_u, new_u; + __BKEY_PADDED(k, 8) alloc_key; /* hack: */ + struct bkey_i_alloc *a; + struct bkey_s_c k; + unsigned i; + size_t b; + int ret = 0; BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); - a->k.p = POS(ca->dev_idx, b); + bch2_trans_init(&trans, c); - bch2_btree_iter_set_pos(iter, a->k.p); - - ret = bch2_btree_iter_traverse(iter); - if (ret) - return ret; - - percpu_down_read_preempt_disable(&c->mark_lock); - g = bucket(ca, b); - m = READ_ONCE(g->mark); - - if (!m.dirty) { - percpu_up_read_preempt_enable(&c->mark_lock); - return 0; - } - - __alloc_write_key(a, g, m); - percpu_up_read_preempt_enable(&c->mark_lock); - - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &a->k_i)); - - ret = bch2_trans_commit(trans, NULL, journal_seq, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_USE_ALLOC_RESERVE| - BTREE_INSERT_NOMARK| - flags); - if (ret) - return ret; - - new = m; - new.dirty = false; - atomic64_cmpxchg(&g->_mark.v, m.v.counter, new.v.counter); - - if (ca->buckets_written) - set_bit(b, ca->buckets_written); - - return 0; -} - -int bch2_alloc_write(struct bch_fs *c, bool nowait, bool *wrote) -{ - struct bch_dev *ca; - unsigned i; - int ret = 0; - - *wrote = false; + iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); for_each_rw_member(ca, c, i) { - struct btree_trans trans; - struct btree_iter *iter; - struct bucket_array *buckets; - size_t b; - - bch2_trans_init(&trans, c); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - down_read(&ca->bucket_lock); +restart: buckets = bucket_array(ca); for (b = buckets->first_bucket; @@ -444,26 +344,78 @@ int bch2_alloc_write(struct bch_fs *c, bool nowait, bool *wrote) if (!buckets->b[b].mark.dirty) continue; - ret = __bch2_alloc_write_key(&trans, ca, b, iter, NULL, - nowait - ? BTREE_INSERT_NOWAIT - : 0); + bch2_btree_iter_set_pos(iter, POS(i, b)); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; + + old_u = bch2_alloc_unpack(k); + + percpu_down_read_preempt_disable(&c->mark_lock); + g = bucket(ca, b); + m = READ_ONCE(g->mark); + new_u = alloc_mem_to_key(g, m); + percpu_up_read_preempt_enable(&c->mark_lock); + + if (!m.dirty) + continue; + + if ((flags & BTREE_INSERT_LAZY_RW) && + percpu_ref_is_zero(&c->writes)) { + up_read(&ca->bucket_lock); + bch2_trans_unlock(&trans); + + ret = bch2_fs_read_write_early(c); + down_read(&ca->bucket_lock); + + if (ret) + goto err; + goto restart; + } + + a = bkey_alloc_init(&alloc_key.k); + a->k.p = iter->pos; + bch2_alloc_pack(a, new_u); + + bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &a->k_i)); + ret = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_NOMARK| + flags); +err: + if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) { + bch_err(c, "error %i writing alloc info", ret); + printk(KERN_CONT "dev %llu bucket %llu\n", + iter->pos.inode, iter->pos.offset); + printk(KERN_CONT "gen %u -> %u\n", old_u.gen, new_u.gen); +#define x(_name, _bits) printk(KERN_CONT #_name " %u -> %u\n", old_u._name, new_u._name); + BCH_ALLOC_FIELDS() +#undef x + } if (ret) break; + new = m; + new.dirty = false; + atomic64_cmpxchg(&g->_mark.v, m.v.counter, new.v.counter); + + if (ca->buckets_written) + set_bit(b, ca->buckets_written); + bch2_trans_cond_resched(&trans); *wrote = true; } up_read(&ca->bucket_lock); - bch2_trans_exit(&trans); - if (ret) { percpu_ref_put(&ca->io_ref); break; } } + bch2_trans_exit(&trans); + return ret; } @@ -598,6 +550,9 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) unsigned long gc_count = c->gc_count; int ret = 0; + ca->allocator_state = ALLOCATOR_BLOCKED; + closure_wake_up(&c->freelist_wait); + while (1) { set_current_state(TASK_INTERRUPTIBLE); if (kthread_should_stop()) { @@ -620,6 +575,9 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) } __set_current_state(TASK_RUNNING); + ca->allocator_state = ALLOCATOR_RUNNING; + closure_wake_up(&c->freelist_wait); + return ret; } @@ -693,16 +651,16 @@ static inline int bucket_alloc_cmp(alloc_heap *h, struct alloc_heap_entry l, struct alloc_heap_entry r) { - return (l.key > r.key) - (l.key < r.key) ?: - (l.nr < r.nr) - (l.nr > r.nr) ?: - (l.bucket > r.bucket) - (l.bucket < r.bucket); + return cmp_int(l.key, r.key) ?: + cmp_int(r.nr, l.nr) ?: + cmp_int(l.bucket, r.bucket); } static inline int bucket_idx_cmp(const void *_l, const void *_r) { const struct alloc_heap_entry *l = _l, *r = _r; - return (l->bucket > r->bucket) - (l->bucket < r->bucket); + return cmp_int(l->bucket, r->bucket); } static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) @@ -916,6 +874,7 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, struct bch_fs *c = trans->c; struct bkey_i_alloc *a; struct bkey_alloc_unpacked u; + struct bucket *g; struct bucket_mark m; struct bkey_s_c k; bool invalidating_cached_data; @@ -935,7 +894,6 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, BUG_ON(!fifo_push(&ca->free_inc, b)); bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0); - m = bucket(ca, b)->mark; spin_unlock(&c->freelist_lock); percpu_up_read_preempt_enable(&c->mark_lock); @@ -949,26 +907,25 @@ retry: if (ret) return ret; - if (k.k && k.k->type == KEY_TYPE_alloc) - u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v); - else - memset(&u, 0, sizeof(u)); - - invalidating_cached_data = m.cached_sectors != 0; - - //BUG_ON(u.dirty_sectors); - u.data_type = 0; - u.dirty_sectors = 0; - u.cached_sectors = 0; - u.read_time = c->bucket_clock[READ].hand; - u.write_time = c->bucket_clock[WRITE].hand; - /* * The allocator has to start before journal replay is finished - thus, * we have to trust the in memory bucket @m, not the version in the * btree: */ - u.gen = m.gen + 1; + percpu_down_read_preempt_disable(&c->mark_lock); + g = bucket(ca, b); + m = READ_ONCE(g->mark); + u = alloc_mem_to_key(g, m); + percpu_up_read_preempt_enable(&c->mark_lock); + + invalidating_cached_data = m.cached_sectors != 0; + + u.gen++; + u.data_type = 0; + u.dirty_sectors = 0; + u.cached_sectors = 0; + u.read_time = c->bucket_clock[READ].hand; + u.write_time = c->bucket_clock[WRITE].hand; a = bkey_alloc_init(&alloc_key.k); a->k.p = iter->pos; @@ -1119,14 +1076,14 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t fifo_pop(&ca->free_inc, bucket); closure_wake_up(&c->freelist_wait); - ca->allocator_blocked_full = false; + ca->allocator_state = ALLOCATOR_RUNNING; spin_unlock(&c->freelist_lock); goto out; } - if (!ca->allocator_blocked_full) { - ca->allocator_blocked_full = true; + if (ca->allocator_state != ALLOCATOR_BLOCKED_FULL) { + ca->allocator_state = ALLOCATOR_BLOCKED_FULL; closure_wake_up(&c->freelist_wait); } @@ -1184,6 +1141,7 @@ static int bch2_allocator_thread(void *arg) int ret; set_freezable(); + ca->allocator_state = ALLOCATOR_RUNNING; while (1) { cond_resched(); @@ -1242,9 +1200,6 @@ static int bch2_allocator_thread(void *arg) if (!nr || (nr < ALLOC_SCAN_BATCH(ca) && !fifo_full(&ca->free[RESERVE_MOVINGGC]))) { - ca->allocator_blocked = true; - closure_wake_up(&c->freelist_wait); - ret = wait_buckets_available(c, ca); if (ret) { up_read(&c->gc_lock); @@ -1253,7 +1208,6 @@ static int bch2_allocator_thread(void *arg) } } while (!nr); - ca->allocator_blocked = false; up_read(&c->gc_lock); pr_debug("%zu buckets to invalidate", nr); @@ -1266,6 +1220,8 @@ static int bch2_allocator_thread(void *arg) stop: pr_debug("alloc thread stopping (ret %i)", ret); + ca->allocator_state = ALLOCATOR_STOPPED; + closure_wake_up(&c->freelist_wait); return 0; } @@ -1457,7 +1413,8 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca) { if (ca->alloc_thread) - closure_wait_event(&c->freelist_wait, ca->allocator_blocked_full); + closure_wait_event(&c->freelist_wait, + ca->allocator_state != ALLOCATOR_RUNNING); } /* stop allocator thread: */ @@ -1682,7 +1639,10 @@ int bch2_fs_allocator_start(struct bch_fs *c) * XXX: it's possible for this to deadlock waiting on journal reclaim, * since we're holding btree writes. What then? */ - ret = bch2_alloc_write(c, true, &wrote); + ret = bch2_alloc_write(c, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_USE_ALLOC_RESERVE| + BTREE_INSERT_NOWAIT, &wrote); /* * If bch2_alloc_write() did anything, it may have used some diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index 65e9b373..b5462646 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -12,7 +12,9 @@ struct bkey_alloc_unpacked { #undef x }; -struct bkey_alloc_unpacked bch2_alloc_unpack(const struct bch_alloc *); +struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c); +void bch2_alloc_pack(struct bkey_i_alloc *, + const struct bkey_alloc_unpacked); #define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) @@ -24,7 +26,8 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); .val_to_text = bch2_alloc_to_text, \ } -int bch2_alloc_read(struct bch_fs *, struct list_head *); +struct journal_keys; +int bch2_alloc_read(struct bch_fs *, struct journal_keys *); int bch2_alloc_replay_key(struct bch_fs *, struct bkey_i *); static inline void bch2_wake_allocator(struct bch_dev *ca) @@ -64,7 +67,7 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_stop(struct bch_dev *); int bch2_dev_allocator_start(struct bch_dev *); -int bch2_alloc_write(struct bch_fs *, bool, bool *); +int bch2_alloc_write(struct bch_fs *, unsigned, bool *); int bch2_fs_allocator_start(struct bch_fs *); void bch2_fs_allocator_background_init(struct bch_fs *); diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 6a68376d..d6dc3bd4 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -444,8 +444,12 @@ struct bch_dev { * XXX: this should be an enum for allocator state, so as to include * error state */ - bool allocator_blocked; - bool allocator_blocked_full; + enum { + ALLOCATOR_STOPPED, + ALLOCATOR_RUNNING, + ALLOCATOR_BLOCKED, + ALLOCATOR_BLOCKED_FULL, + } allocator_state; alloc_heap alloc_heap; @@ -638,7 +642,10 @@ struct bch_fs { struct percpu_rw_semaphore mark_lock; + seqcount_t usage_lock; + struct bch_fs_usage *usage_base; struct bch_fs_usage __percpu *usage[2]; + struct bch_fs_usage __percpu *usage_gc; /* single element mempool: */ struct mutex usage_scratch_lock; @@ -831,7 +838,7 @@ static inline s64 bch2_current_time(struct bch_fs *c) { struct timespec64 now; - ktime_get_real_ts64(&now); + ktime_get_coarse_real_ts64(&now); return timespec_to_bch2_time(c, now); } diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index d390ac86..be6acec1 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -1295,6 +1295,7 @@ enum bch_sb_features { enum bch_sb_compat { BCH_COMPAT_FEAT_ALLOC_INFO = 0, + BCH_COMPAT_FEAT_ALLOC_METADATA = 1, }; /* options: */ diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c index 40ce33a4..f1ddd189 100644 --- a/libbcachefs/bkey.c +++ b/libbcachefs/bkey.c @@ -1020,7 +1020,7 @@ static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, r_v = *r; } - return (l_v > r_v) - (l_v < r_v); + return cmp_int(l_v, r_v); } #endif diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index 15397099..b52628be 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -208,8 +208,8 @@ void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); static __always_inline int bversion_cmp(struct bversion l, struct bversion r) { - return (l.hi > r.hi) - (l.hi < r.hi) ?: - (l.lo > r.lo) - (l.lo < r.lo); + return cmp_int(l.hi, r.hi) ?: + cmp_int(l.lo, r.lo); } #define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 }) diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h index 8bc2fdfd..74b962a0 100644 --- a/libbcachefs/bset.h +++ b/libbcachefs/bset.h @@ -449,7 +449,7 @@ static inline int bkey_iter_cmp(struct btree *b, { return bkey_cmp_packed(b, l, r) ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) - ?: (l > r) - (l < r); + ?: cmp_int(l, r); } static inline int btree_node_iter_cmp(struct btree *b, diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 3feea91e..9f0de5cd 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -18,9 +18,9 @@ #include "error.h" #include "extents.h" #include "journal.h" -#include "journal_io.h" #include "keylist.h" #include "move.h" +#include "recovery.h" #include "replicas.h" #include "super-io.h" @@ -207,7 +207,10 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, struct btree_iter *iter; struct btree *b; struct range_checks r; - unsigned depth = btree_node_type_needs_gc(btree_id) ? 0 : 1; + unsigned depth = metadata_only ? 1 + : expensive_debug_checks(c) ? 0 + : !btree_node_type_needs_gc(btree_id) ? 1 + : 0; u8 max_stale; int ret = 0; @@ -215,17 +218,6 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); - /* - * if expensive_debug_checks is on, run range_checks on all leaf nodes: - * - * and on startup, we have to read every btree node (XXX: only if it was - * an unclean shutdown) - */ - if (metadata_only) - depth = 1; - else if (initial || expensive_debug_checks(c)) - depth = 0; - btree_node_range_checks_init(&r, depth); __for_each_btree_node(&trans, iter, btree_id, POS_MIN, @@ -278,11 +270,40 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) (int) btree_id_to_gc_phase(r); } -static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal, +static int mark_journal_key(struct bch_fs *c, enum btree_id id, + struct bkey_i *insert) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + u8 max_stale; + int ret = 0; + + ret = bch2_gc_mark_key(c, bkey_i_to_s_c(insert), &max_stale, true); + if (ret) + return ret; + + bch2_trans_init(&trans, c); + + for_each_btree_key(&trans, iter, id, bkey_start_pos(&insert->k), + BTREE_ITER_SLOTS, k, ret) { + percpu_down_read_preempt_disable(&c->mark_lock); + ret = bch2_mark_overwrite(&trans, iter, k, insert, NULL, + BCH_BUCKET_MARK_GC| + BCH_BUCKET_MARK_NOATOMIC); + percpu_up_read_preempt_enable(&c->mark_lock); + + if (!ret) + break; + } + + return bch2_trans_exit(&trans) ?: ret; +} + +static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, bool initial, bool metadata_only) { enum btree_id ids[BTREE_ID_NR]; - u8 max_stale; unsigned i; for (i = 0; i < BTREE_ID_NR; i++) @@ -297,22 +318,16 @@ static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal, if (ret) return ret; - if (journal && !metadata_only && + if (journal_keys && !metadata_only && btree_node_type_needs_gc(type)) { - struct bkey_i *k, *n; - struct jset_entry *j; - struct journal_replay *r; + struct journal_key *j; int ret; - list_for_each_entry(r, journal, list) - for_each_jset_key(k, n, j, &r->j) { - if (type == __btree_node_type(j->level, j->btree_id)) { - ret = bch2_gc_mark_key(c, - bkey_i_to_s_c(k), - &max_stale, initial); - if (ret) - return ret; - } + for_each_journal_key(*journal_keys, j) + if (j->btree_id == id) { + ret = mark_journal_key(c, id, j->k); + if (ret) + return ret; } } } @@ -477,8 +492,8 @@ static void bch2_gc_free(struct bch_fs *c) ca->usage[1] = NULL; } - free_percpu(c->usage[1]); - c->usage[1] = NULL; + free_percpu(c->usage_gc); + c->usage_gc = NULL; } static int bch2_gc_done(struct bch_fs *c, @@ -574,14 +589,16 @@ static int bch2_gc_done(struct bch_fs *c, } }; + bch2_fs_usage_acc_to_base(c, 0); + bch2_fs_usage_acc_to_base(c, 1); + bch2_dev_usage_from_buckets(c); { unsigned nr = fs_usage_u64s(c); - struct bch_fs_usage *dst = (void *) - bch2_acc_percpu_u64s((void *) c->usage[0], nr); + struct bch_fs_usage *dst = c->usage_base; struct bch_fs_usage *src = (void *) - bch2_acc_percpu_u64s((void *) c->usage[1], nr); + bch2_acc_percpu_u64s((void *) c->usage_gc, nr); copy_fs_field(hidden, "hidden"); copy_fs_field(btree, "btree"); @@ -634,11 +651,11 @@ static int bch2_gc_start(struct bch_fs *c, */ gc_pos_set(c, gc_phase(GC_PHASE_START)); - BUG_ON(c->usage[1]); + BUG_ON(c->usage_gc); - c->usage[1] = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), + c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), sizeof(u64), GFP_KERNEL); - if (!c->usage[1]) + if (!c->usage_gc) return -ENOMEM; for_each_member_device(ca, c, i) { @@ -705,7 +722,7 @@ static int bch2_gc_start(struct bch_fs *c, * move around - if references move backwards in the ordering GC * uses, GC could skip past them */ -int bch2_gc(struct bch_fs *c, struct list_head *journal, +int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys, bool initial, bool metadata_only) { struct bch_dev *ca; @@ -726,7 +743,7 @@ again: bch2_mark_superblocks(c); - ret = bch2_gc_btrees(c, journal, initial, metadata_only); + ret = bch2_gc_btrees(c, journal_keys, initial, metadata_only); if (ret) goto out; @@ -757,11 +774,17 @@ out: ret = -EINVAL; } - percpu_down_write(&c->mark_lock); + if (!ret) { + bch2_journal_block(&c->journal); - if (!ret) + percpu_down_write(&c->mark_lock); ret = bch2_gc_done(c, initial, metadata_only); + bch2_journal_unblock(&c->journal); + } else { + percpu_down_write(&c->mark_lock); + } + /* Indicates that gc is no longer in progress: */ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h index 9e067deb..6522ebaf 100644 --- a/libbcachefs/btree_gc.h +++ b/libbcachefs/btree_gc.h @@ -4,7 +4,9 @@ #include "btree_types.h" void bch2_coalesce(struct bch_fs *); -int bch2_gc(struct bch_fs *, struct list_head *, bool, bool); + +struct journal_keys; +int bch2_gc(struct bch_fs *, struct journal_keys *, bool, bool); void bch2_gc_thread_stop(struct bch_fs *); int bch2_gc_thread_start(struct bch_fs *); void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned); diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 33cbc2ff..49ddf05c 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -1002,7 +1002,7 @@ retry_all: goto retry_all; } - ret = btree_trans_has_multiple_iters(trans) ? -EINTR : 0; + ret = hweight64(trans->iters_live) > 1 ? -EINTR : 0; out: bch2_btree_cache_cannibalize_unlock(c); return ret; @@ -1100,8 +1100,6 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *iter) if (unlikely(ret)) ret = __btree_iter_traverse_all(iter->trans, iter, ret); - BUG_ON(ret == -EINTR && !btree_trans_has_multiple_iters(iter->trans)); - return ret; } diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index c05b2dac..a46a6a4e 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -239,12 +239,16 @@ static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter, : bch2_btree_iter_next(iter); } -#define for_each_btree_key(_trans, _iter, _btree_id, _start, _flags, _k)\ - for (iter = bch2_trans_get_iter((_trans), (_btree_id), \ - (_start), (_flags)), \ - (_k) = __bch2_btree_iter_peek(_iter, _flags); \ - !IS_ERR_OR_NULL((_k).k); \ - (_k) = __bch2_btree_iter_next(_iter, _flags)) +#define for_each_btree_key(_trans, _iter, _btree_id, \ + _start, _flags, _k, _ret) \ + for ((_ret) = PTR_ERR_OR_ZERO((_iter) = \ + bch2_trans_get_iter((_trans), (_btree_id), \ + (_start), (_flags))) ?: \ + PTR_ERR_OR_ZERO(((_k) = \ + __bch2_btree_iter_peek(_iter, _flags)).k); \ + !ret && (_k).k; \ + (_ret) = PTR_ERR_OR_ZERO(((_k) = \ + __bch2_btree_iter_next(_iter, _flags)).k)) #define for_each_btree_key_continue(_iter, _flags, _k) \ for ((_k) = __bch2_btree_iter_peek(_iter, _flags); \ diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index a995efc7..ae273ab7 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -6,6 +6,7 @@ #include #include "bkey_methods.h" +#include "buckets_types.h" #include "journal_types.h" struct open_bucket; @@ -260,6 +261,7 @@ struct btree_insert_entry { }; bool deferred; + bool triggered; }; #define BTREE_ITER_MAX 64 @@ -297,6 +299,8 @@ struct btree_trans { struct btree_iter iters_onstack[2]; struct btree_insert_entry updates_onstack[6]; + + struct replicas_delta_list fs_usage_deltas; }; #define BTREE_FLAG(flag) \ diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index 944b6c24..be11efdc 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -42,7 +42,11 @@ enum { __BTREE_INSERT_USE_ALLOC_RESERVE, __BTREE_INSERT_JOURNAL_REPLAY, __BTREE_INSERT_JOURNAL_RESERVED, + __BTREE_INSERT_NOMARK_INSERT, + __BTREE_INSERT_NOMARK_OVERWRITES, __BTREE_INSERT_NOMARK, + __BTREE_INSERT_MARK_INMEM, + __BTREE_INSERT_NO_CLEAR_REPLICAS, __BTREE_INSERT_NOWAIT, __BTREE_INSERT_GC_LOCK_HELD, __BCH_HASH_SET_MUST_CREATE, @@ -75,9 +79,20 @@ enum { #define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED) -/* Don't call bch2_mark_key: */ +/* Don't mark new key, just overwrites: */ +#define BTREE_INSERT_NOMARK_INSERT (1 << __BTREE_INSERT_NOMARK_INSERT) + +/* Don't mark overwrites, just new key: */ +#define BTREE_INSERT_NOMARK_OVERWRITES (1 << __BTREE_INSERT_NOMARK_OVERWRITES) + +/* Don't call mark new key at all: */ #define BTREE_INSERT_NOMARK (1 << __BTREE_INSERT_NOMARK) +/* Don't mark transactionally: */ +#define BTREE_INSERT_MARK_INMEM (1 << __BTREE_INSERT_MARK_INMEM) + +#define BTREE_INSERT_NO_CLEAR_REPLICAS (1 << __BTREE_INSERT_NO_CLEAR_REPLICAS) + /* Don't block on allocation failure (for new btree nodes: */ #define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT) #define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD) diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 19ba667b..fb6bf79a 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -1084,7 +1084,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b) bch2_btree_node_free_index(as, NULL, bkey_i_to_s_c(&old->key), fs_usage); - bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res); + bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0); bch2_fs_usage_scratch_put(c, fs_usage); percpu_up_read(&c->mark_lock); @@ -1189,7 +1189,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b bkey_disassemble(b, k, &tmp), fs_usage); - bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res); + bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0); bch2_fs_usage_scratch_put(c, fs_usage); percpu_up_read(&c->mark_lock); @@ -2003,7 +2003,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, bch2_btree_node_free_index(as, NULL, bkey_i_to_s_c(&b->key), fs_usage); - bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res); + bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0); bch2_fs_usage_scratch_put(c, fs_usage); percpu_up_read(&c->mark_lock); diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index ce1fc29d..d052ca54 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -54,7 +54,7 @@ static void btree_trans_unlock_write(struct btree_trans *trans) static inline int btree_trans_cmp(struct btree_insert_entry l, struct btree_insert_entry r) { - return (l.deferred > r.deferred) - (l.deferred < r.deferred) ?: + return cmp_int(l.deferred, r.deferred) ?: btree_iter_cmp(l.iter, r.iter); } @@ -524,6 +524,22 @@ static inline void do_btree_insert_one(struct btree_trans *trans, btree_insert_key_deferred(trans, insert); } +static inline bool update_triggers_transactional(struct btree_trans *trans, + struct btree_insert_entry *i) +{ + return likely(!(trans->flags & BTREE_INSERT_MARK_INMEM)) && + (i->iter->btree_id == BTREE_ID_EXTENTS || + i->iter->btree_id == BTREE_ID_INODES); +} + +static inline bool update_has_triggers(struct btree_trans *trans, + struct btree_insert_entry *i) +{ + return likely(!(trans->flags & BTREE_INSERT_NOMARK)) && + !i->deferred && + btree_node_type_needs_gc(i->iter->btree_id); +} + /* * Get journal reservation, take write locks, and attempt to do btree update(s): */ @@ -536,28 +552,26 @@ static inline int do_btree_insert_at(struct btree_trans *trans, struct btree_iter *linked; int ret; + if (likely(!(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS))) { + memset(&trans->fs_usage_deltas.fs_usage, 0, + sizeof(trans->fs_usage_deltas.fs_usage)); + trans->fs_usage_deltas.top = trans->fs_usage_deltas.d; + } + trans_for_each_update_iter(trans, i) BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK); + trans_for_each_update_iter(trans, i) + if (update_has_triggers(trans, i) && + update_triggers_transactional(trans, i)) { + ret = bch2_trans_mark_update(trans, i, + &trans->fs_usage_deltas); + if (ret) + return ret; + } + btree_trans_lock_write(c, trans); - trans_for_each_update_iter(trans, i) { - if (i->deferred || - !btree_node_type_needs_gc(i->iter->btree_id)) - continue; - - if (!fs_usage) { - percpu_down_read(&c->mark_lock); - fs_usage = bch2_fs_usage_scratch_get(c); - } - - if (!bch2_bkey_replicas_marked_locked(c, - bkey_i_to_s_c(i->k), true)) { - ret = BTREE_INSERT_NEED_MARK_REPLICAS; - goto out; - } - } - if (race_fault()) { ret = -EINTR; trans_restart(" (race)"); @@ -573,6 +587,23 @@ static inline int do_btree_insert_at(struct btree_trans *trans, if (ret) goto out; + trans_for_each_update_iter(trans, i) { + if (i->deferred || + !btree_node_type_needs_gc(i->iter->btree_id)) + continue; + + if (!fs_usage) { + percpu_down_read(&c->mark_lock); + fs_usage = bch2_fs_usage_scratch_get(c); + } + + if (!bch2_bkey_replicas_marked_locked(c, + bkey_i_to_s_c(i->k), true)) { + ret = BTREE_INSERT_NEED_MARK_REPLICAS; + goto out; + } + } + /* * Don't get journal reservation until after we know insert will * succeed: @@ -602,16 +633,22 @@ static inline int do_btree_insert_at(struct btree_trans *trans, } trans_for_each_update_iter(trans, i) - bch2_mark_update(trans, i, fs_usage, 0); - if (fs_usage) - bch2_trans_fs_usage_apply(trans, fs_usage); + if (update_has_triggers(trans, i) && + !update_triggers_transactional(trans, i)) + bch2_mark_update(trans, i, fs_usage, 0); - if (unlikely(c->gc_pos.phase)) { + if (fs_usage) { + bch2_replicas_delta_list_apply(c, fs_usage, + &trans->fs_usage_deltas); + bch2_trans_fs_usage_apply(trans, fs_usage); + } + + if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) && + unlikely(c->gc_pos.phase)) trans_for_each_update_iter(trans, i) if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) bch2_mark_update(trans, i, NULL, BCH_BUCKET_MARK_GC); - } trans_for_each_update(trans, i) do_btree_insert_one(trans, i); @@ -639,6 +676,19 @@ int bch2_trans_commit_error(struct btree_trans *trans, { struct bch_fs *c = trans->c; unsigned flags = trans->flags; + struct btree_insert_entry *src, *dst; + + src = dst = trans->updates; + + while (src < trans->updates + trans->nr_updates) { + if (!src->triggered) { + *dst = *src; + dst++; + } + src++; + } + + trans->nr_updates = dst - trans->updates; /* * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree @@ -796,6 +846,7 @@ int bch2_trans_commit(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree_insert_entry *i; + unsigned orig_mem_top = trans->mem_top; int ret = 0; if (!trans->nr_updates) @@ -873,8 +924,16 @@ out_noupdates: return ret; err: ret = bch2_trans_commit_error(trans, i, ret); - if (!ret) + + /* can't loop if it was passed in and we changed it: */ + if (unlikely(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS) && !ret) + ret = -EINTR; + + if (!ret) { + /* free memory used by triggers, they'll be reexecuted: */ + trans->mem_top = orig_mem_top; goto retry; + } goto out; } @@ -957,6 +1016,7 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, int ret = 0; bch2_trans_init(&trans, c); + bch2_trans_preload_iters(&trans); iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT); @@ -1002,5 +1062,6 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, } bch2_trans_exit(&trans); + BUG_ON(ret == -EINTR); return ret; } diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 4fa131a1..58f25894 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -119,8 +119,10 @@ void bch2_fs_usage_initialize(struct bch_fs *c) unsigned i; percpu_down_write(&c->mark_lock); - usage = (void *) bch2_acc_percpu_u64s((void *) c->usage[0], - fs_usage_u64s(c)); + usage = c->usage_base; + + bch2_fs_usage_acc_to_base(c, 0); + bch2_fs_usage_acc_to_base(c, 1); for (i = 0; i < BCH_REPLICAS_MAX; i++) usage->reserved += usage->persistent_reserved[i]; @@ -188,12 +190,40 @@ struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca) return ret; } +static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, + unsigned journal_seq, + bool gc) +{ + return this_cpu_ptr(gc + ? c->usage_gc + : c->usage[journal_seq & 1]); +} + +u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) +{ + ssize_t offset = v - (u64 *) c->usage_base; + unsigned seq; + u64 ret; + + BUG_ON(offset < 0 || offset >= fs_usage_u64s(c)); + percpu_rwsem_assert_held(&c->mark_lock); + + do { + seq = read_seqcount_begin(&c->usage_lock); + ret = *v + + percpu_u64_get((u64 __percpu *) c->usage[0] + offset) + + percpu_u64_get((u64 __percpu *) c->usage[1] + offset); + } while (read_seqcount_retry(&c->usage_lock, seq)); + + return ret; +} + struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c) { struct bch_fs_usage *ret; - unsigned v, u64s = fs_usage_u64s(c); + unsigned seq, v, u64s = fs_usage_u64s(c); retry: - ret = kzalloc(u64s * sizeof(u64), GFP_NOFS); + ret = kmalloc(u64s * sizeof(u64), GFP_NOFS); if (unlikely(!ret)) return NULL; @@ -207,11 +237,70 @@ retry: goto retry; } - acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s); + do { + seq = read_seqcount_begin(&c->usage_lock); + memcpy(ret, c->usage_base, u64s * sizeof(u64)); + acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s); + acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[1], u64s); + } while (read_seqcount_retry(&c->usage_lock, seq)); return ret; } +void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) +{ + unsigned u64s = fs_usage_u64s(c); + + BUG_ON(idx >= 2); + + write_seqcount_begin(&c->usage_lock); + + acc_u64s_percpu((u64 *) c->usage_base, + (u64 __percpu *) c->usage[idx], u64s); + percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); + + write_seqcount_end(&c->usage_lock); +} + +void bch2_fs_usage_to_text(struct printbuf *out, + struct bch_fs *c, + struct bch_fs_usage *fs_usage) +{ + unsigned i; + + pr_buf(out, "capacity:\t\t\t%llu\n", c->capacity); + + pr_buf(out, "hidden:\t\t\t\t%llu\n", + fs_usage->hidden); + pr_buf(out, "data:\t\t\t\t%llu\n", + fs_usage->data); + pr_buf(out, "cached:\t\t\t\t%llu\n", + fs_usage->cached); + pr_buf(out, "reserved:\t\t\t%llu\n", + fs_usage->reserved); + pr_buf(out, "nr_inodes:\t\t\t%llu\n", + fs_usage->nr_inodes); + pr_buf(out, "online reserved:\t\t%llu\n", + fs_usage->online_reserved); + + for (i = 0; + i < ARRAY_SIZE(fs_usage->persistent_reserved); + i++) { + pr_buf(out, "%u replicas:\n", i + 1); + pr_buf(out, "\treserved:\t\t%llu\n", + fs_usage->persistent_reserved[i]); + } + + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + + pr_buf(out, "\t"); + bch2_replicas_entry_to_text(out, e); + pr_buf(out, ":\t%llu\n", fs_usage->replicas[i]); + } +} + #define RESERVE_FACTOR 6 static u64 reserve_factor(u64 r) @@ -241,17 +330,17 @@ __bch2_fs_usage_read_short(struct bch_fs *c) u64 data, reserved; ret.capacity = c->capacity - - percpu_u64_get(&c->usage[0]->hidden); + bch2_fs_usage_read_one(c, &c->usage_base->hidden); - data = percpu_u64_get(&c->usage[0]->data) + - percpu_u64_get(&c->usage[0]->btree); - reserved = percpu_u64_get(&c->usage[0]->reserved) + - percpu_u64_get(&c->usage[0]->online_reserved); + data = bch2_fs_usage_read_one(c, &c->usage_base->data) + + bch2_fs_usage_read_one(c, &c->usage_base->btree); + reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) + + bch2_fs_usage_read_one(c, &c->usage_base->online_reserved); ret.used = min(ret.capacity, data + reserve_factor(reserved)); ret.free = ret.capacity - ret.used; - ret.nr_inodes = percpu_u64_get(&c->usage[0]->nr_inodes); + ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes); return ret; } @@ -300,7 +389,8 @@ static bool bucket_became_unavailable(struct bucket_mark old, int bch2_fs_usage_apply(struct bch_fs *c, struct bch_fs_usage *fs_usage, - struct disk_reservation *disk_res) + struct disk_reservation *disk_res, + unsigned journal_seq) { s64 added = fs_usage->data + fs_usage->reserved; s64 should_not_have_added; @@ -326,7 +416,7 @@ int bch2_fs_usage_apply(struct bch_fs *c, } preempt_disable(); - acc_u64s((u64 *) this_cpu_ptr(c->usage[0]), + acc_u64s((u64 *) fs_usage_ptr(c, journal_seq, false), (u64 *) fs_usage, fs_usage_u64s(c)); preempt_enable(); @@ -391,27 +481,23 @@ void bch2_dev_usage_from_buckets(struct bch_fs *c) { struct bch_dev *ca; struct bucket_mark old = { .v.counter = 0 }; - struct bch_fs_usage *fs_usage; struct bucket_array *buckets; struct bucket *g; unsigned i; int cpu; - percpu_u64_set(&c->usage[0]->hidden, 0); + c->usage_base->hidden = 0; for_each_member_device(ca, c, i) { for_each_possible_cpu(cpu) memset(per_cpu_ptr(ca->usage[0], cpu), 0, sizeof(*ca->usage[0])); - preempt_disable(); - fs_usage = this_cpu_ptr(c->usage[0]); buckets = bucket_array(ca); for_each_bucket(g, buckets) - bch2_dev_usage_update(c, ca, fs_usage, + bch2_dev_usage_update(c, ca, c->usage_base, old, g->mark, false); - preempt_enable(); } } @@ -475,7 +561,7 @@ static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, size_t b, struct bucket_mark *ret, bool gc) { - struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]); + struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc); struct bucket *g = __bucket(ca, b, gc); struct bucket_mark old, new; @@ -514,7 +600,7 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, size_t b, bool owned_by_allocator, bool gc) { - struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]); + struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc); struct bucket *g = __bucket(ca, b, gc); struct bucket_mark old, new; @@ -556,23 +642,24 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, if (flags & BCH_BUCKET_MARK_GC) return 0; - u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v); ca = bch_dev_bkey_exists(c, k.k->p.inode); - g = __bucket(ca, k.k->p.offset, gc); - /* - * this should currently only be getting called from the bucket - * invalidate path: - */ - BUG_ON(u.dirty_sectors); - BUG_ON(u.cached_sectors); - BUG_ON(!g->mark.owned_by_allocator); + if (k.k->p.offset >= ca->mi.nbuckets) + return 0; + + g = __bucket(ca, k.k->p.offset, gc); + u = bch2_alloc_unpack(k); old = bucket_data_cmpxchg(c, ca, fs_usage, g, m, ({ m.gen = u.gen; m.data_type = u.data_type; m.dirty_sectors = u.dirty_sectors; m.cached_sectors = u.cached_sectors; + + if (!(flags & BCH_BUCKET_MARK_GC)) { + m.journal_seq_valid = 1; + m.journal_seq = journal_seq; + } })); g->io_time[READ] = u.read_time; @@ -580,6 +667,11 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, g->oldest_gen = u.oldest_gen; g->gen_valid = 1; + /* + * need to know if we're getting called from the invalidate path or + * not: + */ + if (old.cached_sectors) { update_cached_sectors(c, fs_usage, ca->dev_idx, -old.cached_sectors); @@ -622,7 +714,7 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, old.dirty_sectors, sectors); if (c) - bch2_dev_usage_update(c, ca, this_cpu_ptr(c->usage[gc]), + bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc), old, new, gc); return 0; @@ -665,11 +757,34 @@ static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p, } } -/* - * Checking against gc's position has to be done here, inside the cmpxchg() - * loop, to avoid racing with the start of gc clearing all the marks - GC does - * that with the gc pos seqlock held. - */ +static void bucket_set_stripe(struct bch_fs *c, + const struct bch_stripe *v, + bool enabled, + struct bch_fs_usage *fs_usage, + u64 journal_seq, + bool gc) +{ + unsigned i; + + for (i = 0; i < v->nr_blocks; i++) { + const struct bch_extent_ptr *ptr = v->ptrs + i; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket *g = PTR_BUCKET(ca, ptr, gc); + struct bucket_mark new, old; + + BUG_ON(ptr_stale(ca, ptr)); + + old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({ + new.dirty = true; + new.stripe = enabled; + if (journal_seq) { + new.journal_seq_valid = 1; + new.journal_seq = journal_seq; + } + })); + } +} + static bool bch2_mark_pointer(struct bch_fs *c, struct extent_ptr_decoded p, s64 sectors, enum bch_data_type data_type, @@ -679,8 +794,7 @@ static bool bch2_mark_pointer(struct bch_fs *c, { struct bucket_mark old, new; struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - size_t b = PTR_BUCKET_NR(ca, &p.ptr); - struct bucket *g = __bucket(ca, b, gc); + struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc); bool overflow; u64 v; @@ -849,35 +963,6 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, return 0; } -static void bucket_set_stripe(struct bch_fs *c, - const struct bch_stripe *v, - bool enabled, - struct bch_fs_usage *fs_usage, - u64 journal_seq, - bool gc) -{ - unsigned i; - - for (i = 0; i < v->nr_blocks; i++) { - const struct bch_extent_ptr *ptr = v->ptrs + i; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - size_t b = PTR_BUCKET_NR(ca, ptr); - struct bucket *g = __bucket(ca, b, gc); - struct bucket_mark new, old; - - BUG_ON(ptr_stale(ca, ptr)); - - old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({ - new.dirty = true; - new.stripe = enabled; - if (journal_seq) { - new.journal_seq_valid = 1; - new.journal_seq = journal_seq; - } - })); - } -} - static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, bool inserting, struct bch_fs_usage *fs_usage, @@ -909,14 +994,7 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, m->nr_blocks = s.v->nr_blocks; m->nr_redundant = s.v->nr_redundant; - memset(&m->r, 0, sizeof(m->r)); - - m->r.e.data_type = BCH_DATA_USER; - m->r.e.nr_devs = s.v->nr_blocks; - m->r.e.nr_required = s.v->nr_blocks - s.v->nr_redundant; - - for (i = 0; i < s.v->nr_blocks; i++) - m->r.e.devs[i] = s.v->ptrs[i].dev; + bch2_bkey_to_replicas(&m->r.e, k); /* * XXX: account for stripes somehow here @@ -958,7 +1036,7 @@ int bch2_mark_key_locked(struct bch_fs *c, preempt_disable(); if (!fs_usage || gc) - fs_usage = this_cpu_ptr(c->usage[gc]); + fs_usage = fs_usage_ptr(c, journal_seq, gc); switch (k.k->type) { case KEY_TYPE_alloc: @@ -1019,73 +1097,102 @@ int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, return ret; } -void bch2_mark_update(struct btree_trans *trans, - struct btree_insert_entry *insert, - struct bch_fs_usage *fs_usage, - unsigned flags) +inline int bch2_mark_overwrite(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c old, + struct bkey_i *new, + struct bch_fs_usage *fs_usage, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree *b = iter->l[0].b; + s64 sectors = 0; + + if (btree_node_is_extents(b) + ? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0 + : bkey_cmp(new->k.p, old.k->p)) + return 0; + + if (btree_node_is_extents(b)) { + switch (bch2_extent_overlap(&new->k, old.k)) { + case BCH_EXTENT_OVERLAP_ALL: + sectors = -((s64) old.k->size); + break; + case BCH_EXTENT_OVERLAP_BACK: + sectors = bkey_start_offset(&new->k) - + old.k->p.offset; + break; + case BCH_EXTENT_OVERLAP_FRONT: + sectors = bkey_start_offset(old.k) - + new->k.p.offset; + break; + case BCH_EXTENT_OVERLAP_MIDDLE: + sectors = old.k->p.offset - new->k.p.offset; + BUG_ON(sectors <= 0); + + bch2_mark_key_locked(c, old, true, sectors, + fs_usage, trans->journal_res.seq, + flags); + + sectors = bkey_start_offset(&new->k) - + old.k->p.offset; + break; + } + + BUG_ON(sectors >= 0); + } + + return bch2_mark_key_locked(c, old, false, sectors, fs_usage, + trans->journal_res.seq, flags) ?: 1; +} + +int bch2_mark_update(struct btree_trans *trans, + struct btree_insert_entry *insert, + struct bch_fs_usage *fs_usage, + unsigned flags) { struct bch_fs *c = trans->c; struct btree_iter *iter = insert->iter; struct btree *b = iter->l[0].b; struct btree_node_iter node_iter = iter->l[0].iter; struct bkey_packed *_k; + int ret = 0; if (!btree_node_type_needs_gc(iter->btree_id)) - return; + return 0; - if (!(trans->flags & BTREE_INSERT_NOMARK)) + if (!(trans->flags & BTREE_INSERT_NOMARK_INSERT)) bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true, bpos_min(insert->k->k.p, b->key.k.p).offset - bkey_start_offset(&insert->k->k), fs_usage, trans->journal_res.seq, flags); + if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES)) + return 0; + + /* + * For non extents, we only mark the new key, not the key being + * overwritten - unless we're actually deleting: + */ + if ((iter->btree_id == BTREE_ID_ALLOC || + iter->btree_id == BTREE_ID_EC) && + !bkey_deleted(&insert->k->k)) + return 0; + while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, KEY_TYPE_discard))) { struct bkey unpacked; - struct bkey_s_c k; - s64 sectors = 0; + struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); - k = bkey_disassemble(b, _k, &unpacked); - - if (btree_node_is_extents(b) - ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0 - : bkey_cmp(insert->k->k.p, k.k->p)) + ret = bch2_mark_overwrite(trans, iter, k, insert->k, + fs_usage, flags); + if (ret <= 0) break; - if (btree_node_is_extents(b)) { - switch (bch2_extent_overlap(&insert->k->k, k.k)) { - case BCH_EXTENT_OVERLAP_ALL: - sectors = -((s64) k.k->size); - break; - case BCH_EXTENT_OVERLAP_BACK: - sectors = bkey_start_offset(&insert->k->k) - - k.k->p.offset; - break; - case BCH_EXTENT_OVERLAP_FRONT: - sectors = bkey_start_offset(k.k) - - insert->k->k.p.offset; - break; - case BCH_EXTENT_OVERLAP_MIDDLE: - sectors = k.k->p.offset - insert->k->k.p.offset; - BUG_ON(sectors <= 0); - - bch2_mark_key_locked(c, k, true, sectors, - fs_usage, trans->journal_res.seq, - flags); - - sectors = bkey_start_offset(&insert->k->k) - - k.k->p.offset; - break; - } - - BUG_ON(sectors >= 0); - } - - bch2_mark_key_locked(c, k, false, sectors, - fs_usage, trans->journal_res.seq, flags); - bch2_btree_node_iter_advance(&node_iter, b); } + + return ret; } void bch2_trans_fs_usage_apply(struct btree_trans *trans, @@ -1097,7 +1204,8 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; char buf[200]; - if (!bch2_fs_usage_apply(c, fs_usage, trans->disk_res) || + if (!bch2_fs_usage_apply(c, fs_usage, trans->disk_res, + trans->journal_res.seq) || warned_disk_usage || xchg(&warned_disk_usage, 1)) return; @@ -1136,6 +1244,391 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, } } +/* trans_mark: */ + +static inline void update_replicas_list(struct replicas_delta_list *d, + struct bch_replicas_entry *r, + s64 sectors) +{ + d->top->delta = sectors; + memcpy(&d->top->r, r, replicas_entry_bytes(r)); + + d->top = (void *) d->top + replicas_entry_bytes(r) + 8; + + BUG_ON((void *) d->top > (void *) d->d + sizeof(d->pad)); +} + +static inline void update_cached_sectors_list(struct replicas_delta_list *d, + unsigned dev, s64 sectors) +{ + struct bch_replicas_padded r; + + bch2_replicas_entry_cached(&r.e, dev); + + update_replicas_list(d, &r.e, sectors); +} + +void bch2_replicas_delta_list_apply(struct bch_fs *c, + struct bch_fs_usage *fs_usage, + struct replicas_delta_list *r) +{ + struct replicas_delta *d = r->d; + + acc_u64s((u64 *) fs_usage, + (u64 *) &r->fs_usage, sizeof(*fs_usage) / sizeof(u64)); + + while (d != r->top) { + BUG_ON((void *) d > (void *) r->top); + + update_replicas(c, fs_usage, &d->r, d->delta); + + d = (void *) d + replicas_entry_bytes(&d->r) + 8; + } +} + +static int trans_get_key(struct btree_trans *trans, + enum btree_id btree_id, struct bpos pos, + struct btree_insert_entry **insert, + struct btree_iter **iter, + struct bkey_s_c *k) +{ + unsigned i; + int ret; + + *insert = NULL; + + for (i = 0; i < trans->nr_updates; i++) + if (!trans->updates[i].deferred && + trans->updates[i].iter->btree_id == btree_id && + !bkey_cmp(pos, trans->updates[i].iter->pos)) { + *insert = &trans->updates[i]; + *iter = (*insert)->iter; + *k = bkey_i_to_s_c((*insert)->k); + return 0; + } + + *iter = __bch2_trans_get_iter(trans, btree_id, pos, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, 0); + if (IS_ERR(*iter)) + return PTR_ERR(*iter); + + *k = bch2_btree_iter_peek_slot(*iter); + ret = bkey_err(*k); + if (ret) + bch2_trans_iter_put(trans, *iter); + return ret; +} + +static int trans_update_key(struct btree_trans *trans, + struct btree_insert_entry **insert, + struct btree_iter *iter, + struct bkey_s_c k, + unsigned extra_u64s) +{ + struct bkey_i *new_k; + + if (*insert) + return 0; + + new_k = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + + extra_u64s * sizeof(u64)); + if (IS_ERR(new_k)) + return PTR_ERR(new_k); + + *insert = bch2_trans_update(trans, ((struct btree_insert_entry) { + .iter = iter, + .k = new_k, + .triggered = true, + })); + + bkey_reassemble((*insert)->k, k); + return 0; +} + +static int bch2_trans_mark_pointer(struct btree_trans *trans, + struct extent_ptr_decoded p, + s64 sectors, enum bch_data_type data_type, + struct replicas_delta_list *d) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct btree_insert_entry *insert; + struct btree_iter *iter; + struct bkey_s_c k; + struct bkey_alloc_unpacked u; + struct bkey_i_alloc *a; + bool overflow; + int ret; + + ret = trans_get_key(trans, BTREE_ID_ALLOC, + POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr)), + &insert, &iter, &k); + if (ret) + return ret; + + if (k.k->type != KEY_TYPE_alloc) { + bch_err_ratelimited(c, "pointer to nonexistent bucket %u:%zu", + p.ptr.dev, + PTR_BUCKET_NR(ca, &p.ptr)); + ret = -1; + goto out; + } + + u = bch2_alloc_unpack(k); + + if (gen_after(u.gen, p.ptr.gen)) { + ret = 1; + goto out; + } + + if (!p.ptr.cached) + overflow = checked_add(u.dirty_sectors, sectors); + else + overflow = checked_add(u.cached_sectors, sectors); + + u.data_type = u.dirty_sectors || u.cached_sectors + ? data_type : 0; + + bch2_fs_inconsistent_on(overflow, c, + "bucket sector count overflow: %u + %lli > U16_MAX", + !p.ptr.cached + ? u.dirty_sectors + : u.cached_sectors, sectors); + + ret = trans_update_key(trans, &insert, iter, k, 1); + if (ret) + goto out; + + a = bkey_alloc_init(insert->k); + a->k.p = iter->pos; + bch2_alloc_pack(a, u); +out: + bch2_trans_iter_put(trans, iter); + return ret; +} + +static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, + struct bch_extent_stripe_ptr p, + s64 sectors, enum bch_data_type data_type, + struct replicas_delta_list *d) +{ + struct bch_replicas_padded r; + struct btree_insert_entry *insert; + struct btree_iter *iter; + struct bkey_s_c k; + struct bkey_s_stripe s; + unsigned nr_data; + s64 parity_sectors; + int ret = 0; + + BUG_ON(!sectors); + + ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), + &insert, &iter, &k); + if (ret) + return ret; + + if (k.k->type != KEY_TYPE_stripe) { + bch_err_ratelimited(trans->c, + "pointer to nonexistent stripe %llu", + (u64) p.idx); + ret = -1; + goto out; + } + + ret = trans_update_key(trans, &insert, iter, k, 1); + if (ret) + goto out; + + s = bkey_i_to_s_stripe(insert->k); + + nr_data = s.v->nr_blocks - s.v->nr_redundant; + + parity_sectors = DIV_ROUND_UP(abs(sectors) * s.v->nr_redundant, nr_data); + + if (sectors < 0) + parity_sectors = -parity_sectors; + + stripe_blockcount_set(s.v, p.block, + stripe_blockcount_get(s.v, p.block) + + sectors + parity_sectors); + + bch2_bkey_to_replicas(&r.e, s.s_c); + + update_replicas_list(d, &r.e, sectors); +out: + bch2_trans_iter_put(trans, iter); + return ret; +} + +static int bch2_trans_mark_extent(struct btree_trans *trans, + struct bkey_s_c k, + s64 sectors, enum bch_data_type data_type, + struct replicas_delta_list *d) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct bch_replicas_padded r; + s64 dirty_sectors = 0; + bool stale; + unsigned i; + int ret; + + r.e.data_type = data_type; + r.e.nr_devs = 0; + r.e.nr_required = 1; + + BUG_ON(!sectors); + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + s64 disk_sectors = data_type == BCH_DATA_BTREE + ? sectors + : ptr_disk_sectors_delta(p, sectors); + + ret = bch2_trans_mark_pointer(trans, p, disk_sectors, + data_type, d); + if (ret < 0) + return ret; + + stale = ret > 0; + + if (p.ptr.cached) { + if (disk_sectors && !stale) + update_cached_sectors_list(d, p.ptr.dev, + disk_sectors); + } else if (!p.ec_nr) { + dirty_sectors += disk_sectors; + r.e.devs[r.e.nr_devs++] = p.ptr.dev; + } else { + for (i = 0; i < p.ec_nr; i++) { + ret = bch2_trans_mark_stripe_ptr(trans, p.ec[i], + disk_sectors, data_type, d); + if (ret) + return ret; + } + + r.e.nr_required = 0; + } + } + + if (dirty_sectors) + update_replicas_list(d, &r.e, dirty_sectors); + + return 0; +} + +int bch2_trans_mark_key(struct btree_trans *trans, + struct bkey_s_c k, + bool inserting, s64 sectors, + struct replicas_delta_list *d) +{ + struct bch_fs *c = trans->c; + + switch (k.k->type) { + case KEY_TYPE_btree_ptr: + return bch2_trans_mark_extent(trans, k, inserting + ? c->opts.btree_node_size + : -c->opts.btree_node_size, + BCH_DATA_BTREE, d); + case KEY_TYPE_extent: + return bch2_trans_mark_extent(trans, k, + sectors, BCH_DATA_USER, d); + case KEY_TYPE_inode: + if (inserting) + d->fs_usage.nr_inodes++; + else + d->fs_usage.nr_inodes--; + return 0; + case KEY_TYPE_reservation: { + unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; + + sectors *= replicas; + replicas = clamp_t(unsigned, replicas, 1, + ARRAY_SIZE(d->fs_usage.persistent_reserved)); + + d->fs_usage.reserved += sectors; + d->fs_usage.persistent_reserved[replicas - 1] += sectors; + return 0; + } + default: + return 0; + } +} + +int bch2_trans_mark_update(struct btree_trans *trans, + struct btree_insert_entry *insert, + struct replicas_delta_list *d) +{ + struct btree_iter *iter = insert->iter; + struct btree *b = iter->l[0].b; + struct btree_node_iter node_iter = iter->l[0].iter; + struct bkey_packed *_k; + int ret; + + if (!btree_node_type_needs_gc(iter->btree_id)) + return 0; + + ret = bch2_trans_mark_key(trans, + bkey_i_to_s_c(insert->k), true, + bpos_min(insert->k->k.p, b->key.k.p).offset - + bkey_start_offset(&insert->k->k), d); + if (ret) + return ret; + + while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, + KEY_TYPE_discard))) { + struct bkey unpacked; + struct bkey_s_c k; + s64 sectors = 0; + + k = bkey_disassemble(b, _k, &unpacked); + + if (btree_node_is_extents(b) + ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0 + : bkey_cmp(insert->k->k.p, k.k->p)) + break; + + if (btree_node_is_extents(b)) { + switch (bch2_extent_overlap(&insert->k->k, k.k)) { + case BCH_EXTENT_OVERLAP_ALL: + sectors = -((s64) k.k->size); + break; + case BCH_EXTENT_OVERLAP_BACK: + sectors = bkey_start_offset(&insert->k->k) - + k.k->p.offset; + break; + case BCH_EXTENT_OVERLAP_FRONT: + sectors = bkey_start_offset(k.k) - + insert->k->k.p.offset; + break; + case BCH_EXTENT_OVERLAP_MIDDLE: + sectors = k.k->p.offset - insert->k->k.p.offset; + BUG_ON(sectors <= 0); + + ret = bch2_trans_mark_key(trans, k, true, + sectors, d); + if (ret) + return ret; + + sectors = bkey_start_offset(&insert->k->k) - + k.k->p.offset; + break; + } + + BUG_ON(sectors >= 0); + } + + ret = bch2_trans_mark_key(trans, k, false, sectors, d); + if (ret) + return ret; + + bch2_btree_node_iter_advance(&node_iter, b); + } + + return 0; +} + /* Disk reservations: */ static u64 bch2_recalc_sectors_available(struct bch_fs *c) diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 1033398e..a32c25d8 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -99,7 +99,7 @@ static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca, struct bucket_mark m; rcu_read_lock(); - m = READ_ONCE(bucket(ca, PTR_BUCKET_NR(ca, ptr))->mark); + m = READ_ONCE(PTR_BUCKET(ca, ptr, 0)->mark); rcu_read_unlock(); return m; @@ -221,8 +221,15 @@ static inline unsigned fs_usage_u64s(struct bch_fs *c) void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *); struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *); +u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *); + struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *); +void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned); + +void bch2_fs_usage_to_text(struct printbuf *, + struct bch_fs *, struct bch_fs_usage *); + u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage *); struct bch_fs_usage_short @@ -251,10 +258,22 @@ int bch2_mark_key(struct bch_fs *, struct bkey_s_c, bool, s64, struct bch_fs_usage *, u64, unsigned); int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, - struct disk_reservation *); + struct disk_reservation *, unsigned); -void bch2_mark_update(struct btree_trans *, struct btree_insert_entry *, - struct bch_fs_usage *, unsigned); +int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *, + struct bkey_s_c, struct bkey_i *, + struct bch_fs_usage *, unsigned); +int bch2_mark_update(struct btree_trans *, struct btree_insert_entry *, + struct bch_fs_usage *, unsigned); + +void bch2_replicas_delta_list_apply(struct bch_fs *, + struct bch_fs_usage *, + struct replicas_delta_list *); +int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, + bool, s64, struct replicas_delta_list *); +int bch2_trans_mark_update(struct btree_trans *, + struct btree_insert_entry *, + struct replicas_delta_list *); void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *); /* disk reservations: */ diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index 2a1fd7a7..974daa7e 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -94,6 +94,19 @@ struct bch_fs_usage_short { u64 nr_inodes; }; +struct replicas_delta { + s64 delta; + struct bch_replicas_entry r; +}; + +struct replicas_delta_list { + struct bch_fs_usage fs_usage; + + struct replicas_delta *top; + struct replicas_delta d[0]; + u8 pad[256]; +}; + /* * A reservation for space on disk: */ diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index 82d90cde..595d4797 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -157,7 +157,7 @@ static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg) if (arg.flags || arg.pad) return -EINVAL; - return bch2_fs_start(c) ? -EIO : 0; + return bch2_fs_start(c); } static long bch2_ioctl_stop(struct bch_fs *c) diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index 14a9a2c0..b379780e 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -332,14 +332,10 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) { struct btree_iter *iter; struct bkey_s_c k; - int ret = 0; + int ret; - iter = bch2_trans_get_iter(trans, BTREE_ID_DIRENTS, - POS(dir_inum, 0), 0); - if (IS_ERR(iter)) - return PTR_ERR(iter); - - for_each_btree_key_continue(iter, 0, k) { + for_each_btree_key(trans, iter, BTREE_ID_DIRENTS, + POS(dir_inum, 0), 0, k, ret) { if (k.k->p.inode > dir_inum) break; @@ -368,6 +364,7 @@ int bch2_readdir(struct bch_fs *c, struct file *file, struct bkey_s_c k; struct bkey_s_c_dirent dirent; unsigned len; + int ret; if (!dir_emit_dots(file, ctx)) return 0; @@ -375,7 +372,7 @@ int bch2_readdir(struct bch_fs *c, struct file *file, bch2_trans_init(&trans, c); for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, - POS(inode->v.i_ino, ctx->pos), 0, k) { + POS(inode->v.i_ino, ctx->pos), 0, k, ret) { if (k.k->type != KEY_TYPE_dirent) continue; @@ -400,7 +397,7 @@ int bch2_readdir(struct bch_fs *c, struct file *file, ctx->pos = k.k->p.offset + 1; } - bch2_trans_exit(&trans); + ret = bch2_trans_exit(&trans) ?: ret; - return 0; + return ret; } diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 91b86d9d..a31d6cb2 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -11,8 +11,8 @@ #include "ec.h" #include "error.h" #include "io.h" -#include "journal_io.h" #include "keylist.h" +#include "recovery.h" #include "super-io.h" #include "util.h" @@ -536,14 +536,17 @@ static int ec_stripe_mem_alloc(struct bch_fs *c, struct btree_iter *iter) { size_t idx = iter->pos.offset; + int ret = 0; if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT)) - return 0; + return ret; bch2_btree_trans_unlock(iter->trans); + ret = -EINTR; if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL)) - return -EINTR; + return ret; + return -ENOMEM; } @@ -678,10 +681,8 @@ retry: bch2_trans_begin(&trans); /* XXX: start pos hint */ - iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - - for_each_btree_key_continue(iter, BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k) { + for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) break; @@ -689,24 +690,24 @@ retry: goto found_slot; } - ret = -ENOSPC; - goto out; + if (!ret) + ret = -ENOSPC; + goto err; found_slot: ret = ec_stripe_mem_alloc(c, iter); - - if (ret == -EINTR) - goto retry; if (ret) - return ret; + goto err; stripe->k.p = iter->pos; bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &stripe->k_i)); ret = bch2_trans_commit(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE); -out: + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL); +err: + if (ret == -EINTR) + goto retry; bch2_trans_exit(&trans); return ret; @@ -743,6 +744,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, int ret = 0, dev, idx; bch2_trans_init(&trans, c); + bch2_trans_preload_iters(&trans); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, bkey_start_pos(pos), @@ -950,7 +952,7 @@ static int unsigned_cmp(const void *_l, const void *_r) unsigned l = *((const unsigned *) _l); unsigned r = *((const unsigned *) _r); - return (l > r) - (l < r); + return cmp_int(l, r); } /* pick most common bucket size: */ @@ -1193,7 +1195,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans, BTREE_INSERT_NOFAIL|flags); } -int bch2_stripes_write(struct bch_fs *c, bool *wrote) +int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote) { struct btree_trans trans; struct btree_iter *iter; @@ -1215,7 +1217,7 @@ int bch2_stripes_write(struct bch_fs *c, bool *wrote) continue; ret = __bch2_stripe_write_key(&trans, iter, m, giter.pos, - new_key, BTREE_INSERT_NOCHECK_RW); + new_key, flags); if (ret) break; @@ -1229,14 +1231,9 @@ int bch2_stripes_write(struct bch_fs *c, bool *wrote) return ret; } -static void bch2_stripe_read_key(struct bch_fs *c, struct bkey_s_c k) +int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) { - bch2_mark_key(c, k, true, 0, NULL, 0, 0); -} - -int bch2_stripes_read(struct bch_fs *c, struct list_head *journal_replay_list) -{ - struct journal_replay *r; + struct journal_key *i; struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; @@ -1248,24 +1245,20 @@ int bch2_stripes_read(struct bch_fs *c, struct list_head *journal_replay_list) bch2_trans_init(&trans, c); - for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN, 0, k) { - bch2_stripe_read_key(c, k); - bch2_trans_cond_resched(&trans); - } + for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN, 0, k, ret) + bch2_mark_key(c, k, true, 0, NULL, 0, 0); - ret = bch2_trans_exit(&trans); - if (ret) + ret = bch2_trans_exit(&trans) ?: ret; + if (ret) { + bch_err(c, "error reading stripes: %i", ret); return ret; - - list_for_each_entry(r, journal_replay_list, list) { - struct bkey_i *k, *n; - struct jset_entry *entry; - - for_each_jset_key(k, n, entry, &r->j) - if (entry->btree_id == BTREE_ID_EC) - bch2_stripe_read_key(c, bkey_i_to_s_c(k)); } + for_each_journal_key(*journal_keys, i) + if (i->btree_id == BTREE_ID_EC) + bch2_mark_key(c, bkey_i_to_s_c(i->k), + true, 0, NULL, 0, 0); + return 0; } diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h index 28178330..6c00ec5c 100644 --- a/libbcachefs/ec.h +++ b/libbcachefs/ec.h @@ -149,8 +149,9 @@ void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); void bch2_ec_flush_new_stripes(struct bch_fs *); -int bch2_stripes_read(struct bch_fs *, struct list_head *); -int bch2_stripes_write(struct bch_fs *, bool *); +struct journal_keys; +int bch2_stripes_read(struct bch_fs *, struct journal_keys *); +int bch2_stripes_write(struct bch_fs *, unsigned, bool *); int bch2_ec_mem_alloc(struct bch_fs *, bool); diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index aa2fc779..f8f29251 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -872,15 +872,54 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, bch2_btree_iter_verify(iter, l->b); } +static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + unsigned ret = 0; + + bkey_extent_entry_for_each(ptrs, entry) { + switch (__extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + case BCH_EXTENT_ENTRY_stripe_ptr: + ret++; + } + } + + return ret; +} + static inline struct bpos -bch2_extent_atomic_end(struct bkey_i *k, struct btree_iter *iter) +bch2_extent_atomic_end(struct bkey_i *insert, struct btree_iter *iter) { struct btree *b = iter->l[0].b; + struct btree_node_iter node_iter = iter->l[0].iter; + struct bkey_packed *_k; + unsigned nr_alloc_ptrs = + bch2_bkey_nr_alloc_ptrs(bkey_i_to_s_c(insert)); BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); - BUG_ON(bkey_cmp(bkey_start_pos(&k->k), b->data->min_key) < 0); + BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0); - return bpos_min(k->k.p, b->key.k.p); + while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, + KEY_TYPE_discard))) { + struct bkey unpacked; + struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); + + if (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0) + break; + + nr_alloc_ptrs += bch2_bkey_nr_alloc_ptrs(k); + + if (nr_alloc_ptrs > 20) { + BUG_ON(bkey_cmp(k.k->p, bkey_start_pos(&insert->k)) <= 0); + return bpos_min(insert->k.p, k.k->p); + } + + bch2_btree_node_iter_advance(&node_iter, b); + } + + return bpos_min(insert->k.p, b->key.k.p); } void bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) @@ -1627,13 +1666,14 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, struct bpos end = pos; struct bkey_s_c k; bool ret = true; + int err; end.offset += size; bch2_trans_init(&trans, c); for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos, - BTREE_ITER_SLOTS, k) { + BTREE_ITER_SLOTS, k, err) { if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) break; diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 721215ee..b2ea783f 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -2150,7 +2150,7 @@ static inline int range_has_data(struct bch_fs *c, bch2_trans_init(&trans, c); - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, start, 0, k) { + for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, start, 0, k, ret) { if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) break; @@ -2735,7 +2735,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) bch2_trans_init(&trans, c); for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, - POS(inode->v.i_ino, offset >> 9), 0, k) { + POS(inode->v.i_ino, offset >> 9), 0, k, ret) { if (k.k->p.inode != inode->v.i_ino) { break; } else if (bkey_extent_is_data(k.k)) { @@ -2745,7 +2745,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) break; } - ret = bch2_trans_exit(&trans); + ret = bch2_trans_exit(&trans) ?: ret; if (ret) return ret; @@ -2809,7 +2809,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS(inode->v.i_ino, offset >> 9), - BTREE_ITER_SLOTS, k) { + BTREE_ITER_SLOTS, k, ret) { if (k.k->p.inode != inode->v.i_ino) { next_hole = bch2_next_pagecache_hole(&inode->v, offset, MAX_LFS_FILESIZE); @@ -2826,7 +2826,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) } } - ret = bch2_trans_exit(&trans); + ret = bch2_trans_exit(&trans) ?: ret; if (ret) return ret; diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index 1dc9b06d..c707c46b 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -266,7 +266,8 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) down_write(&sb->s_umount); sb->s_flags |= SB_RDONLY; - bch2_fs_emergency_read_only(c); + if (bch2_fs_emergency_read_only(c)) + bch_err(c, "emergency read only due to ioctl"); up_write(&sb->s_umount); return 0; diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 135f6e41..af58d00f 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -1126,7 +1126,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bch2_trans_init(&trans, c); for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, - POS(ei->v.i_ino, start >> 9), 0, k) + POS(ei->v.i_ino, start >> 9), 0, k, ret) if (bkey_extent_is_data(k.k) || k.k->type == KEY_TYPE_reservation) { if (bkey_cmp(bkey_start_pos(k.k), @@ -1136,17 +1136,17 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, if (have_extent) { ret = bch2_fill_extent(info, &tmp.k, 0); if (ret) - goto out; + break; } bkey_reassemble(&tmp.k, k); have_extent = true; } - if (have_extent) + if (!ret && have_extent) ret = bch2_fill_extent(info, &tmp.k, FIEMAP_EXTENT_LAST); -out: - bch2_trans_exit(&trans); + + ret = bch2_trans_exit(&trans) ?: ret; return ret < 0 ? ret : 0; } @@ -1750,12 +1750,15 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO); if (IS_ERR(vinode)) { + bch_err(c, "error mounting: error getting root inode %i", + (int) PTR_ERR(vinode)); ret = PTR_ERR(vinode); goto err_put_super; } sb->s_root = d_make_root(vinode); if (!sb->s_root) { + bch_err(c, "error mounting: error allocating root dentry"); ret = -ENOMEM; goto err_put_super; } diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h index 729c0317..374f7fd1 100644 --- a/libbcachefs/fs.h +++ b/libbcachefs/fs.h @@ -32,7 +32,7 @@ struct bch_inode_info { static inline int ptrcmp(void *l, void *r) { - return (l > r) - (l < r); + return cmp_int(l, r); } #define __bch2_lock_inodes(_lock, ...) \ diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index b83f94c6..998c10ab 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -20,8 +20,10 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) struct btree_iter *iter; struct bkey_s_c k; u64 sectors = 0; + int ret; - for_each_btree_key(trans, iter, BTREE_ID_EXTENTS, POS(inum, 0), 0, k) { + for_each_btree_key(trans, iter, BTREE_ID_EXTENTS, + POS(inum, 0), 0, k, ret) { if (k.k->p.inode != inum) break; @@ -29,7 +31,9 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) sectors += k.k->size; } - return bch2_trans_iter_free(trans, iter) ?: sectors; + bch2_trans_iter_free(trans, iter); + + return ret ?: sectors; } static int remove_dirent(struct btree_trans *trans, @@ -494,8 +498,7 @@ retry: BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW); if (ret) { - bch_err(c, "error in fs gc: error %i " - "updating inode", ret); + bch_err(c, "error in fsck: error %i updating inode", ret); goto err; } @@ -941,7 +944,7 @@ next: goto up; for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, - POS(e->inum, e->offset + 1), 0, k) { + POS(e->inum, e->offset + 1), 0, k, ret) { if (k.k->p.inode != e->inum) break; @@ -984,7 +987,7 @@ next: } goto next; } - ret = bch2_trans_iter_free(&trans, iter); + ret = bch2_trans_iter_free(&trans, iter) ?: ret; if (ret) { bch_err(c, "btree error %i in fsck", ret); goto err; @@ -1059,7 +1062,7 @@ static void inc_link(struct bch_fs *c, nlink_table *links, link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL); if (!link) { - bch_verbose(c, "allocation failed during fs gc - will need another pass"); + bch_verbose(c, "allocation failed during fsck - will need another pass"); *range_end = inum; return; } @@ -1086,7 +1089,7 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links, inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false); - for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k) { + for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k, ret) { switch (k.k->type) { case KEY_TYPE_dirent: d = bkey_s_c_to_dirent(k); @@ -1104,9 +1107,9 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links, bch2_trans_cond_resched(&trans); } - ret = bch2_trans_exit(&trans); + ret = bch2_trans_exit(&trans) ?: ret; if (ret) - bch_err(c, "error in fs gc: btree error %i while walking dirents", ret); + bch_err(c, "error in fsck: btree error %i while walking dirents", ret); return ret; } @@ -1247,8 +1250,7 @@ static int check_inode(struct btree_trans *trans, ret = bch2_inode_rm(c, u.bi_inum); if (ret) - bch_err(c, "error in fs gc: error %i " - "while deleting inode", ret); + bch_err(c, "error in fsck: error %i while deleting inode", ret); return ret; } @@ -1265,8 +1267,7 @@ static int check_inode(struct btree_trans *trans, ret = bch2_inode_truncate(c, u.bi_inum, u.bi_size); if (ret) { - bch_err(c, "error in fs gc: error %i " - "truncating inode", ret); + bch_err(c, "error in fsck: error %i truncating inode", ret); return ret; } @@ -1291,8 +1292,7 @@ static int check_inode(struct btree_trans *trans, sectors = bch2_count_inode_sectors(trans, u.bi_inum); if (sectors < 0) { - bch_err(c, "error in fs gc: error %i " - "recounting inode sectors", + bch_err(c, "error in fsck: error %i recounting inode sectors", (int) sectors); return sectors; } @@ -1312,7 +1312,7 @@ static int check_inode(struct btree_trans *trans, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW); if (ret && ret != -EINTR) - bch_err(c, "error in fs gc: error %i " + bch_err(c, "error in fsck: error %i " "updating inode", ret); } fsck_err: @@ -1383,7 +1383,7 @@ fsck_err: bch2_trans_exit(&trans); if (ret2) - bch_err(c, "error in fs gc: btree error %i while walking inodes", ret2); + bch_err(c, "error in fsck: btree error %i while walking inodes", ret2); return ret ?: ret2; } @@ -1424,22 +1424,44 @@ static int check_inode_nlinks(struct bch_fs *c, return ret; } -noinline_for_stack -static int check_inodes_fast(struct bch_fs *c) +/* + * Checks for inconsistencies that shouldn't happen, unless we have a bug. + * Doesn't fix them yet, mainly because they haven't yet been observed: + */ +int bch2_fsck_full(struct bch_fs *c) +{ + struct bch_inode_unpacked root_inode, lostfound_inode; + + return check_extents(c) ?: + check_dirents(c) ?: + check_xattrs(c) ?: + check_root(c, &root_inode) ?: + check_lostfound(c, &root_inode, &lostfound_inode) ?: + check_directory_structure(c, &lostfound_inode) ?: + check_inode_nlinks(c, &lostfound_inode); +} + +int bch2_fsck_inode_nlink(struct bch_fs *c) +{ + struct bch_inode_unpacked root_inode, lostfound_inode; + + return check_root(c, &root_inode) ?: + check_lostfound(c, &root_inode, &lostfound_inode) ?: + check_inode_nlinks(c, &lostfound_inode); +} + +int bch2_fsck_walk_inodes_only(struct bch_fs *c) { struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; struct bkey_s_c_inode inode; - int ret = 0, ret2; + int ret; bch2_trans_init(&trans, c); bch2_trans_preload_iters(&trans); - iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, - POS_MIN, 0); - - for_each_btree_key_continue(iter, 0, k) { + for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, 0, k, ret) { if (k.k->type != KEY_TYPE_inode) continue; @@ -1455,74 +1477,7 @@ static int check_inodes_fast(struct bch_fs *c) break; } } + BUG_ON(ret == -EINTR); - ret2 = bch2_trans_exit(&trans); - - return ret ?: ret2; -} - -/* - * Checks for inconsistencies that shouldn't happen, unless we have a bug. - * Doesn't fix them yet, mainly because they haven't yet been observed: - */ -static int bch2_fsck_full(struct bch_fs *c) -{ - struct bch_inode_unpacked root_inode, lostfound_inode; - int ret; - - bch_verbose(c, "starting fsck:"); - ret = check_extents(c) ?: - check_dirents(c) ?: - check_xattrs(c) ?: - check_root(c, &root_inode) ?: - check_lostfound(c, &root_inode, &lostfound_inode) ?: - check_directory_structure(c, &lostfound_inode) ?: - check_inode_nlinks(c, &lostfound_inode); - - bch2_flush_fsck_errs(c); - bch_verbose(c, "fsck done"); - - return ret; -} - -static int bch2_fsck_inode_nlink(struct bch_fs *c) -{ - struct bch_inode_unpacked root_inode, lostfound_inode; - int ret; - - bch_verbose(c, "checking inode link counts:"); - ret = check_root(c, &root_inode) ?: - check_lostfound(c, &root_inode, &lostfound_inode) ?: - check_inode_nlinks(c, &lostfound_inode); - - bch2_flush_fsck_errs(c); - bch_verbose(c, "done"); - - return ret; -} - -static int bch2_fsck_walk_inodes_only(struct bch_fs *c) -{ - int ret; - - bch_verbose(c, "walking inodes:"); - ret = check_inodes_fast(c); - - bch2_flush_fsck_errs(c); - bch_verbose(c, "done"); - - return ret; -} - -int bch2_fsck(struct bch_fs *c) -{ - if (c->opts.fsck) - return bch2_fsck_full(c); - - if (c->sb.clean) - return 0; - - return c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK) - ? bch2_fsck_walk_inodes_only(c) - : bch2_fsck_inode_nlink(c); + return bch2_trans_exit(&trans) ?: ret; } diff --git a/libbcachefs/fsck.h b/libbcachefs/fsck.h index dc7ce687..1f03079c 100644 --- a/libbcachefs/fsck.h +++ b/libbcachefs/fsck.h @@ -1,6 +1,8 @@ #ifndef _BCACHEFS_FSCK_H #define _BCACHEFS_FSCK_H -int bch2_fsck(struct bch_fs *); +int bch2_fsck_full(struct bch_fs *); +int bch2_fsck_inode_nlink(struct bch_fs *); +int bch2_fsck_walk_inodes_only(struct bch_fs *); #endif /* _BCACHEFS_FSCK_H */ diff --git a/libbcachefs/io.c b/libbcachefs/io.c index cc8a3c51..5df690f9 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -1308,7 +1308,7 @@ static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, retry: for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS(inode, bvec_iter.bi_sector), - BTREE_ITER_SLOTS, k) { + BTREE_ITER_SLOTS, k, ret) { BKEY_PADDED(k) tmp; unsigned bytes; @@ -1339,8 +1339,8 @@ retry: * If we get here, it better have been because there was an error * reading a btree node */ - BUG_ON(!btree_iter_err(iter)); - __bcache_io_error(c, "btree IO error"); + BUG_ON(!ret); + __bcache_io_error(c, "btree IO error: %i", ret); err: rbio->bio.bi_status = BLK_STS_IOERR; out: @@ -1846,6 +1846,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) unsigned flags = BCH_READ_RETRY_IF_STALE| BCH_READ_MAY_PROMOTE| BCH_READ_USER_MAPPED; + int ret; bch2_trans_init(&trans, c); @@ -1858,7 +1859,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS(inode, rbio->bio.bi_iter.bi_sector), - BTREE_ITER_SLOTS, k) { + BTREE_ITER_SLOTS, k, ret) { BKEY_PADDED(k) tmp; unsigned bytes; @@ -1890,8 +1891,8 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) * If we get here, it better have been because there was an error * reading a btree node */ - BUG_ON(!btree_iter_err(iter)); - bcache_io_error(c, &rbio->bio, "btree IO error"); + BUG_ON(!ret); + bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret); bch2_trans_exit(&trans); bch2_rbio_done(rbio); diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index d092dc0b..3ec80437 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -55,19 +55,6 @@ static void bch2_journal_buf_init(struct journal *j) buf->data->u64s = 0; } -static inline bool journal_entry_empty(struct jset *j) -{ - struct jset_entry *i; - - if (j->seq != j->last_seq) - return false; - - vstruct_for_each(j, i) - if (i->type || i->u64s) - return false; - return true; -} - void bch2_journal_halt(struct journal *j) { union journal_res_state old, new; @@ -1001,9 +988,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, u64 last_seq = cur_seq, nr, seq; if (!list_empty(journal_entries)) - last_seq = le64_to_cpu(list_last_entry(journal_entries, - struct journal_replay, - list)->j.last_seq); + last_seq = le64_to_cpu(list_first_entry(journal_entries, + struct journal_replay, + list)->j.seq); nr = cur_seq - last_seq; @@ -1016,6 +1003,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, } } + j->replay_journal_seq = last_seq; + j->replay_journal_seq_end = cur_seq; j->last_seq_ondisk = last_seq; j->pin.front = last_seq; j->pin.back = cur_seq; @@ -1024,7 +1013,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, fifo_for_each_entry_ptr(p, &j->pin, seq) { INIT_LIST_HEAD(&p->list); INIT_LIST_HEAD(&p->flushed); - atomic_set(&p->count, 0); + atomic_set(&p->count, 1); p->devs.nr = 0; } @@ -1033,10 +1022,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, BUG_ON(seq < last_seq || seq >= cur_seq); - p = journal_seq_pin(j, seq); - - atomic_set(&p->count, 1); - p->devs = i->devs; + journal_seq_pin(j, seq)->devs = i->devs; } spin_lock(&j->lock); diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 7b1523ef..2e7bc8e4 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -228,6 +228,19 @@ static inline void bch2_journal_add_keys(struct journal *j, struct journal_res * id, 0, k, k->k.u64s); } +static inline bool journal_entry_empty(struct jset *j) +{ + struct jset_entry *i; + + if (j->seq != j->last_seq) + return false; + + vstruct_for_each(j, i) + if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s) + return false; + return true; +} + void __bch2_journal_buf_put(struct journal *, bool); static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index af0701de..56950049 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -1,8 +1,5 @@ #include "bcachefs.h" -#include "alloc_background.h" #include "alloc_foreground.h" -#include "btree_gc.h" -#include "btree_update.h" #include "buckets.h" #include "checksum.h" #include "error.h" @@ -642,18 +639,6 @@ err: goto out; } -void bch2_journal_entries_free(struct list_head *list) -{ - - while (!list_empty(list)) { - struct journal_replay *i = - list_first_entry(list, struct journal_replay, list); - list_del(&i->list); - kvpfree(i, offsetof(struct journal_replay, j) + - vstruct_bytes(&i->j)); - } -} - int bch2_journal_read(struct bch_fs *c, struct list_head *list) { struct journal_list jlist; @@ -733,121 +718,6 @@ fsck_err: return ret; } -/* journal replay: */ - -static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k) -{ - struct btree_trans trans; - struct btree_iter *iter; - /* - * We might cause compressed extents to be - * split, so we need to pass in a - * disk_reservation: - */ - struct disk_reservation disk_res = - bch2_disk_reservation_init(c, 0); - BKEY_PADDED(k) split; - int ret; - - bch2_trans_init(&trans, c); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - bkey_start_pos(&k->k), - BTREE_ITER_INTENT); - do { - ret = bch2_btree_iter_traverse(iter); - if (ret) - break; - - bkey_copy(&split.k, k); - bch2_cut_front(iter->pos, &split.k); - bch2_extent_trim_atomic(&split.k, iter); - - ret = bch2_disk_reservation_add(c, &disk_res, - split.k.k.size * - bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&split.k)), - BCH_DISK_RESERVATION_NOFAIL); - BUG_ON(ret); - - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &split.k)); - ret = bch2_trans_commit(&trans, &disk_res, NULL, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_JOURNAL_REPLAY); - } while ((!ret || ret == -EINTR) && - bkey_cmp(k->k.p, iter->pos)); - - bch2_disk_reservation_put(c, &disk_res); - - /* - * This isn't strictly correct - we should only be relying on the btree - * node lock for synchronization with gc when we've got a write lock - * held. - * - * but - there are other correctness issues if btree gc were to run - * before journal replay finishes - */ - BUG_ON(c->gc_pos.phase); - - bch2_mark_key(c, bkey_i_to_s_c(k), false, -((s64) k->k.size), - NULL, 0, 0); - bch2_trans_exit(&trans); - - return ret; -} - -int bch2_journal_replay(struct bch_fs *c, struct list_head *list) -{ - struct journal *j = &c->journal; - struct bkey_i *k, *_n; - struct jset_entry *entry; - struct journal_replay *i, *n; - int ret = 0; - - list_for_each_entry_safe(i, n, list, list) { - j->replay_journal_seq = le64_to_cpu(i->j.seq); - - for_each_jset_key(k, _n, entry, &i->j) { - switch (entry->btree_id) { - case BTREE_ID_ALLOC: - ret = bch2_alloc_replay_key(c, k); - break; - case BTREE_ID_EXTENTS: - ret = bch2_extent_replay_key(c, k); - break; - default: - ret = bch2_btree_insert(c, entry->btree_id, k, - NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_JOURNAL_REPLAY| - BTREE_INSERT_NOMARK); - break; - } - - if (ret) { - bch_err(c, "journal replay: error %d while replaying key", - ret); - goto err; - } - - cond_resched(); - } - - bch2_journal_pin_put(j, j->replay_journal_seq); - } - - j->replay_journal_seq = 0; - - bch2_journal_set_replay_done(j); - bch2_journal_flush_all_pins(j); - ret = bch2_journal_error(j); -err: - bch2_journal_entries_free(list); - return ret; -} - /* journal write: */ static void __journal_write_alloc(struct journal *j, @@ -1077,7 +947,6 @@ out: return; err: bch2_fatal_error(c); - bch2_journal_halt(j); spin_lock(&j->lock); goto out; } @@ -1123,7 +992,8 @@ void bch2_journal_write(struct closure *cl) j->write_start_time = local_clock(); start = vstruct_last(jset); - end = bch2_journal_super_entries_add_common(c, start); + end = bch2_journal_super_entries_add_common(c, start, + le64_to_cpu(jset->seq)); u64s = (u64 *) end - (u64 *) start; BUG_ON(u64s > j->entry_u64s_reserved); @@ -1188,7 +1058,6 @@ void bch2_journal_write(struct closure *cl) spin_unlock(&j->lock); if (ret) { - bch2_journal_halt(j); bch_err(c, "Unable to allocate journal write"); bch2_fatal_error(c); continue_at(cl, journal_write_done, system_highpri_wq); diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h index 74dd57af..1dc193c3 100644 --- a/libbcachefs/journal_io.h +++ b/libbcachefs/journal_io.h @@ -35,8 +35,6 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, vstruct_for_each_safe(entry, k, _n) int bch2_journal_read(struct bch_fs *, struct list_head *); -void bch2_journal_entries_free(struct list_head *); -int bch2_journal_replay(struct bch_fs *, struct list_head *); void bch2_journal_write(struct closure *); diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c index 2b71d066..93ee5e88 100644 --- a/libbcachefs/journal_seq_blacklist.c +++ b/libbcachefs/journal_seq_blacklist.c @@ -135,7 +135,7 @@ static int journal_seq_blacklist_table_cmp(const void *_l, const struct journal_seq_blacklist_table_entry *l = _l; const struct journal_seq_blacklist_table_entry *r = _r; - return (l->start > r->start) - (l->start < r->start); + return cmp_int(l->start, r->start); } bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index 922fb5ca..dec9dd2a 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -202,6 +202,7 @@ struct journal { } pin; u64 replay_journal_seq; + u64 replay_journal_seq_end; struct write_point wp; spinlock_t err_lock; diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index 88761d34..822b3fce 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -42,6 +42,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) int ret = 0; bch2_trans_init(&trans, c); + bch2_trans_preload_iters(&trans); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, BTREE_ITER_PREFETCH); @@ -95,6 +96,8 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) break; } + BUG_ON(ret == -EINTR); + bch2_trans_exit(&trans); bch2_replicas_gc_end(c, ret); diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 1e7448ba..946e6162 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -62,6 +62,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) int ret = 0; bch2_trans_init(&trans, c); + bch2_trans_preload_iters(&trans); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, bkey_start_pos(&bch2_keylist_front(keys)->k), @@ -184,6 +185,7 @@ nomatch: } out: bch2_trans_exit(&trans); + BUG_ON(ret == -EINTR); return ret; } @@ -631,7 +633,7 @@ static int bch2_gc_data_replicas(struct bch_fs *c) bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED)); for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, - BTREE_ITER_PREFETCH, k) { + BTREE_ITER_PREFETCH, k, ret) { ret = bch2_mark_bkey_replicas(c, k); if (ret) break; diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 12d33119..c6a4f5b9 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -53,7 +53,7 @@ static inline int sectors_used_cmp(copygc_heap *heap, struct copygc_heap_entry l, struct copygc_heap_entry r) { - return (l.sectors > r.sectors) - (l.sectors < r.sectors); + return cmp_int(l.sectors, r.sectors); } static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) @@ -61,7 +61,7 @@ static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) const struct copygc_heap_entry *l = _l; const struct copygc_heap_entry *r = _r; - return (l->offset > r->offset) - (l->offset < r->offset); + return cmp_int(l->offset, r->offset); } static bool __copygc_pred(struct bch_dev *ca, @@ -115,7 +115,7 @@ static bool have_copygc_reserve(struct bch_dev *ca) spin_lock(&ca->freelist_lock); ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) || - ca->allocator_blocked; + ca->allocator_state != ALLOCATOR_RUNNING; spin_unlock(&ca->freelist_lock); return ret; diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index a20a09ee..a95e1447 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -232,16 +232,11 @@ enum opt_type { NO_SB_OPT, false, \ NULL, "Super read only mode - no writes at all will be issued,\n"\ "even if we have to replay the journal") \ - x(noreplay, u8, \ - OPT_MOUNT, \ - OPT_BOOL(), \ - NO_SB_OPT, false, \ - NULL, "Don't replay the journal (only for internal tools)")\ x(norecovery, u8, \ OPT_MOUNT, \ OPT_BOOL(), \ NO_SB_OPT, false, \ - NULL, NULL) \ + NULL, "Don't replay the journal") \ x(noexcl, u8, \ OPT_MOUNT, \ OPT_BOOL(), \ diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c index 5e7df0bb..2b0edb68 100644 --- a/libbcachefs/quota.c +++ b/libbcachefs/quota.c @@ -363,7 +363,7 @@ static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type) bch2_trans_init(&trans, c); for_each_btree_key(&trans, iter, BTREE_ID_QUOTAS, POS(type, 0), - BTREE_ITER_PREFETCH, k) { + BTREE_ITER_PREFETCH, k, ret) { if (k.k->p.inode != type) break; @@ -435,7 +435,7 @@ int bch2_fs_quota_read(struct bch_fs *c) bch2_trans_init(&trans, c); for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, - BTREE_ITER_PREFETCH, k) { + BTREE_ITER_PREFETCH, k, ret) { switch (k.k->type) { case KEY_TYPE_inode: ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u); diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index a5651a9c..70fd9a27 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -11,92 +11,337 @@ #include "error.h" #include "fsck.h" #include "journal_io.h" +#include "journal_reclaim.h" #include "journal_seq_blacklist.h" #include "quota.h" #include "recovery.h" #include "replicas.h" #include "super-io.h" +#include #include #define QSTR(n) { { { .len = strlen(n) } }, .name = n } -static struct bkey_i *btree_root_find(struct bch_fs *c, - struct bch_sb_field_clean *clean, - struct jset *j, - enum btree_id id, unsigned *level) +/* sort and dedup all keys in the journal: */ + +static void journal_entries_free(struct list_head *list) { - struct bkey_i *k; - struct jset_entry *entry, *start, *end; - if (clean) { - start = clean->start; - end = vstruct_end(&clean->field); - } else { - start = j->start; - end = vstruct_last(j); + while (!list_empty(list)) { + struct journal_replay *i = + list_first_entry(list, struct journal_replay, list); + list_del(&i->list); + kvpfree(i, offsetof(struct journal_replay, j) + + vstruct_bytes(&i->j)); } - - for (entry = start; entry < end; entry = vstruct_next(entry)) - if (entry->type == BCH_JSET_ENTRY_btree_root && - entry->btree_id == id) - goto found; - - return NULL; -found: - if (!entry->u64s) - return ERR_PTR(-EINVAL); - - k = entry->start; - *level = entry->level; - return k; } -static int verify_superblock_clean(struct bch_fs *c, - struct bch_sb_field_clean **cleanp, - struct jset *j) +static int journal_sort_key_cmp(const void *_l, const void *_r) { - unsigned i; - struct bch_sb_field_clean *clean = *cleanp; - int ret = 0; + const struct journal_key *l = _l; + const struct journal_key *r = _r; - if (!clean || !j) - return 0; + return cmp_int(l->btree_id, r->btree_id) ?: + bkey_cmp(l->pos, r->pos) ?: + cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->journal_offset, r->journal_offset); +} - if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, - "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", - le64_to_cpu(clean->journal_seq), - le64_to_cpu(j->seq))) { - kfree(clean); - *cleanp = NULL; - return 0; +static int journal_sort_seq_cmp(const void *_l, const void *_r) +{ + const struct journal_key *l = _l; + const struct journal_key *r = _r; + + return cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->btree_id, r->btree_id) ?: + bkey_cmp(l->pos, r->pos); +} + +static void journal_keys_sift(struct journal_keys *keys, struct journal_key *i) +{ + while (i + 1 < keys->d + keys->nr && + journal_sort_key_cmp(i, i + 1) > 0) { + swap(i[0], i[1]); + i++; } +} - mustfix_fsck_err_on(j->read_clock != clean->read_clock, c, - "superblock read clock doesn't match journal after clean shutdown"); - mustfix_fsck_err_on(j->write_clock != clean->write_clock, c, - "superblock read clock doesn't match journal after clean shutdown"); +static void journal_keys_free(struct journal_keys *keys) +{ + struct journal_key *i; - for (i = 0; i < BTREE_ID_NR; i++) { - struct bkey_i *k1, *k2; - unsigned l1 = 0, l2 = 0; + for_each_journal_key(*keys, i) + if (i->allocated) + kfree(i->k); + kvfree(keys->d); + keys->d = NULL; + keys->nr = 0; +} - k1 = btree_root_find(c, clean, NULL, i, &l1); - k2 = btree_root_find(c, NULL, j, i, &l2); +static struct journal_keys journal_keys_sort(struct list_head *journal_entries) +{ + struct journal_replay *p; + struct jset_entry *entry; + struct bkey_i *k, *_n; + struct journal_keys keys = { NULL }, keys_deduped = { NULL }; + struct journal_key *i; + size_t nr_keys = 0; - if (!k1 && !k2) + list_for_each_entry(p, journal_entries, list) + for_each_jset_key(k, _n, entry, &p->j) + nr_keys++; + + keys.journal_seq_base = keys_deduped.journal_seq_base = + le64_to_cpu(list_first_entry(journal_entries, + struct journal_replay, + list)->j.seq); + + keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL); + if (!keys.d) + goto err; + + keys_deduped.d = kvmalloc(sizeof(keys.d[0]) * nr_keys * 2, GFP_KERNEL); + if (!keys_deduped.d) + goto err; + + list_for_each_entry(p, journal_entries, list) + for_each_jset_key(k, _n, entry, &p->j) + keys.d[keys.nr++] = (struct journal_key) { + .btree_id = entry->btree_id, + .pos = bkey_start_pos(&k->k), + .k = k, + .journal_seq = le64_to_cpu(p->j.seq) - + keys.journal_seq_base, + .journal_offset = k->_data - p->j._data, + }; + + sort(keys.d, nr_keys, sizeof(keys.d[0]), journal_sort_key_cmp, NULL); + + i = keys.d; + while (i < keys.d + keys.nr) { + if (i + 1 < keys.d + keys.nr && + i[0].btree_id == i[1].btree_id && + !bkey_cmp(i[0].pos, i[1].pos)) { + if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) { + i++; + } else { + bch2_cut_front(i[1].k->k.p, i[0].k); + i[0].pos = i[1].k->k.p; + journal_keys_sift(&keys, i); + } continue; + } - mustfix_fsck_err_on(!k1 || !k2 || - IS_ERR(k1) || - IS_ERR(k2) || - k1->k.u64s != k2->k.u64s || - memcmp(k1, k2, bkey_bytes(k1)) || - l1 != l2, c, - "superblock btree root doesn't match journal after clean shutdown"); + if (i + 1 < keys.d + keys.nr && + i[0].btree_id == i[1].btree_id && + bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k)) > 0) { + if ((cmp_int(i[0].journal_seq, i[1].journal_seq) ?: + cmp_int(i[0].journal_offset, i[1].journal_offset)) < 0) { + if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) { + bch2_cut_back(bkey_start_pos(&i[1].k->k), &i[0].k->k); + } else { + struct bkey_i *split = + kmalloc(bkey_bytes(i[0].k), GFP_KERNEL); + + if (!split) + goto err; + + bkey_copy(split, i[0].k); + bch2_cut_back(bkey_start_pos(&i[1].k->k), &split->k); + keys_deduped.d[keys_deduped.nr++] = (struct journal_key) { + .btree_id = i[0].btree_id, + .allocated = true, + .pos = bkey_start_pos(&split->k), + .k = split, + .journal_seq = i[0].journal_seq, + .journal_offset = i[0].journal_offset, + }; + + bch2_cut_front(i[1].k->k.p, i[0].k); + i[0].pos = i[1].k->k.p; + journal_keys_sift(&keys, i); + continue; + } + } else { + if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) >= 0) { + i[1] = i[0]; + i++; + continue; + } else { + bch2_cut_front(i[0].k->k.p, i[1].k); + i[1].pos = i[0].k->k.p; + journal_keys_sift(&keys, i + 1); + continue; + } + } + } + + keys_deduped.d[keys_deduped.nr++] = *i++; } -fsck_err: - return ret; + + kvfree(keys.d); + return keys_deduped; +err: + journal_keys_free(&keys_deduped); + kvfree(keys.d); + return (struct journal_keys) { NULL }; +} + +/* journal replay: */ + +static void replay_now_at(struct journal *j, u64 seq) +{ + BUG_ON(seq < j->replay_journal_seq); + BUG_ON(seq > j->replay_journal_seq_end); + + while (j->replay_journal_seq < seq) + bch2_journal_pin_put(j, j->replay_journal_seq++); +} + +static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k) +{ + struct btree_trans trans; + struct btree_iter *iter, *split_iter; + /* + * We might cause compressed extents to be split, so we need to pass in + * a disk_reservation: + */ + struct disk_reservation disk_res = + bch2_disk_reservation_init(c, 0); + struct bkey_i *split; + bool split_compressed = false; + int ret; + + bch2_trans_init(&trans, c); + bch2_trans_preload_iters(&trans); +retry: + bch2_trans_begin(&trans); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + bkey_start_pos(&k->k), + BTREE_ITER_INTENT); + + do { + ret = bch2_btree_iter_traverse(iter); + if (ret) + goto err; + + split_iter = bch2_trans_copy_iter(&trans, iter); + ret = PTR_ERR_OR_ZERO(split_iter); + if (ret) + goto err; + + split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k)); + ret = PTR_ERR_OR_ZERO(split); + if (ret) + goto err; + + if (!split_compressed && + bch2_extent_is_compressed(bkey_i_to_s_c(k)) && + !bch2_extent_is_atomic(k, split_iter)) { + ret = bch2_disk_reservation_add(c, &disk_res, + k->k.size * + bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(k)), + BCH_DISK_RESERVATION_NOFAIL); + BUG_ON(ret); + + split_compressed = true; + } + + bkey_copy(split, k); + bch2_cut_front(split_iter->pos, split); + bch2_extent_trim_atomic(split, split_iter); + + bch2_trans_update(&trans, BTREE_INSERT_ENTRY(split_iter, split)); + bch2_btree_iter_set_pos(iter, split->k.p); + } while (bkey_cmp(iter->pos, k->k.p) < 0); + + if (split_compressed) { + memset(&trans.fs_usage_deltas.fs_usage, 0, + sizeof(trans.fs_usage_deltas.fs_usage)); + trans.fs_usage_deltas.top = trans.fs_usage_deltas.d; + + ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k), false, + -((s64) k->k.size), + &trans.fs_usage_deltas) ?: + bch2_trans_commit(&trans, &disk_res, NULL, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOMARK_OVERWRITES| + BTREE_INSERT_NO_CLEAR_REPLICAS); + } else { + ret = bch2_trans_commit(&trans, &disk_res, NULL, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_JOURNAL_REPLAY| + BTREE_INSERT_NOMARK); + } + + if (ret) + goto err; +err: + if (ret == -EINTR) + goto retry; + + bch2_disk_reservation_put(c, &disk_res); + + return bch2_trans_exit(&trans) ?: ret; +} + +static int bch2_journal_replay(struct bch_fs *c, + struct journal_keys keys) +{ + struct journal *j = &c->journal; + struct journal_key *i; + int ret; + + sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL); + + for_each_journal_key(keys, i) { + replay_now_at(j, keys.journal_seq_base + i->journal_seq); + + switch (i->btree_id) { + case BTREE_ID_ALLOC: + ret = bch2_alloc_replay_key(c, i->k); + break; + case BTREE_ID_EXTENTS: + ret = bch2_extent_replay_key(c, i->k); + break; + default: + ret = bch2_btree_insert(c, i->btree_id, i->k, + NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_JOURNAL_REPLAY| + BTREE_INSERT_NOMARK); + break; + } + + if (ret) { + bch_err(c, "journal replay: error %d while replaying key", + ret); + return ret; + } + + cond_resched(); + } + + replay_now_at(j, j->replay_journal_seq_end); + j->replay_journal_seq = 0; + + bch2_journal_set_replay_done(j); + bch2_journal_flush_all_pins(j); + return bch2_journal_error(j); +} + +static bool journal_empty(struct list_head *journal) +{ + return list_empty(journal) || + journal_entry_empty(&list_last_entry(journal, + struct journal_replay, list)->j); } static int @@ -129,40 +374,7 @@ fsck_err: return ret; } -static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) -{ - struct bch_sb_field_clean *clean, *sb_clean; - int ret; - - mutex_lock(&c->sb_lock); - sb_clean = bch2_sb_get_clean(c->disk_sb.sb); - - if (fsck_err_on(!sb_clean, c, - "superblock marked clean but clean section not present")) { - SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - c->sb.clean = false; - mutex_unlock(&c->sb_lock); - return NULL; - } - - clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), - GFP_KERNEL); - if (!clean) { - mutex_unlock(&c->sb_lock); - return ERR_PTR(-ENOMEM); - } - - if (le16_to_cpu(c->disk_sb.sb->version) < - bcachefs_metadata_version_bkey_renumber) - bch2_sb_clean_renumber(clean, READ); - - mutex_unlock(&c->sb_lock); - - return clean; -fsck_err: - mutex_unlock(&c->sb_lock); - return ERR_PTR(ret); -} +/* journal replay early: */ static int journal_replay_entry_early(struct bch_fs *c, struct jset_entry *entry) @@ -190,13 +402,11 @@ static int journal_replay_entry_early(struct bch_fs *c, switch (entry->btree_id) { case FS_USAGE_RESERVED: if (entry->level < BCH_REPLICAS_MAX) - percpu_u64_set(&c->usage[0]-> - persistent_reserved[entry->level], - le64_to_cpu(u->v)); + c->usage_base->persistent_reserved[entry->level] = + le64_to_cpu(u->v); break; case FS_USAGE_INODES: - percpu_u64_set(&c->usage[0]->nr_inodes, - le64_to_cpu(u->v)); + c->usage_base->nr_inodes = le64_to_cpu(u->v); break; case FS_USAGE_KEY_VERSION: atomic64_set(&c->key_version, @@ -274,6 +484,121 @@ static int journal_replay_early(struct bch_fs *c, return 0; } +/* sb clean section: */ + +static struct bkey_i *btree_root_find(struct bch_fs *c, + struct bch_sb_field_clean *clean, + struct jset *j, + enum btree_id id, unsigned *level) +{ + struct bkey_i *k; + struct jset_entry *entry, *start, *end; + + if (clean) { + start = clean->start; + end = vstruct_end(&clean->field); + } else { + start = j->start; + end = vstruct_last(j); + } + + for (entry = start; entry < end; entry = vstruct_next(entry)) + if (entry->type == BCH_JSET_ENTRY_btree_root && + entry->btree_id == id) + goto found; + + return NULL; +found: + if (!entry->u64s) + return ERR_PTR(-EINVAL); + + k = entry->start; + *level = entry->level; + return k; +} + +static int verify_superblock_clean(struct bch_fs *c, + struct bch_sb_field_clean **cleanp, + struct jset *j) +{ + unsigned i; + struct bch_sb_field_clean *clean = *cleanp; + int ret = 0; + + if (!c->sb.clean || !j) + return 0; + + if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, + "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", + le64_to_cpu(clean->journal_seq), + le64_to_cpu(j->seq))) { + kfree(clean); + *cleanp = NULL; + return 0; + } + + mustfix_fsck_err_on(j->read_clock != clean->read_clock, c, + "superblock read clock doesn't match journal after clean shutdown"); + mustfix_fsck_err_on(j->write_clock != clean->write_clock, c, + "superblock read clock doesn't match journal after clean shutdown"); + + for (i = 0; i < BTREE_ID_NR; i++) { + struct bkey_i *k1, *k2; + unsigned l1 = 0, l2 = 0; + + k1 = btree_root_find(c, clean, NULL, i, &l1); + k2 = btree_root_find(c, NULL, j, i, &l2); + + if (!k1 && !k2) + continue; + + mustfix_fsck_err_on(!k1 || !k2 || + IS_ERR(k1) || + IS_ERR(k2) || + k1->k.u64s != k2->k.u64s || + memcmp(k1, k2, bkey_bytes(k1)) || + l1 != l2, c, + "superblock btree root doesn't match journal after clean shutdown"); + } +fsck_err: + return ret; +} + +static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) +{ + struct bch_sb_field_clean *clean, *sb_clean; + int ret; + + mutex_lock(&c->sb_lock); + sb_clean = bch2_sb_get_clean(c->disk_sb.sb); + + if (fsck_err_on(!sb_clean, c, + "superblock marked clean but clean section not present")) { + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + c->sb.clean = false; + mutex_unlock(&c->sb_lock); + return NULL; + } + + clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), + GFP_KERNEL); + if (!clean) { + mutex_unlock(&c->sb_lock); + return ERR_PTR(-ENOMEM); + } + + if (le16_to_cpu(c->disk_sb.sb->version) < + bcachefs_metadata_version_bkey_renumber) + bch2_sb_clean_renumber(clean, READ); + + mutex_unlock(&c->sb_lock); + + return clean; +fsck_err: + mutex_unlock(&c->sb_lock); + return ERR_PTR(ret); +} + static int read_btree_roots(struct bch_fs *c) { unsigned i; @@ -319,42 +644,14 @@ fsck_err: return ret; } -static bool journal_empty(struct list_head *journal) -{ - struct journal_replay *i; - struct jset_entry *entry; - - if (list_empty(journal)) - return true; - - i = list_last_entry(journal, struct journal_replay, list); - - if (i->j.last_seq != i->j.seq) - return false; - - list_for_each_entry(i, journal, list) { - vstruct_for_each(&i->j, entry) { - if (entry->type == BCH_JSET_ENTRY_btree_root || - entry->type == BCH_JSET_ENTRY_usage || - entry->type == BCH_JSET_ENTRY_data_usage) - continue; - - if (entry->type == BCH_JSET_ENTRY_btree_keys && - !entry->u64s) - continue; - return false; - } - } - - return true; -} - int bch2_fs_recovery(struct bch_fs *c) { const char *err = "cannot allocate memory"; struct bch_sb_field_clean *clean = NULL; u64 journal_seq; - LIST_HEAD(journal); + LIST_HEAD(journal_entries); + struct journal_keys journal_keys = { NULL }; + bool wrote = false, write_sb = false; int ret; if (c->sb.clean) @@ -375,20 +672,31 @@ int bch2_fs_recovery(struct bch_fs *c) if (!c->sb.clean || c->opts.fsck) { struct jset *j; - ret = bch2_journal_read(c, &journal); + ret = bch2_journal_read(c, &journal_entries); if (ret) goto err; - fsck_err_on(c->sb.clean && !journal_empty(&journal), c, - "filesystem marked clean but journal not empty"); + if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&journal_entries), c, + "filesystem marked clean but journal not empty")) { + c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + c->sb.clean = false; + } - if (!c->sb.clean && list_empty(&journal)){ + if (!c->sb.clean && list_empty(&journal_entries)) { bch_err(c, "no journal entries found"); ret = BCH_FSCK_REPAIR_IMPOSSIBLE; goto err; } - j = &list_last_entry(&journal, struct journal_replay, list)->j; + journal_keys = journal_keys_sort(&journal_entries); + if (!journal_keys.d) { + ret = -ENOMEM; + goto err; + } + + j = &list_last_entry(&journal_entries, + struct journal_replay, list)->j; ret = verify_superblock_clean(c, &clean, j); if (ret) @@ -399,14 +707,14 @@ int bch2_fs_recovery(struct bch_fs *c) journal_seq = le64_to_cpu(clean->journal_seq) + 1; } - ret = journal_replay_early(c, clean, &journal); + ret = journal_replay_early(c, clean, &journal_entries); if (ret) goto err; if (!c->sb.clean) { ret = bch2_journal_seq_blacklist_add(c, - journal_seq, - journal_seq + 4); + journal_seq, + journal_seq + 4); if (ret) { bch_err(c, "error creating new journal seq blacklist entry"); goto err; @@ -417,11 +725,13 @@ int bch2_fs_recovery(struct bch_fs *c) ret = bch2_blacklist_table_initialize(c); - ret = verify_journal_entries_not_blacklisted_or_missing(c, &journal); + ret = verify_journal_entries_not_blacklisted_or_missing(c, + &journal_entries); if (ret) goto err; - ret = bch2_fs_journal_start(&c->journal, journal_seq, &journal); + ret = bch2_fs_journal_start(&c->journal, journal_seq, + &journal_entries); if (ret) goto err; @@ -429,25 +739,43 @@ int bch2_fs_recovery(struct bch_fs *c) if (ret) goto err; + bch_verbose(c, "starting alloc read"); err = "error reading allocation information"; - ret = bch2_alloc_read(c, &journal); + ret = bch2_alloc_read(c, &journal_keys); if (ret) goto err; + bch_verbose(c, "alloc read done"); bch_verbose(c, "starting stripes_read"); - ret = bch2_stripes_read(c, &journal); + err = "error reading stripes"; + ret = bch2_stripes_read(c, &journal_keys); if (ret) goto err; bch_verbose(c, "stripes_read done"); set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); + if ((c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) && + !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA))) { + /* + * interior btree node updates aren't consistent with the + * journal; after an unclean shutdown we have to walk all + * pointers to metadata: + */ + bch_info(c, "starting metadata mark and sweep"); + err = "error in mark and sweep"; + ret = bch2_gc(c, NULL, true, true); + if (ret) + goto err; + bch_verbose(c, "mark and sweep done"); + } + if (c->opts.fsck || !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) || test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { - bch_verbose(c, "starting mark and sweep:"); - err = "error in recovery"; - ret = bch2_gc(c, &journal, true, false); + bch_info(c, "starting mark and sweep"); + err = "error in mark and sweep"; + ret = bch2_gc(c, &journal_keys, true, false); if (ret) goto err; bch_verbose(c, "mark and sweep done"); @@ -463,26 +791,63 @@ int bch2_fs_recovery(struct bch_fs *c) if (c->sb.encryption_type && !c->sb.clean) atomic64_add(1 << 16, &c->key_version); - if (c->opts.noreplay) + if (c->opts.norecovery) goto out; - bch_verbose(c, "starting journal replay:"); + bch_verbose(c, "starting journal replay"); err = "journal replay failed"; - ret = bch2_journal_replay(c, &journal); + ret = bch2_journal_replay(c, journal_keys); if (ret) goto err; bch_verbose(c, "journal replay done"); - if (c->opts.norecovery) - goto out; + if (!c->opts.nochanges) { + /* + * note that even when filesystem was clean there might be work + * to do here, if we ran gc (because of fsck) which recalculated + * oldest_gen: + */ + bch_verbose(c, "writing allocation info"); + err = "error writing out alloc info"; + ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW, &wrote) ?: + bch2_alloc_write(c, BTREE_INSERT_LAZY_RW, &wrote); + if (ret) { + bch_err(c, "error writing alloc info"); + goto err; + } + bch_verbose(c, "alloc write done"); + } - err = "error in fsck"; - ret = bch2_fsck(c); - if (ret) - goto err; + if (!c->sb.clean) { + if (!(c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) { + bch_info(c, "checking inode link counts"); + err = "error in recovery"; + ret = bch2_fsck_inode_nlink(c); + if (ret) + goto err; + bch_verbose(c, "check inodes done"); + + } else { + bch_verbose(c, "checking for deleted inodes"); + err = "error in recovery"; + ret = bch2_fsck_walk_inodes_only(c); + if (ret) + goto err; + bch_verbose(c, "check inodes done"); + } + } + + if (c->opts.fsck) { + bch_info(c, "starting fsck"); + err = "error in fsck"; + ret = bch2_fsck_full(c); + if (ret) + goto err; + bch_verbose(c, "fsck done"); + } if (enabled_qtypes(c)) { - bch_verbose(c, "reading quotas:"); + bch_verbose(c, "reading quotas"); ret = bch2_fs_quota_read(c); if (ret) goto err; @@ -495,26 +860,41 @@ int bch2_fs_recovery(struct bch_fs *c) c->disk_sb.sb->version_min = le16_to_cpu(bcachefs_metadata_version_min); c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current); + write_sb = true; + } + + if (!test_bit(BCH_FS_ERROR, &c->flags)) { + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; + write_sb = true; } if (c->opts.fsck && !test_bit(BCH_FS_ERROR, &c->flags)) { c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK; SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); + write_sb = true; } + + if (write_sb) + bch2_write_super(c); mutex_unlock(&c->sb_lock); if (c->journal_seq_blacklist_table && c->journal_seq_blacklist_table->nr > 128) queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); out: - bch2_journal_entries_free(&journal); - kfree(clean); - return ret; + ret = 0; err: fsck_err: - pr_err("Error in recovery: %s (%i)", err, ret); - goto out; + bch2_flush_fsck_errs(c); + journal_keys_free(&journal_keys); + journal_entries_free(&journal_entries); + kfree(clean); + if (ret) + bch_err(c, "Error in recovery: %s (%i)", err, ret); + else + bch_verbose(c, "ret %i", ret); + return ret; } int bch2_fs_initialize(struct bch_fs *c) diff --git a/libbcachefs/recovery.h b/libbcachefs/recovery.h index 685507e8..c61b55f5 100644 --- a/libbcachefs/recovery.h +++ b/libbcachefs/recovery.h @@ -1,6 +1,22 @@ #ifndef _BCACHEFS_RECOVERY_H #define _BCACHEFS_RECOVERY_H +struct journal_keys { + struct journal_key { + enum btree_id btree_id:8; + unsigned allocated:1; + struct bpos pos; + struct bkey_i *k; + u32 journal_seq; + u32 journal_offset; + } *d; + size_t nr; + u64 journal_seq_base; +}; + +#define for_each_journal_key(keys, i) \ + for (i = (keys).d; i < (keys).d + (keys).nr; (i)++) + int bch2_fs_recovery(struct bch_fs *); int bch2_fs_initialize(struct bch_fs *); diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index d0076bd4..eb441194 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -1,5 +1,6 @@ #include "bcachefs.h" +#include "buckets.h" #include "journal.h" #include "replicas.h" #include "super-io.h" @@ -11,7 +12,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, static inline int u8_cmp(u8 l, u8 r) { - return (l > r) - (l < r); + return cmp_int(l, r); } static void verify_replicas_entry_sorted(struct bch_replicas_entry *e) @@ -100,8 +101,8 @@ static void stripe_to_replicas(struct bkey_s_c k, r->devs[r->nr_devs++] = ptr->dev; } -static void bkey_to_replicas(struct bch_replicas_entry *e, - struct bkey_s_c k) +void bch2_bkey_to_replicas(struct bch_replicas_entry *e, + struct bkey_s_c k) { e->nr_devs = 0; @@ -234,20 +235,13 @@ bool bch2_replicas_marked(struct bch_fs *c, return marked; } -static void __replicas_table_update(struct bch_fs_usage __percpu *dst_p, +static void __replicas_table_update(struct bch_fs_usage *dst, struct bch_replicas_cpu *dst_r, - struct bch_fs_usage __percpu *src_p, + struct bch_fs_usage *src, struct bch_replicas_cpu *src_r) { - unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr; - struct bch_fs_usage *dst, *src = (void *) - bch2_acc_percpu_u64s((void *) src_p, src_nr); int src_idx, dst_idx; - preempt_disable(); - dst = this_cpu_ptr(dst_p); - preempt_enable(); - *dst = *src; for (src_idx = 0; src_idx < src_r->nr; src_idx++) { @@ -262,6 +256,22 @@ static void __replicas_table_update(struct bch_fs_usage __percpu *dst_p, } } +static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p, + struct bch_replicas_cpu *dst_r, + struct bch_fs_usage __percpu *src_p, + struct bch_replicas_cpu *src_r) +{ + unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr; + struct bch_fs_usage *dst, *src = (void *) + bch2_acc_percpu_u64s((void *) src_p, src_nr); + + preempt_disable(); + dst = this_cpu_ptr(dst_p); + preempt_enable(); + + __replicas_table_update(dst, dst_r, src, src_r); +} + /* * Resize filesystem accounting: */ @@ -270,34 +280,48 @@ static int replicas_table_update(struct bch_fs *c, { struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL }; struct bch_fs_usage *new_scratch = NULL; + struct bch_fs_usage __percpu *new_gc = NULL; + struct bch_fs_usage *new_base = NULL; unsigned bytes = sizeof(struct bch_fs_usage) + sizeof(u64) * new_r->nr; int ret = -ENOMEM; - if (!(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64), + if (!(new_base = kzalloc(bytes, GFP_NOIO)) || + !(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)) || - (c->usage[1] && - !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64), - GFP_NOIO))) || - !(new_scratch = kmalloc(bytes, GFP_NOIO))) + !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64), + GFP_NOIO)) || + !(new_scratch = kmalloc(bytes, GFP_NOIO)) || + (c->usage_gc && + !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) goto err; + if (c->usage_base) + __replicas_table_update(new_base, new_r, + c->usage_base, &c->replicas); if (c->usage[0]) - __replicas_table_update(new_usage[0], new_r, - c->usage[0], &c->replicas); + __replicas_table_update_pcpu(new_usage[0], new_r, + c->usage[0], &c->replicas); if (c->usage[1]) - __replicas_table_update(new_usage[1], new_r, - c->usage[1], &c->replicas); + __replicas_table_update_pcpu(new_usage[1], new_r, + c->usage[1], &c->replicas); + if (c->usage_gc) + __replicas_table_update_pcpu(new_gc, new_r, + c->usage_gc, &c->replicas); + swap(c->usage_base, new_base); swap(c->usage[0], new_usage[0]); swap(c->usage[1], new_usage[1]); swap(c->usage_scratch, new_scratch); + swap(c->usage_gc, new_gc); swap(c->replicas, *new_r); ret = 0; err: + free_percpu(new_gc); kfree(new_scratch); free_percpu(new_usage[1]); free_percpu(new_usage[0]); + kfree(new_base); return ret; } @@ -411,7 +435,7 @@ bool bch2_bkey_replicas_marked_locked(struct bch_fs *c, return false; } - bkey_to_replicas(&search.e, k); + bch2_bkey_to_replicas(&search.e, k); return bch2_replicas_marked_locked(c, &search.e, check_gc_replicas); } @@ -444,7 +468,7 @@ int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) return ret; } - bkey_to_replicas(&search.e, k); + bch2_bkey_to_replicas(&search.e, k); return bch2_mark_replicas(c, &search.e); } @@ -456,9 +480,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret) lockdep_assert_held(&c->replicas_gc_lock); mutex_lock(&c->sb_lock); - - if (ret) - goto err; + percpu_down_write(&c->mark_lock); /* * this is kind of crappy; the replicas gc mechanism needs to be ripped @@ -469,26 +491,20 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret) struct bch_replicas_entry *e = cpu_replicas_entry(&c->replicas, i); struct bch_replicas_cpu n; - u64 v; - if (__replicas_has_entry(&c->replicas_gc, e)) - continue; + if (!__replicas_has_entry(&c->replicas_gc, e) && + (c->usage_base->replicas[i] || + percpu_u64_get(&c->usage[0]->replicas[i]) || + percpu_u64_get(&c->usage[1]->replicas[i]))) { + n = cpu_replicas_add_entry(&c->replicas_gc, e); + if (!n.entries) { + ret = -ENOSPC; + goto err; + } - v = percpu_u64_get(&c->usage[0]->replicas[i]); - if (!v) - continue; - - n = cpu_replicas_add_entry(&c->replicas_gc, e); - if (!n.entries) { - ret = -ENOSPC; - goto err; + swap(n, c->replicas_gc); + kfree(n.entries); } - - percpu_down_write(&c->mark_lock); - swap(n, c->replicas_gc); - percpu_up_write(&c->mark_lock); - - kfree(n.entries); } if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) { @@ -496,19 +512,18 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret) goto err; } - bch2_write_super(c); - - /* don't update in memory replicas until changes are persistent */ + ret = replicas_table_update(c, &c->replicas_gc); err: - percpu_down_write(&c->mark_lock); - if (!ret) - ret = replicas_table_update(c, &c->replicas_gc); - kfree(c->replicas_gc.entries); c->replicas_gc.entries = NULL; + percpu_up_write(&c->mark_lock); + if (!ret) + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + return ret; } @@ -575,7 +590,7 @@ int bch2_replicas_set_usage(struct bch_fs *c, BUG_ON(ret < 0); } - percpu_u64_set(&c->usage[0]->replicas[idx], sectors); + c->usage_base->replicas[idx] = sectors; return 0; } diff --git a/libbcachefs/replicas.h b/libbcachefs/replicas.h index ad97e3bc..2ffafad7 100644 --- a/libbcachefs/replicas.h +++ b/libbcachefs/replicas.h @@ -27,6 +27,7 @@ int bch2_mark_replicas(struct bch_fs *, bool bch2_bkey_replicas_marked_locked(struct bch_fs *, struct bkey_s_c, bool); +void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c, bool); int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c); diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h index f928ca99..fcbe42bc 100644 --- a/libbcachefs/str_hash.h +++ b/libbcachefs/str_hash.h @@ -134,14 +134,11 @@ bch2_hash_lookup(struct btree_trans *trans, { struct btree_iter *iter; struct bkey_s_c k; + int ret; - iter = bch2_trans_get_iter(trans, desc.btree_id, - POS(inode, desc.hash_key(info, key)), - BTREE_ITER_SLOTS|flags); - if (IS_ERR(iter)) - return iter; - - for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) { + for_each_btree_key(trans, iter, desc.btree_id, + POS(inode, desc.hash_key(info, key)), + BTREE_ITER_SLOTS|flags, k, ret) { if (iter->pos.inode != inode) break; @@ -156,7 +153,7 @@ bch2_hash_lookup(struct btree_trans *trans, } } - return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOENT); + return ERR_PTR(ret ?: -ENOENT); } static __always_inline struct btree_iter * @@ -167,14 +164,11 @@ bch2_hash_hole(struct btree_trans *trans, { struct btree_iter *iter; struct bkey_s_c k; + int ret; - iter = bch2_trans_get_iter(trans, desc.btree_id, - POS(inode, desc.hash_key(info, key)), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - if (IS_ERR(iter)) - return iter; - - for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) { + for_each_btree_key(trans, iter, desc.btree_id, + POS(inode, desc.hash_key(info, key)), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { if (iter->pos.inode != inode) break; @@ -182,7 +176,7 @@ bch2_hash_hole(struct btree_trans *trans, return iter; } - return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOSPC); + return ERR_PTR(ret ?: -ENOSPC); } static __always_inline @@ -224,15 +218,11 @@ int bch2_hash_set(struct btree_trans *trans, struct btree_iter *iter, *slot = NULL; struct bkey_s_c k; bool found = false; - int ret = 0; + int ret; - iter = bch2_trans_get_iter(trans, desc.btree_id, - POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - if (IS_ERR(iter)) - return PTR_ERR(iter); - - for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) { + for_each_btree_key(trans, iter, desc.btree_id, + POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { if (iter->pos.inode != inode) break; @@ -256,9 +246,10 @@ int bch2_hash_set(struct btree_trans *trans, } if (slot) - bch2_trans_iter_free(trans, iter); + bch2_trans_iter_free(trans, slot); + bch2_trans_iter_free(trans, iter); - return bch2_trans_iter_free(trans, iter) ?: -ENOSPC; + return ret ?: -ENOSPC; found: found = true; not_found: diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 83c74af4..61eefd2d 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -1,5 +1,6 @@ #include "bcachefs.h" +#include "buckets.h" #include "checksum.h" #include "disk_groups.h" #include "ec.h" @@ -648,7 +649,7 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca) bio_reset(bio); bio_set_dev(bio, ca->disk_sb.bdev); bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]); - bio->bi_iter.bi_size = 4096; + bio->bi_iter.bi_size = PAGE_SIZE; bio->bi_end_io = write_super_endio; bio->bi_private = ca; bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC|REQ_META); @@ -944,7 +945,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c) mutex_lock(&c->sb_lock); SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); + c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA); ret = bch2_write_super(c); mutex_unlock(&c->sb_lock); @@ -953,7 +954,8 @@ int bch2_fs_mark_dirty(struct bch_fs *c) struct jset_entry * bch2_journal_super_entries_add_common(struct bch_fs *c, - struct jset_entry *entry) + struct jset_entry *entry, + u64 journal_seq) { struct btree_root *r; unsigned i; @@ -976,10 +978,16 @@ bch2_journal_super_entries_add_common(struct bch_fs *c, mutex_unlock(&c->btree_root_lock); - percpu_down_read_preempt_disable(&c->mark_lock); + percpu_down_write(&c->mark_lock); + + if (!journal_seq) { + bch2_fs_usage_acc_to_base(c, 0); + bch2_fs_usage_acc_to_base(c, 1); + } else { + bch2_fs_usage_acc_to_base(c, journal_seq & 1); + } { - u64 nr_inodes = percpu_u64_get(&c->usage[0]->nr_inodes); struct jset_entry_usage *u = container_of(entry, struct jset_entry_usage, entry); @@ -987,7 +995,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c, u->entry.u64s = DIV_ROUND_UP(sizeof(*u), sizeof(u64)) - 1; u->entry.type = BCH_JSET_ENTRY_usage; u->entry.btree_id = FS_USAGE_INODES; - u->v = cpu_to_le64(nr_inodes); + u->v = cpu_to_le64(c->usage_base->nr_inodes); entry = vstruct_next(entry); } @@ -1008,17 +1016,13 @@ bch2_journal_super_entries_add_common(struct bch_fs *c, for (i = 0; i < BCH_REPLICAS_MAX; i++) { struct jset_entry_usage *u = container_of(entry, struct jset_entry_usage, entry); - u64 sectors = percpu_u64_get(&c->usage[0]->persistent_reserved[i]); - - if (!sectors) - continue; memset(u, 0, sizeof(*u)); u->entry.u64s = DIV_ROUND_UP(sizeof(*u), sizeof(u64)) - 1; u->entry.type = BCH_JSET_ENTRY_usage; u->entry.btree_id = FS_USAGE_RESERVED; u->entry.level = i; - u->v = sectors; + u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); entry = vstruct_next(entry); } @@ -1026,7 +1030,6 @@ bch2_journal_super_entries_add_common(struct bch_fs *c, for (i = 0; i < c->replicas.nr; i++) { struct bch_replicas_entry *e = cpu_replicas_entry(&c->replicas, i); - u64 sectors = percpu_u64_get(&c->usage[0]->replicas[i]); struct jset_entry_data_usage *u = container_of(entry, struct jset_entry_data_usage, entry); @@ -1034,13 +1037,13 @@ bch2_journal_super_entries_add_common(struct bch_fs *c, u->entry.u64s = DIV_ROUND_UP(sizeof(*u) + e->nr_devs, sizeof(u64)) - 1; u->entry.type = BCH_JSET_ENTRY_data_usage; - u->v = cpu_to_le64(sectors); + u->v = cpu_to_le64(c->usage_base->replicas[i]); memcpy(&u->r, e, replicas_entry_bytes(e)); entry = vstruct_next(entry); } - percpu_up_read_preempt_enable(&c->mark_lock); + percpu_up_write(&c->mark_lock); return entry; } @@ -1058,6 +1061,7 @@ void bch2_fs_mark_clean(struct bch_fs *c) SET_BCH_SB_CLEAN(c->disk_sb.sb, true); c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA; u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; @@ -1076,7 +1080,7 @@ void bch2_fs_mark_clean(struct bch_fs *c) BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); entry = sb_clean->start; - entry = bch2_journal_super_entries_add_common(c, entry); + entry = bch2_journal_super_entries_add_common(c, entry, 0); BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); memset(entry, 0, diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h index aa91b821..cf25b44a 100644 --- a/libbcachefs/super-io.h +++ b/libbcachefs/super-io.h @@ -136,7 +136,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) struct jset_entry * bch2_journal_super_entries_add_common(struct bch_fs *, - struct jset_entry *); + struct jset_entry *, u64); void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int); diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 9dc201ab..0cbc7eed 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -227,17 +227,16 @@ static void __bch2_fs_read_only(struct bch_fs *c) goto allocator_not_running; do { - ret = bch2_stripes_write(c, &wrote); - if (ret) { - bch2_fs_inconsistent(c, "error writing out stripes"); - break; - } + wrote = false; - ret = bch2_alloc_write(c, false, &wrote); - if (ret) { + ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?: + bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote); + + if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) bch2_fs_inconsistent(c, "error writing out alloc info %i", ret); + + if (ret) break; - } for_each_member_device(ca, c, i) bch2_dev_allocator_quiesce(c, ca); @@ -336,7 +335,8 @@ void bch2_fs_read_only(struct bch_fs *c) if (!bch2_journal_error(&c->journal) && !test_bit(BCH_FS_ERROR, &c->flags) && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) && - test_bit(BCH_FS_STARTED, &c->flags)) + test_bit(BCH_FS_STARTED, &c->flags) && + !c->opts.norecovery) bch2_fs_mark_clean(c); clear_bit(BCH_FS_RW, &c->flags); @@ -409,6 +409,15 @@ int __bch2_fs_read_write(struct bch_fs *c, bool early) if (test_bit(BCH_FS_RW, &c->flags)) return 0; + /* + * nochanges is used for fsck -n mode - we have to allow going rw + * during recovery for that to work: + */ + if (c->opts.norecovery || + (c->opts.nochanges && + (!early || c->opts.read_only))) + return -EROFS; + ret = bch2_fs_mark_dirty(c); if (ret) goto err; @@ -488,7 +497,9 @@ static void bch2_fs_free(struct bch_fs *c) bch2_fs_compress_exit(c); percpu_free_rwsem(&c->mark_lock); kfree(c->usage_scratch); + free_percpu(c->usage[1]); free_percpu(c->usage[0]); + kfree(c->usage_base); free_percpu(c->pcpu); mempool_exit(&c->btree_iters_pool); mempool_exit(&c->btree_bounce_pool); @@ -682,6 +693,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) seqcount_init(&c->gc_pos_lock); + seqcount_init(&c->usage_lock); + c->copy_gc_enabled = 1; c->rebalance.enabled = 1; c->promote_whole_extents = true; @@ -714,9 +727,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->block_bits = ilog2(c->opts.block_size); c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); - c->opts.nochanges |= c->opts.noreplay; - c->opts.read_only |= c->opts.nochanges; - if (bch2_fs_init_fault("fs_alloc")) goto err; @@ -794,7 +804,41 @@ err: goto out; } -const char *bch2_fs_start(struct bch_fs *c) +noinline_for_stack +static void print_mount_opts(struct bch_fs *c) +{ + enum bch_opt_id i; + char buf[512]; + struct printbuf p = PBUF(buf); + bool first = true; + + strcpy(buf, "(null)"); + + if (c->opts.read_only) { + pr_buf(&p, "ro"); + first = false; + } + + for (i = 0; i < bch2_opts_nr; i++) { + const struct bch_option *opt = &bch2_opt_table[i]; + u64 v = bch2_opt_get_by_id(&c->opts, i); + + if (!(opt->mode & OPT_MOUNT)) + continue; + + if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) + continue; + + if (!first) + pr_buf(&p, ","); + first = false; + bch2_opt_to_text(&p, c, opt, v, OPT_SHOW_MOUNT_STYLE); + } + + bch_info(c, "mounted with opts: %s", buf); +} + +int bch2_fs_start(struct bch_fs *c) { const char *err = "cannot allocate memory"; struct bch_sb_field_members *mi; @@ -833,26 +877,27 @@ const char *bch2_fs_start(struct bch_fs *c) goto err; err = "dynamic fault"; + ret = -EINVAL; if (bch2_fs_init_fault("fs_start")) goto err; - if (c->opts.read_only) { + if (c->opts.read_only || c->opts.nochanges) { bch2_fs_read_only(c); } else { - if (!test_bit(BCH_FS_RW, &c->flags) - ? bch2_fs_read_write(c) - : bch2_fs_read_write_late(c)) { - err = "error going read write"; + err = "error going read write"; + ret = !test_bit(BCH_FS_RW, &c->flags) + ? bch2_fs_read_write(c) + : bch2_fs_read_write_late(c); + if (ret) goto err; - } } set_bit(BCH_FS_STARTED, &c->flags); - - err = NULL; + print_mount_opts(c); + ret = 0; out: mutex_unlock(&c->state_lock); - return err; + return ret; err: switch (ret) { case BCH_FSCK_ERRORS_NOT_FIXED: @@ -880,7 +925,7 @@ err: break; } - BUG_ON(!err); + BUG_ON(!ret); goto out; } @@ -947,7 +992,7 @@ static void bch2_dev_free(struct bch_dev *ca) free_percpu(ca->io_done); bioset_exit(&ca->replica_set); bch2_dev_buckets_free(ca); - kfree(ca->sb_read_scratch); + free_page((unsigned long) ca->sb_read_scratch); bch2_time_stats_exit(&ca->io_latency[WRITE]); bch2_time_stats_exit(&ca->io_latency[READ]); @@ -1061,7 +1106,7 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, 0, GFP_KERNEL) || percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || - !(ca->sb_read_scratch = kmalloc(4096, GFP_KERNEL)) || + !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) || bch2_dev_buckets_alloc(c, ca) || bioset_init(&ca->replica_set, 4, offsetof(struct bch_write_bio, bio), 0) || @@ -1460,13 +1505,8 @@ err: static void dev_usage_clear(struct bch_dev *ca) { struct bucket_array *buckets; - int cpu; - for_each_possible_cpu(cpu) { - struct bch_dev_usage *p = - per_cpu_ptr(ca->usage[0], cpu); - memset(p, 0, sizeof(*p)); - } + percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0])); down_read(&ca->bucket_lock); buckets = bucket_array(ca); @@ -1817,9 +1857,9 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, goto err_print; if (!c->opts.nostart) { - err = bch2_fs_start(c); - if (err) - goto err_print; + ret = bch2_fs_start(c); + if (ret) + goto err; } out: kfree(sb); @@ -1846,6 +1886,7 @@ static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb, const char *err; struct bch_fs *c; bool allocated_fs = false; + int ret; err = bch2_sb_validate(sb); if (err) @@ -1878,8 +1919,9 @@ static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb, mutex_unlock(&c->sb_lock); if (!c->opts.nostart && bch2_fs_may_start(c)) { - err = bch2_fs_start(c); - if (err) + err = "error starting filesystem"; + ret = bch2_fs_start(c); + if (ret) goto err; } diff --git a/libbcachefs/super.h b/libbcachefs/super.h index 9bb672c4..4598de9b 100644 --- a/libbcachefs/super.h +++ b/libbcachefs/super.h @@ -224,7 +224,7 @@ int bch2_fs_read_write_early(struct bch_fs *); void bch2_fs_stop(struct bch_fs *); -const char *bch2_fs_start(struct bch_fs *); +int bch2_fs_start(struct bch_fs *); struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); const char *bch2_fs_open_incremental(const char *path); diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 7069bea5..c2744c7d 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -235,42 +235,11 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf) { struct printbuf out = _PBUF(buf, PAGE_SIZE); struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c); - unsigned i; if (!fs_usage) return -ENOMEM; - pr_buf(&out, "capacity:\t\t\t%llu\n", c->capacity); - - pr_buf(&out, "hidden:\t\t\t\t%llu\n", - fs_usage->hidden); - pr_buf(&out, "data:\t\t\t\t%llu\n", - fs_usage->data); - pr_buf(&out, "cached:\t\t\t\t%llu\n", - fs_usage->cached); - pr_buf(&out, "reserved:\t\t\t%llu\n", - fs_usage->reserved); - pr_buf(&out, "nr_inodes:\t\t\t%llu\n", - fs_usage->nr_inodes); - pr_buf(&out, "online reserved:\t\t%llu\n", - fs_usage->online_reserved); - - for (i = 0; - i < ARRAY_SIZE(fs_usage->persistent_reserved); - i++) { - pr_buf(&out, "%u replicas:\n", i + 1); - pr_buf(&out, "\treserved:\t\t%llu\n", - fs_usage->persistent_reserved[i]); - } - - for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *e = - cpu_replicas_entry(&c->replicas, i); - - pr_buf(&out, "\t"); - bch2_replicas_entry_to_text(&out, e); - pr_buf(&out, ":\t%llu\n", fs_usage->replicas[i]); - } + bch2_fs_usage_to_text(&out, c, fs_usage); percpu_up_read_preempt_enable(&c->mark_lock); @@ -288,13 +257,14 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) nr_compressed_extents = 0, compressed_sectors_compressed = 0, compressed_sectors_uncompressed = 0; + int ret; if (!test_bit(BCH_FS_STARTED, &c->flags)) return -EPERM; bch2_trans_init(&trans, c); - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, 0, k) + for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, 0, k, ret) if (k.k->type == KEY_TYPE_extent) { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); const union bch_extent_entry *entry; @@ -316,7 +286,10 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) break; } } - bch2_trans_exit(&trans); + + ret = bch2_trans_exit(&trans) ?: ret; + if (ret) + return ret; return scnprintf(buf, PAGE_SIZE, "uncompressed data:\n" @@ -501,7 +474,7 @@ STORE(__bch2_fs) if (attr == &sysfs_trigger_alloc_write) { bool wrote; - bch2_alloc_write(c, false, &wrote); + bch2_alloc_write(c, 0, &wrote); } if (attr == &sysfs_prune_cache) { @@ -750,10 +723,10 @@ static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca, static int unsigned_cmp(const void *_l, const void *_r) { - unsigned l = *((unsigned *) _l); - unsigned r = *((unsigned *) _r); + const unsigned *l = _l; + const unsigned *r = _r; - return (l > r) - (l < r); + return cmp_int(*l, *r); } static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca, diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c index a7b6fef2..265db89a 100644 --- a/libbcachefs/tests.c +++ b/libbcachefs/tests.c @@ -115,7 +115,8 @@ static void test_iterate(struct bch_fs *c, u64 nr) i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS(0, 0), 0, k) + for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, + POS_MIN, 0, k, ret) BUG_ON(k.k->p.offset != i++); BUG_ON(i != nr); @@ -160,7 +161,8 @@ static void test_iterate_extents(struct bch_fs *c, u64 nr) i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS(0, 0), 0, k) { + for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, + POS_MIN, 0, k, ret) { BUG_ON(bkey_start_offset(k.k) != i); i = k.k->p.offset; } @@ -208,7 +210,8 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr) i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS(0, 0), 0, k) { + for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, + 0, k, ret) { BUG_ON(k.k->p.offset != i); i += 2; } @@ -220,8 +223,8 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr) i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS(0, 0), - BTREE_ITER_SLOTS, k) { + for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, + BTREE_ITER_SLOTS, k, ret) { BUG_ON(bkey_deleted(k.k) != (i & 1)); BUG_ON(k.k->p.offset != i++); @@ -262,7 +265,8 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr) i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS(0, 0), 0, k) { + for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, + 0, k, ret) { BUG_ON(bkey_start_offset(k.k) != i + 8); BUG_ON(k.k->size != 8); i += 16; @@ -275,8 +279,8 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr) i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS(0, 0), - BTREE_ITER_SLOTS, k) { + for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, + BTREE_ITER_SLOTS, k, ret) { BUG_ON(bkey_deleted(k.k) != !(i % 16)); BUG_ON(bkey_start_offset(k.k) != i); @@ -500,10 +504,8 @@ static void seq_insert(struct bch_fs *c, u64 nr) bch2_trans_init(&trans, c); - iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, POS_MIN, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - - for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) { + for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k) { insert.k.p = iter->pos; bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &insert.k_i)); @@ -538,10 +540,8 @@ static void seq_overwrite(struct bch_fs *c, u64 nr) bch2_trans_init(&trans, c); - iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, POS_MIN, - BTREE_ITER_INTENT); - - for_each_btree_key_continue(iter, 0, k) { + for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, + BTREE_ITER_INTENT, k) { struct bkey_i_cookie u; bkey_reassemble(&u.k_i, k); diff --git a/libbcachefs/util.h b/libbcachefs/util.h index 7e1729a4..f7a35880 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -738,6 +738,16 @@ static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src, acc_u64s(acc, per_cpu_ptr(src, cpu), nr); } +static inline void percpu_memset(void __percpu *p, int c, size_t bytes) +{ + int cpu; + + for_each_possible_cpu(cpu) + memset(per_cpu_ptr(p, cpu), c, bytes); +} + u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned); +#define cmp_int(l, r) ((l > r) - (l < r)) + #endif /* _BCACHEFS_UTIL_H */ diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index 5ba52a3f..fd58829a 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -197,55 +197,54 @@ int bch2_xattr_set(struct btree_trans *trans, u64 inum, return ret; } -static void __bch2_xattr_emit(const char *prefix, - const char *name, size_t name_len, - char **buffer, size_t *buffer_size, - ssize_t *ret) +struct xattr_buf { + char *buf; + size_t len; + size_t used; +}; + +static int __bch2_xattr_emit(const char *prefix, + const char *name, size_t name_len, + struct xattr_buf *buf) { const size_t prefix_len = strlen(prefix); const size_t total_len = prefix_len + name_len + 1; - if (*buffer) { - if (total_len > *buffer_size) { - *ret = -ERANGE; - return; - } + if (buf->buf) { + if (buf->used + total_len > buf->len) + return -ERANGE; - memcpy(*buffer, prefix, prefix_len); - memcpy(*buffer + prefix_len, + memcpy(buf->buf + buf->used, prefix, prefix_len); + memcpy(buf->buf + buf->used + prefix_len, name, name_len); - (*buffer)[prefix_len + name_len] = '\0'; - - *buffer += total_len; - *buffer_size -= total_len; + buf->buf[buf->used + prefix_len + name_len] = '\0'; } - *ret += total_len; + buf->used += total_len; + return 0; } -static void bch2_xattr_emit(struct dentry *dentry, +static int bch2_xattr_emit(struct dentry *dentry, const struct bch_xattr *xattr, - char **buffer, size_t *buffer_size, - ssize_t *ret) + struct xattr_buf *buf) { const struct xattr_handler *handler = bch2_xattr_type_to_handler(xattr->x_type); - if (handler && (!handler->list || handler->list(dentry))) - __bch2_xattr_emit(handler->prefix ?: handler->name, - xattr->x_name, xattr->x_name_len, - buffer, buffer_size, ret); + return handler && (!handler->list || handler->list(dentry)) + ? __bch2_xattr_emit(handler->prefix ?: handler->name, + xattr->x_name, xattr->x_name_len, buf) + : 0; } -static void bch2_xattr_list_bcachefs(struct bch_fs *c, - struct bch_inode_info *inode, - char **buffer, - size_t *buffer_size, - ssize_t *ret, - bool all) +static int bch2_xattr_list_bcachefs(struct bch_fs *c, + struct bch_inode_info *inode, + struct xattr_buf *buf, + bool all) { const char *prefix = all ? "bcachefs_effective." : "bcachefs."; unsigned id; + int ret = 0; u64 v; for (id = 0; id < Inode_opt_nr; id++) { @@ -257,13 +256,13 @@ static void bch2_xattr_list_bcachefs(struct bch_fs *c, !(inode->ei_inode.bi_fields_set & (1 << id))) continue; - __bch2_xattr_emit(prefix, - bch2_inode_opts[id], - strlen(bch2_inode_opts[id]), - buffer, buffer_size, ret); - if (*ret < 0) + ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id], + strlen(bch2_inode_opts[id]), buf); + if (ret) break; } + + return ret; } ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) @@ -273,13 +272,14 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; + struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; u64 inum = dentry->d_inode->i_ino; - ssize_t ret = 0; + int ret; bch2_trans_init(&trans, c); for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, - POS(inum, 0), 0, k) { + POS(inum, 0), 0, k, ret) { BUG_ON(k.k->p.inode < inum); if (k.k->p.inode > inum) @@ -288,27 +288,24 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) if (k.k->type != KEY_TYPE_xattr) continue; - bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, - &buffer, &buffer_size, &ret); - if (ret < 0) + ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf); + if (ret) break; } - bch2_trans_exit(&trans); + ret = bch2_trans_exit(&trans) ?: ret; - if (ret < 0) + if (ret) return ret; - bch2_xattr_list_bcachefs(c, inode, &buffer, - &buffer_size, &ret, false); - if (ret < 0) + ret = bch2_xattr_list_bcachefs(c, inode, &buf, false); + if (ret) return ret; - bch2_xattr_list_bcachefs(c, inode, &buffer, - &buffer_size, &ret, true); - if (ret < 0) + ret = bch2_xattr_list_bcachefs(c, inode, &buf, true); + if (ret) return ret; - return ret; + return buf.used; } static int bch2_xattr_get_handler(const struct xattr_handler *handler,