diff --git a/.bcachefs_revision b/.bcachefs_revision index e025c5ab..b34e94b8 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -1712318522fdaa533f8622f4c7da05e44a4828b0 +d83b992f653d9f742f3f8567dbcfd1f4f72e858f diff --git a/include/crypto/chacha20.h b/include/crypto/chacha.h similarity index 65% rename from include/crypto/chacha20.h rename to include/crypto/chacha.h index 1cdc77ba..f004cfb5 100644 --- a/include/crypto/chacha20.h +++ b/include/crypto/chacha.h @@ -8,8 +8,8 @@ #include #include -#define CHACHA20_IV_SIZE 16 -#define CHACHA20_KEY_SIZE 32 -#define CHACHA20_BLOCK_SIZE 64 +#define CHACHA_IV_SIZE 16 +#define CHACHA_KEY_SIZE 32 +#define CHACHA_BLOCK_SIZE 64 #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index cebaaf8b..5011ae7d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -147,12 +147,9 @@ static inline u64 ktime_get_real_seconds(void) return ts.tv_sec; } -static inline struct timespec current_kernel_time(void) +static inline void ktime_get_real_ts64(struct timespec64 *ts) { - struct timespec ts; - - clock_gettime(CLOCK_MONOTONIC, &ts); - return ts; + clock_gettime(CLOCK_MONOTONIC, ts); } #define current_kernel_time64() current_kernel_time() diff --git a/libbcachefs.c b/libbcachefs.c index f84a34ca..d56b987c 100644 --- a/libbcachefs.c +++ b/libbcachefs.c @@ -619,6 +619,11 @@ static void bch2_sb_print_clean(struct bch_sb *sb, struct bch_sb_field *f, { } +static void bch2_sb_print_journal_seq_blacklist(struct bch_sb *sb, struct bch_sb_field *f, + enum units units) +{ +} + typedef void (*sb_field_print_fn)(struct bch_sb *, struct bch_sb_field *, enum units); struct bch_sb_field_toolops { diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 18afef2e..5a306568 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -290,8 +290,9 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list) bch2_alloc_read_key(c, bkey_i_to_s_c(k)); } - for_each_member_device(ca, c, i) - bch2_dev_usage_from_buckets(c, ca); + percpu_down_write(&c->mark_lock); + bch2_dev_usage_from_buckets(c); + percpu_up_write(&c->mark_lock); mutex_lock(&c->bucket_clock[READ].lock); for_each_member_device(ca, c, i) { diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 9a90a68c..6a68376d 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -183,6 +183,7 @@ #include #include #include +#include #include #include #include @@ -220,6 +221,8 @@ printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_warn(c, fmt, ...) \ printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) +#define bch_warn_ratelimited(c, fmt, ...) \ + printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_err(c, fmt, ...) \ printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_err_ratelimited(c, fmt, ...) \ @@ -481,6 +484,7 @@ enum { BCH_FS_RW, /* shutdown: */ + BCH_FS_STOPPING, BCH_FS_EMERGENCY_RO, BCH_FS_WRITE_DISABLE_COMPLETE, @@ -506,6 +510,15 @@ struct bch_fs_pcpu { u64 sectors_available; }; +struct journal_seq_blacklist_table { + size_t nr; + struct journal_seq_blacklist_table_entry { + u64 start; + u64 end; + bool dirty; + } entries[0]; +}; + struct bch_fs { struct closure cl; @@ -641,6 +654,11 @@ struct bch_fs { struct io_clock io_clock[2]; + /* JOURNAL SEQ BLACKLIST */ + struct journal_seq_blacklist_table * + journal_seq_blacklist_table; + struct work_struct journal_seq_blacklist_gc_work; + /* ALLOCATOR */ spinlock_t freelist_lock; struct closure_waitlist freelist_wait; @@ -794,4 +812,27 @@ static inline unsigned block_bytes(const struct bch_fs *c) return c->opts.block_size << 9; } +static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, u64 time) +{ + return ns_to_timespec64(time * c->sb.time_precision + c->sb.time_base_lo); +} + +static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts) +{ + s64 ns = timespec64_to_ns(&ts) - c->sb.time_base_lo; + + if (c->sb.time_precision == 1) + return ns; + + return div_s64(ns, c->sb.time_precision); +} + +static inline s64 bch2_current_time(struct bch_fs *c) +{ + struct timespec64 now; + + ktime_get_real_ts64(&now); + return timespec_to_bch2_time(c, now); +} + #endif /* _BCACHEFS_H */ diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index e899d03f..d390ac86 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -904,7 +904,8 @@ struct bch_sb_field { x(quota, 4) \ x(disk_groups, 5) \ x(clean, 6) \ - x(replicas, 7) + x(replicas, 7) \ + x(journal_seq_blacklist, 8) enum bch_sb_field_type { #define x(f, nr) BCH_SB_FIELD_##f = nr, @@ -1119,6 +1120,20 @@ struct bch_sb_field_clean { }; }; +struct journal_seq_blacklist_entry { + __le64 start; + __le64 end; +}; + +struct bch_sb_field_journal_seq_blacklist { + struct bch_sb_field field; + + union { + struct journal_seq_blacklist_entry start[0]; + __u64 _data[0]; + }; +}; + /* Superblock: */ /* @@ -1274,6 +1289,7 @@ enum bch_sb_features { BCH_FEATURE_ZSTD = 2, BCH_FEATURE_ATOMIC_NLINK = 3, /* should have gone under compat */ BCH_FEATURE_EC = 4, + BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5, BCH_FEATURE_NR, }; diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index cb0e2449..3feea91e 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -114,7 +114,6 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const struct bch_extent_ptr *ptr; - struct gc_pos pos = { 0 }; unsigned flags = BCH_BUCKET_MARK_GC| (initial ? BCH_BUCKET_MARK_NOATOMIC : 0); @@ -171,7 +170,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, *max_stale = max(*max_stale, ptr_stale(ca, ptr)); } - bch2_mark_key(c, k, true, k.k->size, pos, NULL, 0, flags); + bch2_mark_key(c, k, true, k.k->size, NULL, 0, flags); fsck_err: return ret; } @@ -202,7 +201,7 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, } static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, - bool initial) + bool initial, bool metadata_only) { struct btree_trans trans; struct btree_iter *iter; @@ -222,7 +221,9 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, * and on startup, we have to read every btree node (XXX: only if it was * an unclean shutdown) */ - if (initial || expensive_debug_checks(c)) + if (metadata_only) + depth = 1; + else if (initial || expensive_debug_checks(c)) depth = 0; btree_node_range_checks_init(&r, depth); @@ -278,7 +279,7 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) } static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal, - bool initial) + bool initial, bool metadata_only) { enum btree_id ids[BTREE_ID_NR]; u8 max_stale; @@ -292,11 +293,12 @@ static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal, enum btree_id id = ids[i]; enum btree_node_type type = __btree_node_type(0, id); - int ret = bch2_gc_btree(c, id, initial); + int ret = bch2_gc_btree(c, id, initial, metadata_only); if (ret) return ret; - if (journal && btree_node_type_needs_gc(type)) { + if (journal && !metadata_only && + btree_node_type_needs_gc(type)) { struct bkey_i *k, *n; struct jset_entry *j; struct journal_replay *r; @@ -397,7 +399,6 @@ static void bch2_mark_superblocks(struct bch_fs *c) /* Also see bch2_pending_btree_node_free_insert_done() */ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) { - struct gc_pos pos = { 0 }; struct btree_update *as; struct pending_btree_node_free *d; @@ -407,8 +408,7 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) for_each_pending_btree_node_free(c, as, d) if (d->index_update_done) bch2_mark_key(c, bkey_i_to_s_c(&d->key), - true, 0, - pos, NULL, 0, + true, 0, NULL, 0, BCH_BUCKET_MARK_GC); mutex_unlock(&c->btree_interior_update_lock); @@ -481,25 +481,28 @@ static void bch2_gc_free(struct bch_fs *c) c->usage[1] = NULL; } -static void bch2_gc_done(struct bch_fs *c, bool initial) +static int bch2_gc_done(struct bch_fs *c, + bool initial, bool metadata_only) { struct bch_dev *ca; - bool verify = !initial || - (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)); + bool verify = !metadata_only && + (!initial || + (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))); unsigned i; + int ret = 0; #define copy_field(_f, _msg, ...) \ if (dst->_f != src->_f) { \ if (verify) \ - bch_err(c, _msg ": got %llu, should be %llu, fixing"\ + fsck_err(c, _msg ": got %llu, should be %llu" \ , ##__VA_ARGS__, dst->_f, src->_f); \ dst->_f = src->_f; \ } #define copy_stripe_field(_f, _msg, ...) \ if (dst->_f != src->_f) { \ if (verify) \ - bch_err_ratelimited(c, "stripe %zu has wrong "_msg\ - ": got %u, should be %u, fixing", \ + fsck_err(c, "stripe %zu has wrong "_msg \ + ": got %u, should be %u", \ dst_iter.pos, ##__VA_ARGS__, \ dst->_f, src->_f); \ dst->_f = src->_f; \ @@ -508,8 +511,8 @@ static void bch2_gc_done(struct bch_fs *c, bool initial) #define copy_bucket_field(_f) \ if (dst->b[b].mark._f != src->b[b].mark._f) { \ if (verify) \ - bch_err_ratelimited(c, "dev %u bucket %zu has wrong " #_f\ - ": got %u, should be %u, fixing", i, b, \ + fsck_err(c, "dev %u bucket %zu has wrong " #_f \ + ": got %u, should be %u", i, b, \ dst->b[b].mark._f, src->b[b].mark._f); \ dst->b[b]._mark._f = src->b[b].mark._f; \ dst->b[b]._mark.dirty = true; \ @@ -519,7 +522,7 @@ static void bch2_gc_done(struct bch_fs *c, bool initial) #define copy_fs_field(_f, _msg, ...) \ copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) - { + if (!metadata_only) { struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0); struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0); struct stripe *dst, *src; @@ -571,26 +574,7 @@ static void bch2_gc_done(struct bch_fs *c, bool initial) } }; - for_each_member_device(ca, c, i) { - unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64); - struct bch_dev_usage *dst = (void *) - bch2_acc_percpu_u64s((void *) ca->usage[0], nr); - struct bch_dev_usage *src = (void *) - bch2_acc_percpu_u64s((void *) ca->usage[1], nr); - unsigned b; - - for (b = 0; b < BCH_DATA_NR; b++) - copy_dev_field(buckets[b], "buckets[%s]", - bch2_data_types[b]); - copy_dev_field(buckets_alloc, "buckets_alloc"); - copy_dev_field(buckets_ec, "buckets_ec"); - copy_dev_field(buckets_unavailable, "buckets_unavailable"); - - for (b = 0; b < BCH_DATA_NR; b++) - copy_dev_field(sectors[b], "sectors[%s]", - bch2_data_types[b]); - copy_dev_field(sectors_fragmented, "sectors_fragmented"); - } + bch2_dev_usage_from_buckets(c); { unsigned nr = fs_usage_u64s(c); @@ -600,20 +584,29 @@ static void bch2_gc_done(struct bch_fs *c, bool initial) bch2_acc_percpu_u64s((void *) c->usage[1], nr); copy_fs_field(hidden, "hidden"); - copy_fs_field(data, "data"); - copy_fs_field(cached, "cached"); - copy_fs_field(reserved, "reserved"); - copy_fs_field(nr_inodes, "nr_inodes"); + copy_fs_field(btree, "btree"); - for (i = 0; i < BCH_REPLICAS_MAX; i++) - copy_fs_field(persistent_reserved[i], - "persistent_reserved[%i]", i); + if (!metadata_only) { + copy_fs_field(data, "data"); + copy_fs_field(cached, "cached"); + copy_fs_field(reserved, "reserved"); + copy_fs_field(nr_inodes,"nr_inodes"); + + for (i = 0; i < BCH_REPLICAS_MAX; i++) + copy_fs_field(persistent_reserved[i], + "persistent_reserved[%i]", i); + } for (i = 0; i < c->replicas.nr; i++) { struct bch_replicas_entry *e = cpu_replicas_entry(&c->replicas, i); char buf[80]; + if (metadata_only && + (e->data_type == BCH_DATA_USER || + e->data_type == BCH_DATA_CACHED)) + continue; + bch2_replicas_entry_to_text(&PBUF(buf), e); copy_fs_field(replicas[i], "%s", buf); @@ -625,9 +618,12 @@ static void bch2_gc_done(struct bch_fs *c, bool initial) #undef copy_bucket_field #undef copy_stripe_field #undef copy_field +fsck_err: + return ret; } -static int bch2_gc_start(struct bch_fs *c) +static int bch2_gc_start(struct bch_fs *c, + bool metadata_only) { struct bch_dev *ca; unsigned i; @@ -673,10 +669,18 @@ static int bch2_gc_start(struct bch_fs *c) dst->nbuckets = src->nbuckets; for (b = 0; b < src->nbuckets; b++) { - dst->b[b]._mark.gen = - dst->b[b].oldest_gen = - src->b[b].mark.gen; - dst->b[b].gen_valid = src->b[b].gen_valid; + struct bucket *d = &dst->b[b]; + struct bucket *s = &src->b[b]; + + d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen; + d->gen_valid = s->gen_valid; + + if (metadata_only && + (s->mark.data_type == BCH_DATA_USER || + s->mark.data_type == BCH_DATA_CACHED)) { + d->_mark = s->mark; + d->_mark.owned_by_allocator = 0; + } } }; @@ -701,7 +705,8 @@ static int bch2_gc_start(struct bch_fs *c) * move around - if references move backwards in the ordering GC * uses, GC could skip past them */ -int bch2_gc(struct bch_fs *c, struct list_head *journal, bool initial) +int bch2_gc(struct bch_fs *c, struct list_head *journal, + bool initial, bool metadata_only) { struct bch_dev *ca; u64 start_time = local_clock(); @@ -713,7 +718,7 @@ int bch2_gc(struct bch_fs *c, struct list_head *journal, bool initial) down_write(&c->gc_lock); again: percpu_down_write(&c->mark_lock); - ret = bch2_gc_start(c); + ret = bch2_gc_start(c, metadata_only); percpu_up_write(&c->mark_lock); if (ret) @@ -721,7 +726,7 @@ again: bch2_mark_superblocks(c); - ret = bch2_gc_btrees(c, journal, initial); + ret = bch2_gc_btrees(c, journal, initial, metadata_only); if (ret) goto out; @@ -755,7 +760,7 @@ out: percpu_down_write(&c->mark_lock); if (!ret) - bch2_gc_done(c, initial); + ret = bch2_gc_done(c, initial, metadata_only); /* Indicates that gc is no longer in progress: */ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); @@ -1157,7 +1162,7 @@ static int bch2_gc_thread(void *arg) last = atomic_long_read(&clock->now); last_kick = atomic_read(&c->kick_gc); - ret = bch2_gc(c, NULL, false); + ret = bch2_gc(c, NULL, false, false); if (ret) bch_err(c, "btree gc failed: %i", ret); diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h index df51eb83..9e067deb 100644 --- a/libbcachefs/btree_gc.h +++ b/libbcachefs/btree_gc.h @@ -4,7 +4,7 @@ #include "btree_types.h" void bch2_coalesce(struct bch_fs *); -int bch2_gc(struct bch_fs *, struct list_head *, bool); +int bch2_gc(struct bch_fs *, struct list_head *, bool, bool); void bch2_gc_thread_stop(struct bch_fs *); int bch2_gc_thread_start(struct bch_fs *); void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned); diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 0b99e7d2..fe888c57 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -509,7 +509,7 @@ static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, bytes); - nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE)); + nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); } bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, @@ -770,7 +770,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry struct btree_node *sorted; struct bkey_packed *k; struct bset *i; - bool used_mempool; + bool used_mempool, blacklisted; unsigned u64s; int ret, retry_read = 0, write = READ; @@ -844,20 +844,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry b->written += sectors; - ret = bch2_journal_seq_should_ignore(c, le64_to_cpu(i->journal_seq), b); - if (ret < 0) { - btree_err(BTREE_ERR_FATAL, c, b, i, - "insufficient memory"); - goto err; - } + blacklisted = bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(i->journal_seq), + true); - if (ret) { - btree_err_on(first, - BTREE_ERR_FIXABLE, c, b, i, - "first btree node bset has blacklisted journal seq"); - if (!first) - continue; - } + btree_err_on(blacklisted && first, + BTREE_ERR_FIXABLE, c, b, i, + "first btree node bset has blacklisted journal seq"); + if (blacklisted && !first) + continue; bch2_btree_node_iter_large_push(iter, b, i->start, @@ -930,7 +925,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry out: mempool_free(iter, &c->fill_iter); return retry_read; -err: fsck_err: if (ret == BTREE_RETRY_READ) { retry_read = 1; diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 49ad6df8..33cbc2ff 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -818,14 +818,6 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) struct btree_iter *linked; unsigned level = b->level; - /* caller now responsible for unlocking @b */ - - BUG_ON(iter->l[level].b != b); - BUG_ON(!btree_node_intent_locked(iter, level)); - - iter->l[level].b = BTREE_ITER_NOT_END; - mark_btree_node_unlocked(iter, level); - trans_for_each_iter(iter->trans, linked) if (linked->l[level].b == b) { __btree_node_unlock(linked, level); @@ -990,6 +982,7 @@ retry_all: } if (unlikely(ret == -EIO)) { + trans->error = true; iter->flags |= BTREE_ITER_ERROR; iter->l[iter->level].b = BTREE_ITER_NOT_END; goto out; @@ -1162,6 +1155,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth) if (!btree_iter_node(iter, iter->level)) return NULL; + bch2_trans_cond_resched(iter->trans); + btree_iter_up(iter); if (!bch2_btree_node_relock(iter, iter->level)) @@ -1712,7 +1707,7 @@ void bch2_trans_preload_iters(struct btree_trans *trans) static int btree_trans_iter_alloc(struct btree_trans *trans) { - unsigned idx = ffz(trans->iters_linked); + unsigned idx = __ffs64(~trans->iters_linked); if (idx < trans->nr_iters) goto got_slot; @@ -1877,17 +1872,17 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, int bch2_trans_unlock(struct btree_trans *trans) { - unsigned iters = trans->iters_linked; + u64 iters = trans->iters_linked; int ret = 0; while (iters) { - unsigned idx = __ffs(iters); + unsigned idx = __ffs64(iters); struct btree_iter *iter = &trans->iters[idx]; ret = ret ?: btree_iter_err(iter); __bch2_btree_iter_unlock(iter); - iters ^= 1 << idx; + iters ^= 1ULL << idx; } return ret; @@ -1949,7 +1944,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c) int bch2_trans_exit(struct btree_trans *trans) { - int ret = bch2_trans_unlock(trans); + bch2_trans_unlock(trans); kfree(trans->mem); if (trans->used_mempool) @@ -1958,5 +1953,6 @@ int bch2_trans_exit(struct btree_trans *trans) kfree(trans->iters); trans->mem = (void *) 0x1; trans->iters = (void *) 0x1; - return ret; + + return trans->error ? -EIO : 0; } diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 3d869dd8..a995efc7 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -279,6 +279,7 @@ struct btree_trans { u8 nr_updates; u8 size; unsigned used_mempool:1; + unsigned error:1; unsigned mem_top; unsigned mem_bytes; diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 62021727..19ba667b 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -161,7 +161,6 @@ static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b, { struct bch_fs *c = as->c; struct pending_btree_node_free *d; - struct gc_pos pos = { 0 }; for (d = as->pending; d < as->pending + as->nr_pending; d++) if (!bkey_cmp(k.k->p, d->key.k.p) && @@ -189,18 +188,12 @@ found: * to cancel out one of mark and sweep's markings if necessary: */ - /* - * bch2_mark_key() compares the current gc pos to the pos we're - * moving this reference from, hence one comparison here: - */ if (gc_pos_cmp(c->gc_pos, b ? gc_pos_btree_node(b) : gc_pos_btree_root(as->btree_id)) >= 0 && gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) - bch2_mark_key_locked(c, - bkey_i_to_s_c(&d->key), - false, 0, pos, - NULL, 0, BCH_BUCKET_MARK_GC); + bch2_mark_key_locked(c, bkey_i_to_s_c(&d->key), + false, 0, NULL, 0, BCH_BUCKET_MARK_GC); } static void __btree_node_free(struct bch_fs *c, struct btree *b) @@ -272,8 +265,11 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c, bch2_mark_key(c, bkey_i_to_s_c(&pending->key), false, 0, - gc_phase(GC_PHASE_PENDING_DELETE), NULL, 0, 0); + + if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE))) + bch2_mark_key(c, bkey_i_to_s_c(&pending->key), + false, 0, NULL, 0, BCH_BUCKET_MARK_GC); } static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, @@ -1078,9 +1074,11 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b) fs_usage = bch2_fs_usage_scratch_get(c); bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key), - true, 0, - gc_pos_btree_root(b->btree_id), - fs_usage, 0, 0); + true, 0, fs_usage, 0, 0); + if (gc_visited(c, gc_pos_btree_root(b->btree_id))) + bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key), + true, 0, NULL, 0, + BCH_BUCKET_MARK_GC); if (old && !btree_node_fake(old)) bch2_btree_node_free_index(as, NULL, @@ -1172,8 +1170,11 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b fs_usage = bch2_fs_usage_scratch_get(c); bch2_mark_key_locked(c, bkey_i_to_s_c(insert), - true, 0, - gc_pos_btree_node(b), fs_usage, 0, 0); + true, 0, fs_usage, 0, 0); + + if (gc_visited(c, gc_pos_btree_node(b))) + bch2_mark_key_locked(c, bkey_i_to_s_c(insert), + true, 0, NULL, 0, BCH_BUCKET_MARK_GC); while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && bkey_iter_pos_cmp(b, &insert->k.p, k) > 0) @@ -1428,6 +1429,7 @@ static void btree_split(struct btree_update *as, struct btree *b, /* Successful split, update the iterator to point to the new nodes: */ + six_lock_increment(&b->lock, SIX_LOCK_intent); bch2_btree_iter_node_drop(iter, b); if (n3) bch2_btree_iter_node_replace(iter, n3); @@ -1739,7 +1741,10 @@ retry: bch2_open_buckets_put(c, &n->ob); + six_lock_increment(&b->lock, SIX_LOCK_intent); bch2_btree_iter_node_drop(iter, b); + bch2_btree_iter_node_drop(iter, m); + bch2_btree_iter_node_replace(iter, n); bch2_btree_iter_verify(iter, n); @@ -1837,6 +1842,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, bch2_open_buckets_put(c, &n->ob); + six_lock_increment(&b->lock, SIX_LOCK_intent); bch2_btree_iter_node_drop(iter, b); bch2_btree_iter_node_replace(iter, n); bch2_btree_node_free_inmem(c, b, iter); @@ -1988,9 +1994,12 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, fs_usage = bch2_fs_usage_scratch_get(c); bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i), - true, 0, - gc_pos_btree_root(b->btree_id), - fs_usage, 0, 0); + true, 0, fs_usage, 0, 0); + if (gc_visited(c, gc_pos_btree_root(b->btree_id))) + bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i), + true, 0, NULL, 0, + BCH_BUCKET_MARK_GC); + bch2_btree_node_free_index(as, NULL, bkey_i_to_s_c(&b->key), fs_usage); diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 142230cf..ce1fc29d 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "btree_update.h" #include "btree_update_interior.h" +#include "btree_gc.h" #include "btree_io.h" #include "btree_iter.h" #include "btree_locking.h" @@ -601,10 +602,17 @@ static inline int do_btree_insert_at(struct btree_trans *trans, } trans_for_each_update_iter(trans, i) - bch2_mark_update(trans, i, fs_usage); + bch2_mark_update(trans, i, fs_usage, 0); if (fs_usage) bch2_trans_fs_usage_apply(trans, fs_usage); + if (unlikely(c->gc_pos.phase)) { + trans_for_each_update_iter(trans, i) + if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) + bch2_mark_update(trans, i, NULL, + BCH_BUCKET_MARK_GC); + } + trans_for_each_update(trans, i) do_btree_insert_one(trans, i); out: @@ -852,12 +860,15 @@ out_noupdates: BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR); - bch2_trans_unlink_iters(trans, trans->iters_unlink_on_commit); if (!ret) { - bch2_trans_unlink_iters(trans, ~trans->iters_touched); + bch2_trans_unlink_iters(trans, ~trans->iters_touched| + trans->iters_unlink_on_commit); trans->iters_touched = 0; + } else { + bch2_trans_unlink_iters(trans, trans->iters_unlink_on_commit); } - trans->nr_updates = 0; + trans->nr_updates = 0; + trans->mem_top = 0; return ret; err: diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index dae718dc..4fa131a1 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -131,6 +131,8 @@ void bch2_fs_usage_initialize(struct bch_fs *c) switch (e->data_type) { case BCH_DATA_BTREE: + usage->btree += usage->replicas[i]; + break; case BCH_DATA_USER: usage->data += usage->replicas[i]; break; @@ -225,6 +227,7 @@ static u64 avail_factor(u64 r) u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage) { return min(fs_usage->hidden + + fs_usage->btree + fs_usage->data + reserve_factor(fs_usage->reserved + fs_usage->online_reserved), @@ -240,7 +243,8 @@ __bch2_fs_usage_read_short(struct bch_fs *c) ret.capacity = c->capacity - percpu_u64_get(&c->usage[0]->hidden); - data = percpu_u64_get(&c->usage[0]->data); + data = percpu_u64_get(&c->usage[0]->data) + + percpu_u64_get(&c->usage[0]->btree); reserved = percpu_u64_get(&c->usage[0]->reserved) + percpu_u64_get(&c->usage[0]->online_reserved); @@ -383,21 +387,32 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, bch2_wake_allocator(ca); } -void bch2_dev_usage_from_buckets(struct bch_fs *c, struct bch_dev *ca) +void bch2_dev_usage_from_buckets(struct bch_fs *c) { + struct bch_dev *ca; struct bucket_mark old = { .v.counter = 0 }; struct bch_fs_usage *fs_usage; struct bucket_array *buckets; struct bucket *g; + unsigned i; + int cpu; - percpu_down_read_preempt_disable(&c->mark_lock); - fs_usage = this_cpu_ptr(c->usage[0]); - buckets = bucket_array(ca); + percpu_u64_set(&c->usage[0]->hidden, 0); - for_each_bucket(g, buckets) - if (g->mark.data_type) - bch2_dev_usage_update(c, ca, fs_usage, old, g->mark, false); - percpu_up_read_preempt_enable(&c->mark_lock); + for_each_member_device(ca, c, i) { + for_each_possible_cpu(cpu) + memset(per_cpu_ptr(ca->usage[0], cpu), 0, + sizeof(*ca->usage[0])); + + preempt_disable(); + fs_usage = this_cpu_ptr(c->usage[0]); + buckets = bucket_array(ca); + + for_each_bucket(g, buckets) + bch2_dev_usage_update(c, ca, fs_usage, + old, g->mark, false); + preempt_enable(); + } } #define bucket_data_cmpxchg(c, ca, fs_usage, g, new, expr) \ @@ -418,10 +433,17 @@ static inline void update_replicas(struct bch_fs *c, BUG_ON(idx < 0); BUG_ON(!sectors); - if (r->data_type == BCH_DATA_CACHED) - fs_usage->cached += sectors; - else + switch (r->data_type) { + case BCH_DATA_BTREE: + fs_usage->btree += sectors; + break; + case BCH_DATA_USER: fs_usage->data += sectors; + break; + case BCH_DATA_CACHED: + fs_usage->cached += sectors; + break; + } fs_usage->replicas[idx] += sectors; } @@ -924,12 +946,13 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, return 0; } -static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, - bool inserting, s64 sectors, - struct bch_fs_usage *fs_usage, - unsigned journal_seq, unsigned flags, - bool gc) +int bch2_mark_key_locked(struct bch_fs *c, + struct bkey_s_c k, + bool inserting, s64 sectors, + struct bch_fs_usage *fs_usage, + u64 journal_seq, unsigned flags) { + bool gc = flags & BCH_BUCKET_MARK_GC; int ret = 0; preempt_disable(); @@ -981,21 +1004,8 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, return ret; } -int bch2_mark_key_locked(struct bch_fs *c, - struct bkey_s_c k, - bool inserting, s64 sectors, - struct gc_pos pos, - struct bch_fs_usage *fs_usage, - u64 journal_seq, unsigned flags) -{ - return do_mark_fn(__bch2_mark_key, c, pos, flags, - k, inserting, sectors, fs_usage, - journal_seq, flags); -} - int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, bool inserting, s64 sectors, - struct gc_pos pos, struct bch_fs_usage *fs_usage, u64 journal_seq, unsigned flags) { @@ -1003,7 +1013,7 @@ int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, percpu_down_read_preempt_disable(&c->mark_lock); ret = bch2_mark_key_locked(c, k, inserting, sectors, - pos, fs_usage, journal_seq, flags); + fs_usage, journal_seq, flags); percpu_up_read_preempt_enable(&c->mark_lock); return ret; @@ -1011,13 +1021,13 @@ int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, void bch2_mark_update(struct btree_trans *trans, struct btree_insert_entry *insert, - struct bch_fs_usage *fs_usage) + struct bch_fs_usage *fs_usage, + unsigned flags) { struct bch_fs *c = trans->c; struct btree_iter *iter = insert->iter; struct btree *b = iter->l[0].b; struct btree_node_iter node_iter = iter->l[0].iter; - struct gc_pos pos = gc_pos_btree_node(b); struct bkey_packed *_k; if (!btree_node_type_needs_gc(iter->btree_id)) @@ -1027,7 +1037,7 @@ void bch2_mark_update(struct btree_trans *trans, bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true, bpos_min(insert->k->k.p, b->key.k.p).offset - bkey_start_offset(&insert->k->k), - pos, fs_usage, trans->journal_res.seq, 0); + fs_usage, trans->journal_res.seq, flags); while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, KEY_TYPE_discard))) { @@ -1060,7 +1070,8 @@ void bch2_mark_update(struct btree_trans *trans, BUG_ON(sectors <= 0); bch2_mark_key_locked(c, k, true, sectors, - pos, fs_usage, trans->journal_res.seq, 0); + fs_usage, trans->journal_res.seq, + flags); sectors = bkey_start_offset(&insert->k->k) - k.k->p.offset; @@ -1071,7 +1082,7 @@ void bch2_mark_update(struct btree_trans *trans, } bch2_mark_key_locked(c, k, false, sectors, - pos, fs_usage, trans->journal_res.seq, 0); + fs_usage, trans->journal_res.seq, flags); bch2_btree_node_iter_advance(&node_iter, b); } diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index c9706fa0..1033398e 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -173,7 +173,7 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m, struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *); -void bch2_dev_usage_from_buckets(struct bch_fs *, struct bch_dev *); +void bch2_dev_usage_from_buckets(struct bch_fs *); static inline u64 __dev_buckets_available(struct bch_dev *ca, struct bch_dev_usage stats) @@ -245,16 +245,16 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, #define BCH_BUCKET_MARK_NOATOMIC (1 << 1) int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, - bool, s64, struct gc_pos, - struct bch_fs_usage *, u64, unsigned); + bool, s64, struct bch_fs_usage *, + u64, unsigned); int bch2_mark_key(struct bch_fs *, struct bkey_s_c, - bool, s64, struct gc_pos, - struct bch_fs_usage *, u64, unsigned); + bool, s64, struct bch_fs_usage *, + u64, unsigned); int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, struct disk_reservation *); void bch2_mark_update(struct btree_trans *, struct btree_insert_entry *, - struct bch_fs_usage *); + struct bch_fs_usage *, unsigned); void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *); /* disk reservations: */ diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index 869a1314..2a1fd7a7 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -69,6 +69,7 @@ struct bch_fs_usage { u64 gc_start[0]; u64 hidden; + u64 btree; u64 data; u64 cached; u64 reserved; diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c index dfa2de90..b6cf6801 100644 --- a/libbcachefs/checksum.c +++ b/libbcachefs/checksum.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h index fb72c6a4..580eff66 100644 --- a/libbcachefs/checksum.h +++ b/libbcachefs/checksum.h @@ -6,7 +6,7 @@ #include "super-io.h" #include -#include +#include static inline u64 bch2_crc64_update(u64 crc, const void *p, size_t len) { @@ -126,9 +126,9 @@ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) /* for skipping ahead and encrypting/decrypting at an offset: */ static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) { - EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1)); + EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1)); - le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE); + le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE); return nonce; } diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index 58289fcc..14a9a2c0 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -328,17 +328,18 @@ out: return inum; } -int bch2_empty_dir(struct bch_fs *c, u64 dir_inum) +int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) { - struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; int ret = 0; - bch2_trans_init(&trans, c); + iter = bch2_trans_get_iter(trans, BTREE_ID_DIRENTS, + POS(dir_inum, 0), 0); + if (IS_ERR(iter)) + return PTR_ERR(iter); - for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, - POS(dir_inum, 0), 0, k) { + for_each_btree_key_continue(iter, 0, k) { if (k.k->p.inode > dir_inum) break; @@ -347,11 +348,17 @@ int bch2_empty_dir(struct bch_fs *c, u64 dir_inum) break; } } - bch2_trans_exit(&trans); + bch2_trans_iter_put(trans, iter); return ret; } +int bch2_empty_dir(struct bch_fs *c, u64 dir_inum) +{ + return bch2_trans_do(c, NULL, 0, + bch2_empty_dir_trans(&trans, dir_inum)); +} + int bch2_readdir(struct bch_fs *c, struct file *file, struct dir_context *ctx) { diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h index ed09d306..a35d3aad 100644 --- a/libbcachefs/dirent.h +++ b/libbcachefs/dirent.h @@ -54,6 +54,7 @@ int bch2_dirent_rename(struct btree_trans *, u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *, const struct qstr *); +int bch2_empty_dir_trans(struct btree_trans *, u64); int bch2_empty_dir(struct bch_fs *, u64); int bch2_readdir(struct bch_fs *, struct file *, struct dir_context *); diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index e5df9149..91b86d9d 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -1231,10 +1231,7 @@ int bch2_stripes_write(struct bch_fs *c, bool *wrote) static void bch2_stripe_read_key(struct bch_fs *c, struct bkey_s_c k) { - - struct gc_pos pos = { 0 }; - - bch2_mark_key(c, k, true, 0, pos, NULL, 0, 0); + bch2_mark_key(c, k, true, 0, NULL, 0, 0); } int bch2_stripes_read(struct bch_fs *c, struct list_head *journal_replay_list) diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index ef658ad0..721215ee 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -757,7 +757,7 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, EBUG_ON(!PageLocked(page)); EBUG_ON(!PageLocked(newpage)); - ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); + ret = migrate_page_move_mapping(mapping, newpage, page, mode, 0); if (ret != MIGRATEPAGE_SUCCESS) return ret; diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index 26d5f348..1dc9b06d 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -265,7 +265,7 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) return -EPERM; down_write(&sb->s_umount); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; bch2_fs_emergency_read_only(c); up_write(&sb->s_umount); return 0; diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index cc91af0a..135f6e41 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -1582,7 +1582,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data) struct bch_opts opts = bch2_opts_empty(); int ret; - opt_set(opts, read_only, (*flags & MS_RDONLY) != 0); + opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); ret = bch2_parse_mount_opts(&opts, data); if (ret) @@ -1594,7 +1594,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data) if (opts.read_only) { bch2_fs_read_only(c); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } else { ret = bch2_fs_read_write(c); if (ret) { @@ -1603,7 +1603,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data) return -EINVAL; } - sb->s_flags &= ~MS_RDONLY; + sb->s_flags &= ~SB_RDONLY; } c->opts.read_only = opts.read_only; @@ -1681,7 +1681,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, unsigned i; int ret; - opt_set(opts, read_only, (flags & MS_RDONLY) != 0); + opt_set(opts, read_only, (flags & SB_RDONLY) != 0); ret = bch2_parse_mount_opts(&opts, data); if (ret) @@ -1691,7 +1691,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, if (IS_ERR(c)) return ERR_CAST(c); - sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|MS_NOSEC, c); + sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|SB_NOSEC, c); if (IS_ERR(sb)) { closure_put(&c->cl); return ERR_CAST(sb); @@ -1702,7 +1702,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, if (sb->s_root) { closure_put(&c->cl); - if ((flags ^ sb->s_flags) & MS_RDONLY) { + if ((flags ^ sb->s_flags) & SB_RDONLY) { ret = -EBUSY; goto err_put_super; } @@ -1745,7 +1745,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, #ifdef CONFIG_BCACHEFS_POSIX_ACL if (c->opts.acl) - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= SB_POSIXACL; #endif vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO); @@ -1760,7 +1760,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, goto err_put_super; } - sb->s_flags |= MS_ACTIVE; + sb->s_flags |= SB_ACTIVE; out: return dget(sb->s_root); diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h index 7080de1e..729c0317 100644 --- a/libbcachefs/fs.h +++ b/libbcachefs/fs.h @@ -69,11 +69,6 @@ static inline unsigned nlink_bias(umode_t mode) return S_ISDIR(mode) ? 2 : 1; } -static inline u64 bch2_current_time(struct bch_fs *c) -{ - return timespec_to_bch2_time(c, current_kernel_time64()); -} - static inline bool inode_attr_changing(struct bch_inode_info *dir, struct bch_inode_info *inode, enum inode_opt_id id) diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index bc501d40..b83f94c6 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -127,18 +127,21 @@ static struct inode_walker inode_walker_init(void) }; } -static int walk_inode(struct bch_fs *c, struct inode_walker *w, u64 inum) +static int walk_inode(struct btree_trans *trans, + struct inode_walker *w, u64 inum) { - w->first_this_inode = inum != w->cur_inum; - w->cur_inum = inum; - - if (w->first_this_inode) { - int ret = bch2_inode_find_by_inum(c, inum, &w->inode); + if (inum != w->cur_inum) { + int ret = bch2_inode_find_by_inum_trans(trans, inum, + &w->inode); if (ret && ret != -ENOENT) return ret; - w->have_inode = !ret; + w->have_inode = !ret; + w->cur_inum = inum; + w->first_this_inode = true; + } else { + w->first_this_inode = false; } return 0; @@ -444,12 +447,15 @@ static int check_extents(struct bch_fs *c) int ret = 0; bch2_trans_init(&trans, c); + bch2_trans_preload_iters(&trans); bch_verbose(c, "checking extents"); - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, - POS(BCACHEFS_ROOT_INO, 0), 0, k) { - ret = walk_inode(c, &w, k.k->p.inode); + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + POS(BCACHEFS_ROOT_INO, 0), 0); +retry: + for_each_btree_key_continue(iter, 0, k) { + ret = walk_inode(&trans, &w, k.k->p.inode); if (ret) break; @@ -514,6 +520,8 @@ static int check_extents(struct bch_fs *c) } err: fsck_err: + if (ret == -EINTR) + goto retry; return bch2_trans_exit(&trans) ?: ret; } @@ -536,21 +544,20 @@ static int check_dirents(struct bch_fs *c) bch_verbose(c, "checking dirents"); bch2_trans_init(&trans, c); - bch2_trans_preload_iters(&trans); - iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, - POS(BCACHEFS_ROOT_INO, 0), 0); - hash_check_init(&h); + iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, + POS(BCACHEFS_ROOT_INO, 0), 0); +retry: for_each_btree_key_continue(iter, 0, k) { struct bkey_s_c_dirent d; struct bch_inode_unpacked target; bool have_target; u64 d_inum; - ret = walk_inode(c, &w, k.k->p.inode); + ret = walk_inode(&trans, &w, k.k->p.inode); if (ret) break; @@ -619,7 +626,7 @@ static int check_dirents(struct bch_fs *c) continue; } - ret = bch2_inode_find_by_inum(c, d_inum, &target); + ret = bch2_inode_find_by_inum_trans(&trans, d_inum, &target); if (ret && ret != -ENOENT) break; @@ -670,6 +677,9 @@ static int check_dirents(struct bch_fs *c) hash_stop_chain(&trans, &h); err: fsck_err: + if (ret == -EINTR) + goto retry; + return bch2_trans_exit(&trans) ?: ret; } @@ -688,17 +698,16 @@ static int check_xattrs(struct bch_fs *c) bch_verbose(c, "checking xattrs"); - bch2_trans_init(&trans, c); + hash_check_init(&h); + bch2_trans_init(&trans, c); bch2_trans_preload_iters(&trans); iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS(BCACHEFS_ROOT_INO, 0), 0); - - hash_check_init(&h); - +retry: for_each_btree_key_continue(iter, 0, k) { - ret = walk_inode(c, &w, k.k->p.inode); + ret = walk_inode(&trans, &w, k.k->p.inode); if (ret) break; @@ -721,6 +730,8 @@ static int check_xattrs(struct bch_fs *c) } err: fsck_err: + if (ret == -EINTR) + goto retry; return bch2_trans_exit(&trans) ?: ret; } @@ -904,6 +915,7 @@ static int check_directory_structure(struct bch_fs *c, int ret = 0; bch2_trans_init(&trans, c); + bch2_trans_preload_iters(&trans); bch_verbose(c, "checking directory structure"); @@ -918,9 +930,8 @@ restart_dfs: } ret = path_down(&path, BCACHEFS_ROOT_INO); - if (ret) { - return ret; - } + if (ret) + goto err; while (path.nr) { next: @@ -982,14 +993,19 @@ up: path.nr--; } - for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, 0, k) { + iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS_MIN, 0); +retry: + for_each_btree_key_continue(iter, 0, k) { if (k.k->type != KEY_TYPE_inode) continue; if (!S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode))) continue; - if (!bch2_empty_dir(c, k.k->p.inode)) + ret = bch2_empty_dir_trans(&trans, k.k->p.inode); + if (ret == -EINTR) + goto retry; + if (!ret) continue; if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.inode), c, @@ -1017,15 +1033,12 @@ up: memset(&path, 0, sizeof(path)); goto restart_dfs; } - -out: - kfree(dirs_done.bits); - kfree(path.entries); - return ret; err: fsck_err: ret = bch2_trans_exit(&trans) ?: ret; - goto out; + kfree(dirs_done.bits); + kfree(path.entries); + return ret; } struct nlink { @@ -1069,6 +1082,7 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links, int ret; bch2_trans_init(&trans, c); + bch2_trans_preload_iters(&trans); inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false); @@ -1225,12 +1239,10 @@ static int check_inode(struct btree_trans *trans, return ret; } - if (u.bi_flags & BCH_INODE_UNLINKED) { - fsck_err_on(c->sb.clean, c, - "filesystem marked clean, " - "but inode %llu unlinked", - u.bi_inum); - + if (u.bi_flags & BCH_INODE_UNLINKED && + (!c->sb.clean || + fsck_err(c, "filesystem marked clean, but inode %llu unlinked", + u.bi_inum))) { bch_verbose(c, "deleting inode %llu", u.bi_inum); ret = bch2_inode_rm(c, u.bi_inum); @@ -1240,12 +1252,10 @@ static int check_inode(struct btree_trans *trans, return ret; } - if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY) { - fsck_err_on(c->sb.clean, c, - "filesystem marked clean, " - "but inode %llu has i_size dirty", - u.bi_inum); - + if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY && + (!c->sb.clean || + fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty", + u.bi_inum))) { bch_verbose(c, "truncating inode %llu", u.bi_inum); /* @@ -1270,14 +1280,12 @@ static int check_inode(struct btree_trans *trans, do_update = true; } - if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY) { + if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY && + (!c->sb.clean || + fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty", + u.bi_inum))) { s64 sectors; - fsck_err_on(c->sb.clean, c, - "filesystem marked clean, " - "but inode %llu has i_sectors dirty", - u.bi_inum); - bch_verbose(c, "recounting sectors for inode %llu", u.bi_inum); @@ -1326,6 +1334,7 @@ static int bch2_gc_walk_inodes(struct bch_fs *c, u64 nlinks_pos; bch2_trans_init(&trans, c); + bch2_trans_preload_iters(&trans); iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(range_start, 0), 0); @@ -1425,6 +1434,7 @@ static int check_inodes_fast(struct bch_fs *c) int ret = 0, ret2; bch2_trans_init(&trans, c); + bch2_trans_preload_iters(&trans); iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS_MIN, 0); diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 58d58cc4..d2748e70 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -251,9 +251,7 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, uid_t uid, gid_t gid, umode_t mode, dev_t rdev, struct bch_inode_unpacked *parent) { - s64 now = timespec_to_bch2_time(c, - timespec64_trunc(current_kernel_time64(), - c->sb.time_precision)); + s64 now = bch2_current_time(c); memset(inode_u, 0, sizeof(*inode_u)); @@ -445,31 +443,32 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) return ret; } -int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, - struct bch_inode_unpacked *inode) +int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, + struct bch_inode_unpacked *inode) { - struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; int ret = -ENOENT; - bch2_trans_init(&trans, c); + iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, + POS(inode_nr, 0), BTREE_ITER_SLOTS); + if (IS_ERR(iter)) + return PTR_ERR(iter); - for_each_btree_key(&trans, iter, BTREE_ID_INODES, - POS(inode_nr, 0), BTREE_ITER_SLOTS, k) { - switch (k.k->type) { - case KEY_TYPE_inode: - ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); - break; - default: - /* hole, not found */ - break; - } + k = bch2_btree_iter_peek_slot(iter); + if (k.k->type == KEY_TYPE_inode) + ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); - break; - } + bch2_trans_iter_put(trans, iter); - return bch2_trans_exit(&trans) ?: ret; + return ret; +} + +int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, + struct bch_inode_unpacked *inode) +{ + return bch2_trans_do(c, NULL, 0, + bch2_inode_find_by_inum_trans(&trans, inode_nr, inode)); } #ifdef CONFIG_BCACHEFS_DEBUG diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index 0d609985..e7e8507d 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -3,8 +3,6 @@ #include "opts.h" -#include - extern const char * const bch2_inode_opts[]; const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c); @@ -59,23 +57,9 @@ int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *, int bch2_inode_rm(struct bch_fs *, u64); -int bch2_inode_find_by_inum(struct bch_fs *, u64, - struct bch_inode_unpacked *); - -static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, u64 time) -{ - return ns_to_timespec64(time * c->sb.time_precision + c->sb.time_base_lo); -} - -static inline u64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts) -{ - s64 ns = timespec64_to_ns(&ts) - c->sb.time_base_lo; - - if (c->sb.time_precision == 1) - return ns; - - return div_s64(ns, c->sb.time_precision); -} +int bch2_inode_find_by_inum_trans(struct btree_trans *, u64, + struct bch_inode_unpacked *); +int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *); static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode) { diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index aabb68d2..d092dc0b 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -992,27 +992,57 @@ void bch2_fs_journal_stop(struct journal *j) cancel_delayed_work_sync(&j->reclaim_work); } -void bch2_fs_journal_start(struct journal *j) +int bch2_fs_journal_start(struct journal *j, u64 cur_seq, + struct list_head *journal_entries) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_seq_blacklist *bl; - u64 blacklist = 0; + struct journal_entry_pin_list *p; + struct journal_replay *i; + u64 last_seq = cur_seq, nr, seq; - list_for_each_entry(bl, &j->seq_blacklist, list) - blacklist = max(blacklist, bl->end); + if (!list_empty(journal_entries)) + last_seq = le64_to_cpu(list_last_entry(journal_entries, + struct journal_replay, + list)->j.last_seq); + + nr = cur_seq - last_seq; + + if (nr + 1 > j->pin.size) { + free_fifo(&j->pin); + init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL); + if (!j->pin.data) { + bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); + return -ENOMEM; + } + } + + j->last_seq_ondisk = last_seq; + j->pin.front = last_seq; + j->pin.back = cur_seq; + atomic64_set(&j->seq, cur_seq - 1); + + fifo_for_each_entry_ptr(p, &j->pin, seq) { + INIT_LIST_HEAD(&p->list); + INIT_LIST_HEAD(&p->flushed); + atomic_set(&p->count, 0); + p->devs.nr = 0; + } + + list_for_each_entry(i, journal_entries, list) { + seq = le64_to_cpu(i->j.seq); + + BUG_ON(seq < last_seq || seq >= cur_seq); + + p = journal_seq_pin(j, seq); + + atomic_set(&p->count, 1); + p->devs = i->devs; + } spin_lock(&j->lock); set_bit(JOURNAL_STARTED, &j->flags); - while (journal_cur_seq(j) < blacklist) - journal_pin_new_entry(j, 0); - - /* - * __journal_entry_close() only inits the next journal entry when it - * closes an open journal entry - the very first journal entry gets - * initialized here: - */ journal_pin_new_entry(j, 1); bch2_journal_buf_init(j); @@ -1021,12 +1051,7 @@ void bch2_fs_journal_start(struct journal *j) bch2_journal_space_available(j); spin_unlock(&j->lock); - /* - * Adding entries to the next journal entry before allocating space on - * disk for the next journal entry - this is ok, because these entries - * only have to go down with the next journal entry we write: - */ - bch2_journal_seq_blacklist_write(j); + return 0; } /* init/exit: */ @@ -1091,8 +1116,6 @@ int bch2_fs_journal_init(struct journal *j) INIT_DELAYED_WORK(&j->write_work, journal_write_work); INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work); init_waitqueue_head(&j->pin_flush_wait); - mutex_init(&j->blacklist_lock); - INIT_LIST_HEAD(&j->seq_blacklist); mutex_init(&j->reclaim_lock); mutex_init(&j->discard_lock); diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 83b70d51..7b1523ef 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -469,8 +469,10 @@ int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, int bch2_dev_journal_alloc(struct bch_dev *); void bch2_dev_journal_stop(struct journal *, struct bch_dev *); + void bch2_fs_journal_stop(struct journal *); -void bch2_fs_journal_start(struct journal *); +int bch2_fs_journal_start(struct journal *, u64, struct list_head *); + void bch2_dev_journal_exit(struct bch_dev *); int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); void bch2_fs_journal_exit(struct journal *); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 27404311..af0701de 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -9,7 +9,6 @@ #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" -#include "journal_seq_blacklist.h" #include "replicas.h" #include @@ -655,45 +654,11 @@ void bch2_journal_entries_free(struct list_head *list) } } -int bch2_journal_set_seq(struct bch_fs *c, u64 last_seq, u64 end_seq) -{ - struct journal *j = &c->journal; - struct journal_entry_pin_list *p; - u64 seq, nr = end_seq - last_seq + 1; - - if (nr > j->pin.size) { - free_fifo(&j->pin); - init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL); - if (!j->pin.data) { - bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); - return -ENOMEM; - } - } - - atomic64_set(&j->seq, end_seq); - j->last_seq_ondisk = last_seq; - - j->pin.front = last_seq; - j->pin.back = end_seq + 1; - - fifo_for_each_entry_ptr(p, &j->pin, seq) { - INIT_LIST_HEAD(&p->list); - INIT_LIST_HEAD(&p->flushed); - atomic_set(&p->count, 0); - p->devs.nr = 0; - } - - return 0; -} - int bch2_journal_read(struct bch_fs *c, struct list_head *list) { - struct journal *j = &c->journal; struct journal_list jlist; struct journal_replay *i; - struct journal_entry_pin_list *p; struct bch_dev *ca; - u64 cur_seq, end_seq; unsigned iter; size_t keys = 0, entries = 0; bool degraded = false; @@ -725,17 +690,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) if (jlist.ret) return jlist.ret; - if (list_empty(list)){ - bch_err(c, "no journal entries found"); - return BCH_FSCK_REPAIR_IMPOSSIBLE; - } - list_for_each_entry(i, list, list) { + struct jset_entry *entry; + struct bkey_i *k, *_n; struct bch_replicas_padded replicas; char buf[80]; - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs); - ret = jset_validate_entries(c, &i->j, READ); if (ret) goto fsck_err; @@ -745,6 +705,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) * the devices - this is wrong: */ + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs); + if (!degraded && (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || fsck_err_on(!bch2_replicas_marked(c, &replicas.e, false), c, @@ -755,68 +717,18 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) if (ret) return ret; } - } - - i = list_last_entry(list, struct journal_replay, list); - - ret = bch2_journal_set_seq(c, - le64_to_cpu(i->j.last_seq), - le64_to_cpu(i->j.seq)); - if (ret) - return ret; - - mutex_lock(&j->blacklist_lock); - - list_for_each_entry(i, list, list) { - p = journal_seq_pin(j, le64_to_cpu(i->j.seq)); - - atomic_set(&p->count, 1); - p->devs = i->devs; - - if (bch2_journal_seq_blacklist_read(j, i)) { - mutex_unlock(&j->blacklist_lock); - return -ENOMEM; - } - } - - mutex_unlock(&j->blacklist_lock); - - cur_seq = journal_last_seq(j); - end_seq = le64_to_cpu(list_last_entry(list, - struct journal_replay, list)->j.seq); - - list_for_each_entry(i, list, list) { - struct jset_entry *entry; - struct bkey_i *k, *_n; - bool blacklisted; - - mutex_lock(&j->blacklist_lock); - while (cur_seq < le64_to_cpu(i->j.seq) && - bch2_journal_seq_blacklist_find(j, cur_seq)) - cur_seq++; - - blacklisted = bch2_journal_seq_blacklist_find(j, - le64_to_cpu(i->j.seq)); - mutex_unlock(&j->blacklist_lock); - - fsck_err_on(blacklisted, c, - "found blacklisted journal entry %llu", - le64_to_cpu(i->j.seq)); - - fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c, - "journal entries %llu-%llu missing! (replaying %llu-%llu)", - cur_seq, le64_to_cpu(i->j.seq) - 1, - journal_last_seq(j), end_seq); - - cur_seq = le64_to_cpu(i->j.seq) + 1; for_each_jset_key(k, _n, entry, &i->j) keys++; entries++; } - bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", - keys, entries, journal_cur_seq(j)); + if (!list_empty(list)) { + i = list_last_entry(list, struct journal_replay, list); + + bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", + keys, entries, le64_to_cpu(i->j.seq)); + } fsck_err: return ret; } @@ -876,8 +788,9 @@ static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k) * but - there are other correctness issues if btree gc were to run * before journal replay finishes */ + BUG_ON(c->gc_pos.phase); + bch2_mark_key(c, bkey_i_to_s_c(k), false, -((s64) k->k.size), - gc_pos_btree_node(iter->l[0].b), NULL, 0, 0); bch2_trans_exit(&trans); diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h index ec7b49b8..74dd57af 100644 --- a/libbcachefs/journal_io.h +++ b/libbcachefs/journal_io.h @@ -34,7 +34,6 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \ vstruct_for_each_safe(entry, k, _n) -int bch2_journal_set_seq(struct bch_fs *c, u64, u64); int bch2_journal_read(struct bch_fs *, struct list_head *); void bch2_journal_entries_free(struct list_head *); int bch2_journal_replay(struct bch_fs *, struct list_head *); diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c index 5bac41cf..2b71d066 100644 --- a/libbcachefs/journal_seq_blacklist.c +++ b/libbcachefs/journal_seq_blacklist.c @@ -1,12 +1,9 @@ #include "bcachefs.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "error.h" -#include "journal.h" -#include "journal_io.h" -#include "journal_reclaim.h" +#include "btree_iter.h" +#include "eytzinger.h" #include "journal_seq_blacklist.h" +#include "super-io.h" /* * journal_seq_blacklist machinery: @@ -36,327 +33,285 @@ * record that it was blacklisted so that a) on recovery we don't think we have * missing journal entries and b) so that the btree code continues to ignore * that bset, until that btree node is rewritten. - * - * Blacklisted journal sequence numbers are themselves recorded as entries in - * the journal. */ -/* - * Called when journal needs to evict a blacklist entry to reclaim space: find - * any btree nodes that refer to the blacklist journal sequence numbers, and - * rewrite them: - */ -static void journal_seq_blacklist_flush(struct journal *j, - struct journal_entry_pin *pin, u64 seq) +static unsigned +blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) { - struct bch_fs *c = - container_of(j, struct bch_fs, journal); - struct journal_seq_blacklist *bl = - container_of(pin, struct journal_seq_blacklist, pin); - struct blacklisted_node n; - struct closure cl; - unsigned i; - int ret; - - closure_init_stack(&cl); - - for (i = 0;; i++) { - struct btree_trans trans; - struct btree_iter *iter; - struct btree *b; - - bch2_trans_init(&trans, c); - - mutex_lock(&j->blacklist_lock); - if (i >= bl->nr_entries) { - mutex_unlock(&j->blacklist_lock); - break; - } - n = bl->entries[i]; - mutex_unlock(&j->blacklist_lock); - - iter = bch2_trans_get_node_iter(&trans, n.btree_id, n.pos, - 0, 0, 0); - - b = bch2_btree_iter_peek_node(iter); - - /* The node might have already been rewritten: */ - - if (b->data->keys.seq == n.seq) { - ret = bch2_btree_node_rewrite(c, iter, n.seq, 0); - if (ret) { - bch2_trans_exit(&trans); - bch2_fs_fatal_error(c, - "error %i rewriting btree node with blacklisted journal seq", - ret); - bch2_journal_halt(j); - return; - } - } - - bch2_trans_exit(&trans); - } - - for (i = 0;; i++) { - struct btree_update *as; - struct pending_btree_node_free *d; - - mutex_lock(&j->blacklist_lock); - if (i >= bl->nr_entries) { - mutex_unlock(&j->blacklist_lock); - break; - } - n = bl->entries[i]; - mutex_unlock(&j->blacklist_lock); -redo_wait: - mutex_lock(&c->btree_interior_update_lock); - - /* - * Is the node on the list of pending interior node updates - - * being freed? If so, wait for that to finish: - */ - for_each_pending_btree_node_free(c, as, d) - if (n.seq == d->seq && - n.btree_id == d->btree_id && - !d->level && - !bkey_cmp(n.pos, d->key.k.p)) { - closure_wait(&as->wait, &cl); - mutex_unlock(&c->btree_interior_update_lock); - closure_sync(&cl); - goto redo_wait; - } - - mutex_unlock(&c->btree_interior_update_lock); - } - - mutex_lock(&j->blacklist_lock); - - bch2_journal_pin_drop(j, &bl->pin); - list_del(&bl->list); - kfree(bl->entries); - kfree(bl); - - mutex_unlock(&j->blacklist_lock); + return bl + ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) / + sizeof(struct journal_seq_blacklist_entry)) + : 0; } -/* - * Determine if a particular sequence number is blacklisted - if so, return - * blacklist entry: - */ -struct journal_seq_blacklist * -bch2_journal_seq_blacklist_find(struct journal *j, u64 seq) +static unsigned sb_blacklist_u64s(unsigned nr) { - struct journal_seq_blacklist *bl; + struct bch_sb_field_journal_seq_blacklist *bl; - lockdep_assert_held(&j->blacklist_lock); + return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64); +} - list_for_each_entry(bl, &j->seq_blacklist, list) - if (seq >= bl->start && seq <= bl->end) - return bl; +static struct bch_sb_field_journal_seq_blacklist * +blacklist_entry_try_merge(struct bch_fs *c, + struct bch_sb_field_journal_seq_blacklist *bl, + unsigned i) +{ + unsigned nr = blacklist_nr_entries(bl); + + if (le64_to_cpu(bl->start[i].end) >= + le64_to_cpu(bl->start[i + 1].start)) { + bl->start[i].end = bl->start[i + 1].end; + --nr; + memmove(&bl->start[i], + &bl->start[i + 1], + sizeof(bl->start[0]) * (nr - i)); + + bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, + sb_blacklist_u64s(nr)); + BUG_ON(!bl); + } + + return bl; +} + +int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) +{ + struct bch_sb_field_journal_seq_blacklist *bl; + unsigned i, nr; + int ret = 0; + + mutex_lock(&c->sb_lock); + bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); + nr = blacklist_nr_entries(bl); + + if (bl) { + for (i = 0; i < nr; i++) { + struct journal_seq_blacklist_entry *e = + bl->start + i; + + if (start == le64_to_cpu(e->start) && + end == le64_to_cpu(e->end)) + goto out; + + if (start <= le64_to_cpu(e->start) && + end >= le64_to_cpu(e->end)) { + e->start = cpu_to_le64(start); + e->end = cpu_to_le64(end); + + if (i + 1 < nr) + bl = blacklist_entry_try_merge(c, + bl, i); + if (i) + bl = blacklist_entry_try_merge(c, + bl, i - 1); + goto out_write_sb; + } + } + } + + bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, + sb_blacklist_u64s(nr + 1)); + if (!bl) { + ret = -ENOMEM; + goto out; + } + + bl->start[nr].start = cpu_to_le64(start); + bl->start[nr].end = cpu_to_le64(end); +out_write_sb: + c->disk_sb.sb->features[0] |= + 1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3; + + ret = bch2_write_super(c); +out: + mutex_unlock(&c->sb_lock); + + return ret; +} + +static int journal_seq_blacklist_table_cmp(const void *_l, + const void *_r, size_t size) +{ + const struct journal_seq_blacklist_table_entry *l = _l; + const struct journal_seq_blacklist_table_entry *r = _r; + + return (l->start > r->start) - (l->start < r->start); +} + +bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, + bool dirty) +{ + struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; + struct journal_seq_blacklist_table_entry search = { .start = seq }; + int idx; + + if (!t) + return false; + + idx = eytzinger0_find_le(t->entries, t->nr, + sizeof(t->entries[0]), + journal_seq_blacklist_table_cmp, + &search); + if (idx < 0) + return false; + + BUG_ON(t->entries[idx].start > seq); + + if (seq >= t->entries[idx].end) + return false; + + if (dirty) + t->entries[idx].dirty = true; + return true; +} + +int bch2_blacklist_table_initialize(struct bch_fs *c) +{ + struct bch_sb_field_journal_seq_blacklist *bl = + bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); + struct journal_seq_blacklist_table *t; + unsigned i, nr = blacklist_nr_entries(bl); + + BUG_ON(c->journal_seq_blacklist_table); + + if (!bl) + return 0; + + t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr, + GFP_KERNEL); + if (!t) + return -ENOMEM; + + t->nr = nr; + + for (i = 0; i < nr; i++) { + t->entries[i].start = le64_to_cpu(bl->start[i].start); + t->entries[i].end = le64_to_cpu(bl->start[i].end); + } + + eytzinger0_sort(t->entries, + t->nr, + sizeof(t->entries[0]), + journal_seq_blacklist_table_cmp, + NULL); + + c->journal_seq_blacklist_table = t; + return 0; +} + +static const char * +bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_journal_seq_blacklist *bl = + field_to_type(f, journal_seq_blacklist); + struct journal_seq_blacklist_entry *i; + unsigned nr = blacklist_nr_entries(bl); + + for (i = bl->start; i < bl->start + nr; i++) { + if (le64_to_cpu(i->start) >= + le64_to_cpu(i->end)) + return "entry start >= end"; + + if (i + 1 < bl->start + nr && + le64_to_cpu(i[0].end) > + le64_to_cpu(i[1].start)) + return "entries out of order"; + } return NULL; } -/* - * Allocate a new, in memory blacklist entry: - */ -static struct journal_seq_blacklist * -bch2_journal_seq_blacklisted_new(struct journal *j, u64 start, u64 end) +static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out, + struct bch_sb *sb, + struct bch_sb_field *f) { - struct journal_seq_blacklist *bl; + struct bch_sb_field_journal_seq_blacklist *bl = + field_to_type(f, journal_seq_blacklist); + struct journal_seq_blacklist_entry *i; + unsigned nr = blacklist_nr_entries(bl); - lockdep_assert_held(&j->blacklist_lock); + for (i = bl->start; i < bl->start + nr; i++) { + if (i != bl->start) + pr_buf(out, " "); - /* - * When we start the journal, bch2_journal_start() will skip over @seq: - */ - - bl = kzalloc(sizeof(*bl), GFP_KERNEL); - if (!bl) - return NULL; - - bl->start = start; - bl->end = end; - - list_add_tail(&bl->list, &j->seq_blacklist); - return bl; + pr_buf(out, "%llu-%llu", + le64_to_cpu(i->start), + le64_to_cpu(i->end)); + } } -/* - * Returns true if @seq is newer than the most recent journal entry that got - * written, and data corresponding to @seq should be ignored - also marks @seq - * as blacklisted so that on future restarts the corresponding data will still - * be ignored: - */ -int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b) +const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { + .validate = bch2_sb_journal_seq_blacklist_validate, + .to_text = bch2_sb_journal_seq_blacklist_to_text +}; + +void bch2_blacklist_entries_gc(struct work_struct *work) { - struct journal *j = &c->journal; - struct journal_seq_blacklist *bl = NULL; - struct blacklisted_node *n; - u64 journal_seq; - int ret = 0; + struct bch_fs *c = container_of(work, struct bch_fs, + journal_seq_blacklist_gc_work); + struct journal_seq_blacklist_table *t; + struct bch_sb_field_journal_seq_blacklist *bl; + struct journal_seq_blacklist_entry *src, *dst; + struct btree_trans trans; + unsigned i, nr, new_nr; + int ret; - if (!seq) - return 0; + bch2_trans_init(&trans, c); - spin_lock(&j->lock); - journal_seq = journal_cur_seq(j); - spin_unlock(&j->lock); + for (i = 0; i < BTREE_ID_NR; i++) { + struct btree_iter *iter; + struct btree *b; - /* Interier updates aren't journalled: */ - BUG_ON(b->level); - BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)); - - /* - * Decrease this back to j->seq + 2 when we next rev the on disk format: - * increasing it temporarily to work around bug in old kernels - */ - fsck_err_on(seq > journal_seq + 4, c, - "bset journal seq too far in the future: %llu > %llu", - seq, journal_seq); - - if (seq <= journal_seq && - list_empty_careful(&j->seq_blacklist)) - return 0; - - mutex_lock(&j->blacklist_lock); - - if (seq <= journal_seq) { - bl = bch2_journal_seq_blacklist_find(j, seq); - if (!bl) - goto out; - } else { - bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting", - b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq); - - if (!j->new_blacklist) { - j->new_blacklist = bch2_journal_seq_blacklisted_new(j, - journal_seq + 1, - journal_seq + 1); - if (!j->new_blacklist) { - ret = -ENOMEM; - goto out; + for_each_btree_node(&trans, iter, i, POS_MIN, + BTREE_ITER_PREFETCH, b) + if (test_bit(BCH_FS_STOPPING, &c->flags)) { + bch2_trans_exit(&trans); + return; } - } - bl = j->new_blacklist; - bl->end = max(bl->end, seq); + bch2_trans_iter_free(&trans, iter); } - for (n = bl->entries; n < bl->entries + bl->nr_entries; n++) - if (b->data->keys.seq == n->seq && - b->btree_id == n->btree_id && - !bkey_cmp(b->key.k.p, n->pos)) - goto found_entry; - - if (!bl->nr_entries || - is_power_of_2(bl->nr_entries)) { - n = krealloc(bl->entries, - max_t(size_t, bl->nr_entries * 2, 8) * sizeof(*n), - GFP_KERNEL); - if (!n) { - ret = -ENOMEM; - goto out; - } - bl->entries = n; - } - - bl->entries[bl->nr_entries++] = (struct blacklisted_node) { - .seq = b->data->keys.seq, - .btree_id = b->btree_id, - .pos = b->key.k.p, - }; -found_entry: - ret = 1; -out: -fsck_err: - mutex_unlock(&j->blacklist_lock); - return ret; -} - -static int __bch2_journal_seq_blacklist_read(struct journal *j, - struct journal_replay *i, - u64 start, u64 end) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_seq_blacklist *bl; - - bch_verbose(c, "blacklisting existing journal seq %llu-%llu", - start, end); - - bl = bch2_journal_seq_blacklisted_new(j, start, end); - if (!bl) - return -ENOMEM; - - bch2_journal_pin_add(j, le64_to_cpu(i->j.seq), &bl->pin, - journal_seq_blacklist_flush); - return 0; -} - -/* - * After reading the journal, find existing journal seq blacklist entries and - * read them into memory: - */ -int bch2_journal_seq_blacklist_read(struct journal *j, - struct journal_replay *i) -{ - struct jset_entry *entry; - int ret = 0; - - vstruct_for_each(&i->j, entry) { - switch (entry->type) { - case BCH_JSET_ENTRY_blacklist: { - struct jset_entry_blacklist *bl_entry = - container_of(entry, struct jset_entry_blacklist, entry); - - ret = __bch2_journal_seq_blacklist_read(j, i, - le64_to_cpu(bl_entry->seq), - le64_to_cpu(bl_entry->seq)); - break; - } - case BCH_JSET_ENTRY_blacklist_v2: { - struct jset_entry_blacklist_v2 *bl_entry = - container_of(entry, struct jset_entry_blacklist_v2, entry); - - ret = __bch2_journal_seq_blacklist_read(j, i, - le64_to_cpu(bl_entry->start), - le64_to_cpu(bl_entry->end)); - break; - } - } - - if (ret) - break; - } - - return ret; -} - -/* - * After reading the journal and walking the btree, we might have new journal - * sequence numbers to blacklist - add entries to the next journal entry to be - * written: - */ -void bch2_journal_seq_blacklist_write(struct journal *j) -{ - struct journal_seq_blacklist *bl = j->new_blacklist; - struct jset_entry_blacklist_v2 *bl_entry; - struct jset_entry *entry; - - if (!bl) + ret = bch2_trans_exit(&trans); + if (ret) return; - entry = bch2_journal_add_entry_noreservation(journal_cur_buf(j), - (sizeof(*bl_entry) - sizeof(*entry)) / sizeof(u64)); + mutex_lock(&c->sb_lock); + bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); + if (!bl) + goto out; - bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); - bl_entry->entry.type = BCH_JSET_ENTRY_blacklist_v2; - bl_entry->start = cpu_to_le64(bl->start); - bl_entry->end = cpu_to_le64(bl->end); + nr = blacklist_nr_entries(bl); + dst = bl->start; - bch2_journal_pin_add(j, - journal_cur_seq(j), - &bl->pin, - journal_seq_blacklist_flush); + t = c->journal_seq_blacklist_table; + BUG_ON(nr != t->nr); - j->new_blacklist = NULL; + for (src = bl->start, i = eytzinger0_first(t->nr); + src < bl->start + nr; + src++, i = eytzinger0_next(i, nr)) { + BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); + BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); + + if (t->entries[i].dirty) + *dst++ = *src; + } + + new_nr = dst - bl->start; + + bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr); + + if (new_nr != nr) { + bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, + new_nr ? sb_blacklist_u64s(new_nr) : 0); + BUG_ON(new_nr && !bl); + + if (!new_nr) + c->disk_sb.sb->features[0] &= + ~(1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3); + + bch2_write_super(c); + } +out: + mutex_unlock(&c->sb_lock); } diff --git a/libbcachefs/journal_seq_blacklist.h b/libbcachefs/journal_seq_blacklist.h index 95ea6e90..b1ad591d 100644 --- a/libbcachefs/journal_seq_blacklist.h +++ b/libbcachefs/journal_seq_blacklist.h @@ -1,13 +1,12 @@ #ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H #define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H -struct journal_replay; +bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); +int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); +int bch2_blacklist_table_initialize(struct bch_fs *); -struct journal_seq_blacklist * -bch2_journal_seq_blacklist_find(struct journal *, u64); -int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *); -int bch2_journal_seq_blacklist_read(struct journal *, - struct journal_replay *); -void bch2_journal_seq_blacklist_write(struct journal *); +extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist; + +void bch2_blacklist_entries_gc(struct work_struct *); #endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index 4685cf67..922fb5ca 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -53,24 +53,6 @@ struct journal_entry_pin { u64 seq; }; -/* corresponds to a btree node with a blacklisted bset: */ -struct blacklisted_node { - __le64 seq; - enum btree_id btree_id; - struct bpos pos; -}; - -struct journal_seq_blacklist { - struct list_head list; - u64 start; - u64 end; - - struct journal_entry_pin pin; - - struct blacklisted_node *entries; - size_t nr_entries; -}; - struct journal_res { bool ref; u8 idx; @@ -221,10 +203,6 @@ struct journal { u64 replay_journal_seq; - struct mutex blacklist_lock; - struct list_head seq_blacklist; - struct journal_seq_blacklist *new_blacklist; - struct write_point wp; spinlock_t err_lock; diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index d6890824..12d33119 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -208,7 +208,8 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) up_read(&ca->bucket_lock); if (sectors_not_moved && !ret) - bch_warn(c, "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved", + bch_warn_ratelimited(c, + "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved", sectors_not_moved, sectors_to_move, buckets_not_moved, buckets_to_move); diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c index b988a565..5e7df0bb 100644 --- a/libbcachefs/quota.c +++ b/libbcachefs/quota.c @@ -457,7 +457,7 @@ static int bch2_quota_enable(struct super_block *sb, unsigned uflags) { struct bch_fs *c = sb->s_fs_info; - if (sb->s_flags & MS_RDONLY) + if (sb->s_flags & SB_RDONLY) return -EROFS; /* Accounting must be enabled at mount time: */ @@ -494,7 +494,7 @@ static int bch2_quota_disable(struct super_block *sb, unsigned uflags) { struct bch_fs *c = sb->s_fs_info; - if (sb->s_flags & MS_RDONLY) + if (sb->s_flags & SB_RDONLY) return -EROFS; mutex_lock(&c->sb_lock); @@ -518,7 +518,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags) struct bch_fs *c = sb->s_fs_info; int ret; - if (sb->s_flags & MS_RDONLY) + if (sb->s_flags & SB_RDONLY) return -EROFS; if (uflags & FS_USER_QUOTA) { @@ -600,7 +600,7 @@ static int bch2_quota_set_info(struct super_block *sb, int type, struct bch_sb_field_quota *sb_quota; struct bch_memquota_type *q; - if (sb->s_flags & MS_RDONLY) + if (sb->s_flags & SB_RDONLY) return -EROFS; if (type >= QTYP_NR) @@ -719,7 +719,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid, struct bkey_i_quota new_quota; int ret; - if (sb->s_flags & MS_RDONLY) + if (sb->s_flags & SB_RDONLY) return -EROFS; bkey_quota_init(&new_quota.k_i); diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 00161e05..a5651a9c 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -11,6 +11,7 @@ #include "error.h" #include "fsck.h" #include "journal_io.h" +#include "journal_seq_blacklist.h" #include "quota.h" #include "recovery.h" #include "replicas.h" @@ -51,6 +52,118 @@ found: return k; } +static int verify_superblock_clean(struct bch_fs *c, + struct bch_sb_field_clean **cleanp, + struct jset *j) +{ + unsigned i; + struct bch_sb_field_clean *clean = *cleanp; + int ret = 0; + + if (!clean || !j) + return 0; + + if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, + "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", + le64_to_cpu(clean->journal_seq), + le64_to_cpu(j->seq))) { + kfree(clean); + *cleanp = NULL; + return 0; + } + + mustfix_fsck_err_on(j->read_clock != clean->read_clock, c, + "superblock read clock doesn't match journal after clean shutdown"); + mustfix_fsck_err_on(j->write_clock != clean->write_clock, c, + "superblock read clock doesn't match journal after clean shutdown"); + + for (i = 0; i < BTREE_ID_NR; i++) { + struct bkey_i *k1, *k2; + unsigned l1 = 0, l2 = 0; + + k1 = btree_root_find(c, clean, NULL, i, &l1); + k2 = btree_root_find(c, NULL, j, i, &l2); + + if (!k1 && !k2) + continue; + + mustfix_fsck_err_on(!k1 || !k2 || + IS_ERR(k1) || + IS_ERR(k2) || + k1->k.u64s != k2->k.u64s || + memcmp(k1, k2, bkey_bytes(k1)) || + l1 != l2, c, + "superblock btree root doesn't match journal after clean shutdown"); + } +fsck_err: + return ret; +} + +static int +verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c, + struct list_head *journal) +{ + struct journal_replay *i = + list_last_entry(journal, struct journal_replay, list); + u64 start_seq = le64_to_cpu(i->j.last_seq); + u64 end_seq = le64_to_cpu(i->j.seq); + u64 seq = start_seq; + int ret = 0; + + list_for_each_entry(i, journal, list) { + fsck_err_on(seq != le64_to_cpu(i->j.seq), c, + "journal entries %llu-%llu missing! (replaying %llu-%llu)", + seq, le64_to_cpu(i->j.seq) - 1, + start_seq, end_seq); + + seq = le64_to_cpu(i->j.seq); + + fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c, + "found blacklisted journal entry %llu", seq); + + do { + seq++; + } while (bch2_journal_seq_is_blacklisted(c, seq, false)); + } +fsck_err: + return ret; +} + +static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) +{ + struct bch_sb_field_clean *clean, *sb_clean; + int ret; + + mutex_lock(&c->sb_lock); + sb_clean = bch2_sb_get_clean(c->disk_sb.sb); + + if (fsck_err_on(!sb_clean, c, + "superblock marked clean but clean section not present")) { + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + c->sb.clean = false; + mutex_unlock(&c->sb_lock); + return NULL; + } + + clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), + GFP_KERNEL); + if (!clean) { + mutex_unlock(&c->sb_lock); + return ERR_PTR(-ENOMEM); + } + + if (le16_to_cpu(c->disk_sb.sb->version) < + bcachefs_metadata_version_bkey_renumber) + bch2_sb_clean_renumber(clean, READ); + + mutex_unlock(&c->sb_lock); + + return clean; +fsck_err: + mutex_unlock(&c->sb_lock); + return ERR_PTR(ret); +} + static int journal_replay_entry_early(struct bch_fs *c, struct jset_entry *entry) { @@ -100,54 +213,108 @@ static int journal_replay_entry_early(struct bch_fs *c, le64_to_cpu(u->v)); break; } + case BCH_JSET_ENTRY_blacklist: { + struct jset_entry_blacklist *bl_entry = + container_of(entry, struct jset_entry_blacklist, entry); + + ret = bch2_journal_seq_blacklist_add(c, + le64_to_cpu(bl_entry->seq), + le64_to_cpu(bl_entry->seq) + 1); + break; + } + case BCH_JSET_ENTRY_blacklist_v2: { + struct jset_entry_blacklist_v2 *bl_entry = + container_of(entry, struct jset_entry_blacklist_v2, entry); + + ret = bch2_journal_seq_blacklist_add(c, + le64_to_cpu(bl_entry->start), + le64_to_cpu(bl_entry->end) + 1); + break; + } } return ret; } -static int verify_superblock_clean(struct bch_fs *c, - struct bch_sb_field_clean **cleanp, - struct jset *j) +static int journal_replay_early(struct bch_fs *c, + struct bch_sb_field_clean *clean, + struct list_head *journal) +{ + struct jset_entry *entry; + int ret; + + if (clean) { + c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock); + c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock); + + for (entry = clean->start; + entry != vstruct_end(&clean->field); + entry = vstruct_next(entry)) { + ret = journal_replay_entry_early(c, entry); + if (ret) + return ret; + } + } else { + struct journal_replay *i = + list_last_entry(journal, struct journal_replay, list); + + c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock); + c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock); + + list_for_each_entry(i, journal, list) + vstruct_for_each(&i->j, entry) { + ret = journal_replay_entry_early(c, entry); + if (ret) + return ret; + } + } + + bch2_fs_usage_initialize(c); + + return 0; +} + +static int read_btree_roots(struct bch_fs *c) { unsigned i; - struct bch_sb_field_clean *clean = *cleanp; int ret = 0; - if (!clean || !j) - return 0; - - if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, - "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", - le64_to_cpu(clean->journal_seq), - le64_to_cpu(j->seq))) { - kfree(clean); - *cleanp = NULL; - return 0; - } - - mustfix_fsck_err_on(j->read_clock != clean->read_clock, c, - "superblock read clock doesn't match journal after clean shutdown"); - mustfix_fsck_err_on(j->write_clock != clean->write_clock, c, - "superblock read clock doesn't match journal after clean shutdown"); - for (i = 0; i < BTREE_ID_NR; i++) { - struct bkey_i *k1, *k2; - unsigned l1 = 0, l2 = 0; + struct btree_root *r = &c->btree_roots[i]; - k1 = btree_root_find(c, clean, NULL, i, &l1); - k2 = btree_root_find(c, NULL, j, i, &l2); - - if (!k1 && !k2) + if (!r->alive) continue; - mustfix_fsck_err_on(!k1 || !k2 || - IS_ERR(k1) || - IS_ERR(k2) || - k1->k.u64s != k2->k.u64s || - memcmp(k1, k2, bkey_bytes(k1)) || - l1 != l2, c, - "superblock btree root doesn't match journal after clean shutdown"); + if (i == BTREE_ID_ALLOC && + test_reconstruct_alloc(c)) { + c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); + continue; + } + + + if (r->error) { + __fsck_err(c, i == BTREE_ID_ALLOC + ? FSCK_CAN_IGNORE : 0, + "invalid btree root %s", + bch2_btree_ids[i]); + if (i == BTREE_ID_ALLOC) + c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); + } + + ret = bch2_btree_root_read(c, i, &r->key, r->level); + if (ret) { + __fsck_err(c, i == BTREE_ID_ALLOC + ? FSCK_CAN_IGNORE : 0, + "error reading btree root %s", + bch2_btree_ids[i]); + if (i == BTREE_ID_ALLOC) + c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); + } } + + for (i = 0; i < BTREE_ID_NR; i++) + if (!c->btree_roots[i].b) + bch2_btree_root_alloc(c, i); fsck_err: return ret; } @@ -185,119 +352,82 @@ static bool journal_empty(struct list_head *journal) int bch2_fs_recovery(struct bch_fs *c) { const char *err = "cannot allocate memory"; - struct bch_sb_field_clean *clean = NULL, *sb_clean = NULL; - struct jset_entry *entry; + struct bch_sb_field_clean *clean = NULL; + u64 journal_seq; LIST_HEAD(journal); - struct jset *j = NULL; - unsigned i; - bool run_gc = c->opts.fsck || - !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)); int ret; - mutex_lock(&c->sb_lock); + if (c->sb.clean) + clean = read_superblock_clean(c); + ret = PTR_ERR_OR_ZERO(clean); + if (ret) + goto err; + + if (c->sb.clean) + bch_info(c, "recovering from clean shutdown, journal seq %llu", + le64_to_cpu(clean->journal_seq)); + if (!c->replicas.entries) { bch_info(c, "building replicas info"); set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); } - if (c->sb.clean) - sb_clean = bch2_sb_get_clean(c->disk_sb.sb); - if (sb_clean) { - clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), - GFP_KERNEL); - if (!clean) { - ret = -ENOMEM; - mutex_unlock(&c->sb_lock); - goto err; - } + if (!c->sb.clean || c->opts.fsck) { + struct jset *j; - if (le16_to_cpu(c->disk_sb.sb->version) < - bcachefs_metadata_version_bkey_renumber) - bch2_sb_clean_renumber(clean, READ); - } - mutex_unlock(&c->sb_lock); - - if (clean) - bch_info(c, "recovering from clean shutdown, journal seq %llu", - le64_to_cpu(clean->journal_seq)); - - if (!clean || c->opts.fsck) { ret = bch2_journal_read(c, &journal); if (ret) goto err; - j = &list_entry(journal.prev, struct journal_replay, list)->j; + fsck_err_on(c->sb.clean && !journal_empty(&journal), c, + "filesystem marked clean but journal not empty"); + + if (!c->sb.clean && list_empty(&journal)){ + bch_err(c, "no journal entries found"); + ret = BCH_FSCK_REPAIR_IMPOSSIBLE; + goto err; + } + + j = &list_last_entry(&journal, struct journal_replay, list)->j; + + ret = verify_superblock_clean(c, &clean, j); + if (ret) + goto err; + + journal_seq = le64_to_cpu(j->seq) + 1; } else { - ret = bch2_journal_set_seq(c, - le64_to_cpu(clean->journal_seq), - le64_to_cpu(clean->journal_seq)); - BUG_ON(ret); + journal_seq = le64_to_cpu(clean->journal_seq) + 1; } - ret = verify_superblock_clean(c, &clean, j); + ret = journal_replay_early(c, clean, &journal); if (ret) goto err; - fsck_err_on(clean && !journal_empty(&journal), c, - "filesystem marked clean but journal not empty"); - - err = "insufficient memory"; - if (clean) { - c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock); - c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock); - - for (entry = clean->start; - entry != vstruct_end(&clean->field); - entry = vstruct_next(entry)) { - ret = journal_replay_entry_early(c, entry); - if (ret) - goto err; - } - } else { - struct journal_replay *i; - - c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock); - c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock); - - list_for_each_entry(i, &journal, list) - vstruct_for_each(&i->j, entry) { - ret = journal_replay_entry_early(c, entry); - if (ret) - goto err; - } - } - - bch2_fs_usage_initialize(c); - - for (i = 0; i < BTREE_ID_NR; i++) { - struct btree_root *r = &c->btree_roots[i]; - - if (!r->alive) - continue; - - err = "invalid btree root pointer"; - ret = -1; - if (r->error) - goto err; - - if (i == BTREE_ID_ALLOC && - test_reconstruct_alloc(c)) - continue; - - err = "error reading btree root"; - ret = bch2_btree_root_read(c, i, &r->key, r->level); + if (!c->sb.clean) { + ret = bch2_journal_seq_blacklist_add(c, + journal_seq, + journal_seq + 4); if (ret) { - if (i != BTREE_ID_ALLOC) - goto err; - - mustfix_fsck_err(c, "error reading btree root"); - run_gc = true; + bch_err(c, "error creating new journal seq blacklist entry"); + goto err; } + + journal_seq += 4; } - for (i = 0; i < BTREE_ID_NR; i++) - if (!c->btree_roots[i].b) - bch2_btree_root_alloc(c, i); + ret = bch2_blacklist_table_initialize(c); + + ret = verify_journal_entries_not_blacklisted_or_missing(c, &journal); + if (ret) + goto err; + + ret = bch2_fs_journal_start(&c->journal, journal_seq, &journal); + if (ret) + goto err; + + ret = read_btree_roots(c); + if (ret) + goto err; err = "error reading allocation information"; ret = bch2_alloc_read(c, &journal); @@ -312,10 +442,12 @@ int bch2_fs_recovery(struct bch_fs *c) set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); - if (run_gc) { + if (c->opts.fsck || + !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) || + test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { bch_verbose(c, "starting mark and sweep:"); err = "error in recovery"; - ret = bch2_gc(c, &journal, true); + ret = bch2_gc(c, &journal, true, false); if (ret) goto err; bch_verbose(c, "mark and sweep done"); @@ -334,13 +466,6 @@ int bch2_fs_recovery(struct bch_fs *c) if (c->opts.noreplay) goto out; - /* - * bch2_fs_journal_start() can't happen sooner, or btree_gc_finish() - * will give spurious errors about oldest_gen > bucket_gen - - * this is a hack but oh well. - */ - bch2_fs_journal_start(&c->journal); - bch_verbose(c, "starting journal replay:"); err = "journal replay failed"; ret = bch2_journal_replay(c, &journal); @@ -356,6 +481,14 @@ int bch2_fs_recovery(struct bch_fs *c) if (ret) goto err; + if (enabled_qtypes(c)) { + bch_verbose(c, "reading quotas:"); + ret = bch2_fs_quota_read(c); + if (ret) + goto err; + bch_verbose(c, "quotas done"); + } + mutex_lock(&c->sb_lock); if (c->opts.version_upgrade) { if (c->sb.version < bcachefs_metadata_version_new_versioning) @@ -371,14 +504,9 @@ int bch2_fs_recovery(struct bch_fs *c) } mutex_unlock(&c->sb_lock); - if (enabled_qtypes(c)) { - bch_verbose(c, "reading quotas:"); - ret = bch2_fs_quota_read(c); - if (ret) - goto err; - bch_verbose(c, "quotas done"); - } - + if (c->journal_seq_blacklist_table && + c->journal_seq_blacklist_table->nr > 128) + queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); out: bch2_journal_entries_free(&journal); kfree(clean); @@ -427,7 +555,7 @@ int bch2_fs_initialize(struct bch_fs *c) * journal_res_get() will crash if called before this has * set up the journal.pin FIFO and journal.cur pointer: */ - bch2_fs_journal_start(&c->journal); + bch2_fs_journal_start(&c->journal, 1, &journal); bch2_journal_set_replay_done(&c->journal); err = "error going read write"; diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index bcb6e3fb..83c74af4 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -6,6 +6,7 @@ #include "error.h" #include "io.h" #include "journal.h" +#include "journal_seq_blacklist.h" #include "replicas.h" #include "quota.h" #include "super-io.h" diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 7748dafb..9dc201ab 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -29,6 +29,7 @@ #include "io.h" #include "journal.h" #include "journal_reclaim.h" +#include "journal_seq_blacklist.h" #include "move.h" #include "migrate.h" #include "movinggc.h" @@ -499,6 +500,7 @@ static void bch2_fs_free(struct bch_fs *c) kfree(c->replicas.entries); kfree(c->replicas_gc.entries); kfree(rcu_dereference_protected(c->disk_groups, 1)); + kfree(c->journal_seq_blacklist_table); if (c->journal_reclaim_wq) destroy_workqueue(c->journal_reclaim_wq); @@ -527,6 +529,10 @@ void bch2_fs_stop(struct bch_fs *c) bch_verbose(c, "shutting down"); + set_bit(BCH_FS_STOPPING, &c->flags); + + cancel_work_sync(&c->journal_seq_blacklist_gc_work); + for_each_member_device(ca, c, i) if (ca->kobj.state_in_sysfs && ca->disk_sb.bdev) @@ -663,6 +669,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) spin_lock_init(&c->btree_write_error_lock); INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work); + INIT_WORK(&c->journal_seq_blacklist_gc_work, + bch2_blacklist_entries_gc); + INIT_LIST_HEAD(&c->fsck_errors); mutex_init(&c->fsck_error_lock); diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index f9731513..7069bea5 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -496,7 +496,7 @@ STORE(__bch2_fs) bch2_coalesce(c); if (attr == &sysfs_trigger_gc) - bch2_gc(c, NULL, false); + bch2_gc(c, NULL, false, false); if (attr == &sysfs_trigger_alloc_write) { bool wrote; diff --git a/linux/crypto/chacha20_generic.c b/linux/crypto/chacha20_generic.c index c6f14945..914189e7 100644 --- a/linux/crypto/chacha20_generic.c +++ b/linux/crypto/chacha20_generic.c @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include @@ -36,7 +36,7 @@ static int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key, container_of(tfm, struct chacha20_tfm, tfm); int i; - if (keysize != CHACHA20_KEY_SIZE) + if (keysize != CHACHA_KEY_SIZE) return -EINVAL; for (i = 0; i < ARRAY_SIZE(ctx->key); i++) @@ -72,8 +72,8 @@ static int crypto_chacha20_crypt(struct skcipher_request *req) if (sg_is_last(sg)) break; - BUG_ON(sg->length % CHACHA20_BLOCK_SIZE); - iv[0] += sg->length / CHACHA20_BLOCK_SIZE; + BUG_ON(sg->length % CHACHA_BLOCK_SIZE); + iv[0] += sg->length / CHACHA_BLOCK_SIZE; sg = sg_next(sg); }; @@ -93,8 +93,8 @@ static void *crypto_chacha20_alloc_tfm(void) tfm->tfm.setkey = crypto_chacha20_setkey; tfm->tfm.encrypt = crypto_chacha20_crypt; tfm->tfm.decrypt = crypto_chacha20_crypt; - tfm->tfm.ivsize = CHACHA20_IV_SIZE; - tfm->tfm.keysize = CHACHA20_KEY_SIZE; + tfm->tfm.ivsize = CHACHA_IV_SIZE; + tfm->tfm.keysize = CHACHA_KEY_SIZE; return tfm; }